Browse Source

Clean up API and fix channel URL dump

master
terorie 1 year ago
parent
commit
43063ec2fe
10 changed files with 212 additions and 150 deletions
  1. 1
    1
      api/escape.go
  2. 1
    1
      api/markdown.go
  3. 3
    3
      apiclassic/grab.go
  4. 3
    3
      apiclassic/parsedescription.go
  5. 0
    2
      cmd/channel.go
  6. 125
    102
      cmd/channeldump.go
  7. 0
    35
      common/httpasync.go
  8. 2
    2
      main.go
  9. 76
    0
      net/asynchttp.go
  10. 1
    1
      net/transport.go

common/escape.go → api/escape.go View File

@@ -1,4 +1,4 @@
package common
package api

import "bytes"


common/markdown.go → api/markdown.go View File

@@ -1,4 +1,4 @@
package common
package api

var MarkdownTextEscape EscapeMap
var MarkdownLinkEscape EscapeMap

+ 3
- 3
apiclassic/grab.go View File

@@ -5,7 +5,7 @@ import (
"errors"
"encoding/xml"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/yt-mango/common"
"github.com/terorie/yt-mango/net"
)

const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v="
@@ -17,7 +17,7 @@ func GrabVideo(videoID string) (doc *goquery.Document, err error) {
if err != nil { return }
setHeaders(&req.Header)

res, err := common.Client.Do(req)
res, err := net.Client.Do(req)
if err != nil { return }
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") }

@@ -34,7 +34,7 @@ func GrabSubtitleList(videoID string) (tracks *XMLSubTrackList, err error) {
if err != nil { return }
setHeaders(&req.Header)

res, err := common.Client.Do(req)
res, err := net.Client.Do(req)
if err != nil { return }
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") }


+ 3
- 3
apiclassic/parsedescription.go View File

@@ -4,7 +4,7 @@ import (
"errors"
"golang.org/x/net/html"
"bytes"
"github.com/terorie/yt-mango/common"
"github.com/terorie/yt-mango/net"
"strings"
)

@@ -24,7 +24,7 @@ func (p *parseInfo) parseDescription() error {
case html.TextNode:
// FIXME: "&amp;lt;" gets parsed to => "<"
// Write text to buffer, escaping markdown
err := common.MarkdownTextEscape.ToBuffer(c.Data, &buffer)
err := net.MarkdownTextEscape.ToBuffer(c.Data, &buffer)
if err != nil { return err }
case html.ElementNode:
switch c.Data {
@@ -70,7 +70,7 @@ func parseLink(c *html.Node, dest *bytes.Buffer) error {
link, err := decodeLink(attr.Val)
if err != nil { return err }
// Escape to markdown
link, err = common.MarkdownLinkEscape.ToString(link)
link, err = net.MarkdownLinkEscape.ToString(link)
if err != nil { return err }
// Write to buffer
dest.WriteString(fmt.Sprintf("[%s](%s)\n", text, link))

+ 0
- 2
cmd/channel.go View File

@@ -5,7 +5,6 @@ import (
)

var force bool
var offset uint

var Channel = cobra.Command{
Use: "channel",
@@ -14,6 +13,5 @@ var Channel = cobra.Command{

func init() {
channelDumpCmd.Flags().BoolVarP(&force, "force", "f", false, "Overwrite the output file if it already exists")
channelDumpCmd.Flags().UintVar(&offset, "page-offset", 1, "Start getting videos at this page. (A page is usually 30 videos)")
Channel.AddCommand(&channelDumpCmd)
}

+ 125
- 102
cmd/channeldump.go View File

@@ -8,148 +8,171 @@ import (
"log"
"github.com/terorie/yt-mango/api"
"fmt"
"github.com/terorie/yt-mango/common"
"github.com/terorie/yt-mango/net"
"sync/atomic"
"errors"
"sync"
)

var channelDumpContext = struct{
var offset uint

func init() {
channelDumpCmd.Flags().UintVar(&offset, "page-offset", 1, "Start getting videos at this page. (A page is usually 30 videos)")
}

// The shared context of the request and response threads
var channelDumpContext = struct {
startTime time.Time
printResults bool
writer *bufio.Writer
pagesDone uint64
errorOccured int32 // Use atomic boolean here
// Number of pages that have been
// requested but not yet received.
// Additional +1 is added if additional
// are planned to be requested
pagesToReceive sync.WaitGroup
// If set to non-zero, an error was received
errorOccurred int32
}{}

// The channel dump route lists
var channelDumpCmd = cobra.Command{
Use: "dumpurls <channel ID> [file]",
Short: "Get all public video URLs from channel",
Long: "Write all videos URLs of a channel to a file",
Args: cobra.RangeArgs(1, 2),
Run: func(cmd *cobra.Command, args []string) {
printResults := false
fileName := ""
channelID := args[0]
if len(args) != 2 {
printResults = true
} else {
fileName = args[1]
}
channelDumpContext.printResults = printResults
Run: doChannelDump,
}

channelID, err := api.GetChannelID(channelID)
if err != nil {
log.Print(err)
os.Exit(1)
}
func doChannelDump(_ *cobra.Command, args []string) {
if offset == 0 { offset = 1 }

log.Printf("Starting work on channel ID \"%s\".", channelID)
channelDumpContext.startTime = time.Now()
printResults := false
fileName := ""
channelID := args[0]
if len(args) != 2 {
printResults = true
} else {
fileName = args[1]
}
channelDumpContext.printResults = printResults

var flags int
if force {
flags = os.O_WRONLY | os.O_CREATE | os.O_TRUNC
} else {
flags = os.O_WRONLY | os.O_CREATE | os.O_EXCL
}
channelID, err := api.GetChannelID(channelID)
if err != nil {
log.Print(err)
os.Exit(1)
}

var file *os.File
log.Printf("Starting work on channel ID \"%s\".", channelID)
channelDumpContext.startTime = time.Now()

if !printResults {
var err error
file, err = os.OpenFile(fileName, flags, 0640)
if err != nil {
log.Fatal(err)
os.Exit(1)
}
defer file.Close()
var flags int
if force {
flags = os.O_WRONLY | os.O_CREATE | os.O_TRUNC
} else {
flags = os.O_WRONLY | os.O_CREATE | os.O_EXCL
}
var file *os.File

writer := bufio.NewWriter(file)
defer writer.Flush()
channelDumpContext.writer = writer
if !printResults {
var err error
file, err = os.OpenFile(fileName, flags, 0640)
if err != nil {
log.Fatal(err)
os.Exit(1)
}
defer file.Close()

results := make(chan common.JobResult)
terminateSub := make(chan bool)
writer := bufio.NewWriter(file)
defer writer.Flush()
channelDumpContext.writer = writer
}

// TODO Clean up
go processResults(results, terminateSub)
results := make(chan net.JobResult)
terminateSub := make(chan bool)

page := offset
for {
// Terminate if error detected
if atomic.LoadInt32(&channelDumpContext.errorOccured) != 0 {
goto terminate
}
// Send new requests
req := api.Main.GrabChannelPage(channelID, page)
common.DoAsyncHTTP(req, results, page)
// TODO Clean up
go channelDumpResults(results, terminateSub)

page++
page := offset
for {
// Terminate if error detected
if atomic.LoadInt32(&channelDumpContext.errorOccurred) != 0 {
goto terminate
}
terminate:
// Send new requests
req := api.Main.GrabChannelPage(channelID, page)
channelDumpContext.pagesToReceive.Add(1)
net.DoAsyncHTTP(req, results, page)

// Requests sent, wait for remaining requests to finish
for {
done := uint64(offset) + atomic.LoadUint64(&channelDumpContext.pagesDone)
target := uint64(page)
if done >= target { break }

// TODO use semaphore
time.Sleep(time.Millisecond)
}
page++
}
terminate:

// TODO Don't ignore pending results
duration := time.Since(channelDumpContext.startTime)
log.Printf("Done in %s.", duration.String())
// Requests sent, wait for remaining requests to finish
channelDumpContext.pagesToReceive.Wait()

terminateSub <- true
},
terminateSub <- true
}

// TODO combine channels into one
func processResults(results chan common.JobResult, terminateSub chan bool) {
// Helper goroutine that processes HTTP results.
// HTTP results are received on "results".
// The routine exits if a value on "terminateSub" is received.
// For every incoming result (error or response),
// the "pagesToReceive" counter is decreased.
// If an error is received, the "errorOccurred" flag is set.
func channelDumpResults(results chan net.JobResult, terminateSub chan bool) {
totalURLs := 0
for {
select {
case <-terminateSub:
log.Printf("Got %d URLs", totalURLs)
duration := time.Since(channelDumpContext.startTime)
log.Printf("Got %d URLs in %s.", totalURLs, duration.String())
os.Exit(0)
return
case res := <-results:
var err error
var channelURLs []string
page := res.ReqData.(uint)
if res.Err != nil {
err = res.Err
goto endError
}
channelURLs, err = api.Main.ParseChannelVideoURLs(res.Res)
if err != nil { goto endError }
if len(channelURLs) == 0 {
err = errors.New("returned no videos")
goto endError
}
totalURLs += len(channelURLs)
log.Printf("Received page %d: %d videos.", page, len(channelURLs))

if channelDumpContext.printResults {
for _, _url := range channelURLs {
fmt.Println(_url)
}
page, numURLs, err := channelDumpResult(&res)
// Mark page as processed
channelDumpContext.pagesToReceive.Done()
// Report back error
if err != nil {
atomic.StoreInt32(&channelDumpContext.errorOccurred, 1)
log.Printf("Error at page %d: %v", page, err)
} else {
for _, _url := range channelURLs {
_, err := channelDumpContext.writer.WriteString(_url + "\n")
if err != nil { panic(err) }
}
totalURLs += numURLs
}
// Increment done pages count
atomic.AddUint64(&channelDumpContext.pagesDone, 1)
continue
endError:
atomic.AddUint64(&channelDumpContext.pagesDone, 1)
atomic.StoreInt32(&channelDumpContext.errorOccured, 1)
log.Printf("Error at page %d: %v", page, err)
}
}
}

// Processes a HTTP result
func channelDumpResult(res *net.JobResult) (page uint, numURLs int, err error) {
var channelURLs []string

// Extra data is page number
page = res.ReqData.(uint)
// Abort if request failed
if res.Err != nil { return page, 0, res.Err }

// Parse response
channelURLs, err = api.Main.ParseChannelVideoURLs(res.Res)
if err != nil { return }
numURLs = len(channelURLs)
if numURLs == 0 { return page, 0, errors.New("returned no videos") }

// Print results
log.Printf("Received page %d: %d videos.", page, numURLs)

if channelDumpContext.printResults {
for _, _url := range channelURLs {
fmt.Println(_url)
}
} else {
for _, _url := range channelURLs {
_, err := channelDumpContext.writer.WriteString(_url + "\n")
if err != nil { panic(err) }
}
}

return
}

+ 0
- 35
common/httpasync.go View File

@@ -1,35 +0,0 @@
package common

import "net/http"

type JobResult struct {
Res *http.Response
Err error
ReqData interface{} // job.data
}

type job struct {
req *http.Request
c chan JobResult
data interface{}
}

var jobs = make(chan job)

func InitAsyncHTTP(nWorkers uint) {
for i := uint(0); i < nWorkers; i++ {
go asyncHTTPWorker()
}
}

func DoAsyncHTTP(r *http.Request, c chan JobResult, data interface{}) {
jobs <- job{r, c, data}
}

func asyncHTTPWorker() {
for {
job := <-jobs
res, err := Client.Do(job.req)
job.c <- JobResult{res, err, job.data}
}
}

+ 2
- 2
main.go View File

@@ -10,7 +10,7 @@ import (
"github.com/terorie/yt-mango/cmd"
"log"
"github.com/terorie/yt-mango/api"
"github.com/terorie/yt-mango/common"
"github.com/terorie/yt-mango/net"
)

const Version = "v0.1 -- dev"
@@ -35,7 +35,7 @@ func main() {
}
},
PersistentPreRun: func(cmd *cobra.Command, args []string) {
common.InitAsyncHTTP(concurrentRequests)
net.MaxWorkers = uint32(concurrentRequests)

switch forceAPI {
case "": api.Main = &api.TempAPI

+ 76
- 0
net/asynchttp.go View File

@@ -0,0 +1,76 @@
package net

import (
"net/http"
"sync/atomic"
"time"
)

// Max number of HTTP workers
var MaxWorkers uint32 = 4
// Current number of HTTP workers
// atomic variable, don't use directly
var activeWorkers int32

// Kill a worker routine if it
// doesn't get any jobs after "timeOut"
const timeOut = 10 * time.Second

// Result of the HTTP request
type JobResult struct {
// HTTP Response (can be nil)
Res *http.Response
// HTTP error (can be nil)
Err error
// data parameter from DoAsyncHTTP
ReqData interface{} // job.data
}

type job struct {
req *http.Request
c chan JobResult
data interface{}
}

// Job queue
var jobs = make(chan job)

// Enqueue a new HTTP request and send the result to "c" (send to "c" guaranteed)
// Additional data like an ID can be passed in "data" to be returned with "c"
func DoAsyncHTTP(r *http.Request, c chan JobResult, data interface{}) {
newJob := job{r, c, data}
select {
// Try to send to the channel and
// see if an idle worker picks the job up
case jobs <- newJob:
break

// Every routine is busy
default:
if atomic.LoadInt32(&activeWorkers) < int32(MaxWorkers) {
// Another thread is allowed to spawn
// TODO Race condition here: DoAsyncHTTP is not thread safe!
atomic.AddInt32(&activeWorkers, 1)
go asyncHTTPWorker()
}
// Block until another routine finishes
jobs <- newJob
}
}

// Routine that reads continually reads requests from "jobs"
// and quits if it doesn't find any jobs for some time
func asyncHTTPWorker() {
for {
select {
// Get a new job from the queue and process it
case job := <-jobs:
res, err := Client.Do(job.req)
job.c <- JobResult{res, err, job.data}
// Timeout, kill the routine
case <-time.After(timeOut):
atomic.AddInt32(&activeWorkers, -1)
return
}
}
}

common/http.go → net/transport.go View File

@@ -1,4 +1,4 @@
package common
package net

import "net/http"


Loading…
Cancel
Save