@@ -10,8 +10,10 @@ If you don't have a Go toolchain, grab an executable from the Releases tab | |||
##### Project structure | |||
- _/common_: commonly used HTTP code | |||
- _/controller_: Manages workers (sends tasks, gets results, …) | |||
- _/common_: Commonly used HTTP code | |||
- _/data_: Data structures | |||
- _/db_: MongoDB connection | |||
- _/classic_: Extractor calling the HTML `/watch` API | |||
- _/watchapi_: Extractor calling the JSON `/watch` API | |||
@@ -0,0 +1,9 @@ | |||
package browseajax | |||
func GetPage(channelID string, page uint) error { | |||
root, err := GrabPage(channelID, page) | |||
if err != nil { return err } | |||
err = ParsePage(root) | |||
if err != nil { return err } | |||
return nil | |||
} |
@@ -0,0 +1,45 @@ | |||
package browseajax | |||
import ( | |||
"net/http" | |||
"github.com/terorie/yt-mango/common" | |||
"errors" | |||
"io/ioutil" | |||
"github.com/valyala/fastjson" | |||
) | |||
const mainURL = "https://www.youtube.com/browse_ajax?ctoken=" | |||
func GrabPage(channelID string, page uint) (*fastjson.Value, error) { | |||
// Generate page URL | |||
token := GenerateToken(channelID, uint64(page)) | |||
url := mainURL + token | |||
// Prepare request | |||
req, err := http.NewRequest("GET", url, nil) | |||
if err != nil { return nil, err } | |||
req.Header.Add("X-YouTube-Client-Name", "1") | |||
req.Header.Add("X-YouTube-Client-Version", "2.20180726") | |||
// Send request | |||
res, err := common.Client.Do(req) | |||
if err != nil { return nil, err } | |||
if res.StatusCode == 500 { | |||
defer res.Body.Close() | |||
buf, _ := ioutil.ReadAll(res.Body) | |||
println(string(buf)) | |||
} | |||
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") } | |||
// Download response | |||
defer res.Body.Close() | |||
buf, err := ioutil.ReadAll(res.Body) | |||
if err != nil { return nil, err } | |||
// Parse JSON | |||
var p fastjson.Parser | |||
root, err := p.ParseBytes(buf) | |||
if err != nil { return nil, err } | |||
return root, nil | |||
} |
@@ -0,0 +1,60 @@ | |||
package browseajax | |||
import ( | |||
"github.com/valyala/fastjson" | |||
"errors" | |||
) | |||
var missingData = errors.New("missing data") | |||
func ParsePage(rootObj *fastjson.Value) error { | |||
// Root as array | |||
root, err := rootObj.Array() | |||
if err != nil { return err } | |||
// Find response container | |||
var container *fastjson.Value | |||
for _, item := range root { | |||
if item.Exists("response") { | |||
container = item | |||
break | |||
} | |||
} | |||
if container == nil { return missingData } | |||
// Get error obj | |||
// Get items from grid | |||
itemsObj := container.Get( | |||
"response", | |||
"continuationContents", | |||
"gridContinuation", | |||
"items", | |||
) | |||
if itemsObj == nil { return missingData } | |||
// Items as array | |||
items, err := itemsObj.Array() | |||
if err != nil { return err } | |||
// Enumerate | |||
for _, item := range items { | |||
// Find URL | |||
urlObj := item.Get( | |||
"gridVideoRenderer", | |||
"navigationEndpoint", | |||
"commandMetadata", | |||
"webCommandMetadata", | |||
"url", | |||
) | |||
if urlObj == nil { return missingData } | |||
// URL as string | |||
urlBytes, err := urlObj.StringBytes() | |||
if err != nil { return err } | |||
url := string(urlBytes) | |||
println(url) | |||
} | |||
return nil | |||
} |
@@ -0,0 +1,75 @@ | |||
package browseajax | |||
import ( | |||
"bytes" | |||
"strconv" | |||
"encoding/base64" | |||
) | |||
func GenerateToken(channelId string, page uint64) string { | |||
// Generate the inner token | |||
token := genInnerToken(page) | |||
// Build the inner object | |||
var inner bytes.Buffer | |||
// channelId | |||
inner.WriteByte(0x12) // type | |||
writeVarint(&inner, uint64(len(channelId))) // len | |||
inner.WriteString(channelId) // data | |||
// token | |||
inner.WriteByte(0x1a) // type | |||
writeVarint(&inner, uint64(len(token))) // len | |||
inner.WriteString(token) // data | |||
innerBytes := inner.Bytes() | |||
var root bytes.Buffer | |||
// innerBytes | |||
root.Write([]byte{0xe2, 0xa9, 0x85, 0xb2, 0x02}) // probably types | |||
writeVarint(&root, uint64(len(innerBytes))) | |||
root.Write(innerBytes) | |||
rootBytes := root.Bytes() | |||
return base64.URLEncoding.EncodeToString(rootBytes) | |||
} | |||
func genInnerToken(page uint64) string { | |||
var buf bytes.Buffer | |||
pageStr := strconv.FormatUint(page, 10) | |||
// Probably protobuf | |||
buf.Write([]byte{0x12, 0x06}) | |||
buf.WriteString("videos") | |||
buf.Write([]byte{ | |||
0x20, 0x00, 0x30, 0x01, 0x38, 0x01, 0x60, 0x01, | |||
0x6a, 0x00, 0x7a, | |||
}) | |||
// Write size-prefixed page string | |||
writeVarint(&buf, uint64(len(pageStr))) | |||
buf.WriteString(pageStr) | |||
buf.Write([]byte{0xb8, 0x01, 0x00}) | |||
return base64.URLEncoding.EncodeToString(buf.Bytes()) | |||
} | |||
func writeVarint(buf *bytes.Buffer, n uint64) { | |||
var enc [10]byte | |||
i := uint(0) | |||
for { | |||
enc[i] = uint8(n & 0x7F) | |||
n >>= 7 | |||
if n != 0 { | |||
enc[i] |= 0x80 | |||
i++ | |||
} else { | |||
i++ | |||
break | |||
} | |||
} | |||
buf.Write(enc[:i]) | |||
} |
@@ -4,17 +4,11 @@ import ( | |||
"net/http" | |||
"errors" | |||
"encoding/xml" | |||
"time" | |||
"github.com/PuerkitoBio/goquery" | |||
"github.com/terorie/yt-mango/data" | |||
"github.com/terorie/yt-mango/common" | |||
) | |||
var transport = http.Transport{ | |||
MaxIdleConns: 10, | |||
IdleConnTimeout: 30 * time.Second, | |||
} | |||
var client = http.Client{Transport: &transport} | |||
const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v=" | |||
const subtitleURL = "https://video.google.com/timedtext?type=list&v=" | |||
@@ -24,7 +18,7 @@ func grab(v *data.Video) (doc *goquery.Document, err error) { | |||
if err != nil { return } | |||
requestHeader(&req.Header) | |||
res, err := client.Do(req) | |||
res, err := common.Client.Do(req) | |||
if err != nil { return } | |||
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") } | |||
@@ -38,12 +32,9 @@ func grab(v *data.Video) (doc *goquery.Document, err error) { | |||
// Grabs and parses a subtitle list | |||
func grabSubtitleList(v *data.Video) (err error) { | |||
req, err := http.NewRequest("GET", subtitleURL + v.ID, nil) | |||
if err != nil { return err } | |||
requestHeader(&req.Header) | |||
res, err := client.Do(req) | |||
if err != nil { return err } | |||
if res.StatusCode != 200 { return errors.New("HTTP failure") } | |||
@@ -60,11 +51,3 @@ func grabSubtitleList(v *data.Video) (err error) { | |||
return | |||
} | |||
// Important: | |||
// - Set header "Accept-Language: en-US" or else parser might break | |||
// - Set header "User-Agent: youtube-mango/1.0" | |||
func requestHeader(h *http.Header) { | |||
h.Add("Accept-Language", "en-US") | |||
h.Add("User-Agent", "youtube-mango/0.1") | |||
} |
@@ -2,4 +2,16 @@ package common | |||
import "net/http" | |||
var Client = http.Client{Transport: http.DefaultTransport} | |||
// Custom headers | |||
type transport struct{} | |||
// Important: | |||
// - Set header "Accept-Language: en-US" or else parser might break | |||
// - Set header "User-Agent: youtube-mango/1.0" | |||
func (t transport) RoundTrip(r *http.Request) (*http.Response, error) { | |||
r.Header.Add("Accept-Language", "en-US") | |||
r.Header.Add("User-Agent", "youtube-mango/0.1") | |||
return http.DefaultTransport.RoundTrip(r) | |||
} | |||
var Client = http.Client{Transport: transport{}} |
@@ -0,0 +1,13 @@ | |||
package controller | |||
type Controller struct { | |||
} | |||
func (c *Controller) NewController() { | |||
} | |||
func (c *Controller) Schedule() { | |||
} |
@@ -5,19 +5,35 @@ | |||
package main | |||
import ( | |||
"encoding/json" | |||
"github.com/terorie/yt-mango/data" | |||
"github.com/terorie/yt-mango/classic" | |||
"github.com/spf13/cobra" | |||
"fmt" | |||
"os" | |||
) | |||
const Version = "v0.1 -- dev" | |||
func printVersion(_ *cobra.Command, _ []string) { | |||
fmt.Println("YT-Mango archiver", Version) | |||
} | |||
func main() { | |||
v := data.Video{ID: "kj9mFK62c6E"} | |||
rootCmd := cobra.Command{ | |||
Use: "yt-mango", | |||
Short: "YT-Mango is a scalable video metadata archiver", | |||
Long: "YT-Mango is a scalable video metadata archiving utility\n" + | |||
"written by terorie with help from the-eye.eu", | |||
} | |||
err := classic.Get(&v) | |||
if err != nil { panic(err) } | |||
versionCmd := cobra.Command{ | |||
Use: "version", | |||
Short: "Get the version number of yt-mango", | |||
Run: printVersion, | |||
} | |||
jsn, err := json.MarshalIndent(v, "", "\t") | |||
if err != nil { panic(err) } | |||
rootCmd.AddCommand(&versionCmd) | |||
println(string(jsn)) | |||
if err := rootCmd.Execute(); err != nil { | |||
fmt.Fprintln(os.Stderr, err) | |||
os.Exit(1) | |||
} | |||
} |
@@ -0,0 +1,78 @@ | |||
package pretty | |||
import ( | |||
"bytes" | |||
) | |||
type Code string | |||
type Codes []Code | |||
type Effect interface { | |||
E(string) string | |||
} | |||
// Empty effect | |||
type nilEffect struct{} | |||
func (_ nilEffect) E(x string) string { return x } | |||
// Custom effect | |||
type customEffect func(string) string | |||
func (e customEffect) E(x string) string { return e(x) } | |||
const ( | |||
RESET = Code("0") | |||
BOLD = Code("1") | |||
DIM = Code("2") | |||
ITALIC = Code("3") | |||
UNDERL = Code("4") | |||
INV = Code("7") | |||
HIDDEN = Code("8") | |||
STRIKE = Code("9") | |||
BLACK = Code("30") | |||
RED = Code("31") | |||
GREEN = Code("32") | |||
YELLOW = Code("33") | |||
BLUE = Code("34") | |||
MGNTA = Code("35") | |||
CYAN = Code("36") | |||
WHITE = Code("37") | |||
HBLACK = Code("90") | |||
HRED = Code("91") | |||
HGREEN = Code("92") | |||
HYELLOW = Code("93") | |||
HBLUE = Code("94") | |||
HMGNTA = Code("95") | |||
HCYAN = Code("96") | |||
HWHITE = Code("97") | |||
) | |||
func Add(x... Code) Codes { | |||
return Codes(x) | |||
} | |||
func (c Code) E(x string) string { | |||
if !isTTY { return x } | |||
return "\x1b[" + string(c) + "m" + x + "\x1b[0m" | |||
} | |||
func (cs Codes) E(x string) string { | |||
if !isTTY { return x } | |||
var b bytes.Buffer | |||
b.WriteString("\x1b[") | |||
for _, c := range cs { | |||
b.WriteRune(';') | |||
b.WriteString(string(c)) | |||
} | |||
b.WriteRune('m') | |||
b.WriteString(x) | |||
b.WriteString("\x1b[0m") | |||
return b.String() | |||
} | |||
func Wrap(e Effect, wrapper string) Effect { | |||
if !isTTY { return nilEffect{} } | |||
return customEffect(func(s string) string { | |||
return e.E(wrapper[0:1]) + s + e.E(wrapper[1:2]) | |||
}) | |||
} |
@@ -0,0 +1,16 @@ | |||
package pretty | |||
import ( | |||
"os" | |||
"strings" | |||
"github.com/mattn/go-isatty" | |||
) | |||
var isTTY bool | |||
func init() { | |||
term := os.Getenv("TERM") | |||
isTTY = strings.HasPrefix(term, "xterm") || | |||
isatty.IsTerminal(os.Stdout.Fd()) | |||
} |
@@ -0,0 +1,5 @@ | |||
package version | |||
func Get() string { | |||
return "v0.1 -- dev" | |||
} |
@@ -0,0 +1,10 @@ | |||
package main | |||
import "github.com/spf13/cobra" | |||
var workCmd = cobra.Command{ | |||
Use: "work", | |||
Short: "Connect to a queue and start archiving", | |||
Long: "Get work from a Redis queue, start extracting metadata\n" + | |||
"and upload it to a Mongo database.", | |||
} |