@@ -0,0 +1,5 @@ | |||
# IntelliJ | |||
/idea/ | |||
# Apple | |||
.DS_STORE |
@@ -0,0 +1,8 @@ | |||
package classic | |||
type XMLSubTrackList struct { | |||
Tracks []struct { | |||
LangCode string `xml:"lang_code,attr"` | |||
Lang string `xml:"lang_translated,attr"` | |||
} `xml:"track"` | |||
} |
@@ -0,0 +1,21 @@ | |||
package classic | |||
import ( | |||
"github.com/terorie/youtube-mango/data" | |||
"errors" | |||
) | |||
func Get(v *data.Video) error { | |||
if len(v.ID) == 0 { return errors.New("no video ID") } | |||
// Download the doc tree | |||
doc, err := grab(v) | |||
if err != nil { return err } | |||
// Parse it | |||
p := parseInfo{v, doc} | |||
err = p.parse() | |||
if err != nil { return err } | |||
return nil | |||
} |
@@ -0,0 +1,70 @@ | |||
package classic | |||
import ( | |||
"net/http" | |||
"errors" | |||
"encoding/xml" | |||
"time" | |||
"github.com/PuerkitoBio/goquery" | |||
"github.com/terorie/youtube-mango/data" | |||
) | |||
var transport = http.Transport{ | |||
MaxIdleConns: 10, | |||
IdleConnTimeout: 30 * time.Second, | |||
} | |||
var client = http.Client{Transport: &transport} | |||
const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v=" | |||
const subtitleURL = "https://video.google.com/timedtext?type=list&v=" | |||
// Grabs a HTML video page and returns the document tree | |||
func grab(v *data.Video) (doc *goquery.Document, err error) { | |||
req, err := http.NewRequest("GET", mainURL + v.ID, nil) | |||
if err != nil { return } | |||
requestHeader(&req.Header) | |||
res, err := client.Do(req) | |||
if err != nil { return } | |||
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") } | |||
defer res.Body.Close() | |||
doc, err = goquery.NewDocumentFromReader(res.Body) | |||
if err != nil { return nil, err } | |||
return | |||
} | |||
// Grabs and parses a subtitle list | |||
func grabSubtitleList(v *data.Video) (err error) { | |||
req, err := http.NewRequest("GET", subtitleURL + v.ID, nil) | |||
if err != nil { return err } | |||
requestHeader(&req.Header) | |||
res, err := client.Do(req) | |||
if err != nil { return err } | |||
if res.StatusCode != 200 { return errors.New("HTTP failure") } | |||
defer res.Body.Close() | |||
decoder := xml.NewDecoder(res.Body) | |||
var tracks XMLSubTrackList | |||
err = decoder.Decode(&tracks) | |||
if err != nil { return err } | |||
for _, track := range tracks.Tracks { | |||
v.Subtitles = append(v.Subtitles, track.LangCode) | |||
} | |||
return | |||
} | |||
// Important: | |||
// - Set header "Accept-Language: en-US" or else parser might break | |||
// - Set header "User-Agent: youtube-mango/1.0" | |||
func requestHeader(h *http.Header) { | |||
h.Add("Accept-Language", "en-US") | |||
h.Add("User-Agent", "youtube-mango/0.1") | |||
} |
@@ -0,0 +1,135 @@ | |||
package classic | |||
import ( | |||
"github.com/PuerkitoBio/goquery" | |||
"errors" | |||
"strconv" | |||
"time" | |||
"github.com/terorie/youtube-mango/data" | |||
) | |||
const likeBtnSelector = ".like-button-renderer-like-button-unclicked" | |||
const dislikeBtnSelector = ".like-button-renderer-dislike-button-unclicked" | |||
const viewCountSelector = "div .watch-view-count" | |||
const userInfoSelector = "div .yt-user-info" | |||
const channelNameSelector = ".yt-uix-sessionlink" | |||
type parseInfo struct { | |||
v *data.Video | |||
doc *goquery.Document | |||
} | |||
func (p *parseInfo) parse() error { | |||
if err := p.parseLikeDislike(); | |||
err != nil { return err } | |||
if err := p.parseViewCount(); | |||
err != nil { return err } | |||
if err := p.parseUploader(); | |||
err != nil { return err } | |||
if err := p.parseDescription(); | |||
err != nil { return err } | |||
p.parseMetas() | |||
return nil | |||
} | |||
func (p *parseInfo) parseLikeDislike() error { | |||
likeText := p.doc.Find(likeBtnSelector).First().Text() | |||
dislikeText := p.doc.Find(dislikeBtnSelector).First().Text() | |||
if len(likeText) == 0 || len(dislikeText) == 0 { | |||
return errors.New("failed to parse like buttons") | |||
} | |||
var err error | |||
p.v.Likes, err = extractNumber(likeText) | |||
if err != nil { return err } | |||
p.v.Dislikes, err = extractNumber(dislikeText) | |||
if err != nil { return err } | |||
return nil | |||
} | |||
func (p *parseInfo) parseViewCount() error { | |||
viewCountText := p.doc.Find(viewCountSelector).First().Text() | |||
viewCount, err := extractNumber(viewCountText) | |||
if err != nil { return err } | |||
p.v.Views = viewCount | |||
return nil | |||
} | |||
func (p *parseInfo) parseUploader() error { | |||
userInfo := p.doc.Find(userInfoSelector) | |||
userLinkNode := userInfo.Find(".yt-uix-sessionlink") | |||
// get link | |||
userLink, _ := userLinkNode.Attr("href") | |||
if userLink == "" { return errors.New("couldn't find channel link") } | |||
p.v.UploaderURL = "https://www.youtube.com" + userLink | |||
// get name | |||
channelName := userInfo.Find(channelNameSelector).Text() | |||
if channelName == "" { return errors.New("could not find channel name") } | |||
p.v.Uploader = channelName | |||
return nil | |||
} | |||
func (p *parseInfo) parseMetas() { | |||
metas := p.doc.Find("meta") | |||
// For each <meta> | |||
for _, node := range metas.Nodes { | |||
// Attributes | |||
var content string | |||
var itemprop string | |||
var prop string | |||
// Parse attributes | |||
for _, attr := range node.Attr { | |||
switch attr.Key { | |||
case "property": prop = attr.Val | |||
case "itemprop": itemprop = attr.Val | |||
case "content": content = attr.Val | |||
} | |||
} | |||
// Content not set | |||
if len(content) == 0 { | |||
continue | |||
} | |||
// <meta property … | |||
if len(prop) != 0 { | |||
switch prop { | |||
case "og:title": | |||
p.v.Title = content | |||
case "og:video:tag": | |||
p.v.Tags = append(p.v.Tags, content) | |||
case "og:url": | |||
p.v.URL = content | |||
case "og:image": | |||
p.v.Thumbnail = content | |||
} | |||
continue | |||
} | |||
// <meta itemprop … | |||
if len(itemprop) != 0 { | |||
switch itemprop { | |||
case "datePublished": | |||
if val, err := time.Parse("2006-01-02", content); | |||
err == nil { p.v.UploadDate = val } | |||
case "genre": | |||
p.v.Genre = content | |||
case "channelId": | |||
p.v.UploaderID = content | |||
case "duration": | |||
if val, err := parseDuration(content); | |||
err == nil { p.v.Duration = val } | |||
case "isFamilyFriendly": | |||
if val, err := strconv.ParseBool(content); | |||
err == nil { p.v.FamilyFriendly = val } | |||
} | |||
continue | |||
} | |||
} | |||
} |
@@ -0,0 +1,103 @@ | |||
package classic | |||
import ( | |||
"errors" | |||
"golang.org/x/net/html" | |||
"bytes" | |||
"github.com/terorie/youtube-mango/common" | |||
"strings" | |||
) | |||
const descriptionSelector = "#eow-description" | |||
func (p *parseInfo) parseDescription() error { | |||
// Find description root | |||
descNode := p.doc.Find(descriptionSelector).First() | |||
if len(descNode.Nodes) == 0 { return errors.New("could not find description") } | |||
// Markdown text | |||
var buffer bytes.Buffer | |||
// Enumerate nodes | |||
for c := descNode.Nodes[0].FirstChild; c != nil; c = c.NextSibling { | |||
switch c.Type { | |||
case html.TextNode: | |||
// FIXME: "&lt;" gets parsed to => "<" | |||
// Write text to buffer, escaping markdown | |||
err := common.MarkdownTextEscape.ToBuffer(c.Data, &buffer) | |||
if err != nil { return err } | |||
case html.ElementNode: | |||
switch c.Data { | |||
// Newline | |||
case "br": | |||
err := buffer.WriteByte(0x0a) | |||
if err != nil { return err } | |||
// Link | |||
case "a": | |||
err := parseLink(c, &buffer) | |||
if err != nil { return err } | |||
} | |||
} | |||
} | |||
// Save description | |||
p.v.Description = buffer.String() | |||
println(p.v.Description) | |||
return nil | |||
} | |||
func parseLink(c *html.Node, dest *bytes.Buffer) error { | |||
// Find text | |||
if c.FirstChild == nil { return nil } // Empty link | |||
if c.FirstChild.Type != html.TextNode { | |||
return errors.New("unexpected non-text node") | |||
} | |||
text := c.FirstChild.Data | |||
// Find href | |||
for _, attr := range c.Attr { | |||
if attr.Key == "href" { | |||
switch { | |||
// hashtag | |||
case strings.HasPrefix(attr.Val, "/results"): | |||
dest.WriteString(text) | |||
// real link | |||
case strings.HasPrefix(attr.Val, "/redirect"): | |||
/* | |||
Not needed: | |||
// Decode link from href | |||
link, err := decodeLink(attr.Val) | |||
if err != nil { return err } | |||
// Escape to markdown | |||
link, err = common.MarkdownLinkEscape.ToString(link) | |||
if err != nil { return err } | |||
// Write to buffer | |||
dest.WriteString(fmt.Sprintf("[%s](%s)\n", text, link)) | |||
*/ | |||
dest.WriteString(text) | |||
default: | |||
return errors.New("unknown link") | |||
} | |||
break | |||
} | |||
} | |||
return nil | |||
} | |||
/* Not needed | |||
func decodeLink(href string) (string, error) { | |||
url, err := url2.Parse(href) | |||
if err != nil { return "", err } | |||
query := url.Query() | |||
link := query.Get("q") | |||
if link == "" { return "", errors.New("empty link") } | |||
return link, nil | |||
} | |||
*/ |
@@ -0,0 +1,44 @@ | |||
package classic | |||
import ( | |||
"time" | |||
"errors" | |||
"strings" | |||
"strconv" | |||
) | |||
// "PT6M57S" => 6 min 57 s | |||
func parseDuration(d string) (time.Duration, error) { | |||
var err error | |||
goto start | |||
error: | |||
return 0, errors.New("unknown duration code") | |||
start: | |||
if d[0:2] != "PT" { goto error } | |||
mIndex := strings.IndexByte(d, 'M') | |||
if mIndex == -1 { goto error } | |||
minutes, err := strconv.ParseUint(d[2:mIndex], 10, 32) | |||
if err != nil { return 0, err } | |||
seconds, err := strconv.ParseUint(d[mIndex:len(d)-1], 10, 32) | |||
if err != nil { return 0, err } | |||
dur := time.Duration(minutes) * time.Minute + time.Duration(seconds) * time.Second | |||
return dur, nil | |||
} | |||
// "137,802 views" => 137802 | |||
func extractNumber(s string) (uint64, error) { | |||
// Extract numbers from view string | |||
var clean []byte | |||
for _, char := range []byte(s) { | |||
if char >= 0x30 && char <= 0x39 { | |||
clean = append(clean, char) | |||
} | |||
} | |||
// Convert to uint | |||
return strconv.ParseUint(string(clean), 10, 64) | |||
} |
@@ -0,0 +1,45 @@ | |||
package common | |||
import "bytes" | |||
// Markdown escape map (ASCII) | |||
// Inspired by https://github.com/golang-commonmark/markdown/blob/master/escape.go | |||
type EscapeMap [2]uint64 | |||
func (e *EscapeMap) Get(index uint) bool { | |||
if index >= 128 { return false } | |||
high, low := index / 64, index % 64 | |||
return e[high] & (1 << low) != 0 | |||
} | |||
func (e *EscapeMap) Set(index uint, x bool) { | |||
if index >= 128 { return } | |||
high, low := index / 64, index % 64 | |||
if x { | |||
e[high] = e[high] | 1 << low | |||
} else { | |||
e[high] = e[high] &^ 1 << low | |||
} | |||
} | |||
func (e EscapeMap) ToBuffer(src string, dest *bytes.Buffer) (err error) { | |||
for _, char := range src { | |||
if char < 0x80 && e.Get(uint(char)) { | |||
// Write backslash + char | |||
_, err = dest.Write([]byte{0x5c, byte(char)}) | |||
} else { | |||
_, err = dest.WriteRune(char) | |||
} | |||
} | |||
return | |||
} | |||
func (e EscapeMap) ToString(src string) (string, error) { | |||
var buffer bytes.Buffer | |||
err := e.ToBuffer(src, &buffer) | |||
if err != nil { | |||
return "", err | |||
} else { | |||
return buffer.String(), nil | |||
} | |||
} |
@@ -0,0 +1,5 @@ | |||
package common | |||
import "net/http" | |||
var Client = http.Client{Transport: http.DefaultTransport} |
@@ -0,0 +1,16 @@ | |||
package common | |||
var MarkdownTextEscape EscapeMap | |||
var MarkdownLinkEscape EscapeMap | |||
func init() { | |||
registerMap := func(eMap EscapeMap, escaped string) { | |||
for _, c := range escaped { | |||
eMap.Set(uint(c), true) | |||
} | |||
} | |||
registerMap(MarkdownTextEscape, "\\!\"#$%&()*+/;<=>?@[]^_`{|}~-") | |||
registerMap(MarkdownLinkEscape, "\\!\"#$%&'()*+,;<=>?@[]^_`{|}~-") | |||
} | |||
@@ -0,0 +1,41 @@ | |||
package data | |||
import "time" | |||
type Video struct { | |||
ID string `json:"id"` | |||
Title string `json:"title"` | |||
Description string `json:"description"` | |||
Uploader string `json:"uploader"` | |||
UploaderID string `json:"uploader_id"` | |||
UploaderURL string `json:"uploader_url"` | |||
UploadDate time.Time `json:"upload_date"` | |||
Thumbnail string `json:"thumbnail"` | |||
URL string `json:"url"` | |||
License string `json:"license,omitempty"` | |||
Genre string `json:"genre"` | |||
Tags []string `json:"tags"` | |||
Subtitles []string `json:"subtitles,omitempty"` | |||
Duration time.Duration `json:"duration"` | |||
FamilyFriendly bool `json:"family_friendly"` | |||
Views uint64 `json:"views"` | |||
Likes uint64 `json:"likes"` | |||
Dislikes uint64 `json:"dislikes"` | |||
Formats []Format `json:"formats,omitempty"` | |||
} | |||
type Subtitle struct { | |||
URL string | |||
Extension string | |||
} | |||
type Format struct { | |||
FormatID string | |||
URL string | |||
PlayerURL string | |||
Extension string | |||
Height uint32 | |||
FormatNote string | |||
AudioCodec string | |||
Abr float32 | |||
} |
@@ -0,0 +1,23 @@ | |||
/* youtube-ma for MongoDB | |||
* | |||
* Based on https://github.com/CorentinB/youtube-ma */ | |||
package main | |||
import ( | |||
"encoding/json" | |||
"github.com/terorie/youtube-mango/data" | |||
"github.com/terorie/youtube-mango/classic" | |||
) | |||
func main() { | |||
v := data.Video{ID: "kj9mFK62c6E"} | |||
err := classic.Get(&v) | |||
if err != nil { panic(err) } | |||
jsn, err := json.MarshalIndent(v, "", "\t") | |||
if err != nil { panic(err) } | |||
println(string(jsn)) | |||
} |