commit
3b5744c589
12 changed files with 516 additions and 0 deletions
@ -0,0 +1,5 @@
@@ -0,0 +1,5 @@
|
||||
# IntelliJ |
||||
/idea/ |
||||
|
||||
# Apple |
||||
.DS_STORE |
@ -0,0 +1,8 @@
@@ -0,0 +1,8 @@
|
||||
package classic |
||||
|
||||
type XMLSubTrackList struct { |
||||
Tracks []struct { |
||||
LangCode string `xml:"lang_code,attr"` |
||||
Lang string `xml:"lang_translated,attr"` |
||||
} `xml:"track"` |
||||
} |
@ -0,0 +1,21 @@
@@ -0,0 +1,21 @@
|
||||
package classic |
||||
|
||||
import ( |
||||
"github.com/terorie/youtube-mango/data" |
||||
"errors" |
||||
) |
||||
|
||||
func Get(v *data.Video) error { |
||||
if len(v.ID) == 0 { return errors.New("no video ID") } |
||||
|
||||
// Download the doc tree
|
||||
doc, err := grab(v) |
||||
if err != nil { return err } |
||||
|
||||
// Parse it
|
||||
p := parseInfo{v, doc} |
||||
err = p.parse() |
||||
if err != nil { return err } |
||||
|
||||
return nil |
||||
} |
@ -0,0 +1,70 @@
@@ -0,0 +1,70 @@
|
||||
package classic |
||||
|
||||
import ( |
||||
"net/http" |
||||
"errors" |
||||
"encoding/xml" |
||||
"time" |
||||
"github.com/PuerkitoBio/goquery" |
||||
"github.com/terorie/youtube-mango/data" |
||||
) |
||||
|
||||
var transport = http.Transport{ |
||||
MaxIdleConns: 10, |
||||
IdleConnTimeout: 30 * time.Second, |
||||
} |
||||
var client = http.Client{Transport: &transport} |
||||
|
||||
const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v=" |
||||
const subtitleURL = "https://video.google.com/timedtext?type=list&v=" |
||||
|
||||
// Grabs a HTML video page and returns the document tree
|
||||
func grab(v *data.Video) (doc *goquery.Document, err error) { |
||||
req, err := http.NewRequest("GET", mainURL + v.ID, nil) |
||||
if err != nil { return } |
||||
requestHeader(&req.Header) |
||||
|
||||
res, err := client.Do(req) |
||||
if err != nil { return } |
||||
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") } |
||||
|
||||
defer res.Body.Close() |
||||
doc, err = goquery.NewDocumentFromReader(res.Body) |
||||
if err != nil { return nil, err } |
||||
|
||||
return |
||||
} |
||||
|
||||
// Grabs and parses a subtitle list
|
||||
func grabSubtitleList(v *data.Video) (err error) { |
||||
req, err := http.NewRequest("GET", subtitleURL + v.ID, nil) |
||||
|
||||
if err != nil { return err } |
||||
requestHeader(&req.Header) |
||||
|
||||
res, err := client.Do(req) |
||||
|
||||
if err != nil { return err } |
||||
if res.StatusCode != 200 { return errors.New("HTTP failure") } |
||||
|
||||
defer res.Body.Close() |
||||
decoder := xml.NewDecoder(res.Body) |
||||
|
||||
var tracks XMLSubTrackList |
||||
err = decoder.Decode(&tracks) |
||||
if err != nil { return err } |
||||
|
||||
for _, track := range tracks.Tracks { |
||||
v.Subtitles = append(v.Subtitles, track.LangCode) |
||||
} |
||||
|
||||
return |
||||
} |
||||
|
||||
// Important:
|
||||
// - Set header "Accept-Language: en-US" or else parser might break
|
||||
// - Set header "User-Agent: youtube-mango/1.0"
|
||||
func requestHeader(h *http.Header) { |
||||
h.Add("Accept-Language", "en-US") |
||||
h.Add("User-Agent", "youtube-mango/0.1") |
||||
} |
@ -0,0 +1,135 @@
@@ -0,0 +1,135 @@
|
||||
package classic |
||||
|
||||
import ( |
||||
"github.com/PuerkitoBio/goquery" |
||||
"errors" |
||||
"strconv" |
||||
"time" |
||||
"github.com/terorie/youtube-mango/data" |
||||
) |
||||
|
||||
const likeBtnSelector = ".like-button-renderer-like-button-unclicked" |
||||
const dislikeBtnSelector = ".like-button-renderer-dislike-button-unclicked" |
||||
const viewCountSelector = "div .watch-view-count" |
||||
const userInfoSelector = "div .yt-user-info" |
||||
const channelNameSelector = ".yt-uix-sessionlink" |
||||
|
||||
type parseInfo struct { |
||||
v *data.Video |
||||
doc *goquery.Document |
||||
} |
||||
|
||||
func (p *parseInfo) parse() error { |
||||
if err := p.parseLikeDislike(); |
||||
err != nil { return err } |
||||
if err := p.parseViewCount(); |
||||
err != nil { return err } |
||||
if err := p.parseUploader(); |
||||
err != nil { return err } |
||||
if err := p.parseDescription(); |
||||
err != nil { return err } |
||||
|
||||
p.parseMetas() |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (p *parseInfo) parseLikeDislike() error { |
||||
likeText := p.doc.Find(likeBtnSelector).First().Text() |
||||
dislikeText := p.doc.Find(dislikeBtnSelector).First().Text() |
||||
|
||||
if len(likeText) == 0 || len(dislikeText) == 0 { |
||||
return errors.New("failed to parse like buttons") |
||||
} |
||||
|
||||
var err error |
||||
p.v.Likes, err = extractNumber(likeText) |
||||
if err != nil { return err } |
||||
p.v.Dislikes, err = extractNumber(dislikeText) |
||||
if err != nil { return err } |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (p *parseInfo) parseViewCount() error { |
||||
viewCountText := p.doc.Find(viewCountSelector).First().Text() |
||||
viewCount, err := extractNumber(viewCountText) |
||||
if err != nil { return err } |
||||
p.v.Views = viewCount |
||||
return nil |
||||
} |
||||
|
||||
func (p *parseInfo) parseUploader() error { |
||||
userInfo := p.doc.Find(userInfoSelector) |
||||
userLinkNode := userInfo.Find(".yt-uix-sessionlink") |
||||
|
||||
// get link
|
||||
userLink, _ := userLinkNode.Attr("href") |
||||
if userLink == "" { return errors.New("couldn't find channel link") } |
||||
p.v.UploaderURL = "https://www.youtube.com" + userLink |
||||
|
||||
// get name
|
||||
channelName := userInfo.Find(channelNameSelector).Text() |
||||
if channelName == "" { return errors.New("could not find channel name") } |
||||
p.v.Uploader = channelName |
||||
return nil |
||||
} |
||||
|
||||
func (p *parseInfo) parseMetas() { |
||||
metas := p.doc.Find("meta") |
||||
// For each <meta>
|
||||
for _, node := range metas.Nodes { |
||||
// Attributes
|
||||
var content string |
||||
var itemprop string |
||||
var prop string |
||||
|
||||
// Parse attributes
|
||||
for _, attr := range node.Attr { |
||||
switch attr.Key { |
||||
case "property": prop = attr.Val |
||||
case "itemprop": itemprop = attr.Val |
||||
case "content": content = attr.Val |
||||
} |
||||
} |
||||
|
||||
// Content not set
|
||||
if len(content) == 0 { |
||||
continue |
||||
} |
||||
|
||||
// <meta property …
|
||||
if len(prop) != 0 { |
||||
switch prop { |
||||
case "og:title": |
||||
p.v.Title = content |
||||
case "og:video:tag": |
||||
p.v.Tags = append(p.v.Tags, content) |
||||
case "og:url": |
||||
p.v.URL = content |
||||
case "og:image": |
||||
p.v.Thumbnail = content |
||||
} |
||||
continue |
||||
} |
||||
// <meta itemprop …
|
||||
if len(itemprop) != 0 { |
||||
switch itemprop { |
||||
case "datePublished": |
||||
if val, err := time.Parse("2006-01-02", content); |
||||
err == nil { p.v.UploadDate = val } |
||||
case "genre": |
||||
p.v.Genre = content |
||||
case "channelId": |
||||
p.v.UploaderID = content |
||||
case "duration": |
||||
if val, err := parseDuration(content); |
||||
err == nil { p.v.Duration = val } |
||||
case "isFamilyFriendly": |
||||
if val, err := strconv.ParseBool(content); |
||||
err == nil { p.v.FamilyFriendly = val } |
||||
} |
||||
continue |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,103 @@
@@ -0,0 +1,103 @@
|
||||
package classic |
||||
|
||||
import ( |
||||
"errors" |
||||
"golang.org/x/net/html" |
||||
"bytes" |
||||
"github.com/terorie/youtube-mango/common" |
||||
"strings" |
||||
) |
||||
|
||||
const descriptionSelector = "#eow-description" |
||||
|
||||
func (p *parseInfo) parseDescription() error { |
||||
// Find description root
|
||||
descNode := p.doc.Find(descriptionSelector).First() |
||||
if len(descNode.Nodes) == 0 { return errors.New("could not find description") } |
||||
|
||||
// Markdown text
|
||||
var buffer bytes.Buffer |
||||
|
||||
// Enumerate nodes
|
||||
for c := descNode.Nodes[0].FirstChild; c != nil; c = c.NextSibling { |
||||
switch c.Type { |
||||
case html.TextNode: |
||||
// FIXME: "&lt;" gets parsed to => "<"
|
||||
// Write text to buffer, escaping markdown
|
||||
err := common.MarkdownTextEscape.ToBuffer(c.Data, &buffer) |
||||
if err != nil { return err } |
||||
case html.ElementNode: |
||||
switch c.Data { |
||||
// Newline
|
||||
case "br": |
||||
err := buffer.WriteByte(0x0a) |
||||
if err != nil { return err } |
||||
// Link
|
||||
case "a": |
||||
err := parseLink(c, &buffer) |
||||
if err != nil { return err } |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Save description
|
||||
p.v.Description = buffer.String() |
||||
println(p.v.Description) |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func parseLink(c *html.Node, dest *bytes.Buffer) error { |
||||
// Find text
|
||||
if c.FirstChild == nil { return nil } // Empty link
|
||||
if c.FirstChild.Type != html.TextNode { |
||||
return errors.New("unexpected non-text node") |
||||
} |
||||
text := c.FirstChild.Data |
||||
|
||||
// Find href
|
||||
for _, attr := range c.Attr { |
||||
if attr.Key == "href" { |
||||
switch { |
||||
// hashtag
|
||||
case strings.HasPrefix(attr.Val, "/results"): |
||||
dest.WriteString(text) |
||||
|
||||
// real link
|
||||
case strings.HasPrefix(attr.Val, "/redirect"): |
||||
/* |
||||
Not needed: |
||||
// Decode link from href
|
||||
link, err := decodeLink(attr.Val) |
||||
if err != nil { return err } |
||||
// Escape to markdown
|
||||
link, err = common.MarkdownLinkEscape.ToString(link) |
||||
if err != nil { return err } |
||||
// Write to buffer
|
||||
dest.WriteString(fmt.Sprintf("[%s](%s)\n", text, link)) |
||||
*/ |
||||
dest.WriteString(text) |
||||
|
||||
default: |
||||
return errors.New("unknown link") |
||||
} |
||||
break |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
/* Not needed |
||||
|
||||
func decodeLink(href string) (string, error) { |
||||
url, err := url2.Parse(href) |
||||
if err != nil { return "", err } |
||||
|
||||
query := url.Query() |
||||
link := query.Get("q") |
||||
if link == "" { return "", errors.New("empty link") } |
||||
|
||||
return link, nil |
||||
} |
||||
|
||||
*/ |
@ -0,0 +1,44 @@
@@ -0,0 +1,44 @@
|
||||
package classic |
||||
|
||||
import ( |
||||
"time" |
||||
"errors" |
||||
"strings" |
||||
"strconv" |
||||
) |
||||
|
||||
// "PT6M57S" => 6 min 57 s
|
||||
func parseDuration(d string) (time.Duration, error) { |
||||
var err error |
||||
goto start |
||||
|
||||
error: |
||||
return 0, errors.New("unknown duration code") |
||||
|
||||
start: |
||||
if d[0:2] != "PT" { goto error } |
||||
mIndex := strings.IndexByte(d, 'M') |
||||
if mIndex == -1 { goto error } |
||||
|
||||
minutes, err := strconv.ParseUint(d[2:mIndex], 10, 32) |
||||
if err != nil { return 0, err } |
||||
seconds, err := strconv.ParseUint(d[mIndex:len(d)-1], 10, 32) |
||||
if err != nil { return 0, err } |
||||
|
||||
dur := time.Duration(minutes) * time.Minute + time.Duration(seconds) * time.Second |
||||
return dur, nil |
||||
} |
||||
|
||||
// "137,802 views" => 137802
|
||||
func extractNumber(s string) (uint64, error) { |
||||
// Extract numbers from view string
|
||||
var clean []byte |
||||
for _, char := range []byte(s) { |
||||
if char >= 0x30 && char <= 0x39 { |
||||
clean = append(clean, char) |
||||
} |
||||
} |
||||
|
||||
// Convert to uint
|
||||
return strconv.ParseUint(string(clean), 10, 64) |
||||
} |
@ -0,0 +1,45 @@
@@ -0,0 +1,45 @@
|
||||
package common |
||||
|
||||
import "bytes" |
||||
|
||||
// Markdown escape map (ASCII)
|
||||
// Inspired by https://github.com/golang-commonmark/markdown/blob/master/escape.go
|
||||
type EscapeMap [2]uint64 |
||||
|
||||
func (e *EscapeMap) Get(index uint) bool { |
||||
if index >= 128 { return false } |
||||
high, low := index / 64, index % 64 |
||||
return e[high] & (1 << low) != 0 |
||||
} |
||||
|
||||
func (e *EscapeMap) Set(index uint, x bool) { |
||||
if index >= 128 { return } |
||||
high, low := index / 64, index % 64 |
||||
if x { |
||||
e[high] = e[high] | 1 << low |
||||
} else { |
||||
e[high] = e[high] &^ 1 << low |
||||
} |
||||
} |
||||
|
||||
func (e EscapeMap) ToBuffer(src string, dest *bytes.Buffer) (err error) { |
||||
for _, char := range src { |
||||
if char < 0x80 && e.Get(uint(char)) { |
||||
// Write backslash + char
|
||||
_, err = dest.Write([]byte{0x5c, byte(char)}) |
||||
} else { |
||||
_, err = dest.WriteRune(char) |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
func (e EscapeMap) ToString(src string) (string, error) { |
||||
var buffer bytes.Buffer |
||||
err := e.ToBuffer(src, &buffer) |
||||
if err != nil { |
||||
return "", err |
||||
} else { |
||||
return buffer.String(), nil |
||||
} |
||||
} |
@ -0,0 +1,5 @@
@@ -0,0 +1,5 @@
|
||||
package common |
||||
|
||||
import "net/http" |
||||
|
||||
var Client = http.Client{Transport: http.DefaultTransport} |
@ -0,0 +1,16 @@
@@ -0,0 +1,16 @@
|
||||
package common |
||||
|
||||
var MarkdownTextEscape EscapeMap |
||||
var MarkdownLinkEscape EscapeMap |
||||
|
||||
func init() { |
||||
registerMap := func(eMap EscapeMap, escaped string) { |
||||
for _, c := range escaped { |
||||
eMap.Set(uint(c), true) |
||||
} |
||||
} |
||||
|
||||
registerMap(MarkdownTextEscape, "\\!\"#$%&()*+/;<=>?@[]^_`{|}~-") |
||||
registerMap(MarkdownLinkEscape, "\\!\"#$%&'()*+,;<=>?@[]^_`{|}~-") |
||||
} |
||||
|
@ -0,0 +1,41 @@
@@ -0,0 +1,41 @@
|
||||
package data |
||||
|
||||
import "time" |
||||
|
||||
type Video struct { |
||||
ID string `json:"id"` |
||||
Title string `json:"title"` |
||||
Description string `json:"description"` |
||||
Uploader string `json:"uploader"` |
||||
UploaderID string `json:"uploader_id"` |
||||
UploaderURL string `json:"uploader_url"` |
||||
UploadDate time.Time `json:"upload_date"` |
||||
Thumbnail string `json:"thumbnail"` |
||||
URL string `json:"url"` |
||||
License string `json:"license,omitempty"` |
||||
Genre string `json:"genre"` |
||||
Tags []string `json:"tags"` |
||||
Subtitles []string `json:"subtitles,omitempty"` |
||||
Duration time.Duration `json:"duration"` |
||||
FamilyFriendly bool `json:"family_friendly"` |
||||
Views uint64 `json:"views"` |
||||
Likes uint64 `json:"likes"` |
||||
Dislikes uint64 `json:"dislikes"` |
||||
Formats []Format `json:"formats,omitempty"` |
||||
} |
||||
|
||||
type Subtitle struct { |
||||
URL string |
||||
Extension string |
||||
} |
||||
|
||||
type Format struct { |
||||
FormatID string |
||||
URL string |
||||
PlayerURL string |
||||
Extension string |
||||
Height uint32 |
||||
FormatNote string |
||||
AudioCodec string |
||||
Abr float32 |
||||
} |
@ -0,0 +1,23 @@
@@ -0,0 +1,23 @@
|
||||
/* youtube-ma for MongoDB |
||||
* |
||||
* Based on https://github.com/CorentinB/youtube-ma */
|
||||
|
||||
package main |
||||
|
||||
import ( |
||||
"encoding/json" |
||||
"github.com/terorie/youtube-mango/data" |
||||
"github.com/terorie/youtube-mango/classic" |
||||
) |
||||
|
||||
func main() { |
||||
v := data.Video{ID: "kj9mFK62c6E"} |
||||
|
||||
err := classic.Get(&v) |
||||
if err != nil { panic(err) } |
||||
|
||||
jsn, err := json.MarshalIndent(v, "", "\t") |
||||
if err != nil { panic(err) } |
||||
|
||||
println(string(jsn)) |
||||
} |
Loading…
Reference in new issue