Browse Source

Initial commit

master
terorie 4 years ago
commit
3b5744c589
  1. 5
      .gitignore
  2. 8
      classic/data.go
  3. 21
      classic/get.go
  4. 70
      classic/grab.go
  5. 135
      classic/parse.go
  6. 103
      classic/parsedescription.go
  7. 44
      classic/util.go
  8. 45
      common/escape.go
  9. 5
      common/http.go
  10. 16
      common/markdown.go
  11. 41
      data/video.go
  12. 23
      main.go

5
.gitignore vendored

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
# IntelliJ
/idea/
# Apple
.DS_STORE

8
classic/data.go

@ -0,0 +1,8 @@ @@ -0,0 +1,8 @@
package classic
type XMLSubTrackList struct {
Tracks []struct {
LangCode string `xml:"lang_code,attr"`
Lang string `xml:"lang_translated,attr"`
} `xml:"track"`
}

21
classic/get.go

@ -0,0 +1,21 @@ @@ -0,0 +1,21 @@
package classic
import (
"github.com/terorie/youtube-mango/data"
"errors"
)
func Get(v *data.Video) error {
if len(v.ID) == 0 { return errors.New("no video ID") }
// Download the doc tree
doc, err := grab(v)
if err != nil { return err }
// Parse it
p := parseInfo{v, doc}
err = p.parse()
if err != nil { return err }
return nil
}

70
classic/grab.go

@ -0,0 +1,70 @@ @@ -0,0 +1,70 @@
package classic
import (
"net/http"
"errors"
"encoding/xml"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/youtube-mango/data"
)
var transport = http.Transport{
MaxIdleConns: 10,
IdleConnTimeout: 30 * time.Second,
}
var client = http.Client{Transport: &transport}
const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v="
const subtitleURL = "https://video.google.com/timedtext?type=list&v="
// Grabs a HTML video page and returns the document tree
func grab(v *data.Video) (doc *goquery.Document, err error) {
req, err := http.NewRequest("GET", mainURL + v.ID, nil)
if err != nil { return }
requestHeader(&req.Header)
res, err := client.Do(req)
if err != nil { return }
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") }
defer res.Body.Close()
doc, err = goquery.NewDocumentFromReader(res.Body)
if err != nil { return nil, err }
return
}
// Grabs and parses a subtitle list
func grabSubtitleList(v *data.Video) (err error) {
req, err := http.NewRequest("GET", subtitleURL + v.ID, nil)
if err != nil { return err }
requestHeader(&req.Header)
res, err := client.Do(req)
if err != nil { return err }
if res.StatusCode != 200 { return errors.New("HTTP failure") }
defer res.Body.Close()
decoder := xml.NewDecoder(res.Body)
var tracks XMLSubTrackList
err = decoder.Decode(&tracks)
if err != nil { return err }
for _, track := range tracks.Tracks {
v.Subtitles = append(v.Subtitles, track.LangCode)
}
return
}
// Important:
// - Set header "Accept-Language: en-US" or else parser might break
// - Set header "User-Agent: youtube-mango/1.0"
func requestHeader(h *http.Header) {
h.Add("Accept-Language", "en-US")
h.Add("User-Agent", "youtube-mango/0.1")
}

135
classic/parse.go

@ -0,0 +1,135 @@ @@ -0,0 +1,135 @@
package classic
import (
"github.com/PuerkitoBio/goquery"
"errors"
"strconv"
"time"
"github.com/terorie/youtube-mango/data"
)
const likeBtnSelector = ".like-button-renderer-like-button-unclicked"
const dislikeBtnSelector = ".like-button-renderer-dislike-button-unclicked"
const viewCountSelector = "div .watch-view-count"
const userInfoSelector = "div .yt-user-info"
const channelNameSelector = ".yt-uix-sessionlink"
type parseInfo struct {
v *data.Video
doc *goquery.Document
}
func (p *parseInfo) parse() error {
if err := p.parseLikeDislike();
err != nil { return err }
if err := p.parseViewCount();
err != nil { return err }
if err := p.parseUploader();
err != nil { return err }
if err := p.parseDescription();
err != nil { return err }
p.parseMetas()
return nil
}
func (p *parseInfo) parseLikeDislike() error {
likeText := p.doc.Find(likeBtnSelector).First().Text()
dislikeText := p.doc.Find(dislikeBtnSelector).First().Text()
if len(likeText) == 0 || len(dislikeText) == 0 {
return errors.New("failed to parse like buttons")
}
var err error
p.v.Likes, err = extractNumber(likeText)
if err != nil { return err }
p.v.Dislikes, err = extractNumber(dislikeText)
if err != nil { return err }
return nil
}
func (p *parseInfo) parseViewCount() error {
viewCountText := p.doc.Find(viewCountSelector).First().Text()
viewCount, err := extractNumber(viewCountText)
if err != nil { return err }
p.v.Views = viewCount
return nil
}
func (p *parseInfo) parseUploader() error {
userInfo := p.doc.Find(userInfoSelector)
userLinkNode := userInfo.Find(".yt-uix-sessionlink")
// get link
userLink, _ := userLinkNode.Attr("href")
if userLink == "" { return errors.New("couldn't find channel link") }
p.v.UploaderURL = "https://www.youtube.com" + userLink
// get name
channelName := userInfo.Find(channelNameSelector).Text()
if channelName == "" { return errors.New("could not find channel name") }
p.v.Uploader = channelName
return nil
}
func (p *parseInfo) parseMetas() {
metas := p.doc.Find("meta")
// For each <meta>
for _, node := range metas.Nodes {
// Attributes
var content string
var itemprop string
var prop string
// Parse attributes
for _, attr := range node.Attr {
switch attr.Key {
case "property": prop = attr.Val
case "itemprop": itemprop = attr.Val
case "content": content = attr.Val
}
}
// Content not set
if len(content) == 0 {
continue
}
// <meta property …
if len(prop) != 0 {
switch prop {
case "og:title":
p.v.Title = content
case "og:video:tag":
p.v.Tags = append(p.v.Tags, content)
case "og:url":
p.v.URL = content
case "og:image":
p.v.Thumbnail = content
}
continue
}
// <meta itemprop …
if len(itemprop) != 0 {
switch itemprop {
case "datePublished":
if val, err := time.Parse("2006-01-02", content);
err == nil { p.v.UploadDate = val }
case "genre":
p.v.Genre = content
case "channelId":
p.v.UploaderID = content
case "duration":
if val, err := parseDuration(content);
err == nil { p.v.Duration = val }
case "isFamilyFriendly":
if val, err := strconv.ParseBool(content);
err == nil { p.v.FamilyFriendly = val }
}
continue
}
}
}

103
classic/parsedescription.go

@ -0,0 +1,103 @@ @@ -0,0 +1,103 @@
package classic
import (
"errors"
"golang.org/x/net/html"
"bytes"
"github.com/terorie/youtube-mango/common"
"strings"
)
const descriptionSelector = "#eow-description"
func (p *parseInfo) parseDescription() error {
// Find description root
descNode := p.doc.Find(descriptionSelector).First()
if len(descNode.Nodes) == 0 { return errors.New("could not find description") }
// Markdown text
var buffer bytes.Buffer
// Enumerate nodes
for c := descNode.Nodes[0].FirstChild; c != nil; c = c.NextSibling {
switch c.Type {
case html.TextNode:
// FIXME: "&amp;lt;" gets parsed to => "<"
// Write text to buffer, escaping markdown
err := common.MarkdownTextEscape.ToBuffer(c.Data, &buffer)
if err != nil { return err }
case html.ElementNode:
switch c.Data {
// Newline
case "br":
err := buffer.WriteByte(0x0a)
if err != nil { return err }
// Link
case "a":
err := parseLink(c, &buffer)
if err != nil { return err }
}
}
}
// Save description
p.v.Description = buffer.String()
println(p.v.Description)
return nil
}
func parseLink(c *html.Node, dest *bytes.Buffer) error {
// Find text
if c.FirstChild == nil { return nil } // Empty link
if c.FirstChild.Type != html.TextNode {
return errors.New("unexpected non-text node")
}
text := c.FirstChild.Data
// Find href
for _, attr := range c.Attr {
if attr.Key == "href" {
switch {
// hashtag
case strings.HasPrefix(attr.Val, "/results"):
dest.WriteString(text)
// real link
case strings.HasPrefix(attr.Val, "/redirect"):
/*
Not needed:
// Decode link from href
link, err := decodeLink(attr.Val)
if err != nil { return err }
// Escape to markdown
link, err = common.MarkdownLinkEscape.ToString(link)
if err != nil { return err }
// Write to buffer
dest.WriteString(fmt.Sprintf("[%s](%s)\n", text, link))
*/
dest.WriteString(text)
default:
return errors.New("unknown link")
}
break
}
}
return nil
}
/* Not needed
func decodeLink(href string) (string, error) {
url, err := url2.Parse(href)
if err != nil { return "", err }
query := url.Query()
link := query.Get("q")
if link == "" { return "", errors.New("empty link") }
return link, nil
}
*/

44
classic/util.go

@ -0,0 +1,44 @@ @@ -0,0 +1,44 @@
package classic
import (
"time"
"errors"
"strings"
"strconv"
)
// "PT6M57S" => 6 min 57 s
func parseDuration(d string) (time.Duration, error) {
var err error
goto start
error:
return 0, errors.New("unknown duration code")
start:
if d[0:2] != "PT" { goto error }
mIndex := strings.IndexByte(d, 'M')
if mIndex == -1 { goto error }
minutes, err := strconv.ParseUint(d[2:mIndex], 10, 32)
if err != nil { return 0, err }
seconds, err := strconv.ParseUint(d[mIndex:len(d)-1], 10, 32)
if err != nil { return 0, err }
dur := time.Duration(minutes) * time.Minute + time.Duration(seconds) * time.Second
return dur, nil
}
// "137,802 views" => 137802
func extractNumber(s string) (uint64, error) {
// Extract numbers from view string
var clean []byte
for _, char := range []byte(s) {
if char >= 0x30 && char <= 0x39 {
clean = append(clean, char)
}
}
// Convert to uint
return strconv.ParseUint(string(clean), 10, 64)
}

45
common/escape.go

@ -0,0 +1,45 @@ @@ -0,0 +1,45 @@
package common
import "bytes"
// Markdown escape map (ASCII)
// Inspired by https://github.com/golang-commonmark/markdown/blob/master/escape.go
type EscapeMap [2]uint64
func (e *EscapeMap) Get(index uint) bool {
if index >= 128 { return false }
high, low := index / 64, index % 64
return e[high] & (1 << low) != 0
}
func (e *EscapeMap) Set(index uint, x bool) {
if index >= 128 { return }
high, low := index / 64, index % 64
if x {
e[high] = e[high] | 1 << low
} else {
e[high] = e[high] &^ 1 << low
}
}
func (e EscapeMap) ToBuffer(src string, dest *bytes.Buffer) (err error) {
for _, char := range src {
if char < 0x80 && e.Get(uint(char)) {
// Write backslash + char
_, err = dest.Write([]byte{0x5c, byte(char)})
} else {
_, err = dest.WriteRune(char)
}
}
return
}
func (e EscapeMap) ToString(src string) (string, error) {
var buffer bytes.Buffer
err := e.ToBuffer(src, &buffer)
if err != nil {
return "", err
} else {
return buffer.String(), nil
}
}

5
common/http.go

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
package common
import "net/http"
var Client = http.Client{Transport: http.DefaultTransport}

16
common/markdown.go

@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
package common
var MarkdownTextEscape EscapeMap
var MarkdownLinkEscape EscapeMap
func init() {
registerMap := func(eMap EscapeMap, escaped string) {
for _, c := range escaped {
eMap.Set(uint(c), true)
}
}
registerMap(MarkdownTextEscape, "\\!\"#$%&()*+/;<=>?@[]^_`{|}~-")
registerMap(MarkdownLinkEscape, "\\!\"#$%&'()*+,;<=>?@[]^_`{|}~-")
}

41
data/video.go

@ -0,0 +1,41 @@ @@ -0,0 +1,41 @@
package data
import "time"
type Video struct {
ID string `json:"id"`
Title string `json:"title"`
Description string `json:"description"`
Uploader string `json:"uploader"`
UploaderID string `json:"uploader_id"`
UploaderURL string `json:"uploader_url"`
UploadDate time.Time `json:"upload_date"`
Thumbnail string `json:"thumbnail"`
URL string `json:"url"`
License string `json:"license,omitempty"`
Genre string `json:"genre"`
Tags []string `json:"tags"`
Subtitles []string `json:"subtitles,omitempty"`
Duration time.Duration `json:"duration"`
FamilyFriendly bool `json:"family_friendly"`
Views uint64 `json:"views"`
Likes uint64 `json:"likes"`
Dislikes uint64 `json:"dislikes"`
Formats []Format `json:"formats,omitempty"`
}
type Subtitle struct {
URL string
Extension string
}
type Format struct {
FormatID string
URL string
PlayerURL string
Extension string
Height uint32
FormatNote string
AudioCodec string
Abr float32
}

23
main.go

@ -0,0 +1,23 @@ @@ -0,0 +1,23 @@
/* youtube-ma for MongoDB
*
* Based on https://github.com/CorentinB/youtube-ma */
package main
import (
"encoding/json"
"github.com/terorie/youtube-mango/data"
"github.com/terorie/youtube-mango/classic"
)
func main() {
v := data.Video{ID: "kj9mFK62c6E"}
err := classic.Get(&v)
if err != nil { panic(err) }
jsn, err := json.MarshalIndent(v, "", "\t")
if err != nil { panic(err) }
println(string(jsn))
}
Loading…
Cancel
Save