Przeglądaj źródła

Initial commit

master
terorie 2 lat temu
commit
3b5744c589
12 zmienionych plików z 516 dodań i 0 usunięć
  1. 5
    0
      .gitignore
  2. 8
    0
      classic/data.go
  3. 21
    0
      classic/get.go
  4. 70
    0
      classic/grab.go
  5. 135
    0
      classic/parse.go
  6. 103
    0
      classic/parsedescription.go
  7. 44
    0
      classic/util.go
  8. 45
    0
      common/escape.go
  9. 5
    0
      common/http.go
  10. 16
    0
      common/markdown.go
  11. 41
    0
      data/video.go
  12. 23
    0
      main.go

+ 5
- 0
.gitignore Wyświetl plik

@@ -0,0 +1,5 @@
# IntelliJ
/idea/

# Apple
.DS_STORE

+ 8
- 0
classic/data.go Wyświetl plik

@@ -0,0 +1,8 @@
package classic

type XMLSubTrackList struct {
Tracks []struct {
LangCode string `xml:"lang_code,attr"`
Lang string `xml:"lang_translated,attr"`
} `xml:"track"`
}

+ 21
- 0
classic/get.go Wyświetl plik

@@ -0,0 +1,21 @@
package classic

import (
"github.com/terorie/youtube-mango/data"
"errors"
)

func Get(v *data.Video) error {
if len(v.ID) == 0 { return errors.New("no video ID") }

// Download the doc tree
doc, err := grab(v)
if err != nil { return err }

// Parse it
p := parseInfo{v, doc}
err = p.parse()
if err != nil { return err }

return nil
}

+ 70
- 0
classic/grab.go Wyświetl plik

@@ -0,0 +1,70 @@
package classic

import (
"net/http"
"errors"
"encoding/xml"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/youtube-mango/data"
)

var transport = http.Transport{
MaxIdleConns: 10,
IdleConnTimeout: 30 * time.Second,
}
var client = http.Client{Transport: &transport}

const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v="
const subtitleURL = "https://video.google.com/timedtext?type=list&v="

// Grabs a HTML video page and returns the document tree
func grab(v *data.Video) (doc *goquery.Document, err error) {
req, err := http.NewRequest("GET", mainURL + v.ID, nil)
if err != nil { return }
requestHeader(&req.Header)

res, err := client.Do(req)
if err != nil { return }
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") }

defer res.Body.Close()
doc, err = goquery.NewDocumentFromReader(res.Body)
if err != nil { return nil, err }

return
}

// Grabs and parses a subtitle list
func grabSubtitleList(v *data.Video) (err error) {
req, err := http.NewRequest("GET", subtitleURL + v.ID, nil)

if err != nil { return err }
requestHeader(&req.Header)

res, err := client.Do(req)

if err != nil { return err }
if res.StatusCode != 200 { return errors.New("HTTP failure") }

defer res.Body.Close()
decoder := xml.NewDecoder(res.Body)

var tracks XMLSubTrackList
err = decoder.Decode(&tracks)
if err != nil { return err }

for _, track := range tracks.Tracks {
v.Subtitles = append(v.Subtitles, track.LangCode)
}

return
}

// Important:
// - Set header "Accept-Language: en-US" or else parser might break
// - Set header "User-Agent: youtube-mango/1.0"
func requestHeader(h *http.Header) {
h.Add("Accept-Language", "en-US")
h.Add("User-Agent", "youtube-mango/0.1")
}

+ 135
- 0
classic/parse.go Wyświetl plik

@@ -0,0 +1,135 @@
package classic

import (
"github.com/PuerkitoBio/goquery"
"errors"
"strconv"
"time"
"github.com/terorie/youtube-mango/data"
)

const likeBtnSelector = ".like-button-renderer-like-button-unclicked"
const dislikeBtnSelector = ".like-button-renderer-dislike-button-unclicked"
const viewCountSelector = "div .watch-view-count"
const userInfoSelector = "div .yt-user-info"
const channelNameSelector = ".yt-uix-sessionlink"

type parseInfo struct {
v *data.Video
doc *goquery.Document
}

func (p *parseInfo) parse() error {
if err := p.parseLikeDislike();
err != nil { return err }
if err := p.parseViewCount();
err != nil { return err }
if err := p.parseUploader();
err != nil { return err }
if err := p.parseDescription();
err != nil { return err }

p.parseMetas()

return nil
}

func (p *parseInfo) parseLikeDislike() error {
likeText := p.doc.Find(likeBtnSelector).First().Text()
dislikeText := p.doc.Find(dislikeBtnSelector).First().Text()

if len(likeText) == 0 || len(dislikeText) == 0 {
return errors.New("failed to parse like buttons")
}

var err error
p.v.Likes, err = extractNumber(likeText)
if err != nil { return err }
p.v.Dislikes, err = extractNumber(dislikeText)
if err != nil { return err }

return nil
}

func (p *parseInfo) parseViewCount() error {
viewCountText := p.doc.Find(viewCountSelector).First().Text()
viewCount, err := extractNumber(viewCountText)
if err != nil { return err }
p.v.Views = viewCount
return nil
}

func (p *parseInfo) parseUploader() error {
userInfo := p.doc.Find(userInfoSelector)
userLinkNode := userInfo.Find(".yt-uix-sessionlink")

// get link
userLink, _ := userLinkNode.Attr("href")
if userLink == "" { return errors.New("couldn't find channel link") }
p.v.UploaderURL = "https://www.youtube.com" + userLink

// get name
channelName := userInfo.Find(channelNameSelector).Text()
if channelName == "" { return errors.New("could not find channel name") }
p.v.Uploader = channelName
return nil
}

func (p *parseInfo) parseMetas() {
metas := p.doc.Find("meta")
// For each <meta>
for _, node := range metas.Nodes {
// Attributes
var content string
var itemprop string
var prop string

// Parse attributes
for _, attr := range node.Attr {
switch attr.Key {
case "property": prop = attr.Val
case "itemprop": itemprop = attr.Val
case "content": content = attr.Val
}
}

// Content not set
if len(content) == 0 {
continue
}

// <meta property …
if len(prop) != 0 {
switch prop {
case "og:title":
p.v.Title = content
case "og:video:tag":
p.v.Tags = append(p.v.Tags, content)
case "og:url":
p.v.URL = content
case "og:image":
p.v.Thumbnail = content
}
continue
}
// <meta itemprop …
if len(itemprop) != 0 {
switch itemprop {
case "datePublished":
if val, err := time.Parse("2006-01-02", content);
err == nil { p.v.UploadDate = val }
case "genre":
p.v.Genre = content
case "channelId":
p.v.UploaderID = content
case "duration":
if val, err := parseDuration(content);
err == nil { p.v.Duration = val }
case "isFamilyFriendly":
if val, err := strconv.ParseBool(content);
err == nil { p.v.FamilyFriendly = val }
}
continue
}
}
}

+ 103
- 0
classic/parsedescription.go Wyświetl plik

@@ -0,0 +1,103 @@
package classic

import (
"errors"
"golang.org/x/net/html"
"bytes"
"github.com/terorie/youtube-mango/common"
"strings"
)

const descriptionSelector = "#eow-description"

func (p *parseInfo) parseDescription() error {
// Find description root
descNode := p.doc.Find(descriptionSelector).First()
if len(descNode.Nodes) == 0 { return errors.New("could not find description") }

// Markdown text
var buffer bytes.Buffer

// Enumerate nodes
for c := descNode.Nodes[0].FirstChild; c != nil; c = c.NextSibling {
switch c.Type {
case html.TextNode:
// FIXME: "&amp;lt;" gets parsed to => "<"
// Write text to buffer, escaping markdown
err := common.MarkdownTextEscape.ToBuffer(c.Data, &buffer)
if err != nil { return err }
case html.ElementNode:
switch c.Data {
// Newline
case "br":
err := buffer.WriteByte(0x0a)
if err != nil { return err }
// Link
case "a":
err := parseLink(c, &buffer)
if err != nil { return err }
}
}
}

// Save description
p.v.Description = buffer.String()
println(p.v.Description)

return nil
}

func parseLink(c *html.Node, dest *bytes.Buffer) error {
// Find text
if c.FirstChild == nil { return nil } // Empty link
if c.FirstChild.Type != html.TextNode {
return errors.New("unexpected non-text node")
}
text := c.FirstChild.Data

// Find href
for _, attr := range c.Attr {
if attr.Key == "href" {
switch {
// hashtag
case strings.HasPrefix(attr.Val, "/results"):
dest.WriteString(text)

// real link
case strings.HasPrefix(attr.Val, "/redirect"):
/*
Not needed:
// Decode link from href
link, err := decodeLink(attr.Val)
if err != nil { return err }
// Escape to markdown
link, err = common.MarkdownLinkEscape.ToString(link)
if err != nil { return err }
// Write to buffer
dest.WriteString(fmt.Sprintf("[%s](%s)\n", text, link))
*/
dest.WriteString(text)

default:
return errors.New("unknown link")
}
break
}
}
return nil
}

/* Not needed

func decodeLink(href string) (string, error) {
url, err := url2.Parse(href)
if err != nil { return "", err }

query := url.Query()
link := query.Get("q")
if link == "" { return "", errors.New("empty link") }

return link, nil
}

*/

+ 44
- 0
classic/util.go Wyświetl plik

@@ -0,0 +1,44 @@
package classic

import (
"time"
"errors"
"strings"
"strconv"
)

// "PT6M57S" => 6 min 57 s
func parseDuration(d string) (time.Duration, error) {
var err error
goto start

error:
return 0, errors.New("unknown duration code")

start:
if d[0:2] != "PT" { goto error }
mIndex := strings.IndexByte(d, 'M')
if mIndex == -1 { goto error }

minutes, err := strconv.ParseUint(d[2:mIndex], 10, 32)
if err != nil { return 0, err }
seconds, err := strconv.ParseUint(d[mIndex:len(d)-1], 10, 32)
if err != nil { return 0, err }

dur := time.Duration(minutes) * time.Minute + time.Duration(seconds) * time.Second
return dur, nil
}

// "137,802 views" => 137802
func extractNumber(s string) (uint64, error) {
// Extract numbers from view string
var clean []byte
for _, char := range []byte(s) {
if char >= 0x30 && char <= 0x39 {
clean = append(clean, char)
}
}

// Convert to uint
return strconv.ParseUint(string(clean), 10, 64)
}

+ 45
- 0
common/escape.go Wyświetl plik

@@ -0,0 +1,45 @@
package common

import "bytes"

// Markdown escape map (ASCII)
// Inspired by https://github.com/golang-commonmark/markdown/blob/master/escape.go
type EscapeMap [2]uint64

func (e *EscapeMap) Get(index uint) bool {
if index >= 128 { return false }
high, low := index / 64, index % 64
return e[high] & (1 << low) != 0
}

func (e *EscapeMap) Set(index uint, x bool) {
if index >= 128 { return }
high, low := index / 64, index % 64
if x {
e[high] = e[high] | 1 << low
} else {
e[high] = e[high] &^ 1 << low
}
}

func (e EscapeMap) ToBuffer(src string, dest *bytes.Buffer) (err error) {
for _, char := range src {
if char < 0x80 && e.Get(uint(char)) {
// Write backslash + char
_, err = dest.Write([]byte{0x5c, byte(char)})
} else {
_, err = dest.WriteRune(char)
}
}
return
}

func (e EscapeMap) ToString(src string) (string, error) {
var buffer bytes.Buffer
err := e.ToBuffer(src, &buffer)
if err != nil {
return "", err
} else {
return buffer.String(), nil
}
}

+ 5
- 0
common/http.go Wyświetl plik

@@ -0,0 +1,5 @@
package common

import "net/http"

var Client = http.Client{Transport: http.DefaultTransport}

+ 16
- 0
common/markdown.go Wyświetl plik

@@ -0,0 +1,16 @@
package common

var MarkdownTextEscape EscapeMap
var MarkdownLinkEscape EscapeMap

func init() {
registerMap := func(eMap EscapeMap, escaped string) {
for _, c := range escaped {
eMap.Set(uint(c), true)
}
}

registerMap(MarkdownTextEscape, "\\!\"#$%&()*+/;<=>?@[]^_`{|}~-")
registerMap(MarkdownLinkEscape, "\\!\"#$%&'()*+,;<=>?@[]^_`{|}~-")
}


+ 41
- 0
data/video.go Wyświetl plik

@@ -0,0 +1,41 @@
package data

import "time"

type Video struct {
ID string `json:"id"`
Title string `json:"title"`
Description string `json:"description"`
Uploader string `json:"uploader"`
UploaderID string `json:"uploader_id"`
UploaderURL string `json:"uploader_url"`
UploadDate time.Time `json:"upload_date"`
Thumbnail string `json:"thumbnail"`
URL string `json:"url"`
License string `json:"license,omitempty"`
Genre string `json:"genre"`
Tags []string `json:"tags"`
Subtitles []string `json:"subtitles,omitempty"`
Duration time.Duration `json:"duration"`
FamilyFriendly bool `json:"family_friendly"`
Views uint64 `json:"views"`
Likes uint64 `json:"likes"`
Dislikes uint64 `json:"dislikes"`
Formats []Format `json:"formats,omitempty"`
}

type Subtitle struct {
URL string
Extension string
}

type Format struct {
FormatID string
URL string
PlayerURL string
Extension string
Height uint32
FormatNote string
AudioCodec string
Abr float32
}

+ 23
- 0
main.go Wyświetl plik

@@ -0,0 +1,23 @@
/* youtube-ma for MongoDB
*
* Based on https://github.com/CorentinB/youtube-ma */

package main

import (
"encoding/json"
"github.com/terorie/youtube-mango/data"
"github.com/terorie/youtube-mango/classic"
)

func main() {
v := data.Video{ID: "kj9mFK62c6E"}

err := classic.Get(&v)
if err != nil { panic(err) }

jsn, err := json.MarshalIndent(v, "", "\t")
if err != nil { panic(err) }

println(string(jsn))
}

Ładowanie…
Anuluj
Zapisz