@@ -2,6 +2,8 @@ | |||
> YT metadata extractor inspired by [`youtube-ma` by _CorentinB_][youtube-ma] | |||
__Warning: Very WIP rn!__ | |||
##### Build | |||
Install and compile the Go project with `go get github.com/terorie/yt-mango`! | |||
@@ -15,14 +17,15 @@ If you don't have a Go toolchain, grab an executable from the Releases tab | |||
- _/apiclassic_: HTML API implementation (parsing using [goquery][goquery]) | |||
- _/apijson_: JSON API implementation (parsing using [fastjson][fastjson]) | |||
- _/net_: HTTP utilities (asnyc HTTP implementation) | |||
- _/cmd_: Cobra CLI | |||
- _/util_: I don't have a better place for these | |||
- _/pretty_: (not yet used) Terminal color utilities | |||
- _/controller_: (not yet implemented) worker management | |||
- _/db_: (not yet implemented) MongoDB connection | |||
- _???_: (not yet implemented) Redis queue | |||
- _/classic_: Extractor calling the HTML `/watch` API | |||
- _/watchapi_: Extractor calling the JSON `/watch` API | |||
[youtube-ma]: https://github.com/CorentinB/youtube-ma | |||
[goquery]: https://github.com/PuerkitoBio/goquery | |||
[fastjson]: https://github.com/valyala/fastjson | |||
[cobra]: https://github.com/spf13/cobra |
@@ -4,6 +4,7 @@ import ( | |||
"github.com/terorie/yt-mango/data" | |||
"net/http" | |||
"github.com/terorie/yt-mango/apijson" | |||
"github.com/terorie/yt-mango/apiclassic" | |||
) | |||
type API struct { | |||
@@ -25,23 +26,25 @@ var Main *API = nil | |||
// TODO: Remove when everything is implemented | |||
var TempAPI = API{ | |||
GrabVideo: apijson.GrabVideo, | |||
ParseVideo: apijson.ParseVideo, | |||
GrabVideo: apiclassic.GrabVideo, | |||
ParseVideo: apiclassic.ParseVideo, | |||
GrabChannel: apiclassic.GrabChannel, | |||
ParseChannel: apiclassic.ParseChannel, | |||
GrabChannelPage: apijson.GrabChannelPage, | |||
ParseChannelVideoURLs: apijson.ParseChannelVideoURLs, | |||
} | |||
/*var ClassicAPI = API{ | |||
GetVideo: apiclassic.GetVideo, | |||
GetVideoSubtitleList: apiclassic.GetVideoSubtitleList, | |||
GetChannel: apiclassic.GetChannel, | |||
GetChannelVideoURLs: apiclassic.GetChannelVideoURLs, | |||
var ClassicAPI = API{ | |||
GrabVideo: apiclassic.GrabVideo, | |||
ParseVideo: apiclassic.ParseVideo, | |||
GrabChannel: apiclassic.GrabChannel, | |||
ParseChannel: apiclassic.ParseChannel, | |||
} | |||
var JsonAPI = API{ | |||
GetVideo: apijson.GetVideo, | |||
GetVideoSubtitleList: apiclassic.GetVideoSubtitleList, | |||
GetChannel: apijson.GetChannel, | |||
GetChannelVideoURLs: apijson.GetChannelVideoURLs, | |||
}*/ | |||
GrabChannelPage: apijson.GrabChannelPage, | |||
ParseChannelVideoURLs: apijson.ParseChannelVideoURLs, | |||
} |
@@ -2,37 +2,41 @@ package api | |||
import ( | |||
"regexp" | |||
"os" | |||
"strings" | |||
"log" | |||
"net/url" | |||
) | |||
// FIXME: API package should be abstract, no utility code in here | |||
var matchChannelID = regexp.MustCompile("^([\\w\\-]|(%3[dD]))+$") | |||
var matchVideoID = regexp.MustCompile("^[\\w\\-]+$") | |||
func GetChannelID(chanURL string) (string, error) { | |||
// Input: Channel ID or link to YT channel page | |||
// Output: Channel ID or "" on error | |||
func GetChannelID(chanURL string) string { | |||
if !matchChannelID.MatchString(chanURL) { | |||
// Check if youtube.com domain | |||
_url, err := url.Parse(chanURL) | |||
if err != nil || (_url.Host != "www.youtube.com" && _url.Host != "youtube.com") { | |||
log.Fatal("Not a channel ID:", chanURL) | |||
os.Exit(1) | |||
return "" | |||
} | |||
// Check if old /user/ URL | |||
if strings.HasPrefix(_url.Path, "/user/") { | |||
// TODO Implement extraction of channel ID | |||
log.Fatal("New /channel/ link is required!\n" + | |||
"The old /user/ links do not work.") | |||
os.Exit(1) | |||
log.Print("New /channel/ link is required!\n" + | |||
"The old /user/ links do not work:", chanURL) | |||
return "" | |||
} | |||
// Remove /channel/ path | |||
channelID := strings.TrimPrefix(_url.Path, "/channel/") | |||
if len(channelID) == len(_url.Path) { | |||
// No such prefix to be removed | |||
log.Fatal("Not a channel ID:", channelID) | |||
os.Exit(1) | |||
log.Print("Not a channel ID:", channelID) | |||
return "" | |||
} | |||
// Remove rest of path from channel ID | |||
@@ -41,9 +45,40 @@ func GetChannelID(chanURL string) (string, error) { | |||
channelID = channelID[:slashIndex] | |||
} | |||
return channelID, nil | |||
return channelID | |||
} else { | |||
// It's already a channel ID | |||
return chanURL, nil | |||
return chanURL | |||
} | |||
} | |||
func GetVideoID(vidURL string) string { | |||
if !matchVideoID.MatchString(vidURL) { | |||
// Check if youtube.com domain | |||
_url, err := url.Parse(vidURL) | |||
if err != nil || (_url.Host != "www.youtube.com" && _url.Host != "youtube.com") { | |||
log.Fatal("Not a video ID:", vidURL) | |||
return "" | |||
} | |||
// TODO Support other URLs (/v or /embed) | |||
// Check if watch path | |||
if !strings.HasPrefix(_url.Path, "/watch") { | |||
log.Fatal("Not a watch URL:", vidURL) | |||
return "" | |||
} | |||
// Parse query string | |||
query := _url.Query() | |||
videoID := query.Get("v") | |||
if videoID == "" { | |||
log.Fatal("Invalid watch URL:", vidURL) | |||
return "" | |||
} | |||
return videoID | |||
} else { | |||
return vidURL | |||
} | |||
} |
@@ -1,38 +0,0 @@ | |||
package apiclassic | |||
import ( | |||
"github.com/terorie/yt-mango/data" | |||
"errors" | |||
) | |||
func GetVideo(v *data.Video) error { | |||
if len(v.ID) == 0 { return errors.New("no video ID") } | |||
// Download the doc tree | |||
doc, err := GrabVideo(v.ID) | |||
if err != nil { return err } | |||
// Parse it | |||
p := parseInfo{v, doc} | |||
err = p.parse() | |||
if err != nil { return err } | |||
return nil | |||
} | |||
func GetVideoSubtitleList(v *data.Video) (err error) { | |||
tracks, err := GrabSubtitleList(v.ID) | |||
if err != nil { return } | |||
for _, track := range tracks.Tracks { | |||
v.Subtitles = append(v.Subtitles, track.LangCode) | |||
} | |||
return | |||
} | |||
func GetChannel(c *data.Channel) error { | |||
return errors.New("not implemented") | |||
} | |||
func GetChannelVideoURLs(channelID string, page uint) ([]string, error) { | |||
return nil, errors.New("not implemented") | |||
} |
@@ -4,31 +4,22 @@ import ( | |||
"net/http" | |||
"errors" | |||
"encoding/xml" | |||
"github.com/PuerkitoBio/goquery" | |||
"github.com/terorie/yt-mango/net" | |||
"fmt" | |||
) | |||
const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v=" | |||
const videoURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v=" | |||
const subtitleURL = "https://video.google.com/timedtext?type=list&v=" | |||
const channelURL = "https://www.youtube.com/channel/%s/about" | |||
// Grabs a HTML video page and returns the document tree | |||
func GrabVideo(videoID string) (doc *goquery.Document, err error) { | |||
req, err := http.NewRequest("GET", mainURL + videoID, nil) | |||
if err != nil { return } | |||
func GrabVideo(videoID string) *http.Request { | |||
req, err := http.NewRequest("GET", videoURL + videoID, nil) | |||
if err != nil { panic(err) } | |||
setHeaders(&req.Header) | |||
res, err := net.Client.Do(req) | |||
if err != nil { return } | |||
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") } | |||
defer res.Body.Close() | |||
doc, err = goquery.NewDocumentFromReader(res.Body) | |||
if err != nil { return nil, err } | |||
return | |||
return req | |||
} | |||
// Grabs and parses a subtitle list | |||
func GrabSubtitleList(videoID string) (tracks *XMLSubTrackList, err error) { | |||
req, err := http.NewRequest("GET", subtitleURL + videoID, nil) | |||
if err != nil { return } | |||
@@ -46,6 +37,14 @@ func GrabSubtitleList(videoID string) (tracks *XMLSubTrackList, err error) { | |||
return | |||
} | |||
func GrabChannel(channelID string) *http.Request { | |||
req, err := http.NewRequest("GET", fmt.Sprintf(channelURL, channelID), nil) | |||
if err != nil { panic(err) } | |||
setHeaders(&req.Header) | |||
return req | |||
} | |||
func setHeaders(h *http.Header) { | |||
h.Add("Host", "www.youtube.com") | |||
h.Add("User-Agent", "yt-mango/0.1") |
@@ -0,0 +1,61 @@ | |||
package apiclassic | |||
import ( | |||
"github.com/terorie/yt-mango/data" | |||
"net/http" | |||
"errors" | |||
"github.com/PuerkitoBio/goquery" | |||
"strconv" | |||
) | |||
func ParseChannel(c *data.Channel, res *http.Response) (err error) { | |||
if res.StatusCode != 200 { return errors.New("HTTP failure") } | |||
defer res.Body.Close() | |||
doc, err := goquery.NewDocumentFromReader(res.Body) | |||
if err != nil { return } | |||
p := parseChannelInfo{c, doc} | |||
return p.parse() | |||
} | |||
type parseChannelInfo struct { | |||
c *data.Channel | |||
doc *goquery.Document | |||
} | |||
func (p *parseChannelInfo) parse() error { | |||
if err := p.parseMetas(); | |||
err != nil { return err } | |||
return nil | |||
} | |||
func (p *parseChannelInfo) parseMetas() error { | |||
p.doc.Find("head").RemoveFiltered("#watch-container") | |||
enumMetas(p.doc.Find("head").Find("meta"), func(tag metaTag)bool { | |||
content := tag.content | |||
switch tag.typ { | |||
case metaProperty: | |||
switch tag.name { | |||
case "og:title": | |||
p.c.Name = content | |||
} | |||
case metaItemProp: | |||
switch tag.name { | |||
case "paid": | |||
if val, err := strconv.ParseBool(content); | |||
err == nil { p.c.Paid = val } | |||
} | |||
} | |||
return false | |||
}) | |||
return nil | |||
} | |||
func (p *parseChannelInfo) parseAbout() error { | |||
p.doc.Find(".about-stats").Find(".about-stat").Each(func(_ int, s *goquery.Selection) { | |||
text := s.Text() | |||
println(text) | |||
}) | |||
return nil | |||
} |
@@ -4,13 +4,13 @@ import ( | |||
"errors" | |||
"golang.org/x/net/html" | |||
"bytes" | |||
"github.com/terorie/yt-mango/net" | |||
"strings" | |||
"github.com/terorie/yt-mango/util" | |||
) | |||
const descriptionSelector = "#eow-description" | |||
func (p *parseInfo) parseDescription() error { | |||
func (p *parseVideoInfo) parseDescription() error { | |||
// Find description root | |||
descNode := p.doc.Find(descriptionSelector).First() | |||
if len(descNode.Nodes) == 0 { return errors.New("could not find description") } | |||
@@ -24,7 +24,7 @@ func (p *parseInfo) parseDescription() error { | |||
case html.TextNode: | |||
// FIXME: "&lt;" gets parsed to => "<" | |||
// Write text to buffer, escaping markdown | |||
err := net.MarkdownTextEscape.ToBuffer(c.Data, &buffer) | |||
err := util.MarkdownTextEscape.ToBuffer(c.Data, &buffer) | |||
if err != nil { return err } | |||
case html.ElementNode: | |||
switch c.Data { |
@@ -0,0 +1,47 @@ | |||
package apiclassic | |||
import "github.com/PuerkitoBio/goquery" | |||
type metaType uint8 | |||
const ( | |||
metaUnknown = metaType(iota) | |||
metaProperty | |||
metaItemProp | |||
) | |||
type metaTag struct { | |||
typ metaType | |||
name string | |||
content string | |||
} | |||
func enumMetas(s *goquery.Selection, next func(metaTag)bool) { | |||
// For each <meta> | |||
s.EachWithBreak(func(i int, s *goquery.Selection) bool { | |||
tag := metaTag{ metaUnknown, "", "" } | |||
listAttrs: for _, attr := range s.Nodes[0].Attr { | |||
switch attr.Key { | |||
case "property": | |||
tag.typ = metaProperty | |||
tag.name = attr.Val | |||
break listAttrs | |||
case "itemprop": | |||
tag.typ = metaItemProp | |||
tag.name = attr.Val | |||
break listAttrs | |||
case "content": | |||
tag.content = attr.Val | |||
break listAttrs | |||
} | |||
if tag.typ == metaUnknown { continue } | |||
if len(tag.content) == 0 { continue } | |||
// Callback tag | |||
if !next(tag) { | |||
return true | |||
} | |||
} | |||
return false | |||
}) | |||
} |
@@ -9,6 +9,7 @@ import ( | |||
"regexp" | |||
"github.com/valyala/fastjson" | |||
"strings" | |||
"net/http" | |||
) | |||
const likeBtnSelector = ".like-button-renderer-like-button-unclicked" | |||
@@ -19,12 +20,23 @@ const channelNameSelector = ".yt-uix-sessionlink" | |||
var playerConfigErr = errors.New("failed to parse player config") | |||
type parseInfo struct { | |||
func ParseVideo(v *data.Video, res *http.Response) (err error) { | |||
if res.StatusCode != 200 { return errors.New("HTTP failure") } | |||
defer res.Body.Close() | |||
doc, err := goquery.NewDocumentFromReader(res.Body) | |||
if err != nil { return } | |||
p := parseVideoInfo{v, doc} | |||
return p.parse() | |||
} | |||
type parseVideoInfo struct { | |||
v *data.Video | |||
doc *goquery.Document | |||
} | |||
func (p *parseInfo) parse() error { | |||
func (p *parseVideoInfo) parse() error { | |||
if err := p.parseLikeDislike(); | |||
err != nil { return err } | |||
if err := p.parseViewCount(); | |||
@@ -40,7 +52,7 @@ func (p *parseInfo) parse() error { | |||
return nil | |||
} | |||
func (p *parseInfo) parseLikeDislike() error { | |||
func (p *parseVideoInfo) parseLikeDislike() error { | |||
likeText := p.doc.Find(likeBtnSelector).First().Text() | |||
dislikeText := p.doc.Find(dislikeBtnSelector).First().Text() | |||
@@ -57,7 +69,7 @@ func (p *parseInfo) parseLikeDislike() error { | |||
return nil | |||
} | |||
func (p *parseInfo) parseViewCount() error { | |||
func (p *parseVideoInfo) parseViewCount() error { | |||
viewCountText := p.doc.Find(viewCountSelector).First().Text() | |||
viewCount, err := extractNumber(viewCountText) | |||
if err != nil { return err } | |||
@@ -65,7 +77,7 @@ func (p *parseInfo) parseViewCount() error { | |||
return nil | |||
} | |||
func (p *parseInfo) parseUploader() error { | |||
func (p *parseVideoInfo) parseUploader() error { | |||
userInfo := p.doc.Find(userInfoSelector) | |||
userLinkNode := userInfo.Find(".yt-uix-sessionlink") | |||
@@ -81,30 +93,12 @@ func (p *parseInfo) parseUploader() error { | |||
return nil | |||
} | |||
func (p *parseInfo) parseMetas() error { | |||
metas := p.doc.Find("meta") | |||
// For each <meta> | |||
for _, node := range metas.Nodes { | |||
// Attributes | |||
var content string | |||
var itemprop string | |||
var prop string | |||
// Parse attributes | |||
for _, attr := range node.Attr { | |||
switch attr.Key { | |||
case "property": prop = attr.Val | |||
case "itemprop": itemprop = attr.Val | |||
case "content": content = attr.Val | |||
} | |||
} | |||
// Content not set | |||
if len(content) == 0 { continue } | |||
// <meta property … | |||
if len(prop) != 0 { | |||
switch prop { | |||
func (p *parseVideoInfo) parseMetas() (err error) { | |||
enumMetas(p.doc.Selection, func(tag metaTag)bool { | |||
content := tag.content | |||
switch tag.typ { | |||
case metaProperty: | |||
switch tag.name { | |||
case "og:title": | |||
p.v.Title = content | |||
case "og:video:tag": | |||
@@ -114,11 +108,8 @@ func (p *parseInfo) parseMetas() error { | |||
case "og:image": | |||
p.v.Thumbnail = content | |||
} | |||
continue | |||
} | |||
// <meta itemprop … | |||
if len(itemprop) != 0 { | |||
switch itemprop { | |||
case metaItemProp: | |||
switch tag.name { | |||
case "datePublished": | |||
if val, err := time.Parse("2006-01-02", content); | |||
err == nil { p.v.UploadDate = val } | |||
@@ -130,19 +121,20 @@ func (p *parseInfo) parseMetas() error { | |||
if val, err := parseDuration(content); err == nil { | |||
p.v.Duration = val | |||
} else { | |||
return err | |||
return false | |||
} | |||
case "isFamilyFriendly": | |||
if val, err := strconv.ParseBool(content); | |||
err == nil { p.v.FamilyFriendly = val } | |||
} | |||
continue | |||
} | |||
} | |||
return nil | |||
return true | |||
}) | |||
return err | |||
} | |||
func (p *parseInfo) parsePlayerConfig() error { | |||
func (p *parseVideoInfo) parsePlayerConfig() error { | |||
var json string | |||
p.doc.Find("script").EachWithBreak(func(_ int, s *goquery.Selection) bool { |
@@ -8,7 +8,6 @@ const videoURL = "https://www.youtube.com/watch?pbj=1&v=" | |||
const channelURL = "https://www.youtube.com/browse_ajax?ctoken=" | |||
func GrabVideo(videoID string) *http.Request { | |||
// Prepare request | |||
req, err := http.NewRequest("GET", videoURL + videoID, nil) | |||
if err != nil { panic(err) } | |||
setHeaders(&req.Header) |
@@ -14,4 +14,5 @@ var Channel = cobra.Command{ | |||
func init() { | |||
channelDumpCmd.Flags().BoolVarP(&force, "force", "f", false, "Overwrite the output file if it already exists") | |||
Channel.AddCommand(&channelDumpCmd) | |||
Channel.AddCommand(&channelDetailCmd) | |||
} |
@@ -0,0 +1,43 @@ | |||
package cmd | |||
import ( | |||
"github.com/spf13/cobra" | |||
"github.com/terorie/yt-mango/api" | |||
"os" | |||
"log" | |||
"github.com/terorie/yt-mango/net" | |||
"github.com/terorie/yt-mango/data" | |||
"fmt" | |||
"encoding/json" | |||
) | |||
var channelDetailCmd = cobra.Command{ | |||
Use: "detail <channel ID>", | |||
Short: "Get detail about a channel", | |||
Args: cobra.ExactArgs(1), | |||
Run: doChannelDetail, | |||
} | |||
func doChannelDetail(_ *cobra.Command, args []string) { | |||
channelID := args[0] | |||
channelID = api.GetChannelID(channelID) | |||
if channelID == "" { | |||
os.Exit(1) | |||
} | |||
channelReq := api.Main.GrabChannel(channelID) | |||
res, err := net.Client.Do(channelReq) | |||
if err != nil { | |||
log.Fatal(err) | |||
os.Exit(1) | |||
} | |||
var c data.Channel | |||
api.Main.ParseChannel(&c, res) | |||
bytes, err := json.MarshalIndent(&c, "", "\t") | |||
if err != nil { panic(err) } | |||
fmt.Println(string(bytes)) | |||
} |
@@ -56,11 +56,8 @@ func doChannelDump(_ *cobra.Command, args []string) { | |||
} | |||
channelDumpContext.printResults = printResults | |||
channelID, err := api.GetChannelID(channelID) | |||
if err != nil { | |||
log.Print(err) | |||
os.Exit(1) | |||
} | |||
channelID = api.GetChannelID(channelID) | |||
if channelID == "" { os.Exit(1) } | |||
log.Printf("Starting work on channel ID \"%s\".", channelID) | |||
channelDumpContext.startTime = time.Now() |
@@ -1,14 +1,41 @@ | |||
package cmd | |||
import "github.com/spf13/cobra" | |||
import ( | |||
"github.com/spf13/cobra" | |||
"github.com/terorie/yt-mango/api" | |||
"os" | |||
"github.com/terorie/yt-mango/net" | |||
"github.com/terorie/yt-mango/data" | |||
"log" | |||
"fmt" | |||
"encoding/json" | |||
) | |||
var videoDetailCmd = cobra.Command{ | |||
Use: "detail <video ID> [file]", | |||
Short: "Get details about a video", | |||
Args: cobra.ExactArgs(1), | |||
Run: func(cmd *cobra.Command, args []string) { | |||
videoID := args[0] | |||
}, | |||
} | |||
videoID = api.GetVideoID(videoID) | |||
if videoID == "" { | |||
os.Exit(1) | |||
} | |||
videoReq := api.Main.GrabVideo(videoID) | |||
func init() { | |||
res, err := net.Client.Do(videoReq) | |||
if err != nil { | |||
log.Fatal(err) | |||
os.Exit(1) | |||
} | |||
var v data.Video | |||
api.Main.ParseVideo(&v, res) | |||
bytes, err := json.MarshalIndent(&v, "", "\t") | |||
if err != nil { panic(err) } | |||
fmt.Println(string(bytes)) | |||
}, | |||
} |
@@ -3,4 +3,6 @@ package data | |||
type Channel struct { | |||
ID string `json:"id"` | |||
Name string `json:"name"` | |||
Paid bool `json:"paid"` | |||
Thumbnail string `json:"thumbnail"` | |||
} |
@@ -39,8 +39,8 @@ func main() { | |||
switch forceAPI { | |||
case "": api.Main = &api.TempAPI | |||
//case "classic": api.Main = &api.ClassicAPI | |||
//case "json": api.Main = &api.JsonAPI | |||
case "classic": api.Main = &api.ClassicAPI | |||
case "json": api.Main = &api.JsonAPI | |||
default: | |||
fmt.Fprintln(os.Stderr, "Invalid API specified.\n" + | |||
"Valid options are: \"classic\" and \"json\"") |
@@ -1,4 +1,4 @@ | |||
package api | |||
package util | |||
import "bytes" | |||
@@ -1,4 +1,4 @@ | |||
package api | |||
package util | |||
var MarkdownTextEscape EscapeMap | |||
var MarkdownLinkEscape EscapeMap |
@@ -1,5 +1,7 @@ | |||
package version | |||
// TODO Refactor: Dedicating a single package is too much | |||
func Get() string { | |||
return "v0.1 -- dev" | |||
} |