Browse Source

Some more CLI tests

master
terorie 1 year ago
parent
commit
0e47e3a0ba

+ 5
- 2
README.md View File

@@ -2,6 +2,8 @@

> YT metadata extractor inspired by [`youtube-ma` by _CorentinB_][youtube-ma]

__Warning: Very WIP rn!__

##### Build

Install and compile the Go project with `go get github.com/terorie/yt-mango`!
@@ -15,14 +17,15 @@ If you don't have a Go toolchain, grab an executable from the Releases tab
- _/apiclassic_: HTML API implementation (parsing using [goquery][goquery])
- _/apijson_: JSON API implementation (parsing using [fastjson][fastjson])
- _/net_: HTTP utilities (asnyc HTTP implementation)
- _/cmd_: Cobra CLI
- _/util_: I don't have a better place for these

- _/pretty_: (not yet used) Terminal color utilities
- _/controller_: (not yet implemented) worker management
- _/db_: (not yet implemented) MongoDB connection
- _???_: (not yet implemented) Redis queue
- _/classic_: Extractor calling the HTML `/watch` API
- _/watchapi_: Extractor calling the JSON `/watch` API

[youtube-ma]: https://github.com/CorentinB/youtube-ma
[goquery]: https://github.com/PuerkitoBio/goquery
[fastjson]: https://github.com/valyala/fastjson
[cobra]: https://github.com/spf13/cobra

+ 15
- 12
api/api.go View File

@@ -4,6 +4,7 @@ import (
"github.com/terorie/yt-mango/data"
"net/http"
"github.com/terorie/yt-mango/apijson"
"github.com/terorie/yt-mango/apiclassic"
)

type API struct {
@@ -25,23 +26,25 @@ var Main *API = nil

// TODO: Remove when everything is implemented
var TempAPI = API{
GrabVideo: apijson.GrabVideo,
ParseVideo: apijson.ParseVideo,
GrabVideo: apiclassic.GrabVideo,
ParseVideo: apiclassic.ParseVideo,

GrabChannel: apiclassic.GrabChannel,
ParseChannel: apiclassic.ParseChannel,

GrabChannelPage: apijson.GrabChannelPage,
ParseChannelVideoURLs: apijson.ParseChannelVideoURLs,
}

/*var ClassicAPI = API{
GetVideo: apiclassic.GetVideo,
GetVideoSubtitleList: apiclassic.GetVideoSubtitleList,
GetChannel: apiclassic.GetChannel,
GetChannelVideoURLs: apiclassic.GetChannelVideoURLs,
var ClassicAPI = API{
GrabVideo: apiclassic.GrabVideo,
ParseVideo: apiclassic.ParseVideo,

GrabChannel: apiclassic.GrabChannel,
ParseChannel: apiclassic.ParseChannel,
}

var JsonAPI = API{
GetVideo: apijson.GetVideo,
GetVideoSubtitleList: apiclassic.GetVideoSubtitleList,
GetChannel: apijson.GetChannel,
GetChannelVideoURLs: apijson.GetChannelVideoURLs,
}*/
GrabChannelPage: apijson.GrabChannelPage,
ParseChannelVideoURLs: apijson.ParseChannelVideoURLs,
}

+ 45
- 10
api/ids.go View File

@@ -2,37 +2,41 @@ package api

import (
"regexp"
"os"
"strings"
"log"
"net/url"
)

// FIXME: API package should be abstract, no utility code in here

var matchChannelID = regexp.MustCompile("^([\\w\\-]|(%3[dD]))+$")
var matchVideoID = regexp.MustCompile("^[\\w\\-]+$")

func GetChannelID(chanURL string) (string, error) {
// Input: Channel ID or link to YT channel page
// Output: Channel ID or "" on error
func GetChannelID(chanURL string) string {
if !matchChannelID.MatchString(chanURL) {
// Check if youtube.com domain
_url, err := url.Parse(chanURL)
if err != nil || (_url.Host != "www.youtube.com" && _url.Host != "youtube.com") {
log.Fatal("Not a channel ID:", chanURL)
os.Exit(1)
return ""
}

// Check if old /user/ URL
if strings.HasPrefix(_url.Path, "/user/") {
// TODO Implement extraction of channel ID
log.Fatal("New /channel/ link is required!\n" +
"The old /user/ links do not work.")
os.Exit(1)
log.Print("New /channel/ link is required!\n" +
"The old /user/ links do not work:", chanURL)
return ""
}

// Remove /channel/ path
channelID := strings.TrimPrefix(_url.Path, "/channel/")
if len(channelID) == len(_url.Path) {
// No such prefix to be removed
log.Fatal("Not a channel ID:", channelID)
os.Exit(1)
log.Print("Not a channel ID:", channelID)
return ""
}

// Remove rest of path from channel ID
@@ -41,9 +45,40 @@ func GetChannelID(chanURL string) (string, error) {
channelID = channelID[:slashIndex]
}

return channelID, nil
return channelID
} else {
// It's already a channel ID
return chanURL, nil
return chanURL
}
}

func GetVideoID(vidURL string) string {
if !matchVideoID.MatchString(vidURL) {
// Check if youtube.com domain
_url, err := url.Parse(vidURL)
if err != nil || (_url.Host != "www.youtube.com" && _url.Host != "youtube.com") {
log.Fatal("Not a video ID:", vidURL)
return ""
}

// TODO Support other URLs (/v or /embed)

// Check if watch path
if !strings.HasPrefix(_url.Path, "/watch") {
log.Fatal("Not a watch URL:", vidURL)
return ""
}

// Parse query string
query := _url.Query()
videoID := query.Get("v")
if videoID == "" {
log.Fatal("Invalid watch URL:", vidURL)
return ""
}

return videoID
} else {
return vidURL
}
}

+ 0
- 38
apiclassic/get.go View File

@@ -1,38 +0,0 @@
package apiclassic

import (
"github.com/terorie/yt-mango/data"
"errors"
)

func GetVideo(v *data.Video) error {
if len(v.ID) == 0 { return errors.New("no video ID") }

// Download the doc tree
doc, err := GrabVideo(v.ID)
if err != nil { return err }

// Parse it
p := parseInfo{v, doc}
err = p.parse()
if err != nil { return err }

return nil
}

func GetVideoSubtitleList(v *data.Video) (err error) {
tracks, err := GrabSubtitleList(v.ID)
if err != nil { return }
for _, track := range tracks.Tracks {
v.Subtitles = append(v.Subtitles, track.LangCode)
}
return
}

func GetChannel(c *data.Channel) error {
return errors.New("not implemented")
}

func GetChannelVideoURLs(channelID string, page uint) ([]string, error) {
return nil, errors.New("not implemented")
}

+ 15
- 16
apiclassic/grab.go View File

@@ -4,31 +4,22 @@ import (
"net/http"
"errors"
"encoding/xml"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/yt-mango/net"
"fmt"
)

const mainURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v="
const videoURL = "https://www.youtube.com/watch?has_verified=1&bpctr=6969696969&v="
const subtitleURL = "https://video.google.com/timedtext?type=list&v="
const channelURL = "https://www.youtube.com/channel/%s/about"

// Grabs a HTML video page and returns the document tree
func GrabVideo(videoID string) (doc *goquery.Document, err error) {
req, err := http.NewRequest("GET", mainURL + videoID, nil)
if err != nil { return }
func GrabVideo(videoID string) *http.Request {
req, err := http.NewRequest("GET", videoURL + videoID, nil)
if err != nil { panic(err) }
setHeaders(&req.Header)

res, err := net.Client.Do(req)
if err != nil { return }
if res.StatusCode != 200 { return nil, errors.New("HTTP failure") }

defer res.Body.Close()
doc, err = goquery.NewDocumentFromReader(res.Body)
if err != nil { return nil, err }

return
return req
}

// Grabs and parses a subtitle list
func GrabSubtitleList(videoID string) (tracks *XMLSubTrackList, err error) {
req, err := http.NewRequest("GET", subtitleURL + videoID, nil)
if err != nil { return }
@@ -46,6 +37,14 @@ func GrabSubtitleList(videoID string) (tracks *XMLSubTrackList, err error) {
return
}

func GrabChannel(channelID string) *http.Request {
req, err := http.NewRequest("GET", fmt.Sprintf(channelURL, channelID), nil)
if err != nil { panic(err) }
setHeaders(&req.Header)

return req
}

func setHeaders(h *http.Header) {
h.Add("Host", "www.youtube.com")
h.Add("User-Agent", "yt-mango/0.1")

+ 61
- 0
apiclassic/parsechannel.go View File

@@ -0,0 +1,61 @@
package apiclassic

import (
"github.com/terorie/yt-mango/data"
"net/http"
"errors"
"github.com/PuerkitoBio/goquery"
"strconv"
)

func ParseChannel(c *data.Channel, res *http.Response) (err error) {
if res.StatusCode != 200 { return errors.New("HTTP failure") }

defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil { return }

p := parseChannelInfo{c, doc}
return p.parse()
}

type parseChannelInfo struct {
c *data.Channel
doc *goquery.Document
}

func (p *parseChannelInfo) parse() error {
if err := p.parseMetas();
err != nil { return err }
return nil
}

func (p *parseChannelInfo) parseMetas() error {
p.doc.Find("head").RemoveFiltered("#watch-container")
enumMetas(p.doc.Find("head").Find("meta"), func(tag metaTag)bool {
content := tag.content
switch tag.typ {
case metaProperty:
switch tag.name {
case "og:title":
p.c.Name = content
}
case metaItemProp:
switch tag.name {
case "paid":
if val, err := strconv.ParseBool(content);
err == nil { p.c.Paid = val }
}
}
return false
})
return nil
}

func (p *parseChannelInfo) parseAbout() error {
p.doc.Find(".about-stats").Find(".about-stat").Each(func(_ int, s *goquery.Selection) {
text := s.Text()
println(text)
})
return nil
}

+ 3
- 3
apiclassic/parsedescription.go View File

@@ -4,13 +4,13 @@ import (
"errors"
"golang.org/x/net/html"
"bytes"
"github.com/terorie/yt-mango/net"
"strings"
"github.com/terorie/yt-mango/util"
)

const descriptionSelector = "#eow-description"

func (p *parseInfo) parseDescription() error {
func (p *parseVideoInfo) parseDescription() error {
// Find description root
descNode := p.doc.Find(descriptionSelector).First()
if len(descNode.Nodes) == 0 { return errors.New("could not find description") }
@@ -24,7 +24,7 @@ func (p *parseInfo) parseDescription() error {
case html.TextNode:
// FIXME: "&amp;lt;" gets parsed to => "<"
// Write text to buffer, escaping markdown
err := net.MarkdownTextEscape.ToBuffer(c.Data, &buffer)
err := util.MarkdownTextEscape.ToBuffer(c.Data, &buffer)
if err != nil { return err }
case html.ElementNode:
switch c.Data {

+ 47
- 0
apiclassic/parsemetas.go View File

@@ -0,0 +1,47 @@
package apiclassic

import "github.com/PuerkitoBio/goquery"

type metaType uint8
const (
metaUnknown = metaType(iota)
metaProperty
metaItemProp
)

type metaTag struct {
typ metaType
name string
content string
}

func enumMetas(s *goquery.Selection, next func(metaTag)bool) {
// For each <meta>
s.EachWithBreak(func(i int, s *goquery.Selection) bool {
tag := metaTag{ metaUnknown, "", "" }
listAttrs: for _, attr := range s.Nodes[0].Attr {
switch attr.Key {
case "property":
tag.typ = metaProperty
tag.name = attr.Val
break listAttrs
case "itemprop":
tag.typ = metaItemProp
tag.name = attr.Val
break listAttrs
case "content":
tag.content = attr.Val
break listAttrs
}

if tag.typ == metaUnknown { continue }
if len(tag.content) == 0 { continue }

// Callback tag
if !next(tag) {
return true
}
}
return false
})
}

apiclassic/parse.go → apiclassic/parsevideo.go View File

@@ -9,6 +9,7 @@ import (
"regexp"
"github.com/valyala/fastjson"
"strings"
"net/http"
)

const likeBtnSelector = ".like-button-renderer-like-button-unclicked"
@@ -19,12 +20,23 @@ const channelNameSelector = ".yt-uix-sessionlink"

var playerConfigErr = errors.New("failed to parse player config")

type parseInfo struct {
func ParseVideo(v *data.Video, res *http.Response) (err error) {
if res.StatusCode != 200 { return errors.New("HTTP failure") }

defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil { return }

p := parseVideoInfo{v, doc}
return p.parse()
}

type parseVideoInfo struct {
v *data.Video
doc *goquery.Document
}

func (p *parseInfo) parse() error {
func (p *parseVideoInfo) parse() error {
if err := p.parseLikeDislike();
err != nil { return err }
if err := p.parseViewCount();
@@ -40,7 +52,7 @@ func (p *parseInfo) parse() error {
return nil
}

func (p *parseInfo) parseLikeDislike() error {
func (p *parseVideoInfo) parseLikeDislike() error {
likeText := p.doc.Find(likeBtnSelector).First().Text()
dislikeText := p.doc.Find(dislikeBtnSelector).First().Text()

@@ -57,7 +69,7 @@ func (p *parseInfo) parseLikeDislike() error {
return nil
}

func (p *parseInfo) parseViewCount() error {
func (p *parseVideoInfo) parseViewCount() error {
viewCountText := p.doc.Find(viewCountSelector).First().Text()
viewCount, err := extractNumber(viewCountText)
if err != nil { return err }
@@ -65,7 +77,7 @@ func (p *parseInfo) parseViewCount() error {
return nil
}

func (p *parseInfo) parseUploader() error {
func (p *parseVideoInfo) parseUploader() error {
userInfo := p.doc.Find(userInfoSelector)
userLinkNode := userInfo.Find(".yt-uix-sessionlink")

@@ -81,30 +93,12 @@ func (p *parseInfo) parseUploader() error {
return nil
}

func (p *parseInfo) parseMetas() error {
metas := p.doc.Find("meta")
// For each <meta>
for _, node := range metas.Nodes {
// Attributes
var content string
var itemprop string
var prop string

// Parse attributes
for _, attr := range node.Attr {
switch attr.Key {
case "property": prop = attr.Val
case "itemprop": itemprop = attr.Val
case "content": content = attr.Val
}
}

// Content not set
if len(content) == 0 { continue }

// <meta property …
if len(prop) != 0 {
switch prop {
func (p *parseVideoInfo) parseMetas() (err error) {
enumMetas(p.doc.Selection, func(tag metaTag)bool {
content := tag.content
switch tag.typ {
case metaProperty:
switch tag.name {
case "og:title":
p.v.Title = content
case "og:video:tag":
@@ -114,11 +108,8 @@ func (p *parseInfo) parseMetas() error {
case "og:image":
p.v.Thumbnail = content
}
continue
}
// <meta itemprop …
if len(itemprop) != 0 {
switch itemprop {
case metaItemProp:
switch tag.name {
case "datePublished":
if val, err := time.Parse("2006-01-02", content);
err == nil { p.v.UploadDate = val }
@@ -130,19 +121,20 @@ func (p *parseInfo) parseMetas() error {
if val, err := parseDuration(content); err == nil {
p.v.Duration = val
} else {
return err
return false
}
case "isFamilyFriendly":
if val, err := strconv.ParseBool(content);
err == nil { p.v.FamilyFriendly = val }
}
continue
}
}
return nil
return true
})

return err
}

func (p *parseInfo) parsePlayerConfig() error {
func (p *parseVideoInfo) parsePlayerConfig() error {
var json string

p.doc.Find("script").EachWithBreak(func(_ int, s *goquery.Selection) bool {

+ 0
- 1
apijson/grab.go View File

@@ -8,7 +8,6 @@ const videoURL = "https://www.youtube.com/watch?pbj=1&v="
const channelURL = "https://www.youtube.com/browse_ajax?ctoken="

func GrabVideo(videoID string) *http.Request {
// Prepare request
req, err := http.NewRequest("GET", videoURL + videoID, nil)
if err != nil { panic(err) }
setHeaders(&req.Header)

+ 1
- 0
cmd/channel.go View File

@@ -14,4 +14,5 @@ var Channel = cobra.Command{
func init() {
channelDumpCmd.Flags().BoolVarP(&force, "force", "f", false, "Overwrite the output file if it already exists")
Channel.AddCommand(&channelDumpCmd)
Channel.AddCommand(&channelDetailCmd)
}

+ 43
- 0
cmd/channeldetail.go View File

@@ -0,0 +1,43 @@
package cmd

import (
"github.com/spf13/cobra"
"github.com/terorie/yt-mango/api"
"os"
"log"
"github.com/terorie/yt-mango/net"
"github.com/terorie/yt-mango/data"
"fmt"
"encoding/json"
)

var channelDetailCmd = cobra.Command{
Use: "detail <channel ID>",
Short: "Get detail about a channel",
Args: cobra.ExactArgs(1),
Run: doChannelDetail,
}

func doChannelDetail(_ *cobra.Command, args []string) {
channelID := args[0]

channelID = api.GetChannelID(channelID)
if channelID == "" {
os.Exit(1)
}

channelReq := api.Main.GrabChannel(channelID)

res, err := net.Client.Do(channelReq)
if err != nil {
log.Fatal(err)
os.Exit(1)
}

var c data.Channel
api.Main.ParseChannel(&c, res)

bytes, err := json.MarshalIndent(&c, "", "\t")
if err != nil { panic(err) }
fmt.Println(string(bytes))
}

+ 2
- 5
cmd/channeldump.go View File

@@ -56,11 +56,8 @@ func doChannelDump(_ *cobra.Command, args []string) {
}
channelDumpContext.printResults = printResults

channelID, err := api.GetChannelID(channelID)
if err != nil {
log.Print(err)
os.Exit(1)
}
channelID = api.GetChannelID(channelID)
if channelID == "" { os.Exit(1) }

log.Printf("Starting work on channel ID \"%s\".", channelID)
channelDumpContext.startTime = time.Now()

+ 31
- 4
cmd/videodetail.go View File

@@ -1,14 +1,41 @@
package cmd

import "github.com/spf13/cobra"
import (
"github.com/spf13/cobra"
"github.com/terorie/yt-mango/api"
"os"
"github.com/terorie/yt-mango/net"
"github.com/terorie/yt-mango/data"
"log"
"fmt"
"encoding/json"
)

var videoDetailCmd = cobra.Command{
Use: "detail <video ID> [file]",
Short: "Get details about a video",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
videoID := args[0]

},
}
videoID = api.GetVideoID(videoID)
if videoID == "" {
os.Exit(1)
}

videoReq := api.Main.GrabVideo(videoID)

func init() {
res, err := net.Client.Do(videoReq)
if err != nil {
log.Fatal(err)
os.Exit(1)
}

var v data.Video
api.Main.ParseVideo(&v, res)

bytes, err := json.MarshalIndent(&v, "", "\t")
if err != nil { panic(err) }
fmt.Println(string(bytes))
},
}

+ 2
- 0
data/channel.go View File

@@ -3,4 +3,6 @@ package data
type Channel struct {
ID string `json:"id"`
Name string `json:"name"`
Paid bool `json:"paid"`
Thumbnail string `json:"thumbnail"`
}

+ 2
- 2
main.go View File

@@ -39,8 +39,8 @@ func main() {

switch forceAPI {
case "": api.Main = &api.TempAPI
//case "classic": api.Main = &api.ClassicAPI
//case "json": api.Main = &api.JsonAPI
case "classic": api.Main = &api.ClassicAPI
case "json": api.Main = &api.JsonAPI
default:
fmt.Fprintln(os.Stderr, "Invalid API specified.\n" +
"Valid options are: \"classic\" and \"json\"")

api/escape.go → util/escape.go View File

@@ -1,4 +1,4 @@
package api
package util

import "bytes"


api/markdown.go → util/markdown.go View File

@@ -1,4 +1,4 @@
package api
package util

var MarkdownTextEscape EscapeMap
var MarkdownLinkEscape EscapeMap

+ 2
- 0
version/get.go View File

@@ -1,5 +1,7 @@
package version

// TODO Refactor: Dedicating a single package is too much

func Get() string {
return "v0.1 -- dev"
}

Loading…
Cancel
Save