├── .gitignore ├── README.md ├── go.mod ├── go.sum ├── headless_browser.go ├── heuristics.go ├── instance_info.go ├── main.go └── url-extract.code-workspace /.gitignore: -------------------------------------------------------------------------------- 1 | url-extract 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # url-extract 2 | Extract URLs from websites using a headless browser 3 | 4 | ## Getting started 5 | 6 | This project uses a headless chromium instance for navigating websites. 7 | 8 | ```bash 9 | docker run -it --name headless-chromium --rm -p 127.0.0.1:9222:9222 --entrypoint "chromium-browser" zenika/alpine-chrome --headless --disable-gpu --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=9222 --enable-logging --autoplay-policy=no-user-gesture-required --disable-software-rasterizer --disable-dev-shm-usage --disable-sync --disable-background-networking --no-first-run --no-pings --metrics-recording-only --safebrowsing-disable-auto-update --mute-audio 10 | 11 | ``` 12 | 13 | Some of the switches might be deprecated or entirely useless. Sources are 14 | - https://github.com/GoogleChrome/puppeteer/issues/940#issuecomment-336423912 15 | - https://peter.sh/experiments/chromium-command-line-switches/ 16 | - https://github.com/obsproject/obs-browser/issues/105 17 | 18 | 19 | ## Running 20 | 21 | Once the docker container is running, and navigating to `127.0.0.1:9222` works in your local browser, we can try navigating to a website using the headless instance. By default we are trying to extract media files from our target (see `main.go`). 22 | 23 | ```bash 24 | go build && ./url-extract -quiet -heuristics -url https://castr.io/hlsplayer 25 | ``` 26 | 27 | The result should look something like `https://cstr-x.castr.io/castr/live_x/index.m3u8`. 28 | 29 | ### Detection 30 | 31 | The request headers when accessing a website (at the time of writing) are 32 | 33 | ``` 34 | Connection: keep-alive 35 | Upgrade-Insecure-Requests: 1 36 | User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/73.0.3683.103 Safari/537.36 37 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 38 | Accept-Encoding: gzip, deflate, br 39 | ``` 40 | 41 | ## Limitations 42 | 43 | Generally, only URLs that are automatically loaded via the network are found. URLs that are only loaded after user interaction, e.g. on-click will not be found. The autoplay-policy `no-user-gesture-required` allows websites to play some media files without user-interaction. 44 | 45 | ### Heuristics 46 | 47 | We try to click elements with ids and classes that "look like" media players to start playing media that will not auto play. Check `heuristics.go` for more information. This is always open to improvement or adjustment. Note that using such heuristics can lead to false-negatives by interacting with the website in a way that stops loading a resource that should normally be found. 48 | 49 | ### Todo 50 | 51 | Heuristics are not able to access sites that embed e.g. media players inside iframes. Properly accessing iframes seems to be an open issue in the `chromedp` project. -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/MemeLabs/url-extract 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/chromedp/cdproto v0.0.0-20190511222037-764a6f24cacb 7 | github.com/chromedp/chromedp v0.3.0 8 | github.com/gobwas/ws v1.0.1 // indirect 9 | golang.org/x/sys v0.0.0-20190516110030-61b9204099cb // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI= 2 | github.com/chromedp/cdproto v0.0.0-20190511222037-764a6f24cacb h1:qz5T1ydpxa60px6tPNy45H4M+ZBOuNgnP9zPij8vaOc= 3 | github.com/chromedp/cdproto v0.0.0-20190511222037-764a6f24cacb/go.mod h1:5NWqr1Ri5aJB5uSvUXfVpbBslleS+eMjspUWv2Lcaow= 4 | github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA= 5 | github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI= 6 | github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= 7 | github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo= 8 | github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8= 9 | github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= 10 | github.com/gobwas/ws v1.0.0 h1:1WdyfgUcImUfVBvYbsW2krIsnko+1QU2t45soaF8v1M= 11 | github.com/gobwas/ws v1.0.0/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= 12 | github.com/gobwas/ws v1.0.1 h1:iYpM3WoNpsexO6bqCN1MnvVRylnKg6278zivIZDRXUM= 13 | github.com/gobwas/ws v1.0.1/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= 14 | github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls= 15 | github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ= 16 | github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= 17 | github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg= 18 | github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= 19 | golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862 h1:rM0ROo5vb9AdYJi1110yjWGMej9ITfKddS89P3Fkhug= 20 | golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 21 | golang.org/x/sys v0.0.0-20190516110030-61b9204099cb h1:k07iPOt0d6nEnwXF+kHB+iEg+WSuKe/SOQuFM2QoD+E= 22 | golang.org/x/sys v0.0.0-20190516110030-61b9204099cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 23 | -------------------------------------------------------------------------------- /headless_browser.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "log" 8 | "net/url" 9 | "strings" 10 | "time" 11 | 12 | "github.com/chromedp/cdproto/network" 13 | "github.com/chromedp/chromedp" 14 | ) 15 | 16 | type HeadlessBrowser struct { 17 | Info *InstanceInfo 18 | stopChan chan bool 19 | UseHeuristics bool 20 | Quiet bool 21 | } 22 | 23 | // NewHeadlessBrowser connects to a headless browser at remote. 24 | // If quiet is true, debug output is suppressed. 25 | func NewHeadlessBrowser(remote string, useHeuristics bool, quiet bool) (*HeadlessBrowser, error) { 26 | ii, err := GetInstanceInfo(remote) 27 | if err != nil { 28 | return nil, fmt.Errorf("Unable to connect to instance: %q", err) 29 | } 30 | 31 | log.Printf("Found instace %q with User-Agent %q. Using debuggerURL %q.", 32 | ii.Browser, 33 | ii.UserAgent, 34 | ii.WebSocketDebuggerURL, 35 | ) 36 | 37 | return &HeadlessBrowser{ 38 | Info: ii, 39 | stopChan: make(chan bool, 1), 40 | UseHeuristics: useHeuristics, 41 | Quiet: quiet, 42 | }, nil 43 | } 44 | 45 | // ExtractURL visits the given targetURL until it finds a new url that is accepted by matcherFunc or timeout expires. 46 | func (hb *HeadlessBrowser) ExtractURL(targetURL string, timeout time.Duration, resultChan chan *network.Request, matcherFunc func(url *url.URL) bool) error { 47 | log.Printf("extracting from %q", targetURL) 48 | 49 | timeoutTicker := time.NewTicker(timeout) 50 | 51 | // source: https://github.com/chromedp/chromedp/blob/master/allocate_test.go 52 | allocCtx, allocCancel := chromedp.NewRemoteAllocator(context.Background(), hb.Info.WebSocketDebuggerURL) 53 | defer allocCancel() 54 | ctx, cancel := chromedp.NewContext(allocCtx) 55 | defer cancel() 56 | 57 | chromedp.ListenTarget(ctx, func(ev interface{}) { 58 | switch ev := ev.(type) { 59 | 60 | case *network.EventWebSocketCreated: 61 | if hb.Quiet { 62 | break 63 | } 64 | log.Printf("WEBSOCKET: %q", ev.URL) 65 | 66 | case *network.EventLoadingFailed: 67 | if hb.Quiet { 68 | break 69 | } 70 | log.Printf("FAILED: %q", ev.ErrorText) 71 | 72 | case *network.EventRequestWillBeSent: 73 | if !hb.Quiet { 74 | log.Printf("REQUEST: %q", ev.Request.URL) 75 | } 76 | 77 | url, err := url.Parse(ev.Request.URL) 78 | if err != nil { 79 | log.Printf("request %q error: %q", ev.Request.URL, err) 80 | } 81 | if ev.Request.URL != targetURL && matcherFunc(url) { 82 | // Navigation stalls if channel is blocked... 83 | go func() { resultChan <- ev.Request }() 84 | } 85 | } 86 | }) 87 | 88 | if err := chromedp.Run(ctx, 89 | network.Enable(), // enable network events 90 | chromedp.Navigate(targetURL), // navigate to url 91 | ); err != nil { 92 | return err 93 | } 94 | 95 | log.Println("waiting for page to finish loading...") 96 | err := waitToFinishLoading(ctx, timeoutTicker) 97 | if err != nil { 98 | return err 99 | } 100 | 101 | if hb.UseHeuristics { 102 | clickAll(ctx) 103 | } 104 | 105 | log.Printf("waiting to find matching urls...") 106 | 107 | select { 108 | case <-timeoutTicker.C: 109 | chromedp.Run(ctx, 110 | chromedp.Stop(), 111 | ) 112 | return errors.New("timeout") 113 | case <-hb.stopChan: 114 | chromedp.Run(ctx, 115 | chromedp.Stop(), 116 | ) 117 | log.Println("stopped!") 118 | return nil 119 | } 120 | } 121 | 122 | // Stop trys to abort ExtractURL and shuts down the headless browser instance. 123 | func (hb *HeadlessBrowser) Stop() { 124 | hb.stopChan <- true 125 | } 126 | 127 | // waitToFinishLoading waits for site to finish loading (since clicking buttons mights not work correctly otherwise) 128 | // source: https://github.com/chromedp/chromedp/issues/252 129 | // Only returns with an error on timeout. 130 | func waitToFinishLoading(ctx context.Context, timeoutTicker *time.Ticker) error { 131 | state := "notloaded" 132 | script := `document.readyState` 133 | checkTicker := time.NewTicker(time.Millisecond * 100) 134 | for { 135 | select { 136 | case <-checkTicker.C: 137 | err := chromedp.Run(ctx, chromedp.EvaluateAsDevTools(script, &state)) 138 | if err != nil { 139 | log.Printf("error in eval: %q", err) 140 | } 141 | if strings.Compare(state, "complete") == 0 { 142 | return nil 143 | } 144 | case <-timeoutTicker.C: 145 | return errors.New("timeout while waiting to finish loading") 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /heuristics.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "log" 6 | 7 | "github.com/chromedp/cdproto/cdp" 8 | "github.com/chromedp/chromedp" 9 | ) 10 | 11 | var ( 12 | clickSelectors = []string{ 13 | "[class*='play']", 14 | "[id*='play']", 15 | "[class*='btn-danger']", 16 | } 17 | ) 18 | 19 | // Click everything that "looks like" a play button to catch media that does not auto-play. 20 | // Clicking seems to block until any element is found by the given selector. 21 | // Since we cannot guarantee this to happen, use routines... 22 | func clickAll(ctx context.Context) { 23 | for _, sel := range clickSelectors { 24 | go func(sel string) { 25 | clickAllNodes(ctx, sel) 26 | log.Printf("done clicking by %q", sel) 27 | }(sel) 28 | } 29 | } 30 | 31 | // clickAllNodes clicks all nodes that match the given selector. 32 | // errors are ignored and logged. 33 | func clickAllNodes(ctx context.Context, selector string) { 34 | 35 | var nodes []*cdp.Node 36 | if err := chromedp.Run(ctx, chromedp.Nodes(selector, &nodes)); err != nil { 37 | log.Printf("error getting nodes: %q", err) 38 | return 39 | } 40 | log.Printf("heuristics: found %d nodes for selector %q", len(nodes), selector) 41 | for _, n := range nodes { 42 | if err := chromedp.Run(ctx, chromedp.MouseClickNode(n)); err != nil { 43 | log.Printf("error clicking node: %q", err) 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /instance_info.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "net/http" 8 | "time" 9 | ) 10 | 11 | // InstanceInfo is the information that is provided by the debugger instance. 12 | // By default, the information is exposed on http://localhost:9222/json/version 13 | // reference: https://chromium.googlesource.com/external/github.com/mafredri/cdp/+/a974e2fd933e19fc0bbde4ea092df45158e782bf 14 | type InstanceInfo struct { 15 | Browser string `json:"Browser"` 16 | ProtocolVersion string `json:"Protocol-Version"` 17 | UserAgent string `json:"User-Agent"` 18 | V8Version string `json:"V8-Version"` 19 | WebKitVersion string `json:"WebKit-Version"` 20 | WebSocketDebuggerURL string `json:"webSocketDebuggerUrl"` 21 | } 22 | 23 | // GetInstanceInfo fetches information about an instance running at the given endpoint. 24 | // E.g. "localhost:9222". 25 | func GetInstanceInfo(endpoint string) (*InstanceInfo, error) { 26 | 27 | client := &http.Client{ 28 | Timeout: time.Second * 3, 29 | } 30 | 31 | resp, err := client.Get(fmt.Sprintf("http://%s/json/version", endpoint)) 32 | if err != nil { 33 | return nil, err 34 | } 35 | defer resp.Body.Close() 36 | 37 | contents, err := ioutil.ReadAll(resp.Body) 38 | if err != nil { 39 | return nil, err 40 | } 41 | 42 | var ii InstanceInfo 43 | err = json.Unmarshal(contents, &ii) 44 | if err != nil { 45 | return nil, err 46 | } 47 | return &ii, nil 48 | } 49 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "net/url" 7 | "strings" 8 | "time" 9 | 10 | "github.com/chromedp/cdproto/network" 11 | ) 12 | 13 | var ( 14 | target string 15 | headlessURL string 16 | heuristics bool 17 | timeout int 18 | quiet bool 19 | 20 | mediaMatcher = func(url *url.URL) bool { 21 | return strings.HasSuffix(url.Path, ".m3u8") || 22 | strings.HasSuffix(url.Path, ".mp4") || 23 | strings.HasSuffix(url.Path, ".mp3") 24 | } 25 | ) 26 | 27 | func init() { 28 | flag.StringVar(&target, "url", "", "the URL to analyze") 29 | flag.StringVar(&headlessURL, "remote", "localhost:9222", "the endpoint of the headless instance") 30 | flag.BoolVar(&heuristics, "heuristics", false, "use heuristics to find media elements") 31 | flag.IntVar(&timeout, "timeout", 5, "time in seconds to wait for the site to load and a result to be detected") 32 | flag.BoolVar(&quiet, "quiet", false, "discard debug output") 33 | flag.Parse() 34 | } 35 | 36 | func main() { 37 | 38 | if target == "" { 39 | log.Fatalf("Please provide a URL with '-url'.") 40 | } 41 | 42 | hb, err := NewHeadlessBrowser(headlessURL, heuristics, quiet) 43 | if err != nil { 44 | log.Fatal(err) 45 | } 46 | 47 | resultChan := make(chan *network.Request, 100) 48 | go func() { 49 | err := hb.ExtractURL(target, time.Second*time.Duration(timeout), resultChan, mediaMatcher) 50 | if err != nil { 51 | log.Fatalf("FATAL: %q", err) 52 | } 53 | }() 54 | 55 | i := 0 56 | maxResults := 1 57 | for i < maxResults { 58 | result := <-resultChan 59 | log.Printf("RESULT: %q", result.URL) 60 | i++ 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /url-extract.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": {} 8 | } --------------------------------------------------------------------------------