├── LICENSE
├── README.md
├── extractor
└── extractor.go
├── go.mod
├── go.sum
├── main.go
└── runner
├── objects.go
└── runner.go
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 003random
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
JavaScript Extraction CLI & Package
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | This is a powerful tool for extracting JavaScript sources from URLs and web pages / HTTP responses. It offers a command-line interface (CLI) for straightforward URL processing and a package interface for custom integrations, making it ideal for pentesters, bug bounty hunters, and developers needing to extract JS sources efficiently.
17 | ## Table of Contents
18 |
19 | - [Installation](#installation)
20 | - [CLI Usage](#cli-usage)
21 | - [Options](#options)
22 | - [Examples](#examples)
23 | - [Package Usage](#package-usage)
24 | - [Importing the Extractor](#importing-the-extractor)
25 | - [Example](#example)
26 | - [Version Information](#version-information)
27 | - [Contributing](#contributing)
28 | - [License](#license)
29 |
30 | ## Installation
31 |
32 | To install `getJS`, use the following command:
33 |
34 | `go install github.com/003random/getJS/v2@latest`
35 |
36 | ## CLI Usage
37 |
38 | ### Options
39 |
40 | `getJS` provides several command-line options to customize its behavior:
41 |
42 | - `-url string`: The URL from which JavaScript sources should be extracted.
43 | - `-input string`: Optional URLs input files. Each URL should be on a new line in plain text format. Can be used multiple times.
44 | - `-output string`: Optional output file where results are written to. Can be used multiple times.
45 | - `-complete`: Complete/Autofill relative URLs by adding the current origin.
46 | - `-resolve`: Resolve the JavaScript files. Can only be used in combination with `--complete`.
47 | - `-threads int`: The number of processing threads to spawn (default: 2).
48 | - `-verbose`: Print verbose runtime information and errors.
49 | - `-method string`: The request method used to fetch remote contents (default: "GET").
50 | - `-header string`: Optional request headers to add to the requests. Can be used multiple times.
51 | - `-timeout duration`: The request timeout while fetching remote contents (default: 5s).
52 |
53 | ### Examples
54 |
55 | #### Extracting JavaScript from a Single URL
56 |
57 | `getJS -url https://destroy.ai`
58 |
59 | or
60 |
61 | `curl https://destroy.ai | getJS`
62 |
63 | #### Using Custom Request Options
64 |
65 | `getJS -url "http://example.com" -header "User-Agent: foo bar" -method POST --timeout=15s`
66 |
67 | #### Processing Multiple URLs from a File
68 |
69 | `getJS -input foo.txt -input bar.txt`
70 |
71 | #### Saving Results to an Output File
72 |
73 | `getJS -url "http://example.com" -output results.txt`
74 |
75 | ## Package Usage
76 |
77 | ### Importing the Extractor
78 |
79 | To use `getJS` as a package, you need to import the `extractor` package and utilize its functions directly.
80 |
81 | ### Example
82 |
83 | ```Go
84 | package main
85 |
86 | import (
87 | "fmt"
88 | "log"
89 | "net/http"
90 | "net/url"
91 |
92 | "github.com/003random/getJS/extractor"
93 | )
94 |
95 | func main() {
96 | baseURL, err := url.Parse("https://google.com")
97 | if (err != nil) {
98 | log.Fatalf("Error parsing base URL: %v", err)
99 | }
100 |
101 | resp, err := extractor.FetchResponse(baseURL.String(), "GET", http.Header{})
102 | if (err != nil) {
103 | log.Fatalf("Error fetching response: %v", err)
104 | }
105 | defer resp.Body.Close()
106 |
107 | // Custom extraction points (optional).
108 | extractionPoints := map[string][]string{
109 | "script": {"src", "data-src"},
110 | "a": {"href"},
111 | }
112 |
113 | sources, err := extractor.ExtractSources(resp.Body, extractionPoints)
114 | if (err != nil) {
115 | log.Fatalf("Error extracting sources: %v", err)
116 | }
117 |
118 | // Filtering and extending extracted sources.
119 | filtered, err := extractor.Filter(sources, extractor.WithComplete(baseURL), extractor.WithResolve())
120 | if (err != nil) {
121 | log.Fatalf("Error filtering sources: %v", err)
122 | }
123 |
124 | for source := range filtered {
125 | fmt.Println(source.String())
126 | }
127 | }
128 | ```
129 |
130 | ## Version Information
131 |
132 | This is the v2 version of `getJS`. The original version can be found under the tag [v1](https://github.com/003random/getJS/tree/v1).
133 |
134 | ## Contributing
135 |
136 | Contributions are welcome! Please open an issue or submit a pull request for any bugs, feature requests, or improvements.
137 |
138 | ## License
139 |
140 | This project is licensed under the MIT License. See the [LICENSE](https://github.com/003random/getJS/blob/master/LICENSE) file for details.
141 |
--------------------------------------------------------------------------------
/extractor/extractor.go:
--------------------------------------------------------------------------------
1 | package extractor
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "net/http"
8 | "net/url"
9 |
10 | "github.com/PuerkitoBio/goquery"
11 | )
12 |
13 | // ExtractionPoints defines the default HTML tags and their attributes from which JavaScript sources are extracted.
14 | var ExtractionPoints = map[string][]string{
15 | "script": {"src", "data-src"},
16 | }
17 |
18 | // FetchResponse fetches the HTTP response for the given URL.
19 | func FetchResponse(u string, method string, headers http.Header) (*http.Response, error) {
20 | req, err := http.NewRequest(method, u, nil)
21 | if err != nil {
22 | return nil, err
23 | }
24 |
25 | req.Header = headers
26 |
27 | return http.DefaultClient.Do(req)
28 | }
29 |
30 | // ExtractSources extracts all JavaScript sources found in the provided HTTP response reader.
31 | // The optional extractionPoints can be used to overwrite the default extraction points map
32 | // with a set of HTML tag names, together with a list of what attributes to extract from.
33 | func ExtractSources(input io.Reader, extractionPoints ...map[string][]string) (<-chan url.URL, error) {
34 | doc, err := goquery.NewDocumentFromReader(input)
35 | if err != nil {
36 | return nil, err
37 | }
38 |
39 | var (
40 | urls = make(chan url.URL)
41 | points = ExtractionPoints
42 | )
43 |
44 | if len(extractionPoints) > 0 {
45 | points = extractionPoints[0]
46 | }
47 |
48 | go func() {
49 | defer close(urls)
50 | for tag, attributes := range points {
51 | doc.Find(tag).Each(func(i int, s *goquery.Selection) {
52 | for _, a := range attributes {
53 | if value, exists := s.Attr(a); exists {
54 | u, err := url.Parse(value)
55 | if err != nil {
56 | log.Println(fmt.Errorf("invalid attribute value %s cannot be parsed to a URL: %w", value, err))
57 | continue
58 | }
59 |
60 | urls <- *u
61 | }
62 | }
63 | })
64 | }
65 | }()
66 |
67 | return urls, nil
68 | }
69 |
70 | // Filter applies options to filter URLs from the input channel.
71 | func Filter(input <-chan url.URL, options ...func([]url.URL) []url.URL) (<-chan url.URL, error) {
72 | output := make(chan url.URL)
73 | go func() {
74 | defer close(output)
75 | var urls []url.URL
76 | for u := range input {
77 | urls = append(urls, u)
78 | }
79 |
80 | for _, option := range options {
81 | urls = option(urls)
82 | }
83 |
84 | for _, u := range urls {
85 | output <- u
86 | }
87 | }()
88 | return output, nil
89 | }
90 |
91 | // WithComplete is an option to complete relative URLs.
92 | func WithComplete(base *url.URL) func([]url.URL) []url.URL {
93 | return func(urls []url.URL) []url.URL {
94 | var result []url.URL
95 | for _, u := range urls {
96 | result = append(result, complete(u, base))
97 | }
98 | return result
99 | }
100 | }
101 |
102 | // WithResolve is an option to filter URLs that resolve successfully.
103 | func WithResolve() func([]url.URL) []url.URL {
104 | return func(urls []url.URL) []url.URL {
105 | var result []url.URL
106 | for _, u := range urls {
107 | if resolve(u) {
108 | result = append(result, u)
109 | }
110 | }
111 | return result
112 | }
113 | }
114 |
115 | // complete completes relative URLs by adding the base URL.
116 | func complete(source url.URL, base *url.URL) url.URL {
117 | if source.IsAbs() {
118 | return source
119 | }
120 | return *base.ResolveReference(&source)
121 | }
122 |
123 | // resolve checks if the provided URL resolves successfully.
124 | func resolve(source url.URL) bool {
125 | resp, err := http.Get(source.String())
126 | if err != nil {
127 | return false
128 | }
129 | defer resp.Body.Close()
130 |
131 | _, err = io.Copy(io.Discard, resp.Body)
132 | return err == nil && (resp.StatusCode >= http.StatusOK && resp.StatusCode < http.StatusMultipleChoices)
133 | }
134 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/003random/getJS/v2
2 |
3 | go 1.22
4 |
5 | require github.com/PuerkitoBio/goquery v1.8.1
6 |
7 | require (
8 | github.com/andybalholm/cascadia v1.3.1 // indirect
9 | golang.org/x/net v0.7.0 // indirect
10 | )
11 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
2 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
3 | github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
4 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
5 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
6 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
7 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
8 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
9 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
10 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
11 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
12 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
13 | golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
14 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
15 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
16 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
17 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
18 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
19 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
20 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
21 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
22 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
23 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
24 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
25 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
26 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
27 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
28 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
29 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
30 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
31 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
32 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
33 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
34 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
35 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
36 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "flag"
6 | "fmt"
7 | "io"
8 | "log"
9 | "net/http"
10 | "os"
11 | "strings"
12 | "time"
13 |
14 | "github.com/003random/getJS/v2/runner"
15 | )
16 |
17 | func main() {
18 | options, err := setup()
19 | if err != nil {
20 | log.Fatal(fmt.Errorf("parsing flags: %w", err))
21 | }
22 |
23 | if err := runner.New(options).Run(); err != nil {
24 | log.Fatal(err)
25 | }
26 | }
27 |
28 | func setup() (options *runner.Options, err error) {
29 | options = &runner.Options{}
30 |
31 | flag.StringVar(&options.Request.Method, "method", "GET", "The request method that should be used to make fetch the remote contents.")
32 | flag.DurationVar(&options.Request.Timeout, "timeout", 5*time.Second, "The request timeout used while fetching the remote contents.")
33 | flag.BoolVar(&options.Complete, "complete", false, "Complete/Autofil relative URLs by adding the current origin.")
34 | flag.BoolVar(&options.Resolve, "resolve", false, "Resolve the JavaScript files. Can only be used in combination with '--resolve'. Unresolvable hosts are not included in the results.")
35 | flag.IntVar(&options.Threads, "threads", 2, "The amount of processing threads to spawn.")
36 | flag.BoolVar(&options.Verbose, "verbose", false, "Print verbose runtime information and errors.")
37 |
38 | var (
39 | url string
40 | input arrayFlags
41 | output arrayFlags
42 | header arrayFlags
43 | )
44 |
45 | flag.Var(&header, "header", "The optional request headers to add to the requests. This flag can be used multiple times with a new header each time.")
46 | flag.StringVar(&url, "url", "", "The URL where the JavaScript sources should be extracted from.")
47 | flag.Var(&input, "input", "The optional URLs input files. Each URL should be on a new line in plain text format. This flag can be used multiple times with different files.")
48 | flag.Var(&output, "output", "The optional output file where the results are written to.")
49 |
50 | flag.Parse()
51 |
52 | options.Request.Headers = headers(header)
53 |
54 | options.Inputs = inputs(input)
55 | options.Outputs = outputs(output)
56 |
57 | // Add an input for the single URL option, if set.
58 | if len(url) > 0 {
59 | options.Inputs = append(options.Inputs, runner.Input{
60 | Type: runner.InputURL,
61 | Data: strings.NewReader(url),
62 | })
63 | }
64 |
65 | stat, err := os.Stdin.Stat()
66 | if err != nil {
67 | log.Fatal(fmt.Errorf("error reading stdin: %v", err))
68 | }
69 |
70 | if (stat.Mode() & os.ModeCharDevice) == 0 {
71 | // Read the first line of stdin to detect its format
72 | reader := bufio.NewReader(os.Stdin)
73 | firstLine, err := reader.ReadString('\n')
74 | if err != nil && err != io.EOF {
75 | log.Fatal(fmt.Errorf("error reading first line of stdin: %v", err))
76 | }
77 |
78 | if isURL(strings.TrimSpace(firstLine)) {
79 | // Treat as URL input.
80 | options.Inputs = append(options.Inputs, runner.Input{
81 | Type: runner.InputURL,
82 | Data: io.MultiReader(strings.NewReader(firstLine), reader),
83 | })
84 | } else {
85 | // Treat as HTTP response body.
86 | options.Inputs = append(options.Inputs, runner.Input{
87 | Type: runner.InputResponse,
88 | Data: io.MultiReader(strings.NewReader(firstLine), reader),
89 | })
90 | }
91 | }
92 |
93 | return
94 | }
95 |
96 | func isURL(str string) bool {
97 | return strings.HasPrefix(str, "http://") || strings.HasPrefix(str, "https://")
98 | }
99 |
100 | func outputs(names []string) []io.Writer {
101 | outputs := append([]io.Writer{}, os.Stdout)
102 |
103 | for _, n := range names {
104 | file, err := os.OpenFile(n, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
105 | if err != nil {
106 | log.Fatal(fmt.Errorf("error parsing output file flag: %v", err))
107 | }
108 |
109 | outputs = append(outputs, file)
110 | }
111 |
112 | return outputs
113 | }
114 |
115 | func inputs(names []string) []runner.Input {
116 | inputs := []runner.Input{}
117 |
118 | for _, n := range names {
119 | file, err := os.Open(n)
120 | if err != nil {
121 | log.Fatal(fmt.Errorf("error reading from file %s: %v", n, err))
122 | }
123 |
124 | inputs = append(inputs, runner.Input{Type: runner.InputURL, Data: file})
125 | }
126 |
127 | return inputs
128 | }
129 |
130 | func headers(args []string) http.Header {
131 | headers := make(http.Header)
132 | for _, s := range args {
133 | parts := strings.Split(s, ":")
134 | if len(parts) <= 1 {
135 | log.Fatal(fmt.Errorf("invalid header %s", s))
136 | }
137 |
138 | headers[strings.TrimSpace(parts[0])] = []string{strings.TrimSpace(strings.Join(parts[1:], ":"))}
139 | }
140 |
141 | return headers
142 | }
143 |
144 | type arrayFlags []string
145 |
146 | func (a *arrayFlags) Set(value string) error {
147 | *a = append(*a, value)
148 | return nil
149 | }
150 |
151 | func (a *arrayFlags) String() string {
152 | return strings.Join(*a, ",")
153 | }
154 |
--------------------------------------------------------------------------------
/runner/objects.go:
--------------------------------------------------------------------------------
1 | package runner
2 |
3 | import (
4 | "io"
5 | "net/http"
6 | "net/url"
7 | "time"
8 | )
9 |
10 | // Input represents an input source for getJS. The input format is determined by the `Type` property.
11 | type Input struct {
12 | Type InputType
13 | Data io.Reader
14 | }
15 |
16 | // InputType defines the type of input source for getJS.
17 | type InputType int
18 |
19 | const (
20 | // InputURL defines the input format to line separated, plain text, URLs.
21 | InputURL InputType = iota
22 | // InputResponse defines the input format to a HTTP response body.
23 | InputResponse
24 | )
25 |
26 | type runner struct {
27 | Options Options
28 | Results chan url.URL
29 | }
30 |
31 | // Options represents the configuration options for the runner.
32 | type Options struct {
33 | Request struct {
34 | Method string
35 | Headers http.Header
36 | InsecureSkipVerify bool
37 | Timeout time.Duration
38 | }
39 |
40 | Inputs []Input
41 | Outputs []io.Writer
42 |
43 | Complete bool
44 | Resolve bool
45 |
46 | Threads int
47 |
48 | Verbose bool
49 | Colors bool
50 | }
51 |
--------------------------------------------------------------------------------
/runner/runner.go:
--------------------------------------------------------------------------------
1 | package runner
2 |
3 | import (
4 | "bufio"
5 | "crypto/tls"
6 | "errors"
7 | "fmt"
8 | "io"
9 | "log"
10 | "net/http"
11 | "net/url"
12 | "sync"
13 |
14 | "github.com/003random/getJS/v2/extractor"
15 | )
16 |
17 | // ExtractionPoints defines the default HTML tags and their attributes from which JavaScript sources are extracted.
18 | var ExtractionPoints = map[string][]string{
19 | "script": {"src", "data-src"},
20 | }
21 |
22 | // New creates a new runner with the provided options.
23 | func New(options *Options) *runner {
24 | http.DefaultClient.Transport = &http.Transport{
25 | TLSHandshakeTimeout: options.Request.Timeout,
26 | TLSClientConfig: &tls.Config{
27 | InsecureSkipVerify: options.Request.InsecureSkipVerify,
28 | },
29 | }
30 | http.DefaultClient.Timeout = options.Request.Timeout
31 |
32 | return &runner{
33 | Options: *options,
34 | Results: make(chan url.URL),
35 | }
36 | }
37 |
38 | // Run starts processing the inputs and extracts JavaScript sources into the runner's Results channel.
39 | func (r *runner) Run() error {
40 | if !r.Options.Verbose {
41 | log.SetOutput(io.Discard)
42 | }
43 |
44 | go func() {
45 | for _, input := range r.Options.Inputs {
46 | switch input.Type {
47 | case InputURL:
48 | r.ProcessURLs(input.Data)
49 | case InputResponse:
50 | r.ProcessResponse(input.Data)
51 | }
52 |
53 | if input, ok := input.Data.(io.Closer); ok {
54 | input.Close()
55 | }
56 | }
57 |
58 | close(r.Results)
59 | }()
60 |
61 | r.listen()
62 |
63 | return nil
64 | }
65 |
66 | func (r *runner) listen() {
67 | for s := range r.Results {
68 | for _, output := range r.Options.Outputs {
69 | _, err := output.Write([]byte(fmt.Sprintf("%s\n", s.String())))
70 | if err != nil {
71 | log.Println(fmt.Errorf("[error] writing result %s to output: %v", s.String(), err))
72 | }
73 | }
74 | }
75 |
76 | for _, output := range r.Options.Outputs {
77 | if o, ok := output.(io.Closer); ok {
78 | o.Close()
79 | }
80 | }
81 | }
82 |
83 | // ProcessURLs will fetch the HTTP response for all URLs in the provided reader
84 | // and stream the extracted sources to the runner's Results channel.
85 | func (r *runner) ProcessURLs(data io.Reader) {
86 | var (
87 | next = Read(data)
88 | wg = sync.WaitGroup{}
89 |
90 | throttle = make(chan struct{}, r.Options.Threads)
91 | )
92 |
93 | for i := 0; i < r.Options.Threads; i++ {
94 | throttle <- struct{}{}
95 | }
96 |
97 | for {
98 | u, err := next()
99 | if errors.Is(err, io.EOF) {
100 | break
101 | }
102 | if err != nil {
103 | log.Println(fmt.Errorf("[error] parsing url %v: %w", u, err))
104 | continue
105 | }
106 |
107 | wg.Add(1)
108 | go func(u *url.URL) {
109 | defer func() {
110 | throttle <- struct{}{}
111 | wg.Done()
112 | }()
113 |
114 | resp, err := extractor.FetchResponse(u.String(), r.Options.Request.Method, r.Options.Request.Headers)
115 | if err != nil {
116 | log.Println(fmt.Errorf("[error] fetching response for url %s: %w", u.String(), err))
117 | return
118 | }
119 | defer resp.Body.Close()
120 |
121 | sources, err := extractor.ExtractSources(resp.Body)
122 | if err != nil {
123 | log.Println(fmt.Errorf("[error] extracting sources from response for url %s: %w", u.String(), err))
124 | return
125 | }
126 |
127 | filtered, err := extractor.Filter(sources, r.filters(u)...)
128 | if err != nil {
129 | log.Println(fmt.Errorf("[error] filtering sources for url %s: %w", u.String(), err))
130 | return
131 | }
132 |
133 | for source := range filtered {
134 | r.Results <- source
135 | }
136 | }(u)
137 |
138 | <-throttle
139 | }
140 |
141 | wg.Wait()
142 | }
143 |
144 | // Read is a wrapper around the bufio.Scanner Text() method.
145 | // Upon reading from the input, the line is automatically parsed to a *url.URL.
146 | // An io.EOF error is returned when there are no more lines.
147 | func Read(input io.Reader) func() (*url.URL, error) {
148 | scanner := bufio.NewScanner(input)
149 | return func() (*url.URL, error) {
150 | if !scanner.Scan() {
151 | return nil, io.EOF
152 | }
153 |
154 | return url.Parse(scanner.Text())
155 | }
156 | }
157 |
158 | func (r *runner) ProcessResponse(data io.Reader) {
159 | sources, err := extractor.ExtractSources(data)
160 | if err != nil {
161 | log.Println(fmt.Errorf("[error] extracting sources from response file: %w", err))
162 | }
163 |
164 | filtered, err := extractor.Filter(sources, r.filters(nil)...)
165 | if err != nil {
166 | log.Println(fmt.Errorf("[error] filtering sources from response file: %w", err))
167 | return
168 | }
169 |
170 | for source := range filtered {
171 | r.Results <- source
172 | }
173 | }
174 |
175 | func (r *runner) filters(base *url.URL) (options []func([]url.URL) []url.URL) {
176 | if r.Options.Complete && base != nil {
177 | options = append(options, extractor.WithComplete(base))
178 | }
179 |
180 | if r.Options.Resolve {
181 | options = append(options, extractor.WithResolve())
182 | }
183 |
184 | return
185 | }
186 |
--------------------------------------------------------------------------------