├── .github
├── assets
│ ├── flyscrape-demo.jpg
│ ├── logo-alt.png
│ └── logo.png
└── workflows
│ ├── release.yaml
│ └── test.yaml
├── .gitignore
├── .goreleaser.yaml
├── LICENSE
├── README.md
├── cmd
├── args.go
├── args_test.go
├── dev.go
├── flyscrape
│ └── main.go
├── main.go
├── new.go
├── run.go
└── version.go
├── examples
├── browser.js
├── coinmarketcap.js
├── cookies.js
├── custom_headers.js
├── download.js
├── hackernews.js
├── hackernews_manual_follow.js
├── hackernews_with_comments.js
├── multiple_starting_urls.js
├── reddit.js
├── urls.txt
├── urls_from_file.js
└── useragents
│ ├── chrome.js
│ ├── edge.js
│ ├── firefox.js
│ └── opera.js
├── flyscrape.go
├── go.mod
├── go.sum
├── install.sh
├── js.go
├── js_lib.go
├── js_lib_test.go
├── js_test.go
├── module.go
├── modules
├── browser
│ ├── browser.go
│ └── browser_test.go
├── cache
│ ├── boltstore.go
│ ├── boltstore_test.go
│ └── cache.go
├── cookies
│ └── cookies.go
├── depth
│ ├── depth.go
│ └── depth_test.go
├── domainfilter
│ ├── domainfilter.go
│ └── domainfilter_test.go
├── followlinks
│ ├── followlinks.go
│ └── followlinks_test.go
├── headers
│ ├── headers.go
│ ├── headers_test.go
│ ├── versions.go
│ ├── versions_chrome.txt
│ ├── versions_edge.txt
│ ├── versions_firefox.txt
│ ├── versions_linux.txt
│ ├── versions_macos.txt
│ ├── versions_opera.txt
│ └── versions_windows.txt
├── hook
│ └── hook.go
├── output
│ ├── json
│ │ └── json.go
│ └── ndjson
│ │ └── ndjson.go
├── proxy
│ ├── proxy.go
│ └── proxy_test.go
├── ratelimit
│ ├── ratelimit.go
│ └── ratelimit_test.go
├── retry
│ ├── retry.go
│ └── retry_test.go
├── starturl
│ ├── starturl.go
│ └── starturl_test.go
└── urlfilter
│ ├── urlfilter.go
│ └── urlfilter_test.go
├── scrape.go
├── template.js
├── utils.go
├── watch.go
└── watch_test.go
/.github/assets/flyscrape-demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philippta/flyscrape/f1084f3e8370d1d0ceb3a1e82517e7b1761be42a/.github/assets/flyscrape-demo.jpg
--------------------------------------------------------------------------------
/.github/assets/logo-alt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philippta/flyscrape/f1084f3e8370d1d0ceb3a1e82517e7b1761be42a/.github/assets/logo-alt.png
--------------------------------------------------------------------------------
/.github/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philippta/flyscrape/f1084f3e8370d1d0ceb3a1e82517e7b1761be42a/.github/assets/logo.png
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "*"
7 |
8 | permissions:
9 | contents: write
10 |
11 | jobs:
12 | release:
13 | name: Release
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v3
17 | with:
18 | fetch-depth: 0
19 |
20 | - name: Fetch Git tags
21 | run: git fetch --force --tags
22 |
23 | - name: Set up Go
24 | uses: actions/setup-go@v5.1.0
25 | with:
26 | go-version: '1.23.3'
27 |
28 | - name: Run GoReleaser
29 | uses: goreleaser/goreleaser-action@v6.1.0
30 | with:
31 | version: '~> v2'
32 | args: release --clean
33 | env:
34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35 |
--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches: ["master"]
6 | pull_request:
7 | branches: ["master"]
8 |
9 | jobs:
10 | test:
11 | name: Test
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v3
15 |
16 | - name: Set up Go
17 | uses: actions/setup-go@v4
18 | with:
19 | go-version: "1.21.3"
20 |
21 | - name: Install dependencies
22 | run: go get .
23 |
24 | - name: Test
25 | run: go test -v ./...
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | node_modules
3 | dist/
4 | examples/**/*.cache
5 |
--------------------------------------------------------------------------------
/.goreleaser.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | before:
4 | hooks:
5 | - go mod tidy
6 | - go test ./...
7 |
8 | builds:
9 | - id: flyscrape
10 | main: ./cmd/flyscrape
11 | env:
12 | - CGO_ENABLED=0
13 | ldflags:
14 | - -s -w
15 | - -extldflags "-static"
16 | - -X github.com/philippta/flyscrape.Version={{.Tag}}
17 | flags:
18 | - -mod=readonly
19 | tags:
20 | - osusergo
21 | - netgo
22 |
23 | archives:
24 | - format: tar.gz
25 | format_overrides:
26 | - goos: windows
27 | format: zip
28 |
29 | changelog:
30 | sort: asc
31 | filters:
32 | exclude:
33 | - "^docs:"
34 | - "^test:"
35 | - "^chore:"
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Mozilla Public License Version 2.0
2 | ==================================
3 |
4 | 1. Definitions
5 | --------------
6 |
7 | 1.1. "Contributor"
8 | means each individual or legal entity that creates, contributes to
9 | the creation of, or owns Covered Software.
10 |
11 | 1.2. "Contributor Version"
12 | means the combination of the Contributions of others (if any) used
13 | by a Contributor and that particular Contributor's Contribution.
14 |
15 | 1.3. "Contribution"
16 | means Covered Software of a particular Contributor.
17 |
18 | 1.4. "Covered Software"
19 | means Source Code Form to which the initial Contributor has attached
20 | the notice in Exhibit A, the Executable Form of such Source Code
21 | Form, and Modifications of such Source Code Form, in each case
22 | including portions thereof.
23 |
24 | 1.5. "Incompatible With Secondary Licenses"
25 | means
26 |
27 | (a) that the initial Contributor has attached the notice described
28 | in Exhibit B to the Covered Software; or
29 |
30 | (b) that the Covered Software was made available under the terms of
31 | version 1.1 or earlier of the License, but not also under the
32 | terms of a Secondary License.
33 |
34 | 1.6. "Executable Form"
35 | means any form of the work other than Source Code Form.
36 |
37 | 1.7. "Larger Work"
38 | means a work that combines Covered Software with other material, in
39 | a separate file or files, that is not Covered Software.
40 |
41 | 1.8. "License"
42 | means this document.
43 |
44 | 1.9. "Licensable"
45 | means having the right to grant, to the maximum extent possible,
46 | whether at the time of the initial grant or subsequently, any and
47 | all of the rights conveyed by this License.
48 |
49 | 1.10. "Modifications"
50 | means any of the following:
51 |
52 | (a) any file in Source Code Form that results from an addition to,
53 | deletion from, or modification of the contents of Covered
54 | Software; or
55 |
56 | (b) any new file in Source Code Form that contains any Covered
57 | Software.
58 |
59 | 1.11. "Patent Claims" of a Contributor
60 | means any patent claim(s), including without limitation, method,
61 | process, and apparatus claims, in any patent Licensable by such
62 | Contributor that would be infringed, but for the grant of the
63 | License, by the making, using, selling, offering for sale, having
64 | made, import, or transfer of either its Contributions or its
65 | Contributor Version.
66 |
67 | 1.12. "Secondary License"
68 | means either the GNU General Public License, Version 2.0, the GNU
69 | Lesser General Public License, Version 2.1, the GNU Affero General
70 | Public License, Version 3.0, or any later versions of those
71 | licenses.
72 |
73 | 1.13. "Source Code Form"
74 | means the form of the work preferred for making modifications.
75 |
76 | 1.14. "You" (or "Your")
77 | means an individual or a legal entity exercising rights under this
78 | License. For legal entities, "You" includes any entity that
79 | controls, is controlled by, or is under common control with You. For
80 | purposes of this definition, "control" means (a) the power, direct
81 | or indirect, to cause the direction or management of such entity,
82 | whether by contract or otherwise, or (b) ownership of more than
83 | fifty percent (50%) of the outstanding shares or beneficial
84 | ownership of such entity.
85 |
86 | 2. License Grants and Conditions
87 | --------------------------------
88 |
89 | 2.1. Grants
90 |
91 | Each Contributor hereby grants You a world-wide, royalty-free,
92 | non-exclusive license:
93 |
94 | (a) under intellectual property rights (other than patent or trademark)
95 | Licensable by such Contributor to use, reproduce, make available,
96 | modify, display, perform, distribute, and otherwise exploit its
97 | Contributions, either on an unmodified basis, with Modifications, or
98 | as part of a Larger Work; and
99 |
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 | for sale, have made, import, and otherwise transfer either its
102 | Contributions or its Contributor Version.
103 |
104 | 2.2. Effective Date
105 |
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 |
110 | 2.3. Limitations on Grant Scope
111 |
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 |
118 | (a) for any code that a Contributor has removed from Covered Software;
119 | or
120 |
121 | (b) for infringements caused by: (i) Your and any other third party's
122 | modifications of Covered Software, or (ii) the combination of its
123 | Contributions with other software (except as part of its Contributor
124 | Version); or
125 |
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 | its Contributions.
128 |
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 |
133 | 2.4. Subsequent Licenses
134 |
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 |
140 | 2.5. Representation
141 |
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 |
146 | 2.6. Fair Use
147 |
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 |
152 | 2.7. Conditions
153 |
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 |
157 | 3. Responsibilities
158 | -------------------
159 |
160 | 3.1. Distribution of Source Form
161 |
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 |
170 | 3.2. Distribution of Executable Form
171 |
172 | If You distribute Covered Software in Executable Form then:
173 |
174 | (a) such Covered Software must also be made available in Source Code
175 | Form, as described in Section 3.1, and You must inform recipients of
176 | the Executable Form how they can obtain a copy of such Source Code
177 | Form by reasonable means in a timely manner, at a charge no more
178 | than the cost of distribution to the recipient; and
179 |
180 | (b) You may distribute such Executable Form under the terms of this
181 | License, or sublicense it under different terms, provided that the
182 | license for the Executable Form does not attempt to limit or alter
183 | the recipients' rights in the Source Code Form under this License.
184 |
185 | 3.3. Distribution of a Larger Work
186 |
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 |
198 | 3.4. Notices
199 |
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 |
206 | 3.5. Application of Additional Terms
207 |
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 |
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 |
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 |
232 | 5. Termination
233 | --------------
234 |
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 |
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 |
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 |
261 | ************************************************************************
262 | * *
263 | * 6. Disclaimer of Warranty *
264 | * ------------------------- *
265 | * *
266 | * Covered Software is provided under this License on an "as is" *
267 | * basis, without warranty of any kind, either expressed, implied, or *
268 | * statutory, including, without limitation, warranties that the *
269 | * Covered Software is free of defects, merchantable, fit for a *
270 | * particular purpose or non-infringing. The entire risk as to the *
271 | * quality and performance of the Covered Software is with You. *
272 | * Should any Covered Software prove defective in any respect, You *
273 | * (not any Contributor) assume the cost of any necessary servicing, *
274 | * repair, or correction. This disclaimer of warranty constitutes an *
275 | * essential part of this License. No use of any Covered Software is *
276 | * authorized under this License except under this disclaimer. *
277 | * *
278 | ************************************************************************
279 |
280 | ************************************************************************
281 | * *
282 | * 7. Limitation of Liability *
283 | * -------------------------- *
284 | * *
285 | * Under no circumstances and under no legal theory, whether tort *
286 | * (including negligence), contract, or otherwise, shall any *
287 | * Contributor, or anyone who distributes Covered Software as *
288 | * permitted above, be liable to You for any direct, indirect, *
289 | * special, incidental, or consequential damages of any character *
290 | * including, without limitation, damages for lost profits, loss of *
291 | * goodwill, work stoppage, computer failure or malfunction, or any *
292 | * and all other commercial damages or losses, even if such party *
293 | * shall have been informed of the possibility of such damages. This *
294 | * limitation of liability shall not apply to liability for death or *
295 | * personal injury resulting from such party's negligence to the *
296 | * extent applicable law prohibits such limitation. Some *
297 | * jurisdictions do not allow the exclusion or limitation of *
298 | * incidental or consequential damages, so this exclusion and *
299 | * limitation may not apply to You. *
300 | * *
301 | ************************************************************************
302 |
303 | 8. Litigation
304 | -------------
305 |
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 |
313 | 9. Miscellaneous
314 | ----------------
315 |
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 |
323 | 10. Versions of the License
324 | ---------------------------
325 |
326 | 10.1. New Versions
327 |
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 |
333 | 10.2. Effect of New Versions
334 |
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 |
340 | 10.3. Modified Versions
341 |
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 |
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 |
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 |
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 |
358 | This Source Code Form is subject to the terms of the Mozilla Public
359 | License, v. 2.0. If a copy of the MPL was not distributed with this
360 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 |
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 |
367 | You may add additional accurate notices of copyright ownership.
368 |
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 |
372 | This Source Code Form is "Incompatible With Secondary Licenses", as
373 | defined by the Mozilla Public License, v. 2.0.
374 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | Flyscrape is a command-line web scraping tool designed for those without advanced programming skills, enabling precise extraction of website data.
17 |
18 |
19 |
20 |
21 |
22 | Installation · Documentation · Releases
23 |
24 |
25 |
26 | ## Demo
27 |
28 |
29 |
30 |
31 |
32 |
33 | ## Features
34 |
35 | - **Standalone:** Flyscrape comes as a single binary executable.
36 | - **jQuery-like:** Extract data from HTML pages with a familiar API.
37 | - **Scriptable:** Use JavaScript to write your data extraction logic.
38 | - **System Cookies:** Give Flyscrape access to your browsers cookie store.
39 | - **Browser Mode:** Render JavaScript heavy pages using a headless Browser.
40 | - **Nested Scraping:** Extract data from linked pages within a single scrape.
41 |
42 | ## Overview
43 |
44 | - [Example](#example)
45 | - [Installation](#installation)
46 | - [Recommended](#recommended)
47 | - [Homebrew](#homebrew)
48 | - [Pre-compiled binary](#pre-compiled-binary)
49 | - [Compile from source](#compile-from-source)
50 | - [Usage](#usage)
51 | - [Configuration](#configuration)
52 | - [Query API](#query-api)
53 | - [Flyscrape API](#flyscrape-api)
54 | - [Document Parsing](#document-parsing)
55 | - [File Downloads](#file-downloads)
56 | - [Issues and suggestions](#issues-and-suggestions)
57 |
58 | ## Example
59 |
60 | This example scrapes the first few pages form Hacker News, specifically the New, Show and Ask sections.
61 |
62 | ```javascript
63 | export const config = {
64 | urls: [
65 | "https://news.ycombinator.com/new",
66 | "https://news.ycombinator.com/show",
67 | "https://news.ycombinator.com/ask",
68 | ],
69 |
70 | // Cache request for later.
71 | cache: "file",
72 |
73 | // Enable JavaScript rendering.
74 | browser: true,
75 | headless: false,
76 |
77 | // Follow pagination 5 times.
78 | depth: 5,
79 | follow: ["a.morelink[href]"],
80 | }
81 |
82 | export default function ({ doc, absoluteURL }) {
83 | const title = doc.find("title");
84 | const posts = doc.find(".athing");
85 |
86 | return {
87 | title: title.text(),
88 | posts: posts.map((post) => {
89 | const link = post.find(".titleline > a");
90 |
91 | return {
92 | title: link.text(),
93 | url: link.attr("href"),
94 | };
95 | }),
96 | }
97 | }
98 | ```
99 |
100 | ```bash
101 | $ flyscrape run hackernews.js
102 | [
103 | {
104 | "url": "https://news.ycombinator.com/new",
105 | "data": {
106 | "title": "New Links | Hacker News",
107 | "posts": [
108 | {
109 | "title": "Show HN: flyscrape - An standalone and scriptable web scraper",
110 | "url": "https://flyscrape.com/"
111 | },
112 | ...
113 | ]
114 | }
115 | }
116 | ]
117 | ```
118 |
119 | Check out the [examples folder](examples) for more detailed examples.
120 |
121 | ## Installation
122 |
123 | ### Recommended
124 |
125 | The easiest way to install `flyscrape` is via its install script.
126 |
127 | ```bash
128 | curl -fsSL https://flyscrape.com/install | bash
129 | ```
130 |
131 | ### Homebrew
132 |
133 | For macOS users `flyscrape` is also available via homebrew:
134 |
135 | ```bash
136 | brew install flyscrape
137 | ```
138 |
139 | ### Pre-compiled binary
140 |
141 | `flyscrape` is available for MacOS, Linux and Windows as a downloadable binary from the [releases page](https://github.com/philippta/flyscrape/releases).
142 |
143 | ### Compile from source
144 |
145 | To compile flyscrape from source, follow these steps:
146 |
147 | 1. Install Go: Make sure you have Go installed on your system. If not, you can download it from [https://go.dev/](https://go.dev/).
148 |
149 | 2. Install flyscrape: Open a terminal and run the following command:
150 |
151 | ```bash
152 | go install github.com/philippta/flyscrape/cmd/flyscrape@latest
153 | ```
154 |
155 | ## Usage
156 |
157 | ```
158 | Usage:
159 |
160 | flyscrape run SCRIPT [config flags]
161 |
162 | Examples:
163 |
164 | # Run the script.
165 | $ flyscrape run example.js
166 |
167 | # Set the URL as argument.
168 | $ flyscrape run example.js --url "http://other.com"
169 |
170 | # Enable proxy support.
171 | $ flyscrape run example.js --proxies "http://someproxy:8043"
172 |
173 | # Follow paginated links.
174 | $ flyscrape run example.js --depth 5 --follow ".next-button > a"
175 |
176 | # Set the output format to ndjson.
177 | $ flyscrape run example.js --output.format ndjson
178 |
179 | # Write the output to a file.
180 | $ flyscrape run example.js --output.file results.json
181 | ```
182 |
183 | ## Configuration
184 |
185 | Below is an example scraping script that showcases the capabilities of flyscrape. For a full documentation of all configuration options, visit the [documentation page](https://flyscrape.com/docs/getting-started/).
186 |
187 | ```javascript
188 | export const config = {
189 | // Specify the URL to start scraping from.
190 | url: "https://example.com/",
191 |
192 | // Specify the multiple URLs to start scraping from. (default = [])
193 | urls: [
194 | "https://anothersite.com/",
195 | "https://yetanother.com/",
196 | ],
197 |
198 | // Enable rendering with headless browser. (default = false)
199 | browser: true,
200 |
201 | // Specify if browser should be headless or not. (default = true)
202 | headless: false,
203 |
204 | // Specify how deep links should be followed. (default = 0, no follow)
205 | depth: 5,
206 |
207 | // Specify the css selectors to follow. (default = ["a[href]"])
208 | // Setting follow to [] disables automatic following.
209 | // Can later be used with manual following.
210 | follow: [".next > a", ".related a"],
211 |
212 | // Specify the allowed domains. ['*'] for all. (default = domain from url)
213 | allowedDomains: ["example.com", "anothersite.com"],
214 |
215 | // Specify the blocked domains. (default = none)
216 | blockedDomains: ["somesite.com"],
217 |
218 | // Specify the allowed URLs as regex. (default = all allowed)
219 | allowedURLs: ["/posts", "/articles/\d+"],
220 |
221 | // Specify the blocked URLs as regex. (default = none)
222 | blockedURLs: ["/admin"],
223 |
224 | // Specify the rate in requests per minute. (default = no rate limit)
225 | rate: 60,
226 |
227 | // Specify the number of concurrent requests. (default = no limit)
228 | concurrency: 1,
229 |
230 | // Specify a single HTTP(S) proxy URL. (default = no proxy)
231 | // Note: Not compatible with browser mode.
232 | proxy: "http://someproxy.com:8043",
233 |
234 | // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
235 | // Note: Not compatible with browser mode.
236 | proxies: [
237 | "http://someproxy.com:8043",
238 | "http://someotherproxy.com:8043",
239 | ],
240 |
241 | // Enable file-based request caching. (default = no cache)
242 | cache: "file",
243 |
244 | // Specify the HTTP request header. (default = none)
245 | headers: {
246 | "Authorization": "Bearer ...",
247 | "User-Agent": "Mozilla ...",
248 | },
249 |
250 | // Use the cookie store of your local browser. (default = off)
251 | // Options: "chrome" | "edge" | "firefox"
252 | cookies: "chrome",
253 |
254 | // Specify the output options.
255 | output: {
256 | // Specify the output file. (default = stdout)
257 | file: "results.json",
258 |
259 | // Specify the output format. (default = json)
260 | // Options: "json" | "ndjson"
261 | format: "json",
262 | },
263 | };
264 |
265 | export default function ({ doc, url, absoluteURL, scrape, follow }) {
266 | // doc
267 | // Contains the parsed HTML document.
268 |
269 | // url
270 | // Contains the scraped URL.
271 |
272 | // absoluteURL("/foo")
273 | // Transforms a relative URL into absolute URL.
274 |
275 | // scrape(url, function({ doc, url, absoluteURL, scrape }) {
276 | // return { ... };
277 | // })
278 | // Scrapes a linked page and returns the scrape result.
279 |
280 | // follow("/foo")
281 | // Follows a link manually.
282 | // Disable automatic following with `follow: []` for best results.
283 | }
284 | ```
285 |
286 | ## Query API
287 |
288 | ```javascript
289 | // Hey
290 | const el = doc.find(".element")
291 | el.text() // "Hey"
292 | el.html() // `Hey
`
293 | el.name() // div
294 | el.attr("foo") // "bar"
295 | el.hasAttr("foo") // true
296 | el.hasClass("element") // true
297 |
298 | //
299 | // Item 1
300 | // Item 2
301 | // Item 3
302 | //
303 | const list = doc.find("ul")
304 | list.children() // [Item 1 , Item 2 , Item 3 ]
305 |
306 | const items = list.find("li")
307 | items.length() // 3
308 | items.first() // Item 1
309 | items.last() // Item 3
310 | items.get(1) // Item 2
311 | items.get(1).prev() // Item 1
312 | items.get(1).next() // Item 3
313 | items.get(1).parent() //
314 | items.get(1).siblings() // [Item 1 , Item 2 , Item 3 ]
315 | items.map(item => item.text()) // ["Item 1", "Item 2", "Item 3"]
316 | items.filter(item => item.hasClass("a")) // [Item 1 ]
317 |
318 | //
319 | //
Aleph
320 | //
Aleph
321 | //
Beta
322 | //
Beta
323 | //
Gamma
324 | //
Gamma
325 | //
326 | const header = doc.find("div h2")
327 |
328 | header.get(1).prev() // Aleph
329 | header.get(1).prevAll() // [Aleph
, Aleph ]
330 | header.get(1).prevUntil('div,h1,h2,h3') // Aleph
331 | header.get(1).next() // Beta
332 | header.get(1).nextAll() // [Beta
, Gamma , Gamma
]
333 | header.get(1).nextUntil('div,h1,h2,h3') // Beta
334 | ```
335 |
336 | ## Flyscrape API
337 |
338 | ### Document Parsing
339 |
340 | ```javascript
341 | import { parse } from "flyscrape";
342 |
343 | const doc = parse(`bar
`);
344 | const text = doc.find(".foo").text();
345 | ```
346 |
347 | ### File Downloads
348 |
349 | ```javascript
350 | import { download } from "flyscrape/http";
351 |
352 | download("http://example.com/image.jpg") // downloads as "image.jpg"
353 | download("http://example.com/image.jpg", "other.jpg") // downloads as "other.jpg"
354 | download("http://example.com/image.jpg", "dir/") // downloads as "dir/image.jpg"
355 |
356 | // If the server offers a filename via the Content-Disposition header and no
357 | // destination filename is provided, Flyscrape will honor the suggested filename.
358 | // E.g. `Content-Disposition: attachment; filename="archive.zip"`
359 | download("http://example.com/generate_archive.php", "dir/") // downloads as "dir/archive.zip"
360 | ```
361 |
362 | ## Issues and Suggestions
363 |
364 | If you encounter any issues or have suggestions for improvement, please [submit an issue](https://github.com/philippta/flyscrape/issues).
365 |
--------------------------------------------------------------------------------
/cmd/args.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | "fmt"
9 | "slices"
10 | "strconv"
11 | "strings"
12 | )
13 |
14 | var arrayFields = []string{
15 | "urls",
16 | "follow",
17 | "allowedDomains",
18 | "blockedDomains",
19 | "allowedURLs",
20 | "blockedURLs",
21 | "proxies",
22 | }
23 |
24 | func parseConfigArgs(args []string) (map[string]any, error) {
25 | updates := map[string]any{}
26 |
27 | flag := ""
28 | for _, arg := range normalizeArgs(args) {
29 | if flag == "" && !isFlag(arg) {
30 | return nil, fmt.Errorf("expected flag, got %q instead", arg)
31 | }
32 |
33 | if flag != "" && isFlag(arg) {
34 | updates[flag[2:]] = true
35 | flag = ""
36 | continue
37 | }
38 |
39 | if flag != "" {
40 | if v, ok := updates[flag[2:]]; ok {
41 | if vv, ok := v.([]any); ok {
42 | updates[flag[2:]] = append(vv, parseArg(arg))
43 | } else {
44 | updates[flag[2:]] = []any{v, parseArg(arg)}
45 | }
46 | } else {
47 | if slices.Contains(arrayFields, flag[2:]) {
48 | updates[flag[2:]] = []any{parseArg(arg)}
49 | } else {
50 | updates[flag[2:]] = parseArg(arg)
51 | }
52 | }
53 | flag = ""
54 | continue
55 | }
56 |
57 | flag = arg
58 | }
59 |
60 | if flag != "" {
61 | updates[flag[2:]] = true
62 | flag = ""
63 | }
64 |
65 | return updates, nil
66 | }
67 |
68 | func normalizeArgs(args []string) []string {
69 | var norm []string
70 |
71 | for _, arg := range args {
72 | if !strings.HasPrefix(arg, "--") {
73 | norm = append(norm, arg)
74 | } else {
75 | norm = append(norm, strings.SplitN(arg, "=", 2)...)
76 | }
77 | }
78 |
79 | return norm
80 | }
81 |
82 | func parseArg(arg string) any {
83 | if arg == "true" {
84 | return true
85 | }
86 | if arg == "false" {
87 | return false
88 | }
89 | if num, err := strconv.Atoi(arg); err == nil {
90 | return num
91 | }
92 | if num, err := strconv.ParseFloat(arg, 64); err == nil {
93 | return num
94 | }
95 | return arg
96 | }
97 |
98 | func isFlag(arg string) bool {
99 | return strings.HasPrefix(arg, "--")
100 | }
101 |
--------------------------------------------------------------------------------
/cmd/args_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | "strings"
9 | "testing"
10 |
11 | "github.com/stretchr/testify/require"
12 | )
13 |
14 | func TestParseConfigUpdates(t *testing.T) {
15 | tests := []struct {
16 | flags string
17 | err bool
18 | updates map[string]any
19 | }{
20 | {
21 | flags: `--foo bar`,
22 | updates: map[string]any{"foo": "bar"},
23 | },
24 | {
25 | flags: `--foo=bar`,
26 | updates: map[string]any{"foo": "bar"},
27 | },
28 | {
29 | flags: `--foo`,
30 | updates: map[string]any{"foo": true},
31 | },
32 | {
33 | flags: `--foo false`,
34 | updates: map[string]any{"foo": false},
35 | },
36 | {
37 | flags: `--foo a --foo b`,
38 | updates: map[string]any{"foo": []any{"a", "b"}},
39 | },
40 | {
41 | flags: `--foo a --foo=b`,
42 | updates: map[string]any{"foo": []any{"a", "b"}},
43 | },
44 | {
45 | flags: `--foo 69`,
46 | updates: map[string]any{"foo": 69},
47 | },
48 | {
49 | flags: `--foo.bar a`,
50 | updates: map[string]any{"foo.bar": "a"},
51 | },
52 | {
53 | flags: `foo`,
54 | err: true,
55 | },
56 | {
57 | flags: `--foo a b`,
58 | err: true,
59 | },
60 | }
61 | for _, test := range tests {
62 | t.Run(test.flags, func(t *testing.T) {
63 | args, err := parseConfigArgs(strings.Fields(test.flags))
64 |
65 | if test.err {
66 | require.Error(t, err)
67 | require.Empty(t, args)
68 | return
69 | }
70 |
71 | require.NoError(t, err)
72 | require.Equal(t, test.updates, args)
73 | })
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/cmd/dev.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | "flag"
9 | "fmt"
10 |
11 | "github.com/philippta/flyscrape"
12 | )
13 |
14 | type DevCommand struct{}
15 |
16 | func (c *DevCommand) Run(args []string) error {
17 | fs := flag.NewFlagSet("flyscrape-dev", flag.ContinueOnError)
18 | fs.Usage = c.Usage
19 |
20 | if err := fs.Parse(args); err != nil {
21 | return err
22 | } else if fs.NArg() == 0 || fs.Arg(0) == "" {
23 | c.Usage()
24 | return flag.ErrHelp
25 | }
26 |
27 | cfg, err := parseConfigArgs(fs.Args()[1:])
28 | if err != nil {
29 | return fmt.Errorf("error parsing config flags: %w", err)
30 | }
31 |
32 | return flyscrape.Dev(fs.Arg(0), cfg)
33 | }
34 |
35 | func (c *DevCommand) Usage() {
36 | fmt.Println(`
37 | The dev command watches the scraping script and re-runs it on any change.
38 | Recursive scraping is disabled in this mode, only the initial URL will be scraped.
39 |
40 | Usage:
41 |
42 | flyscrape dev SCRIPT [config flags]
43 |
44 | Examples:
45 |
46 | # Run and watch script.
47 | $ flyscrape dev example.js
48 |
49 | # Set the URL as argument.
50 | $ flyscrape dev example.js --url "http://other.com"
51 |
52 | # Enable proxy support.
53 | $ flyscrape dev example.js --proxies "http://someproxy:8043"
54 | `[1:])
55 | }
56 |
--------------------------------------------------------------------------------
/cmd/flyscrape/main.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package main
6 |
7 | import (
8 | _ "embed"
9 | "flag"
10 | "log"
11 | "os"
12 |
13 | "github.com/philippta/flyscrape/cmd"
14 | _ "github.com/philippta/flyscrape/modules/browser"
15 | _ "github.com/philippta/flyscrape/modules/cache"
16 | _ "github.com/philippta/flyscrape/modules/cookies"
17 | _ "github.com/philippta/flyscrape/modules/depth"
18 | _ "github.com/philippta/flyscrape/modules/domainfilter"
19 | _ "github.com/philippta/flyscrape/modules/followlinks"
20 | _ "github.com/philippta/flyscrape/modules/headers"
21 | _ "github.com/philippta/flyscrape/modules/output/json"
22 | _ "github.com/philippta/flyscrape/modules/output/ndjson"
23 | _ "github.com/philippta/flyscrape/modules/proxy"
24 | _ "github.com/philippta/flyscrape/modules/ratelimit"
25 | _ "github.com/philippta/flyscrape/modules/retry"
26 | _ "github.com/philippta/flyscrape/modules/starturl"
27 | _ "github.com/philippta/flyscrape/modules/urlfilter"
28 | )
29 |
30 | func main() {
31 | log.SetFlags(0)
32 |
33 | if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil {
34 | if err != flag.ErrHelp {
35 | log.Println(err)
36 | }
37 | os.Exit(1)
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | _ "embed"
9 | "flag"
10 | "fmt"
11 | "log"
12 | "os"
13 | "strings"
14 | )
15 |
16 | func main() {
17 | log.SetFlags(0)
18 |
19 | m := &Main{}
20 | if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
21 | os.Exit(1)
22 | } else if err != nil {
23 | log.Println(err)
24 | os.Exit(1)
25 | }
26 | }
27 |
28 | type Main struct{}
29 |
30 | func (m *Main) Run(args []string) error {
31 | var cmd string
32 | if len(args) > 0 {
33 | cmd, args = args[0], args[1:]
34 | }
35 |
36 | switch cmd {
37 | case "new":
38 | return (&NewCommand{}).Run(args)
39 | case "run":
40 | return (&RunCommand{}).Run(args)
41 | case "dev":
42 | return (&DevCommand{}).Run(args)
43 | case "version":
44 | return (&VersionCommand{}).Run(args)
45 | default:
46 | if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
47 | m.Usage()
48 | return flag.ErrHelp
49 | }
50 | return fmt.Errorf("flyscrape %s: unknown command", cmd)
51 | }
52 | }
53 |
54 | func (m *Main) Usage() {
55 | fmt.Println(`
56 | flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
57 |
58 | Usage:
59 |
60 | flyscrape [arguments]
61 |
62 | Commands:
63 |
64 | new creates a sample scraping script
65 | run runs a scraping script
66 | dev watches and re-runs a scraping script
67 | version prints the version
68 | `[1:])
69 | }
70 |
--------------------------------------------------------------------------------
/cmd/new.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | "flag"
9 | "fmt"
10 | "os"
11 |
12 | "github.com/philippta/flyscrape"
13 | )
14 |
15 | type NewCommand struct{}
16 |
17 | func (c *NewCommand) Run(args []string) error {
18 | fs := flag.NewFlagSet("flyscrape-new", flag.ContinueOnError)
19 | fs.Usage = c.Usage
20 |
21 | if err := fs.Parse(args); err != nil {
22 | return err
23 | } else if fs.NArg() == 0 || fs.Arg(0) == "" {
24 | c.Usage()
25 | return flag.ErrHelp
26 | } else if fs.NArg() > 1 {
27 | return fmt.Errorf("too many arguments")
28 | }
29 |
30 | script := fs.Arg(0)
31 | if _, err := os.Stat(script); err == nil {
32 | return fmt.Errorf("script already exists")
33 | }
34 |
35 | if err := os.WriteFile(script, flyscrape.ScriptTemplate, 0o644); err != nil {
36 | return fmt.Errorf("failed to create script %q: %w", script, err)
37 | }
38 |
39 | fmt.Printf("Scraping script %v created.\n", script)
40 | return nil
41 | }
42 |
43 | func (c *NewCommand) Usage() {
44 | fmt.Println(`
45 | The new command creates a new scraping script.
46 |
47 | Usage:
48 |
49 | flyscrape new SCRIPT
50 |
51 | Examples:
52 |
53 | # Create a new scraping script.
54 | $ flyscrape new example.js
55 | `[1:])
56 | }
57 |
--------------------------------------------------------------------------------
/cmd/run.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | "flag"
9 | "fmt"
10 |
11 | "github.com/philippta/flyscrape"
12 | )
13 |
14 | type RunCommand struct{}
15 |
16 | func (c *RunCommand) Run(args []string) error {
17 | fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError)
18 | fs.Usage = c.Usage
19 |
20 | if err := fs.Parse(args); err != nil {
21 | return err
22 | } else if fs.NArg() == 0 || fs.Arg(0) == "" {
23 | c.Usage()
24 | return flag.ErrHelp
25 | }
26 |
27 | cfg, err := parseConfigArgs(fs.Args()[1:])
28 | if err != nil {
29 | return fmt.Errorf("error parsing config flags: %w", err)
30 | }
31 |
32 | return flyscrape.Run(fs.Arg(0), cfg)
33 | }
34 |
35 | func (c *RunCommand) Usage() {
36 | fmt.Println(`
37 | The run command runs the scraping script.
38 |
39 | Usage:
40 |
41 | flyscrape run SCRIPT [config flags]
42 |
43 | Examples:
44 |
45 | # Run the script.
46 | $ flyscrape run example.js
47 |
48 | # Set the URL as argument.
49 | $ flyscrape run example.js --url "http://other.com"
50 |
51 | # Enable proxy support.
52 | $ flyscrape run example.js --proxies "http://someproxy:8043"
53 |
54 | # Follow paginated links.
55 | $ flyscrape run example.js --depth 5 --follow ".next-button > a"
56 |
57 | # Set the output format to ndjson.
58 | $ flyscrape run example.js --output.format ndjson
59 |
60 | # Write the output to a file.
61 | $ flyscrape run example.js --output.file results.json
62 | `[1:])
63 | }
64 |
--------------------------------------------------------------------------------
/cmd/version.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cmd
6 |
7 | import (
8 | "fmt"
9 | "runtime/debug"
10 |
11 | "github.com/philippta/flyscrape"
12 | )
13 |
14 | type VersionCommand struct{}
15 |
16 | func (c *VersionCommand) Run(args []string) error {
17 | info, ok := debug.ReadBuildInfo()
18 | if !ok {
19 | return fmt.Errorf("no build info found")
20 | }
21 |
22 | var os, arch, version string
23 | for _, setting := range info.Settings {
24 | switch setting.Key {
25 | case "GOARCH":
26 | arch = setting.Value
27 | case "GOOS":
28 | os = setting.Value
29 | case "vcs.revision":
30 | version = "v0.0.0-" + setting.Value
31 | }
32 | }
33 |
34 | if flyscrape.Version != "" {
35 | version = flyscrape.Version
36 | }
37 |
38 | fmt.Printf("flyscrape %s %s/%s\n", version, os, arch)
39 | return nil
40 | }
41 |
--------------------------------------------------------------------------------
/examples/browser.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://www.airbnb.com/",
3 | browser: true,
4 | // headless: false,
5 | };
6 |
7 | export default function ({ doc, absoluteURL }) {
8 | const rooms = doc.find("[itemprop=itemListElement]");
9 |
10 | return {
11 | listings: rooms.map(room => {
12 | const link = "https://" + room.find("meta[itemprop=url]").attr("content");
13 | const image = room.find("img").attr("src");
14 | const desc = new Set(room.find("[role=group] > div > div > div").map(d => d.text()).filter(Boolean));
15 |
16 | return { link, image, desc }
17 | }),
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/examples/coinmarketcap.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://coinmarketcap.com/",
3 | };
4 |
5 | export default function({ doc }) {
6 | const rows = doc.find(".cmc-table tbody tr");
7 |
8 | return {
9 | currencies: rows
10 | .map((row) => {
11 | const cols = row.find("td");
12 |
13 | return {
14 | position: cols.get(1).text(),
15 | currency: cols.get(2).find("p").get(0).text(),
16 | symbol: cols.get(2).find("p").get(1).text(),
17 | price: cols.get(3).text(),
18 | change: {
19 | "1h": cols.get(4).text(),
20 | "24h": cols.get(5).text(),
21 | "7dh": cols.get(6).text(),
22 | },
23 | marketcap: cols.get(7).find("span").get(1).text(),
24 | volume: cols.get(8).find("p").get(0).text(),
25 | supply: cols.get(9).text(),
26 | };
27 | })
28 | .slice(0, 10),
29 | };
30 | }
31 |
--------------------------------------------------------------------------------
/examples/cookies.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://news.ycombinator.com/",
3 |
4 | // This will use cookies from your Chrome browser.
5 | // Options: "chrome" | "firefox" | "edge"
6 | cookies: "chrome",
7 | };
8 |
9 | export default function({ doc }) {
10 | return {
11 | user: doc.find("#me").text(),
12 | karma: doc.find("#karma").text(),
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/examples/custom_headers.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://news.ycombinator.com/",
3 | headers: {
4 | "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
5 | "User-Agent": "Gecko/1.0",
6 | }
7 | };
8 |
9 | export default function({ doc, absoluteURL }) {
10 | const posts = doc.find(".athing");
11 |
12 | return {
13 | posts: posts.map((post) => {
14 | const link = post.find(".titleline > a");
15 | const meta = post.next();
16 |
17 | return {
18 | url: absoluteURL(link.attr("href")),
19 | user: meta.find(".hnuser").text(),
20 | title: link.text(),
21 | points: meta.find(".score").text().replace(" points", ""),
22 | created: meta.find(".age").attr("title"),
23 | };
24 | }),
25 | };
26 | }
27 |
--------------------------------------------------------------------------------
/examples/download.js:
--------------------------------------------------------------------------------
1 | import { download } from "flyscrape/http";
2 |
3 | export const config = {
4 | url: "https://commons.wikimedia.org/wiki/London",
5 | };
6 |
7 | export default function ({ doc }) {
8 | const symbols = doc.find("#mw-content-text .mw-gallery-traditional:first-of-type li");
9 |
10 | return {
11 | symbols: symbols.map(symbol => {
12 | const name = symbol.text().trim();
13 | const url = symbol.find("img").attr("src");
14 | const file = `symbols/${basename(url)}`;
15 |
16 | download(url, file);
17 |
18 | return { name, url, file };
19 | })
20 | };
21 | }
22 |
23 | function basename(path) {
24 | return path.split("/").slice(-1)[0];
25 | }
26 |
--------------------------------------------------------------------------------
/examples/hackernews.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://news.ycombinator.com/",
3 | depth: 9,
4 | cache: "file",
5 | follow: ["a.morelink[href]"],
6 | };
7 |
8 | export default function({ doc, absoluteURL }) {
9 | const posts = doc.find(".athing");
10 |
11 | return {
12 | posts: posts.map((post) => {
13 | const link = post.find(".titleline > a");
14 | const meta = post.next();
15 |
16 | return {
17 | url: absoluteURL(link.attr("href")),
18 | user: meta.find(".hnuser").text(),
19 | title: link.text(),
20 | points: meta.find(".score").text().replace(" points", ""),
21 | created: meta.find(".age").attr("title"),
22 | };
23 | }),
24 | };
25 | }
26 |
--------------------------------------------------------------------------------
/examples/hackernews_manual_follow.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://news.ycombinator.com/",
3 | depth: 2,
4 | follow: [],
5 | };
6 |
7 | export default function({ url, doc, follow }) {
8 | const next = doc.find(".morelink").attr("href");
9 |
10 | follow(next);
11 |
12 | return { url, next };
13 | }
14 |
--------------------------------------------------------------------------------
/examples/hackernews_with_comments.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://news.ycombinator.com/",
3 | };
4 |
5 | export default function({ doc, scrape }) {
6 | const post = doc.find(".athing.submission").first();
7 | const title = post.find(".titleline > a").text();
8 | const commentsLink = post.next().find("a").last().attr("href");
9 |
10 | const comments = scrape(commentsLink, function({ doc }) {
11 | return doc.find(".comtr").map(comment => {
12 | return {
13 | author: comment.find(".hnuser").text(),
14 | text: comment.find(".commtext").text(),
15 | };
16 | });
17 | });
18 |
19 | return {
20 | title,
21 | comments,
22 | };
23 | }
24 |
--------------------------------------------------------------------------------
/examples/multiple_starting_urls.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | urls: [
3 | "https://news.ycombinator.com/show",
4 | "https://news.ycombinator.com/ask",
5 | ],
6 | };
7 |
8 | export default function({ doc, absoluteURL }) {
9 | const posts = doc.find(".athing");
10 |
11 | return {
12 | posts: posts.map((post) => {
13 | const link = post.find(".titleline > a");
14 | const meta = post.next();
15 |
16 | return {
17 | url: absoluteURL(link.attr("href")),
18 | user: meta.find(".hnuser").text(),
19 | title: link.text(),
20 | points: meta.find(".score").text().replace(" points", ""),
21 | created: meta.find(".age").attr("title"),
22 | };
23 | }),
24 | };
25 | }
26 |
--------------------------------------------------------------------------------
/examples/reddit.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | url: "https://old.reddit.com/",
3 | };
4 |
5 | export default function({ doc, absoluteURL }) {
6 | const posts = doc.find("#siteTable .thing:not(.promoted)");
7 |
8 | return {
9 | posts: posts.map((post) => {
10 | const rank = post.find(".rank");
11 | const user = post.find(".author");
12 | const created = post.find("time");
13 | const title = post.find("a.title");
14 | const comments = post.find(".comments");
15 | const subreddit = post.find(".subreddit");
16 | const upvotes = post.find(".score.unvoted");
17 | const thumbnail = post.find("a.thumbnail img");
18 |
19 | return {
20 | rank: rank.text(),
21 | user: user.text(),
22 | created: created.attr("datetime"),
23 | title: title.text(),
24 | link: absoluteURL(title.attr("href")),
25 | comments: comments.text().replace(" comments", ""),
26 | comments_link: comments.attr("href"),
27 | subreddit: subreddit.text(),
28 | upvotes: upvotes.text(),
29 | thumbnail: absoluteURL(thumbnail.attr("src")),
30 | };
31 | }),
32 | };
33 | }
34 |
--------------------------------------------------------------------------------
/examples/urls.txt:
--------------------------------------------------------------------------------
1 | https://news.ycombinator.com/newest
2 | https://news.ycombinator.com/ask
3 | https://news.ycombinator.com/show
4 |
--------------------------------------------------------------------------------
/examples/urls_from_file.js:
--------------------------------------------------------------------------------
1 | import urls from "./urls.txt"
2 |
3 | export const config = {
4 | urls: urls.split("\n")
5 | };
6 |
7 | export default function({ doc }) {
8 | return {
9 | title: doc.find("title").text().trim(),
10 | };
11 | }
12 |
--------------------------------------------------------------------------------
/examples/useragents/chrome.js:
--------------------------------------------------------------------------------
1 | import { parse } from "flyscrape";
2 |
3 | export const config = {
4 | url: "https://chromereleases.googleblog.com/search/label/Stable%20updates",
5 | follow: [".blog-pager-older-link"],
6 | depth: 30,
7 | cache: "file",
8 | };
9 |
10 | export default function ({ doc, absoluteURL }) {
11 | const posts = doc.find(".post");
12 | return posts.map(post => {
13 | const title = post.find("h2").text().trim();
14 | const body = parse(post.find(".post-content").text()).find("p:nth-child(1)").text().trim();
15 |
16 | const regexes = [
17 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Mac)/,
18 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Windows)/,
19 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Linux)/,
20 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (iOS)/,
21 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Android)/,
22 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (ChromeOS)/,
23 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Mac,Linux)/,
24 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Mac and Linux)/,
25 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)?\s\(Platform version:\s[\d\.]+\)\sfor\smost\s(ChromeOS)/,
26 | ];
27 |
28 | const versions = new Set();
29 | for (const regex of regexes) {
30 | const matches = body.match(regex);
31 | if (!matches) {
32 | continue;
33 | }
34 |
35 | let versionStr = matches[1];
36 |
37 | let vv = versionStr.split("/");
38 | if (vv.length == 2) {
39 | vv[1] = vv[0].substring(0, vv[0].lastIndexOf(".")) + vv[1];
40 | }
41 |
42 | for (const version of vv) {
43 | versions.add(version)
44 | }
45 | }
46 |
47 |
48 | return versions
49 | }).filter(Boolean).flat();
50 | }
51 |
--------------------------------------------------------------------------------
/examples/useragents/edge.js:
--------------------------------------------------------------------------------
1 | import { parse } from "flyscrape";
2 |
3 | export const config = {
4 | url: "https://learn.microsoft.com/en-us/deployedge/microsoft-edge-release-schedule",
5 | };
6 |
7 | export default function ({ doc, absoluteURL }) {
8 | const links = doc.find("table a");
9 | return links
10 | .map(link => link.text())
11 | .filter(Boolean)
12 | }
13 |
--------------------------------------------------------------------------------
/examples/useragents/firefox.js:
--------------------------------------------------------------------------------
1 | import { parse } from "flyscrape";
2 |
3 | export const config = {
4 | url: "https://www.mozilla.org/en-US/firefox/releases/",
5 | };
6 |
7 | export default function ({ doc, absoluteURL }) {
8 | const links = doc.find(".c-release-list a");
9 | return links
10 | .map(link => link.text())
11 | .filter(Boolean)
12 | .filter(version => parseFloat(version) >= 60);
13 | }
14 |
--------------------------------------------------------------------------------
/examples/useragents/opera.js:
--------------------------------------------------------------------------------
1 |
2 | export const config = {
3 | urls: range("https://blogs.opera.com/desktop/changelog-for-{}/", 60, 110),
4 | };
5 |
6 | export default function ({ doc, absoluteURL }) {
7 | const versions = doc.find(".content h4");
8 | return versions.map(versions => {
9 | return versions.text().split(" ")[0].trim();
10 | }).filter(Boolean);
11 | }
12 |
13 | function range(url, from, to) {
14 | return Array.from({length: to - from + 1}).map((_, i) => url.replace("{}", i + from));
15 | }
16 |
--------------------------------------------------------------------------------
/flyscrape.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | "fmt"
9 | "log"
10 | "net/http"
11 | "os"
12 | "os/signal"
13 | "path/filepath"
14 | "syscall"
15 |
16 | "github.com/inancgumus/screen"
17 | "github.com/tidwall/sjson"
18 | )
19 |
20 | var Version string
21 |
22 | func Run(file string, overrides map[string]any) error {
23 | src, err := os.ReadFile(file)
24 | if err != nil {
25 | return fmt.Errorf("failed to read script %q: %w", file, err)
26 | }
27 |
28 | client := &http.Client{}
29 |
30 | imports, wait := NewJSLibrary(client)
31 | defer wait()
32 |
33 | pop, err := pushDir(file)
34 | if err != nil {
35 | return err
36 | }
37 |
38 | exports, err := Compile(string(src), imports)
39 | if err != nil {
40 | return fmt.Errorf("failed to compile script: %w", err)
41 | }
42 |
43 | if err := pop(); err != nil {
44 | return err
45 | }
46 |
47 | cfg := exports.Config()
48 | cfg = updateCfgMultiple(cfg, overrides)
49 |
50 | scraper := NewScraper()
51 | scraper.ScrapeFunc = exports.Scrape
52 | scraper.Script = file
53 | scraper.Client = client
54 | scraper.Modules = LoadModules(cfg)
55 |
56 | scraper.Run()
57 | return nil
58 | }
59 |
60 | func Dev(file string, overrides map[string]any) error {
61 | cachefile, err := newCacheFile()
62 | if err != nil {
63 | return fmt.Errorf("failed to create cache file: %w", err)
64 | }
65 |
66 | trapsignal(func() {
67 | os.RemoveAll(cachefile)
68 | })
69 |
70 | fn := func(s string) error {
71 | client := &http.Client{}
72 |
73 | imports, wait := NewJSLibrary(client)
74 | defer wait()
75 |
76 | pop, err := pushDir(file)
77 | if err != nil {
78 | return err
79 | }
80 |
81 | exports, err := Compile(s, imports)
82 | if err != nil {
83 | printCompileErr(file, err)
84 | return nil
85 | }
86 |
87 | if err := pop(); err != nil {
88 | return err
89 | }
90 |
91 | cfg := exports.Config()
92 | cfg = updateCfgMultiple(cfg, overrides)
93 | cfg = updateCfg(cfg, "depth", 0)
94 | cfg = updateCfg(cfg, "cache", "file:"+cachefile)
95 |
96 | scraper := NewScraper()
97 | scraper.ScrapeFunc = exports.Scrape
98 | scraper.Script = file
99 | scraper.Client = client
100 | scraper.Modules = LoadModules(cfg)
101 |
102 | screen.Clear()
103 | screen.MoveTopLeft()
104 | scraper.Run()
105 |
106 | return nil
107 | }
108 |
109 | if err := Watch(file, fn); err != nil && err != StopWatch {
110 | return fmt.Errorf("failed to watch script %q: %w", file, err)
111 | }
112 | return nil
113 | }
114 |
115 | func printCompileErr(script string, err error) {
116 | screen.Clear()
117 | screen.MoveTopLeft()
118 |
119 | if errs, ok := err.(interface{ Unwrap() []error }); ok {
120 | for _, err := range errs.Unwrap() {
121 | log.Printf("%s:%v\n", script, err)
122 | }
123 | } else {
124 | log.Println(err)
125 | }
126 | }
127 |
128 | func updateCfg(cfg Config, key string, value any) Config {
129 | newcfg, err := sjson.Set(string(cfg), key, value)
130 | if err != nil {
131 | return cfg
132 | }
133 | return Config(newcfg)
134 | }
135 |
136 | func newCacheFile() (string, error) {
137 | cachedir, err := os.MkdirTemp("", "flyscrape-cache")
138 | if err != nil {
139 | return "", err
140 | }
141 | return filepath.Join(cachedir, "dev.cache"), nil
142 | }
143 |
144 | func trapsignal(f func()) {
145 | sig := make(chan os.Signal, 2)
146 | signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
147 |
148 | go func() {
149 | <-sig
150 | f()
151 | os.Exit(0)
152 | }()
153 | }
154 |
155 | func updateCfgMultiple(cfg Config, updates map[string]any) Config {
156 | c := string(cfg)
157 |
158 | for k, v := range updates {
159 | nc, err := sjson.Set(c, k, v)
160 | if err != nil {
161 | continue
162 | }
163 | c = nc
164 | }
165 |
166 | return []byte(c)
167 | }
168 |
169 | func pushDir(file string) (func() error, error) {
170 | cwd, err := os.Getwd()
171 | if err != nil {
172 | return nil, fmt.Errorf("failed to get current working directory: %w", err)
173 | }
174 | if err := os.Chdir(filepath.Dir(file)); err != nil {
175 | return nil, fmt.Errorf("failed to change working directory: %w", err)
176 | }
177 | pop := func() error {
178 | if err := os.Chdir(cwd); err != nil {
179 | return fmt.Errorf("failed to change working directory: %w", err)
180 | }
181 | return nil
182 | }
183 | return pop, nil
184 |
185 | }
186 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/philippta/flyscrape
2 |
3 | go 1.23
4 |
5 | toolchain go1.23.3
6 |
7 | require (
8 | github.com/PuerkitoBio/goquery v1.8.1
9 | github.com/browserutils/kooky v0.2.2
10 | github.com/cornelk/hashmap v1.0.8
11 | github.com/dop251/goja v0.0.0-20230919151941-fc55792775de
12 | github.com/dop251/goja_nodejs v0.0.0-20230914102007-198ba9a8b098
13 | github.com/evanw/esbuild v0.18.14
14 | github.com/fsnotify/fsnotify v1.6.0
15 | github.com/go-rod/rod v0.114.7
16 | github.com/inancgumus/screen v0.0.0-20190314163918-06e984b86ed3
17 | github.com/nlnwa/whatwg-url v0.4.0
18 | github.com/stretchr/testify v1.8.4
19 | github.com/tidwall/sjson v1.2.5
20 | go.etcd.io/bbolt v1.3.11
21 | golang.org/x/sync v0.9.0
22 | )
23 |
24 | require (
25 | github.com/Velocidex/json v0.0.0-20220224052537-92f3c0326e5a // indirect
26 | github.com/Velocidex/ordereddict v0.0.0-20230909174157-2aa49cc5d11d // indirect
27 | github.com/Velocidex/yaml/v2 v2.2.8 // indirect
28 | github.com/andybalholm/cascadia v1.3.1 // indirect
29 | github.com/bits-and-blooms/bitset v1.5.0 // indirect
30 | github.com/davecgh/go-spew v1.1.1 // indirect
31 | github.com/dlclark/regexp2 v1.7.0 // indirect
32 | github.com/go-ini/ini v1.67.0 // indirect
33 | github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect
34 | github.com/go-sqlite/sqlite3 v0.0.0-20180313105335-53dd8e640ee7 // indirect
35 | github.com/godbus/dbus/v5 v5.1.0 // indirect
36 | github.com/gonuts/binary v0.2.0 // indirect
37 | github.com/google/pprof v0.0.0-20230207041349-798e818bf904 // indirect
38 | github.com/keybase/go-keychain v0.0.0-20231219164618-57a3676c3af6 // indirect
39 | github.com/kr/pretty v0.3.1 // indirect
40 | github.com/pmezard/go-difflib v1.0.0 // indirect
41 | github.com/rogpeppe/go-internal v1.10.0 // indirect
42 | github.com/tidwall/gjson v1.17.0 // indirect
43 | github.com/tidwall/match v1.1.1 // indirect
44 | github.com/tidwall/pretty v1.2.1 // indirect
45 | github.com/ysmood/fetchup v0.2.3 // indirect
46 | github.com/ysmood/goob v0.4.0 // indirect
47 | github.com/ysmood/got v0.34.1 // indirect
48 | github.com/ysmood/gson v0.7.3 // indirect
49 | github.com/ysmood/leakless v0.8.0 // indirect
50 | github.com/zalando/go-keyring v0.2.5 // indirect
51 | golang.org/x/crypto v0.29.0 // indirect
52 | golang.org/x/net v0.31.0 // indirect
53 | golang.org/x/sys v0.27.0 // indirect
54 | golang.org/x/term v0.26.0 // indirect
55 | golang.org/x/text v0.20.0 // indirect
56 | gopkg.in/yaml.v3 v3.0.1 // indirect
57 | www.velocidex.com/golang/go-ese v0.2.0 // indirect
58 | )
59 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | error() {
6 | echo -e "error:" "$@" >&2
7 | exit 1
8 | }
9 |
10 | if [[ ${OS:-} = Windows_NT ]]; then
11 | error "This installer does not support Windows."
12 | fi
13 |
14 | echo "Installing flyscrape"
15 |
16 | case $(uname -ms) in
17 | 'Darwin x86_64')
18 | target=darwin_amd64
19 | ;;
20 | 'Darwin arm64')
21 | target=darwin_arm64
22 | ;;
23 | 'Linux aarch64' | 'Linux arm64')
24 | target=linux_arm64
25 | ;;
26 | 'Linux x86_64' | *)
27 | target=linux_amd64
28 | ;;
29 | esac
30 |
31 | dir="$HOME/.flyscrape"
32 |
33 | mkdir -p "$dir" ||
34 | error "Failed to create directory: $HOME/.flyscrape"
35 |
36 |
37 | archive="$dir/flyscrape_$target.tar.gz"
38 | url="https://github.com/philippta/flyscrape/releases/latest/download/flyscrape_0.9.0_$target.tar.gz"
39 | curl --fail --location --progress-bar --output "$archive" "$url" ||
40 | error "Failed to download flyscrape from: $url"
41 |
42 | tar -xzf "$archive" -C "$dir" ||
43 | error "Failed to extract downloaded archive."
44 |
45 | chmod +x "$dir/flyscrape" ||
46 | error "Failed to chmod the flyscrape executable."
47 |
48 | rm "$archive" "$dir/README.md" "$dir/LICENSE" ||
49 | error "Failed to clean up the downloaded archive."
50 |
51 | case $(basename "$SHELL") in
52 | zsh)
53 | # Add paths to zsh
54 | if [[ ":$PATH:" != *":$HOME/.flyscrape:"* ]]; then
55 | if [[ -w "$HOME/.zshrc" ]]; then
56 | echo "# flyscrape" >> "$HOME/.zshrc"
57 | echo "export PATH=\"$dir:\$PATH\"" >> "$HOME/.zshrc"
58 | else
59 | echo ""
60 | echo "Manually add the directory to ~/.zshrc (or similar):"
61 | echo " export PATH=\"$dir:\$PATH\""
62 | fi
63 | fi
64 | ;;
65 | bash)
66 | # Add paths to bbash
67 | if [[ ":$PATH:" != *":$HOME/.flyscrape:"* ]]; then
68 | if [[ -w "$HOME/.bashrc" ]]; then
69 | echo "# flyscrape" >> "$HOME/.bashrc"
70 | echo "export PATH=$dir:\$PATH" >> "$HOME/.bashrc"
71 | else
72 | echo ""
73 | echo "Manually add the directory to ~/.bashrc (or similar):"
74 | echo " export PATH=$dir:\$PATH"
75 | fi
76 | fi
77 | ;;
78 | *)
79 | echo ""
80 | echo "Manually add the directory to ~/.bashrc (or similar):"
81 | echo " export PATH=$dir:\$PATH"
82 | ;;
83 | esac
84 |
85 | echo ""
86 | echo "The installation was successfull!"
87 | echo ""
88 | echo "Note:"
89 | echo "Please restart your terminal window. This ensures your system correctly detects flyscrape."
90 | echo ""
91 |
--------------------------------------------------------------------------------
/js.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | _ "embed"
9 | "encoding/json"
10 | "errors"
11 | "fmt"
12 | "log"
13 | "net/url"
14 | "strings"
15 | "sync"
16 |
17 | "github.com/PuerkitoBio/goquery"
18 | "github.com/dop251/goja"
19 | "github.com/dop251/goja_nodejs/console"
20 | "github.com/dop251/goja_nodejs/require"
21 | "github.com/evanw/esbuild/pkg/api"
22 | )
23 |
24 | //go:embed template.js
25 | var ScriptTemplate []byte
26 |
27 | type Config []byte
28 |
29 | type ScrapeParams struct {
30 | HTML string
31 | URL string
32 | Process func(url string) ([]byte, error)
33 | Follow func(url string)
34 | }
35 |
36 | type ScrapeFunc func(ScrapeParams) (any, error)
37 |
38 | type TransformError struct {
39 | Line int
40 | Column int
41 | Text string
42 | }
43 |
44 | func (err TransformError) Error() string {
45 | return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text)
46 | }
47 |
48 | type Exports map[string]any
49 |
50 | func (e Exports) Config() []byte {
51 | b, _ := json.Marshal(e["config"])
52 | return b
53 | }
54 |
55 | func (e Exports) Scrape(p ScrapeParams) (any, error) {
56 | fn := e["__scrape"].(ScrapeFunc)
57 | return fn(p)
58 | }
59 |
60 | type Imports map[string]map[string]any
61 |
62 | func Compile(src string, imports Imports) (Exports, error) {
63 | src, err := build(src)
64 | if err != nil {
65 | return nil, err
66 | }
67 | return vm(src, imports)
68 | }
69 |
70 | func build(src string) (string, error) {
71 | res := api.Build(api.BuildOptions{
72 | Loader: map[string]api.Loader{
73 | ".txt": api.LoaderText,
74 | ".json": api.LoaderJSON,
75 | },
76 | Bundle: true,
77 | Stdin: &api.StdinOptions{
78 | Contents: src,
79 | ResolveDir: ".",
80 | },
81 | Platform: api.PlatformNode,
82 | Format: api.FormatCommonJS,
83 | External: []string{"flyscrape"},
84 | })
85 |
86 | var errs []error
87 | for _, msg := range res.Errors {
88 | err := TransformError{Text: msg.Text}
89 | if msg.Location != nil {
90 | err.Line = msg.Location.Line
91 | err.Column = msg.Location.Column
92 | }
93 | errs = append(errs, err)
94 | }
95 | if len(res.Errors) > 0 {
96 | return "", errors.Join(errs...)
97 | }
98 | if len(res.OutputFiles) == 0 {
99 | return "", errors.New("no output generated")
100 | }
101 |
102 | return string(res.OutputFiles[0].Contents), nil
103 | }
104 |
105 | func vm(src string, imports Imports) (Exports, error) {
106 | vm := goja.New()
107 | registry := &require.Registry{}
108 |
109 | registry.Enable(vm)
110 | console.Enable(vm)
111 |
112 | for module, pkg := range imports {
113 | pkg := pkg
114 | registry.RegisterNativeModule(module, func(vm *goja.Runtime, o *goja.Object) {
115 | exports := vm.NewObject()
116 |
117 | for ident, val := range pkg {
118 | exports.Set(ident, val)
119 | }
120 |
121 | o.Set("exports", exports)
122 | })
123 | }
124 |
125 | if _, err := vm.RunString("module = {}"); err != nil {
126 | return nil, fmt.Errorf("running defining module: %w", err)
127 | }
128 | if _, err := vm.RunString(src); err != nil {
129 | return nil, fmt.Errorf("running user script: %w", err)
130 | }
131 |
132 | v, err := vm.RunString("module.exports")
133 | if err != nil {
134 | return nil, fmt.Errorf("reading config: %w", err)
135 | }
136 |
137 | exports := Exports{}
138 | if goja.IsUndefined(v) {
139 | return exports, nil
140 | }
141 |
142 | obj := v.ToObject(vm)
143 | for _, key := range obj.Keys() {
144 | exports[key] = obj.Get(key).Export()
145 | }
146 |
147 | exports["__scrape"], err = scrape(vm)
148 | if err != nil {
149 | return nil, err
150 | }
151 |
152 | return exports, nil
153 | }
154 |
155 | func scrape(vm *goja.Runtime) (ScrapeFunc, error) {
156 | var lock sync.Mutex
157 |
158 | if v, err := vm.RunString("module.exports.default"); err != nil || goja.IsUndefined(v) {
159 | return nil, errors.New("default export is not defined")
160 | }
161 |
162 | defaultfn, err := vm.RunString("(o) => JSON.stringify(module.exports.default(o))")
163 | if err != nil {
164 | return nil, fmt.Errorf("failed to create scrape function: %w", err)
165 | }
166 |
167 | scrapefn, ok := defaultfn.Export().(func(goja.FunctionCall) goja.Value)
168 | if !ok {
169 | return nil, errors.New("failed to export scrape function")
170 | }
171 |
172 | var newArg func(p ScrapeParams) (*goja.Object, error)
173 | newArg = func(p ScrapeParams) (*goja.Object, error) {
174 | doc, err := DocumentFromString(p.HTML)
175 | if err != nil {
176 | return nil, err
177 | }
178 |
179 | baseurl, err := url.Parse(p.URL)
180 | if err != nil {
181 | return nil, err
182 | }
183 |
184 | absoluteURL := func(ref string) string {
185 | abs, err := baseurl.Parse(ref)
186 | if err != nil {
187 | return ref
188 | }
189 | return abs.String()
190 | }
191 |
192 | o := vm.NewObject()
193 | o.Set("url", p.URL)
194 | o.Set("doc", doc)
195 | o.Set("absoluteURL", absoluteURL)
196 | o.Set("scrape", func(url string, f func(goja.FunctionCall) goja.Value) goja.Value {
197 | url = absoluteURL(url)
198 |
199 | html, err := p.Process(url)
200 | if err != nil {
201 | return vm.ToValue(map[string]any{"error": err.Error()})
202 | }
203 |
204 | newp := ScrapeParams{
205 | HTML: string(html),
206 | URL: url,
207 | Process: p.Process,
208 | }
209 |
210 | arg, err := newArg(newp)
211 | if err != nil {
212 | return vm.ToValue(map[string]any{"error": err.Error()})
213 | }
214 |
215 | return f(goja.FunctionCall{Arguments: []goja.Value{arg}})
216 | })
217 | o.Set("follow", func(url string) {
218 | p.Follow(absoluteURL(url))
219 | })
220 |
221 | return o, nil
222 | }
223 |
224 | return func(p ScrapeParams) (any, error) {
225 | lock.Lock()
226 | defer lock.Unlock()
227 |
228 | arg, err := newArg(p)
229 | if err != nil {
230 | return nil, err
231 | }
232 |
233 | ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{arg}})
234 | if goja.IsUndefined(ret) {
235 | return nil, nil
236 | }
237 |
238 | var result any
239 | if err := json.Unmarshal([]byte(ret.String()), &result); err != nil {
240 | log.Println(err)
241 | return nil, err
242 | }
243 |
244 | return result, nil
245 | }, nil
246 | }
247 |
248 | func DocumentFromString(s string) (map[string]any, error) {
249 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(s))
250 | if err != nil {
251 | return nil, err
252 | }
253 |
254 | return Document(doc.Selection), nil
255 | }
256 |
257 | func Document(sel *goquery.Selection) map[string]any {
258 | o := map[string]any{}
259 | o["WARNING"] = "Forgot to call text(), html() or attr()?"
260 | o["text"] = sel.Text
261 | o["name"] = func() string { return sel.Get(0).Data }
262 | o["html"] = func() string { h, _ := goquery.OuterHtml(sel); return h }
263 | o["attr"] = func(name string) string { v, _ := sel.Attr(name); return v }
264 | o["hasAttr"] = func(name string) bool { _, ok := sel.Attr(name); return ok }
265 | o["hasClass"] = sel.HasClass
266 | o["length"] = sel.Length()
267 | o["first"] = func() map[string]any { return Document(sel.First()) }
268 | o["last"] = func() map[string]any { return Document(sel.Last()) }
269 | o["get"] = func(index int) map[string]any { return Document(sel.Eq(index)) }
270 | o["find"] = func(s string) map[string]any { return Document(sel.Find(s)) }
271 | o["next"] = func() map[string]any { return Document(sel.Next()) }
272 | o["nextAll"] = func() map[string]any { return Document(sel.NextAll()) }
273 | o["nextUntil"] = func(s string) map[string]any { return Document(sel.NextUntil(s)) }
274 | o["prev"] = func() map[string]any { return Document(sel.Prev()) }
275 | o["prevAll"] = func() map[string]any { return Document(sel.PrevAll()) }
276 | o["prevUntil"] = func(s string) map[string]any { return Document(sel.PrevUntil(s)) }
277 | o["siblings"] = func() map[string]any { return Document(sel.Siblings()) }
278 | o["children"] = func() map[string]any { return Document(sel.Children()) }
279 | o["parent"] = func() map[string]any { return Document(sel.Parent()) }
280 | o["map"] = func(callback func(map[string]any, int) any) []any {
281 | var vals []any
282 | sel.Map(func(i int, s *goquery.Selection) string {
283 | vals = append(vals, callback(Document(s), i))
284 | return ""
285 | })
286 | return vals
287 | }
288 | o["filter"] = func(callback func(map[string]any, int) bool) []any {
289 | var vals []any
290 | sel.Each(func(i int, s *goquery.Selection) {
291 | el := Document(s)
292 | ok := callback(el, i)
293 | if ok {
294 | vals = append(vals, el)
295 | }
296 | })
297 | return vals
298 | }
299 | return o
300 | }
301 |
--------------------------------------------------------------------------------
/js_lib.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | "bytes"
9 | "encoding/json"
10 | "fmt"
11 | "io"
12 | "log"
13 | "mime"
14 | "net/http"
15 | gourl "net/url"
16 | "os"
17 | "path/filepath"
18 | "strings"
19 |
20 | "golang.org/x/sync/errgroup"
21 | )
22 |
23 | func NewJSLibrary(client *http.Client) (imports Imports, wait func()) {
24 | downloads := &errgroup.Group{}
25 |
26 | // Allow 5 parallel downloads. Why 5?
27 | // Docker downloads 3 layers in parallel.
28 | // My Chrome downloads up to 6 files in parallel.
29 | // 5 feels like a reasonable number.
30 | downloads.SetLimit(5)
31 |
32 | im := Imports{
33 | "flyscrape": map[string]any{
34 | "parse": jsParse(),
35 | },
36 | "flyscrape/http": map[string]any{
37 | "get": jsHTTPGet(client),
38 | "postForm": jsHTTPPostForm(client),
39 | "postJSON": jsHTTPPostJSON(client),
40 | "download": jsHTTPDownload(client, downloads),
41 | },
42 | }
43 |
44 | return im, func() { downloads.Wait() }
45 | }
46 |
47 | func jsParse() func(html string) map[string]any {
48 | return func(html string) map[string]any {
49 | doc, err := DocumentFromString(html)
50 | if err != nil {
51 | return nil
52 | }
53 | return doc
54 | }
55 | }
56 |
57 | func jsHTTPGet(client *http.Client) func(url string) map[string]any {
58 | return func(url string) map[string]any {
59 | req, err := http.NewRequest("GET", url, nil)
60 | if err != nil {
61 | return map[string]any{"error": err.Error()}
62 | }
63 | return jsFetch(client, req)
64 | }
65 | }
66 |
67 | func jsHTTPPostForm(client *http.Client) func(url string, form map[string]any) map[string]any {
68 | return func(url string, form map[string]any) map[string]any {
69 | vals := gourl.Values{}
70 | for k, v := range form {
71 | switch v := v.(type) {
72 | case []any:
73 | for _, v := range v {
74 | vals.Add(k, fmt.Sprintf("%v", v))
75 | }
76 | default:
77 | vals.Add(k, fmt.Sprintf("%v", v))
78 | }
79 | }
80 |
81 | req, err := http.NewRequest("POST", url, strings.NewReader(vals.Encode()))
82 | if err != nil {
83 | return map[string]any{"error": err.Error()}
84 | }
85 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
86 |
87 | return jsFetch(client, req)
88 | }
89 | }
90 |
91 | func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]any {
92 | return func(url string, data any) map[string]any {
93 | b, _ := json.Marshal(data)
94 |
95 | req, err := http.NewRequest("POST", url, bytes.NewReader(b))
96 | if err != nil {
97 | return map[string]any{"error": err.Error()}
98 | }
99 | req.Header.Set("Content-Type", "application/json")
100 |
101 | return jsFetch(client, req)
102 | }
103 | }
104 |
105 | func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) {
106 | fileExists := func(name string) bool {
107 | _, err := os.Stat(name)
108 | return err == nil
109 | }
110 |
111 | isDir := func(path string) bool {
112 | if strings.HasSuffix(path, "/") {
113 | return true
114 | }
115 | if filepath.Ext(path) == "" {
116 | return true
117 | }
118 | s, err := os.Stat(path)
119 | return err == nil && s.IsDir()
120 | }
121 |
122 | suggestedFilename := func(url, contentDisp string) string {
123 | filename := filepath.Base(url)
124 |
125 | if contentDisp == "" {
126 | return filename
127 | }
128 |
129 | _, params, err := mime.ParseMediaType(contentDisp)
130 | if err != nil {
131 | return filename
132 | }
133 |
134 | name, ok := params["filename"]
135 | if !ok || name == "" {
136 | return filename
137 | }
138 |
139 | return filepath.Base(name)
140 | }
141 |
142 | return func(url string, dst string) {
143 | g.Go(func() error {
144 | req, err := http.NewRequest("GET", url, nil)
145 | if err != nil {
146 | log.Printf("error downloading file %q: %v", url, err)
147 | return nil
148 | }
149 | req.Header.Add(HeaderBypassCache, "true")
150 |
151 | resp, err := client.Do(req)
152 | if err != nil {
153 | log.Printf("error downloading file %q: %v", url, err)
154 | return nil
155 | }
156 | defer resp.Body.Close()
157 |
158 | if resp.StatusCode < 200 || resp.StatusCode >= 300 {
159 | log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode)
160 | return nil
161 | }
162 |
163 | dst, err = filepath.Abs(dst)
164 | if err != nil {
165 | log.Printf("error downloading file %q: abs path failed: %v", url, err)
166 | return nil
167 | }
168 |
169 | if isDir(dst) {
170 | name := suggestedFilename(url, resp.Header.Get("Content-Disposition"))
171 | dst = filepath.Join(dst, name)
172 | }
173 |
174 | if fileExists(dst) {
175 | return nil
176 | }
177 |
178 | os.MkdirAll(filepath.Dir(dst), 0o755)
179 | f, err := os.Create(dst)
180 | if err != nil {
181 | log.Printf("error downloading file %q: file save failed: %v", url, err)
182 | return nil
183 | }
184 | defer f.Close()
185 |
186 | io.Copy(f, resp.Body)
187 | return nil
188 | })
189 | }
190 | }
191 |
192 | func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) {
193 | obj = map[string]any{
194 | "body": "",
195 | "status": 0,
196 | "headers": map[string]any{},
197 | "error": "",
198 | }
199 |
200 | resp, err := client.Do(req)
201 | if err != nil {
202 | obj["error"] = err.Error()
203 | return
204 | }
205 | defer resp.Body.Close()
206 |
207 | obj["status"] = resp.StatusCode
208 |
209 | b, err := io.ReadAll(resp.Body)
210 | if err != nil {
211 | obj["error"] = err.Error()
212 | return
213 | }
214 |
215 | obj["body"] = string(b)
216 |
217 | headers := map[string]any{}
218 | for name := range resp.Header {
219 | headers[name] = resp.Header.Get(name)
220 | }
221 | obj["headers"] = headers
222 |
223 | return
224 | }
225 |
--------------------------------------------------------------------------------
/js_lib_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape_test
6 |
7 | import (
8 | "encoding/json"
9 | "net/http"
10 | "os"
11 | "sync/atomic"
12 | "testing"
13 |
14 | "github.com/philippta/flyscrape"
15 | "github.com/stretchr/testify/require"
16 | )
17 |
18 | func TestJSLibParse(t *testing.T) {
19 | script := `
20 | import { parse } from "flyscrape"
21 |
22 | const doc = parse('Hello world
')
23 | export const text = doc.find(".foo").text()
24 |
25 | export default function () {}
26 | `
27 |
28 | client := &http.Client{
29 | Transport: flyscrape.MockTransport(200, html),
30 | }
31 |
32 | imports, _ := flyscrape.NewJSLibrary(client)
33 | exports, err := flyscrape.Compile(script, imports)
34 | require.NoError(t, err)
35 |
36 | h, ok := exports["text"].(string)
37 | require.True(t, ok)
38 | require.Equal(t, "Hello world", h)
39 | }
40 |
41 | func TestJSLibHTTPGet(t *testing.T) {
42 | script := `
43 | import http from "flyscrape/http"
44 |
45 | const res = http.get("https://example.com")
46 |
47 | export const body = res.body;
48 | export const status = res.status;
49 | export const error = res.error;
50 | export const headers = res.headers;
51 |
52 | export default function () {}
53 | `
54 |
55 | client := &http.Client{
56 | Transport: flyscrape.MockTransport(200, html),
57 | }
58 |
59 | imports, _ := flyscrape.NewJSLibrary(client)
60 | exports, err := flyscrape.Compile(script, imports)
61 | require.NoError(t, err)
62 |
63 | body, ok := exports["body"].(string)
64 | require.True(t, ok)
65 | require.Equal(t, html, body)
66 |
67 | status, ok := exports["status"].(int64)
68 | require.True(t, ok)
69 | require.Equal(t, int64(200), status)
70 |
71 | error, ok := exports["error"].(string)
72 | require.True(t, ok)
73 | require.Equal(t, "", error)
74 |
75 | headers, ok := exports["headers"].(map[string]any)
76 | require.True(t, ok)
77 | require.NotEmpty(t, headers)
78 | }
79 |
80 | func TestJSLibHTTPPostForm(t *testing.T) {
81 | script := `
82 | import http from "flyscrape/http"
83 |
84 | const res = http.postForm("https://example.com", {
85 | username: "foo",
86 | password: "bar",
87 | arr: [1,2,3],
88 | })
89 |
90 | export const body = res.body;
91 | export const status = res.status;
92 | export const error = res.error;
93 | export const headers = res.headers;
94 |
95 | export default function () {}
96 | `
97 |
98 | client := &http.Client{
99 | Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
100 | require.Equal(t, "POST", r.Method)
101 | require.Equal(t, "application/x-www-form-urlencoded", r.Header.Get("Content-Type"))
102 | require.Equal(t, "foo", r.FormValue("username"))
103 | require.Equal(t, "bar", r.FormValue("password"))
104 | require.Len(t, r.Form["arr"], 3)
105 |
106 | return flyscrape.MockResponse(400, "Bad Request")
107 | }),
108 | }
109 |
110 | imports, _ := flyscrape.NewJSLibrary(client)
111 | exports, err := flyscrape.Compile(script, imports)
112 | require.NoError(t, err)
113 |
114 | body, ok := exports["body"].(string)
115 | require.True(t, ok)
116 | require.Equal(t, "Bad Request", body)
117 |
118 | status, ok := exports["status"].(int64)
119 | require.True(t, ok)
120 | require.Equal(t, int64(400), status)
121 |
122 | error, ok := exports["error"].(string)
123 | require.True(t, ok)
124 | require.Equal(t, "", error)
125 |
126 | headers, ok := exports["headers"].(map[string]any)
127 | require.True(t, ok)
128 | require.NotEmpty(t, headers)
129 | }
130 |
131 | func TestJSLibHTTPPostJSON(t *testing.T) {
132 | script := `
133 | import http from "flyscrape/http"
134 |
135 | const res = http.postJSON("https://example.com", {
136 | username: "foo",
137 | password: "bar",
138 | })
139 |
140 | export const body = res.body;
141 | export const status = res.status;
142 | export const error = res.error;
143 | export const headers = res.headers;
144 |
145 | export default function () {}
146 | `
147 |
148 | client := &http.Client{
149 | Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
150 | require.Equal(t, "POST", r.Method)
151 | require.Equal(t, "application/json", r.Header.Get("Content-Type"))
152 |
153 | m := map[string]any{}
154 | json.NewDecoder(r.Body).Decode(&m)
155 | require.Equal(t, "foo", m["username"])
156 | require.Equal(t, "bar", m["password"])
157 |
158 | return flyscrape.MockResponse(400, "Bad Request")
159 | }),
160 | }
161 |
162 | imports, _ := flyscrape.NewJSLibrary(client)
163 | exports, err := flyscrape.Compile(script, imports)
164 | require.NoError(t, err)
165 |
166 | body, ok := exports["body"].(string)
167 | require.True(t, ok)
168 | require.Equal(t, "Bad Request", body)
169 |
170 | status, ok := exports["status"].(int64)
171 | require.True(t, ok)
172 | require.Equal(t, int64(400), status)
173 |
174 | error, ok := exports["error"].(string)
175 | require.True(t, ok)
176 | require.Equal(t, "", error)
177 |
178 | headers, ok := exports["headers"].(map[string]any)
179 | require.True(t, ok)
180 | require.NotEmpty(t, headers)
181 | }
182 |
183 | func TestJSLibHTTPDownload(t *testing.T) {
184 | cwd, err := os.Getwd()
185 | require.NoError(t, err)
186 |
187 | tmpdir, err := os.MkdirTemp("", "http-download")
188 | require.NoError(t, err)
189 |
190 | defer os.RemoveAll(tmpdir)
191 | defer os.Chdir(cwd)
192 | os.Chdir(tmpdir)
193 |
194 | script := `
195 | import http from "flyscrape/http";
196 |
197 | http.download("https://example.com/foo.txt", "foo.txt");
198 | http.download("https://example.com/foo.txt", "dir/my-foo.txt");
199 | http.download("https://example.com/bar.txt", "dir/");
200 | http.download("https://example.com/baz.txt", "dir");
201 | http.download("https://example.com/content-disposition", ".");
202 | http.download("https://example.com/hack.txt", ".");
203 | http.download("https://example.com/no-dest.txt");
204 | http.download("https://example.com/404.txt");
205 | `
206 |
207 | var nreqs atomic.Int32
208 | client := &http.Client{
209 | Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
210 | nreqs.Add(1)
211 |
212 | if r.URL.Path == "/content-disposition" {
213 | resp, err := flyscrape.MockResponse(200, "hello world")
214 | resp.Header.Set("Content-Disposition", `attachment; filename="qux.txt"`)
215 | return resp, err
216 | }
217 | if r.URL.Path == "/hack.txt" {
218 | resp, err := flyscrape.MockResponse(200, "hello world")
219 | resp.Header.Set("Content-Disposition", `attachment; filename="../../hack.txt"`)
220 | return resp, err
221 | }
222 | if r.URL.Path == "/404.txt" {
223 | resp, err := flyscrape.MockResponse(404, "hello world")
224 | return resp, err
225 | }
226 |
227 | return flyscrape.MockResponse(200, "hello world")
228 | }),
229 | }
230 |
231 | imports, wait := flyscrape.NewJSLibrary(client)
232 | _, err = flyscrape.Compile(script, imports)
233 | require.NoError(t, err)
234 |
235 | wait()
236 |
237 | require.Equal(t, nreqs.Load(), int32(8))
238 | require.FileExists(t, "foo.txt")
239 | require.FileExists(t, "dir/my-foo.txt")
240 | require.FileExists(t, "dir/bar.txt")
241 | require.FileExists(t, "dir/baz.txt")
242 | require.FileExists(t, "qux.txt")
243 | require.FileExists(t, "hack.txt")
244 | require.FileExists(t, "no-dest.txt")
245 | require.NoFileExists(t, "404.txt")
246 | }
247 |
--------------------------------------------------------------------------------
/js_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape_test
6 |
7 | import (
8 | "encoding/json"
9 | "testing"
10 |
11 | "github.com/dop251/goja"
12 | "github.com/philippta/flyscrape"
13 | "github.com/stretchr/testify/require"
14 | )
15 |
16 | var html = `
17 |
18 |
19 |
20 | headline
21 | paragraph
22 |
23 |
24 | `
25 |
26 | var script = `
27 | export const config = {
28 | url: "https://localhost/",
29 | }
30 |
31 | export default function({ doc, url }) {
32 | return {
33 | headline: doc.find("h1").text(),
34 | body: doc.find("p").text(),
35 | url: url,
36 | }
37 | }
38 | `
39 |
40 | func TestJSScrape(t *testing.T) {
41 | exports, err := flyscrape.Compile(script, nil)
42 | require.NoError(t, err)
43 | require.NotNil(t, exports)
44 | require.NotEmpty(t, exports.Config)
45 |
46 | result, err := exports.Scrape(flyscrape.ScrapeParams{
47 | HTML: html,
48 | URL: "http://localhost/",
49 | })
50 |
51 | require.NoError(t, err)
52 |
53 | m, ok := result.(map[string]any)
54 | require.True(t, ok)
55 | require.Equal(t, "headline", m["headline"])
56 | require.Equal(t, "paragraph", m["body"])
57 | require.Equal(t, "http://localhost/", m["url"])
58 | }
59 |
60 | func TestJSScrapeObject(t *testing.T) {
61 | js := `
62 | export default function() {
63 | return {foo: "bar"}
64 | }
65 | `
66 | exports, err := flyscrape.Compile(js, nil)
67 | require.NoError(t, err)
68 |
69 | result, err := exports.Scrape(flyscrape.ScrapeParams{
70 | HTML: html,
71 | URL: "http://localhost/",
72 | })
73 | require.NoError(t, err)
74 |
75 | m, ok := result.(map[string]any)
76 | require.True(t, ok)
77 | require.Equal(t, "bar", m["foo"])
78 | }
79 |
80 | func TestJSScrapeNull(t *testing.T) {
81 | js := `
82 | export default function() {
83 | return null
84 | }
85 | `
86 | exports, err := flyscrape.Compile(js, nil)
87 | require.NoError(t, err)
88 |
89 | result, err := exports.Scrape(flyscrape.ScrapeParams{
90 | HTML: html,
91 | URL: "http://localhost/",
92 | })
93 | require.NoError(t, err)
94 | require.Nil(t, result)
95 | }
96 |
97 | func TestJSScrapeUndefined(t *testing.T) {
98 | js := `
99 | export default function() { }
100 | `
101 | exports, err := flyscrape.Compile(js, nil)
102 | require.NoError(t, err)
103 |
104 | result, err := exports.Scrape(flyscrape.ScrapeParams{
105 | HTML: html,
106 | URL: "http://localhost/",
107 | })
108 | require.NoError(t, err)
109 | require.Nil(t, result)
110 | }
111 |
112 | func TestJSScrapeString(t *testing.T) {
113 | js := `
114 | export default function() {
115 | return "foo"
116 | }
117 | `
118 | exports, err := flyscrape.Compile(js, nil)
119 | require.NoError(t, err)
120 |
121 | result, err := exports.Scrape(flyscrape.ScrapeParams{
122 | HTML: html,
123 | URL: "http://localhost/",
124 | })
125 | require.NoError(t, err)
126 |
127 | m, ok := result.(string)
128 | require.True(t, ok)
129 | require.Equal(t, "foo", m)
130 | }
131 |
132 | func TestJSScrapeArray(t *testing.T) {
133 | js := `
134 | export default function() {
135 | return [1,2,3]
136 | }
137 | `
138 | exports, err := flyscrape.Compile(js, nil)
139 | require.NoError(t, err)
140 |
141 | result, err := exports.Scrape(flyscrape.ScrapeParams{
142 | HTML: html,
143 | URL: "http://localhost/",
144 | })
145 | require.NoError(t, err)
146 |
147 | m, ok := result.([]any)
148 | require.True(t, ok)
149 | require.Equal(t, float64(1), m[0])
150 | require.Equal(t, float64(2), m[1])
151 | require.Equal(t, float64(3), m[2])
152 | }
153 |
154 | func TestJSScrapeNaN(t *testing.T) {
155 | js := `
156 | export default function() {
157 | return NaN
158 | }
159 | `
160 | exports, err := flyscrape.Compile(js, nil)
161 | require.NoError(t, err)
162 |
163 | result, err := exports.Scrape(flyscrape.ScrapeParams{
164 | HTML: html,
165 | URL: "http://localhost/",
166 | })
167 | require.NoError(t, err)
168 | require.Nil(t, result)
169 | }
170 |
171 | func TestJSScrapeParamURL(t *testing.T) {
172 | js := `
173 | export default function({ url }) {
174 | return url;
175 | }
176 | `
177 | exports, err := flyscrape.Compile(js, nil)
178 | require.NoError(t, err)
179 |
180 | result, err := exports.Scrape(flyscrape.ScrapeParams{
181 | HTML: html,
182 | URL: "http://localhost/",
183 | })
184 | require.NoError(t, err)
185 | require.Equal(t, "http://localhost/", result)
186 | }
187 |
188 | func TestJSScrapeParamAbsoluteURL(t *testing.T) {
189 | js := `
190 | export default function({ absoluteURL }) {
191 | return absoluteURL("/foo");
192 | }
193 | `
194 | exports, err := flyscrape.Compile(js, nil)
195 | require.NoError(t, err)
196 |
197 | result, err := exports.Scrape(flyscrape.ScrapeParams{
198 | HTML: html,
199 | URL: "http://localhost/",
200 | })
201 | require.NoError(t, err)
202 | require.Equal(t, "http://localhost/foo", result)
203 | }
204 |
205 | func TestJSScrapeParamScrape(t *testing.T) {
206 | js := `
207 | export default function({ scrape }) {
208 | return scrape("/foo", function({ url }) {
209 | return {
210 | url: url,
211 | foo: "bar",
212 | };
213 | });
214 | }
215 | `
216 | exports, err := flyscrape.Compile(js, nil)
217 | require.NoError(t, err)
218 |
219 | result, err := exports.Scrape(flyscrape.ScrapeParams{
220 | HTML: html,
221 | URL: "http://localhost/",
222 | Process: func(url string) ([]byte, error) {
223 | return nil, nil
224 | },
225 | })
226 | require.NoError(t, err)
227 | require.Equal(t, map[string]any{
228 | "url": "http://localhost/foo",
229 | "foo": "bar",
230 | }, result)
231 | }
232 |
233 | func TestJSScrapeParamScrapeDeep(t *testing.T) {
234 | js := `
235 | export default function({ scrape }) {
236 | return scrape("/foo/", function({ url, scrape }) {
237 | return {
238 | url: url,
239 | deep: scrape("bar", function({ url }) {
240 | return url;
241 | }),
242 | };
243 | });
244 | }
245 | `
246 | exports, err := flyscrape.Compile(js, nil)
247 | require.NoError(t, err)
248 |
249 | result, err := exports.Scrape(flyscrape.ScrapeParams{
250 | HTML: html,
251 | URL: "http://localhost/",
252 | Process: func(url string) ([]byte, error) {
253 | return nil, nil
254 | },
255 | })
256 | require.NoError(t, err)
257 | require.Equal(t, map[string]any{
258 | "url": "http://localhost/foo/",
259 | "deep": "http://localhost/foo/bar",
260 | }, result)
261 | }
262 |
263 | func TestJSScrapeParamFollow(t *testing.T) {
264 | js := `
265 | export default function({ follow }) {
266 | follow("/foo")
267 | }
268 | `
269 | exports, err := flyscrape.Compile(js, nil)
270 | require.NoError(t, err)
271 |
272 | var followedURL string
273 | _, err = exports.Scrape(flyscrape.ScrapeParams{
274 | HTML: html,
275 | URL: "http://localhost/",
276 | Follow: func(url string) {
277 | followedURL = url
278 | },
279 | })
280 | require.NoError(t, err)
281 | require.Equal(t, "http://localhost/foo", followedURL)
282 | }
283 |
284 | func TestJSCompileError(t *testing.T) {
285 | exports, err := flyscrape.Compile("import foo;", nil)
286 | require.Error(t, err)
287 | require.Nil(t, exports)
288 |
289 | var terr flyscrape.TransformError
290 | require.ErrorAs(t, err, &terr)
291 |
292 | require.Equal(t, terr, flyscrape.TransformError{
293 | Line: 1,
294 | Column: 10,
295 | Text: `Expected "from" but found ";"`,
296 | })
297 | }
298 |
299 | func TestJSConfig(t *testing.T) {
300 | js := `
301 | export const config = {
302 | url: 'http://localhost/',
303 | depth: 5,
304 | allowedDomains: ['example.com'],
305 | }
306 | export default function() {}
307 | `
308 | exports, err := flyscrape.Compile(js, nil)
309 | require.NoError(t, err)
310 | require.NotNil(t, exports)
311 | require.NotEmpty(t, exports.Config())
312 |
313 | type config struct {
314 | URL string `json:"url"`
315 | Depth int `json:"depth"`
316 | AllowedDomains []string `json:"allowedDomains"`
317 | }
318 |
319 | var cfg config
320 | err = json.Unmarshal(exports.Config(), &cfg)
321 | require.NoError(t, err)
322 |
323 | require.Equal(t, config{
324 | URL: "http://localhost/",
325 | Depth: 5,
326 | AllowedDomains: []string{"example.com"},
327 | }, cfg)
328 | }
329 |
330 | func TestJSImports(t *testing.T) {
331 | js := `
332 | import A from "flyscrape"
333 | import { bar } from "flyscrape/foo"
334 |
335 | export const config = {}
336 | export default function() {}
337 |
338 | export const a = A.foo
339 | export const b = bar()
340 | `
341 | imports := flyscrape.Imports{
342 | "flyscrape": map[string]any{
343 | "foo": 10,
344 | },
345 | "flyscrape/foo": map[string]any{
346 | "bar": func() string {
347 | return "baz"
348 | },
349 | },
350 | }
351 |
352 | exports, err := flyscrape.Compile(js, imports)
353 | require.NoError(t, err)
354 | require.NotNil(t, exports)
355 |
356 | require.Equal(t, int64(10), exports["a"].(int64))
357 | require.Equal(t, "baz", exports["b"].(string))
358 | }
359 |
360 | func TestJSArbitraryFunction(t *testing.T) {
361 | js := `
362 | export const config = {}
363 | export default function() {}
364 | export function foo() {
365 | return "bar";
366 | }
367 | `
368 | exports, err := flyscrape.Compile(js, nil)
369 | require.NoError(t, err)
370 | require.NotNil(t, exports)
371 |
372 | foo := func() string {
373 | fn := exports["foo"].(func(goja.FunctionCall) goja.Value)
374 | return fn(goja.FunctionCall{}).String()
375 | }
376 |
377 | require.Equal(t, "bar", foo())
378 | }
379 |
380 | func TestJSArbitraryConstString(t *testing.T) {
381 | js := `
382 | export const config = {}
383 | export default function() {}
384 | export const foo = "bar"
385 | `
386 | exports, err := flyscrape.Compile(js, nil)
387 | require.NoError(t, err)
388 | require.NotNil(t, exports)
389 |
390 | require.Equal(t, "bar", exports["foo"].(string))
391 | }
392 |
--------------------------------------------------------------------------------
/module.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | "encoding/json"
9 | "net/http"
10 | "sync"
11 | )
12 |
13 | type Module interface {
14 | ModuleInfo() ModuleInfo
15 | }
16 |
17 | type ModuleInfo struct {
18 | ID string
19 | New func() Module
20 | }
21 |
22 | type TransportAdapter interface {
23 | AdaptTransport(http.RoundTripper) http.RoundTripper
24 | }
25 |
26 | type RequestValidator interface {
27 | ValidateRequest(*Request) bool
28 | }
29 |
30 | type RequestBuilder interface {
31 | BuildRequest(*Request)
32 | }
33 |
34 | type ResponseReceiver interface {
35 | ReceiveResponse(*Response)
36 | }
37 |
38 | type Provisioner interface {
39 | Provision(Context)
40 | }
41 |
42 | type Finalizer interface {
43 | Finalize()
44 | }
45 |
46 | func RegisterModule(mod Module) {
47 | modulesMu.Lock()
48 | defer modulesMu.Unlock()
49 |
50 | id := mod.ModuleInfo().ID
51 | if _, ok := modules[id]; ok {
52 | panic("module with id: " + id + " already registered")
53 | }
54 | modules[mod.ModuleInfo().ID] = mod
55 | }
56 |
57 | func LoadModules(cfg Config) []Module {
58 | modulesMu.RLock()
59 | defer modulesMu.RUnlock()
60 |
61 | loaded := map[string]struct{}{}
62 | mods := []Module{}
63 |
64 | // load standard modules in order
65 | for _, id := range moduleOrder {
66 | if _, ok := loaded[id]; ok {
67 | continue
68 | }
69 | mod := modules[id].ModuleInfo().New()
70 | if err := json.Unmarshal(cfg, mod); err != nil {
71 | panic("failed to decode config: " + err.Error())
72 | }
73 | mods = append(mods, mod)
74 | loaded[id] = struct{}{}
75 | }
76 |
77 | // load custom modules
78 | for id := range modules {
79 | if _, ok := loaded[id]; ok {
80 | continue
81 | }
82 | mod := modules[id].ModuleInfo().New()
83 | if err := json.Unmarshal(cfg, mod); err != nil {
84 | panic("failed to decode config: " + err.Error())
85 | }
86 | mods = append(mods, mod)
87 | loaded[id] = struct{}{}
88 | }
89 |
90 | return mods
91 | }
92 |
93 | var (
94 | modules = map[string]Module{}
95 | modulesMu sync.RWMutex
96 |
97 | moduleOrder = []string{
98 | // Transport adapters must be loaded in a specific order.
99 | // All other modules can be loaded in any order.
100 | "proxy",
101 | "browser",
102 | "retry",
103 | "ratelimit",
104 | "cache",
105 | "cookies",
106 | "headers",
107 | }
108 | )
109 |
--------------------------------------------------------------------------------
/modules/browser/browser.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package browser
6 |
7 | import (
8 | "fmt"
9 | "io"
10 | "log"
11 | "net/http"
12 | "os"
13 | "strings"
14 | "sync"
15 | "time"
16 |
17 | "github.com/go-rod/rod"
18 | "github.com/go-rod/rod/lib/launcher"
19 | "github.com/go-rod/rod/lib/proto"
20 | "github.com/philippta/flyscrape"
21 | )
22 |
23 | func init() {
24 | flyscrape.RegisterModule(Module{})
25 | }
26 |
27 | type Module struct {
28 | Browser bool `json:"browser"`
29 | Headless *bool `json:"headless"`
30 |
31 | browser *rod.Browser
32 | }
33 |
34 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
35 | return flyscrape.ModuleInfo{
36 | ID: "browser",
37 | New: func() flyscrape.Module { return new(Module) },
38 | }
39 | }
40 |
41 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
42 | if !m.Browser {
43 | return t
44 | }
45 |
46 | headless := true
47 | if m.Headless != nil {
48 | headless = *m.Headless
49 | }
50 |
51 | browser, err := newBrowser(headless)
52 | if err != nil {
53 | log.Println(err)
54 | os.Exit(1)
55 | }
56 |
57 | m.browser = browser
58 |
59 | return chromeTransport(browser)
60 | }
61 |
62 | func (m *Module) Finalize() {
63 | if m.browser != nil {
64 | m.browser.Close()
65 | }
66 | }
67 |
68 | func newBrowser(headless bool) (*rod.Browser, error) {
69 | serviceURL, err := launcher.New().
70 | Headless(headless).
71 | Launch()
72 | if err != nil {
73 | return nil, fmt.Errorf("failed to launch browser: %w", err)
74 | }
75 |
76 | browser := rod.New().ControlURL(serviceURL).NoDefaultDevice()
77 | if err := browser.Connect(); err != nil {
78 | return nil, fmt.Errorf("failed to connect to browser: %w", err)
79 | }
80 |
81 | return browser, nil
82 | }
83 |
84 | func chromeTransport(browser *rod.Browser) flyscrape.RoundTripFunc {
85 | return func(r *http.Request) (*http.Response, error) {
86 | select {
87 | case <-r.Context().Done():
88 | return nil, r.Context().Err()
89 | default:
90 | }
91 |
92 | page := browser.MustPage()
93 | defer page.Close()
94 |
95 | var once sync.Once
96 | var networkResponse *proto.NetworkResponse
97 | go page.EachEvent(func(e *proto.NetworkResponseReceived) {
98 | if e.Type != proto.NetworkResourceTypeDocument {
99 | return
100 | }
101 | once.Do(func() {
102 | networkResponse = e.Response
103 | })
104 | })()
105 |
106 | page = page.Context(r.Context())
107 |
108 | for h := range r.Header {
109 | if h == "Cookie" {
110 | continue
111 | }
112 | if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") {
113 | continue
114 | }
115 | page.MustSetExtraHeaders(h, r.Header.Get(h))
116 | }
117 |
118 | page.SetCookies(parseCookies(r))
119 |
120 | if err := page.Navigate(r.URL.String()); err != nil {
121 | return nil, err
122 | }
123 |
124 | timeout := page.Timeout(10 * time.Second)
125 | timeout.WaitLoad()
126 | timeout.WaitDOMStable(300*time.Millisecond, 0)
127 | timeout.WaitRequestIdle(time.Second, nil, nil, nil)
128 |
129 | html, err := page.HTML()
130 | if err != nil {
131 | return nil, err
132 | }
133 |
134 | resp := &http.Response{
135 | StatusCode: 200,
136 | Status: "200 OK",
137 | Body: io.NopCloser(strings.NewReader(html)),
138 | Header: http.Header{"Content-Type": []string{"text/html"}},
139 | }
140 |
141 | if networkResponse != nil {
142 | resp.StatusCode = networkResponse.Status
143 | resp.Status = networkResponse.StatusText
144 | resp.Header = http.Header{}
145 |
146 | for k, v := range networkResponse.Headers {
147 | resp.Header.Set(k, v.String())
148 | }
149 | }
150 |
151 | return resp, err
152 | }
153 | }
154 |
155 | func parseCookies(r *http.Request) []*proto.NetworkCookieParam {
156 | rawCookie := r.Header.Get("Cookie")
157 | if rawCookie == "" {
158 | return nil
159 | }
160 |
161 | header := http.Header{}
162 | header.Add("Cookie", rawCookie)
163 | request := http.Request{Header: header}
164 |
165 | domainSegs := strings.Split(r.URL.Hostname(), ".")
166 | if len(domainSegs) < 2 {
167 | return nil
168 | }
169 |
170 | domain := "." + strings.Join(domainSegs[len(domainSegs)-2:], ".")
171 |
172 | var cookies []*proto.NetworkCookieParam
173 | for _, cookie := range request.Cookies() {
174 | cookies = append(cookies, &proto.NetworkCookieParam{
175 | Name: cookie.Name,
176 | Value: cookie.Value,
177 | Domain: domain,
178 | Path: "/",
179 | Secure: false,
180 | HTTPOnly: false,
181 | SameSite: "Lax",
182 | Expires: -1,
183 | URL: r.URL.String(),
184 | })
185 | }
186 |
187 | return cookies
188 | }
189 |
190 | var (
191 | _ flyscrape.TransportAdapter = &Module{}
192 | _ flyscrape.Finalizer = &Module{}
193 | )
194 |
--------------------------------------------------------------------------------
/modules/browser/browser_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package browser_test
6 |
7 | import (
8 | "fmt"
9 | "net/http"
10 | "net/http/httptest"
11 | "testing"
12 |
13 | "github.com/philippta/flyscrape"
14 | "github.com/philippta/flyscrape/modules/browser"
15 | "github.com/philippta/flyscrape/modules/headers"
16 | "github.com/philippta/flyscrape/modules/hook"
17 | "github.com/philippta/flyscrape/modules/starturl"
18 | "github.com/stretchr/testify/require"
19 | )
20 |
21 | func TestBrowser(t *testing.T) {
22 | t.SkipNow()
23 |
24 | var called bool
25 |
26 | srv := newServer(func(w http.ResponseWriter, r *http.Request) {
27 | called = true
28 | w.Write([]byte(`Hello Browser Foo `))
29 | })
30 | defer srv.Close()
31 |
32 | var body string
33 |
34 | mods := []flyscrape.Module{
35 | &starturl.Module{URL: srv.URL},
36 | &browser.Module{Browser: true},
37 | &hook.Module{
38 | ReceiveResponseFn: func(r *flyscrape.Response) {
39 | body = string(r.Body)
40 | },
41 | },
42 | }
43 |
44 | scraper := flyscrape.NewScraper()
45 | scraper.Modules = mods
46 | scraper.Run()
47 |
48 | require.True(t, called)
49 | require.Contains(t, body, "Hello Browser")
50 | }
51 | func TestBrowserStatusCode(t *testing.T) {
52 | t.SkipNow()
53 |
54 | srv := newServer(func(w http.ResponseWriter, r *http.Request) {
55 | w.WriteHeader(404)
56 | })
57 | defer srv.Close()
58 |
59 | var statusCode int
60 |
61 | mods := []flyscrape.Module{
62 | &starturl.Module{URL: srv.URL},
63 | &browser.Module{Browser: true},
64 | &hook.Module{
65 | ReceiveResponseFn: func(r *flyscrape.Response) {
66 | statusCode = r.StatusCode
67 | },
68 | },
69 | }
70 |
71 | scraper := flyscrape.NewScraper()
72 | scraper.Modules = mods
73 | scraper.Run()
74 |
75 | require.Equal(t, 404, statusCode)
76 | }
77 |
78 | func TestBrowserRequestHeader(t *testing.T) {
79 | t.SkipNow()
80 |
81 | srv := newServer(func(w http.ResponseWriter, r *http.Request) {
82 | w.Write([]byte(r.Header.Get("User-Agent")))
83 | })
84 | defer srv.Close()
85 |
86 | var body string
87 |
88 | mods := []flyscrape.Module{
89 | &starturl.Module{URL: srv.URL},
90 | &browser.Module{Browser: true},
91 | &headers.Module{
92 | Headers: map[string]string{
93 | "User-Agent": "custom-headers",
94 | },
95 | },
96 | &hook.Module{
97 | ReceiveResponseFn: func(r *flyscrape.Response) {
98 | body = string(r.Body)
99 | },
100 | },
101 | }
102 |
103 | scraper := flyscrape.NewScraper()
104 | scraper.Modules = mods
105 | scraper.Run()
106 |
107 | require.Contains(t, body, "custom-headers")
108 | }
109 |
110 | func TestBrowserResponseHeader(t *testing.T) {
111 | t.SkipNow()
112 |
113 | srv := newServer(func(w http.ResponseWriter, r *http.Request) {
114 | w.Header().Set("Foo", "bar")
115 | })
116 | defer srv.Close()
117 |
118 | var header string
119 |
120 | mods := []flyscrape.Module{
121 | &starturl.Module{URL: srv.URL},
122 | &browser.Module{Browser: true},
123 | &hook.Module{
124 | ReceiveResponseFn: func(r *flyscrape.Response) {
125 | header = r.Headers.Get("Foo")
126 | },
127 | },
128 | }
129 |
130 | scraper := flyscrape.NewScraper()
131 | scraper.Modules = mods
132 | scraper.Run()
133 |
134 | require.Equal(t, header, "bar")
135 | }
136 |
137 | func TestBrowserUnsetFlyscrapeUserAgent(t *testing.T) {
138 | t.SkipNow()
139 |
140 | srv := newServer(func(w http.ResponseWriter, r *http.Request) {
141 | w.Write([]byte(r.Header.Get("User-Agent")))
142 | })
143 | defer srv.Close()
144 |
145 | var body string
146 |
147 | mods := []flyscrape.Module{
148 | &starturl.Module{URL: srv.URL},
149 | &browser.Module{Browser: true},
150 | &hook.Module{
151 | ReceiveResponseFn: func(r *flyscrape.Response) {
152 | body = string(r.Body)
153 | },
154 | },
155 | }
156 |
157 | scraper := flyscrape.NewScraper()
158 | scraper.Modules = mods
159 | scraper.Run()
160 |
161 | fmt.Println(body)
162 | require.Contains(t, body, "Mozilla/5.0")
163 | require.NotContains(t, body, "flyscrape")
164 | }
165 |
166 | func newServer(f func(http.ResponseWriter, *http.Request)) *httptest.Server {
167 | return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
168 | f(w, r)
169 | }))
170 | }
171 |
--------------------------------------------------------------------------------
/modules/cache/boltstore.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cache
6 |
7 | import (
8 | "errors"
9 | "log"
10 | "os"
11 |
12 | "go.etcd.io/bbolt"
13 | )
14 |
15 | var cache = []byte("cache")
16 |
17 | func NewBoltStore(file string) *BoltStore {
18 | db, err := bbolt.Open(file, 0644, nil)
19 | if err != nil {
20 | log.Printf("cache: failed to create database file %q: %v\n", file, err)
21 | os.Exit(1)
22 | }
23 |
24 | c := &BoltStore{db: db}
25 |
26 | return c
27 | }
28 |
29 | type BoltStore struct {
30 | db *bbolt.DB
31 | }
32 |
33 | func (s *BoltStore) Get(key string) ([]byte, bool) {
34 | var value []byte
35 |
36 | err := s.db.View(func(tx *bbolt.Tx) error {
37 | bucket := tx.Bucket(cache)
38 | if bucket == nil {
39 | return errors.New("bucket not found")
40 | }
41 |
42 | v := bucket.Get([]byte(key))
43 | if v == nil {
44 | return errors.New("key not found")
45 | }
46 |
47 | value = make([]byte, len(v))
48 | copy(value, v)
49 |
50 | return nil
51 | })
52 | if err != nil {
53 | return nil, false
54 | }
55 | return value, true
56 | }
57 |
58 | func (s *BoltStore) Set(key string, value []byte) {
59 | err := s.db.Update(func(tx *bbolt.Tx) error {
60 | bucket, err := tx.CreateBucketIfNotExists(cache)
61 | if err != nil {
62 | return err
63 | }
64 |
65 | return bucket.Put([]byte(key), value)
66 | })
67 | if err != nil {
68 | log.Printf("cache: failed to insert cache key %q: %v\n", key, err)
69 | }
70 | }
71 |
72 | func (s *BoltStore) Close() {
73 | s.db.Close()
74 | }
75 |
--------------------------------------------------------------------------------
/modules/cache/boltstore_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cache_test
6 |
7 | import (
8 | "os"
9 | "testing"
10 |
11 | "github.com/philippta/flyscrape/modules/cache"
12 | "github.com/stretchr/testify/require"
13 | )
14 |
15 | func TestBoltStore(t *testing.T) {
16 | dir, err := os.MkdirTemp("", "boltstore")
17 | require.NoError(t, err)
18 | defer os.RemoveAll(dir)
19 |
20 | store := cache.NewBoltStore(dir + "/test.db")
21 |
22 | v, ok := store.Get("foo")
23 | require.Nil(t, v)
24 | require.False(t, ok)
25 |
26 | store.Set("foo", []byte("bar"))
27 |
28 | v, ok = store.Get("foo")
29 | require.NotNil(t, v)
30 | require.True(t, ok)
31 | require.Equal(t, []byte("bar"), v)
32 | }
33 |
--------------------------------------------------------------------------------
/modules/cache/cache.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cache
6 |
7 | import (
8 | "bufio"
9 | "bytes"
10 | "net/http"
11 | "net/http/httputil"
12 | "path/filepath"
13 | "strings"
14 |
15 | "github.com/philippta/flyscrape"
16 | )
17 |
18 | func init() {
19 | flyscrape.RegisterModule(Module{})
20 | }
21 |
22 | type Module struct {
23 | Cache string `json:"cache"`
24 |
25 | store Store
26 | }
27 |
28 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
29 | return flyscrape.ModuleInfo{
30 | ID: "cache",
31 | New: func() flyscrape.Module { return new(Module) },
32 | }
33 | }
34 |
35 | func (m *Module) Provision(ctx flyscrape.Context) {
36 | switch {
37 | case m.Cache == "file":
38 | file := replaceExt(ctx.ScriptName(), ".cache")
39 | m.store = NewBoltStore(file)
40 |
41 | case strings.HasPrefix(m.Cache, "file:"):
42 | m.store = NewBoltStore(strings.TrimPrefix(m.Cache, "file:"))
43 | }
44 | }
45 |
46 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
47 | if m.store == nil {
48 | return t
49 | }
50 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
51 | if nocache(r) {
52 | return t.RoundTrip(r)
53 | }
54 |
55 | key := r.Method + " " + r.URL.String()
56 | if b, ok := m.store.Get(key); ok {
57 | if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil {
58 | return resp, nil
59 | }
60 | }
61 |
62 | resp, err := t.RoundTrip(r)
63 | if err != nil {
64 | return resp, err
65 | }
66 |
67 | // Avoid caching when running into rate limits or
68 | // when the page errored.
69 | if resp.StatusCode < 200 || resp.StatusCode > 299 {
70 | return resp, err
71 | }
72 |
73 | encoded, err := httputil.DumpResponse(resp, true)
74 | if err != nil {
75 | return resp, err
76 | }
77 |
78 | m.store.Set(key, encoded)
79 | return resp, nil
80 | })
81 | }
82 |
83 | func (m *Module) Finalize() {
84 | if v, ok := m.store.(interface{ Close() }); ok {
85 | v.Close()
86 | }
87 | }
88 |
89 | func nocache(r *http.Request) bool {
90 | if r.Header.Get(flyscrape.HeaderBypassCache) != "" {
91 | r.Header.Del(flyscrape.HeaderBypassCache)
92 | return true
93 | }
94 | return false
95 | }
96 |
97 | func replaceExt(filePath string, newExt string) string {
98 | ext := filepath.Ext(filePath)
99 | if ext != "" {
100 | fileNameWithoutExt := filePath[:len(filePath)-len(ext)]
101 | newFilePath := fileNameWithoutExt + newExt
102 | return newFilePath
103 | }
104 | return filePath + newExt
105 | }
106 |
107 | type Store interface {
108 | Get(key string) ([]byte, bool)
109 | Set(key string, value []byte)
110 | }
111 |
--------------------------------------------------------------------------------
/modules/cookies/cookies.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package cookies
6 |
7 | import (
8 | "net/http"
9 | "slices"
10 |
11 | "github.com/browserutils/kooky"
12 | _ "github.com/browserutils/kooky/browser/chrome"
13 | _ "github.com/browserutils/kooky/browser/edge"
14 | _ "github.com/browserutils/kooky/browser/firefox"
15 | "github.com/philippta/flyscrape"
16 | )
17 |
18 | var supportedBrowsers = []string{
19 | "chrome",
20 | "edge",
21 | "firefox",
22 | }
23 |
24 | func init() {
25 | flyscrape.RegisterModule(Module{})
26 | }
27 |
28 | type Module struct {
29 | Cookies string `json:"cookies"`
30 | }
31 |
32 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
33 | return flyscrape.ModuleInfo{
34 | ID: "cookies",
35 | New: func() flyscrape.Module { return new(Module) },
36 | }
37 | }
38 |
39 | func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
40 | if !slices.Contains(supportedBrowsers, m.Cookies) {
41 | return t
42 | }
43 |
44 | var stores []kooky.CookieStore
45 | for _, store := range kooky.FindAllCookieStores() {
46 | if store.Browser() == m.Cookies && store.IsDefaultProfile() {
47 | stores = append(stores, store)
48 | }
49 | }
50 |
51 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
52 | for _, store := range stores {
53 | for _, cookie := range store.Cookies(r.URL) {
54 | r.AddCookie(cookie)
55 | }
56 | }
57 | return t.RoundTrip(r)
58 | })
59 | }
60 |
61 | var _ flyscrape.TransportAdapter = Module{}
62 |
--------------------------------------------------------------------------------
/modules/depth/depth.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package depth
6 |
7 | import (
8 | "github.com/philippta/flyscrape"
9 | )
10 |
11 | func init() {
12 | flyscrape.RegisterModule(Module{})
13 | }
14 |
15 | type Module struct {
16 | Depth int `json:"depth"`
17 | }
18 |
19 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
20 | return flyscrape.ModuleInfo{
21 | ID: "depth",
22 | New: func() flyscrape.Module { return new(Module) },
23 | }
24 | }
25 |
26 | func (m *Module) ValidateRequest(r *flyscrape.Request) bool {
27 | return r.Depth <= m.Depth
28 | }
29 |
30 | var _ flyscrape.RequestValidator = (*Module)(nil)
31 |
--------------------------------------------------------------------------------
/modules/depth/depth_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package depth_test
6 |
7 | import (
8 | "net/http"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/depth"
14 | "github.com/philippta/flyscrape/modules/followlinks"
15 | "github.com/philippta/flyscrape/modules/hook"
16 | "github.com/philippta/flyscrape/modules/starturl"
17 | "github.com/stretchr/testify/require"
18 | )
19 |
20 | func TestDepth(t *testing.T) {
21 | var urls []string
22 | var mu sync.Mutex
23 |
24 | mods := []flyscrape.Module{
25 | &starturl.Module{URL: "http://www.example.com"},
26 | &followlinks.Module{},
27 | &depth.Module{Depth: 2},
28 | hook.Module{
29 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
30 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
31 | switch r.URL.String() {
32 | case "http://www.example.com":
33 | return flyscrape.MockResponse(200, `Google `)
34 | case "http://www.google.com":
35 | return flyscrape.MockResponse(200, `DuckDuckGo `)
36 | case "http://www.duckduckgo.com":
37 | return flyscrape.MockResponse(200, `Example `)
38 | }
39 | return flyscrape.MockResponse(200, "")
40 | })
41 | },
42 | ReceiveResponseFn: func(r *flyscrape.Response) {
43 | mu.Lock()
44 | urls = append(urls, r.Request.URL)
45 | mu.Unlock()
46 | },
47 | },
48 | }
49 |
50 | scraper := flyscrape.NewScraper()
51 | scraper.Modules = mods
52 | scraper.Run()
53 |
54 | require.Len(t, urls, 3)
55 | require.Contains(t, urls, "http://www.example.com")
56 | require.Contains(t, urls, "http://www.google.com")
57 | require.Contains(t, urls, "http://www.duckduckgo.com")
58 | }
59 |
--------------------------------------------------------------------------------
/modules/domainfilter/domainfilter.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package domainfilter
6 |
7 | import (
8 | "github.com/nlnwa/whatwg-url/url"
9 | "github.com/philippta/flyscrape"
10 | )
11 |
12 | func init() {
13 | flyscrape.RegisterModule(Module{})
14 | }
15 |
16 | type Module struct {
17 | URL string `json:"url"`
18 | URLs []string `json:"urls"`
19 | AllowedDomains []string `json:"allowedDomains"`
20 | BlockedDomains []string `json:"blockedDomains"`
21 |
22 | active bool
23 | }
24 |
25 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
26 | return flyscrape.ModuleInfo{
27 | ID: "domainfilter",
28 | New: func() flyscrape.Module { return new(Module) },
29 | }
30 | }
31 |
32 | func (m *Module) Provision(v flyscrape.Context) {
33 | if m.URL != "" {
34 | if u, err := url.Parse(m.URL); err == nil {
35 | m.AllowedDomains = append(m.AllowedDomains, u.Host())
36 | }
37 | }
38 | for _, u := range m.URLs {
39 | if u, err := url.Parse(u); err == nil {
40 | m.AllowedDomains = append(m.AllowedDomains, u.Host())
41 | }
42 | }
43 | }
44 |
45 | func (m *Module) ValidateRequest(r *flyscrape.Request) bool {
46 | if m.disabled() {
47 | return true
48 | }
49 |
50 | u, err := url.Parse(r.URL)
51 | if err != nil {
52 | return false
53 | }
54 |
55 | host := u.Host()
56 | ok := false
57 |
58 | for _, domain := range m.AllowedDomains {
59 | if domain == "*" || host == domain {
60 | ok = true
61 | break
62 | }
63 | }
64 |
65 | for _, domain := range m.BlockedDomains {
66 | if host == domain {
67 | ok = false
68 | break
69 | }
70 | }
71 |
72 | return ok
73 | }
74 |
75 | func (m *Module) disabled() bool {
76 | return len(m.AllowedDomains) == 0 && len(m.BlockedDomains) == 0
77 | }
78 |
79 | var (
80 | _ flyscrape.RequestValidator = (*Module)(nil)
81 | _ flyscrape.Provisioner = (*Module)(nil)
82 | )
83 |
--------------------------------------------------------------------------------
/modules/domainfilter/domainfilter_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package domainfilter_test
6 |
7 | import (
8 | "net/http"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/domainfilter"
14 | "github.com/philippta/flyscrape/modules/followlinks"
15 | "github.com/philippta/flyscrape/modules/hook"
16 | "github.com/philippta/flyscrape/modules/starturl"
17 | "github.com/stretchr/testify/require"
18 | )
19 |
20 | func TestDomainfilterAllowed(t *testing.T) {
21 | var urls []string
22 | var mu sync.Mutex
23 |
24 | mods := []flyscrape.Module{
25 | &starturl.Module{URL: "http://www.example.com"},
26 | &followlinks.Module{},
27 | &domainfilter.Module{
28 | URL: "http://www.example.com",
29 | AllowedDomains: []string{"www.google.com"},
30 | },
31 | hook.Module{
32 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
33 | return flyscrape.MockTransport(200, `
34 | Google
35 | DuckDuckGo `)
36 | },
37 | ReceiveResponseFn: func(r *flyscrape.Response) {
38 | mu.Lock()
39 | urls = append(urls, r.Request.URL)
40 | mu.Unlock()
41 | },
42 | },
43 | }
44 |
45 | scraper := flyscrape.NewScraper()
46 | scraper.Modules = mods
47 | scraper.Run()
48 |
49 | require.Len(t, urls, 2)
50 | require.Contains(t, urls, "http://www.example.com")
51 | require.Contains(t, urls, "http://www.google.com")
52 | }
53 |
54 | func TestDomainfilterAllowedAll(t *testing.T) {
55 | var urls []string
56 | var mu sync.Mutex
57 |
58 | mods := []flyscrape.Module{
59 | &starturl.Module{URL: "http://www.example.com"},
60 | &followlinks.Module{},
61 | &domainfilter.Module{
62 | URL: "http://www.example.com",
63 | AllowedDomains: []string{"*"},
64 | },
65 | hook.Module{
66 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
67 | return flyscrape.MockTransport(200, `
68 | Google
69 | DuckDuckGo `)
70 | },
71 | ReceiveResponseFn: func(r *flyscrape.Response) {
72 | mu.Lock()
73 | urls = append(urls, r.Request.URL)
74 | mu.Unlock()
75 | },
76 | },
77 | }
78 |
79 | scraper := flyscrape.NewScraper()
80 | scraper.Modules = mods
81 | scraper.Run()
82 |
83 | require.Len(t, urls, 3)
84 | require.Contains(t, urls, "http://www.example.com")
85 | require.Contains(t, urls, "http://www.duckduckgo.com")
86 | require.Contains(t, urls, "http://www.google.com")
87 | }
88 |
89 | func TestDomainfilterBlocked(t *testing.T) {
90 | var urls []string
91 | var mu sync.Mutex
92 |
93 | mods := []flyscrape.Module{
94 | &starturl.Module{URL: "http://www.example.com"},
95 | &followlinks.Module{},
96 | &domainfilter.Module{
97 | URL: "http://www.example.com",
98 | AllowedDomains: []string{"*"},
99 | BlockedDomains: []string{"www.google.com"},
100 | },
101 | hook.Module{
102 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
103 | return flyscrape.MockTransport(200, `
104 | Google
105 | DuckDuckGo `)
106 | },
107 | ReceiveResponseFn: func(r *flyscrape.Response) {
108 | mu.Lock()
109 | urls = append(urls, r.Request.URL)
110 | mu.Unlock()
111 | },
112 | },
113 | }
114 |
115 | scraper := flyscrape.NewScraper()
116 | scraper.Modules = mods
117 | scraper.Run()
118 |
119 | require.Len(t, urls, 2)
120 | require.Contains(t, urls, "http://www.example.com")
121 | require.Contains(t, urls, "http://www.duckduckgo.com")
122 | }
123 |
--------------------------------------------------------------------------------
/modules/followlinks/followlinks.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package followlinks
6 |
7 | import (
8 | "net/url"
9 | "regexp"
10 | "strings"
11 |
12 | "github.com/PuerkitoBio/goquery"
13 | "github.com/philippta/flyscrape"
14 | )
15 |
16 | func init() {
17 | flyscrape.RegisterModule(Module{})
18 | }
19 |
20 | type Module struct {
21 | Follow *[]string `json:"follow"`
22 | }
23 |
24 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
25 | return flyscrape.ModuleInfo{
26 | ID: "followlinks",
27 | New: func() flyscrape.Module { return new(Module) },
28 | }
29 | }
30 |
31 | func (m *Module) Provision(ctx flyscrape.Context) {
32 | if m.Follow == nil {
33 | m.Follow = &[]string{"a[href]"}
34 | }
35 | }
36 |
37 | func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
38 | if m.Follow == nil {
39 | return
40 | }
41 |
42 | for _, link := range m.parseLinks(string(resp.Body), resp.Request.URL) {
43 | resp.Visit(link)
44 | }
45 | }
46 |
47 | func (m *Module) parseLinks(html string, origin string) []string {
48 | if m.Follow == nil {
49 | return nil
50 | }
51 |
52 | var links []string
53 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
54 | if err != nil {
55 | return nil
56 | }
57 |
58 | originurl, err := url.Parse(origin)
59 | if err != nil {
60 | return nil
61 | }
62 |
63 | uniqueLinks := make(map[string]bool)
64 |
65 | for _, selector := range *m.Follow {
66 | attr := parseSelectorAttr(selector)
67 | doc.Find(selector).Each(func(i int, s *goquery.Selection) {
68 | link, _ := s.Attr(attr)
69 |
70 | parsedLink, err := originurl.Parse(link)
71 |
72 | if err != nil || !isValidLink(parsedLink) {
73 | return
74 | }
75 |
76 | absLink := parsedLink.String()
77 |
78 | if !uniqueLinks[absLink] {
79 | links = append(links, absLink)
80 | uniqueLinks[absLink] = true
81 | }
82 | })
83 | }
84 |
85 | return links
86 | }
87 |
88 | func isValidLink(link *url.URL) bool {
89 | if link.Scheme != "" && link.Scheme != "http" && link.Scheme != "https" {
90 | return false
91 | }
92 |
93 | return true
94 | }
95 |
96 | func parseSelectorAttr(sel string) string {
97 | matches := selectorExpr.FindAllString(sel, -1)
98 | if len(matches) == 0 {
99 | return "href"
100 | }
101 |
102 | attr := attrExpr.FindString(matches[len(matches)-1])
103 | if attr == "" {
104 | return "href"
105 | }
106 |
107 | return attr
108 | }
109 |
110 | var (
111 | _ flyscrape.Provisioner = (*Module)(nil)
112 | _ flyscrape.ResponseReceiver = (*Module)(nil)
113 | )
114 |
115 | var (
116 | selectorExpr = regexp.MustCompile(`\[(.*?)\]`)
117 | attrExpr = regexp.MustCompile(`[\w-]+`)
118 | )
119 |
--------------------------------------------------------------------------------
/modules/followlinks/followlinks_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package followlinks_test
6 |
7 | import (
8 | "net/http"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/followlinks"
14 | "github.com/philippta/flyscrape/modules/hook"
15 | "github.com/philippta/flyscrape/modules/starturl"
16 | "github.com/stretchr/testify/require"
17 | )
18 |
19 | func TestFollowLinks(t *testing.T) {
20 | var urls []string
21 | var mu sync.Mutex
22 |
23 | mods := []flyscrape.Module{
24 | &starturl.Module{URL: "http://www.example.com/foo/bar"},
25 | &followlinks.Module{},
26 | hook.Module{
27 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
28 | return flyscrape.MockTransport(200, `
29 | Baz
30 | Baz
31 | Google `)
32 | },
33 | ReceiveResponseFn: func(r *flyscrape.Response) {
34 | mu.Lock()
35 | urls = append(urls, r.Request.URL)
36 | mu.Unlock()
37 | },
38 | },
39 | }
40 |
41 | scraper := flyscrape.NewScraper()
42 | scraper.Modules = mods
43 | scraper.Run()
44 |
45 | require.Len(t, urls, 5)
46 | require.Contains(t, urls, "http://www.example.com/baz")
47 | require.Contains(t, urls, "http://www.example.com/foo/bar")
48 | require.Contains(t, urls, "http://www.example.com/foo/baz")
49 | require.Contains(t, urls, "http://www.google.com")
50 | require.Contains(t, urls, "http://www.google.com/baz")
51 | }
52 |
53 | func TestFollowSelector(t *testing.T) {
54 | var urls []string
55 | var mu sync.Mutex
56 |
57 | mods := []flyscrape.Module{
58 | &starturl.Module{URL: "http://www.example.com/foo/bar"},
59 | &followlinks.Module{
60 | Follow: &[]string{".next a[href]"},
61 | },
62 | hook.Module{
63 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
64 | return flyscrape.MockTransport(200, `
65 | Baz
66 | Baz
67 | `)
70 | },
71 | ReceiveResponseFn: func(r *flyscrape.Response) {
72 | mu.Lock()
73 | urls = append(urls, r.Request.URL)
74 | mu.Unlock()
75 | },
76 | },
77 | }
78 |
79 | scraper := flyscrape.NewScraper()
80 | scraper.Modules = mods
81 | scraper.Run()
82 |
83 | require.Len(t, urls, 2)
84 | require.Contains(t, urls, "http://www.example.com/foo/bar")
85 | require.Contains(t, urls, "http://www.google.com")
86 | }
87 |
88 | func TestFollowDataAttr(t *testing.T) {
89 | var urls []string
90 | var mu sync.Mutex
91 |
92 | mods := []flyscrape.Module{
93 | &starturl.Module{URL: "http://www.example.com/foo/bar"},
94 | &followlinks.Module{
95 | Follow: &[]string{"[data-url]"},
96 | },
97 | hook.Module{
98 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
99 | return flyscrape.MockTransport(200, `
100 | Baz
101 | Baz
102 | Google
`)
103 | },
104 | ReceiveResponseFn: func(r *flyscrape.Response) {
105 | mu.Lock()
106 | urls = append(urls, r.Request.URL)
107 | mu.Unlock()
108 | },
109 | },
110 | }
111 |
112 | scraper := flyscrape.NewScraper()
113 | scraper.Modules = mods
114 | scraper.Run()
115 |
116 | require.Len(t, urls, 2)
117 | require.Contains(t, urls, "http://www.example.com/foo/bar")
118 | require.Contains(t, urls, "http://www.google.com")
119 | }
120 |
121 | func TestFollowMultiple(t *testing.T) {
122 | var urls []string
123 | var mu sync.Mutex
124 |
125 | mods := []flyscrape.Module{
126 | &starturl.Module{URL: "http://www.example.com/foo/bar"},
127 | &followlinks.Module{
128 | Follow: &[]string{"a.prev", "a.next"},
129 | },
130 | hook.Module{
131 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
132 | return flyscrape.MockTransport(200, `
133 | Baz
134 | a
135 | b `)
136 | },
137 | ReceiveResponseFn: func(r *flyscrape.Response) {
138 | mu.Lock()
139 | urls = append(urls, r.Request.URL)
140 | mu.Unlock()
141 | },
142 | },
143 | }
144 |
145 | scraper := flyscrape.NewScraper()
146 | scraper.Modules = mods
147 | scraper.Run()
148 |
149 | require.Len(t, urls, 3)
150 | require.Contains(t, urls, "http://www.example.com/foo/bar")
151 | require.Contains(t, urls, "http://www.example.com/foo/a")
152 | require.Contains(t, urls, "http://www.example.com/foo/b")
153 | }
154 |
155 | func TestFollowNoFollow(t *testing.T) {
156 | var urls []string
157 | var mu sync.Mutex
158 |
159 | mods := []flyscrape.Module{
160 | &starturl.Module{URL: "http://www.example.com/foo/bar"},
161 | &followlinks.Module{
162 | Follow: &[]string{},
163 | },
164 | hook.Module{
165 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
166 | return flyscrape.MockTransport(200, `
167 | Baz
168 | Baz
169 | `)
172 | },
173 | ReceiveResponseFn: func(r *flyscrape.Response) {
174 | mu.Lock()
175 | urls = append(urls, r.Request.URL)
176 | mu.Unlock()
177 | },
178 | },
179 | }
180 |
181 | scraper := flyscrape.NewScraper()
182 | scraper.Modules = mods
183 | scraper.Run()
184 |
185 | require.Len(t, urls, 1)
186 | require.Contains(t, urls, "http://www.example.com/foo/bar")
187 | }
188 |
--------------------------------------------------------------------------------
/modules/headers/headers.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package headers
6 |
7 | import (
8 | "net/http"
9 |
10 | "github.com/philippta/flyscrape"
11 | )
12 |
13 | func init() {
14 | flyscrape.RegisterModule(Module{})
15 | }
16 |
17 | type Module struct {
18 | Headers map[string]string `json:"headers"`
19 | }
20 |
21 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
22 | return flyscrape.ModuleInfo{
23 | ID: "headers",
24 | New: func() flyscrape.Module { return new(Module) },
25 | }
26 | }
27 |
28 | func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
29 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
30 | for k, v := range m.Headers {
31 | r.Header.Set(k, v)
32 | }
33 |
34 | if r.Header.Get("User-Agent") == "" {
35 | r.Header.Set("User-Agent", randomUserAgent())
36 | }
37 |
38 | return t.RoundTrip(r)
39 | })
40 | }
41 |
42 | var _ flyscrape.TransportAdapter = Module{}
43 |
--------------------------------------------------------------------------------
/modules/headers/headers_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package headers_test
6 |
7 | import (
8 | "net/http"
9 | "strings"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/headers"
14 | "github.com/philippta/flyscrape/modules/hook"
15 | "github.com/philippta/flyscrape/modules/starturl"
16 | "github.com/stretchr/testify/require"
17 | )
18 |
19 | func TestHeaders(t *testing.T) {
20 | gotHeaders := map[string]string{}
21 | sentHeaders := map[string]string{
22 | "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
23 | "User-Agent": "Gecko/1.0",
24 | }
25 |
26 | mods := []flyscrape.Module{
27 | hook.Module{
28 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
29 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
30 | for k := range r.Header {
31 | gotHeaders[k] = r.Header.Get(k)
32 | }
33 | return flyscrape.MockResponse(200, "")
34 | })
35 | },
36 | },
37 | &starturl.Module{URL: "http://www.example.com"},
38 | &headers.Module{
39 | Headers: sentHeaders,
40 | },
41 | }
42 |
43 | scraper := flyscrape.NewScraper()
44 | scraper.Modules = mods
45 | scraper.Run()
46 |
47 | require.Equal(t, sentHeaders, gotHeaders)
48 | }
49 |
50 | func TestHeadersRandomUserAgent(t *testing.T) {
51 | var userAgent string
52 | mods := []flyscrape.Module{
53 | hook.Module{
54 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
55 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
56 | userAgent = r.Header.Get("User-Agent")
57 | return flyscrape.MockResponse(200, "")
58 | })
59 | },
60 | },
61 | &starturl.Module{URL: "http://www.example.com"},
62 | &headers.Module{},
63 | }
64 |
65 | scraper := flyscrape.NewScraper()
66 | scraper.Modules = mods
67 | scraper.Run()
68 |
69 | require.NotEmpty(t, userAgent)
70 | require.True(t, strings.HasPrefix(userAgent, "Mozilla/5.0 ("))
71 | }
72 |
--------------------------------------------------------------------------------
/modules/headers/versions.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package headers
6 |
7 | import (
8 | _ "embed"
9 | "fmt"
10 | "math/rand"
11 | "strings"
12 | )
13 |
14 | //go:generate bash -c "flyscrape run ../../examples/useragents/chrome.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_chrome.txt"
15 | //go:generate bash -c "flyscrape run ../../examples/useragents/firefox.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_firefox.txt"
16 | //go:generate bash -c "flyscrape run ../../examples/useragents/edge.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_edge.txt"
17 | //go:generate bash -c "flyscrape run ../../examples/useragents/opera.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_opera.txt"
18 |
19 | //go:embed versions_chrome.txt
20 | var versionsChromeRaw string
21 | var versionsChrome = strings.Split(strings.TrimSpace(versionsChromeRaw), "\n")
22 |
23 | //go:embed versions_firefox.txt
24 | var versionsFirefoxRaw string
25 | var versionsFirefox = strings.Split(strings.TrimSpace(versionsFirefoxRaw), "\n")
26 |
27 | //go:embed versions_edge.txt
28 | var versionsEdgeRaw string
29 | var versionsEdge = strings.Split(strings.TrimSpace(versionsEdgeRaw), "\n")
30 |
31 | //go:embed versions_opera.txt
32 | var versionsOperaRaw string
33 | var versionsOpera = strings.Split(strings.TrimSpace(versionsOperaRaw), "\n")
34 |
35 | //go:embed versions_macos.txt
36 | var versionsMacOSRaw string
37 | var versionsMacOS = strings.Split(strings.TrimSpace(versionsMacOSRaw), "\n")
38 |
39 | //go:embed versions_windows.txt
40 | var versionsWindowsRaw string
41 | var versionsWindows = strings.Split(strings.TrimSpace(versionsWindowsRaw), "\n")
42 |
43 | //go:embed versions_linux.txt
44 | var versionsLinuxRaw string
45 | var versionsLinux = strings.Split(strings.TrimSpace(versionsLinuxRaw), "\n")
46 |
47 | func randomUAChrome() string {
48 | f := "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36"
49 | return fmt.Sprintf(f, randomOS(), random(versionsChrome))
50 | }
51 |
52 | func randomUAFirefox() string {
53 | f := "Mozilla/5.0 (%s; rv:%s) Gecko/20100101 Firefox/%s"
54 | ver := random(versionsFirefox)
55 | return fmt.Sprintf(f, randomOS(), ver, ver)
56 | }
57 |
58 | func randomUAEdge() string {
59 | f := "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/%s"
60 | return fmt.Sprintf(f, randomOS(), random(versionsEdge))
61 | }
62 |
63 | func randomUAOpera() string {
64 | f := "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/%s"
65 | return fmt.Sprintf(f, randomOS(), random(versionsOpera))
66 | }
67 |
68 | func randomUserAgent() string {
69 | switch rand.Intn(4) {
70 | case 0:
71 | return randomUAChrome()
72 | case 1:
73 | return randomUAFirefox()
74 | case 2:
75 | return randomUAEdge()
76 | case 3:
77 | return randomUAOpera()
78 | }
79 | panic("rand.Intn is broken")
80 | }
81 |
82 | func randomOS() string {
83 | switch rand.Intn(3) {
84 | case 0:
85 | return random(versionsMacOS)
86 | case 1:
87 | return random(versionsWindows)
88 | case 2:
89 | return random(versionsLinux)
90 | }
91 | panic("rand.Intn is broken")
92 | }
93 |
94 | func random(ss []string) string {
95 | return ss[rand.Intn(len(ss))]
96 | }
97 |
--------------------------------------------------------------------------------
/modules/headers/versions_chrome.txt:
--------------------------------------------------------------------------------
1 | 123.0.6312.52
2 | 123.0.6312.40
3 | 122.0.6261.94
4 | 122.0.6261.90
5 | 122.0.6261.89
6 | 122.0.6261.69
7 | 122.0.6261.64
8 | 122.0.6261.62
9 | 122.0.6261.51
10 | 122.0.6261.48
11 | 122.0.6261.129
12 | 122.0.6261.128
13 | 122.0.6261.119
14 | 122.0.6261.112
15 | 122.0.6261.111
16 | 122.0.6261.105
17 | 122.0.6045.214
18 | 121.0.6167.85
19 | 121.0.6167.66
20 | 121.0.6167.184
21 | 121.0.6167.178
22 | 121.0.6167.171
23 | 121.0.6167.164
24 | 121.0.6167.160
25 | 121.0.6167.143
26 | 121.0.6167.139
27 | 121.0.6167.138
28 | 121.0.6167.101
29 | 120.0.6099.71
30 | 120.0.6099.50
31 | 120.0.6099.43
32 | 120.0.6099.230
33 | 120.0.6099.216
34 | 120.0.6099.210
35 | 120.0.6099.199
36 | 120.0.6099.193
37 | 120.0.6099.130
38 | 120.0.6099.129
39 | 120.0.6099.119
40 | 120.0.6099.109
41 | 120.0.6099.101
42 | 119.0.6045.66
43 | 119.0.6045.41
44 | 119.0.6045.214
45 | 119.0.6045.200
46 | 119.0.6045.199
47 | 119.0.6045.193
48 | 119.0.6045.169
49 | 119.0.6045.163
50 | 119.0.6045.134
51 | 119.0.6045.124
52 | 119.0.6045.123
53 | 119.0.6045.109
54 | 118.0.5993.96
55 | 118.0.5993.92
56 | 118.0.5993.89
57 | 118.0.5993.88
58 | 118.0.5993.80
59 | 118.0.5993.71
60 | 118.0.5993.70
61 | 118.0.5993.69
62 | 118.0.5993.65
63 | 118.0.5993.58
64 | 118.0.5993.118
65 | 118.0.5993.117
66 | 118.0.5993.111
67 | 117.0.5938.92
68 | 117.0.5938.89
69 | 117.0.5938.88
70 | 117.0.5938.82
71 | 117.0.5938.60
72 | 117.0.5938.153
73 | 117.0.5938.150
74 | 117.0.5938.149
75 | 117.0.5938.140
76 | 117.0.5938.117
77 | 117.0.5938.108
78 | 117.0.5938.104
79 | 116.0.5845.97
80 | 116.0.5845.96
81 | 116.0.5845.92
82 | 116.0.5845.90
83 | 116.0.5845.188
84 | 116.0.5845.187
85 | 116.0.5845.180
86 | 116.0.5845.179
87 | 116.0.5845.177
88 | 116.0.5845.172
89 | 116.0.5845.163
90 | 116.0.5845.146
91 | 116.0.5845.141
92 | 116.0.5845.140
93 | 116.0.5845.118
94 | 116.0.5845.114
95 | 116.0.5845.111
96 | 116.0.5845.110
97 | 116.0.5845.103
98 | 115.0.5790.84
99 | 115.0.5790.166
100 | 115.0.5790.160
101 | 115.0.5790.138
102 | 115.0.5790.136
103 | 115.0.5790.130
104 | 115.0.5790.114
105 | 115.0.5790.110
106 | 115.0.5790.102
107 | 114.0.5735.99
108 | 114.0.5735.61
109 | 114.0.5735.60
110 | 114.0.5735.58
111 | 114.0.5735.57
112 | 114.0.5735.50
113 | 114.0.5735.198
114 | 114.0.5735.196
115 | 114.0.5735.133
116 | 114.0.5735.131
117 | 114.0.5735.130
118 | 114.0.5735.124
119 | 114.0.5735.106
120 | 113.0.5672.92
121 | 113.0.5672.77
122 | 113.0.5672.76
123 | 113.0.5672.69
124 | 113.0.5672.163
125 | 113.0.5672.162
126 | 113.0.5672.134
127 | 113.0.5672.126
128 | 113.0.5672.121
129 | 113.0.5672.109
130 | 112.0.5615.70
131 | 112.0.5615.69
132 | 112.0.5615.48
133 | 112.0.5615.47
134 | 112.0.5615.46
135 | 112.0.5615.167
136 | 112.0.5615.165
137 | 112.0.5615.137
138 | 112.0.5615.136
139 | 112.0.5615.135
140 | 112.0.5615.101
141 | 112.0.5615.100
142 | 111.0.5563.72
143 | 111.0.5563.58
144 | 111.0.5563.57
145 | 111.0.5563.54
146 | 111.0.5563.147
147 | 111.0.5563.116
148 | 111.0.5563.115
149 | 111.0.5563.110
150 | 111.0.5563.101
151 | 110.0.5481.96
152 | 110.0.5481.83
153 | 110.0.5481.65
154 | 110.0.5481.64
155 | 110.0.5481.63
156 | 110.0.5481.177
157 | 110.0.5481.154
158 | 110.0.5481.153
159 | 110.0.5481.114
160 | 110.0.5481.104
161 | 110.0.5481.100
162 | 109.0.5414.94
163 | 109.0.5414.83
164 | 109.0.5414.165
165 | 109.0.5414.149
166 | 109.0.5414.141
167 | 109.0.5414.125
168 | 109.0.5414.119
169 | 109.0.5414.112
170 | 108.0.5359.98
171 | 108.0.5359.94
172 | 108.0.5359.61
173 | 108.0.5359.52
174 | 108.0.5359.128
175 | 108.0.5359.124
176 | 108.0.5359.112
177 | 107.0.5304.91
178 | 107.0.5304.87
179 | 107.0.5304.66
180 | 107.0.5304.54
181 | 107.0.5304.141
182 | 107.0.5304.121
183 | 107.0.5304.110
184 | 107.0.5304.105
185 | 107.0.5304.101
186 | 106.0.5249.92
187 | 106.0.5249.91
188 | 106.0.5249.75
189 | 106.0.5249.70
190 | 106.0.5249.65
191 | 106.0.5249.60
192 | 106.0.5249.126
193 | 106.0.5249.119
194 | 106.0.5249.118
195 | 106.0.5249.112
196 | 106.0.5249.103
197 | 105.0.5195.98
198 | 105.0.5195.79
199 | 105.0.5195.77
200 | 105.0.5195.69
201 | 105.0.5195.68
202 | 105.0.5195.147
203 | 105.0.5195.136
204 | 105.0.5195.134
205 | 105.0.5195.129
206 | 105.0.5195.125
207 | 105.0.5195.124
208 | 105.0.5195.112
209 | 105.0.5195.102
210 | 105.0.5195.100
211 | 104.0.5112.99
212 | 104.0.5112.97
213 | 104.0.5112.88
214 | 104.0.5112.71
215 | 104.0.5112.69
216 | 104.0.5112.102
217 | 104.0.5112.101
218 | 103.0.5060.71
219 | 103.0.5060.70
220 | 103.0.5060.66
221 | 103.0.5060.64
222 | 103.0.5060.63
223 | 103.0.5060.54
224 | 103.0.5060.53
225 | 103.0.5060.134
226 | 103.0.5060.132
227 | 103.0.5060.129
228 | 103.0.5060.114
229 | 102.0.5005.99
230 | 102.0.5005.78
231 | 102.0.5005.67
232 | 102.0.5005.59
233 | 102.0.5005.115
234 | 101.0.4951.67
235 | 101.0.4951.64
236 | 101.0.4951.61
237 | 101.0.4951.54
238 | 101.0.4951.44
239 | 101.0.4951.41
240 | 100.0.4896.88
241 | 100.0.4896.85
242 | 100.0.4896.79
243 | 100.0.4896.77
244 | 100.0.4896.75
245 | 100.0.4896.58
246 | 100.0.4896.56
247 | 100.0.4896.127
248 | 99.0.4844.88
249 | 99.0.4844.84
250 | 99.0.4844.83
251 | 99.0.4844.82
252 | 99.0.4844.74
253 | 99.0.4844.73
254 | 99.0.4844.59
255 | 99.0.4844.48
256 | 99.0.4844.47
257 | 98.0.4758.97
258 | 98.0.4758.87
259 | 98.0.4758.85
260 | 98.0.4758.109
261 | 98.0.4758.102
262 | 98.0.4758.101
263 | 97.0.4692.99
264 | 97.0.4692.84
265 | 97.0.4692.72
266 | 97.0.4664.98
267 | 97.0.4664.87
268 | 97.0.4664.70
269 | 96.0.4664.94
270 | 96.0.4664.93
271 | 96.0.4664.92
272 | 96.0.4664.53
273 | 96.0.4664.45
274 | 96.0.4664.36
275 | 96.0.4664.116
276 | 96.0.4664.110
277 | 96.0.4664.104
278 | 96.0.4664.101
279 | 95.0.4638.69
280 | 95.0.4638.50
281 | 94.0.4606.85
282 | 94.0.4606.81
283 | 94.0.4606.80
284 | 94.0.4606.76
285 | 94.0.4606.71
286 | 94.0.4606.61
287 | 94.0.4606.52
288 | 94.0.4606.50
289 | 93.0.4577.82
290 | 93.0.4577.78
291 | 93.0.4577.62
292 | 93.0.4577.39
293 | 92.0.4515.90
294 | 92.0.4515.166
295 | 92.0.4515.159
296 | 92.0.4515.131
297 | 92.0.4515.115
298 | 92.0.4515.105
299 | 91.0.4472.88
300 | 91.0.4472.80
301 | 91.0.4472.77
302 | 91.0.4472.164
303 | 91.0.4472.124
304 | 91.0.4472.123
305 | 91.0.4472.120
306 | 91.0.4472.114
307 | 91.0.4472.106
308 | 91.0.4472.101
309 | 90.0.4430.93
310 | 90.0.4430.91
311 | 90.0.4430.85
312 | 90.0.4430.82
313 | 90.0.4430.78
314 | 90.0.4430.66
315 | 90.0.4430.216
316 | 90.0.4430.212
317 | 90.0.4430.210
318 | 89.0.4389.90
319 | 89.0.4389.86
320 | 89.0.4389.82
321 | 89.0.4389.72
322 | 89.0.4389.128
323 | 89.0.4389.114
324 | 89.0.4389.105
325 | 88.0.4324.93
326 | 88.0.4324.192
327 | 88.0.4324.190
328 | 88.0.4324.182
329 | 88.0.4324.181
330 | 88.0.4324.155
331 | 88.0.4324.150
332 | 88.0.4324.146
333 | 88.0.4324.141
334 | 87.0.4280.88
335 | 87.0.4280.86
336 | 87.0.4280.77
337 | 87.0.4280.67
338 | 87.0.4280.66
339 | 87.0.4280.60
340 | 87.0.4280.163
341 | 87.0.4280.141
342 | 87.0.4280.101
343 |
--------------------------------------------------------------------------------
/modules/headers/versions_edge.txt:
--------------------------------------------------------------------------------
1 | 132.0.2957.11
2 | 131.0.2903.9
3 | 131.0.2903.48
4 | 130.0.2849.5
5 | 130.0.2849.46
6 | 129.0.2792.52
7 | 129.0.2792.12
8 | 128.0.2739.5
9 | 128.0.2739.42
10 | 127.0.2651.8
11 | 127.0.2651.74
12 | 126.0.2592.56
13 | 126.0.2592.13
14 | 125.0.2535.51
15 | 125.0.2535.13
16 | 124.0.2478.51
17 | 124.0.2478.10
18 | 123.0.2420.53
19 | 123.0.2420.10
20 | 122.0.2365.8
21 | 122.0.2365.52
22 | 121.0.2277.83
23 | 121.0.2277.4
24 | 120.0.2210.7
25 | 120.0.2210.61
26 | 119.0.2151.44
27 | 119.0.2151.12
28 | 118.0.2088.46
29 | 118.0.2088.11
30 | 117.0.2045.9
31 | 117.0.2045.31
32 | 116.0.1938.54
33 | 116.0.1938.29
34 | 115.0.1901.7
35 | 115.0.1901.183
36 | 114.0.1823.37
37 | 114.0.1823.11
38 | 113.0.1774.9
39 | 113.0.1774.3
40 | 112.0.1722.34
41 | 112.0.1722.11
42 | 111.0.1661.41
43 | 111.0.1661.15
44 | 110.0.1587.41
45 | 110.0.1587.17
46 | 109.0.1518.49
47 | 109.0.1518.14
48 | 108.0.1462.42
49 | 108.0.1462.15
50 | 107.0.1418.8
51 | 107.0.1418.24
52 | 106.0.1370.34
53 | 106.0.1370.15
54 | 105.0.1343.7
55 | 105.0.1343.25
56 | 104.0.1293.47
57 | 104.0.1293.14
58 | 103.0.1264.37
59 | 103.0.1264.13
60 | 102.0.1245.7
61 | 102.0.1245.30
62 | 101.0.1210.32
63 | 101.0.1210.10
64 | 100.0.1185.29
65 | 100.0.1185.10
66 | 99.0.1150.30
67 | 99.0.1150.11
68 | 98.0.1108.43
69 | 98.0.1108.23
70 | 97.0.1072.55
71 | 97.0.1072.21
72 | 96.0.1054.8
73 | 96.0.1054.29
74 | 95.0.1020.9
75 | 95.0.1020.30
76 | 94.0.992.9
77 | 94.0.992.31
78 | 93.0.961.38
79 | 93.0.961.11
80 | 92.0.902.9
81 | 92.0.902.55
82 | 91.0.864.37
83 | 91.0.864.11
84 | 90.0.818.8
85 | 90.0.818.39
86 | 89.0.774.48
87 | 89.0.774.18
88 | 88.0.705.50
89 | 88.0.705.18
90 |
--------------------------------------------------------------------------------
/modules/headers/versions_firefox.txt:
--------------------------------------------------------------------------------
1 | 132.0.2
2 | 132.0.1
3 | 132.0
4 | 131.0.3
5 | 131.0.2
6 | 131.0
7 | 130.0.1
8 | 130.0
9 | 129.0.2
10 | 129.0.1
11 | 129.0
12 | 128.4.0
13 | 128.3.1
14 | 128.3.0
15 | 128.2.0
16 | 128.1.0
17 | 128.0.3
18 | 128.0.2
19 | 128.0
20 | 127.0.2
21 | 127.0.1
22 | 127.0
23 | 126.0.1
24 | 126.0
25 | 125.0.3
26 | 125.0.2
27 | 125.0.1
28 | 124.0.2
29 | 124.0.1
30 | 124.0
31 | 123.0.1
32 | 123.0
33 | 122.0.1
34 | 122.0
35 | 121.0.1
36 | 121.0
37 | 120.0.1
38 | 120.0
39 | 119.0.1
40 | 119.0
41 | 118.0.2
42 | 118.0.1
43 | 118.0
44 | 117.0.1
45 | 117.0
46 | 116.0.3
47 | 116.0.2
48 | 116.0.1
49 | 116.0
50 | 115.9.1
51 | 115.9.0
52 | 115.8.0
53 | 115.7.0
54 | 115.6.0
55 | 115.5.0
56 | 115.4.0
57 | 115.3.1
58 | 115.3.0
59 | 115.2.1
60 | 115.2.0
61 | 115.17.0
62 | 115.16.1
63 | 115.16.0
64 | 115.15.0
65 | 115.14.0
66 | 115.13.0
67 | 115.12.0
68 | 115.11.0
69 | 115.10.0
70 | 115.1.0
71 | 115.0.3
72 | 115.0.2
73 | 115.0.1
74 | 115.0
75 | 114.0.2
76 | 114.0.1
77 | 114.0
78 | 113.0.2
79 | 113.0.1
80 | 113.0
81 | 112.0.2
82 | 112.0.1
83 | 112.0
84 | 111.0.1
85 | 111.0
86 | 110.0.1
87 | 110.0
88 | 109.0.1
89 | 109.0
90 | 108.0.2
91 | 108.0.1
92 | 108.0
93 | 107.0.1
94 | 107.0
95 | 106.0.5
96 | 106.0.4
97 | 106.0.3
98 | 106.0.2
99 | 106.0.1
100 | 106.0
101 | 105.0.3
102 | 105.0.2
103 | 105.0.1
104 | 105.0
105 | 104.0.2
106 | 104.0.1
107 | 104.0
108 | 103.0.2
109 | 103.0.1
110 | 103.0
111 | 102.9.0
112 | 102.8.0
113 | 102.7.0
114 | 102.6.0
115 | 102.5.0
116 | 102.4.0
117 | 102.3.0
118 | 102.2.0
119 | 102.15.1
120 | 102.15.0
121 | 102.14.0
122 | 102.13.0
123 | 102.12.0
124 | 102.11.0
125 | 102.10.0
126 | 102.1.0
127 | 102.0.1
128 | 102.0
129 | 101.0.1
130 | 101.0
131 | 100.0.2
132 | 100.0.1
133 | 100.0
134 | 99.0.1
135 | 99.0
136 | 98.0.2
137 | 98.0.1
138 | 98.0
139 | 97.0.2
140 | 97.0.1
141 | 97.0
142 | 96.0.3
143 | 96.0.2
144 | 96.0.1
145 | 96.0
146 | 95.0.2
147 | 95.0.1
148 | 95.0
149 | 94.0.2
150 | 94.0.1
151 | 94.0
152 | 93.0
153 | 92.0.1
154 | 92.0
155 | 91.9.1
156 | 91.9.0
157 | 91.8.0
158 | 91.7.1
159 | 91.7.0
160 | 91.6.1
161 | 91.6.0
162 | 91.5.1
163 | 91.5.0
164 | 91.4.1
165 | 91.4.0
166 | 91.3.0
167 | 91.2.0
168 | 91.13.0
169 | 91.12.0
170 | 91.11.0
171 | 91.10.0
172 | 91.1.0
173 | 91.0.2
174 | 91.0.1
175 | 91.0
176 | 90.0.2
177 | 90.0.1
178 | 90.0
179 | 89.0.2
180 | 89.0.1
181 | 89.0
182 | 88.0.1
183 | 88.0
184 | 87.0
185 | 86.0.1
186 | 86.0
187 | 85.0.2
188 | 85.0.1
189 | 85.0
190 | 84.0.2
191 | 84.0.1
192 | 84.0
193 | 83.0
194 | 82.0.3
195 | 82.0.2
196 | 82.0.1
197 | 82.0
198 | 81.0.2
199 | 81.0.1
200 | 81.0
201 | 80.0.1
202 | 80.0
203 | 79.0
204 | 78.9.0
205 | 78.8.0
206 | 78.7.1
207 | 78.7.0
208 | 78.6.1
209 | 78.6.0
210 | 78.5.0
211 | 78.4.1
212 | 78.4.0
213 | 78.3.1
214 | 78.3.0
215 | 78.2.0
216 | 78.15.0
217 | 78.14.0
218 | 78.13.0
219 | 78.12.0
220 | 78.11.0
221 | 78.10.1
222 | 78.10.0
223 | 78.1.0
224 | 78.0.2
225 | 78.0.1
226 | 78.0
227 | 77.0.1
228 | 77.0
229 | 76.0.1
230 | 76.0
231 | 75.0
232 | 74.0.1
233 | 74.0
234 | 73.0.1
235 | 73.0
236 | 72.0.2
237 | 72.0.1
238 | 72.0
239 | 71.0
240 | 70.0.1
241 | 70.0
242 | 69.0.3
243 | 69.0.2
244 | 69.0.1
245 | 69.0
246 | 68.9.0
247 | 68.8.0
248 | 68.7.0
249 | 68.6.1
250 | 68.6.0
251 | 68.5.0
252 | 68.4.2
253 | 68.4.1
254 | 68.4.0
255 | 68.3.0
256 | 68.2.0
257 | 68.12.0
258 | 68.11.0
259 | 68.10.0
260 | 68.1.0
261 | 68.0.2
262 | 68.0.1
263 | 68.0
264 | 67.0.4
265 | 67.0.3
266 | 67.0.2
267 | 67.0.1
268 | 67.0
269 | 66.0.5
270 | 66.0.4
271 | 66.0.3
272 | 66.0.2
273 | 66.0.1
274 | 66.0
275 | 65.0.2
276 | 65.0.1
277 | 65.0
278 | 64.0.2
279 | 64.0
280 | 63.0.3
281 | 63.0.1
282 | 63.0
283 | 62.0.3
284 | 62.0.2
285 | 62.0
286 | 61.0.2
287 | 61.0.1
288 | 61.0
289 | 60.9.0
290 | 60.8.0
291 | 60.7.2
292 | 60.7.1
293 | 60.7.0
294 | 60.6.3
295 | 60.6.2
296 | 60.6.1
297 | 60.6.0
298 | 60.5.2
299 | 60.5.1
300 | 60.5.0
301 | 60.4.0
302 | 60.3.0
303 | 60.2.2
304 | 60.2.1
305 | 60.2.0
306 | 60.1.0
307 | 60.0.2
308 | 60.0.1
309 | 60.0
310 |
--------------------------------------------------------------------------------
/modules/headers/versions_linux.txt:
--------------------------------------------------------------------------------
1 | X11; Linux x86_64
2 | X11; Ubuntu; Linux x86_64
3 |
--------------------------------------------------------------------------------
/modules/headers/versions_macos.txt:
--------------------------------------------------------------------------------
1 | Macintosh; Intel Mac OS X 11_2_1
2 | Macintosh; Intel Mac OS X 11_2_2
3 | Macintosh; Intel Mac OS X 11_2_3
4 | Macintosh; Intel Mac OS X 11_3_1
5 | Macintosh; Intel Mac OS X 11_3_5
6 | Macintosh; Intel Mac OS X 11_4_3
7 | Macintosh; Intel Mac OS X 11_5_1
8 | Macintosh; Intel Mac OS X 11_5_2
9 | Macintosh; Intel Mac OS X 11_6_1
10 | Macintosh; Intel Mac OS X 11_6_2
11 | Macintosh; Intel Mac OS X 11_6_3
12 | Macintosh; Intel Mac OS X 11_6_4
13 | Macintosh; Intel Mac OS X 11_6_5
14 | Macintosh; Intel Mac OS X 11_6_6
15 | Macintosh; Intel Mac OS X 11_6_7
16 | Macintosh; Intel Mac OS X 11_6_8
17 | Macintosh; Intel Mac OS X 11_7
18 | Macintosh; Intel Mac OS X 11_7_1
19 | Macintosh; Intel Mac OS X 11_7_10
20 | Macintosh; Intel Mac OS X 11_7_2
21 | Macintosh; Intel Mac OS X 11_7_3
22 | Macintosh; Intel Mac OS X 11_7_4
23 | Macintosh; Intel Mac OS X 11_7_5
24 | Macintosh; Intel Mac OS X 11_7_6
25 | Macintosh; Intel Mac OS X 11_7_7
26 | Macintosh; Intel Mac OS X 11_7_8
27 | Macintosh; Intel Mac OS X 11_7_9
28 | Macintosh; Intel Mac OS X 12_0_1
29 | Macintosh; Intel Mac OS X 12_1_2
30 | Macintosh; Intel Mac OS X 12_2_1
31 | Macintosh; Intel Mac OS X 12_3_1
32 | Macintosh; Intel Mac OS X 12_3_4
33 | Macintosh; Intel Mac OS X 12_4_2
34 | Macintosh; Intel Mac OS X 12_5_1
35 | Macintosh; Intel Mac OS X 12_5_3
36 | Macintosh; Intel Mac OS X 12_6_1
37 | Macintosh; Intel Mac OS X 12_6_2
38 | Macintosh; Intel Mac OS X 12_6_3
39 | Macintosh; Intel Mac OS X 12_6_4
40 | Macintosh; Intel Mac OS X 12_6_5
41 | Macintosh; Intel Mac OS X 12_6_6
42 | Macintosh; Intel Mac OS X 12_6_7
43 | Macintosh; Intel Mac OS X 12_6_8
44 | Macintosh; Intel Mac OS X 12_6_9
45 | Macintosh; Intel Mac OS X 12_7
46 | Macintosh; Intel Mac OS X 12_7_1
47 | Macintosh; Intel Mac OS X 12_7_2
48 | Macintosh; Intel Mac OS X 12_7_3
49 | Macintosh; Intel Mac OS X 12_7_4
50 | Macintosh; Intel Mac OS X 12_7_5
51 | Macintosh; Intel Mac OS X 12_7_6
52 | Macintosh; Intel Mac OS X 13_0_1
53 | Macintosh; Intel Mac OS X 13_0_5
54 | Macintosh; Intel Mac OS X 13_1_1
55 | Macintosh; Intel Mac OS X 13_2_1
56 | Macintosh; Intel Mac OS X 13_3_1
57 | Macintosh; Intel Mac OS X 13_3_3
58 | Macintosh; Intel Mac OS X 13_4_1
59 | Macintosh; Intel Mac OS X 13_5_1
60 | Macintosh; Intel Mac OS X 13_5_2
61 | Macintosh; Intel Mac OS X 13_6
62 | Macintosh; Intel Mac OS X 13_6_1
63 | Macintosh; Intel Mac OS X 13_6_2
64 | Macintosh; Intel Mac OS X 13_6_3
65 | Macintosh; Intel Mac OS X 13_6_4
66 | Macintosh; Intel Mac OS X 13_6_5
67 | Macintosh; Intel Mac OS X 13_6_6
68 | Macintosh; Intel Mac OS X 13_6_7
69 | Macintosh; Intel Mac OS X 13_6_8
70 | Macintosh; Intel Mac OS X 13_6_9
71 | Macintosh; Intel Mac OS X 13_7
72 | Macintosh; Intel Mac OS X 13_7_1
73 | Macintosh; Intel Mac OS X 14_0_6
74 | Macintosh; Intel Mac OS X 14_1_1
75 | Macintosh; Intel Mac OS X 14_1_2
76 | Macintosh; Intel Mac OS X 14_2_1
77 | Macintosh; Intel Mac OS X 14_2_2
78 | Macintosh; Intel Mac OS X 14_3_1
79 | Macintosh; Intel Mac OS X 14_4_1
80 | Macintosh; Intel Mac OS X 14_4_3
81 | Macintosh; Intel Mac OS X 14_5_2
82 | Macintosh; Intel Mac OS X 14_6_1
83 | Macintosh; Intel Mac OS X 14_6_2
84 | Macintosh; Intel Mac OS X 14_7
85 | Macintosh; Intel Mac OS X 14_7_1
86 | Macintosh; Intel Mac OS X 15_0
87 | Macintosh; Intel Mac OS X 15_0_1
88 |
--------------------------------------------------------------------------------
/modules/headers/versions_opera.txt:
--------------------------------------------------------------------------------
1 | 110.0.5130.82
2 | 110.0.5130.8
3 | 110.0.5130.66
4 | 110.0.5130.49
5 | 110.0.5130.4
6 | 110.0.5130.39
7 | 110.0.5130.35
8 | 110.0.5130.23
9 | 110.0.5130.17
10 | 110.0.5130.13
11 | 110.0.5117.0
12 | 110.0.5111.0
13 | 110.0.5104.0
14 | 109.0.5097.80
15 | 109.0.5097.59
16 | 109.0.5097.5
17 | 109.0.5097.45
18 | 109.0.5097.38
19 | 109.0.5097.33
20 | 109.0.5097.24
21 | 109.0.5097.0
22 | 109.0.5089.0
23 | 109.0.5083.0
24 | 109.0.5076.0
25 | 109.0.5069.0
26 | 108.0.5067.40
27 | 108.0.5067.24
28 | 108.0.5067.20
29 | 108.0.5067.14
30 | 108.0.5067.10
31 | 108.0.5063.0
32 | 108.0.5054.0
33 | 108.0.5047.0
34 | 107.0.5045.8
35 | 107.0.5045.4
36 | 107.0.5045.36
37 | 107.0.5045.21
38 | 107.0.5045.15
39 | 107.0.5045.11
40 | 107.0.5041.0
41 | 107.0.5035.0
42 | 107.0.5019.0
43 | 107.0.5012.0
44 | 107.0.5004.0
45 | 106.0.4971.0
46 | 105.0.4970.6
47 | 105.0.4970.48
48 | 105.0.4970.34
49 | 105.0.4970.29
50 | 105.0.4970.21
51 | 105.0.4970.16
52 | 105.0.4970.13
53 | 105.0.4970.10
54 | 105.0.4963.0
55 | 105.0.4957.0
56 | 105.0.4950.0
57 | 104.0.4944.54
58 | 104.0.4944.36
59 | 104.0.4944.33
60 | 104.0.4944.3
61 | 104.0.4944.28
62 | 104.0.4944.23
63 | 104.0.4944.18
64 | 104.0.4944.10
65 | 104.0.4941.0
66 | 104.0.4934.0
67 | 103.0.4928.34
68 | 103.0.4928.3
69 | 103.0.4928.26
70 | 103.0.4928.16
71 | 103.0.4928.0
72 | 103.0.4920.0
73 | 103.0.4906.0
74 | 103.0.4899.0
75 | 103.0.4892.0
76 | 103.0.4885.0
77 | 102.0.4880.78
78 | 102.0.4880.70
79 | 102.0.4880.6
80 | 102.0.4880.56
81 | 102.0.4880.51
82 | 102.0.4880.46
83 | 102.0.4880.40
84 | 102.0.4880.38
85 | 102.0.4880.33
86 | 102.0.4880.28
87 | 102.0.4880.16
88 | 102.0.4880.10
89 | 102.0.4879.0
90 | 102.0.4871.0
91 | 102.0.4864.0
92 | 102.0.4857.0
93 | 102.0.4850.0
94 | 101.0.4843.58
95 | 101.0.4843.5
96 | 101.0.4843.43
97 | 101.0.4843.33
98 | 101.0.4843.25
99 | 101.0.4843.19 –
100 | 101.0.4843.13
101 | 101.0.4843.10
102 | 101.0.4843.0
103 | 101.0.4836.0
104 | 101.0.4829.0
105 | 101.0.4822.0
106 | 100.0.4815.76
107 | 100.0.4815.54
108 | 100.0.4815.47
109 | 100.0.4815.30
110 | 100.0.4815.2
111 | 100.0.4815.13
112 | 100.0.4815.0
113 | 100.0.4809.0
114 | 100.0.4801.0
115 | 100.0.4796.0
116 | 100.0.4790.0
117 | 99.0.4788.9
118 | 99.0.4788.88
119 | 99.0.4788.77
120 | 99.0.4788.65
121 | 99.0.4788.5
122 | 99.0.4788.47
123 | 99.0.4788.31
124 | 99.0.4788.13
125 | 99.0.4780.0
126 | 99.0.4765.0
127 | 98.0.4759.6
128 | 98.0.4759.39
129 | 98.0.4759.3
130 | 98.0.4759.21
131 | 98.0.4759.15
132 | 98.0.4759.1
133 | 98.0.4756.0
134 | 98.0.4746.0
135 | 98.0.4739.0
136 | 98.0.4732.0
137 | 98.0.4725.0
138 | 97.0.4719.83
139 | 97.0.4719.63
140 | 97.0.4719.43
141 | 97.0.4719.4
142 | 97.0.4719.28
143 | 97.0.4719.26
144 | 97.0.4719.17
145 | 97.0.4719.11
146 | 97.0.4718.0
147 | 97.0.4711.0
148 | 97.0.4704.0
149 | 97.0.4697.0
150 | 96.0.4693.80
151 | 96.0.4693.50
152 | 96.0.4693.31
153 | 96.0.4693.20
154 | 96.0.4693.16
155 | 96.0.4693.12
156 | 96.0.4691.0
157 | 96.0.4674.0
158 | 96.0.4660.0
159 | 96.0.4653.0
160 | 96.0.4640.0
161 | 95.0.4635.84
162 | 95.0.4635.46
163 | 95.0.4635.37
164 | 95.0.4635.28
165 | 95.0.4635.25
166 | 95.0.4635.20
167 | 95.0.4635.15
168 | 95.0.4635.12
169 | 95.0.4635.10
170 | 95.0.4632.0
171 | 95.0.4625.0
172 | 95.0.4618.0
173 | 95.0.4612.0
174 | 94.0.4606.8
175 | 94.0.4606.76
176 | 94.0.4606.65
177 | 94.0.4606.54
178 | 94.0.4606.38
179 | 94.0.4606.26
180 | 94.0.4606.19
181 | 94.0.4606.14
182 | 94.0.4604.0
183 | 94.0.4597.0
184 | 93.0.4585.70
185 | 93.0.4585.7
186 | 93.0.4585.64
187 | 93.0.4585.39
188 | 93.0.4585.37
189 | 93.0.4585.3
190 | 93.0.4585.21
191 | 93.0.4585.11
192 | 93.0.4582.0
193 | 93.0.4575.0
194 | 93.0.4569.0
195 | 92.0.4561.8
196 | 92.0.4561.43
197 | 92.0.4561.33
198 | 92.0.4561.30
199 | 92.0.4561.21
200 | 92.0.4561.11
201 | 92.0.4561.0
202 | 92.0.4555.0
203 | 92.0.4547.0
204 | 92.0.4540.0
205 | 92.0.4526.0
206 | 92.0.4519.0
207 | 91.0.4516.9
208 | 91.0.4516.77
209 | 91.0.4516.65
210 | 91.0.4516.6
211 | 91.0.4516.3
212 | 91.0.4516.20
213 | 91.0.4516.16
214 | 91.0.4514.0
215 | 91.0.4505.0
216 | 91.0.4498.0
217 | 91.0.4491.0
218 | 91.0.4484.0
219 | 90.0.4480.84
220 | 90.0.4480.80
221 | 90.0.4480.54
222 | 90.0.4480.48
223 | 90.0.4480.41
224 | 90.0.4480.37
225 | 90.0.4480.30
226 | 90.0.4480.25
227 | 90.0.4480.107
228 | 90.0.4477.0
229 | 90.0.4470.0
230 | 90.0.4463.0
231 | 90.0.4457.0
232 | 90.0.4450.0
233 | 89.0.4447.83
234 | 89.0.4447.71
235 | 89.0.4447.51
236 | 89.0.4447.48
237 | 89.0.4447.38
238 | 89.0.4447.37
239 | 89.0.4447.33
240 | 89.0.4447.31
241 | 89.0.4447.20
242 | 89.0.4447.12
243 | 89.0.4447.101
244 | 89.0.4443.0
245 | 89.0.4436.0
246 | 89.0.4428.0
247 | 89.0.4422.0
248 | 89.0.4415.0
249 | 88.0.4412.74
250 | 88.0.4412.53
251 | 88.0.4412.40
252 | 88.0.4412.27
253 | 88.0.4412.20
254 | 88.0.4412.18
255 | 88.0.4412.13
256 | 88.0.4401.0
257 | 88.0.4395.0
258 | 87.0.4390.8
259 | 87.0.4390.45
260 | 87.0.4390.36
261 | 87.0.4390.25
262 | 87.0.4390.21
263 | 87.0.4390.17
264 | 87.0.4388.0
265 | 87.0.4382.0
266 | 87.0.4374.0
267 | 87.0.4366.0
268 | 86.0.4363.9
269 | 86.0.4363.59
270 | 86.0.4363.50
271 | 86.0.4363.32
272 | 86.0.4363.22
273 | 86.0.4363.17
274 | 86.0.4363.15
275 | 86.0.4363.12
276 | 86.0.4359.0
277 | 86.0.4351.0
278 | 86.0.4344.0
279 | 85.0.4341.75
280 | 85.0.4341.60
281 | 85.0.4341.6
282 | 85.0.4341.47
283 | 85.0.4341.28
284 | 85.0.4341.18
285 | 85.0.4341.13
286 | 85.0.4341.10
287 | 85.0.4338.0
288 | 85.0.4331.0
289 | 85.0.4323.0
290 | 84.0.4316.9
291 | 84.0.4316.42
292 | 84.0.4316.31
293 | 84.0.4316.21
294 | 84.0.4316.14
295 | 84.0.4316.0
296 | 84.0.4309.0
297 | 84.0.4302.0
298 | 84.0.4295.0
299 | 84.0.4284.0
300 | 84.0.4274.0
301 | 84.0.4267.0
302 | 84.0.4260.0
303 | 83.0.4254.9
304 | 83.0.4254.62
305 | 83.0.4254.54
306 | 83.0.4254.5
307 | 83.0.4254.27
308 | 83.0.4254.19
309 | 83.0.4254.16
310 | 83.0.4254.14
311 | 83.0.4253.0
312 | 83.0.4246.0
313 | 83.0.4239.0
314 | 83.0.4232.0
315 | 82.0.4227.7
316 | 82.0.4227.58
317 | 82.0.4227.43
318 | 82.0.4227.4
319 | 82.0.4227.33
320 | 82.0.4227.23
321 | 82.0.4227.13
322 | 82.0.4226.0
323 | 82.0.4218.0
324 | 82.0.4210.0
325 | 82.0.4203.0
326 | 81.0.4196.60
327 | 81.0.4196.54
328 | 81.0.4196.37
329 | 81.0.4196.31
330 | 81.0.4196.27
331 | 81.0.4196.14
332 | 81.0.4196.11
333 | 81.0.4196.0
334 | 81.0.4189.0
335 | 81.0.4183.0
336 | 81.0.4175.0
337 | 80.0.4170.72
338 | 80.0.4170.7
339 | 80.0.4170.63
340 | 80.0.4170.40
341 | 80.0.4170.4
342 | 80.0.4170.16
343 | 80.0.4170.11
344 | 80.0.4170.0
345 | 80.0.4162.0
346 | 80.0.4157.0
347 | 80.0.4150.0
348 | 79.0.4143.72
349 | 79.0.4143.50
350 | 79.0.4143.3
351 | 79.0.4143.22
352 | 79.0.4143.15
353 | 79.0.4142.0
354 | 79.0.4135.0
355 | 79.0.4128.0
356 | 79.0.4114.0
357 | 79.0.4105.0
358 | 79.0.4100.0
359 | 78.0.4093.79
360 | 78.0.4093.68
361 | 78.0.4093.46
362 | 78.0.4093.34
363 | 78.0.4093.184
364 | 78.0.4093.147
365 | 78.0.4093.112
366 | 78.0.4093.103
367 | 78.0.4093.0
368 | 78.0.4086.0
369 | 78.0.4079.0
370 | 78.0.4072.0
371 | 78.0.4066.0
372 | 78.0.4058.0
373 | 77.0.4054.91
374 | 77.0.4054.90
375 | 77.0.4054.80
376 | 77.0.4054.64
377 | 77.0.4054.38
378 | 77.0.4054.277 –
379 | 77.0.4054.254 –
380 | 77.0.4054.203 –
381 | 77.0.4054.19
382 | 77.0.4054.172 –
383 | 77.0.4054.146 –
384 | 77.0.4054.14
385 | 77.0.4051.0
386 | 77.0.4046.0
387 | 77.0.4039.0
388 | 77.0.4032.0
389 | 77.0.4028.0
390 | 77.0.4023.0
391 | 76.0.4017.94
392 | 76.0.4017.88
393 | 76.0.4017.59
394 | 76.0.4017.5
395 | 76.0.4017.40
396 | 76.0.4017.177
397 | 76.0.4017.154
398 | 76.0.4017.137
399 | 76.0.4017.123
400 | 76.0.4017.107
401 | 76.0.4009.0
402 | 76.0.3995.0
403 | 76.0.3989.0
404 | 76.0.3981.0
405 | 76.0.3974.0
406 | 75.0.3969.93
407 | 75.0.3969.60
408 | 75.0.3969.50
409 | 75.0.3969.35
410 | 75.0.3969.218
411 | 75.0.3969.171
412 | 75.0.3969.141
413 | 75.0.3969.14
414 | 75.0.3967.0
415 | 75.0.3960.0
416 | 75.0.3953.0
417 | 75.0.3946.0
418 | 75.0.3939.0
419 | 75.0.3932.0
420 | 75.0.3925.0
421 | 74.0.3911.75
422 | 74.0.3911.63
423 | 74.0.3911.42
424 | 74.0.3911.232
425 | 74.0.3911.22
426 | 74.0.3911.218
427 | 74.0.3911.203
428 | 74.0.3911.160
429 | 74.0.3911.107
430 | 74.0.3904.0
431 | 74.0.3897.0
432 | 74.0.3890.0
433 | 74.0.3883.0
434 | 74.0.3876.0
435 | 74.0.3870.0
436 | 74.0.3862.0
437 | 73.0.3856.344
438 | 73.0.3856.329
439 | 73.0.3856.31
440 | 73.0.3856.284
441 | 73.0.3856.257
442 | 73.0.3856.235
443 | 73.0.3856.208
444 | 73.0.3856.184
445 | 73.0.3856.156
446 | 73.0.3856.0
447 | 73.0.3847.0
448 | 73.0.3841.0
449 | 73.0.3834.0
450 | 73.0.3827.0
451 | 73.0.3820.0
452 | 72.0.3815.86
453 | 72.0.3815.49
454 | 72.0.3815.400
455 | 72.0.3815.378
456 | 72.0.3815.320
457 | 72.0.3815.200
458 | 72.0.3815.186
459 | 72.0.3815.148
460 | 72.0.3815.133
461 | 72.0.3814.0
462 | 72.0.3807.0
463 | 72.0.3798.0
464 | 72.0.3791.0
465 | 72.0.3784.0
466 | 72.0.3779.0
467 | 71.0.3770.97
468 | 71.0.3770.81
469 | 71.0.3770.50
470 | 71.0.3770.271
471 | 71.0.3770.228
472 | 71.0.3770.198
473 | 71.0.3770.148
474 | 71.0.3770.126
475 | 71.0.3770.0
476 | 71.0.3763.0
477 | 71.0.3756.0
478 | 71.0.3749.0
479 | 71.0.3742.0
480 | 71.0.3735.0 –
481 | 70.0.3728.95
482 | 70.0.3728.8
483 | 70.0.3728.71
484 | 70.0.3728.59
485 | 70.0.3728.46
486 | 70.0.3728.21
487 | 70.0.3728.189
488 | 70.0.3728.144
489 | 70.0.3728.133
490 | 70.0.3728.119
491 | 70.0.3728.106
492 | 70.0.3728.0
493 | 70.0.3721.0
494 | 70.0.3714.0
495 | 70.0.3707.0
496 | 70.0.3701.0
497 | 70.0.3693.0
498 | 69.0.3686.95
499 | 69.0.3686.77
500 | 69.0.3686.7
501 | 69.0.3686.57
502 | 69.0.3686.49
503 | 69.0.3686.36
504 | 69.0.3686.30
505 | 69.0.3686.21
506 | 69.0.3686.2
507 | 69.0.3686.12
508 | 69.0.3686.0
509 | 69.0.3679.0
510 | 69.0.3673.0
511 | 69.0.3665.0
512 | 69.0.3660.0
513 | 69.0.3653.0
514 | 69.0.3651.0
515 | 69.0.3645.0
516 | 69.0.3638.0
517 | 69.0.3630.0
518 | 69.0.3623.0
519 | 68.0.3618.91
520 | 68.0.3618.63
521 | 68.0.3618.56
522 | 68.0.3618.5
523 | 68.0.3618.45/68.0.3618.46
524 | 68.0.3618.41
525 | 68.0.3618.36
526 | 68.0.3618.31
527 | 68.0.3618.3
528 | 68.0.3618.24
529 | 68.0.3618.18
530 | 68.0.3618.173
531 | 68.0.3618.165
532 | 68.0.3618.104
533 | 68.0.3616.0
534 | 68.0.3609.0
535 | 68.0.3602.0
536 | 68.0.3590.0
537 | 68.0.3581.0
538 | 67.0.3575.97
539 | 67.0.3575.8
540 | 67.0.3575.79
541 | 67.0.3575.53
542 | 67.0.3575.31
543 | 67.0.3575.28
544 | 67.0.3575.23
545 | 67.0.3575.2
546 | 67.0.3575.137
547 | 67.0.3575.13
548 | 67.0.3575.115
549 | 67.0.3574.0
550 | 67.0.3564.0
551 | 67.0.3554.0
552 | 67.0.3541.0
553 | 67.0.3536.0
554 | 67.0.3523.0
555 | 66.0.3515.72
556 | 66.0.3515.7
557 | 66.0.3515.44
558 | 66.0.3515.36
559 | 66.0.3515.3
560 | 66.0.3515.27
561 | 66.0.3515.21
562 | 66.0.3515.2
563 | 66.0.3515.14
564 | 66.0.3515.103
565 | 66.0.3511.0
566 | 66.0.3508.0
567 | 66.0.3502.0
568 | 66.0.3494.0
569 | 66.0.3487.0
570 | 66.0.3480.0
571 | 66.0.3475.0
572 | 66.0.3472.0
573 | 65.0.3467.78
574 | 65.0.3467.72
575 | 65.0.3467.7
576 | 65.0.3467.69
577 | 65.0.3467.62
578 | 65.0.3467.48
579 | 65.0.3467.38
580 | 65.0.3467.32
581 | 65.0.3467.24
582 | 65.0.3467.16
583 | 65.0.3466.0
584 | 65.0.3459.0
585 | 65.0.3454.0
586 | 65.0.3450.0
587 | 65.0.3445.0
588 | 65.0.3437.0
589 | 65.0.3430.0
590 | 65.0.3425.0
591 | 64.0.3417.92
592 | 64.0.3417.83
593 | 64.0.3417.8
594 | 64.0.3417.73
595 | 64.0.3417.61
596 | 64.0.3417.54
597 | 64.0.3417.47
598 | 64.0.3417.41
599 | 64.0.3417.32
600 | 64.0.3417.19
601 | 64.0.3417.119
602 | 64.0.3417.11
603 | 64.0.3416.0
604 | 64.0.3409.0
605 | 64.0.3407.0
606 | 64.0.3401.0
607 | 64.0.3396.0
608 | 64.0.3394.0
609 | 64.0.3388.0
610 | 64.0.3380.0
611 | 64.0.3372.0
612 | 63.0.3368.94
613 | 63.0.3368.88
614 | 63.0.3368.8
615 | 63.0.3368.66
616 | 63.0.3368.53
617 | 63.0.3368.51
618 | 63.0.3368.43
619 | 63.0.3368.35
620 | 63.0.3368.33
621 | 63.0.3368.29
622 | 63.0.3368.22
623 | 63.0.3368.17
624 | 63.0.3368.14
625 | 63.0.3367.0
626 | 63.0.3359.0
627 | 63.0.3353.0
628 | 63.0.3349.0
629 | 63.0.3347.0
630 | 62.0.3331.99
631 | 62.0.3331.8
632 | 62.0.3331.66
633 | 62.0.3331.55
634 | 62.0.3331.5
635 | 62.0.3331.43
636 | 62.0.3331.2
637 | 62.0.3331.18
638 | 62.0.3331.14
639 | 62.0.3331.119
640 | 62.0.3331.116
641 | 62.0.3331.10
642 | 62.0.3323.0
643 | 62.0.3319.0
644 | 61.0.3298.6
645 | 61.0.3298.3
646 | 61.0.3296.0
647 | 61.0.3290.0
648 | 61.0.3282.0
649 | 61.0.3275.0
650 | 61.0.3271.0
651 | 61.0.3268.0
652 | 60.0.3255.95
653 | 60.0.3255.84
654 | 60.0.3255.83
655 | 60.0.3255.8
656 | 60.0.3255.79
657 | 60.0.3255.70
658 | 60.0.3255.60
659 | 60.0.3255.59
660 | 60.0.3255.57
661 | 60.0.3255.56
662 | 60.0.3255.4
663 | 60.0.3255.37
664 | 60.0.3255.27
665 | 60.0.3255.20
666 | 60.0.3255.170
667 | 60.0.3255.151
668 | 60.0.3255.15
669 | 60.0.3255.116
670 | 60.0.3255.109
671 | 60.0.3255.103
672 | 60.0.3254.0
673 | 60.0.3248.0
674 | 60.0.3242.0
675 | 60.0.3236.0
676 |
--------------------------------------------------------------------------------
/modules/headers/versions_windows.txt:
--------------------------------------------------------------------------------
1 | Windows NT 10.0; Win64; x64
2 | Windows NT 10.0; WOW64
3 |
--------------------------------------------------------------------------------
/modules/hook/hook.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package hook
6 |
7 | import (
8 | "net/http"
9 |
10 | "github.com/philippta/flyscrape"
11 | )
12 |
13 | type Module struct {
14 | AdaptTransportFn func(http.RoundTripper) http.RoundTripper
15 | ValidateRequestFn func(*flyscrape.Request) bool
16 | BuildRequestFn func(*flyscrape.Request)
17 | ReceiveResponseFn func(*flyscrape.Response)
18 | ProvisionFn func(flyscrape.Context)
19 | FinalizeFn func()
20 | }
21 |
22 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
23 | return flyscrape.ModuleInfo{
24 | ID: "hook",
25 | New: func() flyscrape.Module { return new(Module) },
26 | }
27 | }
28 |
29 | func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
30 | if m.AdaptTransportFn == nil {
31 | return t
32 | }
33 | return m.AdaptTransportFn(t)
34 | }
35 |
36 | func (m Module) ValidateRequest(r *flyscrape.Request) bool {
37 | if m.ValidateRequestFn == nil {
38 | return true
39 | }
40 | return m.ValidateRequestFn(r)
41 | }
42 |
43 | func (m Module) BuildRequest(r *flyscrape.Request) {
44 | if m.BuildRequestFn == nil {
45 | return
46 | }
47 | m.BuildRequestFn(r)
48 | }
49 |
50 | func (m Module) ReceiveResponse(r *flyscrape.Response) {
51 | if m.ReceiveResponseFn == nil {
52 | return
53 | }
54 | m.ReceiveResponseFn(r)
55 | }
56 |
57 | func (m Module) Provision(ctx flyscrape.Context) {
58 | if m.ProvisionFn == nil {
59 | return
60 | }
61 | m.ProvisionFn(ctx)
62 | }
63 |
64 | func (m Module) Finalize() {
65 | if m.FinalizeFn == nil {
66 | return
67 | }
68 | m.FinalizeFn()
69 | }
70 |
71 | var (
72 | _ flyscrape.TransportAdapter = Module{}
73 | _ flyscrape.RequestValidator = Module{}
74 | _ flyscrape.RequestBuilder = Module{}
75 | _ flyscrape.ResponseReceiver = Module{}
76 | _ flyscrape.Provisioner = Module{}
77 | _ flyscrape.Finalizer = Module{}
78 | )
79 |
--------------------------------------------------------------------------------
/modules/output/json/json.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package json
6 |
7 | import (
8 | "bytes"
9 | "encoding/json"
10 | "fmt"
11 | "io"
12 | "log"
13 | "os"
14 | "sync"
15 | "time"
16 |
17 | "github.com/philippta/flyscrape"
18 | )
19 |
20 | func init() {
21 | flyscrape.RegisterModule(Module{})
22 | }
23 |
24 | type Module struct {
25 | Output struct {
26 | Format string `json:"format"`
27 | File string `json:"file"`
28 | } `json:"output"`
29 |
30 | once bool
31 | w io.WriteCloser
32 | mu *sync.Mutex
33 | }
34 |
35 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
36 | return flyscrape.ModuleInfo{
37 | ID: "output.json",
38 | New: func() flyscrape.Module { return new(Module) },
39 | }
40 | }
41 |
42 | func (m *Module) Provision(ctx flyscrape.Context) {
43 | if m.disabled() {
44 | return
45 | }
46 |
47 | m.mu = &sync.Mutex{}
48 |
49 | if m.Output.File == "" {
50 | m.w = nopCloser{os.Stdout}
51 | return
52 | }
53 |
54 | f, err := os.Create(m.Output.File)
55 | if err != nil {
56 | log.Printf("failed to create file %q: %v", m.Output.File, err)
57 | os.Exit(1)
58 | }
59 | m.w = f
60 | }
61 |
62 | func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
63 | if m.disabled() {
64 | return
65 | }
66 |
67 | if resp.Error == nil && resp.Data == nil {
68 | return
69 | }
70 |
71 | o := output{
72 | URL: resp.Request.URL,
73 | Data: resp.Data,
74 | Timestamp: time.Now(),
75 | }
76 | if resp.Error != nil {
77 | o.Error = resp.Error.Error()
78 | }
79 |
80 | m.mu.Lock()
81 | defer m.mu.Unlock()
82 |
83 | if !m.once {
84 | fmt.Fprintln(m.w, "[")
85 | m.once = true
86 | } else {
87 | fmt.Fprintln(m.w, ",")
88 | }
89 |
90 | var buf bytes.Buffer
91 | enc := json.NewEncoder(&buf)
92 | enc.SetEscapeHTML(false)
93 | enc.SetIndent(" ", " ")
94 | enc.Encode(o)
95 |
96 | fmt.Fprint(m.w, " ")
97 | fmt.Fprint(m.w, buf.String()[:buf.Len()-1])
98 | }
99 |
100 | func (m *Module) Finalize() {
101 | if m.disabled() {
102 | return
103 | }
104 | if m.once {
105 | fmt.Fprintln(m.w, "\n]")
106 | }
107 | m.w.Close()
108 | }
109 |
110 | func (m *Module) disabled() bool {
111 | return m.Output.Format != "json" && m.Output.Format != ""
112 | }
113 |
114 | type output struct {
115 | URL string `json:"url,omitempty"`
116 | Data any `json:"data,omitempty"`
117 | Error string `json:"error,omitempty"`
118 | Timestamp time.Time `json:"timestamp,omitempty"`
119 | }
120 |
121 | type nopCloser struct {
122 | io.Writer
123 | }
124 |
125 | func (c nopCloser) Write(p []byte) (n int, err error) {
126 | return c.Writer.Write(p)
127 | }
128 |
129 | func (c nopCloser) Close() error {
130 | return nil
131 | }
132 |
133 | var (
134 | _ flyscrape.Provisioner = (*Module)(nil)
135 | _ flyscrape.ResponseReceiver = (*Module)(nil)
136 | _ flyscrape.Finalizer = (*Module)(nil)
137 | )
138 |
--------------------------------------------------------------------------------
/modules/output/ndjson/ndjson.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package ndjson
6 |
7 | import (
8 | "encoding/json"
9 | "io"
10 | "log"
11 | "os"
12 | "sync"
13 | "time"
14 |
15 | "github.com/philippta/flyscrape"
16 | )
17 |
18 | func init() {
19 | flyscrape.RegisterModule(Module{})
20 | }
21 |
22 | type Module struct {
23 | Output struct {
24 | Format string `json:"format"`
25 | File string `json:"file"`
26 | } `json:"output"`
27 |
28 | w io.WriteCloser
29 | mu *sync.Mutex
30 | }
31 |
32 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
33 | return flyscrape.ModuleInfo{
34 | ID: "output.ndjson",
35 | New: func() flyscrape.Module { return new(Module) },
36 | }
37 | }
38 |
39 | func (m *Module) Provision(ctx flyscrape.Context) {
40 | if m.disabled() {
41 | return
42 | }
43 |
44 | m.mu = &sync.Mutex{}
45 |
46 | if m.Output.File == "" {
47 | m.w = nopCloser{os.Stdout}
48 | return
49 | }
50 |
51 | f, err := os.Create(m.Output.File)
52 | if err != nil {
53 | log.Printf("failed to create file %q: %v", m.Output.File, err)
54 | os.Exit(1)
55 | }
56 | m.w = f
57 | }
58 |
59 | func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
60 | if m.disabled() {
61 | return
62 | }
63 |
64 | if resp.Error == nil && resp.Data == nil {
65 | return
66 | }
67 |
68 | o := output{
69 | URL: resp.Request.URL,
70 | Data: resp.Data,
71 | Timestamp: time.Now(),
72 | }
73 | if resp.Error != nil {
74 | o.Error = resp.Error.Error()
75 | }
76 |
77 | m.mu.Lock()
78 | defer m.mu.Unlock()
79 |
80 | enc := json.NewEncoder(m.w)
81 | enc.SetEscapeHTML(false)
82 | enc.Encode(o)
83 | }
84 |
85 | func (m *Module) Finalize() {
86 | if m.disabled() {
87 | return
88 | }
89 | m.w.Close()
90 | }
91 |
92 | func (m *Module) disabled() bool {
93 | return m.Output.Format != "ndjson"
94 | }
95 |
96 | type output struct {
97 | URL string `json:"url,omitempty"`
98 | Data any `json:"data,omitempty"`
99 | Error string `json:"error,omitempty"`
100 | Timestamp time.Time `json:"timestamp,omitempty"`
101 | }
102 |
103 | type nopCloser struct {
104 | io.Writer
105 | }
106 |
107 | func (c nopCloser) Write(p []byte) (n int, err error) {
108 | return c.Writer.Write(p)
109 | }
110 |
111 | func (c nopCloser) Close() error {
112 | return nil
113 | }
114 |
115 | var (
116 | _ flyscrape.Provisioner = (*Module)(nil)
117 | _ flyscrape.ResponseReceiver = (*Module)(nil)
118 | _ flyscrape.Finalizer = (*Module)(nil)
119 | )
120 |
--------------------------------------------------------------------------------
/modules/proxy/proxy.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package proxy
6 |
7 | import (
8 | "crypto/tls"
9 | "math/rand"
10 | "net/http"
11 | "net/url"
12 |
13 | "github.com/philippta/flyscrape"
14 | )
15 |
16 | func init() {
17 | flyscrape.RegisterModule(Module{})
18 | }
19 |
20 | type Module struct {
21 | Proxies []string `json:"proxies"`
22 | Proxy string `json:"proxy"`
23 |
24 | transports []*http.Transport
25 | }
26 |
27 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
28 | return flyscrape.ModuleInfo{
29 | ID: "proxy",
30 | New: func() flyscrape.Module { return new(Module) },
31 | }
32 | }
33 |
34 | func (m *Module) Provision(ctx flyscrape.Context) {
35 | if m.disabled() {
36 | return
37 | }
38 |
39 | for _, purl := range append(m.Proxies, m.Proxy) {
40 | if purl == "" {
41 | continue
42 | }
43 | if parsed, err := url.Parse(purl); err == nil {
44 | m.transports = append(m.transports, &http.Transport{
45 | Proxy: http.ProxyURL(parsed),
46 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
47 | })
48 | }
49 |
50 | }
51 | }
52 |
53 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
54 | if m.disabled() {
55 | return t
56 | }
57 |
58 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
59 | transport := m.transports[rand.Intn(len(m.transports))]
60 | return transport.RoundTrip(r)
61 | })
62 | }
63 |
64 | func (m *Module) disabled() bool {
65 | return len(m.Proxies) == 0 && m.Proxy == ""
66 | }
67 |
--------------------------------------------------------------------------------
/modules/proxy/proxy_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package proxy_test
6 |
7 | import (
8 | "net/http"
9 | "net/http/httptest"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/proxy"
14 | "github.com/philippta/flyscrape/modules/starturl"
15 | "github.com/stretchr/testify/require"
16 | )
17 |
18 | func TestProxy(t *testing.T) {
19 | var called bool
20 | p := newProxy(func() { called = true })
21 | defer p.Close()
22 |
23 | mods := []flyscrape.Module{
24 | &starturl.Module{URL: "http://www.example.com"},
25 | &proxy.Module{
26 | Proxies: []string{p.URL},
27 | },
28 | }
29 |
30 | scraper := flyscrape.NewScraper()
31 | scraper.Modules = mods
32 | scraper.Run()
33 |
34 | require.True(t, called)
35 | }
36 |
37 | func TestProxyMultiple(t *testing.T) {
38 | calls := []int{0, 0, 0}
39 | p0 := newProxy(func() { calls[0]++ })
40 | p1 := newProxy(func() { calls[1]++ })
41 | p2 := newProxy(func() { calls[2]++ })
42 | defer p0.Close()
43 | defer p1.Close()
44 | defer p2.Close()
45 |
46 | mod := &proxy.Module{Proxies: []string{p0.URL, p1.URL}, Proxy: p2.URL}
47 | mod.Provision(nil)
48 | trans := mod.AdaptTransport(nil)
49 |
50 | req := httptest.NewRequest("GET", "http://www.example.com/", nil)
51 |
52 | for i := 0; i < 50; i++ {
53 | resp, err := trans.RoundTrip(req)
54 | require.NoError(t, err)
55 | require.Equal(t, http.StatusOK, resp.StatusCode)
56 | }
57 |
58 | require.Greater(t, calls[0], 1)
59 | require.Greater(t, calls[1], 1)
60 | require.Greater(t, calls[2], 1)
61 | require.Equal(t, 50, calls[0]+calls[1]+calls[2])
62 | }
63 |
64 | func newProxy(f func()) *httptest.Server {
65 | return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
66 | f()
67 | w.Write([]byte("response from proxy"))
68 | }))
69 | }
70 |
--------------------------------------------------------------------------------
/modules/ratelimit/ratelimit.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package ratelimit
6 |
7 | import (
8 | "math"
9 | "net/http"
10 | "time"
11 |
12 | "github.com/philippta/flyscrape"
13 | )
14 |
15 | func init() {
16 | flyscrape.RegisterModule(Module{})
17 | }
18 |
19 | type Module struct {
20 | Rate int `json:"rate"`
21 | Concurrency int `json:"concurrency"`
22 | Browser bool `json:"browser"`
23 |
24 | ticker *time.Ticker
25 | ratelimit chan struct{}
26 | concurrency chan struct{}
27 | }
28 |
29 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
30 | return flyscrape.ModuleInfo{
31 | ID: "ratelimit",
32 | New: func() flyscrape.Module { return new(Module) },
33 | }
34 | }
35 |
36 | func (m *Module) Provision(v flyscrape.Context) {
37 | if m.rateLimitEnabled() {
38 | rate := time.Duration(float64(time.Minute) / float64(m.Rate))
39 | m.ticker = time.NewTicker(rate)
40 | m.ratelimit = make(chan struct{}, int(math.Max(float64(m.Rate)/10, 1)))
41 |
42 | go func() {
43 | m.ratelimit <- struct{}{}
44 | for range m.ticker.C {
45 | m.ratelimit <- struct{}{}
46 | }
47 | }()
48 | }
49 |
50 | if m.browserEnabled() && !m.concurrencyEnabled() {
51 | m.Concurrency = 1
52 | }
53 |
54 | if m.concurrencyEnabled() {
55 | m.concurrency = make(chan struct{}, m.Concurrency)
56 | for i := 0; i < m.Concurrency; i++ {
57 | m.concurrency <- struct{}{}
58 | }
59 | }
60 | }
61 |
62 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
63 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
64 | if m.rateLimitEnabled() {
65 | <-m.ratelimit
66 | }
67 |
68 | if m.concurrencyEnabled() {
69 | <-m.concurrency
70 | defer func() { m.concurrency <- struct{}{} }()
71 | }
72 |
73 | return t.RoundTrip(r)
74 | })
75 | }
76 |
77 | func (m *Module) Finalize() {
78 | if m.rateLimitEnabled() {
79 | m.ticker.Stop()
80 | }
81 | }
82 |
83 | func (m *Module) rateLimitEnabled() bool {
84 | return m.Rate != 0
85 | }
86 |
87 | func (m *Module) concurrencyEnabled() bool {
88 | return m.Concurrency > 0
89 | }
90 |
91 | func (m *Module) browserEnabled() bool {
92 | return m.Browser
93 | }
94 |
95 | var (
96 | _ flyscrape.TransportAdapter = (*Module)(nil)
97 | _ flyscrape.Provisioner = (*Module)(nil)
98 | _ flyscrape.Finalizer = (*Module)(nil)
99 | )
100 |
--------------------------------------------------------------------------------
/modules/ratelimit/ratelimit_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package ratelimit_test
6 |
7 | import (
8 | "net/http"
9 | "sync"
10 | "testing"
11 | "time"
12 |
13 | "github.com/philippta/flyscrape"
14 | "github.com/philippta/flyscrape/modules/followlinks"
15 | "github.com/philippta/flyscrape/modules/hook"
16 | "github.com/philippta/flyscrape/modules/ratelimit"
17 | "github.com/philippta/flyscrape/modules/starturl"
18 | "github.com/stretchr/testify/require"
19 | )
20 |
21 | func TestRatelimit(t *testing.T) {
22 | var times []time.Time
23 | var mu sync.Mutex
24 |
25 | mods := []flyscrape.Module{
26 | &starturl.Module{URL: "http://www.example.com"},
27 | &followlinks.Module{},
28 | hook.Module{
29 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
30 | return flyscrape.MockTransport(200, `foo `)
31 | },
32 | ReceiveResponseFn: func(r *flyscrape.Response) {
33 | mu.Lock()
34 | times = append(times, time.Now())
35 | mu.Unlock()
36 | },
37 | },
38 | &ratelimit.Module{
39 | Rate: 240,
40 | },
41 | }
42 |
43 | start := time.Now()
44 | scraper := flyscrape.NewScraper()
45 | scraper.Modules = mods
46 | scraper.Run()
47 |
48 | first := times[0].Add(-250 * time.Millisecond)
49 | second := times[1].Add(-500 * time.Millisecond)
50 |
51 | require.Less(t, first.Sub(start), 250*time.Millisecond)
52 | require.Less(t, second.Sub(start), 250*time.Millisecond)
53 |
54 | require.Less(t, start.Sub(first), 250*time.Millisecond)
55 | require.Less(t, start.Sub(second), 250*time.Millisecond)
56 | }
57 |
58 | func TestRatelimitConcurrency(t *testing.T) {
59 | var times []time.Time
60 | var mu sync.Mutex
61 |
62 | mods := []flyscrape.Module{
63 | &starturl.Module{URL: "http://www.example.com"},
64 | &followlinks.Module{},
65 | hook.Module{
66 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
67 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
68 | mu.Lock()
69 | times = append(times, time.Now())
70 | mu.Unlock()
71 |
72 | time.Sleep(10 * time.Millisecond)
73 | return flyscrape.MockResponse(200, `
74 |
75 |
76 |
77 |
78 | `)
79 | })
80 | },
81 | },
82 | &ratelimit.Module{
83 | Concurrency: 2,
84 | },
85 | }
86 |
87 | scraper := flyscrape.NewScraper()
88 | scraper.Modules = mods
89 | scraper.Run()
90 |
91 | require.Len(t, times, 5)
92 | require.Less(t, times[2].Sub(times[1]), time.Millisecond)
93 | require.Less(t, times[4].Sub(times[3]), time.Millisecond)
94 | }
95 |
--------------------------------------------------------------------------------
/modules/retry/retry.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package retry
6 |
7 | import (
8 | "errors"
9 | "io"
10 | "net"
11 | "net/http"
12 | "slices"
13 | "strconv"
14 | "time"
15 |
16 | "github.com/philippta/flyscrape"
17 | )
18 |
19 | func init() {
20 | flyscrape.RegisterModule(Module{})
21 | }
22 |
23 | type Module struct {
24 | ticker *time.Ticker
25 | semaphore chan struct{}
26 |
27 | RetryDelays []time.Duration
28 | }
29 |
30 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
31 | return flyscrape.ModuleInfo{
32 | ID: "retry",
33 | New: func() flyscrape.Module { return new(Module) },
34 | }
35 | }
36 |
37 | func (m *Module) Provision(flyscrape.Context) {
38 | if m.RetryDelays == nil {
39 | m.RetryDelays = defaultRetryDelays
40 | }
41 | }
42 |
43 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
44 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
45 | resp, err := t.RoundTrip(r)
46 | if !shouldRetry(resp, err) {
47 | return resp, err
48 | }
49 |
50 | for _, delay := range m.RetryDelays {
51 | drainBody(resp, err)
52 |
53 | time.Sleep(retryAfter(resp, delay))
54 |
55 | resp, err = t.RoundTrip(r)
56 | if !shouldRetry(resp, err) {
57 | break
58 | }
59 | }
60 |
61 | return resp, err
62 | })
63 | }
64 |
65 | func shouldRetry(resp *http.Response, err error) bool {
66 | statusCodes := []int{
67 | http.StatusForbidden,
68 | http.StatusRequestTimeout,
69 | http.StatusTooEarly,
70 | http.StatusTooManyRequests,
71 | http.StatusInternalServerError,
72 | http.StatusBadGateway,
73 | http.StatusServiceUnavailable,
74 | http.StatusGatewayTimeout,
75 | }
76 |
77 | if resp != nil {
78 | if slices.Contains(statusCodes, resp.StatusCode) {
79 | return true
80 | }
81 | }
82 | if err == nil {
83 | return false
84 | }
85 | if _, ok := err.(net.Error); ok {
86 | return true
87 | }
88 | if errors.Is(err, io.ErrUnexpectedEOF) {
89 | return true
90 | }
91 |
92 | return false
93 | }
94 |
95 | func drainBody(resp *http.Response, err error) {
96 | if err == nil && resp != nil && resp.Body != nil {
97 | io.Copy(io.Discard, resp.Body)
98 | resp.Body.Close()
99 | }
100 | }
101 |
102 | func retryAfter(resp *http.Response, fallback time.Duration) time.Duration {
103 | if resp == nil {
104 | return fallback
105 | }
106 |
107 | timeexp := resp.Header.Get("Retry-After")
108 | if timeexp == "" {
109 | return fallback
110 | }
111 |
112 | if seconds, err := strconv.Atoi(timeexp); err == nil {
113 | return time.Duration(seconds) * time.Second
114 | }
115 |
116 | formats := []string{
117 | time.RFC1123, // HTTP Spec
118 | time.RFC1123Z,
119 | time.ANSIC,
120 | time.UnixDate,
121 | time.RubyDate,
122 | time.RFC822,
123 | time.RFC822Z,
124 | time.RFC850,
125 | time.RFC3339,
126 | }
127 | for _, format := range formats {
128 | if t, err := time.Parse(format, timeexp); err == nil {
129 | return t.Sub(time.Now())
130 | }
131 | }
132 |
133 | return fallback
134 | }
135 |
136 | var defaultRetryDelays = []time.Duration{
137 | 1 * time.Second,
138 | 2 * time.Second,
139 | 5 * time.Second,
140 | 10 * time.Second,
141 | }
142 |
143 | var (
144 | _ flyscrape.TransportAdapter = (*Module)(nil)
145 | _ flyscrape.Provisioner = (*Module)(nil)
146 | )
147 |
--------------------------------------------------------------------------------
/modules/retry/retry_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package retry_test
6 |
7 | import (
8 | "fmt"
9 | "io"
10 | "net"
11 | "net/http"
12 | "testing"
13 | "time"
14 |
15 | "github.com/philippta/flyscrape"
16 | "github.com/philippta/flyscrape/modules/followlinks"
17 | "github.com/philippta/flyscrape/modules/hook"
18 | "github.com/philippta/flyscrape/modules/retry"
19 | "github.com/philippta/flyscrape/modules/starturl"
20 | "github.com/stretchr/testify/require"
21 | )
22 |
23 | func TestRetry(t *testing.T) {
24 | t.Parallel()
25 | var count int
26 |
27 | mods := []flyscrape.Module{
28 | &starturl.Module{URL: "http://www.example.com"},
29 | &followlinks.Module{},
30 | hook.Module{
31 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
32 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
33 | count++
34 | return flyscrape.MockResponse(http.StatusServiceUnavailable, "service unavailable")
35 | })
36 | },
37 | },
38 | &retry.Module{
39 | RetryDelays: []time.Duration{
40 | 100 * time.Millisecond,
41 | 200 * time.Millisecond,
42 | },
43 | },
44 | }
45 |
46 | scraper := flyscrape.NewScraper()
47 | scraper.Modules = mods
48 | scraper.Run()
49 |
50 | require.Equal(t, 3, count)
51 | }
52 |
53 | func TestRetryStatusCodes(t *testing.T) {
54 | t.Parallel()
55 |
56 | tests := []struct {
57 | statusCode int
58 | retry bool
59 | }{
60 | {statusCode: http.StatusBadGateway, retry: true},
61 | {statusCode: http.StatusTooManyRequests, retry: true},
62 | {statusCode: http.StatusBadRequest, retry: false},
63 | {statusCode: http.StatusOK, retry: false},
64 | }
65 |
66 | for _, test := range tests {
67 | t.Run(fmt.Sprintf("%s_%t", http.StatusText(test.statusCode), test.retry), func(t *testing.T) {
68 | t.Parallel()
69 | var count int
70 | mods := []flyscrape.Module{
71 | &starturl.Module{URL: "http://www.example.com"},
72 | &followlinks.Module{},
73 | hook.Module{
74 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
75 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
76 | count++
77 | return flyscrape.MockResponse(test.statusCode, http.StatusText(test.statusCode))
78 | })
79 | },
80 | },
81 | &retry.Module{
82 | RetryDelays: []time.Duration{
83 | 100 * time.Millisecond,
84 | 200 * time.Millisecond,
85 | },
86 | },
87 | }
88 |
89 | scraper := flyscrape.NewScraper()
90 | scraper.Modules = mods
91 | scraper.Run()
92 |
93 | if test.retry {
94 | require.NotEqual(t, 1, count)
95 | } else {
96 | require.Equal(t, 1, count)
97 | }
98 | })
99 | }
100 | }
101 |
102 | func TestRetryErrors(t *testing.T) {
103 | t.Parallel()
104 |
105 | tests := []struct {
106 | error error
107 | }{
108 | {error: &net.OpError{}},
109 | {error: io.ErrUnexpectedEOF},
110 | }
111 |
112 | for _, test := range tests {
113 | t.Run(fmt.Sprintf("%T", test.error), func(t *testing.T) {
114 | t.Parallel()
115 | var count int
116 | mods := []flyscrape.Module{
117 | &starturl.Module{URL: "http://www.example.com"},
118 | &followlinks.Module{},
119 | hook.Module{
120 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
121 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
122 | return nil, test.error
123 | })
124 | },
125 | },
126 | &retry.Module{
127 | RetryDelays: []time.Duration{
128 | 100 * time.Millisecond,
129 | 200 * time.Millisecond,
130 | },
131 | },
132 | }
133 |
134 | scraper := flyscrape.NewScraper()
135 | scraper.Modules = mods
136 | scraper.Run()
137 |
138 | require.NotEqual(t, 1, count)
139 | })
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/modules/starturl/starturl.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package starturl
6 |
7 | import (
8 | "github.com/philippta/flyscrape"
9 | )
10 |
11 | func init() {
12 | flyscrape.RegisterModule(Module{})
13 | }
14 |
15 | type Module struct {
16 | URL string `json:"url"`
17 | URLs []string `json:"urls"`
18 | }
19 |
20 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
21 | return flyscrape.ModuleInfo{
22 | ID: "starturl",
23 | New: func() flyscrape.Module { return new(Module) },
24 | }
25 | }
26 |
27 | func (m *Module) Provision(ctx flyscrape.Context) {
28 | if m.URL != "" {
29 | ctx.Visit(m.URL)
30 | }
31 |
32 | for _, url := range m.URLs {
33 | ctx.Visit(url)
34 | }
35 | }
36 |
37 | var _ flyscrape.Provisioner = (*Module)(nil)
38 |
--------------------------------------------------------------------------------
/modules/starturl/starturl_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package starturl_test
6 |
7 | import (
8 | "net/http"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/hook"
14 | "github.com/philippta/flyscrape/modules/starturl"
15 | "github.com/stretchr/testify/require"
16 | )
17 |
18 | func TestStartURL(t *testing.T) {
19 | var url string
20 | var depth int
21 |
22 | mods := []flyscrape.Module{
23 | &starturl.Module{URL: "http://www.example.com/foo/bar"},
24 | hook.Module{
25 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
26 | return flyscrape.MockTransport(200, "")
27 | },
28 | BuildRequestFn: func(r *flyscrape.Request) {
29 | url = r.URL
30 | depth = r.Depth
31 | },
32 | },
33 | }
34 |
35 | scraper := flyscrape.NewScraper()
36 | scraper.Modules = mods
37 | scraper.Run()
38 |
39 | require.Equal(t, "http://www.example.com/foo/bar", url)
40 | require.Equal(t, 0, depth)
41 | }
42 |
43 | func TestStartURL_MultipleStartingURLs(t *testing.T) {
44 | testCases := []struct {
45 | name string
46 | startURLModFn func() *starturl.Module
47 | urls []string
48 | }{
49 | {
50 | name: ".URL and .URLs",
51 | startURLModFn: func() *starturl.Module {
52 | return &starturl.Module{
53 | URL: "http://www.example.com/foo",
54 | URLs: []string{
55 | "http://www.example.com/bar",
56 | "http://www.example.com/baz",
57 | },
58 | }
59 | },
60 | urls: []string{
61 | "http://www.example.com/foo",
62 | "http://www.example.com/bar",
63 | "http://www.example.com/baz",
64 | },
65 | },
66 | {
67 | name: "only .URL",
68 | startURLModFn: func() *starturl.Module {
69 | return &starturl.Module{
70 | URL: "http://www.example.com/foo",
71 | }
72 | },
73 | urls: []string{
74 | "http://www.example.com/foo",
75 | },
76 | },
77 | {
78 | name: "only .URLs",
79 | startURLModFn: func() *starturl.Module {
80 | return &starturl.Module{
81 | URLs: []string{
82 | "http://www.example.com/bar",
83 | "http://www.example.com/baz",
84 | },
85 | }
86 | },
87 | urls: []string{
88 | "http://www.example.com/bar",
89 | "http://www.example.com/baz",
90 | },
91 | },
92 | {
93 | name: "empty",
94 | startURLModFn: func() *starturl.Module {
95 | return &starturl.Module{}
96 | },
97 | urls: []string{},
98 | },
99 | }
100 |
101 | for _, tc := range testCases {
102 | t.Run(tc.name, func(t *testing.T) {
103 | urls := []string{}
104 | mu := sync.Mutex{}
105 |
106 | mods := []flyscrape.Module{
107 | tc.startURLModFn(),
108 | hook.Module{
109 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
110 | return flyscrape.MockTransport(http.StatusOK, "")
111 | },
112 | BuildRequestFn: func(r *flyscrape.Request) {
113 | mu.Lock()
114 | urls = append(urls, r.URL)
115 | mu.Unlock()
116 | },
117 | },
118 | }
119 |
120 | scraper := flyscrape.NewScraper()
121 | scraper.Modules = mods
122 | scraper.Run()
123 |
124 | require.ElementsMatch(t, tc.urls, urls)
125 | })
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/modules/urlfilter/urlfilter.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package urlfilter
6 |
7 | import (
8 | "regexp"
9 |
10 | "github.com/philippta/flyscrape"
11 | )
12 |
13 | func init() {
14 | flyscrape.RegisterModule(Module{})
15 | }
16 |
17 | type Module struct {
18 | URL string `json:"url"`
19 | URLs []string `json:"urls"`
20 | AllowedURLs []string `json:"allowedURLs"`
21 | BlockedURLs []string `json:"blockedURLs"`
22 |
23 | allowedURLsRE []*regexp.Regexp
24 | blockedURLsRE []*regexp.Regexp
25 | }
26 |
27 | func (Module) ModuleInfo() flyscrape.ModuleInfo {
28 | return flyscrape.ModuleInfo{
29 | ID: "urlfilter",
30 | New: func() flyscrape.Module { return new(Module) },
31 | }
32 | }
33 |
34 | func (m *Module) Provision(v flyscrape.Context) {
35 | if m.disabled() {
36 | return
37 | }
38 |
39 | for _, pat := range m.AllowedURLs {
40 | re, err := regexp.Compile(pat)
41 | if err != nil {
42 | continue
43 | }
44 | m.allowedURLsRE = append(m.allowedURLsRE, re)
45 | }
46 |
47 | for _, pat := range m.BlockedURLs {
48 | re, err := regexp.Compile(pat)
49 | if err != nil {
50 | continue
51 | }
52 | m.blockedURLsRE = append(m.blockedURLsRE, re)
53 | }
54 | }
55 |
56 | func (m *Module) ValidateRequest(r *flyscrape.Request) bool {
57 | if m.disabled() {
58 | return true
59 | }
60 |
61 | // allow root url
62 | if r.URL == m.URL {
63 | return true
64 | }
65 | for _, u := range m.URLs {
66 | if r.URL == u {
67 | return true
68 | }
69 | }
70 |
71 | // allow if no filter is set
72 | if len(m.allowedURLsRE) == 0 && len(m.blockedURLsRE) == 0 {
73 | return true
74 | }
75 |
76 | ok := false
77 | if len(m.allowedURLsRE) == 0 {
78 | ok = true
79 | }
80 |
81 | for _, re := range m.allowedURLsRE {
82 | if re.MatchString(r.URL) {
83 | ok = true
84 | break
85 | }
86 | }
87 |
88 | for _, re := range m.blockedURLsRE {
89 | if re.MatchString(r.URL) {
90 | ok = false
91 | break
92 | }
93 | }
94 |
95 | return ok
96 | }
97 |
98 | func (m *Module) disabled() bool {
99 | return len(m.AllowedURLs) == 0 && len(m.BlockedURLs) == 0
100 | }
101 |
102 | var (
103 | _ flyscrape.RequestValidator = (*Module)(nil)
104 | _ flyscrape.Provisioner = (*Module)(nil)
105 | )
106 |
--------------------------------------------------------------------------------
/modules/urlfilter/urlfilter_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package urlfilter_test
6 |
7 | import (
8 | "net/http"
9 | "sync"
10 | "testing"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/philippta/flyscrape/modules/followlinks"
14 | "github.com/philippta/flyscrape/modules/hook"
15 | "github.com/philippta/flyscrape/modules/starturl"
16 | "github.com/philippta/flyscrape/modules/urlfilter"
17 | "github.com/stretchr/testify/require"
18 | )
19 |
20 | func TestURLFilterAllowed(t *testing.T) {
21 | var urls []string
22 | var mu sync.Mutex
23 |
24 | mods := []flyscrape.Module{
25 | &starturl.Module{URL: "http://www.example.com/"},
26 | &followlinks.Module{},
27 | &urlfilter.Module{
28 | URL: "http://www.example.com/",
29 | AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`},
30 | },
31 | hook.Module{
32 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
33 | return flyscrape.MockTransport(200, `
34 | 123
35 | ABC
36 | bar
37 | barz `)
38 | },
39 | ReceiveResponseFn: func(r *flyscrape.Response) {
40 | mu.Lock()
41 | urls = append(urls, r.Request.URL)
42 | mu.Unlock()
43 | },
44 | },
45 | }
46 |
47 | scraper := flyscrape.NewScraper()
48 | scraper.Modules = mods
49 | scraper.Run()
50 |
51 | require.Len(t, urls, 3)
52 | require.Contains(t, urls, "http://www.example.com/")
53 | require.Contains(t, urls, "http://www.example.com/foo?id=123")
54 | require.Contains(t, urls, "http://www.example.com/bar")
55 | }
56 |
57 | func TestURLFilterBlocked(t *testing.T) {
58 | var urls []string
59 | var mu sync.Mutex
60 |
61 | mods := []flyscrape.Module{
62 | &starturl.Module{URL: "http://www.example.com/"},
63 | &followlinks.Module{},
64 | &urlfilter.Module{
65 | URL: "http://www.example.com/",
66 | BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`},
67 | },
68 | hook.Module{
69 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
70 | return flyscrape.MockTransport(200, `
71 | 123
72 | ABC
73 | bar
74 | barz `)
75 | },
76 | ReceiveResponseFn: func(r *flyscrape.Response) {
77 | mu.Lock()
78 | urls = append(urls, r.Request.URL)
79 | mu.Unlock()
80 | },
81 | },
82 | }
83 |
84 | scraper := flyscrape.NewScraper()
85 | scraper.Modules = mods
86 | scraper.Run()
87 |
88 | require.Len(t, urls, 3)
89 | require.Contains(t, urls, "http://www.example.com/")
90 | require.Contains(t, urls, "http://www.example.com/foo?id=ABC")
91 | require.Contains(t, urls, "http://www.example.com/barz")
92 | }
93 |
--------------------------------------------------------------------------------
/scrape.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | "fmt"
9 | "io"
10 | "log"
11 | "net/http"
12 | "net/http/cookiejar"
13 | "strings"
14 | "sync"
15 |
16 | "github.com/cornelk/hashmap"
17 | )
18 |
19 | type Context interface {
20 | ScriptName() string
21 | Visit(url string)
22 | MarkVisited(url string)
23 | MarkUnvisited(url string)
24 | }
25 |
26 | type Request struct {
27 | Method string
28 | URL string
29 | Headers http.Header
30 | Cookies http.CookieJar
31 | Depth int
32 | }
33 |
34 | type Response struct {
35 | StatusCode int
36 | Headers http.Header
37 | Body []byte
38 | Data any
39 | Error error
40 | Request *Request
41 |
42 | Visit func(url string)
43 | }
44 |
45 | type target struct {
46 | url string
47 | depth int
48 | }
49 |
50 | func NewScraper() *Scraper {
51 | return &Scraper{}
52 | }
53 |
54 | type Scraper struct {
55 | ScrapeFunc ScrapeFunc
56 | Script string
57 | Modules []Module
58 | Client *http.Client
59 |
60 | wg sync.WaitGroup
61 | jobs chan target
62 | visited *hashmap.Map[string, struct{}]
63 | }
64 |
65 | func (s *Scraper) Visit(url string) {
66 | s.enqueueJob(url, 0)
67 | }
68 |
69 | func (s *Scraper) MarkVisited(url string) {
70 | s.visited.Insert(url, struct{}{})
71 | }
72 |
73 | func (s *Scraper) MarkUnvisited(url string) {
74 | s.visited.Del(url)
75 | }
76 |
77 | func (s *Scraper) ScriptName() string {
78 | return s.Script
79 | }
80 |
81 | func (s *Scraper) Run() {
82 | s.jobs = make(chan target, 1<<20)
83 | s.visited = hashmap.New[string, struct{}]()
84 |
85 | s.initClient()
86 |
87 | for _, mod := range s.Modules {
88 | if v, ok := mod.(Provisioner); ok {
89 | v.Provision(s)
90 | }
91 | }
92 |
93 | for _, mod := range s.Modules {
94 | if v, ok := mod.(TransportAdapter); ok {
95 | s.Client.Transport = v.AdaptTransport(s.Client.Transport)
96 | }
97 | }
98 |
99 | s.scrape()
100 | s.wg.Wait()
101 | close(s.jobs)
102 |
103 | for _, mod := range s.Modules {
104 | if v, ok := mod.(Finalizer); ok {
105 | v.Finalize()
106 | }
107 | }
108 | }
109 |
110 | func (s *Scraper) initClient() {
111 | if s.Client == nil {
112 | s.Client = &http.Client{}
113 | }
114 | if s.Client.Jar == nil {
115 | s.Client.Jar, _ = cookiejar.New(nil)
116 | }
117 | if s.Client.Transport == nil {
118 | s.Client.Transport = http.DefaultTransport
119 | }
120 | }
121 |
122 | func (s *Scraper) scrape() {
123 | for i := 0; i < 500; i++ {
124 | go func() {
125 | for job := range s.jobs {
126 | s.process(job.url, job.depth)
127 | s.wg.Done()
128 | }
129 | }()
130 | }
131 | }
132 |
133 | func (s *Scraper) process(url string, depth int) {
134 | request := &Request{
135 | Method: http.MethodGet,
136 | URL: url,
137 | Headers: http.Header{},
138 | Cookies: s.Client.Jar,
139 | Depth: depth,
140 | }
141 |
142 | response := &Response{
143 | Request: request,
144 | Visit: func(url string) {
145 | s.enqueueJob(url, depth+1)
146 | },
147 | }
148 |
149 | for _, mod := range s.Modules {
150 | if v, ok := mod.(RequestBuilder); ok {
151 | v.BuildRequest(request)
152 | }
153 | }
154 |
155 | req, err := http.NewRequest(request.Method, request.URL, nil)
156 | if err != nil {
157 | response.Error = err
158 | return
159 | }
160 | req.Header = request.Headers
161 |
162 | for _, mod := range s.Modules {
163 | if v, ok := mod.(RequestValidator); ok {
164 | if !v.ValidateRequest(request) {
165 | return
166 | }
167 | }
168 | }
169 |
170 | defer func() {
171 | for _, mod := range s.Modules {
172 | if v, ok := mod.(ResponseReceiver); ok {
173 | v.ReceiveResponse(response)
174 | }
175 | }
176 | }()
177 |
178 | resp, err := s.Client.Do(req)
179 | if err != nil {
180 | response.Error = err
181 | return
182 | }
183 | defer resp.Body.Close()
184 |
185 | response.StatusCode = resp.StatusCode
186 | response.Headers = resp.Header
187 |
188 | if response.StatusCode < 200 || response.StatusCode >= 300 {
189 | response.Error = fmt.Errorf("%d %s", response.StatusCode, http.StatusText(response.StatusCode))
190 | }
191 |
192 | response.Body, err = io.ReadAll(resp.Body)
193 | if err != nil {
194 | response.Error = err
195 | return
196 | }
197 |
198 | if s.ScrapeFunc != nil {
199 | func() {
200 | defer func() {
201 | if r := recover(); r != nil {
202 | log.Println(r)
203 | }
204 | }()
205 |
206 | p := ScrapeParams{
207 | HTML: string(response.Body),
208 | URL: request.URL,
209 | Process: s.processImmediate,
210 | Follow: func(url string) {
211 | s.enqueueJob(url, depth+1)
212 | },
213 | }
214 |
215 | response.Data, err = s.ScrapeFunc(p)
216 | if err != nil {
217 | response.Error = err
218 | return
219 | }
220 | }()
221 | }
222 | }
223 |
224 | func (s *Scraper) processImmediate(url string) ([]byte, error) {
225 | request := &Request{
226 | Method: http.MethodGet,
227 | URL: url,
228 | Headers: http.Header{},
229 | Cookies: s.Client.Jar,
230 | }
231 |
232 | for _, mod := range s.Modules {
233 | if v, ok := mod.(RequestBuilder); ok {
234 | v.BuildRequest(request)
235 | }
236 | }
237 |
238 | req, err := http.NewRequest(request.Method, request.URL, nil)
239 | if err != nil {
240 | return nil, err
241 | }
242 | req.Header = request.Headers
243 |
244 | for _, mod := range s.Modules {
245 | if v, ok := mod.(RequestValidator); ok {
246 | if !v.ValidateRequest(request) {
247 | return nil, nil
248 | }
249 | }
250 | }
251 |
252 | resp, err := s.Client.Do(req)
253 | if err != nil {
254 | return nil, err
255 | }
256 | defer resp.Body.Close()
257 |
258 | if resp.StatusCode < 200 || resp.StatusCode >= 300 {
259 | return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode))
260 | }
261 |
262 | body, err := io.ReadAll(resp.Body)
263 | if err != nil {
264 | return nil, err
265 | }
266 |
267 | return body, nil
268 | }
269 |
270 | func (s *Scraper) enqueueJob(url string, depth int) {
271 | url = strings.TrimSpace(url)
272 | if url == "" {
273 | return
274 | }
275 |
276 | if _, ok := s.visited.Get(url); ok {
277 | return
278 | }
279 |
280 | s.wg.Add(1)
281 | select {
282 | case s.jobs <- target{url: url, depth: depth}:
283 | s.MarkVisited(url)
284 | default:
285 | log.Println("queue is full, can't add url:", url)
286 | s.wg.Done()
287 | }
288 | }
289 |
--------------------------------------------------------------------------------
/template.js:
--------------------------------------------------------------------------------
1 | export const config = {
2 | // Specify the URL to start scraping from.
3 | url: "https://example.com/",
4 |
5 | // Enable rendering with headless browser. (default = false)
6 | // browser: true,
7 |
8 | // Specify if browser should be headless or not. (default = true)
9 | // headless: false,
10 |
11 | // Specify the multiple URLs to start scraping from. (default = [])
12 | // urls: [
13 | // "https://anothersite.com/",
14 | // "https://yetanother.com/",
15 | // ],
16 |
17 | // Specify how deep links should be followed. (default = 0, no follow)
18 | // depth: 5,
19 |
20 | // Speficy the css selectors to follow. (default = ["a[href]"])
21 | // follow: [".next > a", ".related a"],
22 |
23 | // Specify the allowed domains. ['*'] for all. (default = domain from url)
24 | // allowedDomains: ["example.com", "anothersite.com"],
25 |
26 | // Specify the blocked domains. (default = none)
27 | // blockedDomains: ["somesite.com"],
28 |
29 | // Specify the allowed URLs as regex. (default = all allowed)
30 | // allowedURLs: ["/posts", "/articles/\d+"],
31 |
32 | // Specify the blocked URLs as regex. (default = none)
33 | // blockedURLs: ["/admin"],
34 |
35 | // Specify the rate in requests per minute. (default = no rate limit)
36 | // rate: 60,
37 |
38 | // Specify the number of concurrent requests. (default = no limit)
39 | // concurrency: 1,
40 |
41 | // Specify a single HTTP(S) proxy URL. (default = no proxy)
42 | // Note: Not compatible with browser mode.
43 | // proxy: "http://someproxy.com:8043",
44 |
45 | // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
46 | // Note: Not compatible with browser mode.
47 | // proxies: [
48 | // "http://someproxy.com:8043",
49 | // "http://someotherproxy.com:8043",
50 | // ],
51 |
52 | // Enable file-based request caching. (default = no cache)
53 | // cache: "file",
54 |
55 | // Specify the HTTP request header. (default = none)
56 | // headers: {
57 | // "Authorization": "Bearer ...",
58 | // "User-Agent": "Mozilla ...",
59 | // },
60 |
61 | // Use the cookie store of your local browser. (default = off)
62 | // Options: "chrome" | "edge" | "firefox"
63 | // cookies: "chrome",
64 |
65 | // Specify the output options.
66 | // output: {
67 | // // Specify the output file. (default = stdout)
68 | // file: "results.json",
69 | //
70 | // // Specify the output format. (default = json)
71 | // // Options: "json" | "ndjson"
72 | // format: "json",
73 | // },
74 | };
75 |
76 | export default function({ doc, absoluteURL }) {
77 | const title = doc.find("h1");
78 | const link = doc.find("a");
79 |
80 | return {
81 | title: title.text(),
82 | link: {
83 | text: link.text(),
84 | url: absoluteURL(link.attr("href")),
85 | },
86 | };
87 | }
88 |
--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | "fmt"
9 | "io"
10 | "net/http"
11 | "strings"
12 | )
13 |
14 | const HeaderBypassCache = "X-Flyscrape-Bypass-Cache"
15 |
16 | type RoundTripFunc func(*http.Request) (*http.Response, error)
17 |
18 | func (f RoundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
19 | return f(r)
20 | }
21 |
22 | func MockTransport(statusCode int, html string) RoundTripFunc {
23 | return func(*http.Request) (*http.Response, error) {
24 | return MockResponse(statusCode, html)
25 | }
26 | }
27 |
28 | func MockResponse(statusCode int, html string) (*http.Response, error) {
29 | return &http.Response{
30 | StatusCode: statusCode,
31 | Status: fmt.Sprintf("%d %s", statusCode, http.StatusText(statusCode)),
32 | Body: io.NopCloser(strings.NewReader(html)),
33 | Header: http.Header{"Content-Type": []string{"text/html"}},
34 | }, nil
35 | }
36 |
--------------------------------------------------------------------------------
/watch.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape
6 |
7 | import (
8 | "errors"
9 | "fmt"
10 | "os"
11 | "time"
12 |
13 | "github.com/fsnotify/fsnotify"
14 | )
15 |
16 | var StopWatch = errors.New("stop watch")
17 |
18 | func Watch(path string, fn func(string) error) error {
19 | watcher, err := fsnotify.NewWatcher()
20 | if err != nil {
21 | return fmt.Errorf("creating file watcher: %w", err)
22 | }
23 | defer watcher.Close()
24 |
25 | if err := watcher.Add(path); err != nil {
26 | return fmt.Errorf("watching file %q: %w", path, err)
27 | }
28 |
29 | update := func() error {
30 | data, err := os.ReadFile(path)
31 | if err != nil {
32 | return err
33 | }
34 | return fn(string(data))
35 | }
36 |
37 | if err := update(); errors.Is(err, StopWatch) {
38 | return nil
39 | }
40 |
41 | for {
42 | select {
43 | case e, ok := <-watcher.Events:
44 | if !ok {
45 | return nil
46 | }
47 | if e.Has(fsnotify.Rename) {
48 | time.Sleep(10 * time.Millisecond)
49 | watcher.Remove(path)
50 | watcher.Add(path)
51 | }
52 | if e.Has(fsnotify.Write) || e.Has(fsnotify.Rename) {
53 | if err := update(); errors.Is(err, StopWatch) {
54 | return nil
55 | }
56 | }
57 | case err, ok := <-watcher.Errors:
58 | if !ok {
59 | return nil
60 | }
61 | if err != nil {
62 | return fmt.Errorf("watcher: %w", err)
63 | }
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/watch_test.go:
--------------------------------------------------------------------------------
1 | // This Source Code Form is subject to the terms of the Mozilla Public
2 | // License, v. 2.0. If a copy of the MPL was not distributed with this
3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 |
5 | package flyscrape_test
6 |
7 | import (
8 | "os"
9 | "testing"
10 | "time"
11 |
12 | "github.com/philippta/flyscrape"
13 | "github.com/stretchr/testify/require"
14 | )
15 |
16 | func TestWatch(t *testing.T) {
17 | f := tmpfile(t)
18 | defer os.Remove(f.Name())
19 | write(f, "test 1")
20 |
21 | calls := 0
22 | done := make(chan struct{})
23 |
24 | go func() {
25 | err := flyscrape.Watch(f.Name(), func(s string) error {
26 | calls++
27 | if calls == 1 {
28 | require.Equal(t, "test 1", s)
29 | return nil
30 | }
31 | if calls == 2 {
32 | require.Equal(t, "test 2", s)
33 | return flyscrape.StopWatch
34 | }
35 | return nil
36 | })
37 | require.NoError(t, err)
38 | close(done)
39 | }()
40 |
41 | write(f, "test 2")
42 | <-done
43 | }
44 |
45 | func tmpfile(t *testing.T) *os.File {
46 | f, err := os.CreateTemp("", "scrape.js")
47 | require.NoError(t, err)
48 | return f
49 | }
50 |
51 | func write(f *os.File, s string) {
52 | time.Sleep(10 * time.Millisecond)
53 | f.Seek(0, 0)
54 | f.Truncate(0)
55 | f.WriteString(s)
56 | f.Sync()
57 | }
58 |
--------------------------------------------------------------------------------