├── .github ├── assets │ ├── flyscrape-demo.jpg │ ├── logo-alt.png │ └── logo.png └── workflows │ ├── release.yaml │ └── test.yaml ├── .gitignore ├── .goreleaser.yaml ├── LICENSE ├── README.md ├── cmd ├── args.go ├── args_test.go ├── dev.go ├── flyscrape │ └── main.go ├── main.go ├── new.go ├── run.go └── version.go ├── examples ├── browser.js ├── coinmarketcap.js ├── cookies.js ├── custom_headers.js ├── download.js ├── hackernews.js ├── hackernews_manual_follow.js ├── hackernews_with_comments.js ├── multiple_starting_urls.js ├── reddit.js ├── urls.txt ├── urls_from_file.js └── useragents │ ├── chrome.js │ ├── edge.js │ ├── firefox.js │ └── opera.js ├── flyscrape.go ├── go.mod ├── go.sum ├── install.sh ├── js.go ├── js_lib.go ├── js_lib_test.go ├── js_test.go ├── module.go ├── modules ├── browser │ ├── browser.go │ └── browser_test.go ├── cache │ ├── boltstore.go │ ├── boltstore_test.go │ └── cache.go ├── cookies │ └── cookies.go ├── depth │ ├── depth.go │ └── depth_test.go ├── domainfilter │ ├── domainfilter.go │ └── domainfilter_test.go ├── followlinks │ ├── followlinks.go │ └── followlinks_test.go ├── headers │ ├── headers.go │ ├── headers_test.go │ ├── versions.go │ ├── versions_chrome.txt │ ├── versions_edge.txt │ ├── versions_firefox.txt │ ├── versions_linux.txt │ ├── versions_macos.txt │ ├── versions_opera.txt │ └── versions_windows.txt ├── hook │ └── hook.go ├── output │ ├── json │ │ └── json.go │ └── ndjson │ │ └── ndjson.go ├── proxy │ ├── proxy.go │ └── proxy_test.go ├── ratelimit │ ├── ratelimit.go │ └── ratelimit_test.go ├── retry │ ├── retry.go │ └── retry_test.go ├── starturl │ ├── starturl.go │ └── starturl_test.go └── urlfilter │ ├── urlfilter.go │ └── urlfilter_test.go ├── scrape.go ├── template.js ├── utils.go ├── watch.go └── watch_test.go /.github/assets/flyscrape-demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philippta/flyscrape/f1084f3e8370d1d0ceb3a1e82517e7b1761be42a/.github/assets/flyscrape-demo.jpg -------------------------------------------------------------------------------- /.github/assets/logo-alt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philippta/flyscrape/f1084f3e8370d1d0ceb3a1e82517e7b1761be42a/.github/assets/logo-alt.png -------------------------------------------------------------------------------- /.github/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philippta/flyscrape/f1084f3e8370d1d0ceb3a1e82517e7b1761be42a/.github/assets/logo.png -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | release: 13 | name: Release 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Fetch Git tags 21 | run: git fetch --force --tags 22 | 23 | - name: Set up Go 24 | uses: actions/setup-go@v5.1.0 25 | with: 26 | go-version: '1.23.3' 27 | 28 | - name: Run GoReleaser 29 | uses: goreleaser/goreleaser-action@v6.1.0 30 | with: 31 | version: '~> v2' 32 | args: release --clean 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 35 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: ["master"] 6 | pull_request: 7 | branches: ["master"] 8 | 9 | jobs: 10 | test: 11 | name: Test 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v4 18 | with: 19 | go-version: "1.21.3" 20 | 21 | - name: Install dependencies 22 | run: go get . 23 | 24 | - name: Test 25 | run: go test -v ./... 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | dist/ 4 | examples/**/*.cache 5 | -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | before: 4 | hooks: 5 | - go mod tidy 6 | - go test ./... 7 | 8 | builds: 9 | - id: flyscrape 10 | main: ./cmd/flyscrape 11 | env: 12 | - CGO_ENABLED=0 13 | ldflags: 14 | - -s -w 15 | - -extldflags "-static" 16 | - -X github.com/philippta/flyscrape.Version={{.Tag}} 17 | flags: 18 | - -mod=readonly 19 | tags: 20 | - osusergo 21 | - netgo 22 | 23 | archives: 24 | - format: tar.gz 25 | format_overrides: 26 | - goos: windows 27 | format: zip 28 | 29 | changelog: 30 | sort: asc 31 | filters: 32 | exclude: 33 | - "^docs:" 34 | - "^test:" 35 | - "^chore:" 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |

12 | 13 |
14 | 15 |

16 | Flyscrape is a command-line web scraping tool designed for those without
advanced programming skills, enabling precise extraction of website data. 17 |

18 | 19 |
20 | 21 |

22 | Installation · Documentation · Releases 23 |

24 | 25 | 26 | ## Demo 27 | 28 | 29 | 30 | 31 | 32 | 33 | ## Features 34 | 35 | - **Standalone:** Flyscrape comes as a single binary executable. 36 | - **jQuery-like:** Extract data from HTML pages with a familiar API. 37 | - **Scriptable:** Use JavaScript to write your data extraction logic. 38 | - **System Cookies:** Give Flyscrape access to your browsers cookie store. 39 | - **Browser Mode:** Render JavaScript heavy pages using a headless Browser. 40 | - **Nested Scraping:** Extract data from linked pages within a single scrape. 41 | 42 | ## Overview 43 | 44 | - [Example](#example) 45 | - [Installation](#installation) 46 | - [Recommended](#recommended) 47 | - [Homebrew](#homebrew) 48 | - [Pre-compiled binary](#pre-compiled-binary) 49 | - [Compile from source](#compile-from-source) 50 | - [Usage](#usage) 51 | - [Configuration](#configuration) 52 | - [Query API](#query-api) 53 | - [Flyscrape API](#flyscrape-api) 54 | - [Document Parsing](#document-parsing) 55 | - [File Downloads](#file-downloads) 56 | - [Issues and suggestions](#issues-and-suggestions) 57 | 58 | ## Example 59 | 60 | This example scrapes the first few pages form Hacker News, specifically the New, Show and Ask sections. 61 | 62 | ```javascript 63 | export const config = { 64 | urls: [ 65 | "https://news.ycombinator.com/new", 66 | "https://news.ycombinator.com/show", 67 | "https://news.ycombinator.com/ask", 68 | ], 69 | 70 | // Cache request for later. 71 | cache: "file", 72 | 73 | // Enable JavaScript rendering. 74 | browser: true, 75 | headless: false, 76 | 77 | // Follow pagination 5 times. 78 | depth: 5, 79 | follow: ["a.morelink[href]"], 80 | } 81 | 82 | export default function ({ doc, absoluteURL }) { 83 | const title = doc.find("title"); 84 | const posts = doc.find(".athing"); 85 | 86 | return { 87 | title: title.text(), 88 | posts: posts.map((post) => { 89 | const link = post.find(".titleline > a"); 90 | 91 | return { 92 | title: link.text(), 93 | url: link.attr("href"), 94 | }; 95 | }), 96 | } 97 | } 98 | ``` 99 | 100 | ```bash 101 | $ flyscrape run hackernews.js 102 | [ 103 | { 104 | "url": "https://news.ycombinator.com/new", 105 | "data": { 106 | "title": "New Links | Hacker News", 107 | "posts": [ 108 | { 109 | "title": "Show HN: flyscrape - An standalone and scriptable web scraper", 110 | "url": "https://flyscrape.com/" 111 | }, 112 | ... 113 | ] 114 | } 115 | } 116 | ] 117 | ``` 118 | 119 | Check out the [examples folder](examples) for more detailed examples. 120 | 121 | ## Installation 122 | 123 | ### Recommended 124 | 125 | The easiest way to install `flyscrape` is via its install script. 126 | 127 | ```bash 128 | curl -fsSL https://flyscrape.com/install | bash 129 | ``` 130 | 131 | ### Homebrew 132 | 133 | For macOS users `flyscrape` is also available via homebrew: 134 | 135 | ```bash 136 | brew install flyscrape 137 | ``` 138 | 139 | ### Pre-compiled binary 140 | 141 | `flyscrape` is available for MacOS, Linux and Windows as a downloadable binary from the [releases page](https://github.com/philippta/flyscrape/releases). 142 | 143 | ### Compile from source 144 | 145 | To compile flyscrape from source, follow these steps: 146 | 147 | 1. Install Go: Make sure you have Go installed on your system. If not, you can download it from [https://go.dev/](https://go.dev/). 148 | 149 | 2. Install flyscrape: Open a terminal and run the following command: 150 | 151 | ```bash 152 | go install github.com/philippta/flyscrape/cmd/flyscrape@latest 153 | ``` 154 | 155 | ## Usage 156 | 157 | ``` 158 | Usage: 159 | 160 | flyscrape run SCRIPT [config flags] 161 | 162 | Examples: 163 | 164 | # Run the script. 165 | $ flyscrape run example.js 166 | 167 | # Set the URL as argument. 168 | $ flyscrape run example.js --url "http://other.com" 169 | 170 | # Enable proxy support. 171 | $ flyscrape run example.js --proxies "http://someproxy:8043" 172 | 173 | # Follow paginated links. 174 | $ flyscrape run example.js --depth 5 --follow ".next-button > a" 175 | 176 | # Set the output format to ndjson. 177 | $ flyscrape run example.js --output.format ndjson 178 | 179 | # Write the output to a file. 180 | $ flyscrape run example.js --output.file results.json 181 | ``` 182 | 183 | ## Configuration 184 | 185 | Below is an example scraping script that showcases the capabilities of flyscrape. For a full documentation of all configuration options, visit the [documentation page](https://flyscrape.com/docs/getting-started/). 186 | 187 | ```javascript 188 | export const config = { 189 | // Specify the URL to start scraping from. 190 | url: "https://example.com/", 191 | 192 | // Specify the multiple URLs to start scraping from. (default = []) 193 | urls: [ 194 | "https://anothersite.com/", 195 | "https://yetanother.com/", 196 | ], 197 | 198 | // Enable rendering with headless browser. (default = false) 199 | browser: true, 200 | 201 | // Specify if browser should be headless or not. (default = true) 202 | headless: false, 203 | 204 | // Specify how deep links should be followed. (default = 0, no follow) 205 | depth: 5, 206 | 207 | // Specify the css selectors to follow. (default = ["a[href]"]) 208 | // Setting follow to [] disables automatic following. 209 | // Can later be used with manual following. 210 | follow: [".next > a", ".related a"], 211 | 212 | // Specify the allowed domains. ['*'] for all. (default = domain from url) 213 | allowedDomains: ["example.com", "anothersite.com"], 214 | 215 | // Specify the blocked domains. (default = none) 216 | blockedDomains: ["somesite.com"], 217 | 218 | // Specify the allowed URLs as regex. (default = all allowed) 219 | allowedURLs: ["/posts", "/articles/\d+"], 220 | 221 | // Specify the blocked URLs as regex. (default = none) 222 | blockedURLs: ["/admin"], 223 | 224 | // Specify the rate in requests per minute. (default = no rate limit) 225 | rate: 60, 226 | 227 | // Specify the number of concurrent requests. (default = no limit) 228 | concurrency: 1, 229 | 230 | // Specify a single HTTP(S) proxy URL. (default = no proxy) 231 | // Note: Not compatible with browser mode. 232 | proxy: "http://someproxy.com:8043", 233 | 234 | // Specify multiple HTTP(S) proxy URLs. (default = no proxy) 235 | // Note: Not compatible with browser mode. 236 | proxies: [ 237 | "http://someproxy.com:8043", 238 | "http://someotherproxy.com:8043", 239 | ], 240 | 241 | // Enable file-based request caching. (default = no cache) 242 | cache: "file", 243 | 244 | // Specify the HTTP request header. (default = none) 245 | headers: { 246 | "Authorization": "Bearer ...", 247 | "User-Agent": "Mozilla ...", 248 | }, 249 | 250 | // Use the cookie store of your local browser. (default = off) 251 | // Options: "chrome" | "edge" | "firefox" 252 | cookies: "chrome", 253 | 254 | // Specify the output options. 255 | output: { 256 | // Specify the output file. (default = stdout) 257 | file: "results.json", 258 | 259 | // Specify the output format. (default = json) 260 | // Options: "json" | "ndjson" 261 | format: "json", 262 | }, 263 | }; 264 | 265 | export default function ({ doc, url, absoluteURL, scrape, follow }) { 266 | // doc 267 | // Contains the parsed HTML document. 268 | 269 | // url 270 | // Contains the scraped URL. 271 | 272 | // absoluteURL("/foo") 273 | // Transforms a relative URL into absolute URL. 274 | 275 | // scrape(url, function({ doc, url, absoluteURL, scrape }) { 276 | // return { ... }; 277 | // }) 278 | // Scrapes a linked page and returns the scrape result. 279 | 280 | // follow("/foo") 281 | // Follows a link manually. 282 | // Disable automatic following with `follow: []` for best results. 283 | } 284 | ``` 285 | 286 | ## Query API 287 | 288 | ```javascript 289 | //
Hey
290 | const el = doc.find(".element") 291 | el.text() // "Hey" 292 | el.html() // `
Hey
` 293 | el.name() // div 294 | el.attr("foo") // "bar" 295 | el.hasAttr("foo") // true 296 | el.hasClass("element") // true 297 | 298 | // 303 | const list = doc.find("ul") 304 | list.children() // [
  • Item 1
  • ,
  • Item 2
  • ,
  • Item 3
  • ] 305 | 306 | const items = list.find("li") 307 | items.length() // 3 308 | items.first() //
  • Item 1
  • 309 | items.last() //
  • Item 3
  • 310 | items.get(1) //
  • Item 2
  • 311 | items.get(1).prev() //
  • Item 1
  • 312 | items.get(1).next() //
  • Item 3
  • 313 | items.get(1).parent() // 314 | items.get(1).siblings() // [
  • Item 1
  • ,
  • Item 2
  • ,
  • Item 3
  • ] 315 | items.map(item => item.text()) // ["Item 1", "Item 2", "Item 3"] 316 | items.filter(item => item.hasClass("a")) // [
  • Item 1
  • ] 317 | 318 | //
    319 | //

    Aleph

    320 | //

    Aleph

    321 | //

    Beta

    322 | //

    Beta

    323 | //

    Gamma

    324 | //

    Gamma

    325 | //
    326 | const header = doc.find("div h2") 327 | 328 | header.get(1).prev() //

    Aleph

    329 | header.get(1).prevAll() // [

    Aleph

    ,

    Aleph

    ] 330 | header.get(1).prevUntil('div,h1,h2,h3') //

    Aleph

    331 | header.get(1).next() //

    Beta

    332 | header.get(1).nextAll() // [

    Beta

    ,

    Gamma

    ,

    Gamma

    ] 333 | header.get(1).nextUntil('div,h1,h2,h3') //

    Beta

    334 | ``` 335 | 336 | ## Flyscrape API 337 | 338 | ### Document Parsing 339 | 340 | ```javascript 341 | import { parse } from "flyscrape"; 342 | 343 | const doc = parse(`
    bar
    `); 344 | const text = doc.find(".foo").text(); 345 | ``` 346 | 347 | ### File Downloads 348 | 349 | ```javascript 350 | import { download } from "flyscrape/http"; 351 | 352 | download("http://example.com/image.jpg") // downloads as "image.jpg" 353 | download("http://example.com/image.jpg", "other.jpg") // downloads as "other.jpg" 354 | download("http://example.com/image.jpg", "dir/") // downloads as "dir/image.jpg" 355 | 356 | // If the server offers a filename via the Content-Disposition header and no 357 | // destination filename is provided, Flyscrape will honor the suggested filename. 358 | // E.g. `Content-Disposition: attachment; filename="archive.zip"` 359 | download("http://example.com/generate_archive.php", "dir/") // downloads as "dir/archive.zip" 360 | ``` 361 | 362 | ## Issues and Suggestions 363 | 364 | If you encounter any issues or have suggestions for improvement, please [submit an issue](https://github.com/philippta/flyscrape/issues). 365 | -------------------------------------------------------------------------------- /cmd/args.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | "fmt" 9 | "slices" 10 | "strconv" 11 | "strings" 12 | ) 13 | 14 | var arrayFields = []string{ 15 | "urls", 16 | "follow", 17 | "allowedDomains", 18 | "blockedDomains", 19 | "allowedURLs", 20 | "blockedURLs", 21 | "proxies", 22 | } 23 | 24 | func parseConfigArgs(args []string) (map[string]any, error) { 25 | updates := map[string]any{} 26 | 27 | flag := "" 28 | for _, arg := range normalizeArgs(args) { 29 | if flag == "" && !isFlag(arg) { 30 | return nil, fmt.Errorf("expected flag, got %q instead", arg) 31 | } 32 | 33 | if flag != "" && isFlag(arg) { 34 | updates[flag[2:]] = true 35 | flag = "" 36 | continue 37 | } 38 | 39 | if flag != "" { 40 | if v, ok := updates[flag[2:]]; ok { 41 | if vv, ok := v.([]any); ok { 42 | updates[flag[2:]] = append(vv, parseArg(arg)) 43 | } else { 44 | updates[flag[2:]] = []any{v, parseArg(arg)} 45 | } 46 | } else { 47 | if slices.Contains(arrayFields, flag[2:]) { 48 | updates[flag[2:]] = []any{parseArg(arg)} 49 | } else { 50 | updates[flag[2:]] = parseArg(arg) 51 | } 52 | } 53 | flag = "" 54 | continue 55 | } 56 | 57 | flag = arg 58 | } 59 | 60 | if flag != "" { 61 | updates[flag[2:]] = true 62 | flag = "" 63 | } 64 | 65 | return updates, nil 66 | } 67 | 68 | func normalizeArgs(args []string) []string { 69 | var norm []string 70 | 71 | for _, arg := range args { 72 | if !strings.HasPrefix(arg, "--") { 73 | norm = append(norm, arg) 74 | } else { 75 | norm = append(norm, strings.SplitN(arg, "=", 2)...) 76 | } 77 | } 78 | 79 | return norm 80 | } 81 | 82 | func parseArg(arg string) any { 83 | if arg == "true" { 84 | return true 85 | } 86 | if arg == "false" { 87 | return false 88 | } 89 | if num, err := strconv.Atoi(arg); err == nil { 90 | return num 91 | } 92 | if num, err := strconv.ParseFloat(arg, 64); err == nil { 93 | return num 94 | } 95 | return arg 96 | } 97 | 98 | func isFlag(arg string) bool { 99 | return strings.HasPrefix(arg, "--") 100 | } 101 | -------------------------------------------------------------------------------- /cmd/args_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | "strings" 9 | "testing" 10 | 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func TestParseConfigUpdates(t *testing.T) { 15 | tests := []struct { 16 | flags string 17 | err bool 18 | updates map[string]any 19 | }{ 20 | { 21 | flags: `--foo bar`, 22 | updates: map[string]any{"foo": "bar"}, 23 | }, 24 | { 25 | flags: `--foo=bar`, 26 | updates: map[string]any{"foo": "bar"}, 27 | }, 28 | { 29 | flags: `--foo`, 30 | updates: map[string]any{"foo": true}, 31 | }, 32 | { 33 | flags: `--foo false`, 34 | updates: map[string]any{"foo": false}, 35 | }, 36 | { 37 | flags: `--foo a --foo b`, 38 | updates: map[string]any{"foo": []any{"a", "b"}}, 39 | }, 40 | { 41 | flags: `--foo a --foo=b`, 42 | updates: map[string]any{"foo": []any{"a", "b"}}, 43 | }, 44 | { 45 | flags: `--foo 69`, 46 | updates: map[string]any{"foo": 69}, 47 | }, 48 | { 49 | flags: `--foo.bar a`, 50 | updates: map[string]any{"foo.bar": "a"}, 51 | }, 52 | { 53 | flags: `foo`, 54 | err: true, 55 | }, 56 | { 57 | flags: `--foo a b`, 58 | err: true, 59 | }, 60 | } 61 | for _, test := range tests { 62 | t.Run(test.flags, func(t *testing.T) { 63 | args, err := parseConfigArgs(strings.Fields(test.flags)) 64 | 65 | if test.err { 66 | require.Error(t, err) 67 | require.Empty(t, args) 68 | return 69 | } 70 | 71 | require.NoError(t, err) 72 | require.Equal(t, test.updates, args) 73 | }) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /cmd/dev.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | 11 | "github.com/philippta/flyscrape" 12 | ) 13 | 14 | type DevCommand struct{} 15 | 16 | func (c *DevCommand) Run(args []string) error { 17 | fs := flag.NewFlagSet("flyscrape-dev", flag.ContinueOnError) 18 | fs.Usage = c.Usage 19 | 20 | if err := fs.Parse(args); err != nil { 21 | return err 22 | } else if fs.NArg() == 0 || fs.Arg(0) == "" { 23 | c.Usage() 24 | return flag.ErrHelp 25 | } 26 | 27 | cfg, err := parseConfigArgs(fs.Args()[1:]) 28 | if err != nil { 29 | return fmt.Errorf("error parsing config flags: %w", err) 30 | } 31 | 32 | return flyscrape.Dev(fs.Arg(0), cfg) 33 | } 34 | 35 | func (c *DevCommand) Usage() { 36 | fmt.Println(` 37 | The dev command watches the scraping script and re-runs it on any change. 38 | Recursive scraping is disabled in this mode, only the initial URL will be scraped. 39 | 40 | Usage: 41 | 42 | flyscrape dev SCRIPT [config flags] 43 | 44 | Examples: 45 | 46 | # Run and watch script. 47 | $ flyscrape dev example.js 48 | 49 | # Set the URL as argument. 50 | $ flyscrape dev example.js --url "http://other.com" 51 | 52 | # Enable proxy support. 53 | $ flyscrape dev example.js --proxies "http://someproxy:8043" 54 | `[1:]) 55 | } 56 | -------------------------------------------------------------------------------- /cmd/flyscrape/main.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package main 6 | 7 | import ( 8 | _ "embed" 9 | "flag" 10 | "log" 11 | "os" 12 | 13 | "github.com/philippta/flyscrape/cmd" 14 | _ "github.com/philippta/flyscrape/modules/browser" 15 | _ "github.com/philippta/flyscrape/modules/cache" 16 | _ "github.com/philippta/flyscrape/modules/cookies" 17 | _ "github.com/philippta/flyscrape/modules/depth" 18 | _ "github.com/philippta/flyscrape/modules/domainfilter" 19 | _ "github.com/philippta/flyscrape/modules/followlinks" 20 | _ "github.com/philippta/flyscrape/modules/headers" 21 | _ "github.com/philippta/flyscrape/modules/output/json" 22 | _ "github.com/philippta/flyscrape/modules/output/ndjson" 23 | _ "github.com/philippta/flyscrape/modules/proxy" 24 | _ "github.com/philippta/flyscrape/modules/ratelimit" 25 | _ "github.com/philippta/flyscrape/modules/retry" 26 | _ "github.com/philippta/flyscrape/modules/starturl" 27 | _ "github.com/philippta/flyscrape/modules/urlfilter" 28 | ) 29 | 30 | func main() { 31 | log.SetFlags(0) 32 | 33 | if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil { 34 | if err != flag.ErrHelp { 35 | log.Println(err) 36 | } 37 | os.Exit(1) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | _ "embed" 9 | "flag" 10 | "fmt" 11 | "log" 12 | "os" 13 | "strings" 14 | ) 15 | 16 | func main() { 17 | log.SetFlags(0) 18 | 19 | m := &Main{} 20 | if err := m.Run(os.Args[1:]); err == flag.ErrHelp { 21 | os.Exit(1) 22 | } else if err != nil { 23 | log.Println(err) 24 | os.Exit(1) 25 | } 26 | } 27 | 28 | type Main struct{} 29 | 30 | func (m *Main) Run(args []string) error { 31 | var cmd string 32 | if len(args) > 0 { 33 | cmd, args = args[0], args[1:] 34 | } 35 | 36 | switch cmd { 37 | case "new": 38 | return (&NewCommand{}).Run(args) 39 | case "run": 40 | return (&RunCommand{}).Run(args) 41 | case "dev": 42 | return (&DevCommand{}).Run(args) 43 | case "version": 44 | return (&VersionCommand{}).Run(args) 45 | default: 46 | if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { 47 | m.Usage() 48 | return flag.ErrHelp 49 | } 50 | return fmt.Errorf("flyscrape %s: unknown command", cmd) 51 | } 52 | } 53 | 54 | func (m *Main) Usage() { 55 | fmt.Println(` 56 | flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. 57 | 58 | Usage: 59 | 60 | flyscrape [arguments] 61 | 62 | Commands: 63 | 64 | new creates a sample scraping script 65 | run runs a scraping script 66 | dev watches and re-runs a scraping script 67 | version prints the version 68 | `[1:]) 69 | } 70 | -------------------------------------------------------------------------------- /cmd/new.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "os" 11 | 12 | "github.com/philippta/flyscrape" 13 | ) 14 | 15 | type NewCommand struct{} 16 | 17 | func (c *NewCommand) Run(args []string) error { 18 | fs := flag.NewFlagSet("flyscrape-new", flag.ContinueOnError) 19 | fs.Usage = c.Usage 20 | 21 | if err := fs.Parse(args); err != nil { 22 | return err 23 | } else if fs.NArg() == 0 || fs.Arg(0) == "" { 24 | c.Usage() 25 | return flag.ErrHelp 26 | } else if fs.NArg() > 1 { 27 | return fmt.Errorf("too many arguments") 28 | } 29 | 30 | script := fs.Arg(0) 31 | if _, err := os.Stat(script); err == nil { 32 | return fmt.Errorf("script already exists") 33 | } 34 | 35 | if err := os.WriteFile(script, flyscrape.ScriptTemplate, 0o644); err != nil { 36 | return fmt.Errorf("failed to create script %q: %w", script, err) 37 | } 38 | 39 | fmt.Printf("Scraping script %v created.\n", script) 40 | return nil 41 | } 42 | 43 | func (c *NewCommand) Usage() { 44 | fmt.Println(` 45 | The new command creates a new scraping script. 46 | 47 | Usage: 48 | 49 | flyscrape new SCRIPT 50 | 51 | Examples: 52 | 53 | # Create a new scraping script. 54 | $ flyscrape new example.js 55 | `[1:]) 56 | } 57 | -------------------------------------------------------------------------------- /cmd/run.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | 11 | "github.com/philippta/flyscrape" 12 | ) 13 | 14 | type RunCommand struct{} 15 | 16 | func (c *RunCommand) Run(args []string) error { 17 | fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) 18 | fs.Usage = c.Usage 19 | 20 | if err := fs.Parse(args); err != nil { 21 | return err 22 | } else if fs.NArg() == 0 || fs.Arg(0) == "" { 23 | c.Usage() 24 | return flag.ErrHelp 25 | } 26 | 27 | cfg, err := parseConfigArgs(fs.Args()[1:]) 28 | if err != nil { 29 | return fmt.Errorf("error parsing config flags: %w", err) 30 | } 31 | 32 | return flyscrape.Run(fs.Arg(0), cfg) 33 | } 34 | 35 | func (c *RunCommand) Usage() { 36 | fmt.Println(` 37 | The run command runs the scraping script. 38 | 39 | Usage: 40 | 41 | flyscrape run SCRIPT [config flags] 42 | 43 | Examples: 44 | 45 | # Run the script. 46 | $ flyscrape run example.js 47 | 48 | # Set the URL as argument. 49 | $ flyscrape run example.js --url "http://other.com" 50 | 51 | # Enable proxy support. 52 | $ flyscrape run example.js --proxies "http://someproxy:8043" 53 | 54 | # Follow paginated links. 55 | $ flyscrape run example.js --depth 5 --follow ".next-button > a" 56 | 57 | # Set the output format to ndjson. 58 | $ flyscrape run example.js --output.format ndjson 59 | 60 | # Write the output to a file. 61 | $ flyscrape run example.js --output.file results.json 62 | `[1:]) 63 | } 64 | -------------------------------------------------------------------------------- /cmd/version.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cmd 6 | 7 | import ( 8 | "fmt" 9 | "runtime/debug" 10 | 11 | "github.com/philippta/flyscrape" 12 | ) 13 | 14 | type VersionCommand struct{} 15 | 16 | func (c *VersionCommand) Run(args []string) error { 17 | info, ok := debug.ReadBuildInfo() 18 | if !ok { 19 | return fmt.Errorf("no build info found") 20 | } 21 | 22 | var os, arch, version string 23 | for _, setting := range info.Settings { 24 | switch setting.Key { 25 | case "GOARCH": 26 | arch = setting.Value 27 | case "GOOS": 28 | os = setting.Value 29 | case "vcs.revision": 30 | version = "v0.0.0-" + setting.Value 31 | } 32 | } 33 | 34 | if flyscrape.Version != "" { 35 | version = flyscrape.Version 36 | } 37 | 38 | fmt.Printf("flyscrape %s %s/%s\n", version, os, arch) 39 | return nil 40 | } 41 | -------------------------------------------------------------------------------- /examples/browser.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://www.airbnb.com/", 3 | browser: true, 4 | // headless: false, 5 | }; 6 | 7 | export default function ({ doc, absoluteURL }) { 8 | const rooms = doc.find("[itemprop=itemListElement]"); 9 | 10 | return { 11 | listings: rooms.map(room => { 12 | const link = "https://" + room.find("meta[itemprop=url]").attr("content"); 13 | const image = room.find("img").attr("src"); 14 | const desc = new Set(room.find("[role=group] > div > div > div").map(d => d.text()).filter(Boolean)); 15 | 16 | return { link, image, desc } 17 | }), 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/coinmarketcap.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://coinmarketcap.com/", 3 | }; 4 | 5 | export default function({ doc }) { 6 | const rows = doc.find(".cmc-table tbody tr"); 7 | 8 | return { 9 | currencies: rows 10 | .map((row) => { 11 | const cols = row.find("td"); 12 | 13 | return { 14 | position: cols.get(1).text(), 15 | currency: cols.get(2).find("p").get(0).text(), 16 | symbol: cols.get(2).find("p").get(1).text(), 17 | price: cols.get(3).text(), 18 | change: { 19 | "1h": cols.get(4).text(), 20 | "24h": cols.get(5).text(), 21 | "7dh": cols.get(6).text(), 22 | }, 23 | marketcap: cols.get(7).find("span").get(1).text(), 24 | volume: cols.get(8).find("p").get(0).text(), 25 | supply: cols.get(9).text(), 26 | }; 27 | }) 28 | .slice(0, 10), 29 | }; 30 | } 31 | -------------------------------------------------------------------------------- /examples/cookies.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://news.ycombinator.com/", 3 | 4 | // This will use cookies from your Chrome browser. 5 | // Options: "chrome" | "firefox" | "edge" 6 | cookies: "chrome", 7 | }; 8 | 9 | export default function({ doc }) { 10 | return { 11 | user: doc.find("#me").text(), 12 | karma: doc.find("#karma").text(), 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/custom_headers.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://news.ycombinator.com/", 3 | headers: { 4 | "Authorization": "Basic ZGVtbzpwQDU1dzByZA==", 5 | "User-Agent": "Gecko/1.0", 6 | } 7 | }; 8 | 9 | export default function({ doc, absoluteURL }) { 10 | const posts = doc.find(".athing"); 11 | 12 | return { 13 | posts: posts.map((post) => { 14 | const link = post.find(".titleline > a"); 15 | const meta = post.next(); 16 | 17 | return { 18 | url: absoluteURL(link.attr("href")), 19 | user: meta.find(".hnuser").text(), 20 | title: link.text(), 21 | points: meta.find(".score").text().replace(" points", ""), 22 | created: meta.find(".age").attr("title"), 23 | }; 24 | }), 25 | }; 26 | } 27 | -------------------------------------------------------------------------------- /examples/download.js: -------------------------------------------------------------------------------- 1 | import { download } from "flyscrape/http"; 2 | 3 | export const config = { 4 | url: "https://commons.wikimedia.org/wiki/London", 5 | }; 6 | 7 | export default function ({ doc }) { 8 | const symbols = doc.find("#mw-content-text .mw-gallery-traditional:first-of-type li"); 9 | 10 | return { 11 | symbols: symbols.map(symbol => { 12 | const name = symbol.text().trim(); 13 | const url = symbol.find("img").attr("src"); 14 | const file = `symbols/${basename(url)}`; 15 | 16 | download(url, file); 17 | 18 | return { name, url, file }; 19 | }) 20 | }; 21 | } 22 | 23 | function basename(path) { 24 | return path.split("/").slice(-1)[0]; 25 | } 26 | -------------------------------------------------------------------------------- /examples/hackernews.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://news.ycombinator.com/", 3 | depth: 9, 4 | cache: "file", 5 | follow: ["a.morelink[href]"], 6 | }; 7 | 8 | export default function({ doc, absoluteURL }) { 9 | const posts = doc.find(".athing"); 10 | 11 | return { 12 | posts: posts.map((post) => { 13 | const link = post.find(".titleline > a"); 14 | const meta = post.next(); 15 | 16 | return { 17 | url: absoluteURL(link.attr("href")), 18 | user: meta.find(".hnuser").text(), 19 | title: link.text(), 20 | points: meta.find(".score").text().replace(" points", ""), 21 | created: meta.find(".age").attr("title"), 22 | }; 23 | }), 24 | }; 25 | } 26 | -------------------------------------------------------------------------------- /examples/hackernews_manual_follow.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://news.ycombinator.com/", 3 | depth: 2, 4 | follow: [], 5 | }; 6 | 7 | export default function({ url, doc, follow }) { 8 | const next = doc.find(".morelink").attr("href"); 9 | 10 | follow(next); 11 | 12 | return { url, next }; 13 | } 14 | -------------------------------------------------------------------------------- /examples/hackernews_with_comments.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://news.ycombinator.com/", 3 | }; 4 | 5 | export default function({ doc, scrape }) { 6 | const post = doc.find(".athing.submission").first(); 7 | const title = post.find(".titleline > a").text(); 8 | const commentsLink = post.next().find("a").last().attr("href"); 9 | 10 | const comments = scrape(commentsLink, function({ doc }) { 11 | return doc.find(".comtr").map(comment => { 12 | return { 13 | author: comment.find(".hnuser").text(), 14 | text: comment.find(".commtext").text(), 15 | }; 16 | }); 17 | }); 18 | 19 | return { 20 | title, 21 | comments, 22 | }; 23 | } 24 | -------------------------------------------------------------------------------- /examples/multiple_starting_urls.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | urls: [ 3 | "https://news.ycombinator.com/show", 4 | "https://news.ycombinator.com/ask", 5 | ], 6 | }; 7 | 8 | export default function({ doc, absoluteURL }) { 9 | const posts = doc.find(".athing"); 10 | 11 | return { 12 | posts: posts.map((post) => { 13 | const link = post.find(".titleline > a"); 14 | const meta = post.next(); 15 | 16 | return { 17 | url: absoluteURL(link.attr("href")), 18 | user: meta.find(".hnuser").text(), 19 | title: link.text(), 20 | points: meta.find(".score").text().replace(" points", ""), 21 | created: meta.find(".age").attr("title"), 22 | }; 23 | }), 24 | }; 25 | } 26 | -------------------------------------------------------------------------------- /examples/reddit.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | url: "https://old.reddit.com/", 3 | }; 4 | 5 | export default function({ doc, absoluteURL }) { 6 | const posts = doc.find("#siteTable .thing:not(.promoted)"); 7 | 8 | return { 9 | posts: posts.map((post) => { 10 | const rank = post.find(".rank"); 11 | const user = post.find(".author"); 12 | const created = post.find("time"); 13 | const title = post.find("a.title"); 14 | const comments = post.find(".comments"); 15 | const subreddit = post.find(".subreddit"); 16 | const upvotes = post.find(".score.unvoted"); 17 | const thumbnail = post.find("a.thumbnail img"); 18 | 19 | return { 20 | rank: rank.text(), 21 | user: user.text(), 22 | created: created.attr("datetime"), 23 | title: title.text(), 24 | link: absoluteURL(title.attr("href")), 25 | comments: comments.text().replace(" comments", ""), 26 | comments_link: comments.attr("href"), 27 | subreddit: subreddit.text(), 28 | upvotes: upvotes.text(), 29 | thumbnail: absoluteURL(thumbnail.attr("src")), 30 | }; 31 | }), 32 | }; 33 | } 34 | -------------------------------------------------------------------------------- /examples/urls.txt: -------------------------------------------------------------------------------- 1 | https://news.ycombinator.com/newest 2 | https://news.ycombinator.com/ask 3 | https://news.ycombinator.com/show 4 | -------------------------------------------------------------------------------- /examples/urls_from_file.js: -------------------------------------------------------------------------------- 1 | import urls from "./urls.txt" 2 | 3 | export const config = { 4 | urls: urls.split("\n") 5 | }; 6 | 7 | export default function({ doc }) { 8 | return { 9 | title: doc.find("title").text().trim(), 10 | }; 11 | } 12 | -------------------------------------------------------------------------------- /examples/useragents/chrome.js: -------------------------------------------------------------------------------- 1 | import { parse } from "flyscrape"; 2 | 3 | export const config = { 4 | url: "https://chromereleases.googleblog.com/search/label/Stable%20updates", 5 | follow: [".blog-pager-older-link"], 6 | depth: 30, 7 | cache: "file", 8 | }; 9 | 10 | export default function ({ doc, absoluteURL }) { 11 | const posts = doc.find(".post"); 12 | return posts.map(post => { 13 | const title = post.find("h2").text().trim(); 14 | const body = parse(post.find(".post-content").text()).find("p:nth-child(1)").text().trim(); 15 | 16 | const regexes = [ 17 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Mac)/, 18 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Windows)/, 19 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Linux)/, 20 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (iOS)/, 21 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Android)/, 22 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (ChromeOS)/, 23 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Mac,Linux)/, 24 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)? for (Mac and Linux)/, 25 | /(\d+\.\d+\.\d+\.\d+(\/\.\d+)?)\)?\s\(Platform version:\s[\d\.]+\)\sfor\smost\s(ChromeOS)/, 26 | ]; 27 | 28 | const versions = new Set(); 29 | for (const regex of regexes) { 30 | const matches = body.match(regex); 31 | if (!matches) { 32 | continue; 33 | } 34 | 35 | let versionStr = matches[1]; 36 | 37 | let vv = versionStr.split("/"); 38 | if (vv.length == 2) { 39 | vv[1] = vv[0].substring(0, vv[0].lastIndexOf(".")) + vv[1]; 40 | } 41 | 42 | for (const version of vv) { 43 | versions.add(version) 44 | } 45 | } 46 | 47 | 48 | return versions 49 | }).filter(Boolean).flat(); 50 | } 51 | -------------------------------------------------------------------------------- /examples/useragents/edge.js: -------------------------------------------------------------------------------- 1 | import { parse } from "flyscrape"; 2 | 3 | export const config = { 4 | url: "https://learn.microsoft.com/en-us/deployedge/microsoft-edge-release-schedule", 5 | }; 6 | 7 | export default function ({ doc, absoluteURL }) { 8 | const links = doc.find("table a"); 9 | return links 10 | .map(link => link.text()) 11 | .filter(Boolean) 12 | } 13 | -------------------------------------------------------------------------------- /examples/useragents/firefox.js: -------------------------------------------------------------------------------- 1 | import { parse } from "flyscrape"; 2 | 3 | export const config = { 4 | url: "https://www.mozilla.org/en-US/firefox/releases/", 5 | }; 6 | 7 | export default function ({ doc, absoluteURL }) { 8 | const links = doc.find(".c-release-list a"); 9 | return links 10 | .map(link => link.text()) 11 | .filter(Boolean) 12 | .filter(version => parseFloat(version) >= 60); 13 | } 14 | -------------------------------------------------------------------------------- /examples/useragents/opera.js: -------------------------------------------------------------------------------- 1 | 2 | export const config = { 3 | urls: range("https://blogs.opera.com/desktop/changelog-for-{}/", 60, 110), 4 | }; 5 | 6 | export default function ({ doc, absoluteURL }) { 7 | const versions = doc.find(".content h4"); 8 | return versions.map(versions => { 9 | return versions.text().split(" ")[0].trim(); 10 | }).filter(Boolean); 11 | } 12 | 13 | function range(url, from, to) { 14 | return Array.from({length: to - from + 1}).map((_, i) => url.replace("{}", i + from)); 15 | } 16 | -------------------------------------------------------------------------------- /flyscrape.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | "fmt" 9 | "log" 10 | "net/http" 11 | "os" 12 | "os/signal" 13 | "path/filepath" 14 | "syscall" 15 | 16 | "github.com/inancgumus/screen" 17 | "github.com/tidwall/sjson" 18 | ) 19 | 20 | var Version string 21 | 22 | func Run(file string, overrides map[string]any) error { 23 | src, err := os.ReadFile(file) 24 | if err != nil { 25 | return fmt.Errorf("failed to read script %q: %w", file, err) 26 | } 27 | 28 | client := &http.Client{} 29 | 30 | imports, wait := NewJSLibrary(client) 31 | defer wait() 32 | 33 | pop, err := pushDir(file) 34 | if err != nil { 35 | return err 36 | } 37 | 38 | exports, err := Compile(string(src), imports) 39 | if err != nil { 40 | return fmt.Errorf("failed to compile script: %w", err) 41 | } 42 | 43 | if err := pop(); err != nil { 44 | return err 45 | } 46 | 47 | cfg := exports.Config() 48 | cfg = updateCfgMultiple(cfg, overrides) 49 | 50 | scraper := NewScraper() 51 | scraper.ScrapeFunc = exports.Scrape 52 | scraper.Script = file 53 | scraper.Client = client 54 | scraper.Modules = LoadModules(cfg) 55 | 56 | scraper.Run() 57 | return nil 58 | } 59 | 60 | func Dev(file string, overrides map[string]any) error { 61 | cachefile, err := newCacheFile() 62 | if err != nil { 63 | return fmt.Errorf("failed to create cache file: %w", err) 64 | } 65 | 66 | trapsignal(func() { 67 | os.RemoveAll(cachefile) 68 | }) 69 | 70 | fn := func(s string) error { 71 | client := &http.Client{} 72 | 73 | imports, wait := NewJSLibrary(client) 74 | defer wait() 75 | 76 | pop, err := pushDir(file) 77 | if err != nil { 78 | return err 79 | } 80 | 81 | exports, err := Compile(s, imports) 82 | if err != nil { 83 | printCompileErr(file, err) 84 | return nil 85 | } 86 | 87 | if err := pop(); err != nil { 88 | return err 89 | } 90 | 91 | cfg := exports.Config() 92 | cfg = updateCfgMultiple(cfg, overrides) 93 | cfg = updateCfg(cfg, "depth", 0) 94 | cfg = updateCfg(cfg, "cache", "file:"+cachefile) 95 | 96 | scraper := NewScraper() 97 | scraper.ScrapeFunc = exports.Scrape 98 | scraper.Script = file 99 | scraper.Client = client 100 | scraper.Modules = LoadModules(cfg) 101 | 102 | screen.Clear() 103 | screen.MoveTopLeft() 104 | scraper.Run() 105 | 106 | return nil 107 | } 108 | 109 | if err := Watch(file, fn); err != nil && err != StopWatch { 110 | return fmt.Errorf("failed to watch script %q: %w", file, err) 111 | } 112 | return nil 113 | } 114 | 115 | func printCompileErr(script string, err error) { 116 | screen.Clear() 117 | screen.MoveTopLeft() 118 | 119 | if errs, ok := err.(interface{ Unwrap() []error }); ok { 120 | for _, err := range errs.Unwrap() { 121 | log.Printf("%s:%v\n", script, err) 122 | } 123 | } else { 124 | log.Println(err) 125 | } 126 | } 127 | 128 | func updateCfg(cfg Config, key string, value any) Config { 129 | newcfg, err := sjson.Set(string(cfg), key, value) 130 | if err != nil { 131 | return cfg 132 | } 133 | return Config(newcfg) 134 | } 135 | 136 | func newCacheFile() (string, error) { 137 | cachedir, err := os.MkdirTemp("", "flyscrape-cache") 138 | if err != nil { 139 | return "", err 140 | } 141 | return filepath.Join(cachedir, "dev.cache"), nil 142 | } 143 | 144 | func trapsignal(f func()) { 145 | sig := make(chan os.Signal, 2) 146 | signal.Notify(sig, os.Interrupt, syscall.SIGTERM) 147 | 148 | go func() { 149 | <-sig 150 | f() 151 | os.Exit(0) 152 | }() 153 | } 154 | 155 | func updateCfgMultiple(cfg Config, updates map[string]any) Config { 156 | c := string(cfg) 157 | 158 | for k, v := range updates { 159 | nc, err := sjson.Set(c, k, v) 160 | if err != nil { 161 | continue 162 | } 163 | c = nc 164 | } 165 | 166 | return []byte(c) 167 | } 168 | 169 | func pushDir(file string) (func() error, error) { 170 | cwd, err := os.Getwd() 171 | if err != nil { 172 | return nil, fmt.Errorf("failed to get current working directory: %w", err) 173 | } 174 | if err := os.Chdir(filepath.Dir(file)); err != nil { 175 | return nil, fmt.Errorf("failed to change working directory: %w", err) 176 | } 177 | pop := func() error { 178 | if err := os.Chdir(cwd); err != nil { 179 | return fmt.Errorf("failed to change working directory: %w", err) 180 | } 181 | return nil 182 | } 183 | return pop, nil 184 | 185 | } 186 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/philippta/flyscrape 2 | 3 | go 1.23 4 | 5 | toolchain go1.23.3 6 | 7 | require ( 8 | github.com/PuerkitoBio/goquery v1.8.1 9 | github.com/browserutils/kooky v0.2.2 10 | github.com/cornelk/hashmap v1.0.8 11 | github.com/dop251/goja v0.0.0-20230919151941-fc55792775de 12 | github.com/dop251/goja_nodejs v0.0.0-20230914102007-198ba9a8b098 13 | github.com/evanw/esbuild v0.18.14 14 | github.com/fsnotify/fsnotify v1.6.0 15 | github.com/go-rod/rod v0.114.7 16 | github.com/inancgumus/screen v0.0.0-20190314163918-06e984b86ed3 17 | github.com/nlnwa/whatwg-url v0.4.0 18 | github.com/stretchr/testify v1.8.4 19 | github.com/tidwall/sjson v1.2.5 20 | go.etcd.io/bbolt v1.3.11 21 | golang.org/x/sync v0.9.0 22 | ) 23 | 24 | require ( 25 | github.com/Velocidex/json v0.0.0-20220224052537-92f3c0326e5a // indirect 26 | github.com/Velocidex/ordereddict v0.0.0-20230909174157-2aa49cc5d11d // indirect 27 | github.com/Velocidex/yaml/v2 v2.2.8 // indirect 28 | github.com/andybalholm/cascadia v1.3.1 // indirect 29 | github.com/bits-and-blooms/bitset v1.5.0 // indirect 30 | github.com/davecgh/go-spew v1.1.1 // indirect 31 | github.com/dlclark/regexp2 v1.7.0 // indirect 32 | github.com/go-ini/ini v1.67.0 // indirect 33 | github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect 34 | github.com/go-sqlite/sqlite3 v0.0.0-20180313105335-53dd8e640ee7 // indirect 35 | github.com/godbus/dbus/v5 v5.1.0 // indirect 36 | github.com/gonuts/binary v0.2.0 // indirect 37 | github.com/google/pprof v0.0.0-20230207041349-798e818bf904 // indirect 38 | github.com/keybase/go-keychain v0.0.0-20231219164618-57a3676c3af6 // indirect 39 | github.com/kr/pretty v0.3.1 // indirect 40 | github.com/pmezard/go-difflib v1.0.0 // indirect 41 | github.com/rogpeppe/go-internal v1.10.0 // indirect 42 | github.com/tidwall/gjson v1.17.0 // indirect 43 | github.com/tidwall/match v1.1.1 // indirect 44 | github.com/tidwall/pretty v1.2.1 // indirect 45 | github.com/ysmood/fetchup v0.2.3 // indirect 46 | github.com/ysmood/goob v0.4.0 // indirect 47 | github.com/ysmood/got v0.34.1 // indirect 48 | github.com/ysmood/gson v0.7.3 // indirect 49 | github.com/ysmood/leakless v0.8.0 // indirect 50 | github.com/zalando/go-keyring v0.2.5 // indirect 51 | golang.org/x/crypto v0.29.0 // indirect 52 | golang.org/x/net v0.31.0 // indirect 53 | golang.org/x/sys v0.27.0 // indirect 54 | golang.org/x/term v0.26.0 // indirect 55 | golang.org/x/text v0.20.0 // indirect 56 | gopkg.in/yaml.v3 v3.0.1 // indirect 57 | www.velocidex.com/golang/go-ese v0.2.0 // indirect 58 | ) 59 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | error() { 6 | echo -e "error:" "$@" >&2 7 | exit 1 8 | } 9 | 10 | if [[ ${OS:-} = Windows_NT ]]; then 11 | error "This installer does not support Windows." 12 | fi 13 | 14 | echo "Installing flyscrape" 15 | 16 | case $(uname -ms) in 17 | 'Darwin x86_64') 18 | target=darwin_amd64 19 | ;; 20 | 'Darwin arm64') 21 | target=darwin_arm64 22 | ;; 23 | 'Linux aarch64' | 'Linux arm64') 24 | target=linux_arm64 25 | ;; 26 | 'Linux x86_64' | *) 27 | target=linux_amd64 28 | ;; 29 | esac 30 | 31 | dir="$HOME/.flyscrape" 32 | 33 | mkdir -p "$dir" || 34 | error "Failed to create directory: $HOME/.flyscrape" 35 | 36 | 37 | archive="$dir/flyscrape_$target.tar.gz" 38 | url="https://github.com/philippta/flyscrape/releases/latest/download/flyscrape_0.9.0_$target.tar.gz" 39 | curl --fail --location --progress-bar --output "$archive" "$url" || 40 | error "Failed to download flyscrape from: $url" 41 | 42 | tar -xzf "$archive" -C "$dir" || 43 | error "Failed to extract downloaded archive." 44 | 45 | chmod +x "$dir/flyscrape" || 46 | error "Failed to chmod the flyscrape executable." 47 | 48 | rm "$archive" "$dir/README.md" "$dir/LICENSE" || 49 | error "Failed to clean up the downloaded archive." 50 | 51 | case $(basename "$SHELL") in 52 | zsh) 53 | # Add paths to zsh 54 | if [[ ":$PATH:" != *":$HOME/.flyscrape:"* ]]; then 55 | if [[ -w "$HOME/.zshrc" ]]; then 56 | echo "# flyscrape" >> "$HOME/.zshrc" 57 | echo "export PATH=\"$dir:\$PATH\"" >> "$HOME/.zshrc" 58 | else 59 | echo "" 60 | echo "Manually add the directory to ~/.zshrc (or similar):" 61 | echo " export PATH=\"$dir:\$PATH\"" 62 | fi 63 | fi 64 | ;; 65 | bash) 66 | # Add paths to bbash 67 | if [[ ":$PATH:" != *":$HOME/.flyscrape:"* ]]; then 68 | if [[ -w "$HOME/.bashrc" ]]; then 69 | echo "# flyscrape" >> "$HOME/.bashrc" 70 | echo "export PATH=$dir:\$PATH" >> "$HOME/.bashrc" 71 | else 72 | echo "" 73 | echo "Manually add the directory to ~/.bashrc (or similar):" 74 | echo " export PATH=$dir:\$PATH" 75 | fi 76 | fi 77 | ;; 78 | *) 79 | echo "" 80 | echo "Manually add the directory to ~/.bashrc (or similar):" 81 | echo " export PATH=$dir:\$PATH" 82 | ;; 83 | esac 84 | 85 | echo "" 86 | echo "The installation was successfull!" 87 | echo "" 88 | echo "Note:" 89 | echo "Please restart your terminal window. This ensures your system correctly detects flyscrape." 90 | echo "" 91 | -------------------------------------------------------------------------------- /js.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | _ "embed" 9 | "encoding/json" 10 | "errors" 11 | "fmt" 12 | "log" 13 | "net/url" 14 | "strings" 15 | "sync" 16 | 17 | "github.com/PuerkitoBio/goquery" 18 | "github.com/dop251/goja" 19 | "github.com/dop251/goja_nodejs/console" 20 | "github.com/dop251/goja_nodejs/require" 21 | "github.com/evanw/esbuild/pkg/api" 22 | ) 23 | 24 | //go:embed template.js 25 | var ScriptTemplate []byte 26 | 27 | type Config []byte 28 | 29 | type ScrapeParams struct { 30 | HTML string 31 | URL string 32 | Process func(url string) ([]byte, error) 33 | Follow func(url string) 34 | } 35 | 36 | type ScrapeFunc func(ScrapeParams) (any, error) 37 | 38 | type TransformError struct { 39 | Line int 40 | Column int 41 | Text string 42 | } 43 | 44 | func (err TransformError) Error() string { 45 | return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text) 46 | } 47 | 48 | type Exports map[string]any 49 | 50 | func (e Exports) Config() []byte { 51 | b, _ := json.Marshal(e["config"]) 52 | return b 53 | } 54 | 55 | func (e Exports) Scrape(p ScrapeParams) (any, error) { 56 | fn := e["__scrape"].(ScrapeFunc) 57 | return fn(p) 58 | } 59 | 60 | type Imports map[string]map[string]any 61 | 62 | func Compile(src string, imports Imports) (Exports, error) { 63 | src, err := build(src) 64 | if err != nil { 65 | return nil, err 66 | } 67 | return vm(src, imports) 68 | } 69 | 70 | func build(src string) (string, error) { 71 | res := api.Build(api.BuildOptions{ 72 | Loader: map[string]api.Loader{ 73 | ".txt": api.LoaderText, 74 | ".json": api.LoaderJSON, 75 | }, 76 | Bundle: true, 77 | Stdin: &api.StdinOptions{ 78 | Contents: src, 79 | ResolveDir: ".", 80 | }, 81 | Platform: api.PlatformNode, 82 | Format: api.FormatCommonJS, 83 | External: []string{"flyscrape"}, 84 | }) 85 | 86 | var errs []error 87 | for _, msg := range res.Errors { 88 | err := TransformError{Text: msg.Text} 89 | if msg.Location != nil { 90 | err.Line = msg.Location.Line 91 | err.Column = msg.Location.Column 92 | } 93 | errs = append(errs, err) 94 | } 95 | if len(res.Errors) > 0 { 96 | return "", errors.Join(errs...) 97 | } 98 | if len(res.OutputFiles) == 0 { 99 | return "", errors.New("no output generated") 100 | } 101 | 102 | return string(res.OutputFiles[0].Contents), nil 103 | } 104 | 105 | func vm(src string, imports Imports) (Exports, error) { 106 | vm := goja.New() 107 | registry := &require.Registry{} 108 | 109 | registry.Enable(vm) 110 | console.Enable(vm) 111 | 112 | for module, pkg := range imports { 113 | pkg := pkg 114 | registry.RegisterNativeModule(module, func(vm *goja.Runtime, o *goja.Object) { 115 | exports := vm.NewObject() 116 | 117 | for ident, val := range pkg { 118 | exports.Set(ident, val) 119 | } 120 | 121 | o.Set("exports", exports) 122 | }) 123 | } 124 | 125 | if _, err := vm.RunString("module = {}"); err != nil { 126 | return nil, fmt.Errorf("running defining module: %w", err) 127 | } 128 | if _, err := vm.RunString(src); err != nil { 129 | return nil, fmt.Errorf("running user script: %w", err) 130 | } 131 | 132 | v, err := vm.RunString("module.exports") 133 | if err != nil { 134 | return nil, fmt.Errorf("reading config: %w", err) 135 | } 136 | 137 | exports := Exports{} 138 | if goja.IsUndefined(v) { 139 | return exports, nil 140 | } 141 | 142 | obj := v.ToObject(vm) 143 | for _, key := range obj.Keys() { 144 | exports[key] = obj.Get(key).Export() 145 | } 146 | 147 | exports["__scrape"], err = scrape(vm) 148 | if err != nil { 149 | return nil, err 150 | } 151 | 152 | return exports, nil 153 | } 154 | 155 | func scrape(vm *goja.Runtime) (ScrapeFunc, error) { 156 | var lock sync.Mutex 157 | 158 | if v, err := vm.RunString("module.exports.default"); err != nil || goja.IsUndefined(v) { 159 | return nil, errors.New("default export is not defined") 160 | } 161 | 162 | defaultfn, err := vm.RunString("(o) => JSON.stringify(module.exports.default(o))") 163 | if err != nil { 164 | return nil, fmt.Errorf("failed to create scrape function: %w", err) 165 | } 166 | 167 | scrapefn, ok := defaultfn.Export().(func(goja.FunctionCall) goja.Value) 168 | if !ok { 169 | return nil, errors.New("failed to export scrape function") 170 | } 171 | 172 | var newArg func(p ScrapeParams) (*goja.Object, error) 173 | newArg = func(p ScrapeParams) (*goja.Object, error) { 174 | doc, err := DocumentFromString(p.HTML) 175 | if err != nil { 176 | return nil, err 177 | } 178 | 179 | baseurl, err := url.Parse(p.URL) 180 | if err != nil { 181 | return nil, err 182 | } 183 | 184 | absoluteURL := func(ref string) string { 185 | abs, err := baseurl.Parse(ref) 186 | if err != nil { 187 | return ref 188 | } 189 | return abs.String() 190 | } 191 | 192 | o := vm.NewObject() 193 | o.Set("url", p.URL) 194 | o.Set("doc", doc) 195 | o.Set("absoluteURL", absoluteURL) 196 | o.Set("scrape", func(url string, f func(goja.FunctionCall) goja.Value) goja.Value { 197 | url = absoluteURL(url) 198 | 199 | html, err := p.Process(url) 200 | if err != nil { 201 | return vm.ToValue(map[string]any{"error": err.Error()}) 202 | } 203 | 204 | newp := ScrapeParams{ 205 | HTML: string(html), 206 | URL: url, 207 | Process: p.Process, 208 | } 209 | 210 | arg, err := newArg(newp) 211 | if err != nil { 212 | return vm.ToValue(map[string]any{"error": err.Error()}) 213 | } 214 | 215 | return f(goja.FunctionCall{Arguments: []goja.Value{arg}}) 216 | }) 217 | o.Set("follow", func(url string) { 218 | p.Follow(absoluteURL(url)) 219 | }) 220 | 221 | return o, nil 222 | } 223 | 224 | return func(p ScrapeParams) (any, error) { 225 | lock.Lock() 226 | defer lock.Unlock() 227 | 228 | arg, err := newArg(p) 229 | if err != nil { 230 | return nil, err 231 | } 232 | 233 | ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{arg}}) 234 | if goja.IsUndefined(ret) { 235 | return nil, nil 236 | } 237 | 238 | var result any 239 | if err := json.Unmarshal([]byte(ret.String()), &result); err != nil { 240 | log.Println(err) 241 | return nil, err 242 | } 243 | 244 | return result, nil 245 | }, nil 246 | } 247 | 248 | func DocumentFromString(s string) (map[string]any, error) { 249 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(s)) 250 | if err != nil { 251 | return nil, err 252 | } 253 | 254 | return Document(doc.Selection), nil 255 | } 256 | 257 | func Document(sel *goquery.Selection) map[string]any { 258 | o := map[string]any{} 259 | o["WARNING"] = "Forgot to call text(), html() or attr()?" 260 | o["text"] = sel.Text 261 | o["name"] = func() string { return sel.Get(0).Data } 262 | o["html"] = func() string { h, _ := goquery.OuterHtml(sel); return h } 263 | o["attr"] = func(name string) string { v, _ := sel.Attr(name); return v } 264 | o["hasAttr"] = func(name string) bool { _, ok := sel.Attr(name); return ok } 265 | o["hasClass"] = sel.HasClass 266 | o["length"] = sel.Length() 267 | o["first"] = func() map[string]any { return Document(sel.First()) } 268 | o["last"] = func() map[string]any { return Document(sel.Last()) } 269 | o["get"] = func(index int) map[string]any { return Document(sel.Eq(index)) } 270 | o["find"] = func(s string) map[string]any { return Document(sel.Find(s)) } 271 | o["next"] = func() map[string]any { return Document(sel.Next()) } 272 | o["nextAll"] = func() map[string]any { return Document(sel.NextAll()) } 273 | o["nextUntil"] = func(s string) map[string]any { return Document(sel.NextUntil(s)) } 274 | o["prev"] = func() map[string]any { return Document(sel.Prev()) } 275 | o["prevAll"] = func() map[string]any { return Document(sel.PrevAll()) } 276 | o["prevUntil"] = func(s string) map[string]any { return Document(sel.PrevUntil(s)) } 277 | o["siblings"] = func() map[string]any { return Document(sel.Siblings()) } 278 | o["children"] = func() map[string]any { return Document(sel.Children()) } 279 | o["parent"] = func() map[string]any { return Document(sel.Parent()) } 280 | o["map"] = func(callback func(map[string]any, int) any) []any { 281 | var vals []any 282 | sel.Map(func(i int, s *goquery.Selection) string { 283 | vals = append(vals, callback(Document(s), i)) 284 | return "" 285 | }) 286 | return vals 287 | } 288 | o["filter"] = func(callback func(map[string]any, int) bool) []any { 289 | var vals []any 290 | sel.Each(func(i int, s *goquery.Selection) { 291 | el := Document(s) 292 | ok := callback(el, i) 293 | if ok { 294 | vals = append(vals, el) 295 | } 296 | }) 297 | return vals 298 | } 299 | return o 300 | } 301 | -------------------------------------------------------------------------------- /js_lib.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "fmt" 11 | "io" 12 | "log" 13 | "mime" 14 | "net/http" 15 | gourl "net/url" 16 | "os" 17 | "path/filepath" 18 | "strings" 19 | 20 | "golang.org/x/sync/errgroup" 21 | ) 22 | 23 | func NewJSLibrary(client *http.Client) (imports Imports, wait func()) { 24 | downloads := &errgroup.Group{} 25 | 26 | // Allow 5 parallel downloads. Why 5? 27 | // Docker downloads 3 layers in parallel. 28 | // My Chrome downloads up to 6 files in parallel. 29 | // 5 feels like a reasonable number. 30 | downloads.SetLimit(5) 31 | 32 | im := Imports{ 33 | "flyscrape": map[string]any{ 34 | "parse": jsParse(), 35 | }, 36 | "flyscrape/http": map[string]any{ 37 | "get": jsHTTPGet(client), 38 | "postForm": jsHTTPPostForm(client), 39 | "postJSON": jsHTTPPostJSON(client), 40 | "download": jsHTTPDownload(client, downloads), 41 | }, 42 | } 43 | 44 | return im, func() { downloads.Wait() } 45 | } 46 | 47 | func jsParse() func(html string) map[string]any { 48 | return func(html string) map[string]any { 49 | doc, err := DocumentFromString(html) 50 | if err != nil { 51 | return nil 52 | } 53 | return doc 54 | } 55 | } 56 | 57 | func jsHTTPGet(client *http.Client) func(url string) map[string]any { 58 | return func(url string) map[string]any { 59 | req, err := http.NewRequest("GET", url, nil) 60 | if err != nil { 61 | return map[string]any{"error": err.Error()} 62 | } 63 | return jsFetch(client, req) 64 | } 65 | } 66 | 67 | func jsHTTPPostForm(client *http.Client) func(url string, form map[string]any) map[string]any { 68 | return func(url string, form map[string]any) map[string]any { 69 | vals := gourl.Values{} 70 | for k, v := range form { 71 | switch v := v.(type) { 72 | case []any: 73 | for _, v := range v { 74 | vals.Add(k, fmt.Sprintf("%v", v)) 75 | } 76 | default: 77 | vals.Add(k, fmt.Sprintf("%v", v)) 78 | } 79 | } 80 | 81 | req, err := http.NewRequest("POST", url, strings.NewReader(vals.Encode())) 82 | if err != nil { 83 | return map[string]any{"error": err.Error()} 84 | } 85 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 86 | 87 | return jsFetch(client, req) 88 | } 89 | } 90 | 91 | func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]any { 92 | return func(url string, data any) map[string]any { 93 | b, _ := json.Marshal(data) 94 | 95 | req, err := http.NewRequest("POST", url, bytes.NewReader(b)) 96 | if err != nil { 97 | return map[string]any{"error": err.Error()} 98 | } 99 | req.Header.Set("Content-Type", "application/json") 100 | 101 | return jsFetch(client, req) 102 | } 103 | } 104 | 105 | func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) { 106 | fileExists := func(name string) bool { 107 | _, err := os.Stat(name) 108 | return err == nil 109 | } 110 | 111 | isDir := func(path string) bool { 112 | if strings.HasSuffix(path, "/") { 113 | return true 114 | } 115 | if filepath.Ext(path) == "" { 116 | return true 117 | } 118 | s, err := os.Stat(path) 119 | return err == nil && s.IsDir() 120 | } 121 | 122 | suggestedFilename := func(url, contentDisp string) string { 123 | filename := filepath.Base(url) 124 | 125 | if contentDisp == "" { 126 | return filename 127 | } 128 | 129 | _, params, err := mime.ParseMediaType(contentDisp) 130 | if err != nil { 131 | return filename 132 | } 133 | 134 | name, ok := params["filename"] 135 | if !ok || name == "" { 136 | return filename 137 | } 138 | 139 | return filepath.Base(name) 140 | } 141 | 142 | return func(url string, dst string) { 143 | g.Go(func() error { 144 | req, err := http.NewRequest("GET", url, nil) 145 | if err != nil { 146 | log.Printf("error downloading file %q: %v", url, err) 147 | return nil 148 | } 149 | req.Header.Add(HeaderBypassCache, "true") 150 | 151 | resp, err := client.Do(req) 152 | if err != nil { 153 | log.Printf("error downloading file %q: %v", url, err) 154 | return nil 155 | } 156 | defer resp.Body.Close() 157 | 158 | if resp.StatusCode < 200 || resp.StatusCode >= 300 { 159 | log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode) 160 | return nil 161 | } 162 | 163 | dst, err = filepath.Abs(dst) 164 | if err != nil { 165 | log.Printf("error downloading file %q: abs path failed: %v", url, err) 166 | return nil 167 | } 168 | 169 | if isDir(dst) { 170 | name := suggestedFilename(url, resp.Header.Get("Content-Disposition")) 171 | dst = filepath.Join(dst, name) 172 | } 173 | 174 | if fileExists(dst) { 175 | return nil 176 | } 177 | 178 | os.MkdirAll(filepath.Dir(dst), 0o755) 179 | f, err := os.Create(dst) 180 | if err != nil { 181 | log.Printf("error downloading file %q: file save failed: %v", url, err) 182 | return nil 183 | } 184 | defer f.Close() 185 | 186 | io.Copy(f, resp.Body) 187 | return nil 188 | }) 189 | } 190 | } 191 | 192 | func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) { 193 | obj = map[string]any{ 194 | "body": "", 195 | "status": 0, 196 | "headers": map[string]any{}, 197 | "error": "", 198 | } 199 | 200 | resp, err := client.Do(req) 201 | if err != nil { 202 | obj["error"] = err.Error() 203 | return 204 | } 205 | defer resp.Body.Close() 206 | 207 | obj["status"] = resp.StatusCode 208 | 209 | b, err := io.ReadAll(resp.Body) 210 | if err != nil { 211 | obj["error"] = err.Error() 212 | return 213 | } 214 | 215 | obj["body"] = string(b) 216 | 217 | headers := map[string]any{} 218 | for name := range resp.Header { 219 | headers[name] = resp.Header.Get(name) 220 | } 221 | obj["headers"] = headers 222 | 223 | return 224 | } 225 | -------------------------------------------------------------------------------- /js_lib_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape_test 6 | 7 | import ( 8 | "encoding/json" 9 | "net/http" 10 | "os" 11 | "sync/atomic" 12 | "testing" 13 | 14 | "github.com/philippta/flyscrape" 15 | "github.com/stretchr/testify/require" 16 | ) 17 | 18 | func TestJSLibParse(t *testing.T) { 19 | script := ` 20 | import { parse } from "flyscrape" 21 | 22 | const doc = parse('
    Hello world
    ') 23 | export const text = doc.find(".foo").text() 24 | 25 | export default function () {} 26 | ` 27 | 28 | client := &http.Client{ 29 | Transport: flyscrape.MockTransport(200, html), 30 | } 31 | 32 | imports, _ := flyscrape.NewJSLibrary(client) 33 | exports, err := flyscrape.Compile(script, imports) 34 | require.NoError(t, err) 35 | 36 | h, ok := exports["text"].(string) 37 | require.True(t, ok) 38 | require.Equal(t, "Hello world", h) 39 | } 40 | 41 | func TestJSLibHTTPGet(t *testing.T) { 42 | script := ` 43 | import http from "flyscrape/http" 44 | 45 | const res = http.get("https://example.com") 46 | 47 | export const body = res.body; 48 | export const status = res.status; 49 | export const error = res.error; 50 | export const headers = res.headers; 51 | 52 | export default function () {} 53 | ` 54 | 55 | client := &http.Client{ 56 | Transport: flyscrape.MockTransport(200, html), 57 | } 58 | 59 | imports, _ := flyscrape.NewJSLibrary(client) 60 | exports, err := flyscrape.Compile(script, imports) 61 | require.NoError(t, err) 62 | 63 | body, ok := exports["body"].(string) 64 | require.True(t, ok) 65 | require.Equal(t, html, body) 66 | 67 | status, ok := exports["status"].(int64) 68 | require.True(t, ok) 69 | require.Equal(t, int64(200), status) 70 | 71 | error, ok := exports["error"].(string) 72 | require.True(t, ok) 73 | require.Equal(t, "", error) 74 | 75 | headers, ok := exports["headers"].(map[string]any) 76 | require.True(t, ok) 77 | require.NotEmpty(t, headers) 78 | } 79 | 80 | func TestJSLibHTTPPostForm(t *testing.T) { 81 | script := ` 82 | import http from "flyscrape/http" 83 | 84 | const res = http.postForm("https://example.com", { 85 | username: "foo", 86 | password: "bar", 87 | arr: [1,2,3], 88 | }) 89 | 90 | export const body = res.body; 91 | export const status = res.status; 92 | export const error = res.error; 93 | export const headers = res.headers; 94 | 95 | export default function () {} 96 | ` 97 | 98 | client := &http.Client{ 99 | Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 100 | require.Equal(t, "POST", r.Method) 101 | require.Equal(t, "application/x-www-form-urlencoded", r.Header.Get("Content-Type")) 102 | require.Equal(t, "foo", r.FormValue("username")) 103 | require.Equal(t, "bar", r.FormValue("password")) 104 | require.Len(t, r.Form["arr"], 3) 105 | 106 | return flyscrape.MockResponse(400, "Bad Request") 107 | }), 108 | } 109 | 110 | imports, _ := flyscrape.NewJSLibrary(client) 111 | exports, err := flyscrape.Compile(script, imports) 112 | require.NoError(t, err) 113 | 114 | body, ok := exports["body"].(string) 115 | require.True(t, ok) 116 | require.Equal(t, "Bad Request", body) 117 | 118 | status, ok := exports["status"].(int64) 119 | require.True(t, ok) 120 | require.Equal(t, int64(400), status) 121 | 122 | error, ok := exports["error"].(string) 123 | require.True(t, ok) 124 | require.Equal(t, "", error) 125 | 126 | headers, ok := exports["headers"].(map[string]any) 127 | require.True(t, ok) 128 | require.NotEmpty(t, headers) 129 | } 130 | 131 | func TestJSLibHTTPPostJSON(t *testing.T) { 132 | script := ` 133 | import http from "flyscrape/http" 134 | 135 | const res = http.postJSON("https://example.com", { 136 | username: "foo", 137 | password: "bar", 138 | }) 139 | 140 | export const body = res.body; 141 | export const status = res.status; 142 | export const error = res.error; 143 | export const headers = res.headers; 144 | 145 | export default function () {} 146 | ` 147 | 148 | client := &http.Client{ 149 | Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 150 | require.Equal(t, "POST", r.Method) 151 | require.Equal(t, "application/json", r.Header.Get("Content-Type")) 152 | 153 | m := map[string]any{} 154 | json.NewDecoder(r.Body).Decode(&m) 155 | require.Equal(t, "foo", m["username"]) 156 | require.Equal(t, "bar", m["password"]) 157 | 158 | return flyscrape.MockResponse(400, "Bad Request") 159 | }), 160 | } 161 | 162 | imports, _ := flyscrape.NewJSLibrary(client) 163 | exports, err := flyscrape.Compile(script, imports) 164 | require.NoError(t, err) 165 | 166 | body, ok := exports["body"].(string) 167 | require.True(t, ok) 168 | require.Equal(t, "Bad Request", body) 169 | 170 | status, ok := exports["status"].(int64) 171 | require.True(t, ok) 172 | require.Equal(t, int64(400), status) 173 | 174 | error, ok := exports["error"].(string) 175 | require.True(t, ok) 176 | require.Equal(t, "", error) 177 | 178 | headers, ok := exports["headers"].(map[string]any) 179 | require.True(t, ok) 180 | require.NotEmpty(t, headers) 181 | } 182 | 183 | func TestJSLibHTTPDownload(t *testing.T) { 184 | cwd, err := os.Getwd() 185 | require.NoError(t, err) 186 | 187 | tmpdir, err := os.MkdirTemp("", "http-download") 188 | require.NoError(t, err) 189 | 190 | defer os.RemoveAll(tmpdir) 191 | defer os.Chdir(cwd) 192 | os.Chdir(tmpdir) 193 | 194 | script := ` 195 | import http from "flyscrape/http"; 196 | 197 | http.download("https://example.com/foo.txt", "foo.txt"); 198 | http.download("https://example.com/foo.txt", "dir/my-foo.txt"); 199 | http.download("https://example.com/bar.txt", "dir/"); 200 | http.download("https://example.com/baz.txt", "dir"); 201 | http.download("https://example.com/content-disposition", "."); 202 | http.download("https://example.com/hack.txt", "."); 203 | http.download("https://example.com/no-dest.txt"); 204 | http.download("https://example.com/404.txt"); 205 | ` 206 | 207 | var nreqs atomic.Int32 208 | client := &http.Client{ 209 | Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 210 | nreqs.Add(1) 211 | 212 | if r.URL.Path == "/content-disposition" { 213 | resp, err := flyscrape.MockResponse(200, "hello world") 214 | resp.Header.Set("Content-Disposition", `attachment; filename="qux.txt"`) 215 | return resp, err 216 | } 217 | if r.URL.Path == "/hack.txt" { 218 | resp, err := flyscrape.MockResponse(200, "hello world") 219 | resp.Header.Set("Content-Disposition", `attachment; filename="../../hack.txt"`) 220 | return resp, err 221 | } 222 | if r.URL.Path == "/404.txt" { 223 | resp, err := flyscrape.MockResponse(404, "hello world") 224 | return resp, err 225 | } 226 | 227 | return flyscrape.MockResponse(200, "hello world") 228 | }), 229 | } 230 | 231 | imports, wait := flyscrape.NewJSLibrary(client) 232 | _, err = flyscrape.Compile(script, imports) 233 | require.NoError(t, err) 234 | 235 | wait() 236 | 237 | require.Equal(t, nreqs.Load(), int32(8)) 238 | require.FileExists(t, "foo.txt") 239 | require.FileExists(t, "dir/my-foo.txt") 240 | require.FileExists(t, "dir/bar.txt") 241 | require.FileExists(t, "dir/baz.txt") 242 | require.FileExists(t, "qux.txt") 243 | require.FileExists(t, "hack.txt") 244 | require.FileExists(t, "no-dest.txt") 245 | require.NoFileExists(t, "404.txt") 246 | } 247 | -------------------------------------------------------------------------------- /js_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape_test 6 | 7 | import ( 8 | "encoding/json" 9 | "testing" 10 | 11 | "github.com/dop251/goja" 12 | "github.com/philippta/flyscrape" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | var html = ` 17 | 18 | 19 |
    20 |

    headline

    21 |

    paragraph

    22 |
    23 | 24 | ` 25 | 26 | var script = ` 27 | export const config = { 28 | url: "https://localhost/", 29 | } 30 | 31 | export default function({ doc, url }) { 32 | return { 33 | headline: doc.find("h1").text(), 34 | body: doc.find("p").text(), 35 | url: url, 36 | } 37 | } 38 | ` 39 | 40 | func TestJSScrape(t *testing.T) { 41 | exports, err := flyscrape.Compile(script, nil) 42 | require.NoError(t, err) 43 | require.NotNil(t, exports) 44 | require.NotEmpty(t, exports.Config) 45 | 46 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 47 | HTML: html, 48 | URL: "http://localhost/", 49 | }) 50 | 51 | require.NoError(t, err) 52 | 53 | m, ok := result.(map[string]any) 54 | require.True(t, ok) 55 | require.Equal(t, "headline", m["headline"]) 56 | require.Equal(t, "paragraph", m["body"]) 57 | require.Equal(t, "http://localhost/", m["url"]) 58 | } 59 | 60 | func TestJSScrapeObject(t *testing.T) { 61 | js := ` 62 | export default function() { 63 | return {foo: "bar"} 64 | } 65 | ` 66 | exports, err := flyscrape.Compile(js, nil) 67 | require.NoError(t, err) 68 | 69 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 70 | HTML: html, 71 | URL: "http://localhost/", 72 | }) 73 | require.NoError(t, err) 74 | 75 | m, ok := result.(map[string]any) 76 | require.True(t, ok) 77 | require.Equal(t, "bar", m["foo"]) 78 | } 79 | 80 | func TestJSScrapeNull(t *testing.T) { 81 | js := ` 82 | export default function() { 83 | return null 84 | } 85 | ` 86 | exports, err := flyscrape.Compile(js, nil) 87 | require.NoError(t, err) 88 | 89 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 90 | HTML: html, 91 | URL: "http://localhost/", 92 | }) 93 | require.NoError(t, err) 94 | require.Nil(t, result) 95 | } 96 | 97 | func TestJSScrapeUndefined(t *testing.T) { 98 | js := ` 99 | export default function() { } 100 | ` 101 | exports, err := flyscrape.Compile(js, nil) 102 | require.NoError(t, err) 103 | 104 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 105 | HTML: html, 106 | URL: "http://localhost/", 107 | }) 108 | require.NoError(t, err) 109 | require.Nil(t, result) 110 | } 111 | 112 | func TestJSScrapeString(t *testing.T) { 113 | js := ` 114 | export default function() { 115 | return "foo" 116 | } 117 | ` 118 | exports, err := flyscrape.Compile(js, nil) 119 | require.NoError(t, err) 120 | 121 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 122 | HTML: html, 123 | URL: "http://localhost/", 124 | }) 125 | require.NoError(t, err) 126 | 127 | m, ok := result.(string) 128 | require.True(t, ok) 129 | require.Equal(t, "foo", m) 130 | } 131 | 132 | func TestJSScrapeArray(t *testing.T) { 133 | js := ` 134 | export default function() { 135 | return [1,2,3] 136 | } 137 | ` 138 | exports, err := flyscrape.Compile(js, nil) 139 | require.NoError(t, err) 140 | 141 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 142 | HTML: html, 143 | URL: "http://localhost/", 144 | }) 145 | require.NoError(t, err) 146 | 147 | m, ok := result.([]any) 148 | require.True(t, ok) 149 | require.Equal(t, float64(1), m[0]) 150 | require.Equal(t, float64(2), m[1]) 151 | require.Equal(t, float64(3), m[2]) 152 | } 153 | 154 | func TestJSScrapeNaN(t *testing.T) { 155 | js := ` 156 | export default function() { 157 | return NaN 158 | } 159 | ` 160 | exports, err := flyscrape.Compile(js, nil) 161 | require.NoError(t, err) 162 | 163 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 164 | HTML: html, 165 | URL: "http://localhost/", 166 | }) 167 | require.NoError(t, err) 168 | require.Nil(t, result) 169 | } 170 | 171 | func TestJSScrapeParamURL(t *testing.T) { 172 | js := ` 173 | export default function({ url }) { 174 | return url; 175 | } 176 | ` 177 | exports, err := flyscrape.Compile(js, nil) 178 | require.NoError(t, err) 179 | 180 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 181 | HTML: html, 182 | URL: "http://localhost/", 183 | }) 184 | require.NoError(t, err) 185 | require.Equal(t, "http://localhost/", result) 186 | } 187 | 188 | func TestJSScrapeParamAbsoluteURL(t *testing.T) { 189 | js := ` 190 | export default function({ absoluteURL }) { 191 | return absoluteURL("/foo"); 192 | } 193 | ` 194 | exports, err := flyscrape.Compile(js, nil) 195 | require.NoError(t, err) 196 | 197 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 198 | HTML: html, 199 | URL: "http://localhost/", 200 | }) 201 | require.NoError(t, err) 202 | require.Equal(t, "http://localhost/foo", result) 203 | } 204 | 205 | func TestJSScrapeParamScrape(t *testing.T) { 206 | js := ` 207 | export default function({ scrape }) { 208 | return scrape("/foo", function({ url }) { 209 | return { 210 | url: url, 211 | foo: "bar", 212 | }; 213 | }); 214 | } 215 | ` 216 | exports, err := flyscrape.Compile(js, nil) 217 | require.NoError(t, err) 218 | 219 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 220 | HTML: html, 221 | URL: "http://localhost/", 222 | Process: func(url string) ([]byte, error) { 223 | return nil, nil 224 | }, 225 | }) 226 | require.NoError(t, err) 227 | require.Equal(t, map[string]any{ 228 | "url": "http://localhost/foo", 229 | "foo": "bar", 230 | }, result) 231 | } 232 | 233 | func TestJSScrapeParamScrapeDeep(t *testing.T) { 234 | js := ` 235 | export default function({ scrape }) { 236 | return scrape("/foo/", function({ url, scrape }) { 237 | return { 238 | url: url, 239 | deep: scrape("bar", function({ url }) { 240 | return url; 241 | }), 242 | }; 243 | }); 244 | } 245 | ` 246 | exports, err := flyscrape.Compile(js, nil) 247 | require.NoError(t, err) 248 | 249 | result, err := exports.Scrape(flyscrape.ScrapeParams{ 250 | HTML: html, 251 | URL: "http://localhost/", 252 | Process: func(url string) ([]byte, error) { 253 | return nil, nil 254 | }, 255 | }) 256 | require.NoError(t, err) 257 | require.Equal(t, map[string]any{ 258 | "url": "http://localhost/foo/", 259 | "deep": "http://localhost/foo/bar", 260 | }, result) 261 | } 262 | 263 | func TestJSScrapeParamFollow(t *testing.T) { 264 | js := ` 265 | export default function({ follow }) { 266 | follow("/foo") 267 | } 268 | ` 269 | exports, err := flyscrape.Compile(js, nil) 270 | require.NoError(t, err) 271 | 272 | var followedURL string 273 | _, err = exports.Scrape(flyscrape.ScrapeParams{ 274 | HTML: html, 275 | URL: "http://localhost/", 276 | Follow: func(url string) { 277 | followedURL = url 278 | }, 279 | }) 280 | require.NoError(t, err) 281 | require.Equal(t, "http://localhost/foo", followedURL) 282 | } 283 | 284 | func TestJSCompileError(t *testing.T) { 285 | exports, err := flyscrape.Compile("import foo;", nil) 286 | require.Error(t, err) 287 | require.Nil(t, exports) 288 | 289 | var terr flyscrape.TransformError 290 | require.ErrorAs(t, err, &terr) 291 | 292 | require.Equal(t, terr, flyscrape.TransformError{ 293 | Line: 1, 294 | Column: 10, 295 | Text: `Expected "from" but found ";"`, 296 | }) 297 | } 298 | 299 | func TestJSConfig(t *testing.T) { 300 | js := ` 301 | export const config = { 302 | url: 'http://localhost/', 303 | depth: 5, 304 | allowedDomains: ['example.com'], 305 | } 306 | export default function() {} 307 | ` 308 | exports, err := flyscrape.Compile(js, nil) 309 | require.NoError(t, err) 310 | require.NotNil(t, exports) 311 | require.NotEmpty(t, exports.Config()) 312 | 313 | type config struct { 314 | URL string `json:"url"` 315 | Depth int `json:"depth"` 316 | AllowedDomains []string `json:"allowedDomains"` 317 | } 318 | 319 | var cfg config 320 | err = json.Unmarshal(exports.Config(), &cfg) 321 | require.NoError(t, err) 322 | 323 | require.Equal(t, config{ 324 | URL: "http://localhost/", 325 | Depth: 5, 326 | AllowedDomains: []string{"example.com"}, 327 | }, cfg) 328 | } 329 | 330 | func TestJSImports(t *testing.T) { 331 | js := ` 332 | import A from "flyscrape" 333 | import { bar } from "flyscrape/foo" 334 | 335 | export const config = {} 336 | export default function() {} 337 | 338 | export const a = A.foo 339 | export const b = bar() 340 | ` 341 | imports := flyscrape.Imports{ 342 | "flyscrape": map[string]any{ 343 | "foo": 10, 344 | }, 345 | "flyscrape/foo": map[string]any{ 346 | "bar": func() string { 347 | return "baz" 348 | }, 349 | }, 350 | } 351 | 352 | exports, err := flyscrape.Compile(js, imports) 353 | require.NoError(t, err) 354 | require.NotNil(t, exports) 355 | 356 | require.Equal(t, int64(10), exports["a"].(int64)) 357 | require.Equal(t, "baz", exports["b"].(string)) 358 | } 359 | 360 | func TestJSArbitraryFunction(t *testing.T) { 361 | js := ` 362 | export const config = {} 363 | export default function() {} 364 | export function foo() { 365 | return "bar"; 366 | } 367 | ` 368 | exports, err := flyscrape.Compile(js, nil) 369 | require.NoError(t, err) 370 | require.NotNil(t, exports) 371 | 372 | foo := func() string { 373 | fn := exports["foo"].(func(goja.FunctionCall) goja.Value) 374 | return fn(goja.FunctionCall{}).String() 375 | } 376 | 377 | require.Equal(t, "bar", foo()) 378 | } 379 | 380 | func TestJSArbitraryConstString(t *testing.T) { 381 | js := ` 382 | export const config = {} 383 | export default function() {} 384 | export const foo = "bar" 385 | ` 386 | exports, err := flyscrape.Compile(js, nil) 387 | require.NoError(t, err) 388 | require.NotNil(t, exports) 389 | 390 | require.Equal(t, "bar", exports["foo"].(string)) 391 | } 392 | -------------------------------------------------------------------------------- /module.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | "encoding/json" 9 | "net/http" 10 | "sync" 11 | ) 12 | 13 | type Module interface { 14 | ModuleInfo() ModuleInfo 15 | } 16 | 17 | type ModuleInfo struct { 18 | ID string 19 | New func() Module 20 | } 21 | 22 | type TransportAdapter interface { 23 | AdaptTransport(http.RoundTripper) http.RoundTripper 24 | } 25 | 26 | type RequestValidator interface { 27 | ValidateRequest(*Request) bool 28 | } 29 | 30 | type RequestBuilder interface { 31 | BuildRequest(*Request) 32 | } 33 | 34 | type ResponseReceiver interface { 35 | ReceiveResponse(*Response) 36 | } 37 | 38 | type Provisioner interface { 39 | Provision(Context) 40 | } 41 | 42 | type Finalizer interface { 43 | Finalize() 44 | } 45 | 46 | func RegisterModule(mod Module) { 47 | modulesMu.Lock() 48 | defer modulesMu.Unlock() 49 | 50 | id := mod.ModuleInfo().ID 51 | if _, ok := modules[id]; ok { 52 | panic("module with id: " + id + " already registered") 53 | } 54 | modules[mod.ModuleInfo().ID] = mod 55 | } 56 | 57 | func LoadModules(cfg Config) []Module { 58 | modulesMu.RLock() 59 | defer modulesMu.RUnlock() 60 | 61 | loaded := map[string]struct{}{} 62 | mods := []Module{} 63 | 64 | // load standard modules in order 65 | for _, id := range moduleOrder { 66 | if _, ok := loaded[id]; ok { 67 | continue 68 | } 69 | mod := modules[id].ModuleInfo().New() 70 | if err := json.Unmarshal(cfg, mod); err != nil { 71 | panic("failed to decode config: " + err.Error()) 72 | } 73 | mods = append(mods, mod) 74 | loaded[id] = struct{}{} 75 | } 76 | 77 | // load custom modules 78 | for id := range modules { 79 | if _, ok := loaded[id]; ok { 80 | continue 81 | } 82 | mod := modules[id].ModuleInfo().New() 83 | if err := json.Unmarshal(cfg, mod); err != nil { 84 | panic("failed to decode config: " + err.Error()) 85 | } 86 | mods = append(mods, mod) 87 | loaded[id] = struct{}{} 88 | } 89 | 90 | return mods 91 | } 92 | 93 | var ( 94 | modules = map[string]Module{} 95 | modulesMu sync.RWMutex 96 | 97 | moduleOrder = []string{ 98 | // Transport adapters must be loaded in a specific order. 99 | // All other modules can be loaded in any order. 100 | "proxy", 101 | "browser", 102 | "retry", 103 | "ratelimit", 104 | "cache", 105 | "cookies", 106 | "headers", 107 | } 108 | ) 109 | -------------------------------------------------------------------------------- /modules/browser/browser.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package browser 6 | 7 | import ( 8 | "fmt" 9 | "io" 10 | "log" 11 | "net/http" 12 | "os" 13 | "strings" 14 | "sync" 15 | "time" 16 | 17 | "github.com/go-rod/rod" 18 | "github.com/go-rod/rod/lib/launcher" 19 | "github.com/go-rod/rod/lib/proto" 20 | "github.com/philippta/flyscrape" 21 | ) 22 | 23 | func init() { 24 | flyscrape.RegisterModule(Module{}) 25 | } 26 | 27 | type Module struct { 28 | Browser bool `json:"browser"` 29 | Headless *bool `json:"headless"` 30 | 31 | browser *rod.Browser 32 | } 33 | 34 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 35 | return flyscrape.ModuleInfo{ 36 | ID: "browser", 37 | New: func() flyscrape.Module { return new(Module) }, 38 | } 39 | } 40 | 41 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 42 | if !m.Browser { 43 | return t 44 | } 45 | 46 | headless := true 47 | if m.Headless != nil { 48 | headless = *m.Headless 49 | } 50 | 51 | browser, err := newBrowser(headless) 52 | if err != nil { 53 | log.Println(err) 54 | os.Exit(1) 55 | } 56 | 57 | m.browser = browser 58 | 59 | return chromeTransport(browser) 60 | } 61 | 62 | func (m *Module) Finalize() { 63 | if m.browser != nil { 64 | m.browser.Close() 65 | } 66 | } 67 | 68 | func newBrowser(headless bool) (*rod.Browser, error) { 69 | serviceURL, err := launcher.New(). 70 | Headless(headless). 71 | Launch() 72 | if err != nil { 73 | return nil, fmt.Errorf("failed to launch browser: %w", err) 74 | } 75 | 76 | browser := rod.New().ControlURL(serviceURL).NoDefaultDevice() 77 | if err := browser.Connect(); err != nil { 78 | return nil, fmt.Errorf("failed to connect to browser: %w", err) 79 | } 80 | 81 | return browser, nil 82 | } 83 | 84 | func chromeTransport(browser *rod.Browser) flyscrape.RoundTripFunc { 85 | return func(r *http.Request) (*http.Response, error) { 86 | select { 87 | case <-r.Context().Done(): 88 | return nil, r.Context().Err() 89 | default: 90 | } 91 | 92 | page := browser.MustPage() 93 | defer page.Close() 94 | 95 | var once sync.Once 96 | var networkResponse *proto.NetworkResponse 97 | go page.EachEvent(func(e *proto.NetworkResponseReceived) { 98 | if e.Type != proto.NetworkResourceTypeDocument { 99 | return 100 | } 101 | once.Do(func() { 102 | networkResponse = e.Response 103 | }) 104 | })() 105 | 106 | page = page.Context(r.Context()) 107 | 108 | for h := range r.Header { 109 | if h == "Cookie" { 110 | continue 111 | } 112 | if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") { 113 | continue 114 | } 115 | page.MustSetExtraHeaders(h, r.Header.Get(h)) 116 | } 117 | 118 | page.SetCookies(parseCookies(r)) 119 | 120 | if err := page.Navigate(r.URL.String()); err != nil { 121 | return nil, err 122 | } 123 | 124 | timeout := page.Timeout(10 * time.Second) 125 | timeout.WaitLoad() 126 | timeout.WaitDOMStable(300*time.Millisecond, 0) 127 | timeout.WaitRequestIdle(time.Second, nil, nil, nil) 128 | 129 | html, err := page.HTML() 130 | if err != nil { 131 | return nil, err 132 | } 133 | 134 | resp := &http.Response{ 135 | StatusCode: 200, 136 | Status: "200 OK", 137 | Body: io.NopCloser(strings.NewReader(html)), 138 | Header: http.Header{"Content-Type": []string{"text/html"}}, 139 | } 140 | 141 | if networkResponse != nil { 142 | resp.StatusCode = networkResponse.Status 143 | resp.Status = networkResponse.StatusText 144 | resp.Header = http.Header{} 145 | 146 | for k, v := range networkResponse.Headers { 147 | resp.Header.Set(k, v.String()) 148 | } 149 | } 150 | 151 | return resp, err 152 | } 153 | } 154 | 155 | func parseCookies(r *http.Request) []*proto.NetworkCookieParam { 156 | rawCookie := r.Header.Get("Cookie") 157 | if rawCookie == "" { 158 | return nil 159 | } 160 | 161 | header := http.Header{} 162 | header.Add("Cookie", rawCookie) 163 | request := http.Request{Header: header} 164 | 165 | domainSegs := strings.Split(r.URL.Hostname(), ".") 166 | if len(domainSegs) < 2 { 167 | return nil 168 | } 169 | 170 | domain := "." + strings.Join(domainSegs[len(domainSegs)-2:], ".") 171 | 172 | var cookies []*proto.NetworkCookieParam 173 | for _, cookie := range request.Cookies() { 174 | cookies = append(cookies, &proto.NetworkCookieParam{ 175 | Name: cookie.Name, 176 | Value: cookie.Value, 177 | Domain: domain, 178 | Path: "/", 179 | Secure: false, 180 | HTTPOnly: false, 181 | SameSite: "Lax", 182 | Expires: -1, 183 | URL: r.URL.String(), 184 | }) 185 | } 186 | 187 | return cookies 188 | } 189 | 190 | var ( 191 | _ flyscrape.TransportAdapter = &Module{} 192 | _ flyscrape.Finalizer = &Module{} 193 | ) 194 | -------------------------------------------------------------------------------- /modules/browser/browser_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package browser_test 6 | 7 | import ( 8 | "fmt" 9 | "net/http" 10 | "net/http/httptest" 11 | "testing" 12 | 13 | "github.com/philippta/flyscrape" 14 | "github.com/philippta/flyscrape/modules/browser" 15 | "github.com/philippta/flyscrape/modules/headers" 16 | "github.com/philippta/flyscrape/modules/hook" 17 | "github.com/philippta/flyscrape/modules/starturl" 18 | "github.com/stretchr/testify/require" 19 | ) 20 | 21 | func TestBrowser(t *testing.T) { 22 | t.SkipNow() 23 | 24 | var called bool 25 | 26 | srv := newServer(func(w http.ResponseWriter, r *http.Request) { 27 | called = true 28 | w.Write([]byte(`

    Hello Browser

    Foo`)) 29 | }) 30 | defer srv.Close() 31 | 32 | var body string 33 | 34 | mods := []flyscrape.Module{ 35 | &starturl.Module{URL: srv.URL}, 36 | &browser.Module{Browser: true}, 37 | &hook.Module{ 38 | ReceiveResponseFn: func(r *flyscrape.Response) { 39 | body = string(r.Body) 40 | }, 41 | }, 42 | } 43 | 44 | scraper := flyscrape.NewScraper() 45 | scraper.Modules = mods 46 | scraper.Run() 47 | 48 | require.True(t, called) 49 | require.Contains(t, body, "Hello Browser") 50 | } 51 | func TestBrowserStatusCode(t *testing.T) { 52 | t.SkipNow() 53 | 54 | srv := newServer(func(w http.ResponseWriter, r *http.Request) { 55 | w.WriteHeader(404) 56 | }) 57 | defer srv.Close() 58 | 59 | var statusCode int 60 | 61 | mods := []flyscrape.Module{ 62 | &starturl.Module{URL: srv.URL}, 63 | &browser.Module{Browser: true}, 64 | &hook.Module{ 65 | ReceiveResponseFn: func(r *flyscrape.Response) { 66 | statusCode = r.StatusCode 67 | }, 68 | }, 69 | } 70 | 71 | scraper := flyscrape.NewScraper() 72 | scraper.Modules = mods 73 | scraper.Run() 74 | 75 | require.Equal(t, 404, statusCode) 76 | } 77 | 78 | func TestBrowserRequestHeader(t *testing.T) { 79 | t.SkipNow() 80 | 81 | srv := newServer(func(w http.ResponseWriter, r *http.Request) { 82 | w.Write([]byte(r.Header.Get("User-Agent"))) 83 | }) 84 | defer srv.Close() 85 | 86 | var body string 87 | 88 | mods := []flyscrape.Module{ 89 | &starturl.Module{URL: srv.URL}, 90 | &browser.Module{Browser: true}, 91 | &headers.Module{ 92 | Headers: map[string]string{ 93 | "User-Agent": "custom-headers", 94 | }, 95 | }, 96 | &hook.Module{ 97 | ReceiveResponseFn: func(r *flyscrape.Response) { 98 | body = string(r.Body) 99 | }, 100 | }, 101 | } 102 | 103 | scraper := flyscrape.NewScraper() 104 | scraper.Modules = mods 105 | scraper.Run() 106 | 107 | require.Contains(t, body, "custom-headers") 108 | } 109 | 110 | func TestBrowserResponseHeader(t *testing.T) { 111 | t.SkipNow() 112 | 113 | srv := newServer(func(w http.ResponseWriter, r *http.Request) { 114 | w.Header().Set("Foo", "bar") 115 | }) 116 | defer srv.Close() 117 | 118 | var header string 119 | 120 | mods := []flyscrape.Module{ 121 | &starturl.Module{URL: srv.URL}, 122 | &browser.Module{Browser: true}, 123 | &hook.Module{ 124 | ReceiveResponseFn: func(r *flyscrape.Response) { 125 | header = r.Headers.Get("Foo") 126 | }, 127 | }, 128 | } 129 | 130 | scraper := flyscrape.NewScraper() 131 | scraper.Modules = mods 132 | scraper.Run() 133 | 134 | require.Equal(t, header, "bar") 135 | } 136 | 137 | func TestBrowserUnsetFlyscrapeUserAgent(t *testing.T) { 138 | t.SkipNow() 139 | 140 | srv := newServer(func(w http.ResponseWriter, r *http.Request) { 141 | w.Write([]byte(r.Header.Get("User-Agent"))) 142 | }) 143 | defer srv.Close() 144 | 145 | var body string 146 | 147 | mods := []flyscrape.Module{ 148 | &starturl.Module{URL: srv.URL}, 149 | &browser.Module{Browser: true}, 150 | &hook.Module{ 151 | ReceiveResponseFn: func(r *flyscrape.Response) { 152 | body = string(r.Body) 153 | }, 154 | }, 155 | } 156 | 157 | scraper := flyscrape.NewScraper() 158 | scraper.Modules = mods 159 | scraper.Run() 160 | 161 | fmt.Println(body) 162 | require.Contains(t, body, "Mozilla/5.0") 163 | require.NotContains(t, body, "flyscrape") 164 | } 165 | 166 | func newServer(f func(http.ResponseWriter, *http.Request)) *httptest.Server { 167 | return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 168 | f(w, r) 169 | })) 170 | } 171 | -------------------------------------------------------------------------------- /modules/cache/boltstore.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cache 6 | 7 | import ( 8 | "errors" 9 | "log" 10 | "os" 11 | 12 | "go.etcd.io/bbolt" 13 | ) 14 | 15 | var cache = []byte("cache") 16 | 17 | func NewBoltStore(file string) *BoltStore { 18 | db, err := bbolt.Open(file, 0644, nil) 19 | if err != nil { 20 | log.Printf("cache: failed to create database file %q: %v\n", file, err) 21 | os.Exit(1) 22 | } 23 | 24 | c := &BoltStore{db: db} 25 | 26 | return c 27 | } 28 | 29 | type BoltStore struct { 30 | db *bbolt.DB 31 | } 32 | 33 | func (s *BoltStore) Get(key string) ([]byte, bool) { 34 | var value []byte 35 | 36 | err := s.db.View(func(tx *bbolt.Tx) error { 37 | bucket := tx.Bucket(cache) 38 | if bucket == nil { 39 | return errors.New("bucket not found") 40 | } 41 | 42 | v := bucket.Get([]byte(key)) 43 | if v == nil { 44 | return errors.New("key not found") 45 | } 46 | 47 | value = make([]byte, len(v)) 48 | copy(value, v) 49 | 50 | return nil 51 | }) 52 | if err != nil { 53 | return nil, false 54 | } 55 | return value, true 56 | } 57 | 58 | func (s *BoltStore) Set(key string, value []byte) { 59 | err := s.db.Update(func(tx *bbolt.Tx) error { 60 | bucket, err := tx.CreateBucketIfNotExists(cache) 61 | if err != nil { 62 | return err 63 | } 64 | 65 | return bucket.Put([]byte(key), value) 66 | }) 67 | if err != nil { 68 | log.Printf("cache: failed to insert cache key %q: %v\n", key, err) 69 | } 70 | } 71 | 72 | func (s *BoltStore) Close() { 73 | s.db.Close() 74 | } 75 | -------------------------------------------------------------------------------- /modules/cache/boltstore_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cache_test 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/philippta/flyscrape/modules/cache" 12 | "github.com/stretchr/testify/require" 13 | ) 14 | 15 | func TestBoltStore(t *testing.T) { 16 | dir, err := os.MkdirTemp("", "boltstore") 17 | require.NoError(t, err) 18 | defer os.RemoveAll(dir) 19 | 20 | store := cache.NewBoltStore(dir + "/test.db") 21 | 22 | v, ok := store.Get("foo") 23 | require.Nil(t, v) 24 | require.False(t, ok) 25 | 26 | store.Set("foo", []byte("bar")) 27 | 28 | v, ok = store.Get("foo") 29 | require.NotNil(t, v) 30 | require.True(t, ok) 31 | require.Equal(t, []byte("bar"), v) 32 | } 33 | -------------------------------------------------------------------------------- /modules/cache/cache.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cache 6 | 7 | import ( 8 | "bufio" 9 | "bytes" 10 | "net/http" 11 | "net/http/httputil" 12 | "path/filepath" 13 | "strings" 14 | 15 | "github.com/philippta/flyscrape" 16 | ) 17 | 18 | func init() { 19 | flyscrape.RegisterModule(Module{}) 20 | } 21 | 22 | type Module struct { 23 | Cache string `json:"cache"` 24 | 25 | store Store 26 | } 27 | 28 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 29 | return flyscrape.ModuleInfo{ 30 | ID: "cache", 31 | New: func() flyscrape.Module { return new(Module) }, 32 | } 33 | } 34 | 35 | func (m *Module) Provision(ctx flyscrape.Context) { 36 | switch { 37 | case m.Cache == "file": 38 | file := replaceExt(ctx.ScriptName(), ".cache") 39 | m.store = NewBoltStore(file) 40 | 41 | case strings.HasPrefix(m.Cache, "file:"): 42 | m.store = NewBoltStore(strings.TrimPrefix(m.Cache, "file:")) 43 | } 44 | } 45 | 46 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 47 | if m.store == nil { 48 | return t 49 | } 50 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 51 | if nocache(r) { 52 | return t.RoundTrip(r) 53 | } 54 | 55 | key := r.Method + " " + r.URL.String() 56 | if b, ok := m.store.Get(key); ok { 57 | if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil { 58 | return resp, nil 59 | } 60 | } 61 | 62 | resp, err := t.RoundTrip(r) 63 | if err != nil { 64 | return resp, err 65 | } 66 | 67 | // Avoid caching when running into rate limits or 68 | // when the page errored. 69 | if resp.StatusCode < 200 || resp.StatusCode > 299 { 70 | return resp, err 71 | } 72 | 73 | encoded, err := httputil.DumpResponse(resp, true) 74 | if err != nil { 75 | return resp, err 76 | } 77 | 78 | m.store.Set(key, encoded) 79 | return resp, nil 80 | }) 81 | } 82 | 83 | func (m *Module) Finalize() { 84 | if v, ok := m.store.(interface{ Close() }); ok { 85 | v.Close() 86 | } 87 | } 88 | 89 | func nocache(r *http.Request) bool { 90 | if r.Header.Get(flyscrape.HeaderBypassCache) != "" { 91 | r.Header.Del(flyscrape.HeaderBypassCache) 92 | return true 93 | } 94 | return false 95 | } 96 | 97 | func replaceExt(filePath string, newExt string) string { 98 | ext := filepath.Ext(filePath) 99 | if ext != "" { 100 | fileNameWithoutExt := filePath[:len(filePath)-len(ext)] 101 | newFilePath := fileNameWithoutExt + newExt 102 | return newFilePath 103 | } 104 | return filePath + newExt 105 | } 106 | 107 | type Store interface { 108 | Get(key string) ([]byte, bool) 109 | Set(key string, value []byte) 110 | } 111 | -------------------------------------------------------------------------------- /modules/cookies/cookies.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package cookies 6 | 7 | import ( 8 | "net/http" 9 | "slices" 10 | 11 | "github.com/browserutils/kooky" 12 | _ "github.com/browserutils/kooky/browser/chrome" 13 | _ "github.com/browserutils/kooky/browser/edge" 14 | _ "github.com/browserutils/kooky/browser/firefox" 15 | "github.com/philippta/flyscrape" 16 | ) 17 | 18 | var supportedBrowsers = []string{ 19 | "chrome", 20 | "edge", 21 | "firefox", 22 | } 23 | 24 | func init() { 25 | flyscrape.RegisterModule(Module{}) 26 | } 27 | 28 | type Module struct { 29 | Cookies string `json:"cookies"` 30 | } 31 | 32 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 33 | return flyscrape.ModuleInfo{ 34 | ID: "cookies", 35 | New: func() flyscrape.Module { return new(Module) }, 36 | } 37 | } 38 | 39 | func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 40 | if !slices.Contains(supportedBrowsers, m.Cookies) { 41 | return t 42 | } 43 | 44 | var stores []kooky.CookieStore 45 | for _, store := range kooky.FindAllCookieStores() { 46 | if store.Browser() == m.Cookies && store.IsDefaultProfile() { 47 | stores = append(stores, store) 48 | } 49 | } 50 | 51 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 52 | for _, store := range stores { 53 | for _, cookie := range store.Cookies(r.URL) { 54 | r.AddCookie(cookie) 55 | } 56 | } 57 | return t.RoundTrip(r) 58 | }) 59 | } 60 | 61 | var _ flyscrape.TransportAdapter = Module{} 62 | -------------------------------------------------------------------------------- /modules/depth/depth.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package depth 6 | 7 | import ( 8 | "github.com/philippta/flyscrape" 9 | ) 10 | 11 | func init() { 12 | flyscrape.RegisterModule(Module{}) 13 | } 14 | 15 | type Module struct { 16 | Depth int `json:"depth"` 17 | } 18 | 19 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 20 | return flyscrape.ModuleInfo{ 21 | ID: "depth", 22 | New: func() flyscrape.Module { return new(Module) }, 23 | } 24 | } 25 | 26 | func (m *Module) ValidateRequest(r *flyscrape.Request) bool { 27 | return r.Depth <= m.Depth 28 | } 29 | 30 | var _ flyscrape.RequestValidator = (*Module)(nil) 31 | -------------------------------------------------------------------------------- /modules/depth/depth_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package depth_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/depth" 14 | "github.com/philippta/flyscrape/modules/followlinks" 15 | "github.com/philippta/flyscrape/modules/hook" 16 | "github.com/philippta/flyscrape/modules/starturl" 17 | "github.com/stretchr/testify/require" 18 | ) 19 | 20 | func TestDepth(t *testing.T) { 21 | var urls []string 22 | var mu sync.Mutex 23 | 24 | mods := []flyscrape.Module{ 25 | &starturl.Module{URL: "http://www.example.com"}, 26 | &followlinks.Module{}, 27 | &depth.Module{Depth: 2}, 28 | hook.Module{ 29 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 30 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 31 | switch r.URL.String() { 32 | case "http://www.example.com": 33 | return flyscrape.MockResponse(200, `Google`) 34 | case "http://www.google.com": 35 | return flyscrape.MockResponse(200, `DuckDuckGo`) 36 | case "http://www.duckduckgo.com": 37 | return flyscrape.MockResponse(200, `Example`) 38 | } 39 | return flyscrape.MockResponse(200, "") 40 | }) 41 | }, 42 | ReceiveResponseFn: func(r *flyscrape.Response) { 43 | mu.Lock() 44 | urls = append(urls, r.Request.URL) 45 | mu.Unlock() 46 | }, 47 | }, 48 | } 49 | 50 | scraper := flyscrape.NewScraper() 51 | scraper.Modules = mods 52 | scraper.Run() 53 | 54 | require.Len(t, urls, 3) 55 | require.Contains(t, urls, "http://www.example.com") 56 | require.Contains(t, urls, "http://www.google.com") 57 | require.Contains(t, urls, "http://www.duckduckgo.com") 58 | } 59 | -------------------------------------------------------------------------------- /modules/domainfilter/domainfilter.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package domainfilter 6 | 7 | import ( 8 | "github.com/nlnwa/whatwg-url/url" 9 | "github.com/philippta/flyscrape" 10 | ) 11 | 12 | func init() { 13 | flyscrape.RegisterModule(Module{}) 14 | } 15 | 16 | type Module struct { 17 | URL string `json:"url"` 18 | URLs []string `json:"urls"` 19 | AllowedDomains []string `json:"allowedDomains"` 20 | BlockedDomains []string `json:"blockedDomains"` 21 | 22 | active bool 23 | } 24 | 25 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 26 | return flyscrape.ModuleInfo{ 27 | ID: "domainfilter", 28 | New: func() flyscrape.Module { return new(Module) }, 29 | } 30 | } 31 | 32 | func (m *Module) Provision(v flyscrape.Context) { 33 | if m.URL != "" { 34 | if u, err := url.Parse(m.URL); err == nil { 35 | m.AllowedDomains = append(m.AllowedDomains, u.Host()) 36 | } 37 | } 38 | for _, u := range m.URLs { 39 | if u, err := url.Parse(u); err == nil { 40 | m.AllowedDomains = append(m.AllowedDomains, u.Host()) 41 | } 42 | } 43 | } 44 | 45 | func (m *Module) ValidateRequest(r *flyscrape.Request) bool { 46 | if m.disabled() { 47 | return true 48 | } 49 | 50 | u, err := url.Parse(r.URL) 51 | if err != nil { 52 | return false 53 | } 54 | 55 | host := u.Host() 56 | ok := false 57 | 58 | for _, domain := range m.AllowedDomains { 59 | if domain == "*" || host == domain { 60 | ok = true 61 | break 62 | } 63 | } 64 | 65 | for _, domain := range m.BlockedDomains { 66 | if host == domain { 67 | ok = false 68 | break 69 | } 70 | } 71 | 72 | return ok 73 | } 74 | 75 | func (m *Module) disabled() bool { 76 | return len(m.AllowedDomains) == 0 && len(m.BlockedDomains) == 0 77 | } 78 | 79 | var ( 80 | _ flyscrape.RequestValidator = (*Module)(nil) 81 | _ flyscrape.Provisioner = (*Module)(nil) 82 | ) 83 | -------------------------------------------------------------------------------- /modules/domainfilter/domainfilter_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package domainfilter_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/domainfilter" 14 | "github.com/philippta/flyscrape/modules/followlinks" 15 | "github.com/philippta/flyscrape/modules/hook" 16 | "github.com/philippta/flyscrape/modules/starturl" 17 | "github.com/stretchr/testify/require" 18 | ) 19 | 20 | func TestDomainfilterAllowed(t *testing.T) { 21 | var urls []string 22 | var mu sync.Mutex 23 | 24 | mods := []flyscrape.Module{ 25 | &starturl.Module{URL: "http://www.example.com"}, 26 | &followlinks.Module{}, 27 | &domainfilter.Module{ 28 | URL: "http://www.example.com", 29 | AllowedDomains: []string{"www.google.com"}, 30 | }, 31 | hook.Module{ 32 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 33 | return flyscrape.MockTransport(200, ` 34 | Google 35 | DuckDuckGo`) 36 | }, 37 | ReceiveResponseFn: func(r *flyscrape.Response) { 38 | mu.Lock() 39 | urls = append(urls, r.Request.URL) 40 | mu.Unlock() 41 | }, 42 | }, 43 | } 44 | 45 | scraper := flyscrape.NewScraper() 46 | scraper.Modules = mods 47 | scraper.Run() 48 | 49 | require.Len(t, urls, 2) 50 | require.Contains(t, urls, "http://www.example.com") 51 | require.Contains(t, urls, "http://www.google.com") 52 | } 53 | 54 | func TestDomainfilterAllowedAll(t *testing.T) { 55 | var urls []string 56 | var mu sync.Mutex 57 | 58 | mods := []flyscrape.Module{ 59 | &starturl.Module{URL: "http://www.example.com"}, 60 | &followlinks.Module{}, 61 | &domainfilter.Module{ 62 | URL: "http://www.example.com", 63 | AllowedDomains: []string{"*"}, 64 | }, 65 | hook.Module{ 66 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 67 | return flyscrape.MockTransport(200, ` 68 | Google 69 | DuckDuckGo`) 70 | }, 71 | ReceiveResponseFn: func(r *flyscrape.Response) { 72 | mu.Lock() 73 | urls = append(urls, r.Request.URL) 74 | mu.Unlock() 75 | }, 76 | }, 77 | } 78 | 79 | scraper := flyscrape.NewScraper() 80 | scraper.Modules = mods 81 | scraper.Run() 82 | 83 | require.Len(t, urls, 3) 84 | require.Contains(t, urls, "http://www.example.com") 85 | require.Contains(t, urls, "http://www.duckduckgo.com") 86 | require.Contains(t, urls, "http://www.google.com") 87 | } 88 | 89 | func TestDomainfilterBlocked(t *testing.T) { 90 | var urls []string 91 | var mu sync.Mutex 92 | 93 | mods := []flyscrape.Module{ 94 | &starturl.Module{URL: "http://www.example.com"}, 95 | &followlinks.Module{}, 96 | &domainfilter.Module{ 97 | URL: "http://www.example.com", 98 | AllowedDomains: []string{"*"}, 99 | BlockedDomains: []string{"www.google.com"}, 100 | }, 101 | hook.Module{ 102 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 103 | return flyscrape.MockTransport(200, ` 104 | Google 105 | DuckDuckGo`) 106 | }, 107 | ReceiveResponseFn: func(r *flyscrape.Response) { 108 | mu.Lock() 109 | urls = append(urls, r.Request.URL) 110 | mu.Unlock() 111 | }, 112 | }, 113 | } 114 | 115 | scraper := flyscrape.NewScraper() 116 | scraper.Modules = mods 117 | scraper.Run() 118 | 119 | require.Len(t, urls, 2) 120 | require.Contains(t, urls, "http://www.example.com") 121 | require.Contains(t, urls, "http://www.duckduckgo.com") 122 | } 123 | -------------------------------------------------------------------------------- /modules/followlinks/followlinks.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package followlinks 6 | 7 | import ( 8 | "net/url" 9 | "regexp" 10 | "strings" 11 | 12 | "github.com/PuerkitoBio/goquery" 13 | "github.com/philippta/flyscrape" 14 | ) 15 | 16 | func init() { 17 | flyscrape.RegisterModule(Module{}) 18 | } 19 | 20 | type Module struct { 21 | Follow *[]string `json:"follow"` 22 | } 23 | 24 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 25 | return flyscrape.ModuleInfo{ 26 | ID: "followlinks", 27 | New: func() flyscrape.Module { return new(Module) }, 28 | } 29 | } 30 | 31 | func (m *Module) Provision(ctx flyscrape.Context) { 32 | if m.Follow == nil { 33 | m.Follow = &[]string{"a[href]"} 34 | } 35 | } 36 | 37 | func (m *Module) ReceiveResponse(resp *flyscrape.Response) { 38 | if m.Follow == nil { 39 | return 40 | } 41 | 42 | for _, link := range m.parseLinks(string(resp.Body), resp.Request.URL) { 43 | resp.Visit(link) 44 | } 45 | } 46 | 47 | func (m *Module) parseLinks(html string, origin string) []string { 48 | if m.Follow == nil { 49 | return nil 50 | } 51 | 52 | var links []string 53 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) 54 | if err != nil { 55 | return nil 56 | } 57 | 58 | originurl, err := url.Parse(origin) 59 | if err != nil { 60 | return nil 61 | } 62 | 63 | uniqueLinks := make(map[string]bool) 64 | 65 | for _, selector := range *m.Follow { 66 | attr := parseSelectorAttr(selector) 67 | doc.Find(selector).Each(func(i int, s *goquery.Selection) { 68 | link, _ := s.Attr(attr) 69 | 70 | parsedLink, err := originurl.Parse(link) 71 | 72 | if err != nil || !isValidLink(parsedLink) { 73 | return 74 | } 75 | 76 | absLink := parsedLink.String() 77 | 78 | if !uniqueLinks[absLink] { 79 | links = append(links, absLink) 80 | uniqueLinks[absLink] = true 81 | } 82 | }) 83 | } 84 | 85 | return links 86 | } 87 | 88 | func isValidLink(link *url.URL) bool { 89 | if link.Scheme != "" && link.Scheme != "http" && link.Scheme != "https" { 90 | return false 91 | } 92 | 93 | return true 94 | } 95 | 96 | func parseSelectorAttr(sel string) string { 97 | matches := selectorExpr.FindAllString(sel, -1) 98 | if len(matches) == 0 { 99 | return "href" 100 | } 101 | 102 | attr := attrExpr.FindString(matches[len(matches)-1]) 103 | if attr == "" { 104 | return "href" 105 | } 106 | 107 | return attr 108 | } 109 | 110 | var ( 111 | _ flyscrape.Provisioner = (*Module)(nil) 112 | _ flyscrape.ResponseReceiver = (*Module)(nil) 113 | ) 114 | 115 | var ( 116 | selectorExpr = regexp.MustCompile(`\[(.*?)\]`) 117 | attrExpr = regexp.MustCompile(`[\w-]+`) 118 | ) 119 | -------------------------------------------------------------------------------- /modules/followlinks/followlinks_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package followlinks_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/followlinks" 14 | "github.com/philippta/flyscrape/modules/hook" 15 | "github.com/philippta/flyscrape/modules/starturl" 16 | "github.com/stretchr/testify/require" 17 | ) 18 | 19 | func TestFollowLinks(t *testing.T) { 20 | var urls []string 21 | var mu sync.Mutex 22 | 23 | mods := []flyscrape.Module{ 24 | &starturl.Module{URL: "http://www.example.com/foo/bar"}, 25 | &followlinks.Module{}, 26 | hook.Module{ 27 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 28 | return flyscrape.MockTransport(200, ` 29 | Baz 30 | Baz 31 | Google`) 32 | }, 33 | ReceiveResponseFn: func(r *flyscrape.Response) { 34 | mu.Lock() 35 | urls = append(urls, r.Request.URL) 36 | mu.Unlock() 37 | }, 38 | }, 39 | } 40 | 41 | scraper := flyscrape.NewScraper() 42 | scraper.Modules = mods 43 | scraper.Run() 44 | 45 | require.Len(t, urls, 5) 46 | require.Contains(t, urls, "http://www.example.com/baz") 47 | require.Contains(t, urls, "http://www.example.com/foo/bar") 48 | require.Contains(t, urls, "http://www.example.com/foo/baz") 49 | require.Contains(t, urls, "http://www.google.com") 50 | require.Contains(t, urls, "http://www.google.com/baz") 51 | } 52 | 53 | func TestFollowSelector(t *testing.T) { 54 | var urls []string 55 | var mu sync.Mutex 56 | 57 | mods := []flyscrape.Module{ 58 | &starturl.Module{URL: "http://www.example.com/foo/bar"}, 59 | &followlinks.Module{ 60 | Follow: &[]string{".next a[href]"}, 61 | }, 62 | hook.Module{ 63 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 64 | return flyscrape.MockTransport(200, ` 65 | Baz 66 | Baz 67 | `) 70 | }, 71 | ReceiveResponseFn: func(r *flyscrape.Response) { 72 | mu.Lock() 73 | urls = append(urls, r.Request.URL) 74 | mu.Unlock() 75 | }, 76 | }, 77 | } 78 | 79 | scraper := flyscrape.NewScraper() 80 | scraper.Modules = mods 81 | scraper.Run() 82 | 83 | require.Len(t, urls, 2) 84 | require.Contains(t, urls, "http://www.example.com/foo/bar") 85 | require.Contains(t, urls, "http://www.google.com") 86 | } 87 | 88 | func TestFollowDataAttr(t *testing.T) { 89 | var urls []string 90 | var mu sync.Mutex 91 | 92 | mods := []flyscrape.Module{ 93 | &starturl.Module{URL: "http://www.example.com/foo/bar"}, 94 | &followlinks.Module{ 95 | Follow: &[]string{"[data-url]"}, 96 | }, 97 | hook.Module{ 98 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 99 | return flyscrape.MockTransport(200, ` 100 | Baz 101 | Baz 102 |
    Google
    `) 103 | }, 104 | ReceiveResponseFn: func(r *flyscrape.Response) { 105 | mu.Lock() 106 | urls = append(urls, r.Request.URL) 107 | mu.Unlock() 108 | }, 109 | }, 110 | } 111 | 112 | scraper := flyscrape.NewScraper() 113 | scraper.Modules = mods 114 | scraper.Run() 115 | 116 | require.Len(t, urls, 2) 117 | require.Contains(t, urls, "http://www.example.com/foo/bar") 118 | require.Contains(t, urls, "http://www.google.com") 119 | } 120 | 121 | func TestFollowMultiple(t *testing.T) { 122 | var urls []string 123 | var mu sync.Mutex 124 | 125 | mods := []flyscrape.Module{ 126 | &starturl.Module{URL: "http://www.example.com/foo/bar"}, 127 | &followlinks.Module{ 128 | Follow: &[]string{"a.prev", "a.next"}, 129 | }, 130 | hook.Module{ 131 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 132 | return flyscrape.MockTransport(200, ` 133 | Baz 134 | 135 | `) 136 | }, 137 | ReceiveResponseFn: func(r *flyscrape.Response) { 138 | mu.Lock() 139 | urls = append(urls, r.Request.URL) 140 | mu.Unlock() 141 | }, 142 | }, 143 | } 144 | 145 | scraper := flyscrape.NewScraper() 146 | scraper.Modules = mods 147 | scraper.Run() 148 | 149 | require.Len(t, urls, 3) 150 | require.Contains(t, urls, "http://www.example.com/foo/bar") 151 | require.Contains(t, urls, "http://www.example.com/foo/a") 152 | require.Contains(t, urls, "http://www.example.com/foo/b") 153 | } 154 | 155 | func TestFollowNoFollow(t *testing.T) { 156 | var urls []string 157 | var mu sync.Mutex 158 | 159 | mods := []flyscrape.Module{ 160 | &starturl.Module{URL: "http://www.example.com/foo/bar"}, 161 | &followlinks.Module{ 162 | Follow: &[]string{}, 163 | }, 164 | hook.Module{ 165 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 166 | return flyscrape.MockTransport(200, ` 167 | Baz 168 | Baz 169 | `) 172 | }, 173 | ReceiveResponseFn: func(r *flyscrape.Response) { 174 | mu.Lock() 175 | urls = append(urls, r.Request.URL) 176 | mu.Unlock() 177 | }, 178 | }, 179 | } 180 | 181 | scraper := flyscrape.NewScraper() 182 | scraper.Modules = mods 183 | scraper.Run() 184 | 185 | require.Len(t, urls, 1) 186 | require.Contains(t, urls, "http://www.example.com/foo/bar") 187 | } 188 | -------------------------------------------------------------------------------- /modules/headers/headers.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package headers 6 | 7 | import ( 8 | "net/http" 9 | 10 | "github.com/philippta/flyscrape" 11 | ) 12 | 13 | func init() { 14 | flyscrape.RegisterModule(Module{}) 15 | } 16 | 17 | type Module struct { 18 | Headers map[string]string `json:"headers"` 19 | } 20 | 21 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 22 | return flyscrape.ModuleInfo{ 23 | ID: "headers", 24 | New: func() flyscrape.Module { return new(Module) }, 25 | } 26 | } 27 | 28 | func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 29 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 30 | for k, v := range m.Headers { 31 | r.Header.Set(k, v) 32 | } 33 | 34 | if r.Header.Get("User-Agent") == "" { 35 | r.Header.Set("User-Agent", randomUserAgent()) 36 | } 37 | 38 | return t.RoundTrip(r) 39 | }) 40 | } 41 | 42 | var _ flyscrape.TransportAdapter = Module{} 43 | -------------------------------------------------------------------------------- /modules/headers/headers_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package headers_test 6 | 7 | import ( 8 | "net/http" 9 | "strings" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/headers" 14 | "github.com/philippta/flyscrape/modules/hook" 15 | "github.com/philippta/flyscrape/modules/starturl" 16 | "github.com/stretchr/testify/require" 17 | ) 18 | 19 | func TestHeaders(t *testing.T) { 20 | gotHeaders := map[string]string{} 21 | sentHeaders := map[string]string{ 22 | "Authorization": "Basic ZGVtbzpwQDU1dzByZA==", 23 | "User-Agent": "Gecko/1.0", 24 | } 25 | 26 | mods := []flyscrape.Module{ 27 | hook.Module{ 28 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 29 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 30 | for k := range r.Header { 31 | gotHeaders[k] = r.Header.Get(k) 32 | } 33 | return flyscrape.MockResponse(200, "") 34 | }) 35 | }, 36 | }, 37 | &starturl.Module{URL: "http://www.example.com"}, 38 | &headers.Module{ 39 | Headers: sentHeaders, 40 | }, 41 | } 42 | 43 | scraper := flyscrape.NewScraper() 44 | scraper.Modules = mods 45 | scraper.Run() 46 | 47 | require.Equal(t, sentHeaders, gotHeaders) 48 | } 49 | 50 | func TestHeadersRandomUserAgent(t *testing.T) { 51 | var userAgent string 52 | mods := []flyscrape.Module{ 53 | hook.Module{ 54 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 55 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 56 | userAgent = r.Header.Get("User-Agent") 57 | return flyscrape.MockResponse(200, "") 58 | }) 59 | }, 60 | }, 61 | &starturl.Module{URL: "http://www.example.com"}, 62 | &headers.Module{}, 63 | } 64 | 65 | scraper := flyscrape.NewScraper() 66 | scraper.Modules = mods 67 | scraper.Run() 68 | 69 | require.NotEmpty(t, userAgent) 70 | require.True(t, strings.HasPrefix(userAgent, "Mozilla/5.0 (")) 71 | } 72 | -------------------------------------------------------------------------------- /modules/headers/versions.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package headers 6 | 7 | import ( 8 | _ "embed" 9 | "fmt" 10 | "math/rand" 11 | "strings" 12 | ) 13 | 14 | //go:generate bash -c "flyscrape run ../../examples/useragents/chrome.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_chrome.txt" 15 | //go:generate bash -c "flyscrape run ../../examples/useragents/firefox.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_firefox.txt" 16 | //go:generate bash -c "flyscrape run ../../examples/useragents/edge.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_edge.txt" 17 | //go:generate bash -c "flyscrape run ../../examples/useragents/opera.js | jq -r '[.[].data] | flatten | .[]' | sort -nr | uniq > versions_opera.txt" 18 | 19 | //go:embed versions_chrome.txt 20 | var versionsChromeRaw string 21 | var versionsChrome = strings.Split(strings.TrimSpace(versionsChromeRaw), "\n") 22 | 23 | //go:embed versions_firefox.txt 24 | var versionsFirefoxRaw string 25 | var versionsFirefox = strings.Split(strings.TrimSpace(versionsFirefoxRaw), "\n") 26 | 27 | //go:embed versions_edge.txt 28 | var versionsEdgeRaw string 29 | var versionsEdge = strings.Split(strings.TrimSpace(versionsEdgeRaw), "\n") 30 | 31 | //go:embed versions_opera.txt 32 | var versionsOperaRaw string 33 | var versionsOpera = strings.Split(strings.TrimSpace(versionsOperaRaw), "\n") 34 | 35 | //go:embed versions_macos.txt 36 | var versionsMacOSRaw string 37 | var versionsMacOS = strings.Split(strings.TrimSpace(versionsMacOSRaw), "\n") 38 | 39 | //go:embed versions_windows.txt 40 | var versionsWindowsRaw string 41 | var versionsWindows = strings.Split(strings.TrimSpace(versionsWindowsRaw), "\n") 42 | 43 | //go:embed versions_linux.txt 44 | var versionsLinuxRaw string 45 | var versionsLinux = strings.Split(strings.TrimSpace(versionsLinuxRaw), "\n") 46 | 47 | func randomUAChrome() string { 48 | f := "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36" 49 | return fmt.Sprintf(f, randomOS(), random(versionsChrome)) 50 | } 51 | 52 | func randomUAFirefox() string { 53 | f := "Mozilla/5.0 (%s; rv:%s) Gecko/20100101 Firefox/%s" 54 | ver := random(versionsFirefox) 55 | return fmt.Sprintf(f, randomOS(), ver, ver) 56 | } 57 | 58 | func randomUAEdge() string { 59 | f := "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/%s" 60 | return fmt.Sprintf(f, randomOS(), random(versionsEdge)) 61 | } 62 | 63 | func randomUAOpera() string { 64 | f := "Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/%s" 65 | return fmt.Sprintf(f, randomOS(), random(versionsOpera)) 66 | } 67 | 68 | func randomUserAgent() string { 69 | switch rand.Intn(4) { 70 | case 0: 71 | return randomUAChrome() 72 | case 1: 73 | return randomUAFirefox() 74 | case 2: 75 | return randomUAEdge() 76 | case 3: 77 | return randomUAOpera() 78 | } 79 | panic("rand.Intn is broken") 80 | } 81 | 82 | func randomOS() string { 83 | switch rand.Intn(3) { 84 | case 0: 85 | return random(versionsMacOS) 86 | case 1: 87 | return random(versionsWindows) 88 | case 2: 89 | return random(versionsLinux) 90 | } 91 | panic("rand.Intn is broken") 92 | } 93 | 94 | func random(ss []string) string { 95 | return ss[rand.Intn(len(ss))] 96 | } 97 | -------------------------------------------------------------------------------- /modules/headers/versions_chrome.txt: -------------------------------------------------------------------------------- 1 | 123.0.6312.52 2 | 123.0.6312.40 3 | 122.0.6261.94 4 | 122.0.6261.90 5 | 122.0.6261.89 6 | 122.0.6261.69 7 | 122.0.6261.64 8 | 122.0.6261.62 9 | 122.0.6261.51 10 | 122.0.6261.48 11 | 122.0.6261.129 12 | 122.0.6261.128 13 | 122.0.6261.119 14 | 122.0.6261.112 15 | 122.0.6261.111 16 | 122.0.6261.105 17 | 122.0.6045.214 18 | 121.0.6167.85 19 | 121.0.6167.66 20 | 121.0.6167.184 21 | 121.0.6167.178 22 | 121.0.6167.171 23 | 121.0.6167.164 24 | 121.0.6167.160 25 | 121.0.6167.143 26 | 121.0.6167.139 27 | 121.0.6167.138 28 | 121.0.6167.101 29 | 120.0.6099.71 30 | 120.0.6099.50 31 | 120.0.6099.43 32 | 120.0.6099.230 33 | 120.0.6099.216 34 | 120.0.6099.210 35 | 120.0.6099.199 36 | 120.0.6099.193 37 | 120.0.6099.130 38 | 120.0.6099.129 39 | 120.0.6099.119 40 | 120.0.6099.109 41 | 120.0.6099.101 42 | 119.0.6045.66 43 | 119.0.6045.41 44 | 119.0.6045.214 45 | 119.0.6045.200 46 | 119.0.6045.199 47 | 119.0.6045.193 48 | 119.0.6045.169 49 | 119.0.6045.163 50 | 119.0.6045.134 51 | 119.0.6045.124 52 | 119.0.6045.123 53 | 119.0.6045.109 54 | 118.0.5993.96 55 | 118.0.5993.92 56 | 118.0.5993.89 57 | 118.0.5993.88 58 | 118.0.5993.80 59 | 118.0.5993.71 60 | 118.0.5993.70 61 | 118.0.5993.69 62 | 118.0.5993.65 63 | 118.0.5993.58 64 | 118.0.5993.118 65 | 118.0.5993.117 66 | 118.0.5993.111 67 | 117.0.5938.92 68 | 117.0.5938.89 69 | 117.0.5938.88 70 | 117.0.5938.82 71 | 117.0.5938.60 72 | 117.0.5938.153 73 | 117.0.5938.150 74 | 117.0.5938.149 75 | 117.0.5938.140 76 | 117.0.5938.117 77 | 117.0.5938.108 78 | 117.0.5938.104 79 | 116.0.5845.97 80 | 116.0.5845.96 81 | 116.0.5845.92 82 | 116.0.5845.90 83 | 116.0.5845.188 84 | 116.0.5845.187 85 | 116.0.5845.180 86 | 116.0.5845.179 87 | 116.0.5845.177 88 | 116.0.5845.172 89 | 116.0.5845.163 90 | 116.0.5845.146 91 | 116.0.5845.141 92 | 116.0.5845.140 93 | 116.0.5845.118 94 | 116.0.5845.114 95 | 116.0.5845.111 96 | 116.0.5845.110 97 | 116.0.5845.103 98 | 115.0.5790.84 99 | 115.0.5790.166 100 | 115.0.5790.160 101 | 115.0.5790.138 102 | 115.0.5790.136 103 | 115.0.5790.130 104 | 115.0.5790.114 105 | 115.0.5790.110 106 | 115.0.5790.102 107 | 114.0.5735.99 108 | 114.0.5735.61 109 | 114.0.5735.60 110 | 114.0.5735.58 111 | 114.0.5735.57 112 | 114.0.5735.50 113 | 114.0.5735.198 114 | 114.0.5735.196 115 | 114.0.5735.133 116 | 114.0.5735.131 117 | 114.0.5735.130 118 | 114.0.5735.124 119 | 114.0.5735.106 120 | 113.0.5672.92 121 | 113.0.5672.77 122 | 113.0.5672.76 123 | 113.0.5672.69 124 | 113.0.5672.163 125 | 113.0.5672.162 126 | 113.0.5672.134 127 | 113.0.5672.126 128 | 113.0.5672.121 129 | 113.0.5672.109 130 | 112.0.5615.70 131 | 112.0.5615.69 132 | 112.0.5615.48 133 | 112.0.5615.47 134 | 112.0.5615.46 135 | 112.0.5615.167 136 | 112.0.5615.165 137 | 112.0.5615.137 138 | 112.0.5615.136 139 | 112.0.5615.135 140 | 112.0.5615.101 141 | 112.0.5615.100 142 | 111.0.5563.72 143 | 111.0.5563.58 144 | 111.0.5563.57 145 | 111.0.5563.54 146 | 111.0.5563.147 147 | 111.0.5563.116 148 | 111.0.5563.115 149 | 111.0.5563.110 150 | 111.0.5563.101 151 | 110.0.5481.96 152 | 110.0.5481.83 153 | 110.0.5481.65 154 | 110.0.5481.64 155 | 110.0.5481.63 156 | 110.0.5481.177 157 | 110.0.5481.154 158 | 110.0.5481.153 159 | 110.0.5481.114 160 | 110.0.5481.104 161 | 110.0.5481.100 162 | 109.0.5414.94 163 | 109.0.5414.83 164 | 109.0.5414.165 165 | 109.0.5414.149 166 | 109.0.5414.141 167 | 109.0.5414.125 168 | 109.0.5414.119 169 | 109.0.5414.112 170 | 108.0.5359.98 171 | 108.0.5359.94 172 | 108.0.5359.61 173 | 108.0.5359.52 174 | 108.0.5359.128 175 | 108.0.5359.124 176 | 108.0.5359.112 177 | 107.0.5304.91 178 | 107.0.5304.87 179 | 107.0.5304.66 180 | 107.0.5304.54 181 | 107.0.5304.141 182 | 107.0.5304.121 183 | 107.0.5304.110 184 | 107.0.5304.105 185 | 107.0.5304.101 186 | 106.0.5249.92 187 | 106.0.5249.91 188 | 106.0.5249.75 189 | 106.0.5249.70 190 | 106.0.5249.65 191 | 106.0.5249.60 192 | 106.0.5249.126 193 | 106.0.5249.119 194 | 106.0.5249.118 195 | 106.0.5249.112 196 | 106.0.5249.103 197 | 105.0.5195.98 198 | 105.0.5195.79 199 | 105.0.5195.77 200 | 105.0.5195.69 201 | 105.0.5195.68 202 | 105.0.5195.147 203 | 105.0.5195.136 204 | 105.0.5195.134 205 | 105.0.5195.129 206 | 105.0.5195.125 207 | 105.0.5195.124 208 | 105.0.5195.112 209 | 105.0.5195.102 210 | 105.0.5195.100 211 | 104.0.5112.99 212 | 104.0.5112.97 213 | 104.0.5112.88 214 | 104.0.5112.71 215 | 104.0.5112.69 216 | 104.0.5112.102 217 | 104.0.5112.101 218 | 103.0.5060.71 219 | 103.0.5060.70 220 | 103.0.5060.66 221 | 103.0.5060.64 222 | 103.0.5060.63 223 | 103.0.5060.54 224 | 103.0.5060.53 225 | 103.0.5060.134 226 | 103.0.5060.132 227 | 103.0.5060.129 228 | 103.0.5060.114 229 | 102.0.5005.99 230 | 102.0.5005.78 231 | 102.0.5005.67 232 | 102.0.5005.59 233 | 102.0.5005.115 234 | 101.0.4951.67 235 | 101.0.4951.64 236 | 101.0.4951.61 237 | 101.0.4951.54 238 | 101.0.4951.44 239 | 101.0.4951.41 240 | 100.0.4896.88 241 | 100.0.4896.85 242 | 100.0.4896.79 243 | 100.0.4896.77 244 | 100.0.4896.75 245 | 100.0.4896.58 246 | 100.0.4896.56 247 | 100.0.4896.127 248 | 99.0.4844.88 249 | 99.0.4844.84 250 | 99.0.4844.83 251 | 99.0.4844.82 252 | 99.0.4844.74 253 | 99.0.4844.73 254 | 99.0.4844.59 255 | 99.0.4844.48 256 | 99.0.4844.47 257 | 98.0.4758.97 258 | 98.0.4758.87 259 | 98.0.4758.85 260 | 98.0.4758.109 261 | 98.0.4758.102 262 | 98.0.4758.101 263 | 97.0.4692.99 264 | 97.0.4692.84 265 | 97.0.4692.72 266 | 97.0.4664.98 267 | 97.0.4664.87 268 | 97.0.4664.70 269 | 96.0.4664.94 270 | 96.0.4664.93 271 | 96.0.4664.92 272 | 96.0.4664.53 273 | 96.0.4664.45 274 | 96.0.4664.36 275 | 96.0.4664.116 276 | 96.0.4664.110 277 | 96.0.4664.104 278 | 96.0.4664.101 279 | 95.0.4638.69 280 | 95.0.4638.50 281 | 94.0.4606.85 282 | 94.0.4606.81 283 | 94.0.4606.80 284 | 94.0.4606.76 285 | 94.0.4606.71 286 | 94.0.4606.61 287 | 94.0.4606.52 288 | 94.0.4606.50 289 | 93.0.4577.82 290 | 93.0.4577.78 291 | 93.0.4577.62 292 | 93.0.4577.39 293 | 92.0.4515.90 294 | 92.0.4515.166 295 | 92.0.4515.159 296 | 92.0.4515.131 297 | 92.0.4515.115 298 | 92.0.4515.105 299 | 91.0.4472.88 300 | 91.0.4472.80 301 | 91.0.4472.77 302 | 91.0.4472.164 303 | 91.0.4472.124 304 | 91.0.4472.123 305 | 91.0.4472.120 306 | 91.0.4472.114 307 | 91.0.4472.106 308 | 91.0.4472.101 309 | 90.0.4430.93 310 | 90.0.4430.91 311 | 90.0.4430.85 312 | 90.0.4430.82 313 | 90.0.4430.78 314 | 90.0.4430.66 315 | 90.0.4430.216 316 | 90.0.4430.212 317 | 90.0.4430.210 318 | 89.0.4389.90 319 | 89.0.4389.86 320 | 89.0.4389.82 321 | 89.0.4389.72 322 | 89.0.4389.128 323 | 89.0.4389.114 324 | 89.0.4389.105 325 | 88.0.4324.93 326 | 88.0.4324.192 327 | 88.0.4324.190 328 | 88.0.4324.182 329 | 88.0.4324.181 330 | 88.0.4324.155 331 | 88.0.4324.150 332 | 88.0.4324.146 333 | 88.0.4324.141 334 | 87.0.4280.88 335 | 87.0.4280.86 336 | 87.0.4280.77 337 | 87.0.4280.67 338 | 87.0.4280.66 339 | 87.0.4280.60 340 | 87.0.4280.163 341 | 87.0.4280.141 342 | 87.0.4280.101 343 | -------------------------------------------------------------------------------- /modules/headers/versions_edge.txt: -------------------------------------------------------------------------------- 1 | 132.0.2957.11 2 | 131.0.2903.9 3 | 131.0.2903.48 4 | 130.0.2849.5 5 | 130.0.2849.46 6 | 129.0.2792.52 7 | 129.0.2792.12 8 | 128.0.2739.5 9 | 128.0.2739.42 10 | 127.0.2651.8 11 | 127.0.2651.74 12 | 126.0.2592.56 13 | 126.0.2592.13 14 | 125.0.2535.51 15 | 125.0.2535.13 16 | 124.0.2478.51 17 | 124.0.2478.10 18 | 123.0.2420.53 19 | 123.0.2420.10 20 | 122.0.2365.8 21 | 122.0.2365.52 22 | 121.0.2277.83 23 | 121.0.2277.4 24 | 120.0.2210.7 25 | 120.0.2210.61 26 | 119.0.2151.44 27 | 119.0.2151.12 28 | 118.0.2088.46 29 | 118.0.2088.11 30 | 117.0.2045.9 31 | 117.0.2045.31 32 | 116.0.1938.54 33 | 116.0.1938.29 34 | 115.0.1901.7 35 | 115.0.1901.183 36 | 114.0.1823.37 37 | 114.0.1823.11 38 | 113.0.1774.9 39 | 113.0.1774.3 40 | 112.0.1722.34 41 | 112.0.1722.11 42 | 111.0.1661.41 43 | 111.0.1661.15 44 | 110.0.1587.41 45 | 110.0.1587.17 46 | 109.0.1518.49 47 | 109.0.1518.14 48 | 108.0.1462.42 49 | 108.0.1462.15 50 | 107.0.1418.8 51 | 107.0.1418.24 52 | 106.0.1370.34 53 | 106.0.1370.15 54 | 105.0.1343.7 55 | 105.0.1343.25 56 | 104.0.1293.47 57 | 104.0.1293.14 58 | 103.0.1264.37 59 | 103.0.1264.13 60 | 102.0.1245.7 61 | 102.0.1245.30 62 | 101.0.1210.32 63 | 101.0.1210.10 64 | 100.0.1185.29 65 | 100.0.1185.10 66 | 99.0.1150.30 67 | 99.0.1150.11 68 | 98.0.1108.43 69 | 98.0.1108.23 70 | 97.0.1072.55 71 | 97.0.1072.21 72 | 96.0.1054.8 73 | 96.0.1054.29 74 | 95.0.1020.9 75 | 95.0.1020.30 76 | 94.0.992.9 77 | 94.0.992.31 78 | 93.0.961.38 79 | 93.0.961.11 80 | 92.0.902.9 81 | 92.0.902.55 82 | 91.0.864.37 83 | 91.0.864.11 84 | 90.0.818.8 85 | 90.0.818.39 86 | 89.0.774.48 87 | 89.0.774.18 88 | 88.0.705.50 89 | 88.0.705.18 90 | -------------------------------------------------------------------------------- /modules/headers/versions_firefox.txt: -------------------------------------------------------------------------------- 1 | 132.0.2 2 | 132.0.1 3 | 132.0 4 | 131.0.3 5 | 131.0.2 6 | 131.0 7 | 130.0.1 8 | 130.0 9 | 129.0.2 10 | 129.0.1 11 | 129.0 12 | 128.4.0 13 | 128.3.1 14 | 128.3.0 15 | 128.2.0 16 | 128.1.0 17 | 128.0.3 18 | 128.0.2 19 | 128.0 20 | 127.0.2 21 | 127.0.1 22 | 127.0 23 | 126.0.1 24 | 126.0 25 | 125.0.3 26 | 125.0.2 27 | 125.0.1 28 | 124.0.2 29 | 124.0.1 30 | 124.0 31 | 123.0.1 32 | 123.0 33 | 122.0.1 34 | 122.0 35 | 121.0.1 36 | 121.0 37 | 120.0.1 38 | 120.0 39 | 119.0.1 40 | 119.0 41 | 118.0.2 42 | 118.0.1 43 | 118.0 44 | 117.0.1 45 | 117.0 46 | 116.0.3 47 | 116.0.2 48 | 116.0.1 49 | 116.0 50 | 115.9.1 51 | 115.9.0 52 | 115.8.0 53 | 115.7.0 54 | 115.6.0 55 | 115.5.0 56 | 115.4.0 57 | 115.3.1 58 | 115.3.0 59 | 115.2.1 60 | 115.2.0 61 | 115.17.0 62 | 115.16.1 63 | 115.16.0 64 | 115.15.0 65 | 115.14.0 66 | 115.13.0 67 | 115.12.0 68 | 115.11.0 69 | 115.10.0 70 | 115.1.0 71 | 115.0.3 72 | 115.0.2 73 | 115.0.1 74 | 115.0 75 | 114.0.2 76 | 114.0.1 77 | 114.0 78 | 113.0.2 79 | 113.0.1 80 | 113.0 81 | 112.0.2 82 | 112.0.1 83 | 112.0 84 | 111.0.1 85 | 111.0 86 | 110.0.1 87 | 110.0 88 | 109.0.1 89 | 109.0 90 | 108.0.2 91 | 108.0.1 92 | 108.0 93 | 107.0.1 94 | 107.0 95 | 106.0.5 96 | 106.0.4 97 | 106.0.3 98 | 106.0.2 99 | 106.0.1 100 | 106.0 101 | 105.0.3 102 | 105.0.2 103 | 105.0.1 104 | 105.0 105 | 104.0.2 106 | 104.0.1 107 | 104.0 108 | 103.0.2 109 | 103.0.1 110 | 103.0 111 | 102.9.0 112 | 102.8.0 113 | 102.7.0 114 | 102.6.0 115 | 102.5.0 116 | 102.4.0 117 | 102.3.0 118 | 102.2.0 119 | 102.15.1 120 | 102.15.0 121 | 102.14.0 122 | 102.13.0 123 | 102.12.0 124 | 102.11.0 125 | 102.10.0 126 | 102.1.0 127 | 102.0.1 128 | 102.0 129 | 101.0.1 130 | 101.0 131 | 100.0.2 132 | 100.0.1 133 | 100.0 134 | 99.0.1 135 | 99.0 136 | 98.0.2 137 | 98.0.1 138 | 98.0 139 | 97.0.2 140 | 97.0.1 141 | 97.0 142 | 96.0.3 143 | 96.0.2 144 | 96.0.1 145 | 96.0 146 | 95.0.2 147 | 95.0.1 148 | 95.0 149 | 94.0.2 150 | 94.0.1 151 | 94.0 152 | 93.0 153 | 92.0.1 154 | 92.0 155 | 91.9.1 156 | 91.9.0 157 | 91.8.0 158 | 91.7.1 159 | 91.7.0 160 | 91.6.1 161 | 91.6.0 162 | 91.5.1 163 | 91.5.0 164 | 91.4.1 165 | 91.4.0 166 | 91.3.0 167 | 91.2.0 168 | 91.13.0 169 | 91.12.0 170 | 91.11.0 171 | 91.10.0 172 | 91.1.0 173 | 91.0.2 174 | 91.0.1 175 | 91.0 176 | 90.0.2 177 | 90.0.1 178 | 90.0 179 | 89.0.2 180 | 89.0.1 181 | 89.0 182 | 88.0.1 183 | 88.0 184 | 87.0 185 | 86.0.1 186 | 86.0 187 | 85.0.2 188 | 85.0.1 189 | 85.0 190 | 84.0.2 191 | 84.0.1 192 | 84.0 193 | 83.0 194 | 82.0.3 195 | 82.0.2 196 | 82.0.1 197 | 82.0 198 | 81.0.2 199 | 81.0.1 200 | 81.0 201 | 80.0.1 202 | 80.0 203 | 79.0 204 | 78.9.0 205 | 78.8.0 206 | 78.7.1 207 | 78.7.0 208 | 78.6.1 209 | 78.6.0 210 | 78.5.0 211 | 78.4.1 212 | 78.4.0 213 | 78.3.1 214 | 78.3.0 215 | 78.2.0 216 | 78.15.0 217 | 78.14.0 218 | 78.13.0 219 | 78.12.0 220 | 78.11.0 221 | 78.10.1 222 | 78.10.0 223 | 78.1.0 224 | 78.0.2 225 | 78.0.1 226 | 78.0 227 | 77.0.1 228 | 77.0 229 | 76.0.1 230 | 76.0 231 | 75.0 232 | 74.0.1 233 | 74.0 234 | 73.0.1 235 | 73.0 236 | 72.0.2 237 | 72.0.1 238 | 72.0 239 | 71.0 240 | 70.0.1 241 | 70.0 242 | 69.0.3 243 | 69.0.2 244 | 69.0.1 245 | 69.0 246 | 68.9.0 247 | 68.8.0 248 | 68.7.0 249 | 68.6.1 250 | 68.6.0 251 | 68.5.0 252 | 68.4.2 253 | 68.4.1 254 | 68.4.0 255 | 68.3.0 256 | 68.2.0 257 | 68.12.0 258 | 68.11.0 259 | 68.10.0 260 | 68.1.0 261 | 68.0.2 262 | 68.0.1 263 | 68.0 264 | 67.0.4 265 | 67.0.3 266 | 67.0.2 267 | 67.0.1 268 | 67.0 269 | 66.0.5 270 | 66.0.4 271 | 66.0.3 272 | 66.0.2 273 | 66.0.1 274 | 66.0 275 | 65.0.2 276 | 65.0.1 277 | 65.0 278 | 64.0.2 279 | 64.0 280 | 63.0.3 281 | 63.0.1 282 | 63.0 283 | 62.0.3 284 | 62.0.2 285 | 62.0 286 | 61.0.2 287 | 61.0.1 288 | 61.0 289 | 60.9.0 290 | 60.8.0 291 | 60.7.2 292 | 60.7.1 293 | 60.7.0 294 | 60.6.3 295 | 60.6.2 296 | 60.6.1 297 | 60.6.0 298 | 60.5.2 299 | 60.5.1 300 | 60.5.0 301 | 60.4.0 302 | 60.3.0 303 | 60.2.2 304 | 60.2.1 305 | 60.2.0 306 | 60.1.0 307 | 60.0.2 308 | 60.0.1 309 | 60.0 310 | -------------------------------------------------------------------------------- /modules/headers/versions_linux.txt: -------------------------------------------------------------------------------- 1 | X11; Linux x86_64 2 | X11; Ubuntu; Linux x86_64 3 | -------------------------------------------------------------------------------- /modules/headers/versions_macos.txt: -------------------------------------------------------------------------------- 1 | Macintosh; Intel Mac OS X 11_2_1 2 | Macintosh; Intel Mac OS X 11_2_2 3 | Macintosh; Intel Mac OS X 11_2_3 4 | Macintosh; Intel Mac OS X 11_3_1 5 | Macintosh; Intel Mac OS X 11_3_5 6 | Macintosh; Intel Mac OS X 11_4_3 7 | Macintosh; Intel Mac OS X 11_5_1 8 | Macintosh; Intel Mac OS X 11_5_2 9 | Macintosh; Intel Mac OS X 11_6_1 10 | Macintosh; Intel Mac OS X 11_6_2 11 | Macintosh; Intel Mac OS X 11_6_3 12 | Macintosh; Intel Mac OS X 11_6_4 13 | Macintosh; Intel Mac OS X 11_6_5 14 | Macintosh; Intel Mac OS X 11_6_6 15 | Macintosh; Intel Mac OS X 11_6_7 16 | Macintosh; Intel Mac OS X 11_6_8 17 | Macintosh; Intel Mac OS X 11_7 18 | Macintosh; Intel Mac OS X 11_7_1 19 | Macintosh; Intel Mac OS X 11_7_10 20 | Macintosh; Intel Mac OS X 11_7_2 21 | Macintosh; Intel Mac OS X 11_7_3 22 | Macintosh; Intel Mac OS X 11_7_4 23 | Macintosh; Intel Mac OS X 11_7_5 24 | Macintosh; Intel Mac OS X 11_7_6 25 | Macintosh; Intel Mac OS X 11_7_7 26 | Macintosh; Intel Mac OS X 11_7_8 27 | Macintosh; Intel Mac OS X 11_7_9 28 | Macintosh; Intel Mac OS X 12_0_1 29 | Macintosh; Intel Mac OS X 12_1_2 30 | Macintosh; Intel Mac OS X 12_2_1 31 | Macintosh; Intel Mac OS X 12_3_1 32 | Macintosh; Intel Mac OS X 12_3_4 33 | Macintosh; Intel Mac OS X 12_4_2 34 | Macintosh; Intel Mac OS X 12_5_1 35 | Macintosh; Intel Mac OS X 12_5_3 36 | Macintosh; Intel Mac OS X 12_6_1 37 | Macintosh; Intel Mac OS X 12_6_2 38 | Macintosh; Intel Mac OS X 12_6_3 39 | Macintosh; Intel Mac OS X 12_6_4 40 | Macintosh; Intel Mac OS X 12_6_5 41 | Macintosh; Intel Mac OS X 12_6_6 42 | Macintosh; Intel Mac OS X 12_6_7 43 | Macintosh; Intel Mac OS X 12_6_8 44 | Macintosh; Intel Mac OS X 12_6_9 45 | Macintosh; Intel Mac OS X 12_7 46 | Macintosh; Intel Mac OS X 12_7_1 47 | Macintosh; Intel Mac OS X 12_7_2 48 | Macintosh; Intel Mac OS X 12_7_3 49 | Macintosh; Intel Mac OS X 12_7_4 50 | Macintosh; Intel Mac OS X 12_7_5 51 | Macintosh; Intel Mac OS X 12_7_6 52 | Macintosh; Intel Mac OS X 13_0_1 53 | Macintosh; Intel Mac OS X 13_0_5 54 | Macintosh; Intel Mac OS X 13_1_1 55 | Macintosh; Intel Mac OS X 13_2_1 56 | Macintosh; Intel Mac OS X 13_3_1 57 | Macintosh; Intel Mac OS X 13_3_3 58 | Macintosh; Intel Mac OS X 13_4_1 59 | Macintosh; Intel Mac OS X 13_5_1 60 | Macintosh; Intel Mac OS X 13_5_2 61 | Macintosh; Intel Mac OS X 13_6 62 | Macintosh; Intel Mac OS X 13_6_1 63 | Macintosh; Intel Mac OS X 13_6_2 64 | Macintosh; Intel Mac OS X 13_6_3 65 | Macintosh; Intel Mac OS X 13_6_4 66 | Macintosh; Intel Mac OS X 13_6_5 67 | Macintosh; Intel Mac OS X 13_6_6 68 | Macintosh; Intel Mac OS X 13_6_7 69 | Macintosh; Intel Mac OS X 13_6_8 70 | Macintosh; Intel Mac OS X 13_6_9 71 | Macintosh; Intel Mac OS X 13_7 72 | Macintosh; Intel Mac OS X 13_7_1 73 | Macintosh; Intel Mac OS X 14_0_6 74 | Macintosh; Intel Mac OS X 14_1_1 75 | Macintosh; Intel Mac OS X 14_1_2 76 | Macintosh; Intel Mac OS X 14_2_1 77 | Macintosh; Intel Mac OS X 14_2_2 78 | Macintosh; Intel Mac OS X 14_3_1 79 | Macintosh; Intel Mac OS X 14_4_1 80 | Macintosh; Intel Mac OS X 14_4_3 81 | Macintosh; Intel Mac OS X 14_5_2 82 | Macintosh; Intel Mac OS X 14_6_1 83 | Macintosh; Intel Mac OS X 14_6_2 84 | Macintosh; Intel Mac OS X 14_7 85 | Macintosh; Intel Mac OS X 14_7_1 86 | Macintosh; Intel Mac OS X 15_0 87 | Macintosh; Intel Mac OS X 15_0_1 88 | -------------------------------------------------------------------------------- /modules/headers/versions_opera.txt: -------------------------------------------------------------------------------- 1 | 110.0.5130.82 2 | 110.0.5130.8 3 | 110.0.5130.66 4 | 110.0.5130.49 5 | 110.0.5130.4 6 | 110.0.5130.39 7 | 110.0.5130.35 8 | 110.0.5130.23 9 | 110.0.5130.17 10 | 110.0.5130.13 11 | 110.0.5117.0 12 | 110.0.5111.0 13 | 110.0.5104.0 14 | 109.0.5097.80 15 | 109.0.5097.59 16 | 109.0.5097.5 17 | 109.0.5097.45 18 | 109.0.5097.38 19 | 109.0.5097.33 20 | 109.0.5097.24 21 | 109.0.5097.0 22 | 109.0.5089.0 23 | 109.0.5083.0 24 | 109.0.5076.0 25 | 109.0.5069.0 26 | 108.0.5067.40 27 | 108.0.5067.24 28 | 108.0.5067.20 29 | 108.0.5067.14 30 | 108.0.5067.10 31 | 108.0.5063.0 32 | 108.0.5054.0 33 | 108.0.5047.0 34 | 107.0.5045.8 35 | 107.0.5045.4 36 | 107.0.5045.36 37 | 107.0.5045.21 38 | 107.0.5045.15 39 | 107.0.5045.11 40 | 107.0.5041.0 41 | 107.0.5035.0 42 | 107.0.5019.0 43 | 107.0.5012.0 44 | 107.0.5004.0 45 | 106.0.4971.0 46 | 105.0.4970.6 47 | 105.0.4970.48 48 | 105.0.4970.34 49 | 105.0.4970.29 50 | 105.0.4970.21 51 | 105.0.4970.16 52 | 105.0.4970.13 53 | 105.0.4970.10 54 | 105.0.4963.0 55 | 105.0.4957.0 56 | 105.0.4950.0 57 | 104.0.4944.54 58 | 104.0.4944.36 59 | 104.0.4944.33 60 | 104.0.4944.3 61 | 104.0.4944.28 62 | 104.0.4944.23 63 | 104.0.4944.18 64 | 104.0.4944.10 65 | 104.0.4941.0 66 | 104.0.4934.0 67 | 103.0.4928.34 68 | 103.0.4928.3 69 | 103.0.4928.26 70 | 103.0.4928.16 71 | 103.0.4928.0 72 | 103.0.4920.0 73 | 103.0.4906.0 74 | 103.0.4899.0 75 | 103.0.4892.0 76 | 103.0.4885.0 77 | 102.0.4880.78 78 | 102.0.4880.70 79 | 102.0.4880.6 80 | 102.0.4880.56 81 | 102.0.4880.51 82 | 102.0.4880.46 83 | 102.0.4880.40 84 | 102.0.4880.38 85 | 102.0.4880.33 86 | 102.0.4880.28 87 | 102.0.4880.16 88 | 102.0.4880.10 89 | 102.0.4879.0 90 | 102.0.4871.0 91 | 102.0.4864.0 92 | 102.0.4857.0 93 | 102.0.4850.0 94 | 101.0.4843.58 95 | 101.0.4843.5 96 | 101.0.4843.43 97 | 101.0.4843.33 98 | 101.0.4843.25 99 | 101.0.4843.19 – 100 | 101.0.4843.13 101 | 101.0.4843.10 102 | 101.0.4843.0 103 | 101.0.4836.0 104 | 101.0.4829.0 105 | 101.0.4822.0 106 | 100.0.4815.76 107 | 100.0.4815.54 108 | 100.0.4815.47 109 | 100.0.4815.30 110 | 100.0.4815.2 111 | 100.0.4815.13 112 | 100.0.4815.0 113 | 100.0.4809.0 114 | 100.0.4801.0 115 | 100.0.4796.0 116 | 100.0.4790.0 117 | 99.0.4788.9 118 | 99.0.4788.88 119 | 99.0.4788.77 120 | 99.0.4788.65 121 | 99.0.4788.5 122 | 99.0.4788.47 123 | 99.0.4788.31 124 | 99.0.4788.13 125 | 99.0.4780.0 126 | 99.0.4765.0 127 | 98.0.4759.6 128 | 98.0.4759.39 129 | 98.0.4759.3 130 | 98.0.4759.21 131 | 98.0.4759.15 132 | 98.0.4759.1 133 | 98.0.4756.0 134 | 98.0.4746.0 135 | 98.0.4739.0 136 | 98.0.4732.0 137 | 98.0.4725.0 138 | 97.0.4719.83 139 | 97.0.4719.63 140 | 97.0.4719.43 141 | 97.0.4719.4 142 | 97.0.4719.28 143 | 97.0.4719.26 144 | 97.0.4719.17 145 | 97.0.4719.11 146 | 97.0.4718.0 147 | 97.0.4711.0 148 | 97.0.4704.0 149 | 97.0.4697.0 150 | 96.0.4693.80 151 | 96.0.4693.50 152 | 96.0.4693.31 153 | 96.0.4693.20 154 | 96.0.4693.16 155 | 96.0.4693.12 156 | 96.0.4691.0 157 | 96.0.4674.0 158 | 96.0.4660.0 159 | 96.0.4653.0 160 | 96.0.4640.0 161 | 95.0.4635.84 162 | 95.0.4635.46 163 | 95.0.4635.37 164 | 95.0.4635.28 165 | 95.0.4635.25 166 | 95.0.4635.20 167 | 95.0.4635.15 168 | 95.0.4635.12 169 | 95.0.4635.10 170 | 95.0.4632.0 171 | 95.0.4625.0 172 | 95.0.4618.0 173 | 95.0.4612.0 174 | 94.0.4606.8 175 | 94.0.4606.76 176 | 94.0.4606.65 177 | 94.0.4606.54 178 | 94.0.4606.38 179 | 94.0.4606.26 180 | 94.0.4606.19 181 | 94.0.4606.14 182 | 94.0.4604.0 183 | 94.0.4597.0 184 | 93.0.4585.70 185 | 93.0.4585.7 186 | 93.0.4585.64 187 | 93.0.4585.39 188 | 93.0.4585.37 189 | 93.0.4585.3 190 | 93.0.4585.21 191 | 93.0.4585.11 192 | 93.0.4582.0 193 | 93.0.4575.0 194 | 93.0.4569.0 195 | 92.0.4561.8 196 | 92.0.4561.43 197 | 92.0.4561.33 198 | 92.0.4561.30 199 | 92.0.4561.21 200 | 92.0.4561.11 201 | 92.0.4561.0 202 | 92.0.4555.0 203 | 92.0.4547.0 204 | 92.0.4540.0 205 | 92.0.4526.0 206 | 92.0.4519.0 207 | 91.0.4516.9 208 | 91.0.4516.77 209 | 91.0.4516.65 210 | 91.0.4516.6 211 | 91.0.4516.3 212 | 91.0.4516.20 213 | 91.0.4516.16 214 | 91.0.4514.0 215 | 91.0.4505.0 216 | 91.0.4498.0 217 | 91.0.4491.0 218 | 91.0.4484.0 219 | 90.0.4480.84 220 | 90.0.4480.80 221 | 90.0.4480.54 222 | 90.0.4480.48 223 | 90.0.4480.41 224 | 90.0.4480.37 225 | 90.0.4480.30 226 | 90.0.4480.25 227 | 90.0.4480.107 228 | 90.0.4477.0 229 | 90.0.4470.0 230 | 90.0.4463.0 231 | 90.0.4457.0 232 | 90.0.4450.0 233 | 89.0.4447.83 234 | 89.0.4447.71 235 | 89.0.4447.51 236 | 89.0.4447.48 237 | 89.0.4447.38 238 | 89.0.4447.37 239 | 89.0.4447.33 240 | 89.0.4447.31 241 | 89.0.4447.20 242 | 89.0.4447.12 243 | 89.0.4447.101 244 | 89.0.4443.0 245 | 89.0.4436.0 246 | 89.0.4428.0 247 | 89.0.4422.0 248 | 89.0.4415.0 249 | 88.0.4412.74 250 | 88.0.4412.53 251 | 88.0.4412.40 252 | 88.0.4412.27 253 | 88.0.4412.20 254 | 88.0.4412.18 255 | 88.0.4412.13 256 | 88.0.4401.0 257 | 88.0.4395.0 258 | 87.0.4390.8 259 | 87.0.4390.45 260 | 87.0.4390.36 261 | 87.0.4390.25 262 | 87.0.4390.21 263 | 87.0.4390.17 264 | 87.0.4388.0 265 | 87.0.4382.0 266 | 87.0.4374.0 267 | 87.0.4366.0 268 | 86.0.4363.9 269 | 86.0.4363.59 270 | 86.0.4363.50 271 | 86.0.4363.32 272 | 86.0.4363.22 273 | 86.0.4363.17 274 | 86.0.4363.15 275 | 86.0.4363.12 276 | 86.0.4359.0 277 | 86.0.4351.0 278 | 86.0.4344.0 279 | 85.0.4341.75 280 | 85.0.4341.60 281 | 85.0.4341.6 282 | 85.0.4341.47 283 | 85.0.4341.28 284 | 85.0.4341.18 285 | 85.0.4341.13 286 | 85.0.4341.10 287 | 85.0.4338.0 288 | 85.0.4331.0 289 | 85.0.4323.0 290 | 84.0.4316.9 291 | 84.0.4316.42 292 | 84.0.4316.31 293 | 84.0.4316.21 294 | 84.0.4316.14 295 | 84.0.4316.0 296 | 84.0.4309.0 297 | 84.0.4302.0 298 | 84.0.4295.0 299 | 84.0.4284.0 300 | 84.0.4274.0 301 | 84.0.4267.0 302 | 84.0.4260.0 303 | 83.0.4254.9 304 | 83.0.4254.62 305 | 83.0.4254.54 306 | 83.0.4254.5 307 | 83.0.4254.27 308 | 83.0.4254.19 309 | 83.0.4254.16 310 | 83.0.4254.14 311 | 83.0.4253.0 312 | 83.0.4246.0 313 | 83.0.4239.0 314 | 83.0.4232.0 315 | 82.0.4227.7 316 | 82.0.4227.58 317 | 82.0.4227.43 318 | 82.0.4227.4 319 | 82.0.4227.33 320 | 82.0.4227.23 321 | 82.0.4227.13 322 | 82.0.4226.0 323 | 82.0.4218.0 324 | 82.0.4210.0 325 | 82.0.4203.0 326 | 81.0.4196.60 327 | 81.0.4196.54 328 | 81.0.4196.37 329 | 81.0.4196.31 330 | 81.0.4196.27 331 | 81.0.4196.14 332 | 81.0.4196.11 333 | 81.0.4196.0 334 | 81.0.4189.0 335 | 81.0.4183.0 336 | 81.0.4175.0 337 | 80.0.4170.72 338 | 80.0.4170.7 339 | 80.0.4170.63 340 | 80.0.4170.40 341 | 80.0.4170.4 342 | 80.0.4170.16 343 | 80.0.4170.11 344 | 80.0.4170.0 345 | 80.0.4162.0 346 | 80.0.4157.0 347 | 80.0.4150.0 348 | 79.0.4143.72 349 | 79.0.4143.50 350 | 79.0.4143.3 351 | 79.0.4143.22 352 | 79.0.4143.15 353 | 79.0.4142.0 354 | 79.0.4135.0 355 | 79.0.4128.0 356 | 79.0.4114.0 357 | 79.0.4105.0 358 | 79.0.4100.0 359 | 78.0.4093.79 360 | 78.0.4093.68 361 | 78.0.4093.46 362 | 78.0.4093.34 363 | 78.0.4093.184 364 | 78.0.4093.147 365 | 78.0.4093.112 366 | 78.0.4093.103 367 | 78.0.4093.0 368 | 78.0.4086.0 369 | 78.0.4079.0 370 | 78.0.4072.0 371 | 78.0.4066.0 372 | 78.0.4058.0 373 | 77.0.4054.91 374 | 77.0.4054.90 375 | 77.0.4054.80 376 | 77.0.4054.64 377 | 77.0.4054.38 378 | 77.0.4054.277 – 379 | 77.0.4054.254 – 380 | 77.0.4054.203 – 381 | 77.0.4054.19 382 | 77.0.4054.172 – 383 | 77.0.4054.146 – 384 | 77.0.4054.14 385 | 77.0.4051.0 386 | 77.0.4046.0 387 | 77.0.4039.0 388 | 77.0.4032.0 389 | 77.0.4028.0 390 | 77.0.4023.0 391 | 76.0.4017.94 392 | 76.0.4017.88 393 | 76.0.4017.59 394 | 76.0.4017.5 395 | 76.0.4017.40 396 | 76.0.4017.177 397 | 76.0.4017.154 398 | 76.0.4017.137 399 | 76.0.4017.123 400 | 76.0.4017.107 401 | 76.0.4009.0 402 | 76.0.3995.0 403 | 76.0.3989.0 404 | 76.0.3981.0 405 | 76.0.3974.0 406 | 75.0.3969.93 407 | 75.0.3969.60 408 | 75.0.3969.50 409 | 75.0.3969.35 410 | 75.0.3969.218 411 | 75.0.3969.171 412 | 75.0.3969.141 413 | 75.0.3969.14 414 | 75.0.3967.0 415 | 75.0.3960.0 416 | 75.0.3953.0 417 | 75.0.3946.0 418 | 75.0.3939.0 419 | 75.0.3932.0 420 | 75.0.3925.0 421 | 74.0.3911.75 422 | 74.0.3911.63 423 | 74.0.3911.42 424 | 74.0.3911.232 425 | 74.0.3911.22 426 | 74.0.3911.218 427 | 74.0.3911.203 428 | 74.0.3911.160 429 | 74.0.3911.107 430 | 74.0.3904.0 431 | 74.0.3897.0 432 | 74.0.3890.0 433 | 74.0.3883.0 434 | 74.0.3876.0 435 | 74.0.3870.0 436 | 74.0.3862.0 437 | 73.0.3856.344 438 | 73.0.3856.329 439 | 73.0.3856.31 440 | 73.0.3856.284 441 | 73.0.3856.257 442 | 73.0.3856.235 443 | 73.0.3856.208 444 | 73.0.3856.184 445 | 73.0.3856.156 446 | 73.0.3856.0 447 | 73.0.3847.0 448 | 73.0.3841.0 449 | 73.0.3834.0 450 | 73.0.3827.0 451 | 73.0.3820.0 452 | 72.0.3815.86 453 | 72.0.3815.49 454 | 72.0.3815.400 455 | 72.0.3815.378 456 | 72.0.3815.320 457 | 72.0.3815.200 458 | 72.0.3815.186 459 | 72.0.3815.148 460 | 72.0.3815.133 461 | 72.0.3814.0 462 | 72.0.3807.0 463 | 72.0.3798.0 464 | 72.0.3791.0 465 | 72.0.3784.0 466 | 72.0.3779.0 467 | 71.0.3770.97 468 | 71.0.3770.81 469 | 71.0.3770.50 470 | 71.0.3770.271 471 | 71.0.3770.228 472 | 71.0.3770.198 473 | 71.0.3770.148 474 | 71.0.3770.126 475 | 71.0.3770.0 476 | 71.0.3763.0 477 | 71.0.3756.0 478 | 71.0.3749.0 479 | 71.0.3742.0 480 | 71.0.3735.0 – 481 | 70.0.3728.95 482 | 70.0.3728.8 483 | 70.0.3728.71 484 | 70.0.3728.59 485 | 70.0.3728.46 486 | 70.0.3728.21 487 | 70.0.3728.189 488 | 70.0.3728.144 489 | 70.0.3728.133 490 | 70.0.3728.119 491 | 70.0.3728.106 492 | 70.0.3728.0 493 | 70.0.3721.0 494 | 70.0.3714.0 495 | 70.0.3707.0 496 | 70.0.3701.0 497 | 70.0.3693.0 498 | 69.0.3686.95 499 | 69.0.3686.77 500 | 69.0.3686.7 501 | 69.0.3686.57 502 | 69.0.3686.49 503 | 69.0.3686.36 504 | 69.0.3686.30 505 | 69.0.3686.21 506 | 69.0.3686.2 507 | 69.0.3686.12 508 | 69.0.3686.0 509 | 69.0.3679.0 510 | 69.0.3673.0 511 | 69.0.3665.0 512 | 69.0.3660.0 513 | 69.0.3653.0 514 | 69.0.3651.0 515 | 69.0.3645.0 516 | 69.0.3638.0 517 | 69.0.3630.0 518 | 69.0.3623.0 519 | 68.0.3618.91 520 | 68.0.3618.63 521 | 68.0.3618.56 522 | 68.0.3618.5 523 | 68.0.3618.45/68.0.3618.46 524 | 68.0.3618.41 525 | 68.0.3618.36 526 | 68.0.3618.31 527 | 68.0.3618.3 528 | 68.0.3618.24 529 | 68.0.3618.18 530 | 68.0.3618.173 531 | 68.0.3618.165 532 | 68.0.3618.104 533 | 68.0.3616.0 534 | 68.0.3609.0 535 | 68.0.3602.0 536 | 68.0.3590.0 537 | 68.0.3581.0 538 | 67.0.3575.97 539 | 67.0.3575.8 540 | 67.0.3575.79 541 | 67.0.3575.53 542 | 67.0.3575.31 543 | 67.0.3575.28 544 | 67.0.3575.23 545 | 67.0.3575.2 546 | 67.0.3575.137 547 | 67.0.3575.13 548 | 67.0.3575.115 549 | 67.0.3574.0 550 | 67.0.3564.0 551 | 67.0.3554.0 552 | 67.0.3541.0 553 | 67.0.3536.0 554 | 67.0.3523.0 555 | 66.0.3515.72 556 | 66.0.3515.7 557 | 66.0.3515.44 558 | 66.0.3515.36 559 | 66.0.3515.3 560 | 66.0.3515.27 561 | 66.0.3515.21 562 | 66.0.3515.2 563 | 66.0.3515.14 564 | 66.0.3515.103 565 | 66.0.3511.0 566 | 66.0.3508.0 567 | 66.0.3502.0 568 | 66.0.3494.0 569 | 66.0.3487.0 570 | 66.0.3480.0 571 | 66.0.3475.0 572 | 66.0.3472.0 573 | 65.0.3467.78 574 | 65.0.3467.72 575 | 65.0.3467.7 576 | 65.0.3467.69 577 | 65.0.3467.62 578 | 65.0.3467.48 579 | 65.0.3467.38 580 | 65.0.3467.32 581 | 65.0.3467.24 582 | 65.0.3467.16 583 | 65.0.3466.0 584 | 65.0.3459.0 585 | 65.0.3454.0 586 | 65.0.3450.0 587 | 65.0.3445.0 588 | 65.0.3437.0 589 | 65.0.3430.0 590 | 65.0.3425.0 591 | 64.0.3417.92 592 | 64.0.3417.83 593 | 64.0.3417.8 594 | 64.0.3417.73 595 | 64.0.3417.61 596 | 64.0.3417.54 597 | 64.0.3417.47 598 | 64.0.3417.41 599 | 64.0.3417.32 600 | 64.0.3417.19 601 | 64.0.3417.119 602 | 64.0.3417.11 603 | 64.0.3416.0 604 | 64.0.3409.0 605 | 64.0.3407.0 606 | 64.0.3401.0 607 | 64.0.3396.0 608 | 64.0.3394.0 609 | 64.0.3388.0 610 | 64.0.3380.0 611 | 64.0.3372.0 612 | 63.0.3368.94 613 | 63.0.3368.88 614 | 63.0.3368.8 615 | 63.0.3368.66 616 | 63.0.3368.53 617 | 63.0.3368.51 618 | 63.0.3368.43 619 | 63.0.3368.35 620 | 63.0.3368.33 621 | 63.0.3368.29 622 | 63.0.3368.22 623 | 63.0.3368.17 624 | 63.0.3368.14 625 | 63.0.3367.0 626 | 63.0.3359.0 627 | 63.0.3353.0 628 | 63.0.3349.0 629 | 63.0.3347.0 630 | 62.0.3331.99 631 | 62.0.3331.8 632 | 62.0.3331.66 633 | 62.0.3331.55 634 | 62.0.3331.5 635 | 62.0.3331.43 636 | 62.0.3331.2 637 | 62.0.3331.18 638 | 62.0.3331.14 639 | 62.0.3331.119 640 | 62.0.3331.116 641 | 62.0.3331.10 642 | 62.0.3323.0 643 | 62.0.3319.0 644 | 61.0.3298.6 645 | 61.0.3298.3 646 | 61.0.3296.0 647 | 61.0.3290.0 648 | 61.0.3282.0 649 | 61.0.3275.0 650 | 61.0.3271.0 651 | 61.0.3268.0 652 | 60.0.3255.95 653 | 60.0.3255.84 654 | 60.0.3255.83 655 | 60.0.3255.8 656 | 60.0.3255.79 657 | 60.0.3255.70 658 | 60.0.3255.60 659 | 60.0.3255.59 660 | 60.0.3255.57 661 | 60.0.3255.56 662 | 60.0.3255.4 663 | 60.0.3255.37 664 | 60.0.3255.27 665 | 60.0.3255.20 666 | 60.0.3255.170 667 | 60.0.3255.151 668 | 60.0.3255.15 669 | 60.0.3255.116 670 | 60.0.3255.109 671 | 60.0.3255.103 672 | 60.0.3254.0 673 | 60.0.3248.0 674 | 60.0.3242.0 675 | 60.0.3236.0 676 | -------------------------------------------------------------------------------- /modules/headers/versions_windows.txt: -------------------------------------------------------------------------------- 1 | Windows NT 10.0; Win64; x64 2 | Windows NT 10.0; WOW64 3 | -------------------------------------------------------------------------------- /modules/hook/hook.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package hook 6 | 7 | import ( 8 | "net/http" 9 | 10 | "github.com/philippta/flyscrape" 11 | ) 12 | 13 | type Module struct { 14 | AdaptTransportFn func(http.RoundTripper) http.RoundTripper 15 | ValidateRequestFn func(*flyscrape.Request) bool 16 | BuildRequestFn func(*flyscrape.Request) 17 | ReceiveResponseFn func(*flyscrape.Response) 18 | ProvisionFn func(flyscrape.Context) 19 | FinalizeFn func() 20 | } 21 | 22 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 23 | return flyscrape.ModuleInfo{ 24 | ID: "hook", 25 | New: func() flyscrape.Module { return new(Module) }, 26 | } 27 | } 28 | 29 | func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 30 | if m.AdaptTransportFn == nil { 31 | return t 32 | } 33 | return m.AdaptTransportFn(t) 34 | } 35 | 36 | func (m Module) ValidateRequest(r *flyscrape.Request) bool { 37 | if m.ValidateRequestFn == nil { 38 | return true 39 | } 40 | return m.ValidateRequestFn(r) 41 | } 42 | 43 | func (m Module) BuildRequest(r *flyscrape.Request) { 44 | if m.BuildRequestFn == nil { 45 | return 46 | } 47 | m.BuildRequestFn(r) 48 | } 49 | 50 | func (m Module) ReceiveResponse(r *flyscrape.Response) { 51 | if m.ReceiveResponseFn == nil { 52 | return 53 | } 54 | m.ReceiveResponseFn(r) 55 | } 56 | 57 | func (m Module) Provision(ctx flyscrape.Context) { 58 | if m.ProvisionFn == nil { 59 | return 60 | } 61 | m.ProvisionFn(ctx) 62 | } 63 | 64 | func (m Module) Finalize() { 65 | if m.FinalizeFn == nil { 66 | return 67 | } 68 | m.FinalizeFn() 69 | } 70 | 71 | var ( 72 | _ flyscrape.TransportAdapter = Module{} 73 | _ flyscrape.RequestValidator = Module{} 74 | _ flyscrape.RequestBuilder = Module{} 75 | _ flyscrape.ResponseReceiver = Module{} 76 | _ flyscrape.Provisioner = Module{} 77 | _ flyscrape.Finalizer = Module{} 78 | ) 79 | -------------------------------------------------------------------------------- /modules/output/json/json.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package json 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "fmt" 11 | "io" 12 | "log" 13 | "os" 14 | "sync" 15 | "time" 16 | 17 | "github.com/philippta/flyscrape" 18 | ) 19 | 20 | func init() { 21 | flyscrape.RegisterModule(Module{}) 22 | } 23 | 24 | type Module struct { 25 | Output struct { 26 | Format string `json:"format"` 27 | File string `json:"file"` 28 | } `json:"output"` 29 | 30 | once bool 31 | w io.WriteCloser 32 | mu *sync.Mutex 33 | } 34 | 35 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 36 | return flyscrape.ModuleInfo{ 37 | ID: "output.json", 38 | New: func() flyscrape.Module { return new(Module) }, 39 | } 40 | } 41 | 42 | func (m *Module) Provision(ctx flyscrape.Context) { 43 | if m.disabled() { 44 | return 45 | } 46 | 47 | m.mu = &sync.Mutex{} 48 | 49 | if m.Output.File == "" { 50 | m.w = nopCloser{os.Stdout} 51 | return 52 | } 53 | 54 | f, err := os.Create(m.Output.File) 55 | if err != nil { 56 | log.Printf("failed to create file %q: %v", m.Output.File, err) 57 | os.Exit(1) 58 | } 59 | m.w = f 60 | } 61 | 62 | func (m *Module) ReceiveResponse(resp *flyscrape.Response) { 63 | if m.disabled() { 64 | return 65 | } 66 | 67 | if resp.Error == nil && resp.Data == nil { 68 | return 69 | } 70 | 71 | o := output{ 72 | URL: resp.Request.URL, 73 | Data: resp.Data, 74 | Timestamp: time.Now(), 75 | } 76 | if resp.Error != nil { 77 | o.Error = resp.Error.Error() 78 | } 79 | 80 | m.mu.Lock() 81 | defer m.mu.Unlock() 82 | 83 | if !m.once { 84 | fmt.Fprintln(m.w, "[") 85 | m.once = true 86 | } else { 87 | fmt.Fprintln(m.w, ",") 88 | } 89 | 90 | var buf bytes.Buffer 91 | enc := json.NewEncoder(&buf) 92 | enc.SetEscapeHTML(false) 93 | enc.SetIndent(" ", " ") 94 | enc.Encode(o) 95 | 96 | fmt.Fprint(m.w, " ") 97 | fmt.Fprint(m.w, buf.String()[:buf.Len()-1]) 98 | } 99 | 100 | func (m *Module) Finalize() { 101 | if m.disabled() { 102 | return 103 | } 104 | if m.once { 105 | fmt.Fprintln(m.w, "\n]") 106 | } 107 | m.w.Close() 108 | } 109 | 110 | func (m *Module) disabled() bool { 111 | return m.Output.Format != "json" && m.Output.Format != "" 112 | } 113 | 114 | type output struct { 115 | URL string `json:"url,omitempty"` 116 | Data any `json:"data,omitempty"` 117 | Error string `json:"error,omitempty"` 118 | Timestamp time.Time `json:"timestamp,omitempty"` 119 | } 120 | 121 | type nopCloser struct { 122 | io.Writer 123 | } 124 | 125 | func (c nopCloser) Write(p []byte) (n int, err error) { 126 | return c.Writer.Write(p) 127 | } 128 | 129 | func (c nopCloser) Close() error { 130 | return nil 131 | } 132 | 133 | var ( 134 | _ flyscrape.Provisioner = (*Module)(nil) 135 | _ flyscrape.ResponseReceiver = (*Module)(nil) 136 | _ flyscrape.Finalizer = (*Module)(nil) 137 | ) 138 | -------------------------------------------------------------------------------- /modules/output/ndjson/ndjson.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package ndjson 6 | 7 | import ( 8 | "encoding/json" 9 | "io" 10 | "log" 11 | "os" 12 | "sync" 13 | "time" 14 | 15 | "github.com/philippta/flyscrape" 16 | ) 17 | 18 | func init() { 19 | flyscrape.RegisterModule(Module{}) 20 | } 21 | 22 | type Module struct { 23 | Output struct { 24 | Format string `json:"format"` 25 | File string `json:"file"` 26 | } `json:"output"` 27 | 28 | w io.WriteCloser 29 | mu *sync.Mutex 30 | } 31 | 32 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 33 | return flyscrape.ModuleInfo{ 34 | ID: "output.ndjson", 35 | New: func() flyscrape.Module { return new(Module) }, 36 | } 37 | } 38 | 39 | func (m *Module) Provision(ctx flyscrape.Context) { 40 | if m.disabled() { 41 | return 42 | } 43 | 44 | m.mu = &sync.Mutex{} 45 | 46 | if m.Output.File == "" { 47 | m.w = nopCloser{os.Stdout} 48 | return 49 | } 50 | 51 | f, err := os.Create(m.Output.File) 52 | if err != nil { 53 | log.Printf("failed to create file %q: %v", m.Output.File, err) 54 | os.Exit(1) 55 | } 56 | m.w = f 57 | } 58 | 59 | func (m *Module) ReceiveResponse(resp *flyscrape.Response) { 60 | if m.disabled() { 61 | return 62 | } 63 | 64 | if resp.Error == nil && resp.Data == nil { 65 | return 66 | } 67 | 68 | o := output{ 69 | URL: resp.Request.URL, 70 | Data: resp.Data, 71 | Timestamp: time.Now(), 72 | } 73 | if resp.Error != nil { 74 | o.Error = resp.Error.Error() 75 | } 76 | 77 | m.mu.Lock() 78 | defer m.mu.Unlock() 79 | 80 | enc := json.NewEncoder(m.w) 81 | enc.SetEscapeHTML(false) 82 | enc.Encode(o) 83 | } 84 | 85 | func (m *Module) Finalize() { 86 | if m.disabled() { 87 | return 88 | } 89 | m.w.Close() 90 | } 91 | 92 | func (m *Module) disabled() bool { 93 | return m.Output.Format != "ndjson" 94 | } 95 | 96 | type output struct { 97 | URL string `json:"url,omitempty"` 98 | Data any `json:"data,omitempty"` 99 | Error string `json:"error,omitempty"` 100 | Timestamp time.Time `json:"timestamp,omitempty"` 101 | } 102 | 103 | type nopCloser struct { 104 | io.Writer 105 | } 106 | 107 | func (c nopCloser) Write(p []byte) (n int, err error) { 108 | return c.Writer.Write(p) 109 | } 110 | 111 | func (c nopCloser) Close() error { 112 | return nil 113 | } 114 | 115 | var ( 116 | _ flyscrape.Provisioner = (*Module)(nil) 117 | _ flyscrape.ResponseReceiver = (*Module)(nil) 118 | _ flyscrape.Finalizer = (*Module)(nil) 119 | ) 120 | -------------------------------------------------------------------------------- /modules/proxy/proxy.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package proxy 6 | 7 | import ( 8 | "crypto/tls" 9 | "math/rand" 10 | "net/http" 11 | "net/url" 12 | 13 | "github.com/philippta/flyscrape" 14 | ) 15 | 16 | func init() { 17 | flyscrape.RegisterModule(Module{}) 18 | } 19 | 20 | type Module struct { 21 | Proxies []string `json:"proxies"` 22 | Proxy string `json:"proxy"` 23 | 24 | transports []*http.Transport 25 | } 26 | 27 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 28 | return flyscrape.ModuleInfo{ 29 | ID: "proxy", 30 | New: func() flyscrape.Module { return new(Module) }, 31 | } 32 | } 33 | 34 | func (m *Module) Provision(ctx flyscrape.Context) { 35 | if m.disabled() { 36 | return 37 | } 38 | 39 | for _, purl := range append(m.Proxies, m.Proxy) { 40 | if purl == "" { 41 | continue 42 | } 43 | if parsed, err := url.Parse(purl); err == nil { 44 | m.transports = append(m.transports, &http.Transport{ 45 | Proxy: http.ProxyURL(parsed), 46 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 47 | }) 48 | } 49 | 50 | } 51 | } 52 | 53 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 54 | if m.disabled() { 55 | return t 56 | } 57 | 58 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 59 | transport := m.transports[rand.Intn(len(m.transports))] 60 | return transport.RoundTrip(r) 61 | }) 62 | } 63 | 64 | func (m *Module) disabled() bool { 65 | return len(m.Proxies) == 0 && m.Proxy == "" 66 | } 67 | -------------------------------------------------------------------------------- /modules/proxy/proxy_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package proxy_test 6 | 7 | import ( 8 | "net/http" 9 | "net/http/httptest" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/proxy" 14 | "github.com/philippta/flyscrape/modules/starturl" 15 | "github.com/stretchr/testify/require" 16 | ) 17 | 18 | func TestProxy(t *testing.T) { 19 | var called bool 20 | p := newProxy(func() { called = true }) 21 | defer p.Close() 22 | 23 | mods := []flyscrape.Module{ 24 | &starturl.Module{URL: "http://www.example.com"}, 25 | &proxy.Module{ 26 | Proxies: []string{p.URL}, 27 | }, 28 | } 29 | 30 | scraper := flyscrape.NewScraper() 31 | scraper.Modules = mods 32 | scraper.Run() 33 | 34 | require.True(t, called) 35 | } 36 | 37 | func TestProxyMultiple(t *testing.T) { 38 | calls := []int{0, 0, 0} 39 | p0 := newProxy(func() { calls[0]++ }) 40 | p1 := newProxy(func() { calls[1]++ }) 41 | p2 := newProxy(func() { calls[2]++ }) 42 | defer p0.Close() 43 | defer p1.Close() 44 | defer p2.Close() 45 | 46 | mod := &proxy.Module{Proxies: []string{p0.URL, p1.URL}, Proxy: p2.URL} 47 | mod.Provision(nil) 48 | trans := mod.AdaptTransport(nil) 49 | 50 | req := httptest.NewRequest("GET", "http://www.example.com/", nil) 51 | 52 | for i := 0; i < 50; i++ { 53 | resp, err := trans.RoundTrip(req) 54 | require.NoError(t, err) 55 | require.Equal(t, http.StatusOK, resp.StatusCode) 56 | } 57 | 58 | require.Greater(t, calls[0], 1) 59 | require.Greater(t, calls[1], 1) 60 | require.Greater(t, calls[2], 1) 61 | require.Equal(t, 50, calls[0]+calls[1]+calls[2]) 62 | } 63 | 64 | func newProxy(f func()) *httptest.Server { 65 | return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 66 | f() 67 | w.Write([]byte("response from proxy")) 68 | })) 69 | } 70 | -------------------------------------------------------------------------------- /modules/ratelimit/ratelimit.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package ratelimit 6 | 7 | import ( 8 | "math" 9 | "net/http" 10 | "time" 11 | 12 | "github.com/philippta/flyscrape" 13 | ) 14 | 15 | func init() { 16 | flyscrape.RegisterModule(Module{}) 17 | } 18 | 19 | type Module struct { 20 | Rate int `json:"rate"` 21 | Concurrency int `json:"concurrency"` 22 | Browser bool `json:"browser"` 23 | 24 | ticker *time.Ticker 25 | ratelimit chan struct{} 26 | concurrency chan struct{} 27 | } 28 | 29 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 30 | return flyscrape.ModuleInfo{ 31 | ID: "ratelimit", 32 | New: func() flyscrape.Module { return new(Module) }, 33 | } 34 | } 35 | 36 | func (m *Module) Provision(v flyscrape.Context) { 37 | if m.rateLimitEnabled() { 38 | rate := time.Duration(float64(time.Minute) / float64(m.Rate)) 39 | m.ticker = time.NewTicker(rate) 40 | m.ratelimit = make(chan struct{}, int(math.Max(float64(m.Rate)/10, 1))) 41 | 42 | go func() { 43 | m.ratelimit <- struct{}{} 44 | for range m.ticker.C { 45 | m.ratelimit <- struct{}{} 46 | } 47 | }() 48 | } 49 | 50 | if m.browserEnabled() && !m.concurrencyEnabled() { 51 | m.Concurrency = 1 52 | } 53 | 54 | if m.concurrencyEnabled() { 55 | m.concurrency = make(chan struct{}, m.Concurrency) 56 | for i := 0; i < m.Concurrency; i++ { 57 | m.concurrency <- struct{}{} 58 | } 59 | } 60 | } 61 | 62 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 63 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 64 | if m.rateLimitEnabled() { 65 | <-m.ratelimit 66 | } 67 | 68 | if m.concurrencyEnabled() { 69 | <-m.concurrency 70 | defer func() { m.concurrency <- struct{}{} }() 71 | } 72 | 73 | return t.RoundTrip(r) 74 | }) 75 | } 76 | 77 | func (m *Module) Finalize() { 78 | if m.rateLimitEnabled() { 79 | m.ticker.Stop() 80 | } 81 | } 82 | 83 | func (m *Module) rateLimitEnabled() bool { 84 | return m.Rate != 0 85 | } 86 | 87 | func (m *Module) concurrencyEnabled() bool { 88 | return m.Concurrency > 0 89 | } 90 | 91 | func (m *Module) browserEnabled() bool { 92 | return m.Browser 93 | } 94 | 95 | var ( 96 | _ flyscrape.TransportAdapter = (*Module)(nil) 97 | _ flyscrape.Provisioner = (*Module)(nil) 98 | _ flyscrape.Finalizer = (*Module)(nil) 99 | ) 100 | -------------------------------------------------------------------------------- /modules/ratelimit/ratelimit_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package ratelimit_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | "testing" 11 | "time" 12 | 13 | "github.com/philippta/flyscrape" 14 | "github.com/philippta/flyscrape/modules/followlinks" 15 | "github.com/philippta/flyscrape/modules/hook" 16 | "github.com/philippta/flyscrape/modules/ratelimit" 17 | "github.com/philippta/flyscrape/modules/starturl" 18 | "github.com/stretchr/testify/require" 19 | ) 20 | 21 | func TestRatelimit(t *testing.T) { 22 | var times []time.Time 23 | var mu sync.Mutex 24 | 25 | mods := []flyscrape.Module{ 26 | &starturl.Module{URL: "http://www.example.com"}, 27 | &followlinks.Module{}, 28 | hook.Module{ 29 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 30 | return flyscrape.MockTransport(200, `foo`) 31 | }, 32 | ReceiveResponseFn: func(r *flyscrape.Response) { 33 | mu.Lock() 34 | times = append(times, time.Now()) 35 | mu.Unlock() 36 | }, 37 | }, 38 | &ratelimit.Module{ 39 | Rate: 240, 40 | }, 41 | } 42 | 43 | start := time.Now() 44 | scraper := flyscrape.NewScraper() 45 | scraper.Modules = mods 46 | scraper.Run() 47 | 48 | first := times[0].Add(-250 * time.Millisecond) 49 | second := times[1].Add(-500 * time.Millisecond) 50 | 51 | require.Less(t, first.Sub(start), 250*time.Millisecond) 52 | require.Less(t, second.Sub(start), 250*time.Millisecond) 53 | 54 | require.Less(t, start.Sub(first), 250*time.Millisecond) 55 | require.Less(t, start.Sub(second), 250*time.Millisecond) 56 | } 57 | 58 | func TestRatelimitConcurrency(t *testing.T) { 59 | var times []time.Time 60 | var mu sync.Mutex 61 | 62 | mods := []flyscrape.Module{ 63 | &starturl.Module{URL: "http://www.example.com"}, 64 | &followlinks.Module{}, 65 | hook.Module{ 66 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 67 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 68 | mu.Lock() 69 | times = append(times, time.Now()) 70 | mu.Unlock() 71 | 72 | time.Sleep(10 * time.Millisecond) 73 | return flyscrape.MockResponse(200, ` 74 | 75 | 76 | 77 | 78 | `) 79 | }) 80 | }, 81 | }, 82 | &ratelimit.Module{ 83 | Concurrency: 2, 84 | }, 85 | } 86 | 87 | scraper := flyscrape.NewScraper() 88 | scraper.Modules = mods 89 | scraper.Run() 90 | 91 | require.Len(t, times, 5) 92 | require.Less(t, times[2].Sub(times[1]), time.Millisecond) 93 | require.Less(t, times[4].Sub(times[3]), time.Millisecond) 94 | } 95 | -------------------------------------------------------------------------------- /modules/retry/retry.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package retry 6 | 7 | import ( 8 | "errors" 9 | "io" 10 | "net" 11 | "net/http" 12 | "slices" 13 | "strconv" 14 | "time" 15 | 16 | "github.com/philippta/flyscrape" 17 | ) 18 | 19 | func init() { 20 | flyscrape.RegisterModule(Module{}) 21 | } 22 | 23 | type Module struct { 24 | ticker *time.Ticker 25 | semaphore chan struct{} 26 | 27 | RetryDelays []time.Duration 28 | } 29 | 30 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 31 | return flyscrape.ModuleInfo{ 32 | ID: "retry", 33 | New: func() flyscrape.Module { return new(Module) }, 34 | } 35 | } 36 | 37 | func (m *Module) Provision(flyscrape.Context) { 38 | if m.RetryDelays == nil { 39 | m.RetryDelays = defaultRetryDelays 40 | } 41 | } 42 | 43 | func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { 44 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 45 | resp, err := t.RoundTrip(r) 46 | if !shouldRetry(resp, err) { 47 | return resp, err 48 | } 49 | 50 | for _, delay := range m.RetryDelays { 51 | drainBody(resp, err) 52 | 53 | time.Sleep(retryAfter(resp, delay)) 54 | 55 | resp, err = t.RoundTrip(r) 56 | if !shouldRetry(resp, err) { 57 | break 58 | } 59 | } 60 | 61 | return resp, err 62 | }) 63 | } 64 | 65 | func shouldRetry(resp *http.Response, err error) bool { 66 | statusCodes := []int{ 67 | http.StatusForbidden, 68 | http.StatusRequestTimeout, 69 | http.StatusTooEarly, 70 | http.StatusTooManyRequests, 71 | http.StatusInternalServerError, 72 | http.StatusBadGateway, 73 | http.StatusServiceUnavailable, 74 | http.StatusGatewayTimeout, 75 | } 76 | 77 | if resp != nil { 78 | if slices.Contains(statusCodes, resp.StatusCode) { 79 | return true 80 | } 81 | } 82 | if err == nil { 83 | return false 84 | } 85 | if _, ok := err.(net.Error); ok { 86 | return true 87 | } 88 | if errors.Is(err, io.ErrUnexpectedEOF) { 89 | return true 90 | } 91 | 92 | return false 93 | } 94 | 95 | func drainBody(resp *http.Response, err error) { 96 | if err == nil && resp != nil && resp.Body != nil { 97 | io.Copy(io.Discard, resp.Body) 98 | resp.Body.Close() 99 | } 100 | } 101 | 102 | func retryAfter(resp *http.Response, fallback time.Duration) time.Duration { 103 | if resp == nil { 104 | return fallback 105 | } 106 | 107 | timeexp := resp.Header.Get("Retry-After") 108 | if timeexp == "" { 109 | return fallback 110 | } 111 | 112 | if seconds, err := strconv.Atoi(timeexp); err == nil { 113 | return time.Duration(seconds) * time.Second 114 | } 115 | 116 | formats := []string{ 117 | time.RFC1123, // HTTP Spec 118 | time.RFC1123Z, 119 | time.ANSIC, 120 | time.UnixDate, 121 | time.RubyDate, 122 | time.RFC822, 123 | time.RFC822Z, 124 | time.RFC850, 125 | time.RFC3339, 126 | } 127 | for _, format := range formats { 128 | if t, err := time.Parse(format, timeexp); err == nil { 129 | return t.Sub(time.Now()) 130 | } 131 | } 132 | 133 | return fallback 134 | } 135 | 136 | var defaultRetryDelays = []time.Duration{ 137 | 1 * time.Second, 138 | 2 * time.Second, 139 | 5 * time.Second, 140 | 10 * time.Second, 141 | } 142 | 143 | var ( 144 | _ flyscrape.TransportAdapter = (*Module)(nil) 145 | _ flyscrape.Provisioner = (*Module)(nil) 146 | ) 147 | -------------------------------------------------------------------------------- /modules/retry/retry_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package retry_test 6 | 7 | import ( 8 | "fmt" 9 | "io" 10 | "net" 11 | "net/http" 12 | "testing" 13 | "time" 14 | 15 | "github.com/philippta/flyscrape" 16 | "github.com/philippta/flyscrape/modules/followlinks" 17 | "github.com/philippta/flyscrape/modules/hook" 18 | "github.com/philippta/flyscrape/modules/retry" 19 | "github.com/philippta/flyscrape/modules/starturl" 20 | "github.com/stretchr/testify/require" 21 | ) 22 | 23 | func TestRetry(t *testing.T) { 24 | t.Parallel() 25 | var count int 26 | 27 | mods := []flyscrape.Module{ 28 | &starturl.Module{URL: "http://www.example.com"}, 29 | &followlinks.Module{}, 30 | hook.Module{ 31 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 32 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 33 | count++ 34 | return flyscrape.MockResponse(http.StatusServiceUnavailable, "service unavailable") 35 | }) 36 | }, 37 | }, 38 | &retry.Module{ 39 | RetryDelays: []time.Duration{ 40 | 100 * time.Millisecond, 41 | 200 * time.Millisecond, 42 | }, 43 | }, 44 | } 45 | 46 | scraper := flyscrape.NewScraper() 47 | scraper.Modules = mods 48 | scraper.Run() 49 | 50 | require.Equal(t, 3, count) 51 | } 52 | 53 | func TestRetryStatusCodes(t *testing.T) { 54 | t.Parallel() 55 | 56 | tests := []struct { 57 | statusCode int 58 | retry bool 59 | }{ 60 | {statusCode: http.StatusBadGateway, retry: true}, 61 | {statusCode: http.StatusTooManyRequests, retry: true}, 62 | {statusCode: http.StatusBadRequest, retry: false}, 63 | {statusCode: http.StatusOK, retry: false}, 64 | } 65 | 66 | for _, test := range tests { 67 | t.Run(fmt.Sprintf("%s_%t", http.StatusText(test.statusCode), test.retry), func(t *testing.T) { 68 | t.Parallel() 69 | var count int 70 | mods := []flyscrape.Module{ 71 | &starturl.Module{URL: "http://www.example.com"}, 72 | &followlinks.Module{}, 73 | hook.Module{ 74 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 75 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 76 | count++ 77 | return flyscrape.MockResponse(test.statusCode, http.StatusText(test.statusCode)) 78 | }) 79 | }, 80 | }, 81 | &retry.Module{ 82 | RetryDelays: []time.Duration{ 83 | 100 * time.Millisecond, 84 | 200 * time.Millisecond, 85 | }, 86 | }, 87 | } 88 | 89 | scraper := flyscrape.NewScraper() 90 | scraper.Modules = mods 91 | scraper.Run() 92 | 93 | if test.retry { 94 | require.NotEqual(t, 1, count) 95 | } else { 96 | require.Equal(t, 1, count) 97 | } 98 | }) 99 | } 100 | } 101 | 102 | func TestRetryErrors(t *testing.T) { 103 | t.Parallel() 104 | 105 | tests := []struct { 106 | error error 107 | }{ 108 | {error: &net.OpError{}}, 109 | {error: io.ErrUnexpectedEOF}, 110 | } 111 | 112 | for _, test := range tests { 113 | t.Run(fmt.Sprintf("%T", test.error), func(t *testing.T) { 114 | t.Parallel() 115 | var count int 116 | mods := []flyscrape.Module{ 117 | &starturl.Module{URL: "http://www.example.com"}, 118 | &followlinks.Module{}, 119 | hook.Module{ 120 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 121 | return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { 122 | return nil, test.error 123 | }) 124 | }, 125 | }, 126 | &retry.Module{ 127 | RetryDelays: []time.Duration{ 128 | 100 * time.Millisecond, 129 | 200 * time.Millisecond, 130 | }, 131 | }, 132 | } 133 | 134 | scraper := flyscrape.NewScraper() 135 | scraper.Modules = mods 136 | scraper.Run() 137 | 138 | require.NotEqual(t, 1, count) 139 | }) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /modules/starturl/starturl.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package starturl 6 | 7 | import ( 8 | "github.com/philippta/flyscrape" 9 | ) 10 | 11 | func init() { 12 | flyscrape.RegisterModule(Module{}) 13 | } 14 | 15 | type Module struct { 16 | URL string `json:"url"` 17 | URLs []string `json:"urls"` 18 | } 19 | 20 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 21 | return flyscrape.ModuleInfo{ 22 | ID: "starturl", 23 | New: func() flyscrape.Module { return new(Module) }, 24 | } 25 | } 26 | 27 | func (m *Module) Provision(ctx flyscrape.Context) { 28 | if m.URL != "" { 29 | ctx.Visit(m.URL) 30 | } 31 | 32 | for _, url := range m.URLs { 33 | ctx.Visit(url) 34 | } 35 | } 36 | 37 | var _ flyscrape.Provisioner = (*Module)(nil) 38 | -------------------------------------------------------------------------------- /modules/starturl/starturl_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package starturl_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/hook" 14 | "github.com/philippta/flyscrape/modules/starturl" 15 | "github.com/stretchr/testify/require" 16 | ) 17 | 18 | func TestStartURL(t *testing.T) { 19 | var url string 20 | var depth int 21 | 22 | mods := []flyscrape.Module{ 23 | &starturl.Module{URL: "http://www.example.com/foo/bar"}, 24 | hook.Module{ 25 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 26 | return flyscrape.MockTransport(200, "") 27 | }, 28 | BuildRequestFn: func(r *flyscrape.Request) { 29 | url = r.URL 30 | depth = r.Depth 31 | }, 32 | }, 33 | } 34 | 35 | scraper := flyscrape.NewScraper() 36 | scraper.Modules = mods 37 | scraper.Run() 38 | 39 | require.Equal(t, "http://www.example.com/foo/bar", url) 40 | require.Equal(t, 0, depth) 41 | } 42 | 43 | func TestStartURL_MultipleStartingURLs(t *testing.T) { 44 | testCases := []struct { 45 | name string 46 | startURLModFn func() *starturl.Module 47 | urls []string 48 | }{ 49 | { 50 | name: ".URL and .URLs", 51 | startURLModFn: func() *starturl.Module { 52 | return &starturl.Module{ 53 | URL: "http://www.example.com/foo", 54 | URLs: []string{ 55 | "http://www.example.com/bar", 56 | "http://www.example.com/baz", 57 | }, 58 | } 59 | }, 60 | urls: []string{ 61 | "http://www.example.com/foo", 62 | "http://www.example.com/bar", 63 | "http://www.example.com/baz", 64 | }, 65 | }, 66 | { 67 | name: "only .URL", 68 | startURLModFn: func() *starturl.Module { 69 | return &starturl.Module{ 70 | URL: "http://www.example.com/foo", 71 | } 72 | }, 73 | urls: []string{ 74 | "http://www.example.com/foo", 75 | }, 76 | }, 77 | { 78 | name: "only .URLs", 79 | startURLModFn: func() *starturl.Module { 80 | return &starturl.Module{ 81 | URLs: []string{ 82 | "http://www.example.com/bar", 83 | "http://www.example.com/baz", 84 | }, 85 | } 86 | }, 87 | urls: []string{ 88 | "http://www.example.com/bar", 89 | "http://www.example.com/baz", 90 | }, 91 | }, 92 | { 93 | name: "empty", 94 | startURLModFn: func() *starturl.Module { 95 | return &starturl.Module{} 96 | }, 97 | urls: []string{}, 98 | }, 99 | } 100 | 101 | for _, tc := range testCases { 102 | t.Run(tc.name, func(t *testing.T) { 103 | urls := []string{} 104 | mu := sync.Mutex{} 105 | 106 | mods := []flyscrape.Module{ 107 | tc.startURLModFn(), 108 | hook.Module{ 109 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 110 | return flyscrape.MockTransport(http.StatusOK, "") 111 | }, 112 | BuildRequestFn: func(r *flyscrape.Request) { 113 | mu.Lock() 114 | urls = append(urls, r.URL) 115 | mu.Unlock() 116 | }, 117 | }, 118 | } 119 | 120 | scraper := flyscrape.NewScraper() 121 | scraper.Modules = mods 122 | scraper.Run() 123 | 124 | require.ElementsMatch(t, tc.urls, urls) 125 | }) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /modules/urlfilter/urlfilter.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package urlfilter 6 | 7 | import ( 8 | "regexp" 9 | 10 | "github.com/philippta/flyscrape" 11 | ) 12 | 13 | func init() { 14 | flyscrape.RegisterModule(Module{}) 15 | } 16 | 17 | type Module struct { 18 | URL string `json:"url"` 19 | URLs []string `json:"urls"` 20 | AllowedURLs []string `json:"allowedURLs"` 21 | BlockedURLs []string `json:"blockedURLs"` 22 | 23 | allowedURLsRE []*regexp.Regexp 24 | blockedURLsRE []*regexp.Regexp 25 | } 26 | 27 | func (Module) ModuleInfo() flyscrape.ModuleInfo { 28 | return flyscrape.ModuleInfo{ 29 | ID: "urlfilter", 30 | New: func() flyscrape.Module { return new(Module) }, 31 | } 32 | } 33 | 34 | func (m *Module) Provision(v flyscrape.Context) { 35 | if m.disabled() { 36 | return 37 | } 38 | 39 | for _, pat := range m.AllowedURLs { 40 | re, err := regexp.Compile(pat) 41 | if err != nil { 42 | continue 43 | } 44 | m.allowedURLsRE = append(m.allowedURLsRE, re) 45 | } 46 | 47 | for _, pat := range m.BlockedURLs { 48 | re, err := regexp.Compile(pat) 49 | if err != nil { 50 | continue 51 | } 52 | m.blockedURLsRE = append(m.blockedURLsRE, re) 53 | } 54 | } 55 | 56 | func (m *Module) ValidateRequest(r *flyscrape.Request) bool { 57 | if m.disabled() { 58 | return true 59 | } 60 | 61 | // allow root url 62 | if r.URL == m.URL { 63 | return true 64 | } 65 | for _, u := range m.URLs { 66 | if r.URL == u { 67 | return true 68 | } 69 | } 70 | 71 | // allow if no filter is set 72 | if len(m.allowedURLsRE) == 0 && len(m.blockedURLsRE) == 0 { 73 | return true 74 | } 75 | 76 | ok := false 77 | if len(m.allowedURLsRE) == 0 { 78 | ok = true 79 | } 80 | 81 | for _, re := range m.allowedURLsRE { 82 | if re.MatchString(r.URL) { 83 | ok = true 84 | break 85 | } 86 | } 87 | 88 | for _, re := range m.blockedURLsRE { 89 | if re.MatchString(r.URL) { 90 | ok = false 91 | break 92 | } 93 | } 94 | 95 | return ok 96 | } 97 | 98 | func (m *Module) disabled() bool { 99 | return len(m.AllowedURLs) == 0 && len(m.BlockedURLs) == 0 100 | } 101 | 102 | var ( 103 | _ flyscrape.RequestValidator = (*Module)(nil) 104 | _ flyscrape.Provisioner = (*Module)(nil) 105 | ) 106 | -------------------------------------------------------------------------------- /modules/urlfilter/urlfilter_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package urlfilter_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | "testing" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/philippta/flyscrape/modules/followlinks" 14 | "github.com/philippta/flyscrape/modules/hook" 15 | "github.com/philippta/flyscrape/modules/starturl" 16 | "github.com/philippta/flyscrape/modules/urlfilter" 17 | "github.com/stretchr/testify/require" 18 | ) 19 | 20 | func TestURLFilterAllowed(t *testing.T) { 21 | var urls []string 22 | var mu sync.Mutex 23 | 24 | mods := []flyscrape.Module{ 25 | &starturl.Module{URL: "http://www.example.com/"}, 26 | &followlinks.Module{}, 27 | &urlfilter.Module{ 28 | URL: "http://www.example.com/", 29 | AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`}, 30 | }, 31 | hook.Module{ 32 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 33 | return flyscrape.MockTransport(200, ` 34 | 123 35 | ABC 36 | bar 37 | barz`) 38 | }, 39 | ReceiveResponseFn: func(r *flyscrape.Response) { 40 | mu.Lock() 41 | urls = append(urls, r.Request.URL) 42 | mu.Unlock() 43 | }, 44 | }, 45 | } 46 | 47 | scraper := flyscrape.NewScraper() 48 | scraper.Modules = mods 49 | scraper.Run() 50 | 51 | require.Len(t, urls, 3) 52 | require.Contains(t, urls, "http://www.example.com/") 53 | require.Contains(t, urls, "http://www.example.com/foo?id=123") 54 | require.Contains(t, urls, "http://www.example.com/bar") 55 | } 56 | 57 | func TestURLFilterBlocked(t *testing.T) { 58 | var urls []string 59 | var mu sync.Mutex 60 | 61 | mods := []flyscrape.Module{ 62 | &starturl.Module{URL: "http://www.example.com/"}, 63 | &followlinks.Module{}, 64 | &urlfilter.Module{ 65 | URL: "http://www.example.com/", 66 | BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`}, 67 | }, 68 | hook.Module{ 69 | AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { 70 | return flyscrape.MockTransport(200, ` 71 | 123 72 | ABC 73 | bar 74 | barz`) 75 | }, 76 | ReceiveResponseFn: func(r *flyscrape.Response) { 77 | mu.Lock() 78 | urls = append(urls, r.Request.URL) 79 | mu.Unlock() 80 | }, 81 | }, 82 | } 83 | 84 | scraper := flyscrape.NewScraper() 85 | scraper.Modules = mods 86 | scraper.Run() 87 | 88 | require.Len(t, urls, 3) 89 | require.Contains(t, urls, "http://www.example.com/") 90 | require.Contains(t, urls, "http://www.example.com/foo?id=ABC") 91 | require.Contains(t, urls, "http://www.example.com/barz") 92 | } 93 | -------------------------------------------------------------------------------- /scrape.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | "fmt" 9 | "io" 10 | "log" 11 | "net/http" 12 | "net/http/cookiejar" 13 | "strings" 14 | "sync" 15 | 16 | "github.com/cornelk/hashmap" 17 | ) 18 | 19 | type Context interface { 20 | ScriptName() string 21 | Visit(url string) 22 | MarkVisited(url string) 23 | MarkUnvisited(url string) 24 | } 25 | 26 | type Request struct { 27 | Method string 28 | URL string 29 | Headers http.Header 30 | Cookies http.CookieJar 31 | Depth int 32 | } 33 | 34 | type Response struct { 35 | StatusCode int 36 | Headers http.Header 37 | Body []byte 38 | Data any 39 | Error error 40 | Request *Request 41 | 42 | Visit func(url string) 43 | } 44 | 45 | type target struct { 46 | url string 47 | depth int 48 | } 49 | 50 | func NewScraper() *Scraper { 51 | return &Scraper{} 52 | } 53 | 54 | type Scraper struct { 55 | ScrapeFunc ScrapeFunc 56 | Script string 57 | Modules []Module 58 | Client *http.Client 59 | 60 | wg sync.WaitGroup 61 | jobs chan target 62 | visited *hashmap.Map[string, struct{}] 63 | } 64 | 65 | func (s *Scraper) Visit(url string) { 66 | s.enqueueJob(url, 0) 67 | } 68 | 69 | func (s *Scraper) MarkVisited(url string) { 70 | s.visited.Insert(url, struct{}{}) 71 | } 72 | 73 | func (s *Scraper) MarkUnvisited(url string) { 74 | s.visited.Del(url) 75 | } 76 | 77 | func (s *Scraper) ScriptName() string { 78 | return s.Script 79 | } 80 | 81 | func (s *Scraper) Run() { 82 | s.jobs = make(chan target, 1<<20) 83 | s.visited = hashmap.New[string, struct{}]() 84 | 85 | s.initClient() 86 | 87 | for _, mod := range s.Modules { 88 | if v, ok := mod.(Provisioner); ok { 89 | v.Provision(s) 90 | } 91 | } 92 | 93 | for _, mod := range s.Modules { 94 | if v, ok := mod.(TransportAdapter); ok { 95 | s.Client.Transport = v.AdaptTransport(s.Client.Transport) 96 | } 97 | } 98 | 99 | s.scrape() 100 | s.wg.Wait() 101 | close(s.jobs) 102 | 103 | for _, mod := range s.Modules { 104 | if v, ok := mod.(Finalizer); ok { 105 | v.Finalize() 106 | } 107 | } 108 | } 109 | 110 | func (s *Scraper) initClient() { 111 | if s.Client == nil { 112 | s.Client = &http.Client{} 113 | } 114 | if s.Client.Jar == nil { 115 | s.Client.Jar, _ = cookiejar.New(nil) 116 | } 117 | if s.Client.Transport == nil { 118 | s.Client.Transport = http.DefaultTransport 119 | } 120 | } 121 | 122 | func (s *Scraper) scrape() { 123 | for i := 0; i < 500; i++ { 124 | go func() { 125 | for job := range s.jobs { 126 | s.process(job.url, job.depth) 127 | s.wg.Done() 128 | } 129 | }() 130 | } 131 | } 132 | 133 | func (s *Scraper) process(url string, depth int) { 134 | request := &Request{ 135 | Method: http.MethodGet, 136 | URL: url, 137 | Headers: http.Header{}, 138 | Cookies: s.Client.Jar, 139 | Depth: depth, 140 | } 141 | 142 | response := &Response{ 143 | Request: request, 144 | Visit: func(url string) { 145 | s.enqueueJob(url, depth+1) 146 | }, 147 | } 148 | 149 | for _, mod := range s.Modules { 150 | if v, ok := mod.(RequestBuilder); ok { 151 | v.BuildRequest(request) 152 | } 153 | } 154 | 155 | req, err := http.NewRequest(request.Method, request.URL, nil) 156 | if err != nil { 157 | response.Error = err 158 | return 159 | } 160 | req.Header = request.Headers 161 | 162 | for _, mod := range s.Modules { 163 | if v, ok := mod.(RequestValidator); ok { 164 | if !v.ValidateRequest(request) { 165 | return 166 | } 167 | } 168 | } 169 | 170 | defer func() { 171 | for _, mod := range s.Modules { 172 | if v, ok := mod.(ResponseReceiver); ok { 173 | v.ReceiveResponse(response) 174 | } 175 | } 176 | }() 177 | 178 | resp, err := s.Client.Do(req) 179 | if err != nil { 180 | response.Error = err 181 | return 182 | } 183 | defer resp.Body.Close() 184 | 185 | response.StatusCode = resp.StatusCode 186 | response.Headers = resp.Header 187 | 188 | if response.StatusCode < 200 || response.StatusCode >= 300 { 189 | response.Error = fmt.Errorf("%d %s", response.StatusCode, http.StatusText(response.StatusCode)) 190 | } 191 | 192 | response.Body, err = io.ReadAll(resp.Body) 193 | if err != nil { 194 | response.Error = err 195 | return 196 | } 197 | 198 | if s.ScrapeFunc != nil { 199 | func() { 200 | defer func() { 201 | if r := recover(); r != nil { 202 | log.Println(r) 203 | } 204 | }() 205 | 206 | p := ScrapeParams{ 207 | HTML: string(response.Body), 208 | URL: request.URL, 209 | Process: s.processImmediate, 210 | Follow: func(url string) { 211 | s.enqueueJob(url, depth+1) 212 | }, 213 | } 214 | 215 | response.Data, err = s.ScrapeFunc(p) 216 | if err != nil { 217 | response.Error = err 218 | return 219 | } 220 | }() 221 | } 222 | } 223 | 224 | func (s *Scraper) processImmediate(url string) ([]byte, error) { 225 | request := &Request{ 226 | Method: http.MethodGet, 227 | URL: url, 228 | Headers: http.Header{}, 229 | Cookies: s.Client.Jar, 230 | } 231 | 232 | for _, mod := range s.Modules { 233 | if v, ok := mod.(RequestBuilder); ok { 234 | v.BuildRequest(request) 235 | } 236 | } 237 | 238 | req, err := http.NewRequest(request.Method, request.URL, nil) 239 | if err != nil { 240 | return nil, err 241 | } 242 | req.Header = request.Headers 243 | 244 | for _, mod := range s.Modules { 245 | if v, ok := mod.(RequestValidator); ok { 246 | if !v.ValidateRequest(request) { 247 | return nil, nil 248 | } 249 | } 250 | } 251 | 252 | resp, err := s.Client.Do(req) 253 | if err != nil { 254 | return nil, err 255 | } 256 | defer resp.Body.Close() 257 | 258 | if resp.StatusCode < 200 || resp.StatusCode >= 300 { 259 | return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode)) 260 | } 261 | 262 | body, err := io.ReadAll(resp.Body) 263 | if err != nil { 264 | return nil, err 265 | } 266 | 267 | return body, nil 268 | } 269 | 270 | func (s *Scraper) enqueueJob(url string, depth int) { 271 | url = strings.TrimSpace(url) 272 | if url == "" { 273 | return 274 | } 275 | 276 | if _, ok := s.visited.Get(url); ok { 277 | return 278 | } 279 | 280 | s.wg.Add(1) 281 | select { 282 | case s.jobs <- target{url: url, depth: depth}: 283 | s.MarkVisited(url) 284 | default: 285 | log.Println("queue is full, can't add url:", url) 286 | s.wg.Done() 287 | } 288 | } 289 | -------------------------------------------------------------------------------- /template.js: -------------------------------------------------------------------------------- 1 | export const config = { 2 | // Specify the URL to start scraping from. 3 | url: "https://example.com/", 4 | 5 | // Enable rendering with headless browser. (default = false) 6 | // browser: true, 7 | 8 | // Specify if browser should be headless or not. (default = true) 9 | // headless: false, 10 | 11 | // Specify the multiple URLs to start scraping from. (default = []) 12 | // urls: [ 13 | // "https://anothersite.com/", 14 | // "https://yetanother.com/", 15 | // ], 16 | 17 | // Specify how deep links should be followed. (default = 0, no follow) 18 | // depth: 5, 19 | 20 | // Speficy the css selectors to follow. (default = ["a[href]"]) 21 | // follow: [".next > a", ".related a"], 22 | 23 | // Specify the allowed domains. ['*'] for all. (default = domain from url) 24 | // allowedDomains: ["example.com", "anothersite.com"], 25 | 26 | // Specify the blocked domains. (default = none) 27 | // blockedDomains: ["somesite.com"], 28 | 29 | // Specify the allowed URLs as regex. (default = all allowed) 30 | // allowedURLs: ["/posts", "/articles/\d+"], 31 | 32 | // Specify the blocked URLs as regex. (default = none) 33 | // blockedURLs: ["/admin"], 34 | 35 | // Specify the rate in requests per minute. (default = no rate limit) 36 | // rate: 60, 37 | 38 | // Specify the number of concurrent requests. (default = no limit) 39 | // concurrency: 1, 40 | 41 | // Specify a single HTTP(S) proxy URL. (default = no proxy) 42 | // Note: Not compatible with browser mode. 43 | // proxy: "http://someproxy.com:8043", 44 | 45 | // Specify multiple HTTP(S) proxy URLs. (default = no proxy) 46 | // Note: Not compatible with browser mode. 47 | // proxies: [ 48 | // "http://someproxy.com:8043", 49 | // "http://someotherproxy.com:8043", 50 | // ], 51 | 52 | // Enable file-based request caching. (default = no cache) 53 | // cache: "file", 54 | 55 | // Specify the HTTP request header. (default = none) 56 | // headers: { 57 | // "Authorization": "Bearer ...", 58 | // "User-Agent": "Mozilla ...", 59 | // }, 60 | 61 | // Use the cookie store of your local browser. (default = off) 62 | // Options: "chrome" | "edge" | "firefox" 63 | // cookies: "chrome", 64 | 65 | // Specify the output options. 66 | // output: { 67 | // // Specify the output file. (default = stdout) 68 | // file: "results.json", 69 | // 70 | // // Specify the output format. (default = json) 71 | // // Options: "json" | "ndjson" 72 | // format: "json", 73 | // }, 74 | }; 75 | 76 | export default function({ doc, absoluteURL }) { 77 | const title = doc.find("h1"); 78 | const link = doc.find("a"); 79 | 80 | return { 81 | title: title.text(), 82 | link: { 83 | text: link.text(), 84 | url: absoluteURL(link.attr("href")), 85 | }, 86 | }; 87 | } 88 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | "fmt" 9 | "io" 10 | "net/http" 11 | "strings" 12 | ) 13 | 14 | const HeaderBypassCache = "X-Flyscrape-Bypass-Cache" 15 | 16 | type RoundTripFunc func(*http.Request) (*http.Response, error) 17 | 18 | func (f RoundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) { 19 | return f(r) 20 | } 21 | 22 | func MockTransport(statusCode int, html string) RoundTripFunc { 23 | return func(*http.Request) (*http.Response, error) { 24 | return MockResponse(statusCode, html) 25 | } 26 | } 27 | 28 | func MockResponse(statusCode int, html string) (*http.Response, error) { 29 | return &http.Response{ 30 | StatusCode: statusCode, 31 | Status: fmt.Sprintf("%d %s", statusCode, http.StatusText(statusCode)), 32 | Body: io.NopCloser(strings.NewReader(html)), 33 | Header: http.Header{"Content-Type": []string{"text/html"}}, 34 | }, nil 35 | } 36 | -------------------------------------------------------------------------------- /watch.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape 6 | 7 | import ( 8 | "errors" 9 | "fmt" 10 | "os" 11 | "time" 12 | 13 | "github.com/fsnotify/fsnotify" 14 | ) 15 | 16 | var StopWatch = errors.New("stop watch") 17 | 18 | func Watch(path string, fn func(string) error) error { 19 | watcher, err := fsnotify.NewWatcher() 20 | if err != nil { 21 | return fmt.Errorf("creating file watcher: %w", err) 22 | } 23 | defer watcher.Close() 24 | 25 | if err := watcher.Add(path); err != nil { 26 | return fmt.Errorf("watching file %q: %w", path, err) 27 | } 28 | 29 | update := func() error { 30 | data, err := os.ReadFile(path) 31 | if err != nil { 32 | return err 33 | } 34 | return fn(string(data)) 35 | } 36 | 37 | if err := update(); errors.Is(err, StopWatch) { 38 | return nil 39 | } 40 | 41 | for { 42 | select { 43 | case e, ok := <-watcher.Events: 44 | if !ok { 45 | return nil 46 | } 47 | if e.Has(fsnotify.Rename) { 48 | time.Sleep(10 * time.Millisecond) 49 | watcher.Remove(path) 50 | watcher.Add(path) 51 | } 52 | if e.Has(fsnotify.Write) || e.Has(fsnotify.Rename) { 53 | if err := update(); errors.Is(err, StopWatch) { 54 | return nil 55 | } 56 | } 57 | case err, ok := <-watcher.Errors: 58 | if !ok { 59 | return nil 60 | } 61 | if err != nil { 62 | return fmt.Errorf("watcher: %w", err) 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /watch_test.go: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | package flyscrape_test 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | "time" 11 | 12 | "github.com/philippta/flyscrape" 13 | "github.com/stretchr/testify/require" 14 | ) 15 | 16 | func TestWatch(t *testing.T) { 17 | f := tmpfile(t) 18 | defer os.Remove(f.Name()) 19 | write(f, "test 1") 20 | 21 | calls := 0 22 | done := make(chan struct{}) 23 | 24 | go func() { 25 | err := flyscrape.Watch(f.Name(), func(s string) error { 26 | calls++ 27 | if calls == 1 { 28 | require.Equal(t, "test 1", s) 29 | return nil 30 | } 31 | if calls == 2 { 32 | require.Equal(t, "test 2", s) 33 | return flyscrape.StopWatch 34 | } 35 | return nil 36 | }) 37 | require.NoError(t, err) 38 | close(done) 39 | }() 40 | 41 | write(f, "test 2") 42 | <-done 43 | } 44 | 45 | func tmpfile(t *testing.T) *os.File { 46 | f, err := os.CreateTemp("", "scrape.js") 47 | require.NoError(t, err) 48 | return f 49 | } 50 | 51 | func write(f *os.File, s string) { 52 | time.Sleep(10 * time.Millisecond) 53 | f.Seek(0, 0) 54 | f.Truncate(0) 55 | f.WriteString(s) 56 | f.Sync() 57 | } 58 | --------------------------------------------------------------------------------