├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── TODO.md ├── examples ├── bing_de.json ├── bing_multiple_browser_multiple_pages.js ├── cleaned_html.js ├── custom_scraper.js ├── delete_comments.js ├── detection_checker.js ├── for_the_lulz.js ├── gimage.js ├── gnold.js ├── google_maps.js ├── headless-test-result.png ├── keywords.txt ├── minimal.js ├── multiple_browsers.js ├── multiple_search_engines.js ├── multiple_tabs.js ├── per_page_proxy.js ├── pluggable.js ├── pluggable_example.js ├── proxies.js ├── quickstart.js ├── reusing.js ├── test_cluster.js ├── test_promise.js └── test_proxyflag.js ├── index.js ├── jformat.py ├── package-lock.json ├── package.json ├── run.js ├── se-scraper.iml ├── src ├── captcha_solver.js ├── concurrency-implementation.js ├── modules │ ├── bing.js │ ├── duckduckgo.js │ ├── google.js │ ├── infospace.js │ ├── metadata.js │ ├── se_scraper.js │ └── yandex.js └── node_scraper.js └── test ├── html_output.js ├── mocks ├── bing │ ├── index.html │ ├── test keyword_page1.html │ ├── test keyword_page2.html │ └── test keyword_page3.html ├── duckduckgo │ ├── index.html │ ├── test keyword_page1.html │ ├── test keyword_page2.html │ └── test keyword_page3.html └── google │ ├── index.html │ ├── test keyword_page1.html │ ├── test keyword_page2.html │ └── test keyword_page3.html ├── modules ├── bing.js ├── duckduckgo.js └── google.js ├── proxy.js └── user_agent.js /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore static tests 2 | 3 | test/static_tests/html/ 4 | test/static_tests/html/* 5 | 6 | .idea 7 | 8 | # ignore data 9 | 10 | examples/data/ 11 | examples/data/* 12 | 13 | examples/results/ 14 | examples/results/* 15 | 16 | 17 | # Logs 18 | logs 19 | *.log 20 | npm-debug.log* 21 | yarn-debug.log* 22 | yarn-error.log* 23 | 24 | # Runtime data 25 | pids 26 | *.pid 27 | *.seed 28 | *.pid.lock 29 | 30 | # Directory for instrumented libs generated by jscoverage/JSCover 31 | lib-cov 32 | 33 | # Coverage directory used by tools like istanbul 34 | coverage 35 | 36 | # nyc test coverage 37 | .nyc_output 38 | 39 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 40 | .grunt 41 | 42 | # Bower dependency directory (https://bower.io/) 43 | bower_components 44 | 45 | # node-waf configuration 46 | .lock-wscript 47 | 48 | # Compiled binary addons (https://nodejs.org/api/addons.html) 49 | build/Release 50 | 51 | # Dependency directories 52 | node_modules/ 53 | jspm_packages/ 54 | 55 | # TypeScript v1 declaration files 56 | typings/ 57 | 58 | # Optional npm cache directory 59 | .npm 60 | 61 | # Optional eslint cache 62 | .eslintcache 63 | 64 | # Optional REPL history 65 | .node_repl_history 66 | 67 | # Output of 'npm pack' 68 | *.tgz 69 | 70 | # Yarn Integrity file 71 | .yarn-integrity 72 | 73 | # dotenv environment variables file 74 | .env 75 | 76 | # next.js build output 77 | .next 78 | 79 | 80 | .idea/ 81 | GoogleScraperPup.iml 82 | 83 | .http-mitm-proxy 84 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NikolaiT/se-scraper/5a0eea201dbeac7c9db4163eaa485bf4cd64f47d/.gitmodules -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at contact@scrapeulous.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:10-slim 2 | 3 | # Application parameters and variables 4 | # ENV NODE_ENV=production 5 | ENV HOST=0.0.0.0 6 | ENV PORT=3000 7 | ENV application_directory=/se-scraper 8 | ENV puppeteer_cluster_directory=/se-scraper/src/puppeteer-cluster 9 | 10 | # Create app directory 11 | WORKDIR $application_directory 12 | 13 | RUN apt-get update && \ 14 | apt-get install -y \ 15 | gconf-service \ 16 | libasound2 \ 17 | libatk1.0-0 \ 18 | libc6 \ 19 | libcairo2 \ 20 | libcups2 \ 21 | libdbus-1-3 \ 22 | libexpat1 \ 23 | libfontconfig1 \ 24 | libgcc1 \ 25 | libgconf-2-4 \ 26 | libgdk-pixbuf2.0-0 \ 27 | libglib2.0-0 \ 28 | libgtk-3-0 \ 29 | libnspr4 \ 30 | libpango-1.0-0 \ 31 | libpangocairo-1.0-0 \ 32 | libstdc++6 \ 33 | libx11-6 \ 34 | libx11-xcb1 \ 35 | libxcb1 \ 36 | libxcomposite1 \ 37 | libxcursor1 \ 38 | libxdamage1 \ 39 | libxext6 \ 40 | libxfixes3 \ 41 | libxi6 \ 42 | libxrandr2 \ 43 | libxrender1 \ 44 | libxss1 \ 45 | libxtst6 \ 46 | ca-certificates \ 47 | fonts-liberation \ 48 | libappindicator1 \ 49 | libnss3 \ 50 | lsb-release \ 51 | xdg-utils \ 52 | wget 53 | 54 | # Bundle app source 55 | COPY . . 56 | WORKDIR $puppeteer_cluster_directory 57 | RUN npm install \ 58 | && npm run build 59 | 60 | WORKDIR $application_directory 61 | # skip installing scripts for puppeteer dependencies 62 | # we've already installed puppeteer above. 63 | RUN npm install --ignore-scripts 64 | 65 | # Cleanup 66 | RUN apt-get clean && rm -rf /var/lib/apt/lists/* 67 | 68 | ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.2/dumb-init_1.2.2_amd64 /usr/local/bin/dumb-init 69 | RUN chmod +x /usr/local/bin/dumb-init 70 | 71 | EXPOSE $PORT 72 | 73 | CMD ["dumb-init", "node", "server/server.js"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 Nikolai Tschacher 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [The maintained successor of se-scraper is the general purpose crawling infrastructure](https://github.com/NikolaiT/Crawling-Infrastructure) 2 | 3 | ## Search Engine Scraper - se-scraper 4 | 5 | [![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper) 6 | [![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas) 7 | [![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper) 8 | 9 | This node module allows you to scrape search engines concurrently with different proxies. 10 | 11 | If you don't have extensive technical experience or don't want to purchase proxies, you can use [my scraping service](https://scrapeulous.com/). 12 | 13 | #### Table of Contents 14 | - [Installation](#installation) 15 | - [Docker](#docker-support) 16 | - [Minimal Example](#minimal-example) 17 | - [Quickstart](#quickstart) 18 | - [Contribute](#contribute) 19 | - [Using Proxies](#proxies) 20 | - [Custom Scrapers](#custom-scrapers) 21 | - [Examples](#examples) 22 | - [Scraping Model](#scraping-model) 23 | - [Technical Notes](#technical-notes) 24 | - [Advanced Usage](#advanced-usage) 25 | - [Special Query String Parameters for Search Engines](#query-string-parameters) 26 | 27 | 28 | Se-scraper supports the following search engines: 29 | * Google 30 | * Google News 31 | * Google News App version (https://news.google.com) 32 | * Google Image 33 | * Bing 34 | * Bing News 35 | * Infospace 36 | * Duckduckgo 37 | * Yandex 38 | * Webcrawler 39 | 40 | This module uses puppeteer and a modified version of [puppeteer-cluster](https://github.com/thomasdondorf/puppeteer-cluster/). It was created by the Developer of [GoogleScraper](https://github.com/NikolaiT/GoogleScraper), a module with 1800 Stars on Github. 41 | 42 | ## Installation 43 | 44 | You need a working installation of **node** and the **npm** package manager. 45 | 46 | 47 | For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands: 48 | 49 | ```bash 50 | sudo apt update; 51 | 52 | sudo apt install nodejs; 53 | 54 | # recent version of npm 55 | curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh; 56 | sudo bash nodesource_setup.sh; 57 | sudo apt install npm; 58 | ``` 59 | 60 | Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-). 61 | 62 | This command will install dependencies: 63 | 64 | ```bash 65 | # install all that is needed by chromium browser. Maybe not everything needed 66 | sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget; 67 | ``` 68 | 69 | Install **se-scraper** by entering the following command in your terminal 70 | 71 | ```bash 72 | npm install se-scraper 73 | ``` 74 | 75 | If you **don't** want puppeteer to download a complete chromium browser, add this variable to your environment. Then this module is not guaranteed to run out of the box. 76 | 77 | ```bash 78 | export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 79 | ``` 80 | 81 | ### Docker Support 82 | 83 | I will maintain a public docker image of se-scraper. Pull the docker image with the command: 84 | 85 | ```bash 86 | docker pull tschachn/se-scraper 87 | ``` 88 | 89 | Confirm that the docker image was correctly pulled: 90 | 91 | ```bash 92 | docker image ls 93 | ``` 94 | 95 | Should show something like that: 96 | 97 | ``` 98 | tschachn/se-scraper latest 897e1aeeba78 21 minutes ago 1.29GB 99 | ``` 100 | 101 | You can check the [latest tag here](https://hub.docker.com/r/tschachn/se-scraper/tags). In the example below, the latest tag is **latest**. This will most likely remain **latest** in the future. 102 | 103 | Run the docker image and map the internal port 3000 to the external 104 | port 3000: 105 | 106 | ```bash 107 | $ docker run -p 3000:3000 tschachn/se-scraper:latest 108 | 109 | Running on http://0.0.0.0:3000 110 | ``` 111 | 112 | When the image is running, you may start scrape jobs via HTTP API: 113 | 114 | ```bash 115 | curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \ 116 | -d '{ 117 | "browser_config": { 118 | "random_user_agent": true 119 | }, 120 | "scrape_config": { 121 | "search_engine": "google", 122 | "keywords": ["test"], 123 | "num_pages": 1 124 | } 125 | }' 126 | ``` 127 | 128 | Many thanks goes to [slotix](https://github.com/NikolaiT/se-scraper/pull/21) for his tremendous help in setting up a docker image. 129 | 130 | 131 | ## Minimal Example 132 | 133 | Create a file named `minimal.js` with the following contents 134 | 135 | ```js 136 | const se_scraper = require('se-scraper'); 137 | 138 | (async () => { 139 | let scrape_job = { 140 | search_engine: 'google', 141 | keywords: ['lets go boys'], 142 | num_pages: 1, 143 | }; 144 | 145 | var results = await se_scraper.scrape({}, scrape_job); 146 | 147 | console.dir(results, {depth: null, colors: true}); 148 | })(); 149 | ``` 150 | 151 | Start scraping by firing up the command `node minimal.js` 152 | 153 | ## Quickstart 154 | 155 | Create a file named `run.js` with the following contents 156 | 157 | ```js 158 | const se_scraper = require('se-scraper'); 159 | 160 | (async () => { 161 | let browser_config = { 162 | debug_level: 1, 163 | output_file: 'examples/results/data.json', 164 | }; 165 | 166 | let scrape_job = { 167 | search_engine: 'google', 168 | keywords: ['news', 'se-scraper'], 169 | num_pages: 1, 170 | // add some cool google search settings 171 | google_settings: { 172 | gl: 'us', // The gl parameter determines the Google country to use for the query. 173 | hl: 'en', // The hl parameter determines the Google UI language to return results. 174 | start: 0, // Determines the results offset to use, defaults to 0. 175 | num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. 176 | }, 177 | }; 178 | 179 | var scraper = new se_scraper.ScrapeManager(browser_config); 180 | 181 | await scraper.start(); 182 | 183 | var results = await scraper.scrape(scrape_job); 184 | 185 | console.dir(results, {depth: null, colors: true}); 186 | 187 | await scraper.quit(); 188 | })(); 189 | ``` 190 | 191 | Start scraping by firing up the command `node run.js` 192 | 193 | ## Contribute 194 | 195 | I really help and love your help! However scraping is a dirty business and it often takes me a lot of time to find failing selectors or missing JS logic. So if any search engine does not yield the results of your liking, please create a **static test case** similar to [this static test of google](test/static_tests/google.js) that fails. I will try to correct se-scraper then. 196 | 197 | That's how you would proceed: 198 | 199 | 1. Copy the [static google test case](test/static_tests/google.js) 200 | 2. Remove all unnecessary testing code 201 | 3. Save a search to file where se-scraper does not work correctly. 202 | 3. Implement the static test case using the saved search html where se-scraper currently fails. 203 | 4. Submit a new issue with the failing test case as pull request 204 | 5. I will fix it! (or better: you submit a pull request directly) 205 | 206 | ## Proxies 207 | 208 | **se-scraper** will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one (your own IP). 209 | 210 | ```js 211 | const se_scraper = require('se-scraper'); 212 | 213 | (async () => { 214 | let browser_config = { 215 | debug_level: 1, 216 | output_file: 'examples/results/proxyresults.json', 217 | proxy_file: '/home/nikolai/.proxies', // one proxy per line 218 | log_ip_address: true, 219 | }; 220 | 221 | let scrape_job = { 222 | search_engine: 'google', 223 | keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'], 224 | num_pages: 1, 225 | }; 226 | 227 | var scraper = new se_scraper.ScrapeManager(browser_config); 228 | await scraper.start(); 229 | 230 | var results = await scraper.scrape(scrape_job); 231 | console.dir(results, {depth: null, colors: true}); 232 | await scraper.quit(); 233 | })(); 234 | ``` 235 | 236 | With a proxy file such as 237 | 238 | ```text 239 | socks5://53.34.23.55:55523 240 | socks4://51.11.23.22:22222 241 | ``` 242 | 243 | This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that. 244 | 245 | 246 | ## Custom Scrapers 247 | 248 | You can define your own scraper class and use it within se-scraper. 249 | 250 | [Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia. 251 | 252 | 253 | ## Examples 254 | 255 | * [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json) 256 | * [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json) 257 | * [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json) 258 | * [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json) 259 | * [Inject your own scraping logic](examples/pluggable.js) 260 | * [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js) 261 | * [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json) 262 | 263 | 264 | ## Scraping Model 265 | 266 | **se-scraper** scrapes search engines only. In order to introduce concurrency into this library, it is necessary to define the scraping model. Then we can decide how we divide and conquer. 267 | 268 | #### Scraping Resources 269 | 270 | What are common scraping resources? 271 | 272 | 1. **Memory and CPU**. Necessary to launch multiple browser instances. 273 | 2. **Network Bandwith**. Is not often the bottleneck. 274 | 3. **IP Addresses**. Websites often block IP addresses after a certain amount of requests from the same IP address. Can be circumvented by using proxies. 275 | 4. Spoofable identifiers such as browser fingerprint or user agents. Those will be handled by **se-scraper** 276 | 277 | #### Concurrency Model 278 | 279 | **se-scraper** should be able to run without any concurrency at all. This is the default case. No concurrency means only one browser/tab is searching at the time. 280 | 281 | For concurrent use, we will make use of a modified [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster). 282 | 283 | One scrape job is properly defined by 284 | 285 | * 1 search engine such as `google` 286 | * `M` pages 287 | * `N` keywords/queries 288 | * `K` proxies and `K+1` browser instances (because when we have no proxies available, we will scrape with our dedicated IP) 289 | 290 | Then **se-scraper** will create `K+1` dedicated browser instances with a unique ip address. Each browser will get `N/(K+1)` keywords and will issue `N/(K+1) * M` total requests to the search engine. 291 | 292 | The problem is that [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) does only allow identical options for subsequent new browser instances. Therefore, it is not trivial to launch a cluster of browsers with distinct proxy settings. Right now, every browser has the same options. It's not possible to set options on a per browser basis. 293 | 294 | Solution: 295 | 296 | 1. Create a [upstream proxy router](https://github.com/GoogleChrome/puppeteer/issues/678). 297 | 2. Modify [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) to accept a list of proxy strings and then pop() from this list at every new call to `workerInstance()` in https://github.com/thomasdondorf/puppeteer-cluster/blob/master/src/Cluster.ts I wrote an [issue here](https://github.com/thomasdondorf/puppeteer-cluster/issues/107). **I ended up doing this**. 298 | 299 | 300 | ## Technical Notes 301 | 302 | Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol. 303 | 304 | If you need to deploy scraping to the cloud (AWS or Azure), you can contact me at **hire@incolumitas.com** 305 | 306 | The chromium browser is started with the following flags to prevent 307 | scraping detection. 308 | 309 | ```js 310 | var ADDITIONAL_CHROME_FLAGS = [ 311 | '--disable-infobars', 312 | '--window-position=0,0', 313 | '--ignore-certifcate-errors', 314 | '--ignore-certifcate-errors-spki-list', 315 | '--no-sandbox', 316 | '--disable-setuid-sandbox', 317 | '--disable-dev-shm-usage', 318 | '--disable-accelerated-2d-canvas', 319 | '--disable-gpu', 320 | '--window-size=1920x1080', 321 | '--hide-scrollbars', 322 | '--disable-notifications', 323 | ]; 324 | ``` 325 | 326 | Furthermore, to avoid loading unnecessary ressources and to speed up 327 | scraping a great deal, we instruct chrome to not load images and css and media: 328 | 329 | ```js 330 | await page.setRequestInterception(true); 331 | page.on('request', (req) => { 332 | let type = req.resourceType(); 333 | const block = ['stylesheet', 'font', 'image', 'media']; 334 | if (block.includes(type)) { 335 | req.abort(); 336 | } else { 337 | req.continue(); 338 | } 339 | }); 340 | ``` 341 | 342 | #### Making puppeteer and headless chrome undetectable 343 | 344 | Consider the following resources: 345 | 346 | * https://antoinevastel.com/bot%20detection/2019/07/19/detecting-chrome-headless-v3.html 347 | * https://intoli.com/blog/making-chrome-headless-undetectable/ 348 | * https://intoli.com/blog/not-possible-to-block-chrome-headless/ 349 | * https://news.ycombinator.com/item?id=16179602 350 | 351 | **se-scraper** implements the countermeasures against headless chrome detection proposed on those sites. 352 | 353 | Most recent detection counter measures can be found here: 354 | 355 | * https://github.com/paulirish/headless-cat-n-mouse/blob/master/apply-evasions.js 356 | 357 | **se-scraper** makes use of those anti detection techniques. 358 | 359 | To check whether evasion works, you can test it by passing `test_evasion` flag to the config: 360 | 361 | ```js 362 | let config = { 363 | // check if headless chrome escapes common detection techniques 364 | test_evasion: true 365 | }; 366 | ``` 367 | 368 | It will create a screenshot named `headless-test-result.png` in the directory where the scraper was started that shows whether all test have passed. 369 | 370 | ## Advanced Usage 371 | 372 | Use **se-scraper** by calling it with a script such as the one below. 373 | 374 | ```js 375 | const se_scraper = require('se-scraper'); 376 | 377 | // those options need to be provided on startup 378 | // and cannot give to se-scraper on scrape() calls 379 | let browser_config = { 380 | // the user agent to scrape with 381 | user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36', 382 | // if random_user_agent is set to True, a random user agent is chosen 383 | random_user_agent: false, 384 | // whether to select manual settings in visible mode 385 | set_manual_settings: false, 386 | // log ip address data 387 | log_ip_address: false, 388 | // log http headers 389 | log_http_headers: false, 390 | // how long to sleep between requests. a random sleep interval within the range [a,b] 391 | // is drawn before every request. empty string for no sleeping. 392 | sleep_range: '', 393 | // which search engine to scrape 394 | search_engine: 'google', 395 | compress: false, // compress 396 | // whether debug information should be printed 397 | // level 0: print nothing 398 | // level 1: print most important info 399 | // ... 400 | // level 4: print all shit nobody wants to know 401 | debug_level: 1, 402 | keywords: ['nodejs rocks',], 403 | // whether to start the browser in headless mode 404 | headless: true, 405 | // specify flags passed to chrome here 406 | chrome_flags: [], 407 | // the number of pages to scrape for each keyword 408 | num_pages: 1, 409 | // path to output file, data will be stored in JSON 410 | output_file: '', 411 | // whether to also passthru all the html output of the serp pages 412 | html_output: false, 413 | // whether to return a screenshot of serp pages as b64 data 414 | screen_output: false, 415 | // whether to prevent images, css, fonts and media from being loaded 416 | // will speed up scraping a great deal 417 | block_assets: true, 418 | // path to js module that extends functionality 419 | // this module should export the functions: 420 | // get_browser, handle_metadata, close_browser 421 | //custom_func: resolve('examples/pluggable.js'), 422 | custom_func: '', 423 | throw_on_detection: false, 424 | // use a proxy for all connections 425 | // example: 'socks5://78.94.172.42:1080' 426 | // example: 'http://118.174.233.10:48400' 427 | proxy: '', 428 | // a file with one proxy per line. Example: 429 | // socks5://78.94.172.42:1080 430 | // http://118.174.233.10:48400 431 | proxy_file: '', 432 | // whether to use proxies only 433 | // when this is set to true, se-scraper will not use 434 | // your default IP address 435 | use_proxies_only: false, 436 | // check if headless chrome escapes common detection techniques 437 | // this is a quick test and should be used for debugging 438 | test_evasion: false, 439 | apply_evasion_techniques: true, 440 | // settings for puppeteer-cluster 441 | puppeteer_cluster_config: { 442 | timeout: 30 * 60 * 1000, // max timeout set to 30 minutes 443 | monitor: false, 444 | concurrency: Cluster.CONCURRENCY_BROWSER, 445 | maxConcurrency: 1, 446 | } 447 | }; 448 | 449 | (async () => { 450 | // scrape config can change on each scrape() call 451 | let scrape_config = { 452 | // which search engine to scrape 453 | search_engine: 'google', 454 | // an array of keywords to scrape 455 | keywords: ['cat', 'mouse'], 456 | // the number of pages to scrape for each keyword 457 | num_pages: 2, 458 | 459 | // OPTIONAL PARAMS BELOW: 460 | google_settings: { 461 | gl: 'us', // The gl parameter determines the Google country to use for the query. 462 | hl: 'fr', // The hl parameter determines the Google UI language to return results. 463 | start: 0, // Determines the results offset to use, defaults to 0. 464 | num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. 465 | }, 466 | // instead of keywords you can specify a keyword_file. this overwrites the keywords array 467 | keyword_file: '', 468 | // how long to sleep between requests. a random sleep interval within the range [a,b] 469 | // is drawn before every request. empty string for no sleeping. 470 | sleep_range: '', 471 | // path to output file, data will be stored in JSON 472 | output_file: 'output.json', 473 | // whether to prevent images, css, fonts from being loaded 474 | // will speed up scraping a great deal 475 | block_assets: false, 476 | // check if headless chrome escapes common detection techniques 477 | // this is a quick test and should be used for debugging 478 | test_evasion: false, 479 | apply_evasion_techniques: true, 480 | // log ip address data 481 | log_ip_address: false, 482 | // log http headers 483 | log_http_headers: false, 484 | }; 485 | 486 | let results = await se_scraper.scrape(browser_config, scrape_config); 487 | console.dir(results, {depth: null, colors: true}); 488 | })(); 489 | ``` 490 | 491 | [Output for the above script on my machine.](examples/results/advanced.json) 492 | 493 | ### Query String Parameters 494 | 495 | You can add your custom query string parameters to the configuration object by specifying a `google_settings` key. In general: `{{search engine}}_settings`. 496 | 497 | For example you can customize your google search with the following config: 498 | 499 | ```js 500 | let scrape_config = { 501 | search_engine: 'google', 502 | // use specific search engine parameters for various search engines 503 | google_settings: { 504 | google_domain: 'google.com', 505 | gl: 'us', // The gl parameter determines the Google country to use for the query. 506 | hl: 'us', // The hl parameter determines the Google UI language to return results. 507 | start: 0, // Determines the results offset to use, defaults to 0. 508 | num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. 509 | }, 510 | } 511 | ``` 512 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ### 24.12.2018 2 | - fix interface to scrape() [DONE] 3 | - add to Github 4 | 5 | 6 | ### 24.1.2018 7 | - fix issue #3: add functionality to add keyword file 8 | 9 | ### 27.1.2019 10 | - Add functionality to block images and CSS from loading as described here: 11 | https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/ 12 | https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/ 13 | 14 | ### 29.1.2019 15 | - implement proxy support functionality 16 | - implement proxy check 17 | 18 | - implement scraping more than 1 page 19 | - do it for google 20 | - and bing 21 | - implement duckduckgo scraping 22 | 23 | 24 | ### 30.1.2019 25 | - modify all scrapers to use the generic class where it makes sense 26 | - Bing, Baidu, Google, Duckduckgo 27 | 28 | ### 7.2.2019 29 | - add num_requests to test cases [done] 30 | 31 | ### 25.2.2019 32 | - https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html 33 | - add support for browsing with multiple browsers, use this neat library: 34 | - https://github.com/thomasdondorf/puppeteer-cluster [done] 35 | 36 | 37 | ### 28.2.2019 38 | - write test case for multiple browsers/proxies 39 | - write test case and example for multiple tabs with bing 40 | - make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template 41 | 42 | 43 | ### 11.6.2019 44 | - TODO: fix amazon scraping 45 | - change api of remaining test cases [done] 46 | - TODO: implement custom search engine parameters on scrape() 47 | 48 | ### 12.6.2019 49 | - remove unnecessary sleep() calls and replace with waitFor selectors 50 | 51 | 52 | ### 16.7.2019 53 | 54 | - resolve issues 55 | - fix this https://github.com/NikolaiT/se-scraper/issues/37 [done] 56 | 57 | - use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth 58 | 59 | - we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now] 60 | 61 | - user random user agents plugin: https://github.com/intoli/user-agents [done] 62 | 63 | - add screenshot capability (make the screen after parsing) 64 | - store as b64 [done] 65 | 66 | 67 | 68 | ### 12.8.2019 69 | 70 | - add static test case for bing [done] 71 | - add options that minimize `html_output` flag: 72 | `clean_html_output` will remove all JS and CSS from the html 73 | `clean_data_images` removes all data images from the html 74 | [done] 75 | 76 | 77 | ### 13.8.2019 78 | - Write test case for clean html output [done] 79 | - Consider better compression algorithm. [done] There is the brotli algorithm, but this is only supported 80 | in very recent versions of nodejs 81 | - what else can we remove from the dom [done] Removing comment nodes now! They are large in BING. 82 | - remove all whitespace and \n and \t from html 83 | 84 | ### TODO: 85 | 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done] 86 | 2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions 87 | 88 | 3. dont create a new tab when opening a new scraper 89 | -------------------------------------------------------------------------------- /examples/bing_multiple_browser_multiple_pages.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var path = require('path'); 3 | var os = require("os"); 4 | 5 | const se_scraper = require('./../index.js'); 6 | var filepath_de = path.join(__dirname, '/data/keywords_de.txt'); 7 | 8 | function read_keywords_from_file(fpath) { 9 | let kws = fs.readFileSync(fpath).toString().split(os.EOL); 10 | // clean keywords 11 | kws = kws.filter((kw) => { 12 | return kw.trim().length > 0; 13 | }); 14 | return kws; 15 | } 16 | 17 | let keywords_de = read_keywords_from_file(filepath_de); 18 | 19 | const Cluster = { 20 | CONCURRENCY_PAGE: 1, // shares cookies, etc. 21 | CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts) 22 | CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts) 23 | }; 24 | 25 | // those options need to be provided on startup 26 | // and cannot give to se-scraper on scrape() calls 27 | let browser_config = { 28 | // the user agent to scrape with 29 | user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 30 | // if random_user_agent is set to True, a random user agent is chosen 31 | random_user_agent: true, 32 | verbose: true, 33 | // whether to start the browser in headless mode 34 | headless: true, 35 | is_local: false, 36 | throw_on_detection: false, 37 | puppeteer_cluster_config: { 38 | headless: true, 39 | timeout: 2 * 60 * 1000, // max timeout set to 2 minutes 40 | monitor: false, 41 | concurrency: 3, // one scraper per tab 42 | maxConcurrency: 3, // scrape with 5 tabs 43 | } 44 | }; 45 | 46 | (async () => { 47 | // scrape config can change on each scrape() call 48 | let scrape_config_bing_de = { 49 | // which search engine to scrape 50 | search_engine: 'bing', 51 | // an array of keywords to scrape 52 | keywords: keywords_de, 53 | // the number of pages to scrape for each keyword 54 | num_pages: 10, 55 | 56 | // OPTIONAL PARAMS BELOW: 57 | // https://docs.microsoft.com/en-us/rest/api/cognitiveservices-bingsearch/bing-web-api-v5-reference#query-parameters 58 | bing_settings: { 59 | cc: 'DE', // The cc parameter determines the country to use for the query. 60 | mkt: 'de-DE', // The mkt parameter determines the UI language to return results. 61 | offset: 0, // Determines the results offset to use, defaults to 0. 62 | count: 20, // Determines the number of results to show, defaults to 10. Maximum is 100. 63 | }, 64 | // how long to sleep between requests. a random sleep interval within the range [a,b] 65 | // is drawn before every request. empty string for no sleeping. 66 | sleep_range: '', 67 | // path to output file, data will be stored in JSON 68 | output_file: 'examples/bing_de.json', 69 | // whether to prevent images, css, fonts from being loaded 70 | // will speed up scraping a great deal 71 | block_assets: true, 72 | // check if headless chrome escapes common detection techniques 73 | // this is a quick test and should be used for debugging 74 | test_evasion: false, 75 | apply_evasion_techniques: true, 76 | // log ip address data 77 | log_ip_address: false, 78 | // log http headers 79 | log_http_headers: false, 80 | }; 81 | 82 | let results = await se_scraper.scrape(browser_config, scrape_config_bing_de); 83 | console.dir(results.metadata, {depth: null, colors: true}); 84 | 85 | })(); -------------------------------------------------------------------------------- /examples/cleaned_html.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../index.js'); 2 | const fs = require('fs'); 3 | 4 | (async () => { 5 | 6 | let kw = 'news iran' 7 | 8 | let scrape_job = { 9 | search_engine: 'baidu', 10 | keywords: [kw], 11 | num_pages: 1, 12 | html_output: true, 13 | // whether to strip JS and CSS from the html_output 14 | // has only an effect if `html_output` is true 15 | clean_html_output: true, 16 | // remove all data images from the html 17 | clean_data_images: true, 18 | }; 19 | 20 | var response = await se_scraper.scrape({}, scrape_job); 21 | 22 | console.dir(response, {depth: null, colors: true}); 23 | 24 | fs.writeFileSync('example_cleaned.html', response.results[kw]['1']['html']); 25 | })(); 26 | -------------------------------------------------------------------------------- /examples/custom_scraper.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../index.js'); 2 | 3 | /* 4 | * This example shows how you can define your custom scraper class and use it 5 | * within se-scraper. 6 | */ 7 | class EcosiaScraper extends se_scraper.Scraper { 8 | 9 | constructor(...args) { 10 | super(...args); 11 | } 12 | 13 | async parse_async(html) { 14 | // In this example we use vanilla javascript to parse out the 15 | // interesting information from the search engine 16 | 17 | // you may also use a external library such as cheerio. 18 | 19 | return await this.page.evaluate(() => { 20 | var results = { 21 | num_results: '', 22 | no_results: false, 23 | effective_query: '', 24 | results: [], 25 | }; 26 | 27 | document.querySelectorAll('.results .result').forEach((result) => { 28 | var serp = {}; 29 | var title = result.querySelector('.result-title'); 30 | if (title) { 31 | serp.title = title.innerText; 32 | serp.link = title.getAttribute('href'); 33 | } 34 | 35 | var green = result.querySelector('.result-url'); 36 | if (green) { 37 | serp.green = green.getAttribute('href'); 38 | } 39 | 40 | var snippet = result.querySelector('.result-snippet'); 41 | 42 | if (snippet) { 43 | serp.snippet = snippet.innerText; 44 | } 45 | 46 | results.results.push(serp); 47 | }); 48 | 49 | var num_res = document.querySelector('.card-title-result-count'); 50 | if (num_res) { 51 | results.num_results = num_res.innerText; 52 | } 53 | 54 | results.no_results = document.querySelector('.empty-result') != null; 55 | 56 | var effective = document.querySelector('.query-context-text .result-title'); 57 | 58 | if (effective) { 59 | results.effective_query = effective.innerText; 60 | } 61 | 62 | return results; 63 | }); 64 | } 65 | 66 | async load_start_page() { 67 | let startUrl = 'https://www.ecosia.org/'; 68 | 69 | await this.page.goto(startUrl); 70 | 71 | try { 72 | await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); 73 | } catch (e) { 74 | return false; 75 | } 76 | 77 | return true; 78 | } 79 | 80 | async search_keyword(keyword) { 81 | const input = await this.page.$('input[name="q"]'); 82 | await this.set_input_value(`input[name="q"]`, keyword); 83 | await this.sleep(50); 84 | await input.focus(); 85 | await this.page.keyboard.press("Enter"); 86 | } 87 | 88 | async next_page() { 89 | let next_page_link = await this.page.$('.pagination-next', {timeout: 1000}); 90 | if (!next_page_link) { 91 | return false; 92 | } 93 | await next_page_link.click(); 94 | 95 | return true; 96 | } 97 | 98 | async wait_for_results() { 99 | await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT }); 100 | } 101 | 102 | async detected() { 103 | // check whether scraping was detected. 104 | } 105 | } 106 | 107 | (async () => { 108 | 109 | let scrape_job = { 110 | search_engine: EcosiaScraper, 111 | keywords: ['lets go boys'], 112 | num_pages: 2, 113 | }; 114 | 115 | var results = await se_scraper.scrape({headless: true}, scrape_job); 116 | 117 | console.dir(results, {depth: null, colors: true}); 118 | 119 | })(); 120 | -------------------------------------------------------------------------------- /examples/delete_comments.js: -------------------------------------------------------------------------------- 1 | var nodeIterator = document.createNodeIterator( 2 | document.body, 3 | NodeFilter.SHOW_COMMENT, 4 | { acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } } 5 | ); 6 | 7 | // Remove all comment nodes 8 | while(nodeIterator.nextNode()){ 9 | var commentNode = nodeIterator.referenceNode; 10 | commentNode.remove(); 11 | } -------------------------------------------------------------------------------- /examples/detection_checker.js: -------------------------------------------------------------------------------- 1 | /* 2 | * See here for most recent detection avoidance: https://github.com/paulirish/headless-cat-n-mouse/blob/master/apply-evasions.js 3 | */ 4 | 5 | // We'll use Puppeteer is our browser automation framework. 6 | const puppeteer = require('puppeteer'); 7 | 8 | // This is where we'll put the code to get around the tests. 9 | const preparePageForTests = async (page) => { 10 | // Pass the User-Agent Test. 11 | const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)' + 12 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36'; 13 | await page.setUserAgent(userAgent); 14 | 15 | // Pass the Webdriver Test. 16 | await page.evaluateOnNewDocument(() => { 17 | const newProto = navigator.__proto__; 18 | delete newProto.webdriver; 19 | navigator.__proto__ = newProto; 20 | }); 21 | 22 | // Pass the Chrome Test. 23 | await page.evaluateOnNewDocument(() => { 24 | // We can mock this in as much depth as we need for the test. 25 | const mockObj = { 26 | app: { 27 | isInstalled: false, 28 | }, 29 | webstore: { 30 | onInstallStageChanged: {}, 31 | onDownloadProgress: {}, 32 | }, 33 | runtime: { 34 | PlatformOs: { 35 | MAC: 'mac', 36 | WIN: 'win', 37 | ANDROID: 'android', 38 | CROS: 'cros', 39 | LINUX: 'linux', 40 | OPENBSD: 'openbsd', 41 | }, 42 | PlatformArch: { 43 | ARM: 'arm', 44 | X86_32: 'x86-32', 45 | X86_64: 'x86-64', 46 | }, 47 | PlatformNaclArch: { 48 | ARM: 'arm', 49 | X86_32: 'x86-32', 50 | X86_64: 'x86-64', 51 | }, 52 | RequestUpdateCheckStatus: { 53 | THROTTLED: 'throttled', 54 | NO_UPDATE: 'no_update', 55 | UPDATE_AVAILABLE: 'update_available', 56 | }, 57 | OnInstalledReason: { 58 | INSTALL: 'install', 59 | UPDATE: 'update', 60 | CHROME_UPDATE: 'chrome_update', 61 | SHARED_MODULE_UPDATE: 'shared_module_update', 62 | }, 63 | OnRestartRequiredReason: { 64 | APP_UPDATE: 'app_update', 65 | OS_UPDATE: 'os_update', 66 | PERIODIC: 'periodic', 67 | }, 68 | }, 69 | }; 70 | 71 | window.navigator.chrome = mockObj; 72 | window.chrome = mockObj; 73 | }); 74 | 75 | // Pass the Permissions Test. 76 | await page.evaluateOnNewDocument(() => { 77 | const originalQuery = window.navigator.permissions.query; 78 | window.navigator.permissions.__proto__.query = parameters => 79 | parameters.name === 'notifications' 80 | ? Promise.resolve({state: Notification.permission}) 81 | : originalQuery(parameters); 82 | 83 | // Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js 84 | const oldCall = Function.prototype.call; 85 | function call() { 86 | return oldCall.apply(this, arguments); 87 | } 88 | Function.prototype.call = call; 89 | 90 | const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString"); 91 | const oldToString = Function.prototype.toString; 92 | 93 | function functionToString() { 94 | if (this === window.navigator.permissions.query) { 95 | return "function query() { [native code] }"; 96 | } 97 | if (this === functionToString) { 98 | return nativeToStringFunctionString; 99 | } 100 | return oldCall.call(oldToString, this); 101 | } 102 | Function.prototype.toString = functionToString; 103 | }); 104 | 105 | // Pass the Plugins Length Test. 106 | await page.evaluateOnNewDocument(() => { 107 | // Overwrite the `plugins` property to use a custom getter. 108 | Object.defineProperty(navigator, 'plugins', { 109 | // This just needs to have `length > 0` for the current test, 110 | // but we could mock the plugins too if necessary. 111 | get: () => [1, 2, 3, 4, 5] 112 | }); 113 | }); 114 | 115 | // Pass the Languages Test. 116 | await page.evaluateOnNewDocument(() => { 117 | // Overwrite the `plugins` property to use a custom getter. 118 | Object.defineProperty(navigator, 'languages', { 119 | get: () => ['en-US', 'en'] 120 | }); 121 | }); 122 | 123 | // Pass the iframe Test 124 | await page.evaluateOnNewDocument(() => { 125 | Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { 126 | get: function() { 127 | return window; 128 | } 129 | }); 130 | }); 131 | 132 | // Pass toString test, though it breaks console.debug() from working 133 | await page.evaluateOnNewDocument(() => { 134 | window.console.debug = () => { 135 | return null; 136 | }; 137 | }); 138 | }; 139 | 140 | (async () => { 141 | // Launch the browser in headless mode and set up a page. 142 | const browser = await puppeteer.launch({ 143 | args: ['--no-sandbox'], 144 | headless: true, 145 | }); 146 | const page = await browser.newPage(); 147 | 148 | // Prepare for the tests (not yet implemented). 149 | await preparePageForTests(page); 150 | 151 | // Navigate to the page that will perform the tests. 152 | const testUrl = 'https://intoli.com/blog/' + 153 | 'not-possible-to-block-chrome-headless/chrome-headless-test.html'; 154 | await page.goto(testUrl); 155 | 156 | // Save a screenshot of the results. 157 | await page.screenshot({path: 'headless-test-result.png'}); 158 | 159 | // Clean up. 160 | await browser.close() 161 | })(); -------------------------------------------------------------------------------- /examples/for_the_lulz.js: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Do not run this, this is probably illegal in your country ;) 4 | */ 5 | 6 | const se_scraper = require('./../index.js'); 7 | 8 | 9 | // generate some google dorks 10 | 11 | function genGoogleDorks(iter=4) { 12 | let lulz_keywords = []; 13 | ['seite', 'inicio', 'index'].forEach((x) => { 14 | for (var i = 0; i < iter; i++) { 15 | lulz_keywords.push( 16 | 'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"' 17 | ) 18 | } 19 | }); 20 | return lulz_keywords; 21 | } 22 | 23 | const lulz_keywords = genGoogleDorks(); 24 | console.log(lulz_keywords); 25 | 26 | 27 | // those options need to be provided on startup 28 | // and cannot give to se-scraper on scrape() calls 29 | let browser_config = { 30 | // if random_user_agent is set to True, a random user agent is chosen 31 | random_user_agent: true, 32 | headless: true, 33 | is_local: false, 34 | throw_on_detection: false, 35 | puppeteer_cluster_config: { 36 | headless: true, 37 | timeout: 2 * 60 * 1000, // max timeout set to 2 minutes 38 | monitor: false, 39 | concurrency: 3, // one scraper per tab 40 | maxConcurrency: 4, // scrape with 4 tabs 41 | } 42 | }; 43 | 44 | (async () => { 45 | // scrape config can change on each scrape() call 46 | let lulz_config = { 47 | // which search engine to scrape 48 | search_engine: 'google', 49 | // an array of keywords to scrape 50 | keywords: lulz_keywords, 51 | // the number of pages to scrape for each keyword 52 | num_pages: 3, 53 | // how long to sleep between requests. a random sleep interval within the range [a,b] 54 | // is drawn before every request. empty string for no sleeping. 55 | sleep_range: '', 56 | // path to output file, data will be stored in JSON 57 | output_file: 'goodboys.json', 58 | // whether to prevent images, css, fonts from being loaded 59 | // will speed up scraping a great deal 60 | block_assets: true, 61 | // check if headless chrome escapes common detection techniques 62 | // this is a quick test and should be used for debugging 63 | test_evasion: false, 64 | apply_evasion_techniques: true, 65 | // log ip address data 66 | log_ip_address: false, 67 | // log http headers 68 | log_http_headers: false, 69 | }; 70 | 71 | let results = await se_scraper.scrape(browser_config, lulz_config); 72 | 73 | const all_links = []; 74 | 75 | for (var kw in results) { 76 | for (var page in results[kw]) { 77 | for (var res of results[kw][page]['results']) { 78 | all_links.push(res.link); 79 | } 80 | } 81 | } 82 | 83 | console.log(all_links); 84 | 85 | for (var link of all_links) { 86 | try { 87 | const response = await got(link.replace(/(id=\d+)/g, "$1'")); 88 | let html = response.body; 89 | if (html.includes('error') || html.includes('mysql')) { 90 | console.log('Got a mysql injection in ' + url); 91 | } 92 | } catch (error) { 93 | console.log(error.response.statusCode); 94 | } 95 | } 96 | 97 | })(); -------------------------------------------------------------------------------- /examples/gimage.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | output_file: '', 6 | }; 7 | 8 | let scrape_job = { 9 | search_engine: 'google_image', 10 | keywords: ['manaslu', 'everest', 'pitcairn'], 11 | num_pages: 1, 12 | }; 13 | 14 | var scraper = new se_scraper.ScrapeManager(browser_config); 15 | 16 | await scraper.start(); 17 | 18 | var results = await scraper.scrape(scrape_job); 19 | 20 | console.dir(results, {depth: null, colors: true}); 21 | 22 | await scraper.quit(); 23 | })(); 24 | -------------------------------------------------------------------------------- /examples/gnold.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | output_file: 'examples/results/gnold.json', 6 | google_news_old_settings: { 7 | gl: 'us', // The gl parameter determines the Google country to use for the query. 8 | hl: 'fr', // The hl parameter determines the Google UI language to return results. 9 | start: 0, // Determines the results offset to use, defaults to 0. 10 | num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. 11 | }, 12 | }; 13 | 14 | let scrape_job = { 15 | search_engine: 'google_news_old', 16 | keywords: ['news world'], 17 | num_pages: 1, 18 | }; 19 | 20 | var scraper = new se_scraper.ScrapeManager(browser_config); 21 | await scraper.start(); 22 | 23 | var results = await scraper.scrape(scrape_job); 24 | console.dir(results, {depth: null, colors: true}); 25 | await scraper.quit(); 26 | })(); 27 | -------------------------------------------------------------------------------- /examples/google_maps.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | output_file: 'examples/results/maps.json', 6 | test_evasion: false, 7 | block_assets: false, 8 | headless: false, 9 | 10 | google_maps_settings: { 11 | scrape_in_detail: false, 12 | } 13 | }; 14 | 15 | let scrape_job = { 16 | search_engine: 'google_maps', 17 | keywords: ['Berlin Zahnarzt'], 18 | num_pages: 1, 19 | }; 20 | 21 | var scraper = new se_scraper.ScrapeManager(browser_config); 22 | 23 | await scraper.start(); 24 | 25 | var results = await scraper.scrape(scrape_job); 26 | 27 | console.dir(results, {depth: null, colors: true}); 28 | 29 | await scraper.quit(); 30 | })(); 31 | -------------------------------------------------------------------------------- /examples/headless-test-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NikolaiT/se-scraper/5a0eea201dbeac7c9db4163eaa485bf4cd64f47d/examples/headless-test-result.png -------------------------------------------------------------------------------- /examples/keywords.txt: -------------------------------------------------------------------------------- 1 | test 2 | water is blue -------------------------------------------------------------------------------- /examples/minimal.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../index.js'); 2 | 3 | (async () => { 4 | 5 | let kws = [ 6 | 'https://www.linkedin.com/in/aakanksha-majhi-b24a8449', 7 | 'https://www.linkedin.com/in/aakash-srivastava-7374a830', 8 | 'https://www.linkedin.com/in/aakash-tiwari-019b8569', 9 | ]; 10 | 11 | let scrape_job = { 12 | search_engine: 'google', 13 | keywords: kws, 14 | num_pages: 1, 15 | }; 16 | 17 | var results = await se_scraper.scrape({}, scrape_job); 18 | 19 | console.dir(results, {depth: null, colors: true}); 20 | 21 | })(); 22 | -------------------------------------------------------------------------------- /examples/multiple_browsers.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | search_engine: 'google', 6 | random_user_agent: true, 7 | is_local: false, 8 | html_output: false, 9 | throw_on_detection: false, 10 | headless: true, 11 | puppeteer_cluster_config: { 12 | headless: true, 13 | timeout: 30 * 60 * 1000, // max timeout set to 30 minutes 14 | monitor: false, 15 | concurrency: 3, // 3 == CONCURRENCY_BROWSER 16 | maxConcurrency: 3, // 3 browsers will scrape 17 | }, 18 | }; 19 | 20 | let scrape_job = { 21 | search_engine: 'google', 22 | keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'], 23 | num_pages: 1, 24 | }; 25 | 26 | var scraper = new se_scraper.ScrapeManager(browser_config); 27 | 28 | await scraper.start(); 29 | 30 | var results = await scraper.scrape(scrape_job); 31 | 32 | console.dir(results, {depth: null, colors: true}); 33 | 34 | await scraper.quit(); 35 | })(); 36 | -------------------------------------------------------------------------------- /examples/multiple_search_engines.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | random_user_agent: true, 6 | write_meta_data: true, 7 | sleep_range: '[1,1]', 8 | headless: true, 9 | output_file: `examples/results/multiple_search_engines.json` 10 | }; 11 | 12 | let scrape_job = { 13 | search_engine: 'google', 14 | keywords: ['news', 'se-scraper'], 15 | num_pages: 1, 16 | }; 17 | 18 | var scraper = new se_scraper.ScrapeManager(browser_config); 19 | await scraper.start(); 20 | 21 | for (var se of ['google', 'bing']) { 22 | scrape_job.search_engine = se; 23 | var results = await scraper.scrape(scrape_job); 24 | console.dir(results, {depth: null, colors: true}); 25 | } 26 | 27 | await scraper.quit(); 28 | })(); 29 | 30 | -------------------------------------------------------------------------------- /examples/multiple_tabs.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../index.js'); 2 | 3 | const Cluster = { 4 | CONCURRENCY_PAGE: 1, // shares cookies, etc. 5 | CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts) 6 | CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts) 7 | }; 8 | 9 | let keywords = ['New York', 10 | 'Los Angeles', 11 | 'Chicago', 12 | 'Houston', 13 | 'Philadelphia', 14 | 'Phoenix', 15 | 'San Antonio', 16 | 'San Diego', 17 | 'Dallas', 18 | 'San Jose', 19 | 'Austin', 20 | 'Indianapolis', 21 | 'Jacksonville', 22 | 'San Francisco', 23 | 'Columbus', 24 | 'Charlotte', 25 | 'Fort Worth', 26 | 'Detroit', 27 | 'El Paso', 28 | 'Memphis', 29 | 'Seattle', 30 | 'Denver', 31 | 'Washington', 32 | 'Boston', 33 | 'Nashville-Davidson', 34 | 'Baltimore', 35 | 'Oklahoma City', 36 | 'Louisville/Jefferson County', 37 | 'Portland', 38 | 'Las Vegas', 39 | 'Milwaukee', 40 | 'Albuquerque', 41 | 'Tucson', 42 | 'Fresno', 43 | 'Sacramento', 44 | 'Long Beach', 45 | 'Kansas City', 46 | 'Mesa', 47 | 'Virginia Beach', 48 | 'Atlanta', 49 | 'Colorado Springs', 50 | 'Omaha', 51 | 'Raleigh', 52 | 'Miami', 53 | 'Oakland', 54 | 'Minneapolis', 55 | 'Tulsa', 56 | 'Cleveland', 57 | 'Wichita', 58 | 'Arlington', 59 | 'New Orleans', 60 | 'Bakersfield', 61 | 'Tampa', 62 | 'Honolulu', 63 | 'Aurora', 64 | 'Anaheim', 65 | 'Santa Ana', 66 | 'St. Louis', 67 | 'Riverside', 68 | 'Corpus Christi', 69 | 'Lexington-Fayette', 70 | 'Pittsburgh', 71 | 'Anchorage', 72 | 'Stockton', 73 | 'Cincinnati', 74 | 'St. Paul', 75 | 'Toledo', 76 | 'Greensboro', 77 | 'Newark', 78 | 'Plano', 79 | 'Henderson', 80 | 'Lincoln', 81 | 'Buffalo', 82 | 'Jersey City', 83 | 'Chula Vista', 84 | 'Fort Wayne', 85 | 'Orlando', 86 | 'St. Petersburg', 87 | 'Chandler', 88 | 'Laredo', 89 | 'Norfolk', 90 | 'Durham', 91 | 'Madison', 92 | 'Lubbock', 93 | 'Irvine', 94 | 'Winston-Salem', 95 | 'Glendale', 96 | 'Garland', 97 | 'Hialeah', 98 | 'Reno', 99 | 'Chesapeake', 100 | 'Gilbert', 101 | 'Baton Rouge', 102 | 'Irving', 103 | 'Scottsdale', 104 | 'North Las Vegas', 105 | 'Fremont', 106 | 'Boise City', 107 | 'Richmond', 108 | 'San Bernardino']; 109 | 110 | let config = { 111 | search_engine: 'bing', 112 | debug: false, 113 | verbose: true, 114 | keywords: keywords, 115 | num_pages: 1, // how many pages per keyword 116 | output_file: 'examples/results/bing.json', 117 | log_ip_address: false, 118 | headless: true, 119 | puppeteer_cluster_config: { 120 | timeout: 10 * 60 * 1000, // max timeout set to 10 minutes 121 | monitor: false, 122 | concurrency: Cluster.CONCURRENCY_PAGE, // one scraper per tab 123 | maxConcurrency: 7, // scrape with 7 tabs 124 | } 125 | }; 126 | 127 | function callback(err, response) { 128 | if (err) { 129 | console.error(err) 130 | } 131 | console.dir(response, {depth: null, colors: true}); 132 | } 133 | 134 | se_scraper.scrape(config, callback); -------------------------------------------------------------------------------- /examples/per_page_proxy.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const ProxyChain = require('proxy-chain'); 3 | 4 | const ROUTER_PROXY = 'http://127.0.0.1:8000'; 5 | 6 | // SEE: https://github.com/GoogleChrome/puppeteer/issues/678 7 | // Idea is: Setup a local router proxy that assigns requests identified by unique user-agent strings 8 | // distinct upstream proxies. With this way it is possible to use one proxy per chromium tab. 9 | // downside: not fast and efficient 10 | 11 | const uas = [ 12 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 13 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 14 | ]; 15 | 16 | const proxies = ['http://142.93.57.147:3128', 'http://85.132.31.115:8181']; 17 | 18 | (async () => { 19 | const browser = await puppeteer.launch({ 20 | headless: false, 21 | args: [`--proxy-server=${ROUTER_PROXY}`], 22 | }); 23 | const page1 = await browser.newPage(); 24 | const page2 = await browser.newPage(); 25 | 26 | try { 27 | await page1.setUserAgent(uas[0]); 28 | await page1.goto('https://www.whatsmyip.org/'); 29 | } catch (e) { 30 | console.log(e); 31 | } 32 | 33 | try { 34 | await page2.setUserAgent(uas[1]); 35 | await page2.goto('https://www.whatsmyip.org/'); 36 | } catch (e) { 37 | console.log(e); 38 | } 39 | 40 | //await browser.close(); 41 | })(); 42 | 43 | const server = new ProxyChain.Server({ 44 | // Port where the server the server will listen. By default 8000. 45 | port: 8000, 46 | 47 | // Enables verbose logging 48 | verbose: true, 49 | 50 | prepareRequestFunction: ({ 51 | request, 52 | username, 53 | password, 54 | hostname, 55 | port, 56 | isHttp, 57 | }) => { 58 | var upstreamProxyUrl; 59 | 60 | if (request.headers['user-agent'] === uas[0]) { 61 | upstreamProxyUrl = proxies[0]; 62 | } 63 | 64 | if (request.headers['user-agent'] === uas[1]) { 65 | upstreamProxyUrl = proxies[1]; 66 | } 67 | 68 | console.log('Using proxy: ' + upstreamProxyUrl); 69 | 70 | return { upstreamProxyUrl }; 71 | }, 72 | }); 73 | 74 | server.listen(() => { 75 | console.log(`Router Proxy server is listening on port ${8000}`); 76 | }); -------------------------------------------------------------------------------- /examples/pluggable.js: -------------------------------------------------------------------------------- 1 | module.exports = class Pluggable { 2 | constructor(options = {}) { 3 | const { 4 | chromeFlags = [ 5 | '--no-sandbox', 6 | '--disable-setuid-sandbox', 7 | '--disable-dev-shm-usage', 8 | '--disable-accelerated-2d-canvas', 9 | '--disable-gpu', 10 | '--window-size=1920x1080', 11 | '--hide-scrollbars', 12 | '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36', 13 | ], 14 | headless = true, 15 | } = options; 16 | 17 | this.chromeFlags = chromeFlags; 18 | this.headless = headless; 19 | } 20 | 21 | async close_browser() { 22 | await this.browser.close(); 23 | } 24 | 25 | // Callback invoked after metadata has been gathered 26 | async handle_metadata(args) { 27 | // store scraping metadata somewhere 28 | } 29 | 30 | // Callback invoked after all keywords have been scraped 31 | async handle_results(args) { 32 | // store the results somewhere 33 | } 34 | 35 | // Callback invoked before a keyword is scraped. 36 | async before_keyword_scraped(args) { 37 | console.log('before keyword scraped.'); 38 | } 39 | 40 | // Callback invoked after a keyword has been scraped. 41 | // TODO: implement this 42 | async after_keyword_scraped(args) { 43 | console.log('after keyword scraped.') 44 | } 45 | 46 | async start_browser(args={}) { 47 | const puppeteer = require('puppeteer'); 48 | 49 | let launch_args = { 50 | args: args.chromeFlags || this.chromeFlags, 51 | headless: args.headless, 52 | }; 53 | 54 | if (launch_args.headless === undefined) { 55 | launch_args.headless = this.headless; 56 | } 57 | 58 | this.browser = await puppeteer.launch(launch_args); 59 | console.log('Loaded custom function get_browser()'); 60 | console.log(launch_args); 61 | 62 | return this.browser; 63 | } 64 | 65 | async do_work(page) { 66 | // do some scraping work and return results and num_requests 67 | 68 | } 69 | }; -------------------------------------------------------------------------------- /examples/pluggable_example.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | const resolve = require('path').resolve; 3 | 4 | (async () => { 5 | let browser_config = { 6 | test_evasion: false, 7 | log_http_headers: true, 8 | log_ip_address: true, 9 | random_user_agent: false, 10 | apply_evasion_techniques: false, 11 | screen_output: false, 12 | custom_func: resolve('./examples/pluggable.js'), 13 | headless: false, 14 | }; 15 | 16 | let scrape_job = { 17 | search_engine: 'google', 18 | keywords: ['news usa'], 19 | num_pages: 1, 20 | }; 21 | 22 | var scraper = new se_scraper.ScrapeManager(browser_config); 23 | 24 | await scraper.start(); 25 | 26 | var results = await scraper.scrape(scrape_job); 27 | 28 | console.dir(results, {depth: null, colors: true}); 29 | 30 | await scraper.quit(); 31 | })(); 32 | -------------------------------------------------------------------------------- /examples/proxies.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | output_file: 'examples/results/proxyresults.json', 6 | log_ip_address: true, 7 | // a file with one proxy per line. Example: 8 | // socks5://78.94.172.42:1080 9 | // http://118.174.233.10:48400 10 | proxy_file: '/home/nikolai/.proxies', // one proxy per line 11 | // whether to use proxies only 12 | // when this is set to true, se-scraper will not use 13 | // your default IP address in a browser 14 | use_proxies_only: true, 15 | }; 16 | 17 | let scrape_job = { 18 | search_engine: 'google', 19 | keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], 20 | num_pages: 1, 21 | }; 22 | 23 | var scraper = new se_scraper.ScrapeManager(browser_config); 24 | await scraper.start(); 25 | 26 | var results = await scraper.scrape(scrape_job); 27 | console.dir(results, {depth: null, colors: true}); 28 | await scraper.quit(); 29 | })(); 30 | -------------------------------------------------------------------------------- /examples/quickstart.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | test_evasion: false, 6 | log_http_headers: false, 7 | log_ip_address: false, 8 | random_user_agent: false, 9 | apply_evasion_techniques: true, 10 | screen_output: false, 11 | html_output: false, 12 | clean_html_output: true, 13 | }; 14 | 15 | let scrape_job = { 16 | search_engine: 'google', 17 | keywords: ['buy a nice car'], 18 | num_pages: 1, 19 | google_settings: { 20 | "gl": "us", 21 | "hl": "en", 22 | "start": 0, 23 | "num": 10 24 | } 25 | }; 26 | 27 | var scraper = new se_scraper.ScrapeManager(browser_config); 28 | 29 | await scraper.start(); 30 | 31 | var results = await scraper.scrape(scrape_job); 32 | 33 | console.dir(results, {depth: null, colors: true}); 34 | 35 | await scraper.quit(); 36 | })(); 37 | -------------------------------------------------------------------------------- /examples/reusing.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./../src/node_scraper.js'); 2 | 3 | (async () => { 4 | let browser_config = { 5 | output_file: 'examples/results/data.json', 6 | }; 7 | 8 | let scrape_job = { 9 | search_engine: 'google', 10 | keywords: ['news', 'se-scraper'], 11 | num_pages: 1, 12 | }; 13 | 14 | let scrape_job2 = { 15 | search_engine: 'bing', 16 | keywords: ['test', 'what a wonderful world'], 17 | num_pages: 1, 18 | }; 19 | 20 | var scraper = new se_scraper.ScrapeManager(browser_config); 21 | await scraper.start(); 22 | 23 | var results = await scraper.scrape(scrape_job); 24 | console.dir(results, {depth: null, colors: true}); 25 | 26 | var results2 = await scraper.scrape(scrape_job2); 27 | console.dir(results2, {depth: null, colors: true}); 28 | 29 | await scraper.quit(); 30 | })(); 31 | -------------------------------------------------------------------------------- /examples/test_cluster.js: -------------------------------------------------------------------------------- 1 | const { Cluster } = require('../../puppeteer-cluster/dist/index.js'); 2 | var fs = require('fs'); 3 | var os = require("os"); 4 | 5 | const PROXY_FILE = '/home/nikolai/.proxies'; 6 | 7 | function read_items_from_file(fname) { 8 | let kws = fs.readFileSync(fname).toString().split(os.EOL); 9 | // clean keywords 10 | kws = kws.filter((kw) => { 11 | return kw.trim().length > 0; 12 | }); 13 | return kws; 14 | } 15 | 16 | (async () => { 17 | 18 | let browserArgs = [ 19 | '--disable-infobars', 20 | '--window-position=0,0', 21 | '--ignore-certifcate-errors', 22 | '--ignore-certifcate-errors-spki-list', 23 | '--no-sandbox', 24 | '--disable-setuid-sandbox', 25 | '--disable-dev-shm-usage', 26 | '--disable-accelerated-2d-canvas', 27 | '--disable-gpu', 28 | '--window-size=1920x1080', 29 | '--hide-scrollbars', 30 | ]; 31 | 32 | let proxies = read_items_from_file(PROXY_FILE); 33 | 34 | console.dir(proxies); 35 | 36 | // each new call to workerInstance() will 37 | // left pop() one element from this list 38 | // maxConcurrency should be equal to perBrowserOptions.length 39 | 40 | // the first browser config with home IP 41 | let perBrowserOptions = [{ 42 | headless: false, 43 | ignoreHTTPSErrors: true, 44 | args: browserArgs 45 | }]; 46 | 47 | for (var proxy of proxies) { 48 | perBrowserOptions.push({ 49 | headless: false, 50 | ignoreHTTPSErrors: true, 51 | args: browserArgs.concat(`--proxy-server=${proxy}`) 52 | }) 53 | } 54 | 55 | const cluster = await Cluster.launch({ 56 | monitor: true, 57 | timeout: 12 * 60 * 60 * 1000, // 12 hours in ms 58 | concurrency: Cluster.CONCURRENCY_BROWSER, 59 | maxConcurrency: perBrowserOptions.length, 60 | puppeteerOptions: { 61 | headless: false, 62 | args: browserArgs, 63 | ignoreHTTPSErrors: true, 64 | }, 65 | perBrowserOptions: perBrowserOptions 66 | }); 67 | 68 | // Event handler to be called in case of problems 69 | cluster.on('taskerror', (err, data) => { 70 | console.log(`Error crawling ${data}: ${err.message}`); 71 | }); 72 | 73 | 74 | await cluster.task(async ({ page, data: url }) => { 75 | await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000}); 76 | const pageTitle = await page.evaluate(() => document.title); 77 | console.log(`Page title of ${url} is ${pageTitle}`); 78 | console.log(await page.content()); 79 | }); 80 | 81 | for(var i = 0; i < perBrowserOptions.length; i++) { 82 | await cluster.queue('http://ipinfo.io/json'); 83 | } 84 | 85 | await cluster.idle(); 86 | await cluster.close(); 87 | })(); 88 | -------------------------------------------------------------------------------- /examples/test_promise.js: -------------------------------------------------------------------------------- 1 | class Test { 2 | constructor(options = {}) { 3 | const { 4 | config = {}, 5 | } = options; 6 | 7 | this.config = config; 8 | } 9 | 10 | run(vars) { 11 | 12 | console.log(this.config) 13 | } 14 | } 15 | 16 | let o1 = new Test({config: {a: Math.random()}}); 17 | let o2 = new Test({config: {a: Math.random()}}); 18 | 19 | o1.run() 20 | o2.run() 21 | 22 | // (async () => { 23 | // 24 | // let prom = []; 25 | // 26 | // for (var i = 0; i < 3; i++) { 27 | // var obj = new Test({ 28 | // config: {a: Math.random()}, 29 | // }); 30 | // prom.push(new Promise(resolve => { 31 | // setTimeout(() => { new Test({ 32 | // config: {a: Math.random()}, 33 | // }).run(); resolve() }, 1000); 34 | // })); 35 | // } 36 | // 37 | // let res = await Promise.all(prom); 38 | // console.log(res); 39 | // 40 | // })(); -------------------------------------------------------------------------------- /examples/test_proxyflag.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | (async () => { 4 | const browser = await puppeteer.launch({ 5 | args: [ 6 | // SET PROXY HERE 7 | '--proxy-server=socks5://IP:PORT', 8 | '--disable-infobars', 9 | '--window-position=0,0', 10 | '--ignore-certifcate-errors', 11 | '--ignore-certifcate-errors-spki-list', 12 | '--disable-setuid-sandbox', 13 | '--disable-dev-shm-usage', 14 | '--disable-accelerated-2d-canvas', 15 | '--disable-gpu', 16 | '--window-size=1920x1080', 17 | '--hide-scrollbars', 18 | '--disable-notifications', 19 | '--no-sandbox', 20 | '--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36' 21 | ], 22 | headless: true 23 | }); 24 | var page = await browser.newPage(); 25 | await page.setViewport({width: 1920, height: 926}); 26 | await page.goto('http://ipinfo.io/json'); 27 | console.log(await page.content()); 28 | await browser.close(); 29 | })(); -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./src/node_scraper.js'); 2 | var Scraper = require('./src/modules/se_scraper'); 3 | 4 | async function scrape(browser_config, scrape_config) { 5 | // scrape config overwrites the browser_config 6 | Object.assign(browser_config, scrape_config); 7 | 8 | var scraper = new se_scraper.ScrapeManager(browser_config); 9 | 10 | await scraper.start(); 11 | 12 | var results = await scraper.scrape(scrape_config); 13 | 14 | await scraper.quit(); 15 | 16 | return results; 17 | } 18 | 19 | module.exports = { 20 | scrape: scrape, 21 | ScrapeManager: se_scraper.ScrapeManager, 22 | Scraper: Scraper, 23 | }; 24 | -------------------------------------------------------------------------------- /jformat.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import sys 3 | import json 4 | 5 | if len(sys.argv) == 2: 6 | print(pprint.pformat(json.load(open(sys.argv[1])))) -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "se-scraper", 3 | "version": "1.5.7", 4 | "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo", 5 | "homepage": "https://scrapeulous.com/", 6 | "main": "index.js", 7 | "scripts": { 8 | "test": "mocha test test/modules" 9 | }, 10 | "keywords": [ 11 | "scraping", 12 | "search-engines", 13 | "google", 14 | "bing", 15 | "web-scraping" 16 | ], 17 | "author": "Nikolai Tschacher (https://incolumitas.com/)", 18 | "repository": { 19 | "type": "git", 20 | "url": "https://github.com/NikolaiT/se-scraper" 21 | }, 22 | "license": "ISC", 23 | "dependencies": { 24 | "cheerio": "^1.0.0-rc.3", 25 | "debug": "^4.1.1", 26 | "got": "^9.6.0", 27 | "lodash": "^4.17.14", 28 | "puppeteer": "^2.0.0", 29 | "puppeteer-cluster": "^0.18.0", 30 | "puppeteer-extra": "^2.1.3", 31 | "puppeteer-extra-plugin-stealth": "^2.2.2", 32 | "user-agents": "^1.0.378", 33 | "winston": "^3.2.1" 34 | }, 35 | "devDependencies": { 36 | "bluebird": "^3.7.2", 37 | "chai": "^4.2.0", 38 | "chai-string": "^1.5.0", 39 | "express": "^4.17.1", 40 | "http-mitm-proxy": "^0.8.2", 41 | "key-cert": "^1.0.1", 42 | "mocha": "^6.1.4", 43 | "ua-parser-js": "^0.7.21" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /run.js: -------------------------------------------------------------------------------- 1 | const se_scraper = require('./index.js'); 2 | 3 | // those options need to be provided on startup 4 | // and cannot give to se-scraper on scrape() calls 5 | let browser_config = { 6 | // the user agent to scrape with 7 | user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 8 | // if random_user_agent is set to True, a random user agent is chosen 9 | random_user_agent: false, 10 | // whether to start the browser in headless mode 11 | headless: false, 12 | // whether debug information should be printed 13 | // level 0: print nothing 14 | // level 1: print most important info 15 | // ... 16 | // level 4: print all shit nobody wants to know 17 | debug_level: 1, 18 | // specify flags passed to chrome here 19 | chrome_flags: [], 20 | // path to js module that extends functionality 21 | // this module should export the functions: 22 | // get_browser, handle_metadata, close_browser 23 | // must be an absolute path to the module 24 | //custom_func: resolve('examples/pluggable.js'), 25 | custom_func: '', 26 | // use a proxy for all connections 27 | // example: 'socks5://78.94.172.42:1080' 28 | // example: 'http://118.174.233.10:48400' 29 | proxy: '', 30 | // a file with one proxy per line. Example: 31 | // socks5://78.94.172.42:1080 32 | // http://118.174.233.10:48400 33 | proxy_file: '', 34 | puppeteer_cluster_config: { 35 | timeout: 10 * 60 * 1000, // max timeout set to 10 minutes 36 | monitor: false, 37 | concurrency: 1, // one scraper per tab 38 | maxConcurrency: 1, // scrape with 1 tab 39 | } 40 | }; 41 | 42 | (async () => { 43 | // scrape config can change on each scrape() call 44 | let scrape_config = { 45 | // which search engine to scrape 46 | search_engine: 'duckduckgo', 47 | // an array of keywords to scrape 48 | keywords: ['cloud service'], 49 | // the number of pages to scrape for each keyword 50 | num_pages: 1, 51 | 52 | // OPTIONAL PARAMS BELOW: 53 | // google_settings: { 54 | // gl: 'us', // The gl parameter determines the Google country to use for the query. 55 | // hl: 'fr', // The hl parameter determines the Google UI language to return results. 56 | // start: 0, // Determines the results offset to use, defaults to 0. 57 | // num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. 58 | // }, 59 | // instead of keywords you can specify a keyword_file. this overwrites the keywords array 60 | keyword_file: '', 61 | // how long to sleep between requests. a random sleep interval within the range [a,b] 62 | // is drawn before every request. empty string for no sleeping. 63 | sleep_range: '', 64 | // path to output file, data will be stored in JSON 65 | output_file: '', 66 | // whether to prevent images, css, fonts from being loaded 67 | // will speed up scraping a great deal 68 | block_assets: false, 69 | // check if headless chrome escapes common detection techniques 70 | // this is a quick test and should be used for debugging 71 | test_evasion: false, 72 | apply_evasion_techniques: true, 73 | // log ip address data 74 | log_ip_address: false, 75 | // log http headers 76 | log_http_headers: false, 77 | }; 78 | 79 | let results = await se_scraper.scrape(browser_config, scrape_config); 80 | console.dir(results, {depth: null, colors: true}); 81 | })(); 82 | 83 | -------------------------------------------------------------------------------- /se-scraper.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/captcha_solver.js: -------------------------------------------------------------------------------- 1 | /* 2 | There are essentially two strategies to handle a search engine showing you a captcha: 3 | 4 | 1. Solve the captcha 5 | https://github.com/ecthros/uncaptcha2 6 | or use a captcha solving service such as https://anti-captcha.com/mainpage 7 | 8 | 2. Switch your IP address with rotating proxies 9 | 10 | */ 11 | 12 | /** 13 | * @name download recaptcha2 audio captcha 14 | * 15 | * There are several issues: 16 | * 17 | * Google sees that we are using an automated browser. 18 | * 19 | * In the worst case we have to completely control the browser ourselves without puppeteer. 20 | * 21 | * https://github.com/ecthros/uncaptcha2 22 | * 23 | * See here: 24 | * 25 | * https://gist.github.com/tegansnyder/c3aeae4d57768c58247ae6c4e5acd3d1 26 | * 27 | * https://github.com/GoogleChrome/puppeteer/issues/3039 28 | * 29 | * https://intoli.com/blog/making-chrome-headless-undetectable/ 30 | * 31 | * @desc Go to the https://www.google.com/recaptcha/api2/demo demo page and download the captcha 32 | */ 33 | 34 | const puppeteer = require('puppeteer'); 35 | const fs = require('fs'); 36 | const got = require('got'); 37 | 38 | try { 39 | (async () => { 40 | const browser = await puppeteer.launch({ 41 | args: [ 42 | '--proxy-server=socks5://78.94.172.42:1080', 43 | '--no-sandbox', 44 | '--disable-setuid-sandbox', 45 | '--disable-dev-shm-usage', 46 | '--disable-accelerated-2d-canvas', 47 | '--disable-gpu', 48 | '--window-size=1920x1080', 49 | '--hide-scrollbars', 50 | '--user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0"', 51 | ], 52 | headless: false, 53 | }); 54 | const page = await browser.newPage() 55 | await page.goto('https://www.google.com/recaptcha/api2/demo') 56 | 57 | await page.waitFor(1000); 58 | 59 | const frames = page.frames(); 60 | 61 | console.info('Available frames', frames.map(frame => frame.name())); 62 | console.info('Available frame urls', frames.map(frame => frame.url())); 63 | 64 | const frame = frames.find(frame => frame.url().includes('/recaptcha/api2/anchor?')); 65 | const content_frame = frames.find(frame => frame.url().includes('/recaptcha/api2/bframe?')); 66 | 67 | await frame.waitForSelector('#recaptcha-anchor', { timeout: 10000 }); 68 | await page.waitFor(1000); 69 | const button = await frame.$('#recaptcha-anchor'); 70 | await button.click(); 71 | 72 | await content_frame.waitForSelector('#recaptcha-audio-button'); 73 | 74 | const audio_button = await content_frame.$('#recaptcha-audio-button'); 75 | await audio_button.click(); 76 | await page.waitFor(1000); 77 | 78 | await content_frame.waitForSelector('.rc-audiochallenge-tdownload-link'); 79 | 80 | let download_link = await content_frame.evaluate(() => { 81 | return document.querySelectorAll('.rc-audiochallenge-tdownload-link').getAttribute('href'); 82 | }); 83 | console.log('Got audio download link: ', download_link); 84 | got.stream(download_link).pipe(fs.createWriteStream('audio.mp3')); 85 | 86 | await browser.close(); 87 | })() 88 | } catch (err) { 89 | console.error(err) 90 | } 91 | 92 | /* 93 | translate this shit into js: https://github.com/ecthros/uncaptcha2/blob/master/queryAPI.py 94 | */ 95 | async function translate_audio_file() { 96 | } -------------------------------------------------------------------------------- /src/concurrency-implementation.js: -------------------------------------------------------------------------------- 1 | const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency'); 2 | const debug = require('debug')('se-scraper:CustomConcurrency'); 3 | const { timeoutExecute } = require('puppeteer-cluster/dist/util'); 4 | 5 | const BROWSER_TIMEOUT = 5000; 6 | 7 | class CustomConcurrency extends Browser { 8 | 9 | async init() {} 10 | async close() {} 11 | 12 | async workerInstance() { 13 | const options = this.options.perBrowserOptions.shift(); 14 | debug('Launch puppeteer instance with options=%o', options); 15 | let chrome = await this.puppeteer.launch(options); 16 | let page; 17 | let context; 18 | 19 | return { 20 | jobInstance: async () => { 21 | await timeoutExecute(BROWSER_TIMEOUT, (async () => { 22 | context = await chrome.createIncognitoBrowserContext(); 23 | page = await context.newPage(); 24 | })()); 25 | 26 | return { 27 | resources: { 28 | page, 29 | }, 30 | 31 | close: async () => { 32 | await timeoutExecute(BROWSER_TIMEOUT, context.close()); 33 | }, 34 | }; 35 | }, 36 | 37 | close: async () => { 38 | await chrome.close(); 39 | }, 40 | 41 | repair: async () => { 42 | debug('Starting repair'); 43 | try { 44 | // will probably fail, but just in case the repair was not necessary 45 | await chrome.close(); 46 | } catch (e) {} 47 | 48 | // just relaunch as there is only one page per browser 49 | chrome = await this.puppeteer.launch(options); 50 | }, 51 | }; 52 | } 53 | }; 54 | 55 | module.exports = CustomConcurrency; -------------------------------------------------------------------------------- /src/modules/bing.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); 2 | const Scraper = require('./se_scraper'); 3 | 4 | class BingScraper extends Scraper { 5 | 6 | async parse_async(html) { 7 | 8 | let results = await this.page.evaluate(() => { 9 | 10 | let _text = (el, s) => { 11 | let n = el.querySelector(s); 12 | 13 | if (n) { 14 | return n.innerText; 15 | } else { 16 | return ''; 17 | } 18 | }; 19 | 20 | let _attr = (el, s, attr) => { 21 | let n = el.querySelector(s); 22 | 23 | if (n) { 24 | return n.getAttribute(attr); 25 | } else { 26 | return null; 27 | } 28 | }; 29 | 30 | let results = { 31 | num_results: '', 32 | no_results: false, 33 | effective_query: '', 34 | results: [], 35 | ads: [], 36 | right_side_ads: [], 37 | }; 38 | 39 | let num_results_el = document.querySelector('#b_content .sb_count'); 40 | 41 | if (num_results_el) { 42 | results.num_results = num_results_el.innerText; 43 | } 44 | 45 | let organic_results = document.querySelectorAll('#b_content #b_results .b_algo'); 46 | 47 | organic_results.forEach((el) => { 48 | 49 | let serp_obj = { 50 | link: _attr(el, 'h2 a', 'href'), 51 | title: _text(el, 'h2'), 52 | snippet: _text(el, '.b_caption p'), 53 | visible_link: _text(el, 'cite'), 54 | }; 55 | 56 | results.results.push(serp_obj); 57 | }); 58 | 59 | // check if no results 60 | results.no_results = (results.results.length === 0); 61 | 62 | // parse bing ads 63 | let ads = document.querySelectorAll('#b_results .b_ad .sb_add'); 64 | 65 | ads.forEach((el) => { 66 | 67 | let ad_obj = { 68 | title: _text(el, 'h2 a'), 69 | snippet: _text(el, '.b_caption p'), 70 | visible_link: _text(el, '.b_adurl cite'), 71 | tracking_link: _attr(el, 'h2 a', 'href'), 72 | }; 73 | 74 | results.ads.push(ad_obj); 75 | }); 76 | 77 | // right side ads 78 | let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add'); 79 | 80 | right_side_ads.forEach((el) => { 81 | 82 | let ad_obj = { 83 | title: _text(el, 'h2 a'), 84 | snippet: _text(el, '.b_caption p'), 85 | visible_link: _text(el, '.b_adurl cite'), 86 | tracking_link: _attr(el, 'h2 a', 'href'), 87 | }; 88 | 89 | results.right_side_ads.push(ad_obj); 90 | }); 91 | 92 | 93 | let effective_query_el = document.querySelector('#sp_requery a'); 94 | 95 | if (effective_query_el) { 96 | results.effective_query = effective_query_el.innerText; 97 | } 98 | 99 | return results; 100 | }); 101 | 102 | results.results = this.clean_results(results.results, ['title', 'link']); 103 | results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']); 104 | results.time = (new Date()).toUTCString(); 105 | return results; 106 | } 107 | 108 | async load_start_page() { 109 | let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/'; 110 | 111 | if (this.config.bing_settings) { 112 | startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`; 113 | if (this.config.bing_settings.bing_domain) { 114 | startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`; 115 | } else { 116 | startUrl = `https://www.bing.com/search?`; 117 | } 118 | 119 | for (var key in this.config.bing_settings) { 120 | if (key !== 'bing_domain') { 121 | startUrl += `${key}=${this.config.bing_settings[key]}&` 122 | } 123 | } 124 | } 125 | 126 | await this.page.goto(startUrl); 127 | await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); 128 | 129 | return true; 130 | } 131 | 132 | async search_keyword(keyword) { 133 | const input = await this.page.$('input[name="q"]'); 134 | await this.set_input_value(`input[name="q"]`, keyword); 135 | await this.sleep(50); 136 | await input.focus(); 137 | await this.page.keyboard.press("Enter"); 138 | } 139 | 140 | async next_page() { 141 | let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000}); 142 | if (!next_page_link) { 143 | return false; 144 | } 145 | 146 | this.last_response = await Promise.all([ 147 | next_page_link.click(), // The promise resolves after navigation has finished 148 | this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation 149 | ]); 150 | 151 | return true; 152 | } 153 | 154 | async wait_for_results() { 155 | await this.page.waitForSelector('#b_content', { timeout: this.STANDARD_TIMEOUT }); 156 | } 157 | 158 | async detected() { 159 | // TODO: I was actually never detected by bing. those are good boys. 160 | } 161 | } 162 | 163 | 164 | class BingNewsScraper extends Scraper { 165 | 166 | parse(html) { 167 | // load the page source into cheerio 168 | const $ = cheerio.load(html); 169 | 170 | // perform queries 171 | const results = []; 172 | $('#algocore .newsitem').each((i, link) => { 173 | results.push({ 174 | link: $(link).attr('url'), 175 | title: $(link).find('a.title').text(), 176 | snippet: $(link).find('.snippet').text(), 177 | date: $(link).find('.source span').last().text(), 178 | }) 179 | }); 180 | 181 | const cleaned = this.clean_results(results, ['title', 'link']); 182 | 183 | return { 184 | time: (new Date()).toUTCString(), 185 | results: cleaned, 186 | } 187 | } 188 | 189 | async load_start_page() { 190 | let startUrl = 'https://www.bing.com/news/search?'; 191 | 192 | try { 193 | await this.page.goto(startUrl); 194 | if (this.config.set_manual_settings === true) { 195 | console.log('Sleeping 30 seconds. Set your settings now.'); 196 | await this.sleep(30000); 197 | } 198 | await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); 199 | } catch (e) { 200 | return false; 201 | } 202 | 203 | return true; 204 | } 205 | 206 | async search_keyword(keyword) { 207 | const input = await this.page.$('input[name="q"]'); 208 | await this.set_input_value(`input[name="q"]`, keyword); 209 | await this.sleep(50); 210 | await input.focus(); 211 | await this.page.keyboard.press("Enter"); 212 | } 213 | 214 | async next_page() { 215 | let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000}); 216 | if (!next_page_link) { 217 | return false; 218 | } 219 | 220 | this.last_response = await Promise.all([ 221 | next_page_link.click(), // The promise resolves after navigation has finished 222 | this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation 223 | ]); 224 | 225 | return true; 226 | } 227 | 228 | async wait_for_results() { 229 | await this.page.waitForSelector('#news', { timeout: this.STANDARD_TIMEOUT }); 230 | } 231 | 232 | async detected() { 233 | // TODO: I was actually never detected by bing news. 234 | } 235 | } 236 | 237 | module.exports = { 238 | BingNewsScraper: BingNewsScraper, 239 | BingScraper: BingScraper, 240 | }; 241 | -------------------------------------------------------------------------------- /src/modules/duckduckgo.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); 2 | const Scraper = require('./se_scraper'); 3 | const debug = require('debug')('se-scraper:DuckduckgoScraper'); 4 | 5 | class DuckduckgoScraper extends Scraper { 6 | 7 | parse(html) { 8 | debug('parse'); 9 | // load the page source into cheerio 10 | const $ = cheerio.load(html); 11 | 12 | // perform queries 13 | const results = []; 14 | const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body'; 15 | $(organicSelector).each((i, link) => { 16 | results.push({ 17 | link: $(link).find('.result__title .result__a').attr('href'), 18 | title: $(link).find('.result__title .result__a').text(), 19 | date: $(link).find('.result__timestamp').text(), 20 | snippet: $(link).find('.result__snippet').text(), 21 | visible_link: $(link).find('.result__url').attr('href'), 22 | }); 23 | }); 24 | 25 | const ads = []; 26 | $('.results--ads .result').each((i, element) => { 27 | ads.push({ 28 | visible_link: $(element).find('.result__url').text(), 29 | tracking_link: $(element).find('.result__title .result__a').attr('href'), 30 | title: $(element).find('.result__title .result__a').text(), 31 | snippet: $(element).find('.result__snippet').text(), 32 | }) 33 | }); 34 | 35 | let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || ''; 36 | 37 | const cleaned = this.clean_results(results, ['title', 'link']); 38 | 39 | return { 40 | time: (new Date()).toUTCString(), 41 | effective_query: effective_query, 42 | results: cleaned, 43 | ads: ads, 44 | } 45 | } 46 | 47 | async load_start_page() { 48 | debug('load_start_page'); 49 | let startUrl = 'https://duckduckgo.com/'; 50 | 51 | this.last_response = await this.page.goto(startUrl); 52 | await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); 53 | 54 | return true; 55 | } 56 | 57 | async search_keyword(keyword) { 58 | debug('search_keyword'); 59 | const input = await this.page.$('input[name="q"]'); 60 | await this.set_input_value(`input[name="q"]`, keyword); 61 | await this.sleep(50); 62 | await input.focus(); 63 | await this.page.keyboard.press("Enter"); 64 | } 65 | 66 | async next_page() { 67 | debug('next_page'); 68 | let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT}); 69 | if (!next_page_link) { 70 | return false; 71 | } 72 | await next_page_link.click(); 73 | await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT }); 74 | 75 | return true; 76 | } 77 | 78 | async wait_for_results() { 79 | debug('wait_for_results'); 80 | await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT }); 81 | } 82 | 83 | async detected() { 84 | } 85 | } 86 | 87 | module.exports = { 88 | DuckduckgoScraper: DuckduckgoScraper, 89 | }; -------------------------------------------------------------------------------- /src/modules/infospace.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); 2 | const Scraper = require('./se_scraper'); 3 | 4 | class InfospaceScraper extends Scraper { 5 | 6 | parse(html) { 7 | // load the page source into cheerio 8 | const $ = cheerio.load(html); 9 | 10 | // perform queries 11 | const results = []; 12 | $('.result').each((i, link) => { 13 | results.push({ 14 | link: $(link).find('a.title').attr('href'), 15 | title: $(link).find('a.title').text(), 16 | snippet: $(link).find('.description').text(), 17 | visible_link: $(link).find('.url').text(), 18 | }) 19 | }); 20 | 21 | const cleaned = []; 22 | for (var i=0; i < results.length; i++) { 23 | let res = results[i]; 24 | if (res.link && res.link.trim()) { 25 | res.rank = this.result_rank++; 26 | cleaned.push(res); 27 | } 28 | } 29 | 30 | let no_results = this.no_results( 31 | ['No search results were found for'], 32 | $('.layout__mainline').text() 33 | ); 34 | 35 | return { 36 | time: (new Date()).toUTCString(), 37 | no_results: no_results, 38 | num_results: '', 39 | results: cleaned, 40 | } 41 | } 42 | 43 | async load_start_page() { 44 | 45 | let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html'; 46 | 47 | try { 48 | this.last_response = await this.page.goto(startUrl); 49 | await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); 50 | } catch (e) { 51 | return false; 52 | } 53 | return true; 54 | } 55 | 56 | async search_keyword(keyword) { 57 | const input = await this.page.$('input[id="q"]'); 58 | await this.set_input_value('input[id="q"]', keyword); 59 | await this.sleep(50); 60 | await input.focus(); 61 | await this.page.keyboard.press("Enter"); 62 | } 63 | 64 | async next_page() { 65 | let next_page_link = await this.page.$('a.next', {timeout: 1000}); 66 | if (!next_page_link) { 67 | return false; 68 | } 69 | await next_page_link.click(); 70 | this.last_response = await this.page.waitForNavigation(); 71 | 72 | return true; 73 | } 74 | 75 | async wait_for_results() { 76 | await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector. 77 | } 78 | 79 | async detected() { 80 | } 81 | } 82 | 83 | class WebcrawlerNewsScraper extends Scraper { 84 | 85 | parse(html) { 86 | // load the page source into cheerio 87 | const $ = cheerio.load(html); 88 | 89 | // perform queries 90 | const results = []; 91 | $('.article').each((i, link) => { 92 | let source = $(link).find('.source').text(); 93 | let date = source.split(',')[1] || ''; 94 | results.push({ 95 | link: $(link).find('a').attr('href'), 96 | title: $(link).find('.title').text(), 97 | publisher: $(link).find('.source').text(), 98 | date: date, 99 | snippet: $(link).find('.description').text(), 100 | }); 101 | }); 102 | 103 | const cleaned = this.clean_results(results, ['title', 'link']); 104 | 105 | return { 106 | time: (new Date()).toUTCString(), 107 | results: cleaned 108 | } 109 | } 110 | 111 | async load_start_page() { 112 | try { 113 | this.last_response = await this.page.goto('https://www.webcrawler.com/?qc=news'); 114 | await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); 115 | } catch (e) { 116 | return false; 117 | } 118 | return true; 119 | } 120 | 121 | async search_keyword(keyword) { 122 | const input = await this.page.$('input[name="q"]'); 123 | await this.set_input_value('input[name="q"]', keyword); 124 | await this.sleep(50); 125 | await input.focus(); 126 | await this.page.keyboard.press("Enter"); 127 | } 128 | 129 | async next_page() { 130 | let next_page_link = await this.page.$('.pagination__num--next', {timeout: 1000}); 131 | if (!next_page_link) { 132 | return false; 133 | } 134 | await next_page_link.click(); 135 | await this.page.waitForNavigation(); 136 | 137 | return true; 138 | } 139 | 140 | async wait_for_results() { 141 | await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); 142 | } 143 | 144 | async detected() { 145 | } 146 | } 147 | 148 | module.exports = { 149 | InfospaceScraper: InfospaceScraper, 150 | WebcrawlerNewsScraper: WebcrawlerNewsScraper, 151 | }; -------------------------------------------------------------------------------- /src/modules/metadata.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); 2 | 3 | module.exports = { 4 | get_ip_data: get_ip_data, 5 | get_http_headers: get_http_headers, 6 | }; 7 | 8 | async function get_ip_data(page) { 9 | await page.goto('https://ipinfo.io/json', { 10 | waitLoad: true, 11 | waitNetworkIdle: true 12 | }); 13 | let json = await page.content({ 14 | timeout: 20000 15 | }); 16 | const $ = cheerio.load(json); 17 | let ipinfo_text = $('pre').text(); 18 | return JSON.parse(ipinfo_text); 19 | } 20 | 21 | async function get_http_headers(page) { 22 | await page.goto('https://httpbin.org/get', { 23 | waitLoad: true, 24 | waitNetworkIdle: true 25 | }); 26 | let headers = await page.content(); 27 | 28 | const $ = cheerio.load(headers); 29 | let headers_text = $('pre').text(); 30 | return JSON.parse(headers_text); 31 | } -------------------------------------------------------------------------------- /src/modules/se_scraper.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const meta = require('./metadata.js'); 3 | const debug = require('debug')('se-scraper:Scraper'); 4 | /* 5 | Get useful JS knowledge and get awesome... 6 | 7 | Read this shit: https://javascript.info/class-inheritance 8 | And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e 9 | */ 10 | 11 | module.exports = class Scraper { 12 | constructor(options = {}) { 13 | debug('constructor'); 14 | const { 15 | config = {}, 16 | context = {}, 17 | pluggable = null, 18 | page = null, 19 | } = options; 20 | 21 | this.page = page; 22 | this.last_response = null; // the last response object 23 | this.metadata = { 24 | scraping_detected: false, 25 | }; 26 | this.pluggable = pluggable; 27 | this.config = config; 28 | this.logger = this.config.logger; 29 | this.context = context; 30 | 31 | this.proxy = config.proxy; 32 | this.keywords = config.keywords; 33 | 34 | this.STANDARD_TIMEOUT = 10000; 35 | this.SOLVE_CAPTCHA_TIME = 45000; 36 | 37 | this.results = {}; 38 | this.result_rank = 1; 39 | // keep track of the requests done 40 | this.num_requests = 0; 41 | // keep track of the keywords searched 42 | this.num_keywords = 0; 43 | 44 | let settings = this.config[`${this.config.search_engine}_settings`]; 45 | if (settings) { 46 | if (typeof settings === 'string') { 47 | settings = JSON.parse(settings); 48 | this.config[`${this.config.search_engine}_settings`] = settings; 49 | } 50 | } 51 | } 52 | 53 | async run({page, data, worker}) { 54 | 55 | debug('worker=%o', worker, this.config.keywords); 56 | 57 | if (page) { 58 | this.page = page; 59 | } 60 | 61 | await this.page.setViewport({ width: 1920, height: 1040 }); 62 | let do_continue = true; 63 | 64 | if (this.config.scrape_from_file.length <= 0) { 65 | do_continue = await this.load_search_engine(); 66 | } 67 | 68 | if (!do_continue) { 69 | console.error('Failed to load the search engine: load_search_engine()'); 70 | } else { 71 | await this.scraping_loop(); 72 | } 73 | 74 | return { 75 | results: this.results, 76 | metadata: this.metadata, 77 | num_requests: this.num_requests, 78 | } 79 | } 80 | 81 | /** 82 | * Action that runs only once in the beginning of the 83 | * scraping procedure. 84 | * 85 | * @returns {Promise} true if everything is correct. 86 | */ 87 | async load_search_engine() { 88 | 89 | if (this.config.apply_evasion_techniques === true) { 90 | // prevent detection by evading common detection techniques 91 | await evadeChromeHeadlessDetection(this.page); 92 | } 93 | 94 | // block some assets to speed up scraping 95 | if (this.config.block_assets === true) { 96 | await this.page.setRequestInterception(true); 97 | this.page.on('request', (req) => { 98 | let type = req.resourceType(); 99 | const block = ['stylesheet', 'font', 'image', 'media']; 100 | if (block.includes(type)) { 101 | req.abort(); 102 | } else { 103 | req.continue(); 104 | } 105 | }); 106 | } 107 | 108 | if (this.config.test_evasion === true) { 109 | // Navigate to the page that will perform the tests. 110 | const testUrl = 'https://bot.sannysoft.com'; 111 | await this.page.goto(testUrl); 112 | // Save a screenshot of the results. 113 | await this.page.screenshot({path: 'headless-evasion-result.png'}); 114 | } 115 | 116 | if (this.config.log_http_headers === true) { 117 | this.metadata.http_headers = await meta.get_http_headers(this.page); 118 | debug('this.metadata.http_headers=%O', this.metadata.http_headers); 119 | } 120 | 121 | if (this.config.log_ip_address === true) { 122 | let ipinfo = await meta.get_ip_data(this.page); 123 | this.metadata.ipinfo = ipinfo; 124 | debug('this.metadata.ipinfo', this.metadata.ipinfo); 125 | } 126 | 127 | // check that our proxy is working by confirming 128 | // that ipinfo.io sees the proxy IP address 129 | if (this.proxy && this.config.log_ip_address === true) { 130 | debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`); 131 | 132 | // if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here 133 | if (!this.proxy.includes(this.metadata.ipinfo.ip)) { 134 | throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`); 135 | } else { 136 | this.logger.info(`Using valid Proxy: ${this.proxy}`); 137 | } 138 | 139 | } 140 | 141 | return await this.load_start_page(); 142 | } 143 | 144 | /** 145 | * Each scraper basically iterates over a list of 146 | * keywords and a list of pages. This is the generic 147 | * method for that. 148 | * 149 | * @returns {Promise} 150 | */ 151 | async scraping_loop() { 152 | for (var keyword of this.keywords) { 153 | this.num_keywords++; 154 | this.keyword = keyword; 155 | this.results[keyword] = {}; 156 | this.result_rank = 1; 157 | 158 | try { 159 | 160 | if (this.pluggable && this.pluggable.before_keyword_scraped) { 161 | await this.pluggable.before_keyword_scraped({ 162 | results: this.results, 163 | num_keywords: this.num_keywords, 164 | num_requests: this.num_requests, 165 | keyword: keyword, 166 | }); 167 | } 168 | 169 | this.page_num = 1; 170 | 171 | // load scraped page from file if `scrape_from_file` is given 172 | if (this.config.scrape_from_file.length <= 0) { 173 | await this.search_keyword(keyword); 174 | } else { 175 | this.last_response = await this.page.goto(this.config.scrape_from_file); 176 | } 177 | 178 | // when searching the keyword fails, num_requests will not 179 | // be incremented. 180 | this.num_requests++; 181 | 182 | do { 183 | 184 | this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`); 185 | 186 | await this.wait_for_results(); 187 | 188 | if (this.config.sleep_range) { 189 | await this.random_sleep(); 190 | } 191 | 192 | let html = await this.page.content(); 193 | let parsed = this.parse(html); 194 | this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html); 195 | 196 | if (this.config.screen_output) { 197 | this.results[keyword][this.page_num].screenshot = await this.page.screenshot({ 198 | encoding: 'base64', 199 | fullPage: false, 200 | }); 201 | } 202 | 203 | if (this.config.html_output) { 204 | 205 | if (this.config.clean_html_output) { 206 | await this.page.evaluate(() => { 207 | // remove script and style tags 208 | Array.prototype.slice.call(document.getElementsByTagName('script')).forEach( 209 | function(item) { 210 | item.remove(); 211 | }); 212 | Array.prototype.slice.call(document.getElementsByTagName('style')).forEach( 213 | function(item) { 214 | item.remove(); 215 | }); 216 | 217 | // remove all comment nodes 218 | var nodeIterator = document.createNodeIterator( 219 | document.body, 220 | NodeFilter.SHOW_COMMENT, 221 | { acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } } 222 | ); 223 | while(nodeIterator.nextNode()){ 224 | var commentNode = nodeIterator.referenceNode; 225 | commentNode.remove(); 226 | } 227 | }); 228 | } 229 | 230 | if (this.config.clean_data_images) { 231 | await this.page.evaluate(() => { 232 | Array.prototype.slice.call(document.getElementsByTagName('img')).forEach( 233 | function(item) { 234 | let src = item.getAttribute('src'); 235 | if (src && src.startsWith('data:')) { 236 | item.setAttribute('src', ''); 237 | } 238 | }); 239 | }); 240 | } 241 | 242 | let html_contents = await this.page.content(); 243 | // https://stackoverflow.com/questions/27841112/how-to-remove-white-space-between-html-tags-using-javascript 244 | // TODO: not sure if this is save! 245 | html_contents = html_contents.replace(/>\s+<'); 246 | this.results[keyword][this.page_num].html = html_contents; 247 | } 248 | 249 | this.page_num += 1; 250 | 251 | // only load the next page when we will pass the next iteration 252 | // step from the while loop 253 | if (this.page_num <= this.config.num_pages) { 254 | 255 | let next_page_loaded = await this.next_page(); 256 | 257 | if (next_page_loaded === false) { 258 | break; 259 | } else { 260 | this.num_requests++; 261 | } 262 | } 263 | 264 | } while (this.page_num <= this.config.num_pages); 265 | 266 | } catch (e) { 267 | 268 | this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`); 269 | debug('this.last_response=%O', this.last_response); 270 | 271 | if (this.config.take_screenshot_on_error) { 272 | await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` }); 273 | } 274 | 275 | this.metadata.scraping_detected = await this.detected(); 276 | 277 | if (this.metadata.scraping_detected === true) { 278 | this.logger.warn(`${this.config.search_engine_name} detected the scraping!`); 279 | 280 | if (this.config.is_local === true) { 281 | await this.sleep(this.SOLVE_CAPTCHA_TIME); 282 | this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`); 283 | // expect that user filled out necessary captcha 284 | } else { 285 | if (this.config.throw_on_detection === true) { 286 | throw( e ); 287 | } else { 288 | return; 289 | } 290 | } 291 | } else { 292 | // some other error, quit scraping process if stuff is broken 293 | if (this.config.throw_on_detection === true) { 294 | throw( e ); 295 | } else { 296 | return; 297 | } 298 | } 299 | } 300 | } 301 | } 302 | 303 | /** 304 | * Generic function to append queryArgs to a search engine url. 305 | * 306 | * @param: The baseUrl to use for the build process. 307 | */ 308 | build_start_url(baseUrl) { 309 | let settings = this.config[`${this.config.search_engine}_settings`]; 310 | 311 | if (settings) { 312 | for (var key in settings) { 313 | baseUrl += `${key}=${settings[key]}&` 314 | } 315 | 316 | this.logger.info('Using startUrl: ' + baseUrl); 317 | 318 | return baseUrl; 319 | } 320 | 321 | return false; 322 | } 323 | 324 | sleep(ms) { 325 | return new Promise(resolve => { 326 | setTimeout(resolve, ms) 327 | }) 328 | } 329 | 330 | async random_sleep() { 331 | const [min, max] = this.config.sleep_range; 332 | let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number 333 | this.logger.info(`Sleeping for ${rand}s`); 334 | await this.sleep(rand * 1000); 335 | } 336 | 337 | async set_input_value(selector, value) { 338 | await this.page.waitFor(selector); 339 | await this.page.evaluate((value, selector) => { 340 | return document.querySelector(selector).value = value; 341 | }, value, selector); 342 | } 343 | 344 | no_results(needles, html) { 345 | for (let needle of needles) { 346 | if (html.includes(needle)) { 347 | this.logger.warn(`HTML contains needle ${needle}. no_results=true`); 348 | return true; 349 | } 350 | } 351 | return false; 352 | } 353 | 354 | /* 355 | Throw away all elements that do not have data in the 356 | specified attributes. Most be of value string. 357 | */ 358 | clean_results(results, attributes) { 359 | const cleaned = []; 360 | for (var res of results) { 361 | let goodboy = true; 362 | for (var attr of attributes) { 363 | if (!res[attr] || !res[attr].trim()) { 364 | goodboy = false; 365 | break; 366 | } 367 | } 368 | if (goodboy) { 369 | res.rank = this.result_rank++; 370 | cleaned.push(res); 371 | } 372 | } 373 | return cleaned; 374 | } 375 | 376 | parse(html) { 377 | 378 | } 379 | 380 | async parse_async(html) { 381 | 382 | } 383 | 384 | /** 385 | * 386 | * @returns true if startpage was loaded correctly. 387 | */ 388 | async load_start_page() { 389 | 390 | } 391 | 392 | /** 393 | * Searches the keyword by inputting it into the form and hitting enter 394 | * or something similar. 395 | * 396 | * @param keyword 397 | * @returns {Promise} 398 | */ 399 | async search_keyword(keyword) { 400 | 401 | } 402 | 403 | /** 404 | * 405 | * @returns true if the next page was loaded correctely 406 | */ 407 | async next_page() { 408 | 409 | } 410 | 411 | async wait_for_results() { 412 | 413 | } 414 | 415 | async detected() { 416 | 417 | } 418 | }; 419 | 420 | // This is where we'll put the code to get around the tests. 421 | async function evadeChromeHeadlessDetection(page) { 422 | 423 | // Pass the Webdriver Test. 424 | await page.evaluateOnNewDocument(() => { 425 | const newProto = navigator.__proto__; 426 | delete newProto.webdriver; 427 | navigator.__proto__ = newProto; 428 | }); 429 | 430 | // Pass the Chrome Test. 431 | await page.evaluateOnNewDocument(() => { 432 | // We can mock this in as much depth as we need for the test. 433 | const mockObj = { 434 | app: { 435 | isInstalled: false, 436 | }, 437 | webstore: { 438 | onInstallStageChanged: {}, 439 | onDownloadProgress: {}, 440 | }, 441 | runtime: { 442 | PlatformOs: { 443 | MAC: 'mac', 444 | WIN: 'win', 445 | ANDROID: 'android', 446 | CROS: 'cros', 447 | LINUX: 'linux', 448 | OPENBSD: 'openbsd', 449 | }, 450 | PlatformArch: { 451 | ARM: 'arm', 452 | X86_32: 'x86-32', 453 | X86_64: 'x86-64', 454 | }, 455 | PlatformNaclArch: { 456 | ARM: 'arm', 457 | X86_32: 'x86-32', 458 | X86_64: 'x86-64', 459 | }, 460 | RequestUpdateCheckStatus: { 461 | THROTTLED: 'throttled', 462 | NO_UPDATE: 'no_update', 463 | UPDATE_AVAILABLE: 'update_available', 464 | }, 465 | OnInstalledReason: { 466 | INSTALL: 'install', 467 | UPDATE: 'update', 468 | CHROME_UPDATE: 'chrome_update', 469 | SHARED_MODULE_UPDATE: 'shared_module_update', 470 | }, 471 | OnRestartRequiredReason: { 472 | APP_UPDATE: 'app_update', 473 | OS_UPDATE: 'os_update', 474 | PERIODIC: 'periodic', 475 | }, 476 | }, 477 | }; 478 | 479 | window.navigator.chrome = mockObj; 480 | window.chrome = mockObj; 481 | }); 482 | 483 | // Pass the Permissions Test. 484 | await page.evaluateOnNewDocument(() => { 485 | const originalQuery = window.navigator.permissions.query; 486 | window.navigator.permissions.__proto__.query = parameters => 487 | parameters.name === 'notifications' 488 | ? Promise.resolve({state: Notification.permission}) 489 | : originalQuery(parameters); 490 | 491 | // Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js 492 | const oldCall = Function.prototype.call; 493 | 494 | function call() { 495 | return oldCall.apply(this, arguments); 496 | } 497 | 498 | Function.prototype.call = call; 499 | 500 | const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString"); 501 | const oldToString = Function.prototype.toString; 502 | 503 | function functionToString() { 504 | if (this === window.navigator.permissions.query) { 505 | return "function query() { [native code] }"; 506 | } 507 | if (this === functionToString) { 508 | return nativeToStringFunctionString; 509 | } 510 | return oldCall.call(oldToString, this); 511 | } 512 | 513 | Function.prototype.toString = functionToString; 514 | }); 515 | 516 | // Pass the Plugins Length Test. 517 | await page.evaluateOnNewDocument(() => { 518 | // Overwrite the `plugins` property to use a custom getter. 519 | Object.defineProperty(navigator, 'plugins', { 520 | // This just needs to have `length > 0` for the current test, 521 | // but we could mock the plugins too if necessary. 522 | get: () => [1, 2, 3, 4, 5] 523 | }); 524 | }); 525 | 526 | // Pass the Languages Test. 527 | await page.evaluateOnNewDocument(() => { 528 | // Overwrite the `plugins` property to use a custom getter. 529 | Object.defineProperty(navigator, 'languages', { 530 | get: () => ['en-US', 'en'] 531 | }); 532 | }); 533 | 534 | // Pass the iframe Test 535 | await page.evaluateOnNewDocument(() => { 536 | Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { 537 | get: function () { 538 | return window; 539 | } 540 | }); 541 | }); 542 | 543 | // Pass toString test, though it breaks console.debug() from working 544 | await page.evaluateOnNewDocument(() => { 545 | window.console.debug = () => { 546 | return null; 547 | }; 548 | }); 549 | } 550 | -------------------------------------------------------------------------------- /src/modules/yandex.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const Scraper = require('./se_scraper'); 4 | 5 | class YandexScraper extends Scraper { 6 | 7 | constructor(...args) { 8 | super(...args); 9 | } 10 | 11 | async parse_async(html) { 12 | 13 | let results = await this.page.evaluate(() => { 14 | let serp_items = document.querySelectorAll('.serp-item'); 15 | const data = []; 16 | serp_items.forEach((item) => { 17 | let obj = { 18 | is_ad: false, 19 | }; 20 | try { 21 | if (item) { 22 | 23 | let linkElement = item.querySelector('h2 a.link'); 24 | 25 | if (linkElement) { 26 | obj.link = linkElement.getAttribute('href'); 27 | obj.title = linkElement.innerText; 28 | } 29 | 30 | 31 | let label = item.querySelector('.organic__subtitle .label'); 32 | 33 | if (label) { 34 | let labelText = label.innerText; 35 | 36 | if (labelText) { 37 | labelText = labelText.trim().toLowerCase(); 38 | console.log(labelText); 39 | let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio']; 40 | obj.is_ad = ad_labels.includes(labelText); 41 | } 42 | } 43 | 44 | obj.snippet = item.querySelector('.text-container.typo').innerText; 45 | obj.visible_link = item.querySelector('.typo_type_greenurl').innerText; 46 | 47 | if (obj.title) { 48 | data.push(obj); 49 | } 50 | } 51 | } catch (e) { 52 | } 53 | }); 54 | return data; 55 | }); 56 | 57 | let num_results = await this.page.evaluate(() => { 58 | let num_results = document.querySelector('.serp-adv__found'); 59 | if (num_results) { 60 | return num_results.innerText; 61 | } 62 | }); 63 | 64 | const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']); 65 | 66 | return { 67 | time: (new Date()).toUTCString(), 68 | num_results: num_results, 69 | results: cleaned, 70 | }; 71 | } 72 | 73 | async load_start_page() { 74 | let startUrl = 'https://yandex.com'; 75 | 76 | this.logger.info('Using startUrl: ' + startUrl); 77 | 78 | this.last_response = await this.page.goto(startUrl); 79 | 80 | await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT }); 81 | 82 | return true; 83 | } 84 | 85 | async search_keyword(keyword) { 86 | const input = await this.page.$('input[name="text"]'); 87 | await this.set_input_value(`input[name="text"]`, keyword); 88 | await this.sleep(50); 89 | await input.focus(); 90 | await this.page.keyboard.press("Enter"); 91 | } 92 | 93 | async next_page() { 94 | let next_page_link = await this.page.$('.pager .pager__item_kind_next', {timeout: 1000}); 95 | if (!next_page_link) { 96 | return false; 97 | } 98 | await next_page_link.click(); 99 | 100 | return true; 101 | } 102 | 103 | async wait_for_results() { 104 | await this.page.waitForSelector('.main__content', { timeout: this.STANDARD_TIMEOUT }); 105 | } 106 | 107 | async detected() { 108 | 109 | } 110 | } 111 | 112 | module.exports = { 113 | YandexScraper: YandexScraper, 114 | }; -------------------------------------------------------------------------------- /src/node_scraper.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const fs = require('fs'); 4 | const os = require('os'); 5 | const _ = require('lodash'); 6 | const { createLogger, format, transports } = require('winston'); 7 | const { combine, timestamp, printf } = format; 8 | const debug = require('debug')('se-scraper:ScrapeManager'); 9 | const { Cluster } = require('puppeteer-cluster'); 10 | 11 | const UserAgent = require('user-agents'); 12 | const google = require('./modules/google.js'); 13 | const bing = require('./modules/bing.js'); 14 | const yandex = require('./modules/yandex.js'); 15 | const infospace = require('./modules/infospace.js'); 16 | const duckduckgo = require('./modules/duckduckgo.js'); 17 | const CustomConcurrencyImpl = require('./concurrency-implementation'); 18 | 19 | const MAX_ALLOWED_BROWSERS = 6; 20 | 21 | function write_results(fname, data) { 22 | fs.writeFileSync(fname, data, (err) => { 23 | if (err) throw err; 24 | console.log(`Results written to file ${fname}`); 25 | }); 26 | } 27 | 28 | function read_keywords_from_file(fname) { 29 | let kws = fs.readFileSync(fname).toString().split(os.EOL); 30 | // clean keywords 31 | kws = kws.filter((kw) => { 32 | return kw.trim().length > 0; 33 | }); 34 | return kws; 35 | } 36 | 37 | 38 | function getScraper(search_engine, args) { 39 | if (typeof search_engine === 'string') { 40 | return new { 41 | google: google.GoogleScraper, 42 | google_news_old: google.GoogleNewsOldScraper, 43 | google_news: google.GoogleNewsScraper, 44 | google_image: google.GoogleImageScraper, 45 | bing: bing.BingScraper, 46 | yandex: yandex.YandexScraper, 47 | bing_news: bing.BingNewsScraper, 48 | duckduckgo: duckduckgo.DuckduckgoScraper, 49 | infospace: infospace.InfospaceScraper, 50 | webcrawler: infospace.WebcrawlerNewsScraper, 51 | }[search_engine](args); 52 | } else if (typeof search_engine === 'function') { 53 | return new search_engine(args); 54 | } else { 55 | throw new Error(`search_engine must either be a string of class (function)`); 56 | } 57 | } 58 | 59 | 60 | class ScrapeManager { 61 | 62 | constructor(config, context={}) { 63 | 64 | this.cluster = null; 65 | this.pluggable = null; 66 | this.scraper = null; 67 | this.context = context; 68 | 69 | this.config = _.defaults(config, { 70 | // the user agent to scrape with 71 | user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36', 72 | // if random_user_agent is set to True, a random user agent is chosen 73 | random_user_agent: false, 74 | // whether to select manual settings in visible mode 75 | set_manual_settings: false, 76 | // log ip address data 77 | log_ip_address: false, 78 | // log http headers 79 | log_http_headers: false, 80 | // how long to sleep between requests. a random sleep interval within the range [a,b] 81 | // is drawn before every request. empty string for no sleeping. 82 | sleep_range: null, 83 | // which search engine to scrape 84 | search_engine: 'google', 85 | search_engine_name: 'google', 86 | logger: createLogger({ 87 | level: 'info', 88 | format: combine( 89 | timestamp(), 90 | printf(({ level, message, timestamp }) => { 91 | return `${timestamp} [${level}] ${message}`; 92 | }) 93 | ), 94 | transports: [ 95 | new transports.Console() 96 | ] 97 | }), 98 | keywords: ['nodejs rocks',], 99 | // whether to start the browser in headless mode 100 | headless: true, 101 | // specify flags passed to chrome here 102 | // About our defaults values https://peter.sh/experiments/chromium-command-line-switches/ 103 | chrome_flags: [ 104 | '--disable-infobars', 105 | '--window-position=0,0', 106 | '--ignore-certifcate-errors', 107 | '--ignore-certifcate-errors-spki-list', 108 | '--no-sandbox', 109 | '--disable-setuid-sandbox', 110 | '--disable-dev-shm-usage', 111 | '--disable-accelerated-2d-canvas', 112 | '--disable-gpu', 113 | '--window-size=1920,1040', 114 | '--start-fullscreen', 115 | '--hide-scrollbars', 116 | '--disable-notifications', 117 | ], 118 | // the number of pages to scrape for each keyword 119 | num_pages: 1, 120 | // path to output file, data will be stored in JSON 121 | output_file: '', 122 | // whether to also passthru all the html output of the serp pages 123 | html_output: false, 124 | // whether to strip JS and CSS from the html_output 125 | // has only an effect if `html_output` is true 126 | clean_html_output: true, 127 | // remove all data images from the html 128 | clean_data_images: true, 129 | // whether to return a screenshot of serp pages as b64 data 130 | screen_output: false, 131 | // Scrape url from local file. Mainly used for testing. 132 | scrape_from_file: '', 133 | // whether to prevent images, css, fonts and media from being loaded 134 | // will speed up scraping a great deal 135 | block_assets: true, 136 | // path to js module that extends functionality 137 | // this module should export the functions: 138 | // get_browser, handle_metadata, close_browser 139 | //custom_func: resolve('examples/pluggable.js'), 140 | custom_func: null, 141 | throw_on_detection: false, 142 | // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080'] 143 | proxies: null, 144 | // a file with one proxy per line. Example: 145 | // socks5://78.94.172.42:1080 146 | // http://118.174.233.10:48400 147 | proxy_file: '', 148 | // whether to use proxies only 149 | // when this is set to true, se-scraper will not use 150 | // your default IP address 151 | use_proxies_only: false, 152 | // check if headless chrome escapes common detection techniques 153 | // this is a quick test and should be used for debugging 154 | test_evasion: false, 155 | apply_evasion_techniques: true, 156 | // settings for puppeteer-cluster 157 | puppeteer_cluster_config: { 158 | timeout: 30 * 60 * 1000, // max timeout set to 30 minutes 159 | monitor: false, 160 | concurrency: Cluster.CONCURRENCY_BROWSER, 161 | maxConcurrency: 1, 162 | } 163 | }); 164 | 165 | this.logger = this.config.logger; 166 | 167 | if (config.sleep_range) { 168 | // parse an array 169 | config.sleep_range = eval(config.sleep_range); 170 | 171 | if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') { 172 | throw "sleep_range is not a valid array of two integers."; 173 | } 174 | } 175 | 176 | if (fs.existsSync(this.config.keyword_file)) { 177 | this.config.keywords = read_keywords_from_file(this.config.keyword_file); 178 | } 179 | 180 | if (this.config.proxies && this.config.proxy_file) { 181 | throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.'); 182 | } 183 | 184 | if (this.config.proxy_file) { 185 | this.config.proxies = read_keywords_from_file(this.config.proxy_file); 186 | this.logger.info(`${this.config.proxies.length} proxies read from file.`); 187 | } 188 | 189 | if (!this.config.proxies && this.config.use_proxies_only) { 190 | throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only'); 191 | } 192 | 193 | debug('this.config=%O', this.config); 194 | } 195 | 196 | /* 197 | * Launches the puppeteer cluster or browser. 198 | * 199 | * Returns true if the browser was successfully launched. Otherwise will return false. 200 | */ 201 | async start() { 202 | 203 | if (this.config.custom_func) { 204 | if (fs.existsSync(this.config.custom_func)) { 205 | try { 206 | const PluggableClass = require(this.config.custom_func); 207 | this.pluggable = new PluggableClass({ 208 | config: this.config, 209 | context: this.context 210 | }); 211 | } catch (exception) { 212 | console.error(exception); 213 | return false; 214 | } 215 | } else { 216 | console.error(`File "${this.config.custom_func}" does not exist!`); 217 | return false; 218 | } 219 | } 220 | 221 | const chrome_flags = _.clone(this.config.chrome_flags); 222 | 223 | if (this.pluggable && this.pluggable.start_browser) { 224 | launch_args.config = this.config; 225 | this.browser = await this.pluggable.start_browser({ 226 | config: this.config, 227 | }); 228 | this.page = await this.browser.newPage(); 229 | } else { 230 | // if no custom start_browser functionality was given 231 | // use puppeteer-cluster for scraping 232 | 233 | let proxies; 234 | // if we have at least one proxy, always use CONCURRENCY_BROWSER 235 | // and set maxConcurrency to this.config.proxies.length + 1 236 | // else use whatever this.configuration was passed 237 | if (this.config.proxies && this.config.proxies.length > 0) { 238 | 239 | // because we use real browsers, we ran out of memory on normal laptops 240 | // when using more than maybe 5 or 6 browsers. 241 | // therefore hardcode a limit here 242 | // TODO not sure this what we want 243 | this.numClusters = Math.min( 244 | this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1), 245 | MAX_ALLOWED_BROWSERS 246 | ); 247 | proxies = _.clone(this.config.proxies); 248 | 249 | // Insert a first config without proxy if use_proxy_only is false 250 | if (this.config.use_proxies_only === false) { 251 | proxies.unshift(null); 252 | } 253 | 254 | } else { 255 | this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency; 256 | proxies = _.times(this.numClusters, null); 257 | } 258 | 259 | this.logger.info(`Using ${this.numClusters} clusters.`); 260 | 261 | // Give the per browser options 262 | const perBrowserOptions = _.map(proxies, (proxy) => { 263 | const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent; 264 | let args = chrome_flags.concat([`--user-agent=${userAgent}`]); 265 | 266 | if (proxy) { 267 | args = args.concat([`--proxy-server=${proxy}`]); 268 | } 269 | 270 | return { 271 | headless: this.config.headless, 272 | ignoreHTTPSErrors: true, 273 | args 274 | }; 275 | }); 276 | 277 | debug('perBrowserOptions=%O', perBrowserOptions) 278 | 279 | this.cluster = await Cluster.launch({ 280 | monitor: this.config.puppeteer_cluster_config.monitor, 281 | timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes 282 | concurrency: CustomConcurrencyImpl, 283 | maxConcurrency: this.numClusters, 284 | puppeteerOptions: { 285 | perBrowserOptions: perBrowserOptions 286 | } 287 | }); 288 | } 289 | } 290 | 291 | /* 292 | * Scrapes the keywords specified by the config. 293 | */ 294 | async scrape(scrape_config = {}) { 295 | 296 | if (!scrape_config.keywords && !scrape_config.keyword_file) { 297 | throw new Error('Either keywords or keyword_file must be supplied to scrape()'); 298 | } 299 | 300 | Object.assign(this.config, scrape_config); 301 | 302 | var results = {}; 303 | var num_requests = 0; 304 | var metadata = {}; 305 | var startTime = Date.now(); 306 | 307 | this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine; 308 | 309 | this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`); 310 | 311 | if (this.pluggable && this.pluggable.start_browser) { 312 | 313 | this.scraper = getScraper(this.config.search_engine, { 314 | config: this.config, 315 | context: this.context, 316 | pluggable: this.pluggable, 317 | page: this.page, 318 | }); 319 | 320 | var {results, metadata, num_requests} = await this.scraper.run(this.page); 321 | 322 | } else { 323 | // Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine. 324 | // https://github.com/GoogleChrome/puppeteer/issues/678 325 | // The question is: Is it possible to set proxies per Page? Per Browser? 326 | // as far as I can see, puppeteer cluster uses the same puppeteerOptions 327 | // for every browser instance. We will use our custom puppeteer-cluster version. 328 | // https://www.npmjs.com/package/proxy-chain 329 | // this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077 330 | let chunks = []; 331 | for (var n = 0; n < this.numClusters; n++) { 332 | chunks.push([]); 333 | } 334 | for (var k = 0; k < this.config.keywords.length; k++) { 335 | chunks[k % this.numClusters].push(this.config.keywords[k]); 336 | } 337 | 338 | debug('chunks=%o', chunks); 339 | 340 | let execPromises = []; 341 | for (var c = 0; c < chunks.length; c++) { 342 | const config = _.clone(this.config); 343 | config.keywords = chunks[c]; 344 | 345 | var obj = getScraper(this.config.search_engine, { 346 | config: config, 347 | context: {}, 348 | pluggable: this.pluggable, 349 | }); 350 | 351 | var boundMethod = obj.run.bind(obj); 352 | execPromises.push(this.cluster.execute({}, boundMethod)); 353 | } 354 | 355 | let promiseReturns = await Promise.all(execPromises); 356 | 357 | // Merge results and metadata per keyword 358 | for (let promiseReturn of promiseReturns) { 359 | Object.assign(results, promiseReturn.results); 360 | Object.assign(metadata, promiseReturn.metadata); 361 | num_requests += promiseReturn.num_requests; 362 | } 363 | } 364 | 365 | let timeDelta = Date.now() - startTime; 366 | let ms_per_request = timeDelta/num_requests; 367 | 368 | this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); 369 | this.logger.info(`On average ms/request: ${ms_per_request}ms/request`); 370 | 371 | if (this.pluggable && this.pluggable.handle_results) { 372 | await this.pluggable.handle_results(results); 373 | } 374 | 375 | metadata.elapsed_time = timeDelta.toString(); 376 | metadata.ms_per_keyword = ms_per_request.toString(); 377 | metadata.num_requests = num_requests; 378 | 379 | debug('metadata=%O', metadata); 380 | 381 | if (this.pluggable && this.pluggable.handle_metadata) { 382 | await this.pluggable.handle_metadata(metadata); 383 | } 384 | 385 | if (this.config.output_file) { 386 | this.logger.info(`Writing results to ${this.config.output_file}`); 387 | write_results(this.config.output_file, JSON.stringify(results, null, 4)); 388 | } 389 | 390 | return { 391 | results: results, 392 | metadata: metadata || {}, 393 | }; 394 | } 395 | 396 | /* 397 | * Quit the puppeteer cluster/browser. 398 | */ 399 | async quit() { 400 | if (this.pluggable && this.pluggable.close_browser) { 401 | await this.pluggable.close_browser(); 402 | } else { 403 | await this.cluster.idle(); 404 | await this.cluster.close(); 405 | } 406 | } 407 | } 408 | 409 | module.exports = { 410 | ScrapeManager: ScrapeManager, 411 | }; 412 | -------------------------------------------------------------------------------- /test/html_output.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const express = require('express'); 3 | const { createLogger, transports } = require('winston'); 4 | const http = require('http'); 5 | const https = require('https'); 6 | const assert = require('assert'); 7 | const path = require('path'); 8 | const keyCert = require('key-cert'); 9 | const Promise = require('bluebird'); 10 | const Proxy = require('http-mitm-proxy'); 11 | 12 | const debug = require('debug')('se-scraper:test'); 13 | const se_scraper = require('../'); 14 | 15 | const httpPort = 3012; 16 | const httpsPort = httpPort + 1; 17 | const proxyPort = httpPort + 2; 18 | 19 | const fakeSearchEngine = express(); 20 | fakeSearchEngine.get('/search', (req, res) => { 21 | debug('q=%s', req.query.q); 22 | const pageNumber = ((req.query.start/10) || 0) + 1; 23 | res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); 24 | }); 25 | fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); 26 | 27 | describe('Config', function(){ 28 | 29 | let httpServer, httpsServer, proxy; 30 | before(async function(){ 31 | // Here mount our fake engine in both http and https listen server 32 | httpServer = http.createServer(fakeSearchEngine); 33 | httpsServer = https.createServer(await keyCert(), fakeSearchEngine); 34 | 35 | proxy = Proxy(); 36 | proxy.onRequest((ctx, callback) => { 37 | ctx.proxyToServerRequestOptions.host = 'localhost'; 38 | ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; 39 | ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; 40 | debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); 41 | return callback(); 42 | }); 43 | 44 | await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); 45 | await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); 46 | await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); 47 | debug('Fake http search engine servers started'); 48 | }); 49 | 50 | after(function(){ 51 | httpsServer.close(); 52 | httpServer.close(); 53 | proxy.close(); 54 | }); 55 | 56 | describe('html_output', function(){ 57 | 58 | const testLogger = createLogger({ 59 | transports: [ 60 | new transports.Console({ 61 | level: 'error' 62 | }) 63 | ] 64 | }); 65 | 66 | /** 67 | * Test html_output option 68 | */ 69 | it('html_output single page single keyword', async function () { 70 | 71 | const scrape_job = { 72 | search_engine: 'google', 73 | /* TODO refactor start_url 74 | google_settings: { 75 | start_url: 'http://localhost:' + httpPort 76 | }, 77 | */ 78 | keywords: ['test keyword'], 79 | }; 80 | 81 | var scraper = new se_scraper.ScrapeManager({ 82 | throw_on_detection: true, 83 | logger: testLogger, 84 | html_output: true, 85 | //clean_html_output: false, 86 | //clean_data_images: false, 87 | // TODO refactor start_url so we can use-it instead of depending of the proxy for this test 88 | proxies: ['http://localhost:' + proxyPort], 89 | use_proxies_only: true, 90 | }); 91 | await scraper.start(); 92 | const { results } = await scraper.scrape(scrape_job); 93 | await scraper.quit(); 94 | 95 | assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided'); 96 | 97 | }); 98 | 99 | }); 100 | 101 | }); -------------------------------------------------------------------------------- /test/mocks/duckduckgo/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | DuckDuckGo — Privacy, simplified. 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 63 | 64 | 65 | 66 |
67 | 68 | 69 |
70 |
71 | 86 |
87 |
88 |
89 |
90 | 96 | 97 |
98 | 104 | 105 |
106 | 107 | 108 | 109 | 110 | 120 |
121 |
122 | 123 | 124 | 125 | 126 |
127 |
128 |
129 | 130 | 131 | 143 | 144 | 145 | 146 |
147 | 148 | 149 | -------------------------------------------------------------------------------- /test/modules/bing.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const express = require('express'); 3 | const puppeteer = require('puppeteer'); 4 | const { createLogger, transports } = require('winston'); 5 | const http = require('http'); 6 | const https = require('https'); 7 | const assert = require('assert'); 8 | const path = require('path'); 9 | const keyCert = require('key-cert'); 10 | const Promise = require('bluebird'); 11 | const Proxy = require('http-mitm-proxy'); 12 | 13 | const debug = require('debug')('se-scraper:test'); 14 | const { BingScraper } = require('../../src/modules/bing'); 15 | 16 | const httpPort = 3012; 17 | const httpsPort = httpPort + 1; 18 | const proxyPort = httpPort + 2; 19 | 20 | const fakeSearchEngine = express(); 21 | fakeSearchEngine.get('/search', (req, res, next) => { 22 | debug('q=%s', req.query.q); 23 | const pageNumber = Math.round((req.query.first || 0) /10) + 1; 24 | res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html')); 25 | }); 26 | fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']})); 27 | 28 | describe('Module Bing', function(){ 29 | 30 | let httpServer, httpsServer, proxy; 31 | before(async function(){ 32 | // Here mount our fake engine in both http and https listen server 33 | httpServer = http.createServer(fakeSearchEngine); 34 | httpsServer = https.createServer(await keyCert(), fakeSearchEngine); 35 | 36 | proxy = Proxy(); 37 | proxy.onRequest((ctx, callback) => { 38 | ctx.proxyToServerRequestOptions.host = 'localhost'; 39 | ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; 40 | ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; 41 | debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port); 42 | return callback(); 43 | }); 44 | 45 | await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); 46 | await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); 47 | await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); 48 | debug('Fake http search engine servers started'); 49 | }); 50 | 51 | after(function(){ 52 | proxy.close(); 53 | httpsServer.close(); 54 | httpServer.close(); 55 | }); 56 | 57 | let browser; 58 | let page; 59 | beforeEach(async function(){ 60 | debug('Start a new browser'); 61 | browser = await puppeteer.launch({ 62 | //dumpio: true, 63 | //headless: false, 64 | ignoreHTTPSErrors: true, 65 | args: [ '--proxy-server=http://localhost:' + proxyPort ] 66 | }); 67 | debug('Open a fresh page'); 68 | page = await browser.newPage(); 69 | }); 70 | 71 | afterEach(async function(){ 72 | await browser.close(); 73 | }); 74 | 75 | const testLogger = createLogger({ 76 | transports: [ 77 | new transports.Console({ 78 | level: 'error' 79 | }) 80 | ] 81 | }); 82 | 83 | it('one keyword one page', function(){ 84 | const bingScraper = new BingScraper({ 85 | config: { 86 | search_engine_name: 'bing', 87 | throw_on_detection: true, 88 | keywords: ['test keyword'], 89 | logger: testLogger, 90 | scrape_from_file: '', 91 | } 92 | }); 93 | bingScraper.STANDARD_TIMEOUT = 500; 94 | return bingScraper.run({page}).then(({results, metadata, num_requests}) => { 95 | assert.strictEqual(num_requests, 1, 'Must do one request'); 96 | assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed'); 97 | }); 98 | }); 99 | 100 | it('one keyword 3 pages', function () { 101 | const bingScraper = new BingScraper({ 102 | config: { 103 | search_engine_name: 'bing', 104 | throw_on_detection: true, 105 | keywords: ['test keyword'], 106 | logger: testLogger, 107 | scrape_from_file: '', 108 | num_pages: 3, 109 | } 110 | }); 111 | bingScraper.STANDARD_TIMEOUT = 500; 112 | return bingScraper.run({page}).then(({results, metadata, num_requests}) => { 113 | assert.strictEqual(num_requests, 3, 'Must three requests'); 114 | assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1'); 115 | assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1'); 116 | assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); 117 | assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2'); 118 | assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); 119 | assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3'); 120 | }); 121 | }); 122 | 123 | }); -------------------------------------------------------------------------------- /test/modules/duckduckgo.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const express = require('express'); 3 | const puppeteer = require('puppeteer'); 4 | const { createLogger, transports } = require('winston'); 5 | const http = require('http'); 6 | const https = require('https'); 7 | const assert = require('assert'); 8 | const path = require('path'); 9 | const keyCert = require('key-cert'); 10 | const Promise = require('bluebird'); 11 | const Proxy = require('http-mitm-proxy'); 12 | 13 | const debug = require('debug')('se-scraper:test'); 14 | const { DuckduckgoScraper } = require('../../src/modules/duckduckgo'); 15 | 16 | const httpPort = 3012; 17 | const httpsPort = httpPort + 1; 18 | const proxyPort = httpPort + 2; 19 | 20 | const fakeSearchEngine = express(); 21 | fakeSearchEngine.use(express.urlencoded({ extended: true })) 22 | fakeSearchEngine.get('/', (req, res, next) => { 23 | if(!req.query.q){ 24 | return next(); 25 | } 26 | debug('q=%s page=%d', req.query.q, req.query.page); 27 | const pageNumber = req.query.page; 28 | res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html')); 29 | }); 30 | fakeSearchEngine.post('/html', (req, res) => { 31 | debug('body=%o', req.body); 32 | const pageNumber = 1; 33 | res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html')); 34 | }); 35 | fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']})); 36 | 37 | describe('Module DuckDuckGo', function(){ 38 | 39 | let httpServer, httpsServer, proxy; 40 | before(async function(){ 41 | // Here mount our fake engine in both http and https listen server 42 | httpServer = http.createServer(fakeSearchEngine); 43 | httpsServer = https.createServer(await keyCert(), fakeSearchEngine); 44 | 45 | proxy = Proxy(); 46 | proxy.onRequest((ctx, callback) => { 47 | ctx.proxyToServerRequestOptions.host = 'localhost'; 48 | ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; 49 | ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; 50 | debug('proxy askedHost=%s method=%s url=%s toPort=%s', 51 | ctx.clientToProxyRequest.headers.host, 52 | ctx.clientToProxyRequest.method, 53 | ctx.clientToProxyRequest.url, 54 | ctx.proxyToServerRequestOptions.port 55 | ); 56 | return callback(); 57 | }); 58 | 59 | await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); 60 | await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); 61 | await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); 62 | debug('Fake http search engine servers started'); 63 | }); 64 | 65 | after(function(){ 66 | proxy.close(); 67 | httpsServer.close(); 68 | httpServer.close(); 69 | }); 70 | 71 | let browser; 72 | let page; 73 | beforeEach(async function(){ 74 | debug('Start a new browser'); 75 | browser = await puppeteer.launch({ 76 | //dumpio: true, 77 | //headless: false, 78 | ignoreHTTPSErrors: true, 79 | args: [ '--proxy-server=http://localhost:' + proxyPort ] 80 | }); 81 | debug('Open a fresh page'); 82 | page = await browser.newPage(); 83 | }); 84 | 85 | afterEach(async function(){ 86 | await browser.close(); 87 | }); 88 | 89 | const testLogger = createLogger({ 90 | transports: [ 91 | new transports.Console({ 92 | level: 'error' 93 | }) 94 | ] 95 | }); 96 | 97 | it('one keyword one page', function(){ 98 | const duckduckgoScraper = new DuckduckgoScraper({ 99 | config: { 100 | search_engine_name: 'duckduckgo', 101 | throw_on_detection: true, 102 | keywords: ['test keyword'], 103 | logger: testLogger, 104 | scrape_from_file: '', 105 | } 106 | }); 107 | duckduckgoScraper.STANDARD_TIMEOUT = 1000; 108 | return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => { 109 | assert.strictEqual(num_requests, 1, 'Must do one request'); 110 | assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); 111 | }); 112 | }); 113 | 114 | it('one keyword 3 pages', function () { 115 | this.timeout(4000); 116 | const duckduckgoScraper = new DuckduckgoScraper({ 117 | config: { 118 | search_engine_name: 'google', 119 | throw_on_detection: true, 120 | keywords: ['test keyword'], 121 | logger: testLogger, 122 | scrape_from_file: '', 123 | num_pages: 3, 124 | } 125 | }); 126 | duckduckgoScraper.STANDARD_TIMEOUT = 1000; 127 | return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => { 128 | assert.strictEqual(num_requests, 3, 'Must three requests'); 129 | assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); 130 | assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1'); 131 | debug('results page 1 %O',results['test keyword']['1'].results); 132 | debug('results page 2 %O', results['test keyword']['2'].results); 133 | assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2'); 134 | assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1'); 135 | assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3'); 136 | assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1'); 137 | }); 138 | }); 139 | 140 | }); -------------------------------------------------------------------------------- /test/modules/google.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const express = require('express'); 3 | const puppeteer = require('puppeteer'); 4 | const { createLogger, transports } = require('winston'); 5 | const http = require('http'); 6 | const https = require('https'); 7 | const assert = require('assert'); 8 | const path = require('path'); 9 | const keyCert = require('key-cert'); 10 | const Promise = require('bluebird'); 11 | const Proxy = require('http-mitm-proxy'); 12 | 13 | const debug = require('debug')('se-scraper:test'); 14 | const { GoogleScraper } = require('../../src/modules/google'); 15 | 16 | const httpPort = 3012; 17 | const httpsPort = httpPort + 1; 18 | const proxyPort = httpPort + 2; 19 | 20 | const fakeSearchEngine = express(); 21 | fakeSearchEngine.get('/search', (req, res) => { 22 | debug('q=%s', req.query.q); 23 | const pageNumber = ((req.query.start/10) || 0) + 1; 24 | res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); 25 | }); 26 | fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); 27 | 28 | describe('Module Google', function(){ 29 | 30 | let httpServer, httpsServer, proxy; 31 | before(async function(){ 32 | // Here mount our fake engine in both http and https listen server 33 | httpServer = http.createServer(fakeSearchEngine); 34 | httpsServer = https.createServer(await keyCert(), fakeSearchEngine); 35 | 36 | proxy = Proxy(); 37 | proxy.onRequest((ctx, callback) => { 38 | ctx.proxyToServerRequestOptions.host = 'localhost'; 39 | ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; 40 | ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; 41 | debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port); 42 | return callback(); 43 | }); 44 | 45 | await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort }); 46 | await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); 47 | await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); 48 | debug('Fake http search engine servers started'); 49 | }); 50 | 51 | after(function(){ 52 | proxy.close(); 53 | httpsServer.close(); 54 | httpServer.close(); 55 | }); 56 | 57 | let browser; 58 | let page; 59 | beforeEach(async function(){ 60 | debug('Start a new browser'); 61 | browser = await puppeteer.launch({ 62 | //dumpio: true, 63 | //headless: false, 64 | ignoreHTTPSErrors: true, 65 | args: [ '--proxy-server=http://localhost:' + proxyPort ] 66 | }); 67 | debug('Open a fresh page'); 68 | page = await browser.newPage(); 69 | }); 70 | 71 | afterEach(async function(){ 72 | await browser.close(); 73 | }); 74 | 75 | const testLogger = createLogger({ 76 | transports: [ 77 | new transports.Console({ 78 | level: 'error' 79 | }) 80 | ] 81 | }); 82 | 83 | it('one keyword one page', function(){ 84 | const googleScraper = new GoogleScraper({ 85 | config: { 86 | search_engine_name: 'google', 87 | throw_on_detection: true, 88 | keywords: ['test keyword'], 89 | logger: testLogger, 90 | scrape_from_file: '', 91 | } 92 | }); 93 | googleScraper.STANDARD_TIMEOUT = 500; 94 | return googleScraper.run({page}).then(({results, metadata, num_requests}) => { 95 | assert.strictEqual(num_requests, 1, 'Must do one request'); 96 | assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed'); 97 | }); 98 | }); 99 | 100 | it('one keyword 3 pages', function () { 101 | const googleScraper = new GoogleScraper({ 102 | config: { 103 | search_engine_name: 'google', 104 | throw_on_detection: true, 105 | keywords: ['test keyword'], 106 | logger: testLogger, 107 | scrape_from_file: '', 108 | num_pages: 3, 109 | } 110 | }); 111 | googleScraper.STANDARD_TIMEOUT = 500; 112 | return googleScraper.run({page}).then(({results, metadata, num_requests}) => { 113 | assert.strictEqual(num_requests, 3, 'Must three requests'); 114 | assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); 115 | assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1'); 116 | assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2'); 117 | assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1'); 118 | assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3'); 119 | assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1'); 120 | }); 121 | }); 122 | 123 | }); -------------------------------------------------------------------------------- /test/proxy.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const express = require('express'); 3 | const { createLogger, transports } = require('winston'); 4 | const http = require('http'); 5 | const https = require('https'); 6 | const assert = require('assert'); 7 | const keyCert = require('key-cert'); 8 | const Promise = require('bluebird'); 9 | const Proxy = require('http-mitm-proxy'); 10 | 11 | const debug = require('debug')('se-scraper:test'); 12 | const se_scraper = require('../'); 13 | const Scraper = require('../src/modules/se_scraper'); 14 | 15 | const httpPort = 3012; 16 | const httpsPort = httpPort + 1; 17 | const proxyPort = httpPort + 2; 18 | 19 | const fakeSearchEngine = express(); 20 | fakeSearchEngine.set('trust proxy', 'loopback'); 21 | fakeSearchEngine.get('/test-proxy', (req, res) => { 22 | debug('fake-search-engine req.hostname=%s', req.hostname); 23 | //debug('req to', req.socket.localAddress, req.socket.localPort); 24 | res.send(req.hostname); 25 | }); 26 | 27 | describe('Config', function(){ 28 | 29 | let httpServer, httpsServer, proxy; 30 | before(async function(){ 31 | // Here mount our fake engine in both http and https listen server 32 | httpServer = http.createServer(fakeSearchEngine); 33 | httpsServer = https.createServer(await keyCert(), fakeSearchEngine); 34 | 35 | proxy = Proxy(); 36 | proxy.onRequest((ctx, callback) => { 37 | ctx.proxyToServerRequestOptions.host = 'localhost'; 38 | ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; 39 | ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; 40 | debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); 41 | return callback(); 42 | }); 43 | 44 | await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); 45 | await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); 46 | await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); 47 | debug('Fake http search engine servers started'); 48 | }); 49 | 50 | after(function(){ 51 | httpsServer.close(); 52 | httpServer.close(); 53 | proxy.close(); 54 | }); 55 | 56 | describe('proxies', function(){ 57 | 58 | class MockScraperTestProxy extends Scraper { 59 | 60 | async load_start_page(){ 61 | return true; 62 | } 63 | 64 | async search_keyword(){ 65 | await this.page.goto('http://test.local:' + httpPort + '/test-proxy'); 66 | } 67 | 68 | async parse_async(){ 69 | const bodyHandle = await this.page.$('body'); 70 | return await this.page.evaluate(body => body.innerHTML, bodyHandle); 71 | } 72 | } 73 | 74 | const testLogger = createLogger({ 75 | transports: [ 76 | new transports.Console({ 77 | level: 'error' 78 | }) 79 | ] 80 | }); 81 | 82 | /** 83 | * Jobs will be executed 2 by 2 through the proxy and direct connection 84 | * THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set 85 | */ 86 | it('one proxy given, use_proxies_only=false', async function () { 87 | 88 | const scrape_job = { 89 | search_engine: MockScraperTestProxy, 90 | keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], 91 | }; 92 | 93 | var scraper = new se_scraper.ScrapeManager({ 94 | throw_on_detection: true, 95 | proxies: ['http://localhost:' + proxyPort], 96 | // default is use_proxies_only: false, 97 | logger: testLogger, 98 | }); 99 | await scraper.start(); 100 | 101 | const { results } = await scraper.scrape(scrape_job); 102 | assert.strictEqual(results['news']['1'], 'test.local'); 103 | assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine'); 104 | assert.strictEqual(results['i work too much']['1'], 'test.local'); 105 | assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine'); 106 | assert.strictEqual(results['javascript is hard']['1'], 'test.local'); 107 | 108 | await scraper.quit(); 109 | }); 110 | 111 | /** 112 | * Jobs will be executed 1 by 1 through the proxy 113 | */ 114 | it('one proxy given, use_proxies_only=true', async function () { 115 | 116 | const scrape_job = { 117 | search_engine: MockScraperTestProxy, 118 | keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], 119 | }; 120 | 121 | var scraper = new se_scraper.ScrapeManager({ 122 | throw_on_detection: true, 123 | proxies: ['http://localhost:' + proxyPort], 124 | use_proxies_only: true, 125 | logger: testLogger, 126 | }); 127 | await scraper.start(); 128 | 129 | const { results } = await scraper.scrape(scrape_job); 130 | assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine'); 131 | assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine'); 132 | assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine'); 133 | assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine'); 134 | assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine'); 135 | 136 | await scraper.quit(); 137 | }); 138 | 139 | it('zero proxy given, use_proxies_only=true', async function () { 140 | 141 | const scrape_job = { 142 | search_engine: MockScraperTestProxy, 143 | keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'], 144 | }; 145 | 146 | await assert.rejects(async () => { 147 | var scraper = new se_scraper.ScrapeManager({ 148 | throw_on_detection: true, 149 | use_proxies_only: true, 150 | logger: testLogger, 151 | }); 152 | await scraper.start(); 153 | const { results } = await scraper.scrape(scrape_job); 154 | await scraper.quit(); 155 | }, /Must provide at least one proxy in proxies if you enable use_proxies_only/); 156 | 157 | }); 158 | 159 | }); 160 | 161 | }); -------------------------------------------------------------------------------- /test/user_agent.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const express = require('express'); 3 | const { createLogger, transports } = require('winston'); 4 | const http = require('http'); 5 | const https = require('https'); 6 | const assert = require('assert'); 7 | const keyCert = require('key-cert'); 8 | const Promise = require('bluebird'); 9 | const Proxy = require('http-mitm-proxy'); 10 | const UAParser = require('ua-parser-js'); 11 | const _ = require('lodash'); 12 | 13 | const debug = require('debug')('se-scraper:test'); 14 | const se_scraper = require('../'); 15 | const Scraper = require('../src/modules/se_scraper'); 16 | 17 | const httpPort = 3012; 18 | const httpsPort = httpPort + 1; 19 | const proxyPort = httpPort + 2; 20 | 21 | const fakeSearchEngine = express(); 22 | fakeSearchEngine.set('trust proxy', 'loopback'); 23 | fakeSearchEngine.get('/test-user_agent', (req, res) => { 24 | debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']); 25 | res.send(req.headers['user-agent']); 26 | }); 27 | 28 | describe('Config', function(){ 29 | 30 | let httpServer, httpsServer, proxy; 31 | before(async function(){ 32 | // Here mount our fake engine in both http and https listen server 33 | httpServer = http.createServer(fakeSearchEngine); 34 | httpsServer = https.createServer(await keyCert(), fakeSearchEngine); 35 | 36 | proxy = Proxy(); 37 | proxy.onRequest((ctx, callback) => { 38 | ctx.proxyToServerRequestOptions.host = 'localhost'; 39 | ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; 40 | ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; 41 | debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); 42 | return callback(); 43 | }); 44 | 45 | await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); 46 | await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); 47 | await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); 48 | debug('Fake http search engine servers started'); 49 | }); 50 | 51 | after(function(){ 52 | httpsServer.close(); 53 | httpServer.close(); 54 | proxy.close(); 55 | }); 56 | 57 | describe('user_agent', function(){ 58 | 59 | class MockScraperTestUserAgent extends Scraper { 60 | 61 | async load_start_page(){ 62 | return true; 63 | } 64 | 65 | async search_keyword(){ 66 | await this.page.goto('http://localhost:' + httpPort + '/test-user_agent'); 67 | } 68 | 69 | async parse_async(){ 70 | const bodyHandle = await this.page.$('body'); 71 | return await this.page.evaluate(body => body.innerHTML, bodyHandle); 72 | } 73 | } 74 | 75 | const testLogger = createLogger({ 76 | transports: [ 77 | new transports.Console({ 78 | level: 'error' 79 | }) 80 | ] 81 | }); 82 | 83 | /** 84 | * Test user_agent option 85 | */ 86 | it('fixed user_agent', async function () { 87 | 88 | const scrape_job = { 89 | search_engine: MockScraperTestUserAgent, 90 | keywords: ['javascript is hard'], 91 | }; 92 | 93 | var scraper = new se_scraper.ScrapeManager({ 94 | throw_on_detection: true, 95 | logger: testLogger, 96 | user_agent: 'THIS IS A USERAGENT 42.0' 97 | }); 98 | await scraper.start(); 99 | 100 | const { results } = await scraper.scrape(scrape_job); 101 | assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0'); 102 | 103 | await scraper.quit(); 104 | }); 105 | 106 | /** 107 | * Test random_user_agent option 108 | * TODO generated user_agent should be different for each keyword 109 | * TODO this test will sometimes fail because user_agent not very random :-( 110 | */ 111 | it('random_user_agent', async function () { 112 | 113 | const scrape_job = { 114 | search_engine: MockScraperTestUserAgent, 115 | keywords: ['news'], 116 | }; 117 | 118 | const NUMBER_OF_EXEC = 10; 119 | 120 | const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => { 121 | const scraper = new se_scraper.ScrapeManager({ 122 | throw_on_detection: true, 123 | logger: testLogger, 124 | random_user_agent: true, 125 | }); 126 | await scraper.start(); 127 | const { results: { news } } = await scraper.scrape(scrape_job); 128 | await scraper.quit(); 129 | return news['1']; 130 | }); 131 | 132 | uaList.forEach((userAgent) => { 133 | const uaParsed = UAParser(userAgent); 134 | assert(uaParsed.browser.name, 'UserAgent should have a browser name detected'); 135 | assert(uaParsed.os.name, 'UserAgent should have a os name detected'); 136 | }); 137 | 138 | assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' ); 139 | 140 | }); 141 | 142 | }); 143 | 144 | }); --------------------------------------------------------------------------------