├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── README.md
├── TODO.md
├── examples
    ├── bing_de.json
    ├── bing_multiple_browser_multiple_pages.js
    ├── cleaned_html.js
    ├── custom_scraper.js
    ├── delete_comments.js
    ├── detection_checker.js
    ├── for_the_lulz.js
    ├── gimage.js
    ├── gnold.js
    ├── google_maps.js
    ├── headless-test-result.png
    ├── keywords.txt
    ├── minimal.js
    ├── multiple_browsers.js
    ├── multiple_search_engines.js
    ├── multiple_tabs.js
    ├── per_page_proxy.js
    ├── pluggable.js
    ├── pluggable_example.js
    ├── proxies.js
    ├── quickstart.js
    ├── reusing.js
    ├── test_cluster.js
    ├── test_promise.js
    └── test_proxyflag.js
├── index.js
├── jformat.py
├── package-lock.json
├── package.json
├── run.js
├── se-scraper.iml
├── src
    ├── captcha_solver.js
    ├── concurrency-implementation.js
    ├── modules
    │   ├── bing.js
    │   ├── duckduckgo.js
    │   ├── google.js
    │   ├── infospace.js
    │   ├── metadata.js
    │   ├── se_scraper.js
    │   └── yandex.js
    └── node_scraper.js
└── test
    ├── html_output.js
    ├── mocks
        ├── bing
        │   ├── index.html
        │   ├── test keyword_page1.html
        │   ├── test keyword_page2.html
        │   └── test keyword_page3.html
        ├── duckduckgo
        │   ├── index.html
        │   ├── test keyword_page1.html
        │   ├── test keyword_page2.html
        │   └── test keyword_page3.html
        └── google
        │   ├── index.html
        │   ├── test keyword_page1.html
        │   ├── test keyword_page2.html
        │   └── test keyword_page3.html
    ├── modules
        ├── bing.js
        ├── duckduckgo.js
        └── google.js
    ├── proxy.js
    └── user_agent.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore static tests
 2 | 
 3 | test/static_tests/html/
 4 | test/static_tests/html/*
 5 | 
 6 | .idea
 7 | 
 8 | # ignore data
 9 | 
10 | examples/data/
11 | examples/data/*
12 | 
13 | examples/results/
14 | examples/results/*
15 | 
16 | 
17 | # Logs
18 | logs
19 | *.log
20 | npm-debug.log*
21 | yarn-debug.log*
22 | yarn-error.log*
23 | 
24 | # Runtime data
25 | pids
26 | *.pid
27 | *.seed
28 | *.pid.lock
29 | 
30 | # Directory for instrumented libs generated by jscoverage/JSCover
31 | lib-cov
32 | 
33 | # Coverage directory used by tools like istanbul
34 | coverage
35 | 
36 | # nyc test coverage
37 | .nyc_output
38 | 
39 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
40 | .grunt
41 | 
42 | # Bower dependency directory (https://bower.io/)
43 | bower_components
44 | 
45 | # node-waf configuration
46 | .lock-wscript
47 | 
48 | # Compiled binary addons (https://nodejs.org/api/addons.html)
49 | build/Release
50 | 
51 | # Dependency directories
52 | node_modules/
53 | jspm_packages/
54 | 
55 | # TypeScript v1 declaration files
56 | typings/
57 | 
58 | # Optional npm cache directory
59 | .npm
60 | 
61 | # Optional eslint cache
62 | .eslintcache
63 | 
64 | # Optional REPL history
65 | .node_repl_history
66 | 
67 | # Output of 'npm pack'
68 | *.tgz
69 | 
70 | # Yarn Integrity file
71 | .yarn-integrity
72 | 
73 | # dotenv environment variables file
74 | .env
75 | 
76 | # next.js build output
77 | .next
78 | 
79 | 
80 | .idea/
81 | GoogleScraperPup.iml
82 | 
83 | .http-mitm-proxy
84 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NikolaiT/se-scraper/5a0eea201dbeac7c9db4163eaa485bf4cd64f47d/.gitmodules


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at contact@scrapeulous.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:10-slim
 2 | 
 3 | # Application parameters and variables
 4 | # ENV NODE_ENV=production
 5 | ENV HOST=0.0.0.0
 6 | ENV PORT=3000
 7 | ENV application_directory=/se-scraper
 8 | ENV puppeteer_cluster_directory=/se-scraper/src/puppeteer-cluster
 9 | 
10 | # Create app directory
11 | WORKDIR $application_directory
12 | 
13 | RUN apt-get update && \
14 | apt-get install -y \
15 | gconf-service \
16 | libasound2 \
17 | libatk1.0-0 \
18 | libc6 \
19 | libcairo2 \
20 | libcups2 \
21 | libdbus-1-3 \
22 | libexpat1 \
23 | libfontconfig1 \
24 | libgcc1 \
25 | libgconf-2-4 \
26 | libgdk-pixbuf2.0-0 \
27 | libglib2.0-0 \
28 | libgtk-3-0 \
29 | libnspr4 \
30 | libpango-1.0-0 \
31 | libpangocairo-1.0-0 \
32 | libstdc++6 \
33 | libx11-6 \
34 | libx11-xcb1 \
35 | libxcb1 \
36 | libxcomposite1 \
37 | libxcursor1 \
38 | libxdamage1 \
39 | libxext6 \
40 | libxfixes3 \
41 | libxi6 \
42 | libxrandr2 \
43 | libxrender1 \
44 | libxss1 \
45 | libxtst6 \
46 | ca-certificates \
47 | fonts-liberation \
48 | libappindicator1 \
49 | libnss3 \
50 | lsb-release \
51 | xdg-utils \
52 | wget
53 | 
54 | # Bundle app source
55 | COPY . .
56 | WORKDIR $puppeteer_cluster_directory
57 | RUN npm install \
58 |     && npm run build
59 | 
60 | WORKDIR $application_directory
61 | # skip installing scripts for puppeteer dependencies
62 | # we've already installed puppeteer above.
63 | RUN npm install --ignore-scripts
64 | 
65 | # Cleanup
66 | RUN apt-get clean && rm -rf /var/lib/apt/lists/*
67 | 
68 | ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.2/dumb-init_1.2.2_amd64 /usr/local/bin/dumb-init
69 | RUN chmod +x /usr/local/bin/dumb-init
70 | 
71 | EXPOSE $PORT
72 | 
73 | CMD ["dumb-init", "node", "server/server.js"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2019 Nikolai Tschacher
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [The maintained successor of se-scraper is the general purpose crawling infrastructure](https://github.com/NikolaiT/Crawling-Infrastructure)
  2 | 
  3 | ## Search Engine Scraper - se-scraper
  4 | 
  5 | [![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper)
  6 | [![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas)
  7 | [![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper)
  8 | 
  9 | This node module allows you to scrape search engines concurrently with different proxies.
 10 | 
 11 | If you don't have extensive technical experience or don't want to purchase proxies, you can use [my scraping service](https://scrapeulous.com/).
 12 | 
 13 | #### Table of Contents
 14 | - [Installation](#installation)
 15 | - [Docker](#docker-support)
 16 | - [Minimal Example](#minimal-example)
 17 | - [Quickstart](#quickstart)
 18 | - [Contribute](#contribute)
 19 | - [Using Proxies](#proxies)
 20 | - [Custom Scrapers](#custom-scrapers)
 21 | - [Examples](#examples)
 22 | - [Scraping Model](#scraping-model)
 23 | - [Technical Notes](#technical-notes)
 24 | - [Advanced Usage](#advanced-usage)
 25 | - [Special Query String Parameters for Search Engines](#query-string-parameters)
 26 | 
 27 | 
 28 | Se-scraper supports the following search engines:
 29 | * Google
 30 | * Google News
 31 | * Google News App version (https://news.google.com)
 32 | * Google Image
 33 | * Bing
 34 | * Bing News
 35 | * Infospace
 36 | * Duckduckgo
 37 | * Yandex
 38 | * Webcrawler
 39 | 
 40 | This module uses puppeteer and a modified version of [puppeteer-cluster](https://github.com/thomasdondorf/puppeteer-cluster/). It was created by the Developer of [GoogleScraper](https://github.com/NikolaiT/GoogleScraper), a module with 1800 Stars on Github.
 41 | 
 42 | ## Installation
 43 | 
 44 | You need a working installation of **node** and the **npm** package manager.
 45 | 
 46 | 
 47 | For example, if you are using Ubuntu 18.04, you can install node and npm with the following commands:
 48 | 
 49 | ```bash
 50 | sudo apt update;
 51 | 
 52 | sudo apt install nodejs;
 53 | 
 54 | # recent version of npm
 55 | curl -sL https://deb.nodesource.com/setup_10.x -o nodesource_setup.sh;
 56 | sudo bash nodesource_setup.sh;
 57 | sudo apt install npm;
 58 | ```
 59 | 
 60 | Chrome and puppeteer [need some additional libraries to run on ubuntu](https://techoverflow.net/2018/06/05/how-to-fix-puppetteer-error-).
 61 | 
 62 | This command will install dependencies:
 63 | 
 64 | ```bash
 65 | # install all that is needed by chromium browser. Maybe not everything needed
 66 | sudo apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget;
 67 | ```
 68 | 
 69 | Install **se-scraper** by entering the following command in your terminal
 70 | 
 71 | ```bash
 72 | npm install se-scraper
 73 | ```
 74 | 
 75 | If you **don't** want puppeteer to download a complete chromium browser, add this variable to your environment. Then this module is not guaranteed to run out of the box.
 76 | 
 77 | ```bash
 78 | export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1
 79 | ```
 80 | 
 81 | ### Docker Support
 82 | 
 83 | I will maintain a public docker image of se-scraper. Pull the docker image with the command:
 84 | 
 85 | ```bash
 86 | docker pull tschachn/se-scraper
 87 | ```
 88 | 
 89 | Confirm that the docker image was correctly pulled:
 90 | 
 91 | ```bash
 92 | docker image ls
 93 | ```
 94 | 
 95 | Should show something like that:
 96 | 
 97 | ```
 98 | tschachn/se-scraper             latest           897e1aeeba78        21 minutes ago      1.29GB
 99 | ```
100 | 
101 | You can check the [latest tag here](https://hub.docker.com/r/tschachn/se-scraper/tags). In the example below, the latest tag is **latest**. This will most likely remain **latest** in the future.
102 | 
103 | Run the docker image and map the internal port 3000 to the external
104 | port 3000:
105 | 
106 | ```bash
107 | $ docker run -p 3000:3000 tschachn/se-scraper:latest
108 | 
109 | Running on http://0.0.0.0:3000
110 | ```
111 | 
112 | When the image is running, you may start scrape jobs via HTTP API:
113 | 
114 | ```bash
115 | curl -XPOST http://0.0.0.0:3000 -H 'Content-Type: application/json' \
116 | -d '{
117 |     "browser_config": {
118 |         "random_user_agent": true
119 |     },
120 |     "scrape_config": {
121 |         "search_engine": "google",
122 |         "keywords": ["test"],
123 |         "num_pages": 1
124 |     }
125 | }'
126 | ```
127 | 
128 | Many thanks goes to [slotix](https://github.com/NikolaiT/se-scraper/pull/21) for his tremendous help in setting up a docker image.
129 | 
130 | 
131 | ## Minimal Example
132 | 
133 | Create a file named `minimal.js` with the following contents
134 | 
135 | ```js
136 | const se_scraper = require('se-scraper');
137 | 
138 | (async () => {
139 |     let scrape_job = {
140 |         search_engine: 'google',
141 |         keywords: ['lets go boys'],
142 |         num_pages: 1,
143 |     };
144 | 
145 |     var results = await se_scraper.scrape({}, scrape_job);
146 | 
147 |     console.dir(results, {depth: null, colors: true});
148 | })();
149 | ```
150 | 
151 | Start scraping by firing up the command `node minimal.js`
152 | 
153 | ## Quickstart
154 | 
155 | Create a file named `run.js` with the following contents
156 | 
157 | ```js
158 | const se_scraper = require('se-scraper');
159 | 
160 | (async () => {
161 |     let browser_config = {
162 |         debug_level: 1,
163 |         output_file: 'examples/results/data.json',
164 |     };
165 | 
166 |     let scrape_job = {
167 |         search_engine: 'google',
168 |         keywords: ['news', 'se-scraper'],
169 |         num_pages: 1,
170 |         // add some cool google search settings
171 |         google_settings: {
172 |             gl: 'us', // The gl parameter determines the Google country to use for the query.
173 |             hl: 'en', // The hl parameter determines the Google UI language to return results.
174 |             start: 0, // Determines the results offset to use, defaults to 0.
175 |             num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
176 |         },
177 |     };
178 | 
179 |     var scraper = new se_scraper.ScrapeManager(browser_config);
180 | 
181 |     await scraper.start();
182 | 
183 |     var results = await scraper.scrape(scrape_job);
184 | 
185 |     console.dir(results, {depth: null, colors: true});
186 | 
187 |     await scraper.quit();
188 | })();
189 | ```
190 | 
191 | Start scraping by firing up the command `node run.js`
192 | 
193 | ## Contribute
194 | 
195 | I really help and love your help! However scraping is a dirty business and it often takes me a lot of time to find failing selectors or missing JS logic. So if any search engine does not yield the results of your liking, please create a **static test case** similar to [this static test of google](test/static_tests/google.js) that fails. I will try to correct se-scraper then.
196 | 
197 | That's how you would proceed:
198 | 
199 | 1. Copy the [static google test case](test/static_tests/google.js)
200 | 2. Remove all unnecessary testing code
201 | 3. Save a search to file where se-scraper does not work correctly.
202 | 3. Implement the static test case using the saved search html where se-scraper currently fails.
203 | 4. Submit a new issue with the failing test case as pull request
204 | 5. I will fix it! (or better: you submit a pull request directly)
205 | 
206 | ## Proxies
207 | 
208 | **se-scraper** will create one browser instance per proxy. So the maximal amount of concurrency is equivalent to the number of proxies plus one (your own IP).
209 | 
210 | ```js
211 | const se_scraper = require('se-scraper');
212 | 
213 | (async () => {
214 |     let browser_config = {
215 |         debug_level: 1,
216 |         output_file: 'examples/results/proxyresults.json',
217 |         proxy_file: '/home/nikolai/.proxies', // one proxy per line
218 |         log_ip_address: true,
219 |     };
220 | 
221 |     let scrape_job = {
222 |         search_engine: 'google',
223 |         keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'],
224 |         num_pages: 1,
225 |     };
226 | 
227 |     var scraper = new se_scraper.ScrapeManager(browser_config);
228 |     await scraper.start();
229 | 
230 |     var results = await scraper.scrape(scrape_job);
231 |     console.dir(results, {depth: null, colors: true});
232 |     await scraper.quit();
233 | })();
234 | ```
235 | 
236 | With a proxy file such as
237 | 
238 | ```text
239 | socks5://53.34.23.55:55523
240 | socks4://51.11.23.22:22222
241 | ```
242 | 
243 | This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that.
244 | 
245 | 
246 | ## Custom Scrapers
247 | 
248 | You can define your own scraper class and use it within se-scraper.
249 | 
250 | [Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia.
251 | 
252 | 
253 | ## Examples
254 | 
255 | * [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json)
256 | * [Simple example scraping google](examples/quickstart.js) yields [these results](examples/results/data.json)
257 | * [Scrape with one proxy per browser](examples/proxies.js) yields [these results](examples/results/proxyresults.json)
258 | * [Scrape 100 keywords on Bing with multible tabs in one browser](examples/multiple_tabs.js) produces [this](examples/results/bing.json)
259 | * [Inject your own scraping logic](examples/pluggable.js)
260 | * [For the Lulz: Scraping google dorks for SQL injection vulnerabilites and confirming them.](examples/for_the_lulz.js)
261 | * [Scrape google maps/locations](examples/google_maps.js) yields [these results](examples/results/maps.json)
262 | 
263 | 
264 | ## Scraping Model
265 | 
266 | **se-scraper** scrapes search engines only. In order to introduce concurrency into this library, it is necessary to define the scraping model. Then we can decide how we divide and conquer.
267 | 
268 | #### Scraping Resources
269 | 
270 | What are common scraping resources?
271 | 
272 | 1. **Memory and CPU**. Necessary to launch multiple browser instances.
273 | 2. **Network Bandwith**. Is not often the bottleneck.
274 | 3. **IP Addresses**. Websites often block IP addresses after a certain amount of requests from the same IP address. Can be circumvented by using proxies.
275 | 4. Spoofable identifiers such as browser fingerprint or user agents. Those will be handled by **se-scraper**
276 | 
277 | #### Concurrency Model
278 | 
279 | **se-scraper** should be able to run without any concurrency at all. This is the default case. No concurrency means only one browser/tab is searching at the time.
280 | 
281 | For concurrent use, we will make use of a modified [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster).
282 | 
283 | One scrape job is properly defined by
284 | 
285 | * 1 search engine such as `google`
286 | * `M` pages
287 | * `N` keywords/queries
288 | * `K` proxies and `K+1` browser instances (because when we have no proxies available, we will scrape with our dedicated IP)
289 | 
290 | Then **se-scraper** will create `K+1` dedicated browser instances with a unique ip address. Each browser will get `N/(K+1)` keywords and will issue `N/(K+1) * M` total requests to the search engine.
291 | 
292 | The problem is that [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) does only allow identical options for subsequent new browser instances. Therefore, it is not trivial to launch a cluster of browsers with distinct proxy settings. Right now, every browser has the same options. It's not possible to set options on a per browser basis.
293 | 
294 | Solution:
295 | 
296 | 1. Create a [upstream proxy router](https://github.com/GoogleChrome/puppeteer/issues/678).
297 | 2. Modify [puppeteer-cluster library](https://github.com/thomasdondorf/puppeteer-cluster) to accept a list of proxy strings and then pop() from this list at every new call to `workerInstance()` in https://github.com/thomasdondorf/puppeteer-cluster/blob/master/src/Cluster.ts I wrote an [issue here](https://github.com/thomasdondorf/puppeteer-cluster/issues/107). **I ended up doing this**.
298 | 
299 | 
300 | ## Technical Notes
301 | 
302 | Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
303 | 
304 | If you need to deploy scraping to the cloud (AWS or Azure), you can contact me at **hire@incolumitas.com**
305 | 
306 | The chromium browser is started with the following flags to prevent
307 | scraping detection.
308 | 
309 | ```js
310 | var ADDITIONAL_CHROME_FLAGS = [
311 |     '--disable-infobars',
312 |     '--window-position=0,0',
313 |     '--ignore-certifcate-errors',
314 |     '--ignore-certifcate-errors-spki-list',
315 |     '--no-sandbox',
316 |     '--disable-setuid-sandbox',
317 |     '--disable-dev-shm-usage',
318 |     '--disable-accelerated-2d-canvas',
319 |     '--disable-gpu',
320 |     '--window-size=1920x1080',
321 |     '--hide-scrollbars',
322 |     '--disable-notifications',
323 | ];
324 | ```
325 | 
326 | Furthermore, to avoid loading unnecessary ressources and to speed up
327 | scraping a great deal, we instruct chrome to not load images and css and media:
328 | 
329 | ```js
330 | await page.setRequestInterception(true);
331 | page.on('request', (req) => {
332 |     let type = req.resourceType();
333 |     const block = ['stylesheet', 'font', 'image', 'media'];
334 |     if (block.includes(type)) {
335 |         req.abort();
336 |     } else {
337 |         req.continue();
338 |     }
339 | });
340 | ```
341 | 
342 | #### Making puppeteer and headless chrome undetectable
343 | 
344 | Consider the following resources:
345 | 
346 | * https://antoinevastel.com/bot%20detection/2019/07/19/detecting-chrome-headless-v3.html
347 | * https://intoli.com/blog/making-chrome-headless-undetectable/
348 | * https://intoli.com/blog/not-possible-to-block-chrome-headless/
349 | * https://news.ycombinator.com/item?id=16179602
350 | 
351 | **se-scraper** implements the countermeasures against headless chrome detection proposed on those sites.
352 | 
353 | Most recent detection counter measures can be found here:
354 | 
355 | * https://github.com/paulirish/headless-cat-n-mouse/blob/master/apply-evasions.js
356 | 
357 | **se-scraper** makes use of those anti detection techniques.
358 | 
359 | To check whether evasion works, you can test it by passing `test_evasion` flag to the config:
360 | 
361 | ```js
362 | let config = {
363 |     // check if headless chrome escapes common detection techniques
364 |     test_evasion: true
365 | };
366 | ```
367 | 
368 | It will create a screenshot named `headless-test-result.png` in the directory where the scraper was started that shows whether all test have passed.
369 | 
370 | ## Advanced Usage
371 | 
372 | Use **se-scraper** by calling it with a script such as the one below.
373 | 
374 | ```js
375 | const se_scraper = require('se-scraper');
376 | 
377 | // those options need to be provided on startup
378 | // and cannot give to se-scraper on scrape() calls
379 | let browser_config = {
380 |     // the user agent to scrape with
381 |     user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
382 |     // if random_user_agent is set to True, a random user agent is chosen
383 |     random_user_agent: false,
384 |     // whether to select manual settings in visible mode
385 |     set_manual_settings: false,
386 |     // log ip address data
387 |     log_ip_address: false,
388 |     // log http headers
389 |     log_http_headers: false,
390 |     // how long to sleep between requests. a random sleep interval within the range [a,b]
391 |     // is drawn before every request. empty string for no sleeping.
392 |     sleep_range: '',
393 |     // which search engine to scrape
394 |     search_engine: 'google',
395 |     compress: false, // compress
396 |     // whether debug information should be printed
397 |     // level 0: print nothing
398 |     // level 1: print most important info
399 |     // ...
400 |     // level 4: print all shit nobody wants to know
401 |     debug_level: 1,
402 |     keywords: ['nodejs rocks',],
403 |     // whether to start the browser in headless mode
404 |     headless: true,
405 |     // specify flags passed to chrome here
406 |     chrome_flags: [],
407 |     // the number of pages to scrape for each keyword
408 |     num_pages: 1,
409 |     // path to output file, data will be stored in JSON
410 |     output_file: '',
411 |     // whether to also passthru all the html output of the serp pages
412 |     html_output: false,
413 |     // whether to return a screenshot of serp pages as b64 data
414 |     screen_output: false,
415 |     // whether to prevent images, css, fonts and media from being loaded
416 |     // will speed up scraping a great deal
417 |     block_assets: true,
418 |     // path to js module that extends functionality
419 |     // this module should export the functions:
420 |     // get_browser, handle_metadata, close_browser
421 |     //custom_func: resolve('examples/pluggable.js'),
422 |     custom_func: '',
423 |     throw_on_detection: false,
424 |     // use a proxy for all connections
425 |     // example: 'socks5://78.94.172.42:1080'
426 |     // example: 'http://118.174.233.10:48400'
427 |     proxy: '',
428 |     // a file with one proxy per line. Example:
429 |     // socks5://78.94.172.42:1080
430 |     // http://118.174.233.10:48400
431 |     proxy_file: '',
432 |     // whether to use proxies only
433 |     // when this is set to true, se-scraper will not use
434 |     // your default IP address
435 |     use_proxies_only: false,
436 |     // check if headless chrome escapes common detection techniques
437 |     // this is a quick test and should be used for debugging
438 |     test_evasion: false,
439 |     apply_evasion_techniques: true,
440 |     // settings for puppeteer-cluster
441 |     puppeteer_cluster_config: {
442 |         timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
443 |         monitor: false,
444 |         concurrency: Cluster.CONCURRENCY_BROWSER,
445 |         maxConcurrency: 1,
446 |     }
447 | };
448 | 
449 | (async () => {
450 |     // scrape config can change on each scrape() call
451 |     let scrape_config = {
452 |         // which search engine to scrape
453 |         search_engine: 'google',
454 |         // an array of keywords to scrape
455 |         keywords: ['cat', 'mouse'],
456 |         // the number of pages to scrape for each keyword
457 |         num_pages: 2,
458 | 
459 |         // OPTIONAL PARAMS BELOW:
460 |         google_settings: {
461 |             gl: 'us', // The gl parameter determines the Google country to use for the query.
462 |             hl: 'fr', // The hl parameter determines the Google UI language to return results.
463 |             start: 0, // Determines the results offset to use, defaults to 0.
464 |             num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
465 |         },
466 |         // instead of keywords you can specify a keyword_file. this overwrites the keywords array
467 |         keyword_file: '',
468 |         // how long to sleep between requests. a random sleep interval within the range [a,b]
469 |         // is drawn before every request. empty string for no sleeping.
470 |         sleep_range: '',
471 |         // path to output file, data will be stored in JSON
472 |         output_file: 'output.json',
473 |         // whether to prevent images, css, fonts from being loaded
474 |         // will speed up scraping a great deal
475 |         block_assets: false,
476 |         // check if headless chrome escapes common detection techniques
477 |         // this is a quick test and should be used for debugging
478 |         test_evasion: false,
479 |         apply_evasion_techniques: true,
480 |         // log ip address data
481 |         log_ip_address: false,
482 |         // log http headers
483 |         log_http_headers: false,
484 |     };
485 | 
486 |     let results = await se_scraper.scrape(browser_config, scrape_config);
487 |     console.dir(results, {depth: null, colors: true});
488 | })();
489 | ```
490 | 
491 | [Output for the above script on my machine.](examples/results/advanced.json)
492 | 
493 | ### Query String Parameters
494 | 
495 | You can add your custom query string parameters to the configuration object by specifying a `google_settings` key. In general: `{{search engine}}_settings`.
496 | 
497 | For example you can customize your google search with the following config:
498 | 
499 | ```js
500 | let scrape_config = {
501 |     search_engine: 'google',
502 |     // use specific search engine parameters for various search engines
503 |     google_settings: {
504 |         google_domain: 'google.com',
505 |         gl: 'us', // The gl parameter determines the Google country to use for the query.
506 |         hl: 'us', // The hl parameter determines the Google UI language to return results.
507 |         start: 0, // Determines the results offset to use, defaults to 0.
508 |         num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
509 |     },
510 | }
511 | ```
512 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | ### 24.12.2018
 2 |     - fix interface to scrape() [DONE]
 3 |     - add to Github
 4 | 
 5 | 
 6 | ### 24.1.2018
 7 |     - fix issue #3: add functionality to add keyword file
 8 | 
 9 | ### 27.1.2019
10 |     - Add functionality to block images and CSS from loading as described here:
11 |         https://www.scrapehero.com/how-to-increase-web-scraping-speed-using-puppeteer/
12 |         https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
13 | 
14 | ### 29.1.2019
15 |     - implement proxy support functionality
16 |         - implement proxy check
17 | 
18 |     - implement scraping more than 1 page
19 |         - do it for google
20 |         - and bing
21 |     - implement duckduckgo scraping
22 | 
23 | 
24 | ### 30.1.2019
25 |     - modify all scrapers to use the generic class where it makes sense
26 |         - Bing, Baidu, Google, Duckduckgo
27 | 
28 | ### 7.2.2019
29 |     - add num_requests to test cases [done]
30 | 
31 | ### 25.2.2019
32 |     - https://antoinevastel.com/crawler/2018/09/20/parallel-crawler-puppeteer.html
33 |     - add support for browsing with multiple browsers, use this neat library:
34 |     - https://github.com/thomasdondorf/puppeteer-cluster [done]
35 |     
36 |     
37 | ### 28.2.2019
38 |     - write test case for multiple browsers/proxies
39 |     - write test case and example for multiple tabs with bing
40 |     - make README.md nicer. https://github.com/thomasdondorf/puppeteer-cluster/blob/master/README.md as template
41 | 
42 | 
43 | ### 11.6.2019
44 |     - TODO: fix amazon scraping
45 |     - change api of remaining test cases [done]
46 |     - TODO: implement custom search engine parameters on scrape()
47 |     
48 | ### 12.6.2019
49 |     - remove unnecessary sleep() calls and replace with waitFor selectors
50 | 
51 | 
52 | ### 16.7.2019
53 | 
54 | - resolve issues
55 |     - fix this https://github.com/NikolaiT/se-scraper/issues/37 [done]
56 |     
57 | - use puppeteer stealth plugin: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
58 | 
59 |     - we will need to load at the concurrency impl of puppeteer-cluster [no typescript support :(), I will not support this right now]
60 | 
61 | - user random user agents plugin: https://github.com/intoli/user-agents [done]
62 | 
63 | - add screenshot capability (make the screen after parsing)
64 |     - store as b64 [done]
65 | 
66 | 
67 | 
68 | ### 12.8.2019
69 | 
70 | - add static test case for bing [done]
71 | - add options that minimize `html_output` flag: 
72 |     `clean_html_output` will remove all JS and CSS from the html 
73 |     `clean_data_images` removes all data images from the html
74 |     [done]
75 |     
76 |     
77 | ### 13.8.2019
78 | - Write test case for clean html output [done]
79 | - Consider better compression algorithm. [done] There is the brotli algorithm, but this is only supported
80 |   in very recent versions of nodejs
81 | - what else can we remove from the dom [done] Removing comment nodes now! They are large in BING.
82 | - remove all whitespace and \n and \t from html
83 | 
84 | ### TODO:
85 | 1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
86 | 2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
87 | 
88 | 3. dont create a new tab when opening a new scraper
89 | 


--------------------------------------------------------------------------------
/examples/bing_multiple_browser_multiple_pages.js:
--------------------------------------------------------------------------------
 1 | var fs = require('fs');
 2 | var path = require('path');
 3 | var os = require("os");
 4 | 
 5 | const se_scraper = require('./../index.js');
 6 | var filepath_de = path.join(__dirname, '/data/keywords_de.txt');
 7 | 
 8 | function read_keywords_from_file(fpath) {
 9 |     let kws =  fs.readFileSync(fpath).toString().split(os.EOL);
10 |     // clean keywords
11 |     kws = kws.filter((kw) => {
12 |         return kw.trim().length > 0;
13 |     });
14 |     return kws;
15 | }
16 | 
17 | let keywords_de = read_keywords_from_file(filepath_de);
18 | 
19 | const Cluster = {
20 |     CONCURRENCY_PAGE: 1, // shares cookies, etc.
21 |     CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
22 |     CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
23 | };
24 | 
25 | // those options need to be provided on startup
26 | // and cannot give to se-scraper on scrape() calls
27 | let browser_config = {
28 |     // the user agent to scrape with
29 |     user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
30 |     // if random_user_agent is set to True, a random user agent is chosen
31 |     random_user_agent: true,
32 |     verbose: true,
33 |     // whether to start the browser in headless mode
34 |     headless: true,
35 |     is_local: false,
36 |     throw_on_detection: false,
37 |     puppeteer_cluster_config: {
38 |         headless: true,
39 |         timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
40 |         monitor: false,
41 |         concurrency: 3, // one scraper per tab
42 |         maxConcurrency: 3, // scrape with 5 tabs
43 |     }
44 | };
45 | 
46 | (async () => {
47 |     // scrape config can change on each scrape() call
48 |     let scrape_config_bing_de = {
49 |         // which search engine to scrape
50 |         search_engine: 'bing',
51 |         // an array of keywords to scrape
52 |         keywords: keywords_de,
53 |         // the number of pages to scrape for each keyword
54 |         num_pages: 10,
55 | 
56 |         // OPTIONAL PARAMS BELOW:
57 |         // https://docs.microsoft.com/en-us/rest/api/cognitiveservices-bingsearch/bing-web-api-v5-reference#query-parameters
58 |         bing_settings: {
59 |             cc: 'DE', // The cc parameter determines the country to use for the query.
60 |             mkt: 'de-DE', // The mkt parameter determines the UI language to return results.
61 |             offset: 0, // Determines the results offset to use, defaults to 0.
62 |             count: 20, // Determines the number of results to show, defaults to 10. Maximum is 100.
63 |         },
64 |         // how long to sleep between requests. a random sleep interval within the range [a,b]
65 |         // is drawn before every request. empty string for no sleeping.
66 |         sleep_range: '',
67 |         // path to output file, data will be stored in JSON
68 |         output_file: 'examples/bing_de.json',
69 |         // whether to prevent images, css, fonts from being loaded
70 |         // will speed up scraping a great deal
71 |         block_assets: true,
72 |         // check if headless chrome escapes common detection techniques
73 |         // this is a quick test and should be used for debugging
74 |         test_evasion: false,
75 |         apply_evasion_techniques: true,
76 |         // log ip address data
77 |         log_ip_address: false,
78 |         // log http headers
79 |         log_http_headers: false,
80 |     };
81 | 
82 |     let results = await se_scraper.scrape(browser_config, scrape_config_bing_de);
83 |     console.dir(results.metadata, {depth: null, colors: true});
84 | 
85 | })();


--------------------------------------------------------------------------------
/examples/cleaned_html.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../index.js');
 2 | const fs = require('fs');
 3 | 
 4 | (async () => {
 5 | 
 6 | 	let kw = 'news iran'
 7 | 
 8 |     let scrape_job = {
 9 |         search_engine: 'baidu',
10 |         keywords: [kw],
11 |         num_pages: 1,
12 |         html_output: true,
13 |         // whether to strip JS and CSS from the html_output
14 |         // has only an effect if `html_output` is true
15 |         clean_html_output: true,
16 |         // remove all data images from the html
17 |         clean_data_images: true,
18 |     };
19 | 
20 |     var response = await se_scraper.scrape({}, scrape_job);
21 | 
22 |     console.dir(response, {depth: null, colors: true});
23 | 
24 |     fs.writeFileSync('example_cleaned.html', response.results[kw]['1']['html']);
25 | })();
26 | 


--------------------------------------------------------------------------------
/examples/custom_scraper.js:
--------------------------------------------------------------------------------
  1 | const se_scraper = require('./../index.js');
  2 | 
  3 | /*
  4 |  * This example shows how you can define your custom scraper class and use it
  5 |  * within se-scraper.
  6 |  */
  7 | class EcosiaScraper extends se_scraper.Scraper {
  8 | 
  9 |     constructor(...args) {
 10 |         super(...args);
 11 |     }
 12 | 
 13 |     async parse_async(html) {
 14 |         // In this example we use vanilla javascript to parse out the
 15 |         // interesting information from the search engine
 16 | 
 17 |         // you may also use a external library such as cheerio.
 18 | 
 19 |         return await this.page.evaluate(() => {
 20 |            var results = {
 21 |                num_results: '',
 22 |                no_results: false,
 23 |                effective_query: '',
 24 |                results: [],
 25 |            };
 26 | 
 27 |            document.querySelectorAll('.results .result').forEach((result) => {
 28 |               var serp = {};
 29 |               var title =  result.querySelector('.result-title');
 30 |               if (title) {
 31 |                   serp.title = title.innerText;
 32 |                   serp.link = title.getAttribute('href');
 33 |               }
 34 | 
 35 |               var green = result.querySelector('.result-url');
 36 |               if (green) {
 37 |                   serp.green = green.getAttribute('href');
 38 |               }
 39 | 
 40 |               var snippet = result.querySelector('.result-snippet');
 41 | 
 42 |               if (snippet) {
 43 |                   serp.snippet = snippet.innerText;
 44 |               }
 45 | 
 46 |               results.results.push(serp);
 47 |            });
 48 | 
 49 |            var num_res = document.querySelector('.card-title-result-count');
 50 |            if (num_res) {
 51 |                results.num_results = num_res.innerText;
 52 |            }
 53 | 
 54 |            results.no_results = document.querySelector('.empty-result') != null;
 55 | 
 56 |            var effective = document.querySelector('.query-context-text .result-title');
 57 | 
 58 |            if (effective) {
 59 |                results.effective_query = effective.innerText;
 60 |            }
 61 | 
 62 |            return results;
 63 |         });
 64 |     }
 65 | 
 66 |     async load_start_page() {
 67 |         let startUrl = 'https://www.ecosia.org/';
 68 | 
 69 |         await this.page.goto(startUrl);
 70 | 
 71 |         try {
 72 |             await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
 73 |         } catch (e) {
 74 |             return false;
 75 |         }
 76 | 
 77 |         return true;
 78 |     }
 79 | 
 80 |     async search_keyword(keyword) {
 81 |         const input = await this.page.$('input[name="q"]');
 82 |         await this.set_input_value(`input[name="q"]`, keyword);
 83 |         await this.sleep(50);
 84 |         await input.focus();
 85 |         await this.page.keyboard.press("Enter");
 86 |     }
 87 | 
 88 |     async next_page() {
 89 |         let next_page_link = await this.page.$('.pagination-next', {timeout: 1000});
 90 |         if (!next_page_link) {
 91 |             return false;
 92 |         }
 93 |         await next_page_link.click();
 94 | 
 95 |         return true;
 96 |     }
 97 | 
 98 |     async wait_for_results() {
 99 |         await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT });
100 |     }
101 | 
102 |     async detected() {
103 |         // check whether scraping was detected.
104 |     }
105 | }
106 | 
107 | (async () => {
108 | 
109 |     let scrape_job = {
110 |         search_engine: EcosiaScraper,
111 |         keywords: ['lets go boys'],
112 |         num_pages: 2,
113 |     };
114 | 
115 |     var results = await se_scraper.scrape({headless: true}, scrape_job);
116 | 
117 |     console.dir(results, {depth: null, colors: true});
118 | 
119 | })();
120 | 


--------------------------------------------------------------------------------
/examples/delete_comments.js:
--------------------------------------------------------------------------------
 1 | var nodeIterator = document.createNodeIterator(
 2 |     document.body,
 3 |     NodeFilter.SHOW_COMMENT,    
 4 |     { acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } }
 5 | );
 6 | 
 7 | // Remove all comment nodes
 8 | while(nodeIterator.nextNode()){
 9 |     var commentNode = nodeIterator.referenceNode;
10 |     commentNode.remove();
11 | }


--------------------------------------------------------------------------------
/examples/detection_checker.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * See here for most recent detection avoidance: https://github.com/paulirish/headless-cat-n-mouse/blob/master/apply-evasions.js
  3 |  */
  4 | 
  5 | // We'll use Puppeteer is our browser automation framework.
  6 | const puppeteer = require('puppeteer');
  7 | 
  8 | // This is where we'll put the code to get around the tests.
  9 | const preparePageForTests = async (page) => {
 10 |     // Pass the User-Agent Test.
 11 |     const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)' +
 12 |         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
 13 |     await page.setUserAgent(userAgent);
 14 | 
 15 |     // Pass the Webdriver Test.
 16 |     await page.evaluateOnNewDocument(() => {
 17 |         const newProto = navigator.__proto__;
 18 |         delete newProto.webdriver;
 19 |         navigator.__proto__ = newProto;
 20 |     });
 21 | 
 22 |     // Pass the Chrome Test.
 23 |     await page.evaluateOnNewDocument(() => {
 24 |         // We can mock this in as much depth as we need for the test.
 25 |         const mockObj = {
 26 |             app: {
 27 |                 isInstalled: false,
 28 |             },
 29 |             webstore: {
 30 |                 onInstallStageChanged: {},
 31 |                 onDownloadProgress: {},
 32 |             },
 33 |             runtime: {
 34 |                 PlatformOs: {
 35 |                     MAC: 'mac',
 36 |                     WIN: 'win',
 37 |                     ANDROID: 'android',
 38 |                     CROS: 'cros',
 39 |                     LINUX: 'linux',
 40 |                     OPENBSD: 'openbsd',
 41 |                 },
 42 |                 PlatformArch: {
 43 |                     ARM: 'arm',
 44 |                     X86_32: 'x86-32',
 45 |                     X86_64: 'x86-64',
 46 |                 },
 47 |                 PlatformNaclArch: {
 48 |                     ARM: 'arm',
 49 |                     X86_32: 'x86-32',
 50 |                     X86_64: 'x86-64',
 51 |                 },
 52 |                 RequestUpdateCheckStatus: {
 53 |                     THROTTLED: 'throttled',
 54 |                     NO_UPDATE: 'no_update',
 55 |                     UPDATE_AVAILABLE: 'update_available',
 56 |                 },
 57 |                 OnInstalledReason: {
 58 |                     INSTALL: 'install',
 59 |                     UPDATE: 'update',
 60 |                     CHROME_UPDATE: 'chrome_update',
 61 |                     SHARED_MODULE_UPDATE: 'shared_module_update',
 62 |                 },
 63 |                 OnRestartRequiredReason: {
 64 |                     APP_UPDATE: 'app_update',
 65 |                     OS_UPDATE: 'os_update',
 66 |                     PERIODIC: 'periodic',
 67 |                 },
 68 |             },
 69 |         };
 70 | 
 71 |         window.navigator.chrome = mockObj;
 72 |         window.chrome = mockObj;
 73 |     });
 74 | 
 75 |     // Pass the Permissions Test.
 76 |     await page.evaluateOnNewDocument(() => {
 77 |         const originalQuery = window.navigator.permissions.query;
 78 |         window.navigator.permissions.__proto__.query = parameters =>
 79 |             parameters.name === 'notifications'
 80 |                 ? Promise.resolve({state: Notification.permission})
 81 |                 : originalQuery(parameters);
 82 | 
 83 |         // Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
 84 |         const oldCall = Function.prototype.call;
 85 |         function call() {
 86 |             return oldCall.apply(this, arguments);
 87 |         }
 88 |         Function.prototype.call = call;
 89 | 
 90 |         const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
 91 |         const oldToString = Function.prototype.toString;
 92 | 
 93 |         function functionToString() {
 94 |             if (this === window.navigator.permissions.query) {
 95 |                 return "function query() { [native code] }";
 96 |             }
 97 |             if (this === functionToString) {
 98 |                 return nativeToStringFunctionString;
 99 |             }
100 |             return oldCall.call(oldToString, this);
101 |         }
102 |         Function.prototype.toString = functionToString;
103 |     });
104 | 
105 |     // Pass the Plugins Length Test.
106 |     await page.evaluateOnNewDocument(() => {
107 |         // Overwrite the `plugins` property to use a custom getter.
108 |         Object.defineProperty(navigator, 'plugins', {
109 |             // This just needs to have `length > 0` for the current test,
110 |             // but we could mock the plugins too if necessary.
111 |             get: () => [1, 2, 3, 4, 5]
112 |         });
113 |     });
114 | 
115 |     // Pass the Languages Test.
116 |     await page.evaluateOnNewDocument(() => {
117 |         // Overwrite the `plugins` property to use a custom getter.
118 |         Object.defineProperty(navigator, 'languages', {
119 |             get: () => ['en-US', 'en']
120 |         });
121 |     });
122 | 
123 |     // Pass the iframe Test
124 |     await page.evaluateOnNewDocument(() => {
125 |         Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
126 |             get: function() {
127 |                 return window;
128 |             }
129 |         });
130 |     });
131 | 
132 |     // Pass toString test, though it breaks console.debug() from working
133 |     await page.evaluateOnNewDocument(() => {
134 |         window.console.debug = () => {
135 |             return null;
136 |         };
137 |     });
138 | };
139 | 
140 | (async () => {
141 |     // Launch the browser in headless mode and set up a page.
142 |     const browser = await puppeteer.launch({
143 |         args: ['--no-sandbox'],
144 |         headless: true,
145 |     });
146 |     const page = await browser.newPage();
147 | 
148 |     // Prepare for the tests (not yet implemented).
149 |     await preparePageForTests(page);
150 | 
151 |     // Navigate to the page that will perform the tests.
152 |     const testUrl = 'https://intoli.com/blog/' +
153 |         'not-possible-to-block-chrome-headless/chrome-headless-test.html';
154 |     await page.goto(testUrl);
155 | 
156 |     // Save a screenshot of the results.
157 |     await page.screenshot({path: 'headless-test-result.png'});
158 | 
159 |     // Clean up.
160 |     await browser.close()
161 | })();


--------------------------------------------------------------------------------
/examples/for_the_lulz.js:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * Do not run this, this is probably illegal in your country ;)
 4 |  */
 5 | 
 6 | const se_scraper = require('./../index.js');
 7 | 
 8 | 
 9 | // generate some google dorks
10 | 
11 | function genGoogleDorks(iter=4) {
12 |     let lulz_keywords = [];
13 |     ['seite', 'inicio', 'index'].forEach((x) => {
14 |         for (var i = 0; i < iter; i++) {
15 |             lulz_keywords.push(
16 |                 'inurl:"' + x + '.php?id=' + Math.floor(Math.random() * 100) + '"'
17 |             )
18 |         }
19 |     });
20 |     return lulz_keywords;
21 | }
22 | 
23 | const lulz_keywords = genGoogleDorks();
24 | console.log(lulz_keywords);
25 | 
26 | 
27 | // those options need to be provided on startup
28 | // and cannot give to se-scraper on scrape() calls
29 | let browser_config = {
30 |     // if random_user_agent is set to True, a random user agent is chosen
31 |     random_user_agent: true,
32 |     headless: true,
33 |     is_local: false,
34 |     throw_on_detection: false,
35 |     puppeteer_cluster_config: {
36 |         headless: true,
37 |         timeout: 2 * 60 * 1000, // max timeout set to 2 minutes
38 |         monitor: false,
39 |         concurrency: 3, // one scraper per tab
40 |         maxConcurrency: 4, // scrape with 4 tabs
41 |     }
42 | };
43 | 
44 | (async () => {
45 |     // scrape config can change on each scrape() call
46 |     let lulz_config = {
47 |         // which search engine to scrape
48 |         search_engine: 'google',
49 |         // an array of keywords to scrape
50 |         keywords: lulz_keywords,
51 |         // the number of pages to scrape for each keyword
52 |         num_pages: 3,
53 |         // how long to sleep between requests. a random sleep interval within the range [a,b]
54 |         // is drawn before every request. empty string for no sleeping.
55 |         sleep_range: '',
56 |         // path to output file, data will be stored in JSON
57 |         output_file: 'goodboys.json',
58 |         // whether to prevent images, css, fonts from being loaded
59 |         // will speed up scraping a great deal
60 |         block_assets: true,
61 |         // check if headless chrome escapes common detection techniques
62 |         // this is a quick test and should be used for debugging
63 |         test_evasion: false,
64 |         apply_evasion_techniques: true,
65 |         // log ip address data
66 |         log_ip_address: false,
67 |         // log http headers
68 |         log_http_headers: false,
69 |     };
70 | 
71 |     let results = await se_scraper.scrape(browser_config, lulz_config);
72 | 
73 |     const all_links = [];
74 | 
75 |     for (var kw in results) {
76 |         for (var page in results[kw]) {
77 |             for (var res of results[kw][page]['results']) {
78 |                 all_links.push(res.link);
79 |             }
80 |         }
81 |     }
82 | 
83 |     console.log(all_links);
84 | 
85 |     for (var link of all_links) {
86 |         try {
87 |             const response = await got(link.replace(/(id=\d+)/g, "$1'"));
88 |             let html = response.body;
89 |             if (html.includes('error') || html.includes('mysql')) {
90 |                 console.log('Got a mysql injection in ' + url);
91 |             }
92 |         } catch (error) {
93 |             console.log(error.response.statusCode);
94 |         }
95 |     }
96 | 
97 | })();


--------------------------------------------------------------------------------
/examples/gimage.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         output_file: '',
 6 |     };
 7 | 
 8 |     let scrape_job = {
 9 |         search_engine: 'google_image',
10 |         keywords: ['manaslu', 'everest', 'pitcairn'],
11 |         num_pages: 1,
12 |     };
13 | 
14 |     var scraper = new se_scraper.ScrapeManager(browser_config);
15 | 
16 |     await scraper.start();
17 | 
18 |     var results = await scraper.scrape(scrape_job);
19 | 
20 |     console.dir(results, {depth: null, colors: true});
21 | 
22 |     await scraper.quit();
23 | })();
24 | 


--------------------------------------------------------------------------------
/examples/gnold.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         output_file: 'examples/results/gnold.json',
 6 |         google_news_old_settings: {
 7 |             gl: 'us', // The gl parameter determines the Google country to use for the query.
 8 |             hl: 'fr', // The hl parameter determines the Google UI language to return results.
 9 |             start: 0, // Determines the results offset to use, defaults to 0.
10 |             num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
11 |         },
12 |     };
13 | 
14 |     let scrape_job = {
15 |         search_engine: 'google_news_old',
16 |         keywords: ['news world'],
17 |         num_pages: 1,
18 |     };
19 | 
20 |     var scraper = new se_scraper.ScrapeManager(browser_config);
21 |     await scraper.start();
22 | 
23 |     var results = await scraper.scrape(scrape_job);
24 |     console.dir(results, {depth: null, colors: true});
25 |     await scraper.quit();
26 | })();
27 | 


--------------------------------------------------------------------------------
/examples/google_maps.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         output_file: 'examples/results/maps.json',
 6 |         test_evasion: false,
 7 |         block_assets: false,
 8 |         headless: false,
 9 | 
10 |         google_maps_settings: {
11 |             scrape_in_detail: false,
12 |         }
13 |     };
14 | 
15 |     let scrape_job = {
16 |         search_engine: 'google_maps',
17 |         keywords: ['Berlin Zahnarzt'],
18 |         num_pages: 1,
19 |     };
20 | 
21 |     var scraper = new se_scraper.ScrapeManager(browser_config);
22 | 
23 |     await scraper.start();
24 | 
25 |     var results = await scraper.scrape(scrape_job);
26 | 
27 |     console.dir(results, {depth: null, colors: true});
28 | 
29 |     await scraper.quit();
30 | })();
31 | 


--------------------------------------------------------------------------------
/examples/headless-test-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NikolaiT/se-scraper/5a0eea201dbeac7c9db4163eaa485bf4cd64f47d/examples/headless-test-result.png


--------------------------------------------------------------------------------
/examples/keywords.txt:
--------------------------------------------------------------------------------
1 | test
2 | water is blue


--------------------------------------------------------------------------------
/examples/minimal.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../index.js');
 2 | 
 3 | (async () => {
 4 | 
 5 |     let kws = [
 6 |         'https://www.linkedin.com/in/aakanksha-majhi-b24a8449',
 7 |         'https://www.linkedin.com/in/aakash-srivastava-7374a830',
 8 |         'https://www.linkedin.com/in/aakash-tiwari-019b8569',
 9 |     ];
10 | 
11 |     let scrape_job = {
12 |         search_engine: 'google',
13 |         keywords: kws,
14 |         num_pages: 1,
15 |     };
16 | 
17 |     var results = await se_scraper.scrape({}, scrape_job);
18 | 
19 |     console.dir(results, {depth: null, colors: true});
20 | 
21 | })();
22 | 


--------------------------------------------------------------------------------
/examples/multiple_browsers.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         search_engine: 'google',
 6 |         random_user_agent: true,
 7 |         is_local: false,
 8 |         html_output: false,
 9 |         throw_on_detection: false,
10 |         headless: true,
11 |         puppeteer_cluster_config: {
12 |             headless: true,
13 |             timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
14 |             monitor: false,
15 |             concurrency: 3, // 3 == CONCURRENCY_BROWSER
16 |             maxConcurrency: 3, // 3 browsers will scrape
17 |         },
18 |     };
19 | 
20 |     let scrape_job = {
21 |         search_engine: 'google',
22 |         keywords: ['news', 'mountain', 'what', 'are good', 'keyword', 'who', 'nice'],
23 |         num_pages: 1,
24 |     };
25 | 
26 |     var scraper = new se_scraper.ScrapeManager(browser_config);
27 | 
28 |     await scraper.start();
29 | 
30 |     var results = await scraper.scrape(scrape_job);
31 | 
32 |     console.dir(results, {depth: null, colors: true});
33 | 
34 |     await scraper.quit();
35 | })();
36 | 


--------------------------------------------------------------------------------
/examples/multiple_search_engines.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         random_user_agent: true,
 6 |         write_meta_data: true,
 7 |         sleep_range: '[1,1]',
 8 |         headless: true,
 9 |         output_file: `examples/results/multiple_search_engines.json`
10 |     };
11 | 
12 |     let scrape_job = {
13 |         search_engine: 'google',
14 |         keywords: ['news', 'se-scraper'],
15 |         num_pages: 1,
16 |     };
17 | 
18 |     var scraper = new se_scraper.ScrapeManager(browser_config);
19 |     await scraper.start();
20 | 
21 |     for (var se of ['google', 'bing']) {
22 |         scrape_job.search_engine = se;
23 |         var results = await scraper.scrape(scrape_job);
24 |         console.dir(results, {depth: null, colors: true});
25 |     }
26 | 
27 |     await scraper.quit();
28 | })();
29 | 
30 | 


--------------------------------------------------------------------------------
/examples/multiple_tabs.js:
--------------------------------------------------------------------------------
  1 | const se_scraper = require('./../index.js');
  2 | 
  3 | const Cluster = {
  4 |     CONCURRENCY_PAGE: 1, // shares cookies, etc.
  5 |     CONCURRENCY_CONTEXT: 2, // no cookie sharing (uses contexts)
  6 |     CONCURRENCY_BROWSER: 3, // no cookie sharing and individual processes (uses contexts)
  7 | };
  8 | 
  9 | let keywords = ['New York',
 10 |     'Los Angeles',
 11 |     'Chicago',
 12 |     'Houston',
 13 |     'Philadelphia',
 14 |     'Phoenix',
 15 |     'San Antonio',
 16 |     'San Diego',
 17 |     'Dallas',
 18 |     'San Jose',
 19 |     'Austin',
 20 |     'Indianapolis',
 21 |     'Jacksonville',
 22 |     'San Francisco',
 23 |     'Columbus',
 24 |     'Charlotte',
 25 |     'Fort Worth',
 26 |     'Detroit',
 27 |     'El Paso',
 28 |     'Memphis',
 29 |     'Seattle',
 30 |     'Denver',
 31 |     'Washington',
 32 |     'Boston',
 33 |     'Nashville-Davidson',
 34 |     'Baltimore',
 35 |     'Oklahoma City',
 36 |     'Louisville/Jefferson County',
 37 |     'Portland',
 38 |     'Las Vegas',
 39 |     'Milwaukee',
 40 |     'Albuquerque',
 41 |     'Tucson',
 42 |     'Fresno',
 43 |     'Sacramento',
 44 |     'Long Beach',
 45 |     'Kansas City',
 46 |     'Mesa',
 47 |     'Virginia Beach',
 48 |     'Atlanta',
 49 |     'Colorado Springs',
 50 |     'Omaha',
 51 |     'Raleigh',
 52 |     'Miami',
 53 |     'Oakland',
 54 |     'Minneapolis',
 55 |     'Tulsa',
 56 |     'Cleveland',
 57 |     'Wichita',
 58 |     'Arlington',
 59 |     'New Orleans',
 60 |     'Bakersfield',
 61 |     'Tampa',
 62 |     'Honolulu',
 63 |     'Aurora',
 64 |     'Anaheim',
 65 |     'Santa Ana',
 66 |     'St. Louis',
 67 |     'Riverside',
 68 |     'Corpus Christi',
 69 |     'Lexington-Fayette',
 70 |     'Pittsburgh',
 71 |     'Anchorage',
 72 |     'Stockton',
 73 |     'Cincinnati',
 74 |     'St. Paul',
 75 |     'Toledo',
 76 |     'Greensboro',
 77 |     'Newark',
 78 |     'Plano',
 79 |     'Henderson',
 80 |     'Lincoln',
 81 |     'Buffalo',
 82 |     'Jersey City',
 83 |     'Chula Vista',
 84 |     'Fort Wayne',
 85 |     'Orlando',
 86 |     'St. Petersburg',
 87 |     'Chandler',
 88 |     'Laredo',
 89 |     'Norfolk',
 90 |     'Durham',
 91 |     'Madison',
 92 |     'Lubbock',
 93 |     'Irvine',
 94 |     'Winston-Salem',
 95 |     'Glendale',
 96 |     'Garland',
 97 |     'Hialeah',
 98 |     'Reno',
 99 |     'Chesapeake',
100 |     'Gilbert',
101 |     'Baton Rouge',
102 |     'Irving',
103 |     'Scottsdale',
104 |     'North Las Vegas',
105 |     'Fremont',
106 |     'Boise City',
107 |     'Richmond',
108 |     'San Bernardino'];
109 | 
110 | let config = {
111 |     search_engine: 'bing',
112 |     debug: false,
113 |     verbose: true,
114 |     keywords: keywords,
115 |     num_pages: 1, // how many pages per keyword
116 |     output_file: 'examples/results/bing.json',
117 |     log_ip_address: false,
118 |     headless: true,
119 |     puppeteer_cluster_config: {
120 |         timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
121 |         monitor: false,
122 |         concurrency: Cluster.CONCURRENCY_PAGE, // one scraper per tab
123 |         maxConcurrency: 7, // scrape with 7 tabs
124 |     }
125 | };
126 | 
127 | function callback(err, response) {
128 |     if (err) {
129 |         console.error(err)
130 |     }
131 |     console.dir(response, {depth: null, colors: true});
132 | }
133 | 
134 | se_scraper.scrape(config, callback);


--------------------------------------------------------------------------------
/examples/per_page_proxy.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require('puppeteer');
 2 | const ProxyChain = require('proxy-chain');
 3 | 
 4 | const ROUTER_PROXY = 'http://127.0.0.1:8000';
 5 | 
 6 | // SEE: https://github.com/GoogleChrome/puppeteer/issues/678
 7 | // Idea is: Setup a local router proxy that assigns requests identified by unique user-agent strings
 8 | // distinct upstream proxies. With this way it is possible to use one proxy per chromium tab.
 9 | // downside: not fast and efficient
10 | 
11 | const uas = [
12 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
13 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
14 | ];
15 | 
16 | const proxies = ['http://142.93.57.147:3128', 'http://85.132.31.115:8181'];
17 | 
18 | (async () => {
19 |     const browser = await puppeteer.launch({
20 |         headless: false,
21 |         args: [`--proxy-server=${ROUTER_PROXY}`],
22 |     });
23 |     const page1 = await browser.newPage();
24 |     const page2 = await browser.newPage();
25 | 
26 |     try {
27 |         await page1.setUserAgent(uas[0]);
28 |         await page1.goto('https://www.whatsmyip.org/');
29 |     } catch (e) {
30 |         console.log(e);
31 |     }
32 | 
33 |     try {
34 |         await page2.setUserAgent(uas[1]);
35 |         await page2.goto('https://www.whatsmyip.org/');
36 |     } catch (e) {
37 |         console.log(e);
38 |     }
39 | 
40 |     //await browser.close();
41 | })();
42 | 
43 | const server = new ProxyChain.Server({
44 |     // Port where the server the server will listen. By default 8000.
45 |     port: 8000,
46 | 
47 |     // Enables verbose logging
48 |     verbose: true,
49 | 
50 |     prepareRequestFunction: ({
51 |                                  request,
52 |                                  username,
53 |                                  password,
54 |                                  hostname,
55 |                                  port,
56 |                                  isHttp,
57 |                              }) => {
58 |         var upstreamProxyUrl;
59 | 
60 |         if (request.headers['user-agent'] === uas[0]) {
61 |             upstreamProxyUrl = proxies[0];
62 |         }
63 | 
64 |         if (request.headers['user-agent'] === uas[1]) {
65 |             upstreamProxyUrl = proxies[1];
66 |         }
67 | 
68 |         console.log('Using proxy: ' + upstreamProxyUrl);
69 | 
70 |         return { upstreamProxyUrl };
71 |     },
72 | });
73 | 
74 | server.listen(() => {
75 |     console.log(`Router Proxy server is listening on port ${8000}`);
76 | });


--------------------------------------------------------------------------------
/examples/pluggable.js:
--------------------------------------------------------------------------------
 1 | module.exports = class Pluggable {
 2 |     constructor(options = {}) {
 3 |         const {
 4 |             chromeFlags = [
 5 |                 '--no-sandbox',
 6 |                 '--disable-setuid-sandbox',
 7 |                 '--disable-dev-shm-usage',
 8 |                 '--disable-accelerated-2d-canvas',
 9 |                 '--disable-gpu',
10 |                 '--window-size=1920x1080',
11 |                 '--hide-scrollbars',
12 |                 '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
13 |             ],
14 |             headless = true,
15 |         } = options;
16 | 
17 |         this.chromeFlags = chromeFlags;
18 |         this.headless = headless;
19 |     }
20 | 
21 |     async close_browser() {
22 |         await this.browser.close();
23 |     }
24 | 
25 |     // Callback invoked after metadata has been gathered
26 |     async handle_metadata(args) {
27 |         // store scraping metadata somewhere
28 |     }
29 | 
30 |     // Callback invoked after all keywords have been scraped
31 |     async handle_results(args) {
32 |         // store the results somewhere
33 |     }
34 | 
35 |     // Callback invoked before a keyword is scraped.
36 |     async before_keyword_scraped(args) {
37 |         console.log('before keyword scraped.');
38 |     }
39 | 
40 |     // Callback invoked after a keyword has been scraped.
41 |     // TODO: implement this
42 |     async after_keyword_scraped(args) {
43 |         console.log('after keyword scraped.')
44 |     }
45 | 
46 |     async start_browser(args={}) {
47 |         const puppeteer = require('puppeteer');
48 | 
49 |         let launch_args = {
50 |             args: args.chromeFlags || this.chromeFlags,
51 |             headless: args.headless,
52 |         };
53 | 
54 |         if (launch_args.headless === undefined) {
55 |             launch_args.headless = this.headless;
56 |         }
57 | 
58 |         this.browser = await puppeteer.launch(launch_args);
59 |         console.log('Loaded custom function get_browser()');
60 |         console.log(launch_args);
61 | 
62 |         return this.browser;
63 |     }
64 | 
65 |     async do_work(page) {
66 |         // do some scraping work and return results and num_requests
67 | 
68 |     }
69 | };


--------------------------------------------------------------------------------
/examples/pluggable_example.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | const resolve = require('path').resolve;
 3 | 
 4 | (async () => {
 5 |     let browser_config = {
 6 |         test_evasion: false,
 7 |         log_http_headers: true,
 8 |         log_ip_address: true,
 9 |         random_user_agent: false,
10 |         apply_evasion_techniques: false,
11 |         screen_output: false,
12 |         custom_func: resolve('./examples/pluggable.js'),
13 |         headless: false,
14 |     };
15 | 
16 |     let scrape_job = {
17 |         search_engine: 'google',
18 |         keywords: ['news usa'],
19 |         num_pages: 1,
20 |     };
21 | 
22 |     var scraper = new se_scraper.ScrapeManager(browser_config);
23 | 
24 |     await scraper.start();
25 | 
26 |     var results = await scraper.scrape(scrape_job);
27 | 
28 |     console.dir(results, {depth: null, colors: true});
29 | 
30 |     await scraper.quit();
31 | })();
32 | 


--------------------------------------------------------------------------------
/examples/proxies.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         output_file: 'examples/results/proxyresults.json',
 6 |         log_ip_address: true,
 7 |         // a file with one proxy per line. Example:
 8 |         // socks5://78.94.172.42:1080
 9 |         // http://118.174.233.10:48400
10 |         proxy_file: '/home/nikolai/.proxies', // one proxy per line
11 |         // whether to use proxies only
12 |         // when this is set to true, se-scraper will not use
13 |         // your default IP address in a browser
14 |         use_proxies_only: true,
15 |     };
16 | 
17 |     let scrape_job = {
18 |         search_engine: 'google',
19 |         keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
20 |         num_pages: 1,
21 |     };
22 | 
23 |     var scraper = new se_scraper.ScrapeManager(browser_config);
24 |     await scraper.start();
25 | 
26 |     var results = await scraper.scrape(scrape_job);
27 |     console.dir(results, {depth: null, colors: true});
28 |     await scraper.quit();
29 | })();
30 | 


--------------------------------------------------------------------------------
/examples/quickstart.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         test_evasion: false,
 6 |         log_http_headers: false,
 7 |         log_ip_address: false,
 8 |         random_user_agent: false,
 9 |         apply_evasion_techniques: true,
10 |         screen_output: false,
11 |         html_output: false,
12 |         clean_html_output: true,
13 |     };
14 | 
15 |     let scrape_job = {
16 |         search_engine: 'google',
17 |         keywords: ['buy a nice car'],
18 |         num_pages: 1,
19 |         google_settings: {
20 |             "gl": "us",
21 |             "hl": "en",
22 |             "start": 0,
23 |             "num": 10
24 |         }
25 |     };
26 | 
27 |     var scraper = new se_scraper.ScrapeManager(browser_config);
28 | 
29 |     await scraper.start();
30 | 
31 |     var results = await scraper.scrape(scrape_job);
32 | 
33 |     console.dir(results, {depth: null, colors: true});
34 | 
35 |     await scraper.quit();
36 | })();
37 | 


--------------------------------------------------------------------------------
/examples/reusing.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./../src/node_scraper.js');
 2 | 
 3 | (async () => {
 4 |     let browser_config = {
 5 |         output_file: 'examples/results/data.json',
 6 |     };
 7 | 
 8 |     let scrape_job = {
 9 |         search_engine: 'google',
10 |         keywords: ['news', 'se-scraper'],
11 |         num_pages: 1,
12 |     };
13 | 
14 |     let scrape_job2 = {
15 |         search_engine: 'bing',
16 |         keywords: ['test', 'what a wonderful world'],
17 |         num_pages: 1,
18 |     };
19 | 
20 |     var scraper = new se_scraper.ScrapeManager(browser_config);
21 |     await scraper.start();
22 | 
23 |     var results = await scraper.scrape(scrape_job);
24 |     console.dir(results, {depth: null, colors: true});
25 | 
26 |     var results2 = await scraper.scrape(scrape_job2);
27 |     console.dir(results2, {depth: null, colors: true});
28 | 
29 |     await scraper.quit();
30 | })();
31 | 


--------------------------------------------------------------------------------
/examples/test_cluster.js:
--------------------------------------------------------------------------------
 1 | const { Cluster } = require('../../puppeteer-cluster/dist/index.js');
 2 | var fs = require('fs');
 3 | var os = require("os");
 4 | 
 5 | const PROXY_FILE = '/home/nikolai/.proxies';
 6 | 
 7 | function read_items_from_file(fname) {
 8 |     let kws =  fs.readFileSync(fname).toString().split(os.EOL);
 9 |     // clean keywords
10 |     kws = kws.filter((kw) => {
11 |         return kw.trim().length > 0;
12 |     });
13 |     return kws;
14 | }
15 | 
16 | (async () => {
17 | 
18 |     let browserArgs = [
19 |         '--disable-infobars',
20 |         '--window-position=0,0',
21 |         '--ignore-certifcate-errors',
22 |         '--ignore-certifcate-errors-spki-list',
23 |         '--no-sandbox',
24 |         '--disable-setuid-sandbox',
25 |         '--disable-dev-shm-usage',
26 |         '--disable-accelerated-2d-canvas',
27 |         '--disable-gpu',
28 |         '--window-size=1920x1080',
29 |         '--hide-scrollbars',
30 |     ];
31 | 
32 |     let proxies = read_items_from_file(PROXY_FILE);
33 | 
34 |     console.dir(proxies);
35 | 
36 |     // each new call to workerInstance() will
37 |     // left pop() one element from this list
38 |     // maxConcurrency should be equal to perBrowserOptions.length
39 | 
40 |     // the first browser config with home IP
41 |     let perBrowserOptions = [{
42 |         headless: false,
43 |         ignoreHTTPSErrors: true,
44 |         args: browserArgs
45 |     }];
46 | 
47 |     for (var proxy of proxies) {
48 |         perBrowserOptions.push({
49 |             headless: false,
50 |             ignoreHTTPSErrors: true,
51 |             args: browserArgs.concat(`--proxy-server=${proxy}`)
52 |         })
53 |     }
54 | 
55 |     const cluster = await Cluster.launch({
56 |         monitor: true,
57 |         timeout: 12 * 60 * 60 * 1000, // 12 hours in ms
58 |         concurrency: Cluster.CONCURRENCY_BROWSER,
59 |         maxConcurrency: perBrowserOptions.length,
60 |         puppeteerOptions: {
61 |             headless: false,
62 |             args: browserArgs,
63 |             ignoreHTTPSErrors: true,
64 |         },
65 |         perBrowserOptions: perBrowserOptions
66 |     });
67 | 
68 |     // Event handler to be called in case of problems
69 |     cluster.on('taskerror', (err, data) => {
70 |         console.log(`Error crawling ${data}: ${err.message}`);
71 |     });
72 | 
73 | 
74 |     await cluster.task(async ({ page, data: url }) => {
75 |         await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 20000});
76 |         const pageTitle = await page.evaluate(() => document.title);
77 |         console.log(`Page title of ${url} is ${pageTitle}`);
78 |         console.log(await page.content());
79 |     });
80 | 
81 |     for(var i = 0; i < perBrowserOptions.length; i++) {
82 |         await cluster.queue('http://ipinfo.io/json');
83 |     }
84 | 
85 |     await cluster.idle();
86 |     await cluster.close();
87 | })();
88 | 


--------------------------------------------------------------------------------
/examples/test_promise.js:
--------------------------------------------------------------------------------
 1 | class Test {
 2 |     constructor(options = {}) {
 3 |         const {
 4 |             config = {},
 5 |         } = options;
 6 | 
 7 |         this.config = config;
 8 |     }
 9 | 
10 |     run(vars) {
11 | 
12 |         console.log(this.config)
13 |     }
14 | }
15 | 
16 | let o1 = new Test({config: {a: Math.random()}});
17 | let o2 = new Test({config: {a: Math.random()}});
18 | 
19 | o1.run()
20 | o2.run()
21 | 
22 | // (async () => {
23 | //
24 | //     let prom = [];
25 | //
26 | //     for (var i = 0; i < 3; i++) {
27 | //         var obj = new Test({
28 | //             config: {a: Math.random()},
29 | //         });
30 | //         prom.push(new Promise(resolve => {
31 | //             setTimeout(() => { new Test({
32 | //                 config: {a: Math.random()},
33 | //             }).run(); resolve() }, 1000);
34 | //         }));
35 | //     }
36 | //
37 | //     let res = await Promise.all(prom);
38 | //     console.log(res);
39 | //
40 | // })();


--------------------------------------------------------------------------------
/examples/test_proxyflag.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require('puppeteer');
 2 | 
 3 | (async () => {
 4 |     const browser = await puppeteer.launch({
 5 |         args: [
 6 |             // SET PROXY HERE
 7 |             '--proxy-server=socks5://IP:PORT',
 8 |             '--disable-infobars',
 9 |             '--window-position=0,0',
10 |             '--ignore-certifcate-errors',
11 |             '--ignore-certifcate-errors-spki-list',
12 |             '--disable-setuid-sandbox',
13 |             '--disable-dev-shm-usage',
14 |             '--disable-accelerated-2d-canvas',
15 |             '--disable-gpu',
16 |             '--window-size=1920x1080',
17 |             '--hide-scrollbars',
18 |             '--disable-notifications',
19 |             '--no-sandbox',
20 |             '--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'
21 |         ],
22 |         headless: true
23 |     });
24 |     var page = await browser.newPage();
25 |     await page.setViewport({width: 1920, height: 926});
26 |     await page.goto('http://ipinfo.io/json');
27 |     console.log(await page.content());
28 |     await browser.close();
29 | })();


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./src/node_scraper.js');
 2 | var Scraper = require('./src/modules/se_scraper');
 3 | 
 4 | async function scrape(browser_config, scrape_config) {
 5 |     // scrape config overwrites the browser_config
 6 |     Object.assign(browser_config, scrape_config);
 7 | 
 8 |     var scraper = new se_scraper.ScrapeManager(browser_config);
 9 | 
10 |     await scraper.start();
11 | 
12 |     var results = await scraper.scrape(scrape_config);
13 | 
14 |     await scraper.quit();
15 | 
16 |     return results;
17 | }
18 | 
19 | module.exports = {
20 |     scrape: scrape,
21 |     ScrapeManager: se_scraper.ScrapeManager,
22 |     Scraper: Scraper,
23 | };
24 | 


--------------------------------------------------------------------------------
/jformat.py:
--------------------------------------------------------------------------------
1 | import pprint
2 | import sys
3 | import json
4 | 
5 | if len(sys.argv) == 2:
6 | 	print(pprint.pformat(json.load(open(sys.argv[1]))))


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "se-scraper",
 3 |   "version": "1.5.7",
 4 |   "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
 5 |   "homepage": "https://scrapeulous.com/",
 6 |   "main": "index.js",
 7 |   "scripts": {
 8 |     "test": "mocha test test/modules"
 9 |   },
10 |   "keywords": [
11 |     "scraping",
12 |     "search-engines",
13 |     "google",
14 |     "bing",
15 |     "web-scraping"
16 |   ],
17 |   "author": "Nikolai Tschacher <hire@incolumitas.com> (https://incolumitas.com/)",
18 |   "repository": {
19 |     "type": "git",
20 |     "url": "https://github.com/NikolaiT/se-scraper"
21 |   },
22 |   "license": "ISC",
23 |   "dependencies": {
24 |     "cheerio": "^1.0.0-rc.3",
25 |     "debug": "^4.1.1",
26 |     "got": "^9.6.0",
27 |     "lodash": "^4.17.14",
28 |     "puppeteer": "^2.0.0",
29 |     "puppeteer-cluster": "^0.18.0",
30 |     "puppeteer-extra": "^2.1.3",
31 |     "puppeteer-extra-plugin-stealth": "^2.2.2",
32 |     "user-agents": "^1.0.378",
33 |     "winston": "^3.2.1"
34 |   },
35 |   "devDependencies": {
36 |     "bluebird": "^3.7.2",
37 |     "chai": "^4.2.0",
38 |     "chai-string": "^1.5.0",
39 |     "express": "^4.17.1",
40 |     "http-mitm-proxy": "^0.8.2",
41 |     "key-cert": "^1.0.1",
42 |     "mocha": "^6.1.4",
43 |     "ua-parser-js": "^0.7.21"
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/run.js:
--------------------------------------------------------------------------------
 1 | const se_scraper = require('./index.js');
 2 | 
 3 | // those options need to be provided on startup
 4 | // and cannot give to se-scraper on scrape() calls
 5 | let browser_config = {
 6 |     // the user agent to scrape with
 7 |     user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
 8 |     // if random_user_agent is set to True, a random user agent is chosen
 9 |     random_user_agent: false,
10 |     // whether to start the browser in headless mode
11 |     headless: false,
12 |     // whether debug information should be printed
13 |     // level 0: print nothing
14 |     // level 1: print most important info
15 |     // ...
16 |     // level 4: print all shit nobody wants to know
17 |     debug_level: 1,
18 |     // specify flags passed to chrome here
19 |     chrome_flags: [],
20 |     // path to js module that extends functionality
21 |     // this module should export the functions:
22 |     // get_browser, handle_metadata, close_browser
23 |     // must be an absolute path to the module
24 |     //custom_func: resolve('examples/pluggable.js'),
25 |     custom_func: '',
26 |     // use a proxy for all connections
27 |     // example: 'socks5://78.94.172.42:1080'
28 |     // example: 'http://118.174.233.10:48400'
29 |     proxy: '',
30 |     // a file with one proxy per line. Example:
31 |     // socks5://78.94.172.42:1080
32 |     // http://118.174.233.10:48400
33 |     proxy_file: '',
34 |     puppeteer_cluster_config: {
35 |         timeout: 10 * 60 * 1000, // max timeout set to 10 minutes
36 |         monitor: false,
37 |         concurrency: 1, // one scraper per tab
38 |         maxConcurrency: 1, // scrape with 1 tab
39 |     }
40 | };
41 | 
42 | (async () => {
43 |     // scrape config can change on each scrape() call
44 |     let scrape_config = {
45 |         // which search engine to scrape
46 |         search_engine: 'duckduckgo',
47 |         // an array of keywords to scrape
48 |         keywords: ['cloud service'],
49 |         // the number of pages to scrape for each keyword
50 |         num_pages: 1,
51 | 
52 |         // OPTIONAL PARAMS BELOW:
53 |         // google_settings: {
54 |         //     gl: 'us', // The gl parameter determines the Google country to use for the query.
55 |         //     hl: 'fr', // The hl parameter determines the Google UI language to return results.
56 |         //     start: 0, // Determines the results offset to use, defaults to 0.
57 |         //     num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
58 |         // },
59 |         // instead of keywords you can specify a keyword_file. this overwrites the keywords array
60 |         keyword_file: '',
61 |         // how long to sleep between requests. a random sleep interval within the range [a,b]
62 |         // is drawn before every request. empty string for no sleeping.
63 |         sleep_range: '',
64 |         // path to output file, data will be stored in JSON
65 |         output_file: '',
66 |         // whether to prevent images, css, fonts from being loaded
67 |         // will speed up scraping a great deal
68 |         block_assets: false,
69 |         // check if headless chrome escapes common detection techniques
70 |         // this is a quick test and should be used for debugging
71 |         test_evasion: false,
72 |         apply_evasion_techniques: true,
73 |         // log ip address data
74 |         log_ip_address: false,
75 |         // log http headers
76 |         log_http_headers: false,
77 |     };
78 | 
79 |     let results = await se_scraper.scrape(browser_config, scrape_config);
80 |     console.dir(results, {depth: null, colors: true});
81 | })();
82 | 
83 | 


--------------------------------------------------------------------------------
/se-scraper.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="WEB_MODULE" version="4">
 3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
 4 |     <exclude-output />
 5 |     <content url="file://$MODULE_DIR$">
 6 |       <excludeFolder url="file://$MODULE_DIR$/test/static_tests/html" />
 7 |     </content>
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/src/captcha_solver.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |     There are essentially two strategies to handle a search engine showing you a captcha:
 3 | 
 4 |     1. Solve the captcha
 5 |         https://github.com/ecthros/uncaptcha2
 6 |         or use a captcha solving service such as https://anti-captcha.com/mainpage
 7 | 
 8 |     2. Switch your IP address with rotating proxies
 9 | 
10 |  */
11 | 
12 | /**
13 |  * @name download recaptcha2 audio captcha
14 |  *
15 |  * There are several issues:
16 |  *
17 |  * Google sees that we are using an automated browser.
18 |  *
19 |  * In the worst case we have to completely control the browser ourselves without puppeteer.
20 |  *
21 |  * https://github.com/ecthros/uncaptcha2
22 |  *
23 |  * See here:
24 |  *
25 |  * https://gist.github.com/tegansnyder/c3aeae4d57768c58247ae6c4e5acd3d1
26 |  *
27 |  * https://github.com/GoogleChrome/puppeteer/issues/3039
28 |  *
29 |  * https://intoli.com/blog/making-chrome-headless-undetectable/
30 |  *
31 |  * @desc  Go to the https://www.google.com/recaptcha/api2/demo demo page and download the captcha
32 |  */
33 | 
34 | const puppeteer = require('puppeteer');
35 | const fs = require('fs');
36 | const got = require('got');
37 | 
38 | try {
39 |     (async () => {
40 |         const browser = await puppeteer.launch({
41 |             args:  [
42 |                 '--proxy-server=socks5://78.94.172.42:1080',
43 |                 '--no-sandbox',
44 |                 '--disable-setuid-sandbox',
45 |                 '--disable-dev-shm-usage',
46 |                 '--disable-accelerated-2d-canvas',
47 |                 '--disable-gpu',
48 |                 '--window-size=1920x1080',
49 |                 '--hide-scrollbars',
50 |                 '--user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0"',
51 |             ],
52 |             headless: false,
53 |         });
54 |         const page = await browser.newPage()
55 |         await page.goto('https://www.google.com/recaptcha/api2/demo')
56 | 
57 |         await page.waitFor(1000);
58 | 
59 |         const frames = page.frames();
60 | 
61 |         console.info('Available frames', frames.map(frame => frame.name()));
62 |         console.info('Available frame urls', frames.map(frame => frame.url()));
63 | 
64 |         const frame = frames.find(frame => frame.url().includes('/recaptcha/api2/anchor?'));
65 |         const content_frame = frames.find(frame => frame.url().includes('/recaptcha/api2/bframe?'));
66 | 
67 |         await frame.waitForSelector('#recaptcha-anchor', { timeout: 10000 });
68 |         await page.waitFor(1000);
69 |         const button = await frame.$('#recaptcha-anchor');
70 |         await button.click();
71 | 
72 |         await content_frame.waitForSelector('#recaptcha-audio-button');
73 | 
74 |         const audio_button = await content_frame.$('#recaptcha-audio-button');
75 |         await audio_button.click();
76 |         await page.waitFor(1000);
77 | 
78 |         await content_frame.waitForSelector('.rc-audiochallenge-tdownload-link');
79 | 
80 |         let download_link = await content_frame.evaluate(() => {
81 |             return document.querySelectorAll('.rc-audiochallenge-tdownload-link').getAttribute('href');
82 |         });
83 |         console.log('Got audio download link: ', download_link);
84 |         got.stream(download_link).pipe(fs.createWriteStream('audio.mp3'));
85 | 
86 |         await browser.close();
87 |     })()
88 | } catch (err) {
89 |     console.error(err)
90 | }
91 | 
92 | /*
93 |     translate this shit into js: https://github.com/ecthros/uncaptcha2/blob/master/queryAPI.py
94 |  */
95 | async function translate_audio_file() {
96 | }


--------------------------------------------------------------------------------
/src/concurrency-implementation.js:
--------------------------------------------------------------------------------
 1 | const { Browser } = require('puppeteer-cluster/dist/concurrency/builtInConcurrency');
 2 | const debug = require('debug')('se-scraper:CustomConcurrency');
 3 | const { timeoutExecute } = require('puppeteer-cluster/dist/util');
 4 | 
 5 | const BROWSER_TIMEOUT = 5000;
 6 | 
 7 | class CustomConcurrency extends Browser {
 8 | 
 9 |     async init() {}
10 |     async close() {}
11 | 
12 |     async workerInstance() {
13 |         const options = this.options.perBrowserOptions.shift();
14 |         debug('Launch puppeteer instance with options=%o', options);
15 |         let chrome = await this.puppeteer.launch(options);
16 |         let page;
17 |         let context;
18 | 
19 |         return {
20 |             jobInstance: async () => {
21 |                 await timeoutExecute(BROWSER_TIMEOUT, (async () => {
22 |                     context = await chrome.createIncognitoBrowserContext();
23 |                     page = await context.newPage();
24 |                 })());
25 | 
26 |                 return {
27 |                     resources: {
28 |                         page,
29 |                     },
30 | 
31 |                     close: async () => {
32 |                         await timeoutExecute(BROWSER_TIMEOUT, context.close());
33 |                     },
34 |                 };
35 |             },
36 | 
37 |             close: async () => {
38 |                 await chrome.close();
39 |             },
40 | 
41 |             repair: async () => {
42 |                 debug('Starting repair');
43 |                 try {
44 |                     // will probably fail, but just in case the repair was not necessary
45 |                     await chrome.close();
46 |                 } catch (e) {}
47 | 
48 |                 // just relaunch as there is only one page per browser
49 |                 chrome = await this.puppeteer.launch(options);
50 |             },
51 |         };
52 |     }
53 | };
54 | 
55 | module.exports = CustomConcurrency;


--------------------------------------------------------------------------------
/src/modules/bing.js:
--------------------------------------------------------------------------------
  1 | const cheerio = require('cheerio');
  2 | const Scraper = require('./se_scraper');
  3 | 
  4 | class BingScraper extends Scraper {
  5 | 
  6 |     async parse_async(html) {
  7 | 
  8 |         let results = await this.page.evaluate(() => {
  9 | 
 10 |             let _text = (el, s) => {
 11 |                 let n = el.querySelector(s);
 12 | 
 13 |                 if (n) {
 14 |                     return n.innerText;
 15 |                 } else {
 16 |                     return '';
 17 |                 }
 18 |             };
 19 | 
 20 |             let _attr = (el, s, attr) => {
 21 |                 let n = el.querySelector(s);
 22 | 
 23 |                 if (n) {
 24 |                     return n.getAttribute(attr);
 25 |                 } else {
 26 |                     return null;
 27 |                 }
 28 |             };
 29 | 
 30 |             let results = {
 31 |                 num_results: '',
 32 |                 no_results: false,
 33 |                 effective_query: '',
 34 |                 results: [],
 35 |                 ads: [],
 36 |                 right_side_ads: [],
 37 |             };
 38 | 
 39 |             let num_results_el = document.querySelector('#b_content .sb_count');
 40 | 
 41 |             if (num_results_el) {
 42 |                 results.num_results = num_results_el.innerText;
 43 |             }
 44 | 
 45 |             let organic_results = document.querySelectorAll('#b_content #b_results .b_algo');
 46 | 
 47 |             organic_results.forEach((el) => {
 48 | 
 49 |                 let serp_obj = {
 50 |                     link: _attr(el, 'h2 a', 'href'),
 51 |                     title: _text(el, 'h2'),
 52 |                     snippet: _text(el, '.b_caption p'),
 53 |                     visible_link: _text(el, 'cite'),
 54 |                 };
 55 | 
 56 |                 results.results.push(serp_obj);
 57 |             });
 58 | 
 59 |             // check if no results
 60 |             results.no_results = (results.results.length === 0);
 61 | 
 62 |             // parse bing ads
 63 |             let ads = document.querySelectorAll('#b_results .b_ad .sb_add');
 64 | 
 65 |             ads.forEach((el) => {
 66 | 
 67 |                 let ad_obj = {
 68 |                     title: _text(el, 'h2 a'),
 69 |                     snippet: _text(el, '.b_caption p'),
 70 |                     visible_link: _text(el, '.b_adurl cite'),
 71 |                     tracking_link: _attr(el, 'h2 a', 'href'),
 72 |                 };
 73 | 
 74 |                 results.ads.push(ad_obj);
 75 |             });
 76 | 
 77 |             // right side ads
 78 |             let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add');
 79 | 
 80 |             right_side_ads.forEach((el) => {
 81 | 
 82 |                 let ad_obj = {
 83 |                     title: _text(el, 'h2 a'),
 84 |                     snippet: _text(el, '.b_caption p'),
 85 |                     visible_link: _text(el, '.b_adurl cite'),
 86 |                     tracking_link: _attr(el, 'h2 a', 'href'),
 87 |                 };
 88 | 
 89 |                 results.right_side_ads.push(ad_obj);
 90 |             });
 91 | 
 92 | 
 93 |             let effective_query_el = document.querySelector('#sp_requery a');
 94 | 
 95 |             if (effective_query_el) {
 96 |                 results.effective_query = effective_query_el.innerText;
 97 |             }
 98 | 
 99 |             return results;
100 |         });
101 | 
102 |         results.results = this.clean_results(results.results, ['title', 'link']);
103 |         results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']);
104 |         results.time = (new Date()).toUTCString();
105 |         return results;
106 |     }
107 | 
108 |     async load_start_page() {
109 |         let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
110 | 
111 |         if (this.config.bing_settings) {
112 |             startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`;
113 |             if (this.config.bing_settings.bing_domain) {
114 |                 startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`;
115 |             } else {
116 |                 startUrl = `https://www.bing.com/search?`;
117 |             }
118 | 
119 |             for (var key in this.config.bing_settings) {
120 |                 if (key !== 'bing_domain') {
121 |                     startUrl += `${key}=${this.config.bing_settings[key]}&`
122 |                 }
123 |             }
124 |         }
125 | 
126 |         await this.page.goto(startUrl);
127 |         await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
128 |         
129 |         return true;
130 |     }
131 | 
132 |     async search_keyword(keyword) {
133 |         const input = await this.page.$('input[name="q"]');
134 |         await this.set_input_value(`input[name="q"]`, keyword);
135 |         await this.sleep(50);
136 |         await input.focus();
137 |         await this.page.keyboard.press("Enter");
138 |     }
139 | 
140 |     async next_page() {
141 |         let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
142 |         if (!next_page_link) {
143 |             return false;
144 |         }
145 | 
146 |         this.last_response = await Promise.all([
147 |             next_page_link.click(), // The promise resolves after navigation has finished
148 |             this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
149 |         ]);
150 | 
151 |         return true;
152 |     }
153 | 
154 |     async wait_for_results() {
155 |         await this.page.waitForSelector('#b_content', { timeout: this.STANDARD_TIMEOUT });
156 |     }
157 | 
158 |     async detected() {
159 |         // TODO: I was actually never detected by bing. those are good boys.
160 |     }
161 | }
162 | 
163 | 
164 | class BingNewsScraper extends Scraper {
165 | 
166 |     parse(html) {
167 |         // load the page source into cheerio
168 |         const $ = cheerio.load(html);
169 | 
170 |         // perform queries
171 |         const results = [];
172 |         $('#algocore .newsitem').each((i, link) => {
173 |             results.push({
174 |                 link: $(link).attr('url'),
175 |                 title: $(link).find('a.title').text(),
176 |                 snippet: $(link).find('.snippet').text(),
177 |                 date: $(link).find('.source span').last().text(),
178 |             })
179 |         });
180 | 
181 |         const cleaned = this.clean_results(results, ['title', 'link']);
182 | 
183 |         return {
184 |             time: (new Date()).toUTCString(),
185 |             results: cleaned,
186 |         }
187 |     }
188 | 
189 |     async load_start_page() {
190 |         let startUrl = 'https://www.bing.com/news/search?';
191 | 
192 |         try {
193 |             await this.page.goto(startUrl);
194 |             if (this.config.set_manual_settings === true) {
195 |                 console.log('Sleeping 30 seconds. Set your settings now.');
196 |                 await this.sleep(30000);
197 |             }
198 |             await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
199 |         } catch (e) {
200 |             return false;
201 |         }
202 | 
203 |         return true;
204 |     }
205 | 
206 |     async search_keyword(keyword) {
207 |         const input = await this.page.$('input[name="q"]');
208 |         await this.set_input_value(`input[name="q"]`, keyword);
209 |         await this.sleep(50);
210 |         await input.focus();
211 |         await this.page.keyboard.press("Enter");
212 |     }
213 | 
214 |     async next_page() {
215 |         let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
216 |         if (!next_page_link) {
217 |             return false;
218 |         }
219 | 
220 |         this.last_response = await Promise.all([
221 |             next_page_link.click(), // The promise resolves after navigation has finished
222 |             this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation
223 |         ]);
224 | 
225 |         return true;
226 |     }
227 | 
228 |     async wait_for_results() {
229 |         await this.page.waitForSelector('#news', { timeout: this.STANDARD_TIMEOUT });
230 |     }
231 | 
232 |     async detected() {
233 |         // TODO: I was actually never detected by bing news.
234 |     }
235 | }
236 | 
237 | module.exports = {
238 |     BingNewsScraper: BingNewsScraper,
239 |     BingScraper: BingScraper,
240 | };
241 | 


--------------------------------------------------------------------------------
/src/modules/duckduckgo.js:
--------------------------------------------------------------------------------
 1 | const cheerio = require('cheerio');
 2 | const Scraper = require('./se_scraper');
 3 | const debug = require('debug')('se-scraper:DuckduckgoScraper');
 4 | 
 5 | class DuckduckgoScraper extends Scraper {
 6 | 
 7 |     parse(html) {
 8 |         debug('parse');
 9 |         // load the page source into cheerio
10 |         const $ = cheerio.load(html);
11 | 
12 |         // perform queries
13 |         const results = [];
14 |         const organicSelector = ($('#links .result--sep').length > 0) ? `#links #rld-${this.page_num - 1} ~ .result .result__body` : '#links .result__body';
15 |         $(organicSelector).each((i, link) => {
16 |             results.push({
17 |                 link: $(link).find('.result__title .result__a').attr('href'),
18 |                 title: $(link).find('.result__title .result__a').text(),
19 |                 date: $(link).find('.result__timestamp').text(),
20 |                 snippet: $(link).find('.result__snippet').text(),
21 |                 visible_link: $(link).find('.result__url').attr('href'),
22 |             });
23 |         });
24 | 
25 |         const ads = [];
26 |         $('.results--ads .result').each((i, element) => {
27 |             ads.push({
28 |                 visible_link: $(element).find('.result__url').text(),
29 |                 tracking_link: $(element).find('.result__title .result__a').attr('href'),
30 |                 title: $(element).find('.result__title .result__a').text(),
31 |                 snippet: $(element).find('.result__snippet').text(),
32 |             })
33 |         });
34 | 
35 |         let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';
36 | 
37 |         const cleaned = this.clean_results(results, ['title', 'link']);
38 | 
39 |         return {
40 |             time: (new Date()).toUTCString(),
41 |             effective_query: effective_query,
42 |             results: cleaned,
43 |             ads: ads,
44 |         }
45 |     }
46 | 
47 |     async load_start_page() {
48 |         debug('load_start_page');
49 |         let startUrl = 'https://duckduckgo.com/';
50 | 
51 |         this.last_response = await this.page.goto(startUrl);
52 |         await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
53 |         
54 |         return true;
55 |     }
56 | 
57 |     async search_keyword(keyword) {
58 |         debug('search_keyword');
59 |         const input = await this.page.$('input[name="q"]');
60 |         await this.set_input_value(`input[name="q"]`, keyword);
61 |         await this.sleep(50);
62 |         await input.focus();
63 |         await this.page.keyboard.press("Enter");
64 |     }
65 | 
66 |     async next_page() {
67 |         debug('next_page');
68 |         let next_page_link = await this.page.$('.result.result--more a', {timeout: this.STANDARD_TIMEOUT});
69 |         if (!next_page_link) {
70 |             return false;
71 |         }
72 |         await next_page_link.click();
73 |         await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
74 | 
75 |         return true;
76 |     }
77 | 
78 |     async wait_for_results() {
79 |         debug('wait_for_results');
80 |         await this.page.waitForSelector('.result__body', { timeout: this.STANDARD_TIMEOUT });
81 |     }
82 | 
83 |     async detected() {
84 |     }
85 | }
86 | 
87 | module.exports = {
88 |     DuckduckgoScraper: DuckduckgoScraper,
89 | };


--------------------------------------------------------------------------------
/src/modules/infospace.js:
--------------------------------------------------------------------------------
  1 | const cheerio = require('cheerio');
  2 | const Scraper = require('./se_scraper');
  3 | 
  4 | class InfospaceScraper extends Scraper {
  5 | 
  6 |     parse(html) {
  7 |         // load the page source into cheerio
  8 |         const $ = cheerio.load(html);
  9 | 
 10 |         // perform queries
 11 |         const results = [];
 12 |         $('.result').each((i, link) => {
 13 |             results.push({
 14 |                 link: $(link).find('a.title').attr('href'),
 15 |                 title: $(link).find('a.title').text(),
 16 |                 snippet: $(link).find('.description').text(),
 17 |                 visible_link: $(link).find('.url').text(),
 18 |             })
 19 |         });
 20 | 
 21 |         const cleaned = [];
 22 |         for (var i=0; i < results.length; i++) {
 23 |             let res = results[i];
 24 |             if (res.link && res.link.trim()) {
 25 |                 res.rank = this.result_rank++;
 26 |                 cleaned.push(res);
 27 |             }
 28 |         }
 29 | 
 30 |         let no_results = this.no_results(
 31 |             ['No search results were found for'],
 32 |             $('.layout__mainline').text()
 33 |         );
 34 | 
 35 |         return {
 36 |             time: (new Date()).toUTCString(),
 37 |             no_results: no_results,
 38 |             num_results: '',
 39 |             results: cleaned,
 40 |         }
 41 |     }
 42 | 
 43 |     async load_start_page() {
 44 | 
 45 |         let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
 46 | 
 47 |         try {
 48 |             this.last_response = await this.page.goto(startUrl);
 49 |             await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
 50 |         } catch (e) {
 51 |             return false;
 52 |         }
 53 |         return true;
 54 |     }
 55 | 
 56 |     async search_keyword(keyword) {
 57 |         const input = await this.page.$('input[id="q"]');
 58 |         await this.set_input_value('input[id="q"]', keyword);
 59 |         await this.sleep(50);
 60 |         await input.focus();
 61 |         await this.page.keyboard.press("Enter");
 62 |     }
 63 | 
 64 |     async next_page() {
 65 |         let next_page_link = await this.page.$('a.next', {timeout: 1000});
 66 |         if (!next_page_link) {
 67 |             return false;
 68 |         }
 69 |         await next_page_link.click();
 70 |         this.last_response = await this.page.waitForNavigation();
 71 | 
 72 |         return true;
 73 |     }
 74 | 
 75 |     async wait_for_results() {
 76 |         await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
 77 |     }
 78 | 
 79 |     async detected() {
 80 |     }
 81 | }
 82 | 
 83 | class WebcrawlerNewsScraper extends Scraper {
 84 | 
 85 |     parse(html) {
 86 |         // load the page source into cheerio
 87 |         const $ = cheerio.load(html);
 88 | 
 89 |         // perform queries
 90 |         const results = [];
 91 |         $('.article').each((i, link) => {
 92 |             let source = $(link).find('.source').text();
 93 |             let date = source.split(',')[1] || '';
 94 |             results.push({
 95 |                 link: $(link).find('a').attr('href'),
 96 |                 title: $(link).find('.title').text(),
 97 |                 publisher: $(link).find('.source').text(),
 98 |                 date: date,
 99 |                 snippet: $(link).find('.description').text(),
100 |             });
101 |         });
102 | 
103 |         const cleaned = this.clean_results(results, ['title', 'link']);
104 | 
105 |         return {
106 |             time: (new Date()).toUTCString(),
107 |             results: cleaned
108 |         }
109 |     }
110 | 
111 |     async load_start_page() {
112 |         try {
113 |             this.last_response = await this.page.goto('https://www.webcrawler.com/?qc=news');
114 |             await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
115 |         } catch (e) {
116 |             return false;
117 |         }
118 |         return true;
119 |     }
120 | 
121 |     async search_keyword(keyword) {
122 |         const input = await this.page.$('input[name="q"]');
123 |         await this.set_input_value('input[name="q"]', keyword);
124 |         await this.sleep(50);
125 |         await input.focus();
126 |         await this.page.keyboard.press("Enter");
127 |     }
128 | 
129 |     async next_page() {
130 |         let next_page_link = await this.page.$('.pagination__num--next', {timeout: 1000});
131 |         if (!next_page_link) {
132 |             return false;
133 |         }
134 |         await next_page_link.click();
135 |         await this.page.waitForNavigation();
136 | 
137 |         return true;
138 |     }
139 | 
140 |     async wait_for_results() {
141 |         await this.page.waitForSelector('.mainline-results', { timeout: 5000 });
142 |     }
143 | 
144 |     async detected() {
145 |     }
146 | }
147 | 
148 | module.exports = {
149 |     InfospaceScraper: InfospaceScraper,
150 |     WebcrawlerNewsScraper: WebcrawlerNewsScraper,
151 | };


--------------------------------------------------------------------------------
/src/modules/metadata.js:
--------------------------------------------------------------------------------
 1 | const cheerio = require('cheerio');
 2 | 
 3 | module.exports = {
 4 |     get_ip_data: get_ip_data,
 5 |     get_http_headers: get_http_headers,
 6 | };
 7 | 
 8 | async function get_ip_data(page) {
 9 |     await page.goto('https://ipinfo.io/json', {
10 |       waitLoad: true,
11 |       waitNetworkIdle: true
12 |     });
13 |     let json = await page.content({
14 |         timeout: 20000
15 |     });
16 |     const $ = cheerio.load(json);
17 |     let ipinfo_text =  $('pre').text();
18 |     return JSON.parse(ipinfo_text);
19 | }
20 | 
21 | async function get_http_headers(page) {
22 |     await page.goto('https://httpbin.org/get', {
23 |       waitLoad: true,
24 |       waitNetworkIdle: true
25 |     });
26 |     let headers = await page.content();
27 | 
28 |     const $ = cheerio.load(headers);
29 |     let headers_text =  $('pre').text();
30 |     return JSON.parse(headers_text);
31 | }


--------------------------------------------------------------------------------
/src/modules/se_scraper.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const meta = require('./metadata.js');
  3 | const debug = require('debug')('se-scraper:Scraper');
  4 | /*
  5 |     Get useful JS knowledge and get awesome...
  6 | 
  7 |     Read this shit: https://javascript.info/class-inheritance
  8 |     And this: https://medium.freecodecamp.org/here-are-examples-of-everything-new-in-ecmascript-2016-2017-and-2018-d52fa3b5a70e
  9 |  */
 10 | 
 11 | module.exports = class Scraper {
 12 |     constructor(options = {}) {
 13 |         debug('constructor');
 14 |         const {
 15 |             config = {},
 16 |             context = {},
 17 |             pluggable = null,
 18 |             page = null,
 19 |         } = options;
 20 | 
 21 |         this.page = page;
 22 |         this.last_response = null; // the last response object
 23 |         this.metadata = {
 24 |             scraping_detected: false,
 25 |         };
 26 |         this.pluggable = pluggable;
 27 |         this.config = config;
 28 |         this.logger = this.config.logger;
 29 |         this.context = context;
 30 | 
 31 |         this.proxy = config.proxy;
 32 |         this.keywords = config.keywords;
 33 | 
 34 |         this.STANDARD_TIMEOUT = 10000;
 35 |         this.SOLVE_CAPTCHA_TIME = 45000;
 36 | 
 37 |         this.results = {};
 38 |         this.result_rank = 1;
 39 |         // keep track of the requests done
 40 |         this.num_requests = 0;
 41 |         // keep track of the keywords searched
 42 |         this.num_keywords = 0;
 43 | 
 44 |         let settings = this.config[`${this.config.search_engine}_settings`];
 45 |         if (settings) {
 46 |             if (typeof settings === 'string') {
 47 |                 settings = JSON.parse(settings);
 48 |                 this.config[`${this.config.search_engine}_settings`] = settings;
 49 |             }
 50 |         }
 51 |     }
 52 | 
 53 |     async run({page, data, worker}) {
 54 | 
 55 |         debug('worker=%o', worker, this.config.keywords);
 56 | 
 57 |         if (page) {
 58 |             this.page = page;
 59 |         }
 60 | 
 61 |         await this.page.setViewport({ width: 1920, height: 1040 });
 62 |         let do_continue = true;
 63 | 
 64 |         if (this.config.scrape_from_file.length <= 0) {
 65 |             do_continue = await this.load_search_engine();
 66 |         }
 67 | 
 68 |         if (!do_continue) {
 69 |             console.error('Failed to load the search engine: load_search_engine()');
 70 |         } else {
 71 |             await this.scraping_loop();
 72 |         }
 73 | 
 74 |         return {
 75 |             results: this.results,
 76 |             metadata: this.metadata,
 77 |             num_requests: this.num_requests,
 78 |         }
 79 |     }
 80 | 
 81 |     /**
 82 |      * Action that runs only once in the beginning of the
 83 |      * scraping procedure.
 84 |      *
 85 |      * @returns {Promise<void>} true if everything is correct.
 86 |      */
 87 |     async load_search_engine() {
 88 | 
 89 |         if (this.config.apply_evasion_techniques === true) {
 90 |             // prevent detection by evading common detection techniques
 91 |             await evadeChromeHeadlessDetection(this.page);
 92 |         }
 93 | 
 94 |         // block some assets to speed up scraping
 95 |         if (this.config.block_assets === true) {
 96 |             await this.page.setRequestInterception(true);
 97 |             this.page.on('request', (req) => {
 98 |                 let type = req.resourceType();
 99 |                 const block = ['stylesheet', 'font', 'image', 'media'];
100 |                 if (block.includes(type)) {
101 |                     req.abort();
102 |                 } else {
103 |                     req.continue();
104 |                 }
105 |             });
106 |         }
107 | 
108 |         if (this.config.test_evasion === true) {
109 |             // Navigate to the page that will perform the tests.
110 |             const testUrl = 'https://bot.sannysoft.com';
111 |             await this.page.goto(testUrl);
112 |             // Save a screenshot of the results.
113 |             await this.page.screenshot({path: 'headless-evasion-result.png'});
114 |         }
115 | 
116 |         if (this.config.log_http_headers === true) {
117 |             this.metadata.http_headers = await meta.get_http_headers(this.page);
118 |             debug('this.metadata.http_headers=%O', this.metadata.http_headers);
119 |         }
120 | 
121 |         if (this.config.log_ip_address === true) {
122 |             let ipinfo = await meta.get_ip_data(this.page);
123 |             this.metadata.ipinfo = ipinfo;
124 |             debug('this.metadata.ipinfo', this.metadata.ipinfo);
125 |         }
126 | 
127 |         // check that our proxy is working by confirming
128 |         // that ipinfo.io sees the proxy IP address
129 |         if (this.proxy && this.config.log_ip_address === true) {
130 |             debug(`${this.metadata.ipinfo.ip} vs ${this.proxy}`);
131 | 
132 |             // if the ip returned by ipinfo is not a substring of our proxystring, get the heck outta here
133 |             if (!this.proxy.includes(this.metadata.ipinfo.ip)) {
134 |                 throw new Error(`Proxy output ip ${this.proxy} does not match with provided one`);
135 |             } else {
136 |                 this.logger.info(`Using valid Proxy: ${this.proxy}`);
137 |             }
138 | 
139 |         }
140 | 
141 |         return await this.load_start_page();
142 |     }
143 | 
144 |     /**
145 |      * Each scraper basically iterates over a list of
146 |      * keywords and a list of pages. This is the generic
147 |      * method for that.
148 |      *
149 |      * @returns {Promise<void>}
150 |      */
151 |     async scraping_loop() {
152 |         for (var keyword of this.keywords) {
153 |             this.num_keywords++;
154 |             this.keyword = keyword;
155 |             this.results[keyword] = {};
156 |             this.result_rank = 1;
157 | 
158 |             try {
159 | 
160 |                 if (this.pluggable && this.pluggable.before_keyword_scraped) {
161 |                     await this.pluggable.before_keyword_scraped({
162 |                         results: this.results,
163 |                         num_keywords: this.num_keywords,
164 |                         num_requests: this.num_requests,
165 |                         keyword: keyword,
166 |                     });
167 |                 }
168 | 
169 |                 this.page_num = 1;
170 | 
171 |                 // load scraped page from file if `scrape_from_file` is given
172 |                 if (this.config.scrape_from_file.length <= 0) {
173 |                     await this.search_keyword(keyword);
174 |                 } else {
175 |                     this.last_response = await this.page.goto(this.config.scrape_from_file);
176 |                 }
177 | 
178 |                 // when searching the keyword fails, num_requests will not
179 |                 // be incremented.
180 |                 this.num_requests++;
181 | 
182 |                 do {
183 | 
184 |                     this.logger.info(`${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${this.page_num}`);
185 | 
186 |                     await this.wait_for_results();
187 | 
188 |                     if (this.config.sleep_range) {
189 |                         await this.random_sleep();
190 |                     }
191 | 
192 |                     let html = await this.page.content();
193 |                     let parsed = this.parse(html);
194 |                     this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
195 | 
196 |                     if (this.config.screen_output) {
197 |                         this.results[keyword][this.page_num].screenshot = await this.page.screenshot({
198 |                             encoding: 'base64',
199 |                             fullPage: false,
200 |                         });
201 |                     }
202 | 
203 |                     if (this.config.html_output) {
204 | 
205 |                         if (this.config.clean_html_output) {
206 |                             await this.page.evaluate(() => {
207 |                                 // remove script and style tags
208 |                                 Array.prototype.slice.call(document.getElementsByTagName('script')).forEach(
209 |                                   function(item) {
210 |                                     item.remove();
211 |                                 });
212 |                                 Array.prototype.slice.call(document.getElementsByTagName('style')).forEach(
213 |                                   function(item) {
214 |                                     item.remove();
215 |                                 });
216 | 
217 |                                 // remove all comment nodes
218 |                                 var nodeIterator = document.createNodeIterator(
219 |                                     document.body,
220 |                                     NodeFilter.SHOW_COMMENT,    
221 |                                     { acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } }
222 |                                 );
223 |                                 while(nodeIterator.nextNode()){
224 |                                     var commentNode = nodeIterator.referenceNode;
225 |                                     commentNode.remove();
226 |                                 }
227 |                             });
228 |                         }
229 | 
230 |                         if (this.config.clean_data_images) {
231 |                             await this.page.evaluate(() => {
232 |                                 Array.prototype.slice.call(document.getElementsByTagName('img')).forEach(
233 |                                   function(item) {
234 |                                     let src = item.getAttribute('src');
235 |                                     if (src && src.startsWith('data:')) {
236 |                                         item.setAttribute('src', '');
237 |                                     }
238 |                                 });
239 |                             });
240 |                         }
241 | 
242 |                         let html_contents = await this.page.content();
243 |                         // https://stackoverflow.com/questions/27841112/how-to-remove-white-space-between-html-tags-using-javascript
244 |                         // TODO: not sure if this is save!
245 |                         html_contents = html_contents.replace(/>\s+</g,'><');
246 |                         this.results[keyword][this.page_num].html = html_contents;
247 |                     }
248 | 
249 |                     this.page_num += 1;
250 | 
251 |                     // only load the next page when we will pass the next iteration
252 |                     // step from the while loop
253 |                     if (this.page_num <= this.config.num_pages) {
254 | 
255 |                         let next_page_loaded = await this.next_page();
256 | 
257 |                         if (next_page_loaded === false) {
258 |                             break;
259 |                         } else {
260 |                             this.num_requests++;
261 |                         }
262 |                     }
263 | 
264 |                 } while (this.page_num <= this.config.num_pages);
265 | 
266 |             } catch (e) {
267 | 
268 |                 this.logger.warn(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e.message}`);
269 |                 debug('this.last_response=%O', this.last_response);
270 | 
271 |                 if (this.config.take_screenshot_on_error) {
272 |                     await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
273 |                 }
274 | 
275 |                 this.metadata.scraping_detected = await this.detected();
276 | 
277 |                 if (this.metadata.scraping_detected === true) {
278 |                     this.logger.warn(`${this.config.search_engine_name} detected the scraping!`);
279 | 
280 |                     if (this.config.is_local === true) {
281 |                         await this.sleep(this.SOLVE_CAPTCHA_TIME);
282 |                         this.logger.info(`You have ${this.SOLVE_CAPTCHA_TIME}ms to enter the captcha.`);
283 |                         // expect that user filled out necessary captcha
284 |                     } else {
285 |                         if (this.config.throw_on_detection === true) {
286 |                             throw( e );
287 |                         } else {
288 |                             return;
289 |                         }
290 |                     }
291 |                 } else {
292 |                     // some other error, quit scraping process if stuff is broken
293 |                     if (this.config.throw_on_detection === true) {
294 |                         throw( e );
295 |                     } else {
296 |                         return;
297 |                     }
298 |                 }
299 |             }
300 |         }
301 |     }
302 | 
303 |     /**
304 |      * Generic function to append queryArgs to a search engine url.
305 |      *
306 |      * @param: The baseUrl to use for the build process.
307 |      */
308 |     build_start_url(baseUrl) {
309 |         let settings = this.config[`${this.config.search_engine}_settings`];
310 | 
311 |         if (settings) {
312 |             for (var key in settings) {
313 |                 baseUrl += `${key}=${settings[key]}&`
314 |             }
315 | 
316 |             this.logger.info('Using startUrl: ' + baseUrl);
317 | 
318 |             return baseUrl;
319 |         }
320 | 
321 |         return false;
322 |     }
323 | 
324 |     sleep(ms) {
325 |         return new Promise(resolve => {
326 |             setTimeout(resolve, ms)
327 |         })
328 |     }
329 | 
330 |     async random_sleep() {
331 |         const [min, max] = this.config.sleep_range;
332 |         let rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
333 |         this.logger.info(`Sleeping for ${rand}s`);
334 |         await this.sleep(rand * 1000);
335 |     }
336 | 
337 |     async set_input_value(selector, value) {
338 |         await this.page.waitFor(selector);
339 |         await this.page.evaluate((value, selector) => {
340 |             return document.querySelector(selector).value = value;
341 |         }, value, selector);
342 |     }
343 | 
344 |     no_results(needles, html) {
345 |         for (let needle of needles) {
346 |             if (html.includes(needle)) {
347 |                 this.logger.warn(`HTML contains needle ${needle}. no_results=true`);
348 |                 return true;
349 |             }
350 |         }
351 |         return false;
352 |     }
353 | 
354 |     /*
355 |         Throw away all elements that do not have data in the
356 |         specified attributes. Most be of value string.
357 |      */
358 |     clean_results(results, attributes) {
359 |         const cleaned = [];
360 |         for (var res of results) {
361 |             let goodboy = true;
362 |             for (var attr of attributes) {
363 |                 if (!res[attr] || !res[attr].trim()) {
364 |                     goodboy = false;
365 |                     break;
366 |                 }
367 |             }
368 |             if (goodboy) {
369 |                 res.rank = this.result_rank++;
370 |                 cleaned.push(res);
371 |             }
372 |         }
373 |         return cleaned;
374 |     }
375 | 
376 |     parse(html) {
377 | 
378 |     }
379 | 
380 |     async parse_async(html) {
381 | 
382 |     }
383 | 
384 |     /**
385 |      *
386 |      * @returns true if startpage was loaded correctly.
387 |      */
388 |     async load_start_page() {
389 | 
390 |     }
391 | 
392 |     /**
393 |      * Searches the keyword by inputting it into the form and hitting enter
394 |      * or something similar.
395 |      *
396 |      * @param keyword
397 |      * @returns {Promise<void>}
398 |      */
399 |     async search_keyword(keyword) {
400 | 
401 |     }
402 | 
403 |     /**
404 |      *
405 |      * @returns true if the next page was loaded correctely
406 |      */
407 |     async next_page() {
408 | 
409 |     }
410 | 
411 |     async wait_for_results() {
412 | 
413 |     }
414 | 
415 |     async detected() {
416 | 
417 |     }
418 | };
419 | 
420 | // This is where we'll put the code to get around the tests.
421 | async function evadeChromeHeadlessDetection(page) {
422 | 
423 |         // Pass the Webdriver Test.
424 |         await page.evaluateOnNewDocument(() => {
425 |             const newProto = navigator.__proto__;
426 |             delete newProto.webdriver;
427 |             navigator.__proto__ = newProto;
428 |         });
429 | 
430 |         // Pass the Chrome Test.
431 |         await page.evaluateOnNewDocument(() => {
432 |             // We can mock this in as much depth as we need for the test.
433 |             const mockObj = {
434 |                 app: {
435 |                     isInstalled: false,
436 |                 },
437 |                 webstore: {
438 |                     onInstallStageChanged: {},
439 |                     onDownloadProgress: {},
440 |                 },
441 |                 runtime: {
442 |                     PlatformOs: {
443 |                         MAC: 'mac',
444 |                         WIN: 'win',
445 |                         ANDROID: 'android',
446 |                         CROS: 'cros',
447 |                         LINUX: 'linux',
448 |                         OPENBSD: 'openbsd',
449 |                     },
450 |                     PlatformArch: {
451 |                         ARM: 'arm',
452 |                         X86_32: 'x86-32',
453 |                         X86_64: 'x86-64',
454 |                     },
455 |                     PlatformNaclArch: {
456 |                         ARM: 'arm',
457 |                         X86_32: 'x86-32',
458 |                         X86_64: 'x86-64',
459 |                     },
460 |                     RequestUpdateCheckStatus: {
461 |                         THROTTLED: 'throttled',
462 |                         NO_UPDATE: 'no_update',
463 |                         UPDATE_AVAILABLE: 'update_available',
464 |                     },
465 |                     OnInstalledReason: {
466 |                         INSTALL: 'install',
467 |                         UPDATE: 'update',
468 |                         CHROME_UPDATE: 'chrome_update',
469 |                         SHARED_MODULE_UPDATE: 'shared_module_update',
470 |                     },
471 |                     OnRestartRequiredReason: {
472 |                         APP_UPDATE: 'app_update',
473 |                         OS_UPDATE: 'os_update',
474 |                         PERIODIC: 'periodic',
475 |                     },
476 |                 },
477 |             };
478 | 
479 |             window.navigator.chrome = mockObj;
480 |             window.chrome = mockObj;
481 |         });
482 | 
483 |         // Pass the Permissions Test.
484 |         await page.evaluateOnNewDocument(() => {
485 |             const originalQuery = window.navigator.permissions.query;
486 |             window.navigator.permissions.__proto__.query = parameters =>
487 |                 parameters.name === 'notifications'
488 |                     ? Promise.resolve({state: Notification.permission})
489 |                     : originalQuery(parameters);
490 | 
491 |             // Inspired by: https://github.com/ikarienator/phantomjs_hide_and_seek/blob/master/5.spoofFunctionBind.js
492 |             const oldCall = Function.prototype.call;
493 | 
494 |             function call() {
495 |                 return oldCall.apply(this, arguments);
496 |             }
497 | 
498 |             Function.prototype.call = call;
499 | 
500 |             const nativeToStringFunctionString = Error.toString().replace(/Error/g, "toString");
501 |             const oldToString = Function.prototype.toString;
502 | 
503 |             function functionToString() {
504 |                 if (this === window.navigator.permissions.query) {
505 |                     return "function query() { [native code] }";
506 |                 }
507 |                 if (this === functionToString) {
508 |                     return nativeToStringFunctionString;
509 |                 }
510 |                 return oldCall.call(oldToString, this);
511 |             }
512 | 
513 |             Function.prototype.toString = functionToString;
514 |         });
515 | 
516 |         // Pass the Plugins Length Test.
517 |         await page.evaluateOnNewDocument(() => {
518 |             // Overwrite the `plugins` property to use a custom getter.
519 |             Object.defineProperty(navigator, 'plugins', {
520 |                 // This just needs to have `length > 0` for the current test,
521 |                 // but we could mock the plugins too if necessary.
522 |                 get: () => [1, 2, 3, 4, 5]
523 |             });
524 |         });
525 | 
526 |         // Pass the Languages Test.
527 |         await page.evaluateOnNewDocument(() => {
528 |             // Overwrite the `plugins` property to use a custom getter.
529 |             Object.defineProperty(navigator, 'languages', {
530 |                 get: () => ['en-US', 'en']
531 |             });
532 |         });
533 | 
534 |         // Pass the iframe Test
535 |         await page.evaluateOnNewDocument(() => {
536 |             Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
537 |                 get: function () {
538 |                     return window;
539 |                 }
540 |             });
541 |         });
542 | 
543 |         // Pass toString test, though it breaks console.debug() from working
544 |         await page.evaluateOnNewDocument(() => {
545 |             window.console.debug = () => {
546 |                 return null;
547 |             };
548 |         });
549 | }
550 | 


--------------------------------------------------------------------------------
/src/modules/yandex.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | const Scraper = require('./se_scraper');
  4 | 
  5 | class YandexScraper extends Scraper {
  6 | 
  7 |     constructor(...args) {
  8 |         super(...args);
  9 |     }
 10 | 
 11 |     async parse_async(html) {
 12 | 
 13 |         let results = await this.page.evaluate(() => {
 14 |             let serp_items =  document.querySelectorAll('.serp-item');
 15 |             const data = [];
 16 |             serp_items.forEach((item) => {
 17 |                 let obj = {
 18 |                     is_ad: false,
 19 |                 };
 20 |                 try {
 21 |                     if (item) {
 22 | 
 23 |                         let linkElement = item.querySelector('h2 a.link');
 24 | 
 25 |                         if (linkElement) {
 26 |                             obj.link = linkElement.getAttribute('href');
 27 |                             obj.title = linkElement.innerText;
 28 |                         }
 29 | 
 30 | 
 31 |                         let label = item.querySelector('.organic__subtitle .label');
 32 | 
 33 |                         if (label) {
 34 |                             let labelText = label.innerText;
 35 | 
 36 |                             if (labelText) {
 37 |                                 labelText = labelText.trim().toLowerCase();
 38 |                                 console.log(labelText);
 39 |                                 let ad_labels = ['ad', 'werbung', 'реклама', 'anuncio'];
 40 |                                 obj.is_ad = ad_labels.includes(labelText);
 41 |                             }
 42 |                         }
 43 | 
 44 |                         obj.snippet = item.querySelector('.text-container.typo').innerText;
 45 |                         obj.visible_link = item.querySelector('.typo_type_greenurl').innerText;
 46 | 
 47 |                         if (obj.title) {
 48 |                             data.push(obj);
 49 |                         }
 50 |                     }
 51 |                 } catch (e) {
 52 |                 }
 53 |             });
 54 |             return data;
 55 |         });
 56 | 
 57 |         let num_results = await this.page.evaluate(() => {
 58 |             let num_results =  document.querySelector('.serp-adv__found');
 59 |             if (num_results) {
 60 |                 return num_results.innerText;
 61 |             }
 62 |         });
 63 | 
 64 |         const cleaned = this.clean_results(results, ['title', 'link' , 'snippet']);
 65 | 
 66 |         return {
 67 |             time: (new Date()).toUTCString(),
 68 |             num_results: num_results,
 69 |             results: cleaned,
 70 |         };
 71 |     }
 72 | 
 73 |     async load_start_page() {
 74 |         let startUrl = 'https://yandex.com';
 75 | 
 76 |         this.logger.info('Using startUrl: ' + startUrl);
 77 | 
 78 |         this.last_response = await this.page.goto(startUrl);
 79 | 
 80 |         await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });
 81 | 
 82 |         return true;
 83 |     }
 84 | 
 85 |     async search_keyword(keyword) {
 86 |         const input = await this.page.$('input[name="text"]');
 87 |         await this.set_input_value(`input[name="text"]`, keyword);
 88 |         await this.sleep(50);
 89 |         await input.focus();
 90 |         await this.page.keyboard.press("Enter");
 91 |     }
 92 | 
 93 |     async next_page() {
 94 |         let next_page_link = await this.page.$('.pager .pager__item_kind_next', {timeout: 1000});
 95 |         if (!next_page_link) {
 96 |             return false;
 97 |         }
 98 |         await next_page_link.click();
 99 | 
100 |         return true;
101 |     }
102 | 
103 |     async wait_for_results() {
104 |         await this.page.waitForSelector('.main__content', { timeout: this.STANDARD_TIMEOUT });
105 |     }
106 | 
107 |     async detected() {
108 | 
109 |     }
110 | }
111 | 
112 | module.exports = {
113 |     YandexScraper: YandexScraper,
114 | };


--------------------------------------------------------------------------------
/src/node_scraper.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | const fs = require('fs');
  4 | const os = require('os');
  5 | const _ = require('lodash');
  6 | const { createLogger, format, transports } = require('winston');
  7 | const { combine, timestamp, printf } = format;
  8 | const debug = require('debug')('se-scraper:ScrapeManager');
  9 | const { Cluster } = require('puppeteer-cluster');
 10 | 
 11 | const UserAgent = require('user-agents');
 12 | const google = require('./modules/google.js');
 13 | const bing = require('./modules/bing.js');
 14 | const yandex = require('./modules/yandex.js');
 15 | const infospace = require('./modules/infospace.js');
 16 | const duckduckgo = require('./modules/duckduckgo.js');
 17 | const CustomConcurrencyImpl = require('./concurrency-implementation');
 18 | 
 19 | const MAX_ALLOWED_BROWSERS = 6;
 20 | 
 21 | function write_results(fname, data) {
 22 |     fs.writeFileSync(fname, data, (err) => {
 23 |         if (err) throw err;
 24 |         console.log(`Results written to file ${fname}`);
 25 |     });
 26 | }
 27 | 
 28 | function read_keywords_from_file(fname) {
 29 |     let kws =  fs.readFileSync(fname).toString().split(os.EOL);
 30 |     // clean keywords
 31 |     kws = kws.filter((kw) => {
 32 |         return kw.trim().length > 0;
 33 |     });
 34 |     return kws;
 35 | }
 36 | 
 37 | 
 38 | function getScraper(search_engine, args) {
 39 |     if (typeof search_engine === 'string') {
 40 |         return new {
 41 |             google: google.GoogleScraper,
 42 |             google_news_old: google.GoogleNewsOldScraper,
 43 |             google_news: google.GoogleNewsScraper,
 44 |             google_image: google.GoogleImageScraper,
 45 |             bing: bing.BingScraper,
 46 |             yandex: yandex.YandexScraper,
 47 |             bing_news: bing.BingNewsScraper,
 48 |             duckduckgo: duckduckgo.DuckduckgoScraper,
 49 |             infospace: infospace.InfospaceScraper,
 50 |             webcrawler: infospace.WebcrawlerNewsScraper,
 51 |         }[search_engine](args);
 52 |     } else if (typeof search_engine === 'function') {
 53 |         return new search_engine(args);
 54 |     } else {
 55 |         throw new Error(`search_engine must either be a string of class (function)`);
 56 |     }
 57 | }
 58 | 
 59 | 
 60 | class ScrapeManager {
 61 | 
 62 |     constructor(config, context={}) {
 63 | 
 64 |         this.cluster = null;
 65 |         this.pluggable = null;
 66 |         this.scraper = null;
 67 |         this.context = context;
 68 | 
 69 |         this.config = _.defaults(config, {
 70 |             // the user agent to scrape with
 71 |             user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
 72 |             // if random_user_agent is set to True, a random user agent is chosen
 73 |             random_user_agent: false,
 74 |             // whether to select manual settings in visible mode
 75 |             set_manual_settings: false,
 76 |             // log ip address data
 77 |             log_ip_address: false,
 78 |             // log http headers
 79 |             log_http_headers: false,
 80 |             // how long to sleep between requests. a random sleep interval within the range [a,b]
 81 |             // is drawn before every request. empty string for no sleeping.
 82 |             sleep_range: null,
 83 |             // which search engine to scrape
 84 |             search_engine: 'google',
 85 |             search_engine_name: 'google',
 86 |             logger: createLogger({
 87 |                 level: 'info',
 88 |                 format: combine(
 89 |                     timestamp(),
 90 |                     printf(({ level, message, timestamp }) => {
 91 |                         return `${timestamp} [${level}] ${message}`;
 92 |                     })
 93 |                 ),
 94 |                 transports: [
 95 |                     new transports.Console()
 96 |                 ]
 97 |             }),
 98 |             keywords: ['nodejs rocks',],
 99 |             // whether to start the browser in headless mode
100 |             headless: true,
101 |             // specify flags passed to chrome here
102 |             // About our defaults values https://peter.sh/experiments/chromium-command-line-switches/
103 |             chrome_flags: [
104 |                 '--disable-infobars',
105 |                 '--window-position=0,0',
106 |                 '--ignore-certifcate-errors',
107 |                 '--ignore-certifcate-errors-spki-list',
108 |                 '--no-sandbox',
109 |                 '--disable-setuid-sandbox',
110 |                 '--disable-dev-shm-usage',
111 |                 '--disable-accelerated-2d-canvas',
112 |                 '--disable-gpu',
113 |                 '--window-size=1920,1040',
114 |                 '--start-fullscreen',
115 |                 '--hide-scrollbars',
116 |                 '--disable-notifications',
117 |             ],
118 |             // the number of pages to scrape for each keyword
119 |             num_pages: 1,
120 |             // path to output file, data will be stored in JSON
121 |             output_file: '',
122 |             // whether to also passthru all the html output of the serp pages
123 |             html_output: false,
124 |             // whether to strip JS and CSS from the html_output
125 |             // has only an effect if `html_output` is true
126 |             clean_html_output: true,
127 |             // remove all data images from the html
128 |             clean_data_images: true,
129 |             // whether to return a screenshot of serp pages as b64 data
130 |             screen_output: false,
131 |             // Scrape url from local file. Mainly used for testing.
132 |             scrape_from_file: '',
133 |             // whether to prevent images, css, fonts and media from being loaded
134 |             // will speed up scraping a great deal
135 |             block_assets: true,
136 |             // path to js module that extends functionality
137 |             // this module should export the functions:
138 |             // get_browser, handle_metadata, close_browser
139 |             //custom_func: resolve('examples/pluggable.js'),
140 |             custom_func: null,
141 |             throw_on_detection: false,
142 |             // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
143 |             proxies: null,
144 |             // a file with one proxy per line. Example:
145 |             // socks5://78.94.172.42:1080
146 |             // http://118.174.233.10:48400
147 |             proxy_file: '',
148 |             // whether to use proxies only
149 |             // when this is set to true, se-scraper will not use
150 |             // your default IP address
151 |             use_proxies_only: false,
152 |             // check if headless chrome escapes common detection techniques
153 |             // this is a quick test and should be used for debugging
154 |             test_evasion: false,
155 |             apply_evasion_techniques: true,
156 |             // settings for puppeteer-cluster
157 |             puppeteer_cluster_config: {
158 |                 timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
159 |                 monitor: false,
160 |                 concurrency: Cluster.CONCURRENCY_BROWSER,
161 |                 maxConcurrency: 1,
162 |             }
163 |         });
164 | 
165 |         this.logger = this.config.logger;
166 | 
167 |         if (config.sleep_range) {
168 |             // parse an array
169 |             config.sleep_range = eval(config.sleep_range);
170 | 
171 |             if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
172 |                 throw "sleep_range is not a valid array of two integers.";
173 |             }
174 |         }
175 | 
176 |         if (fs.existsSync(this.config.keyword_file)) {
177 |             this.config.keywords = read_keywords_from_file(this.config.keyword_file);
178 |         }
179 | 
180 |         if (this.config.proxies && this.config.proxy_file) {
181 |             throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
182 |         }
183 | 
184 |         if (this.config.proxy_file) {
185 |             this.config.proxies = read_keywords_from_file(this.config.proxy_file);
186 |             this.logger.info(`${this.config.proxies.length} proxies read from file.`);
187 |         }
188 | 
189 |         if (!this.config.proxies && this.config.use_proxies_only) {
190 |             throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
191 |         }
192 | 
193 |         debug('this.config=%O', this.config);
194 |     }
195 | 
196 |     /*
197 |      * Launches the puppeteer cluster or browser.
198 |      *
199 |      * Returns true if the browser was successfully launched. Otherwise will return false.
200 |      */
201 |     async start() {
202 | 
203 |         if (this.config.custom_func) {
204 |             if (fs.existsSync(this.config.custom_func)) {
205 |                 try {
206 |                     const PluggableClass = require(this.config.custom_func);
207 |                     this.pluggable = new PluggableClass({
208 |                         config: this.config,
209 |                         context: this.context
210 |                     });
211 |                 } catch (exception) {
212 |                     console.error(exception);
213 |                     return false;
214 |                 }
215 |             } else {
216 |                 console.error(`File "${this.config.custom_func}" does not exist!`);
217 |                 return false;
218 |             }
219 |         }
220 | 
221 |         const chrome_flags = _.clone(this.config.chrome_flags);
222 | 
223 |         if (this.pluggable && this.pluggable.start_browser) {
224 |             launch_args.config = this.config;
225 |             this.browser = await this.pluggable.start_browser({
226 |                 config: this.config,
227 |             });
228 |             this.page = await this.browser.newPage();
229 |         } else {
230 |             // if no custom start_browser functionality was given
231 |             // use puppeteer-cluster for scraping
232 | 
233 |             let proxies;
234 |             // if we have at least one proxy, always use CONCURRENCY_BROWSER
235 |             // and set maxConcurrency to this.config.proxies.length + 1
236 |             // else use whatever this.configuration was passed
237 |             if (this.config.proxies && this.config.proxies.length > 0) {
238 | 
239 |                 // because we use real browsers, we ran out of memory on normal laptops
240 |                 // when using more than maybe 5 or 6 browsers.
241 |                 // therefore hardcode a limit here
242 |                 // TODO not sure this what we want
243 |                 this.numClusters = Math.min(
244 |                     this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
245 |                     MAX_ALLOWED_BROWSERS
246 |                 );
247 |                 proxies = _.clone(this.config.proxies);
248 | 
249 |                 // Insert a first config without proxy if use_proxy_only is false
250 |                 if (this.config.use_proxies_only === false) {
251 |                     proxies.unshift(null);
252 |                 }
253 | 
254 |             } else {
255 |                 this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
256 |                 proxies = _.times(this.numClusters, null);
257 |             }
258 | 
259 |             this.logger.info(`Using ${this.numClusters} clusters.`);
260 | 
261 |             // Give the per browser options
262 |             const perBrowserOptions = _.map(proxies, (proxy) => {
263 |                 const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
264 |                 let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
265 | 
266 |                 if (proxy) {
267 |                     args = args.concat([`--proxy-server=${proxy}`]);
268 |                 }
269 | 
270 |                 return {
271 |                     headless: this.config.headless,
272 |                     ignoreHTTPSErrors: true,
273 |                     args
274 |                 };
275 |             });
276 | 
277 |             debug('perBrowserOptions=%O', perBrowserOptions)
278 | 
279 |             this.cluster = await Cluster.launch({
280 |                 monitor: this.config.puppeteer_cluster_config.monitor,
281 |                 timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
282 |                 concurrency: CustomConcurrencyImpl,
283 |                 maxConcurrency: this.numClusters,
284 |                 puppeteerOptions: {
285 |                     perBrowserOptions: perBrowserOptions
286 |                 }
287 |             });
288 |         }
289 |     }
290 | 
291 |     /*
292 |      * Scrapes the keywords specified by the config.
293 |      */
294 |     async scrape(scrape_config = {}) {
295 | 
296 |         if (!scrape_config.keywords && !scrape_config.keyword_file) {
297 |             throw new Error('Either keywords or keyword_file must be supplied to scrape()');
298 |         }
299 | 
300 |         Object.assign(this.config, scrape_config);
301 | 
302 |         var results = {};
303 |         var num_requests = 0;
304 |         var metadata = {};
305 |         var startTime = Date.now();
306 | 
307 |         this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
308 | 
309 |         this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
310 | 
311 |         if (this.pluggable && this.pluggable.start_browser) {
312 | 
313 |             this.scraper = getScraper(this.config.search_engine, {
314 |                 config: this.config,
315 |                 context: this.context,
316 |                 pluggable: this.pluggable,
317 |                 page: this.page,
318 |             });
319 | 
320 |             var {results, metadata, num_requests} = await this.scraper.run(this.page);
321 | 
322 |         } else {
323 |             // Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
324 |             // https://github.com/GoogleChrome/puppeteer/issues/678
325 |             // The question is: Is it possible to set proxies per Page? Per Browser?
326 |             // as far as I can see, puppeteer cluster uses the same puppeteerOptions
327 |             // for every browser instance. We will use our custom puppeteer-cluster version.
328 |             // https://www.npmjs.com/package/proxy-chain
329 |             // this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077
330 |             let chunks = [];
331 |             for (var n = 0; n < this.numClusters; n++) {
332 |                 chunks.push([]);
333 |             }
334 |             for (var k = 0; k < this.config.keywords.length; k++) {
335 |                 chunks[k % this.numClusters].push(this.config.keywords[k]);
336 |             }
337 | 
338 |             debug('chunks=%o', chunks);
339 | 
340 |             let execPromises = [];
341 |             for (var c = 0; c < chunks.length; c++) {
342 |                 const config = _.clone(this.config);
343 |                 config.keywords = chunks[c];
344 | 
345 |                 var obj = getScraper(this.config.search_engine, {
346 |                     config: config,
347 |                     context: {},
348 |                     pluggable: this.pluggable,
349 |                 });
350 | 
351 |                 var boundMethod = obj.run.bind(obj);
352 |                 execPromises.push(this.cluster.execute({}, boundMethod));
353 |             }
354 | 
355 |             let promiseReturns = await Promise.all(execPromises);
356 | 
357 |             // Merge results and metadata per keyword
358 |             for (let promiseReturn of promiseReturns) {
359 |                 Object.assign(results, promiseReturn.results);
360 |                 Object.assign(metadata, promiseReturn.metadata);
361 |                 num_requests += promiseReturn.num_requests;
362 |             }
363 |         }
364 | 
365 |         let timeDelta = Date.now() - startTime;
366 |         let ms_per_request = timeDelta/num_requests;
367 | 
368 |         this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
369 |         this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
370 | 
371 |         if (this.pluggable && this.pluggable.handle_results) {
372 |             await this.pluggable.handle_results(results);
373 |         }
374 | 
375 |         metadata.elapsed_time = timeDelta.toString();
376 |         metadata.ms_per_keyword = ms_per_request.toString();
377 |         metadata.num_requests = num_requests;
378 | 
379 |         debug('metadata=%O', metadata);
380 | 
381 |         if (this.pluggable && this.pluggable.handle_metadata) {
382 |             await this.pluggable.handle_metadata(metadata);
383 |         }
384 | 
385 |         if (this.config.output_file) {
386 |             this.logger.info(`Writing results to ${this.config.output_file}`);
387 |             write_results(this.config.output_file, JSON.stringify(results, null, 4));
388 |         }
389 | 
390 |         return {
391 |             results: results,
392 |             metadata: metadata || {},
393 |         };
394 |     }
395 | 
396 |     /*
397 |      * Quit the puppeteer cluster/browser.
398 |      */
399 |     async quit() {
400 |         if (this.pluggable && this.pluggable.close_browser) {
401 |             await this.pluggable.close_browser();
402 |         } else {
403 |             await this.cluster.idle();
404 |             await this.cluster.close();
405 |         }
406 |     }
407 | }
408 | 
409 | module.exports = {
410 |     ScrapeManager: ScrapeManager,
411 | };
412 | 


--------------------------------------------------------------------------------
/test/html_output.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const express = require('express');
  3 | const { createLogger, transports } = require('winston');
  4 | const http = require('http');
  5 | const https = require('https');
  6 | const assert = require('assert');
  7 | const path = require('path');
  8 | const keyCert = require('key-cert');
  9 | const Promise = require('bluebird');
 10 | const Proxy = require('http-mitm-proxy');
 11 | 
 12 | const debug = require('debug')('se-scraper:test');
 13 | const se_scraper = require('../');
 14 | 
 15 | const httpPort = 3012;
 16 | const httpsPort = httpPort + 1;
 17 | const proxyPort = httpPort + 2;
 18 | 
 19 | const fakeSearchEngine = express();
 20 | fakeSearchEngine.get('/search', (req, res) => {
 21 |     debug('q=%s', req.query.q);
 22 |     const pageNumber = ((req.query.start/10) || 0)  + 1;
 23 |     res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
 24 | });
 25 | fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
 26 | 
 27 | describe('Config', function(){
 28 | 
 29 |     let httpServer, httpsServer, proxy;
 30 |     before(async function(){
 31 |         // Here mount our fake engine in both http and https listen server
 32 |         httpServer = http.createServer(fakeSearchEngine);
 33 |         httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
 34 |        
 35 |         proxy = Proxy();
 36 |         proxy.onRequest((ctx, callback) => {
 37 |             ctx.proxyToServerRequestOptions.host = 'localhost';
 38 |             ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
 39 |             ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
 40 |             debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
 41 |             return callback();
 42 |         });
 43 | 
 44 |         await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
 45 |         await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
 46 |         await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
 47 |         debug('Fake http search engine servers started');
 48 |     });
 49 | 
 50 |     after(function(){
 51 |         httpsServer.close();
 52 |         httpServer.close();
 53 |         proxy.close();
 54 |     });
 55 | 
 56 |     describe('html_output', function(){
 57 | 
 58 |         const testLogger = createLogger({
 59 |             transports: [
 60 |                 new transports.Console({
 61 |                     level: 'error'
 62 |                 })
 63 |             ]
 64 |         });
 65 | 
 66 |         /**
 67 |          * Test html_output option
 68 |          */
 69 |         it('html_output single page single keyword', async function () {
 70 | 
 71 |             const scrape_job = {
 72 |                 search_engine: 'google',
 73 |                 /* TODO refactor start_url
 74 |                 google_settings: {
 75 |                     start_url: 'http://localhost:' + httpPort
 76 |                 },
 77 |                 */
 78 |                 keywords: ['test keyword'],
 79 |             };
 80 | 
 81 |             var scraper = new se_scraper.ScrapeManager({
 82 |                 throw_on_detection: true,
 83 |                 logger: testLogger,
 84 |                 html_output: true,
 85 |                 //clean_html_output: false,
 86 |                 //clean_data_images: false,
 87 |                 // TODO refactor start_url so we can use-it instead of depending of the proxy for this test
 88 |                 proxies: ['http://localhost:' + proxyPort],
 89 |                 use_proxies_only: true,
 90 |             });
 91 |             await scraper.start();
 92 |             const { results } = await scraper.scrape(scrape_job);
 93 |             await scraper.quit();
 94 | 
 95 |             assert(results['test keyword']['1'].html.length > 1000, 'Html of google page 1 should be provided');
 96 |             
 97 |         });
 98 | 
 99 |     });
100 | 
101 | });


--------------------------------------------------------------------------------
/test/mocks/duckduckgo/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IEMobile 7 ]> <html lang="en_US" class="no-js iem7"> <![endif]-->
  3 | <!--[if lt IE 7]> <html class="ie6 lt-ie10 lt-ie9 lt-ie8 lt-ie7 no-js" lang="en_US"> <![endif]-->
  4 | <!--[if IE 7]>    <html class="ie7 lt-ie10 lt-ie9 lt-ie8 no-js" lang="en_US"> <![endif]-->
  5 | <!--[if IE 8]>    <html class="ie8 lt-ie10 lt-ie9 no-js" lang="en_US"> <![endif]-->
  6 | <!--[if IE 9]>    <html class="ie9 lt-ie10 no-js" lang="en_US"> <![endif]-->
  7 | <!--[if (gte IE 9)|(gt IEMobile 7)|!(IEMobile)|!(IE)]><!--><html class="no-js" lang="en_US"><!--<![endif]-->
  8 | 
  9 | <head>
 10 | 	<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
 11 | <meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
 12 | <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1" />
 13 | <meta name="HandheldFriendly" content="true"/>
 14 | 
 15 | <link rel="canonical" href="https://duckduckgo.com/">
 16 | 
 17 | <link rel="stylesheet" href="/s1847.css" type="text/css">
 18 | 
 19 | <link rel="stylesheet" href="/o1847.css" type="text/css">
 20 | 
 21 | 
 22 | 
 23 | <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon"/>
 24 | <link rel="apple-touch-icon" href="/assets/icons/meta/DDG-iOS-icon_60x60.png"/>
 25 | <link rel="apple-touch-icon" sizes="76x76" href="/assets/icons/meta/DDG-iOS-icon_76x76.png"/>
 26 | <link rel="apple-touch-icon" sizes="120x120" href="/assets/icons/meta/DDG-iOS-icon_120x120.png"/>
 27 | <link rel="apple-touch-icon" sizes="152x152" href="/assets/icons/meta/DDG-iOS-icon_152x152.png"/>
 28 | <link rel="image_src" href="/assets/icons/meta/DDG-icon_256x256.png"/>
 29 | <link rel="manifest" href="/manifest.json"/>
 30 | 
 31 | <meta name="twitter:card" content="summary">
 32 | <meta name="twitter:site" value="@duckduckgo">
 33 | 
 34 | <meta property="og:url" content="https://duckduckgo.com/" />
 35 | <meta property="og:site_name" content="DuckDuckGo" />
 36 | <meta property="og:image" content="https://duckduckgo.com/assets/logo_social-media.png">
 37 | 
 38 | 
 39 | 	<title>DuckDuckGo — Privacy, simplified.</title>
 40 | <meta property="og:title" content="DuckDuckGo — Privacy, simplified." />
 41 | 
 42 | 
 43 | <meta property="og:description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
 44 | <meta name="description" content="The Internet privacy company that empowers you to seamlessly take control of your personal information online, without any tradeoffs.">
 45 | 
 46 | 
 47 | </head>
 48 | <body id="pg-index" class="page-index body--home">
 49 | 	<script type="text/javascript">
 50 | var settings_js_version = "/s2475.js",
 51 |     locale = "en_US";
 52 | </script>
 53 | <script type="text/javascript" src="/lib/l113.js"></script>
 54 | <script type="text/javascript" src="/locale/en_US/duckduckgo14.js"></script>
 55 | <script type="text/javascript" src="/util/u418.js"></script>
 56 | <script type="text/javascript" src="/d2727.js"></script>
 57 | 
 58 | 
 59 | 
 60 | <script type="text/javascript">
 61 |     DDG.page = new DDG.Pages.Home();
 62 | </script>
 63 | 
 64 | 
 65 | 
 66 | 	<div class="site-wrapper  site-wrapper--home  js-site-wrapper">
 67 | 	
 68 | 		
 69 | 			<div class="header-wrap--home  js-header-wrap">
 70 | 	<div class="header--aside js-header-aside"></div>
 71 | 	<div class="js-header-home-search header-wrap--home__search">
 72 | 				<div class="logo-wrap--home">
 73 | 			<a id="logo_homepage_link" class="logo_homepage" href="/about">
 74 | 				About DuckDuckGo
 75 | 				<span class="logo_homepage__tt">Duck it!</span>
 76 | 			</a>
 77 | 		</div>
 78 | 
 79 | 		<form id="search_form_homepage_top" class="search  search--home  js-search-form-top" name="x" method="POST" action="/html">
 80 | 			<input class="search__input  js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
 81 | 			<input class="search__button  js-search-button" type="submit" tabindex="2" value="S" />
 82 | 			<input class="search__clear  empty  js-search-clear" type="button" tabindex="3" value="X" />
 83 | 			<div class="search__hidden  js-search-hidden"></div>
 84 | 		</form>
 85 | 	</div>
 86 | </div>
 87 | 			<div id="" class="content-wrap--home">
 88 | 				<div id="content_homepage" class="content--home">
 89 | 					<div class="cw--c">
 90 | 								<div class="logo-wrap--home">
 91 | 			<a id="logo_homepage_link" class="logo_homepage" href="/about">
 92 | 				About DuckDuckGo
 93 | 				<span class="logo_homepage__tt">Duck it!</span>
 94 | 			</a>
 95 | 		</div>
 96 | 
 97 | 						<div class="search-wrap--home">
 98 | 									<form id="search_form_homepage" class="search  search--home  js-search-form" name="x" method="POST" action="/html">
 99 | 			<input id="search_form_input_homepage" class="search__input  js-search-input" type="text" autocomplete="off" name="q" tabindex="1" value="">
100 | 			<input id="search_button_homepage" class="search__button  js-search-button" type="submit" tabindex="2" value="S" />
101 | 			<input id="search_form_input_clear" class="search__clear  empty  js-search-clear" type="button" tabindex="3" value="X" />
102 | 			<div id="search_elements_hidden" class="search__hidden  js-search-hidden"></div>
103 | 		</form>
104 | 
105 | 						</div>
106 | 		
107 | 	
108 | 
109 | 						<!-- en_US All Settings -->
110 | <noscript>
111 |     <div class="tag-home">
112 |         <div class="tag-home__wrapper">
113 |             <div class="tag-home__item">
114 |                 The search engine that doesn't track you.
115 |                 <span class="hide--screen-xs"><a href="/about" class="tag-home__link">Learn More</a>.</span>
116 |             </div>
117 |         </div>
118 |     </div>
119 | </noscript>
120 | <div class="tag-home  tag-home--slide  no-js__hide  js-tag-home"></div>
121 |         <div id="error_homepage"></div>
122 | 
123 | 
124 | 	
125 | 		
126 | 					</div> <!-- cw -->
127 | 				</div> <!-- content_homepage //-->
128 | 			</div> <!-- content_wrapper_homepage //-->
129 | 			<div id="footer_homepage" class="foot-home  js-foot-home"></div>
130 | 
131 | <script type="text/javascript">
132 | 	{function seterr(str) {
133 | 		var error=document.getElementById('error_homepage');
134 | 		error.innerHTML=str;
135 | 		$(error).css('display','block');
136 | 	}
137 | 	var err=new RegExp('[\?\&]e=([^\&]+)');var errm=new Array();errm['2']='no search';errm['3']='search too long';errm['4']='not UTF\u002d8 encoding';errm['6']='too many search terms';if (err.test(window.location.href)) seterr('Oops, '+(errm[RegExp.$1]?errm[RegExp.$1]:'there was an error.')+' &nbsp;Please try again');};
138 | 	
139 | 	if (kurl) {
140 | 	  document.getElementById("logo_homepage_link").href += (document.getElementById("logo_homepage_link").href.indexOf('?')==-1 ? '?t=i' : '') + kurl;
141 | 	}
142 | </script>
143 | 
144 | 		
145 | 	
146 | 	</div> <!-- site-wrapper -->
147 | </body>
148 | </html>
149 | 


--------------------------------------------------------------------------------
/test/modules/bing.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const express = require('express');
  3 | const puppeteer = require('puppeteer');
  4 | const { createLogger, transports } = require('winston');
  5 | const http = require('http');
  6 | const https = require('https');
  7 | const assert = require('assert');
  8 | const path = require('path');
  9 | const keyCert = require('key-cert');
 10 | const Promise = require('bluebird');
 11 | const Proxy = require('http-mitm-proxy');
 12 | 
 13 | const debug = require('debug')('se-scraper:test');
 14 | const { BingScraper } = require('../../src/modules/bing');
 15 | 
 16 | const httpPort = 3012;
 17 | const httpsPort = httpPort + 1;
 18 | const proxyPort = httpPort + 2;
 19 | 
 20 | const fakeSearchEngine = express();
 21 | fakeSearchEngine.get('/search', (req, res, next) => {
 22 |     debug('q=%s', req.query.q);
 23 |     const pageNumber = Math.round((req.query.first || 0) /10) + 1;
 24 |     res.sendFile(path.join(__dirname, '../mocks/bing/' + req.query.q + '_page' + pageNumber + '.html'));
 25 | });
 26 | fakeSearchEngine.use(express.static('test/mocks/bing', {extensions: ['html']}));
 27 | 
 28 | describe('Module Bing', function(){
 29 | 
 30 |     let httpServer, httpsServer, proxy;
 31 |     before(async function(){
 32 |         // Here mount our fake engine in both http and https listen server
 33 |         httpServer = http.createServer(fakeSearchEngine);
 34 |         httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
 35 |         
 36 |         proxy = Proxy();
 37 |         proxy.onRequest((ctx, callback) => {
 38 |             ctx.proxyToServerRequestOptions.host = 'localhost';
 39 |             ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
 40 |             ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
 41 |             debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
 42 |             return callback();
 43 |         });
 44 | 
 45 |         await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
 46 |         await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
 47 |         await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
 48 |         debug('Fake http search engine servers started');
 49 |     });
 50 | 
 51 |     after(function(){
 52 |         proxy.close();
 53 |         httpsServer.close();
 54 |         httpServer.close();
 55 |     });
 56 | 
 57 |     let browser;
 58 |     let page;
 59 |     beforeEach(async function(){
 60 |         debug('Start a new browser');
 61 |         browser = await puppeteer.launch({
 62 |             //dumpio: true,
 63 |             //headless: false,
 64 |             ignoreHTTPSErrors: true,
 65 |             args: [ '--proxy-server=http://localhost:' + proxyPort ]
 66 |         });
 67 |         debug('Open a fresh page');
 68 |         page = await browser.newPage();
 69 |     });
 70 | 
 71 |     afterEach(async function(){
 72 |         await browser.close();
 73 |     });
 74 | 
 75 |     const testLogger = createLogger({
 76 |         transports: [
 77 |             new transports.Console({
 78 |                 level: 'error'
 79 |             })
 80 |         ]
 81 |     });
 82 | 
 83 |     it('one keyword one page', function(){
 84 |         const bingScraper = new BingScraper({
 85 |             config: {
 86 |                 search_engine_name: 'bing',
 87 |                 throw_on_detection: true,
 88 |                 keywords: ['test keyword'],
 89 |                 logger: testLogger,
 90 |                 scrape_from_file: '',
 91 |             }
 92 |         });
 93 |         bingScraper.STANDARD_TIMEOUT = 500;
 94 |         return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
 95 |             assert.strictEqual(num_requests, 1, 'Must do one request');
 96 |             assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed');
 97 |         });
 98 |     });
 99 | 
100 |     it('one keyword 3 pages', function () {
101 |         const bingScraper = new BingScraper({
102 |             config: {
103 |                 search_engine_name: 'bing',
104 |                 throw_on_detection: true,
105 |                 keywords: ['test keyword'],
106 |                 logger: testLogger,
107 |                 scrape_from_file: '',
108 |                 num_pages: 3,
109 |             }
110 |         });
111 |         bingScraper.STANDARD_TIMEOUT = 500;
112 |         return bingScraper.run({page}).then(({results, metadata, num_requests}) => {
113 |             assert.strictEqual(num_requests, 3, 'Must three requests');
114 |             assert.strictEqual(results['test keyword']['1'].results.length, 6, 'Must have 6 organic results parsed on page 1');
115 |             assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
116 |             assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
117 |             assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keywords - TestLink', 'Title not matching on first organic result page 2');
118 |             assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
119 |             assert.strictEqual(results['test keyword']['3'].results[0].title, 'Keyword Driven Testing | TestComplete', 'Title not matching on first organic result page 3');
120 |         });
121 |     });
122 | 
123 | });


--------------------------------------------------------------------------------
/test/modules/duckduckgo.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const express = require('express');
  3 | const puppeteer = require('puppeteer');
  4 | const { createLogger, transports } = require('winston');
  5 | const http = require('http');
  6 | const https = require('https');
  7 | const assert = require('assert');
  8 | const path = require('path');
  9 | const keyCert = require('key-cert');
 10 | const Promise = require('bluebird');
 11 | const Proxy = require('http-mitm-proxy');
 12 | 
 13 | const debug = require('debug')('se-scraper:test');
 14 | const { DuckduckgoScraper } = require('../../src/modules/duckduckgo');
 15 | 
 16 | const httpPort = 3012;
 17 | const httpsPort = httpPort + 1;
 18 | const proxyPort = httpPort + 2;
 19 | 
 20 | const fakeSearchEngine = express();
 21 | fakeSearchEngine.use(express.urlencoded({ extended: true }))
 22 | fakeSearchEngine.get('/', (req, res, next) => {
 23 |     if(!req.query.q){
 24 |         return next();
 25 |     }
 26 |     debug('q=%s page=%d', req.query.q, req.query.page);
 27 |     const pageNumber = req.query.page;
 28 |     res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.query.q + '_page' + pageNumber + '.html'));
 29 | });
 30 | fakeSearchEngine.post('/html', (req, res) => {
 31 |     debug('body=%o', req.body);
 32 |     const pageNumber = 1;
 33 |     res.sendFile(path.join(__dirname, '../mocks/duckduckgo/' + req.body.q + '_page' + pageNumber + '.html'));
 34 | });
 35 | fakeSearchEngine.use(express.static('test/mocks/duckduckgo', {extensions: ['html']}));
 36 | 
 37 | describe('Module DuckDuckGo', function(){
 38 | 
 39 |     let httpServer, httpsServer, proxy;
 40 |     before(async function(){
 41 |         // Here mount our fake engine in both http and https listen server
 42 |         httpServer = http.createServer(fakeSearchEngine);
 43 |         httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
 44 |         
 45 |         proxy = Proxy();
 46 |         proxy.onRequest((ctx, callback) => {
 47 |             ctx.proxyToServerRequestOptions.host = 'localhost';
 48 |             ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
 49 |             ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
 50 |             debug('proxy askedHost=%s method=%s url=%s toPort=%s',
 51 |                 ctx.clientToProxyRequest.headers.host,
 52 |                 ctx.clientToProxyRequest.method,
 53 |                 ctx.clientToProxyRequest.url,
 54 |                 ctx.proxyToServerRequestOptions.port
 55 |             );
 56 |             return callback();
 57 |         });
 58 | 
 59 |         await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
 60 |         await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
 61 |         await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
 62 |         debug('Fake http search engine servers started');
 63 |     });
 64 | 
 65 |     after(function(){
 66 |         proxy.close();
 67 |         httpsServer.close();
 68 |         httpServer.close();
 69 |     });
 70 | 
 71 |     let browser;
 72 |     let page;
 73 |     beforeEach(async function(){
 74 |         debug('Start a new browser');
 75 |         browser = await puppeteer.launch({
 76 |             //dumpio: true,
 77 |             //headless: false,
 78 |             ignoreHTTPSErrors: true,
 79 |             args: [ '--proxy-server=http://localhost:' + proxyPort ]
 80 |         });
 81 |         debug('Open a fresh page');
 82 |         page = await browser.newPage();
 83 |     });
 84 | 
 85 |     afterEach(async function(){
 86 |         await browser.close();
 87 |     });
 88 | 
 89 |     const testLogger = createLogger({
 90 |         transports: [
 91 |             new transports.Console({
 92 |                 level: 'error'
 93 |             })
 94 |         ]
 95 |     });
 96 | 
 97 |     it('one keyword one page', function(){
 98 |         const duckduckgoScraper = new DuckduckgoScraper({
 99 |             config: {
100 |                 search_engine_name: 'duckduckgo',
101 |                 throw_on_detection: true,
102 |                 keywords: ['test keyword'],
103 |                 logger: testLogger,
104 |                 scrape_from_file: '',
105 |             }
106 |         });
107 |         duckduckgoScraper.STANDARD_TIMEOUT = 1000;
108 |         return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
109 |             assert.strictEqual(num_requests, 1, 'Must do one request');
110 |             assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
111 |         });
112 |     });
113 | 
114 |     it('one keyword 3 pages', function () {
115 |         this.timeout(4000);
116 |         const duckduckgoScraper = new DuckduckgoScraper({
117 |             config: {
118 |                 search_engine_name: 'google',
119 |                 throw_on_detection: true,
120 |                 keywords: ['test keyword'],
121 |                 logger: testLogger,
122 |                 scrape_from_file: '',
123 |                 num_pages: 3,
124 |             }
125 |         });
126 |         duckduckgoScraper.STANDARD_TIMEOUT = 1000;
127 |         return duckduckgoScraper.run({page}).then(({results, metadata, num_requests}) => {
128 |             assert.strictEqual(num_requests, 3, 'Must three requests');
129 |             assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
130 |             assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tests | TestComplete Documentation', 'Title not matching on first organic result page 1');
131 |             debug('results page 1 %O',results['test keyword']['1'].results);
132 |             debug('results page 2 %O', results['test keyword']['2'].results);
133 |             assert.strictEqual(results['test keyword']['2'].results.length, 19, 'Must have 19 organic results parsed on page 2');
134 |             assert.strictEqual(results['test keyword']['2'].results[0].title, 'Quest Diagnostics: Test Directory', 'Title not matching on first organic result page 1');
135 |             assert.strictEqual(results['test keyword']['3'].results.length, 48, 'Must have 48 organic results parsed on page 3');
136 |             assert.strictEqual(results['test keyword']['3'].results[0].title, 'Java Keywords Quiz - Sporcle', 'Title not matching on first organic result page 1');
137 |         });
138 |     });
139 | 
140 | });


--------------------------------------------------------------------------------
/test/modules/google.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const express = require('express');
  3 | const puppeteer = require('puppeteer');
  4 | const { createLogger, transports } = require('winston');
  5 | const http = require('http');
  6 | const https = require('https');
  7 | const assert = require('assert');
  8 | const path = require('path');
  9 | const keyCert = require('key-cert');
 10 | const Promise = require('bluebird');
 11 | const Proxy = require('http-mitm-proxy');
 12 | 
 13 | const debug = require('debug')('se-scraper:test');
 14 | const { GoogleScraper } = require('../../src/modules/google');
 15 | 
 16 | const httpPort = 3012;
 17 | const httpsPort = httpPort + 1;
 18 | const proxyPort = httpPort + 2;
 19 | 
 20 | const fakeSearchEngine = express();
 21 | fakeSearchEngine.get('/search', (req, res) => {
 22 |     debug('q=%s', req.query.q);
 23 |     const pageNumber = ((req.query.start/10) || 0)  + 1;
 24 |     res.sendFile(path.join(__dirname, '../mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
 25 | });
 26 | fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
 27 | 
 28 | describe('Module Google', function(){
 29 | 
 30 |     let httpServer, httpsServer, proxy;
 31 |     before(async function(){
 32 |         // Here mount our fake engine in both http and https listen server
 33 |         httpServer = http.createServer(fakeSearchEngine);
 34 |         httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
 35 |         
 36 |         proxy = Proxy();
 37 |         proxy.onRequest((ctx, callback) => {
 38 |             ctx.proxyToServerRequestOptions.host = 'localhost';
 39 |             ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
 40 |             ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
 41 |             debug('connection proxied askedHost=%s toPort=%s', ctx.clientToProxyRequest.headers.host, ctx.proxyToServerRequestOptions.port);
 42 |             return callback();
 43 |         });
 44 | 
 45 |         await Promise.promisify(proxy.listen, { context: proxy })({ port: proxyPort });
 46 |         await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
 47 |         await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
 48 |         debug('Fake http search engine servers started');
 49 |     });
 50 | 
 51 |     after(function(){
 52 |         proxy.close();
 53 |         httpsServer.close();
 54 |         httpServer.close();
 55 |     });
 56 | 
 57 |     let browser;
 58 |     let page;
 59 |     beforeEach(async function(){
 60 |         debug('Start a new browser');
 61 |         browser = await puppeteer.launch({
 62 |             //dumpio: true,
 63 |             //headless: false,
 64 |             ignoreHTTPSErrors: true,
 65 |             args: [ '--proxy-server=http://localhost:' + proxyPort ]
 66 |         });
 67 |         debug('Open a fresh page');
 68 |         page = await browser.newPage();
 69 |     });
 70 | 
 71 |     afterEach(async function(){
 72 |         await browser.close();
 73 |     });
 74 | 
 75 |     const testLogger = createLogger({
 76 |         transports: [
 77 |             new transports.Console({
 78 |                 level: 'error'
 79 |             })
 80 |         ]
 81 |     });
 82 | 
 83 |     it('one keyword one page', function(){
 84 |         const googleScraper = new GoogleScraper({
 85 |             config: {
 86 |                 search_engine_name: 'google',
 87 |                 throw_on_detection: true,
 88 |                 keywords: ['test keyword'],
 89 |                 logger: testLogger,
 90 |                 scrape_from_file: '',
 91 |             }
 92 |         });
 93 |         googleScraper.STANDARD_TIMEOUT = 500;
 94 |         return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
 95 |             assert.strictEqual(num_requests, 1, 'Must do one request');
 96 |             assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed');
 97 |         });
 98 |     });
 99 | 
100 |     it('one keyword 3 pages', function () {
101 |         const googleScraper = new GoogleScraper({
102 |             config: {
103 |                 search_engine_name: 'google',
104 |                 throw_on_detection: true,
105 |                 keywords: ['test keyword'],
106 |                 logger: testLogger,
107 |                 scrape_from_file: '',
108 |                 num_pages: 3,
109 |             }
110 |         });
111 |         googleScraper.STANDARD_TIMEOUT = 500;
112 |         return googleScraper.run({page}).then(({results, metadata, num_requests}) => {
113 |             assert.strictEqual(num_requests, 3, 'Must three requests');
114 |             assert.strictEqual(results['test keyword']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1');
115 |             assert.strictEqual(results['test keyword']['1'].results[0].title, 'Keyword Tool (FREE) ᐈ #1 Google Keyword Planner Alternative', 'Title not matching on first organic result page 1');
116 |             assert.strictEqual(results['test keyword']['2'].results.length, 10, 'Must have 10 organic results parsed on page 2');
117 |             assert.strictEqual(results['test keyword']['2'].results[0].title, 'Keyword Research | The Beginner\'s Guide to SEO - Moz', 'Title not matching on first organic result page 1');
118 |             assert.strictEqual(results['test keyword']['3'].results.length, 10, 'Must have 10 organic results parsed on page 3');
119 |             assert.strictEqual(results['test keyword']['3'].results[0].title, 'The ACT Keyword Study Plan — NerdCoach', 'Title not matching on first organic result page 1');
120 |         });
121 |     });
122 | 
123 | });


--------------------------------------------------------------------------------
/test/proxy.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const express = require('express');
  3 | const { createLogger, transports } = require('winston');
  4 | const http = require('http');
  5 | const https = require('https');
  6 | const assert = require('assert');
  7 | const keyCert = require('key-cert');
  8 | const Promise = require('bluebird');
  9 | const Proxy = require('http-mitm-proxy');
 10 | 
 11 | const debug = require('debug')('se-scraper:test');
 12 | const se_scraper = require('../');
 13 | const Scraper = require('../src/modules/se_scraper');
 14 | 
 15 | const httpPort = 3012;
 16 | const httpsPort = httpPort + 1;
 17 | const proxyPort = httpPort + 2;
 18 | 
 19 | const fakeSearchEngine = express();
 20 | fakeSearchEngine.set('trust proxy', 'loopback');
 21 | fakeSearchEngine.get('/test-proxy', (req, res) => {
 22 |     debug('fake-search-engine req.hostname=%s', req.hostname);
 23 |     //debug('req to', req.socket.localAddress, req.socket.localPort);
 24 |     res.send(req.hostname);
 25 | });
 26 | 
 27 | describe('Config', function(){
 28 | 
 29 |     let httpServer, httpsServer, proxy;
 30 |     before(async function(){
 31 |         // Here mount our fake engine in both http and https listen server
 32 |         httpServer = http.createServer(fakeSearchEngine);
 33 |         httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
 34 |        
 35 |         proxy = Proxy();
 36 |         proxy.onRequest((ctx, callback) => {
 37 |             ctx.proxyToServerRequestOptions.host = 'localhost';
 38 |             ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
 39 |             ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
 40 |             debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
 41 |             return callback();
 42 |         });
 43 | 
 44 |         await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
 45 |         await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
 46 |         await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
 47 |         debug('Fake http search engine servers started');
 48 |     });
 49 | 
 50 |     after(function(){
 51 |         httpsServer.close();
 52 |         httpServer.close();
 53 |         proxy.close();
 54 |     });
 55 | 
 56 |     describe('proxies', function(){
 57 | 
 58 |         class MockScraperTestProxy extends Scraper {
 59 | 
 60 |             async load_start_page(){
 61 |                 return true;
 62 |             }
 63 |             
 64 |             async search_keyword(){
 65 |                 await this.page.goto('http://test.local:' + httpPort + '/test-proxy');
 66 |             }
 67 | 
 68 |             async parse_async(){
 69 |                 const bodyHandle = await this.page.$('body');
 70 |                 return await this.page.evaluate(body => body.innerHTML, bodyHandle);
 71 |             }
 72 |         }
 73 | 
 74 |         const testLogger = createLogger({
 75 |             transports: [
 76 |                 new transports.Console({
 77 |                     level: 'error'
 78 |                 })
 79 |             ]
 80 |         });
 81 | 
 82 |         /**
 83 |          * Jobs will be executed 2 by 2 through the proxy and direct connection
 84 |          * THIS TEST NEED TO HAVE test.local 127.0.0.1 in /etc/hosts because chrome bypass localhost even with proxy set
 85 |          */
 86 |         it('one proxy given, use_proxies_only=false', async function () {
 87 | 
 88 |             const scrape_job = {
 89 |                 search_engine: MockScraperTestProxy,
 90 |                 keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
 91 |             };
 92 | 
 93 |             var scraper = new se_scraper.ScrapeManager({
 94 |                 throw_on_detection: true,
 95 |                 proxies: ['http://localhost:' + proxyPort],
 96 |                 // default is use_proxies_only: false,
 97 |                 logger: testLogger,
 98 |             });
 99 |             await scraper.start();
100 | 
101 |             const { results } = await scraper.scrape(scrape_job);
102 |             assert.strictEqual(results['news']['1'], 'test.local');
103 |             assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
104 |             assert.strictEqual(results['i work too much']['1'], 'test.local');
105 |             assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
106 |             assert.strictEqual(results['javascript is hard']['1'], 'test.local');
107 | 
108 |             await scraper.quit();
109 |         });
110 | 
111 |         /**
112 |          * Jobs will be executed 1 by 1 through the proxy
113 |          */
114 |         it('one proxy given, use_proxies_only=true', async function () {
115 | 
116 |             const scrape_job = {
117 |                 search_engine: MockScraperTestProxy,
118 |                 keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
119 |             };
120 | 
121 |             var scraper = new se_scraper.ScrapeManager({
122 |                 throw_on_detection: true,
123 |                 proxies: ['http://localhost:' + proxyPort],
124 |                 use_proxies_only: true,
125 |                 logger: testLogger,
126 |             });
127 |             await scraper.start();
128 | 
129 |             const { results } = await scraper.scrape(scrape_job);
130 |             assert.strictEqual(results['news']['1'], 'ProxiedThroughFakeEngine');
131 |             assert.strictEqual(results['some stuff']['1'], 'ProxiedThroughFakeEngine');
132 |             assert.strictEqual(results['i work too much']['1'], 'ProxiedThroughFakeEngine');
133 |             assert.strictEqual(results['what to do?']['1'], 'ProxiedThroughFakeEngine');
134 |             assert.strictEqual(results['javascript is hard']['1'], 'ProxiedThroughFakeEngine');
135 | 
136 |             await scraper.quit();
137 |         });
138 | 
139 |         it('zero proxy given, use_proxies_only=true', async function () {
140 | 
141 |             const scrape_job = {
142 |                 search_engine: MockScraperTestProxy,
143 |                 keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
144 |             };
145 | 
146 |             await assert.rejects(async () => {
147 |                 var scraper = new se_scraper.ScrapeManager({
148 |                     throw_on_detection: true,
149 |                     use_proxies_only: true,
150 |                     logger: testLogger,
151 |                 });
152 |                 await scraper.start();
153 |                 const { results } = await scraper.scrape(scrape_job);
154 |                 await scraper.quit();
155 |             }, /Must provide at least one proxy in proxies if you enable use_proxies_only/);
156 | 
157 |         });
158 | 
159 |     });
160 | 
161 | });


--------------------------------------------------------------------------------
/test/user_agent.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | const express = require('express');
  3 | const { createLogger, transports } = require('winston');
  4 | const http = require('http');
  5 | const https = require('https');
  6 | const assert = require('assert');
  7 | const keyCert = require('key-cert');
  8 | const Promise = require('bluebird');
  9 | const Proxy = require('http-mitm-proxy');
 10 | const UAParser = require('ua-parser-js');
 11 | const _ = require('lodash');
 12 | 
 13 | const debug = require('debug')('se-scraper:test');
 14 | const se_scraper = require('../');
 15 | const Scraper = require('../src/modules/se_scraper');
 16 | 
 17 | const httpPort = 3012;
 18 | const httpsPort = httpPort + 1;
 19 | const proxyPort = httpPort + 2;
 20 | 
 21 | const fakeSearchEngine = express();
 22 | fakeSearchEngine.set('trust proxy', 'loopback');
 23 | fakeSearchEngine.get('/test-user_agent', (req, res) => {
 24 |     debug('fake-search-engine req.headers.user-agent=%s', req.headers['user-agent']);
 25 |     res.send(req.headers['user-agent']);
 26 | });
 27 | 
 28 | describe('Config', function(){
 29 | 
 30 |     let httpServer, httpsServer, proxy;
 31 |     before(async function(){
 32 |         // Here mount our fake engine in both http and https listen server
 33 |         httpServer = http.createServer(fakeSearchEngine);
 34 |         httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
 35 |        
 36 |         proxy = Proxy();
 37 |         proxy.onRequest((ctx, callback) => {
 38 |             ctx.proxyToServerRequestOptions.host = 'localhost';
 39 |             ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
 40 |             ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
 41 |             debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
 42 |             return callback();
 43 |         });
 44 | 
 45 |         await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
 46 |         await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
 47 |         await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
 48 |         debug('Fake http search engine servers started');
 49 |     });
 50 | 
 51 |     after(function(){
 52 |         httpsServer.close();
 53 |         httpServer.close();
 54 |         proxy.close();
 55 |     });
 56 | 
 57 |     describe('user_agent', function(){
 58 | 
 59 |         class MockScraperTestUserAgent extends Scraper {
 60 | 
 61 |             async load_start_page(){
 62 |                 return true;
 63 |             }
 64 |             
 65 |             async search_keyword(){
 66 |                 await this.page.goto('http://localhost:' + httpPort + '/test-user_agent');
 67 |             }
 68 | 
 69 |             async parse_async(){
 70 |                 const bodyHandle = await this.page.$('body');
 71 |                 return await this.page.evaluate(body => body.innerHTML, bodyHandle);
 72 |             }
 73 |         }
 74 | 
 75 |         const testLogger = createLogger({
 76 |             transports: [
 77 |                 new transports.Console({
 78 |                     level: 'error'
 79 |                 })
 80 |             ]
 81 |         });
 82 | 
 83 |         /**
 84 |          * Test user_agent option
 85 |          */
 86 |         it('fixed user_agent', async function () {
 87 | 
 88 |             const scrape_job = {
 89 |                 search_engine: MockScraperTestUserAgent,
 90 |                 keywords: ['javascript is hard'],
 91 |             };
 92 | 
 93 |             var scraper = new se_scraper.ScrapeManager({
 94 |                 throw_on_detection: true,
 95 |                 logger: testLogger,
 96 |                 user_agent: 'THIS IS A USERAGENT 42.0'
 97 |             });
 98 |             await scraper.start();
 99 | 
100 |             const { results } = await scraper.scrape(scrape_job);
101 |             assert.strictEqual(results['javascript is hard']['1'], 'THIS IS A USERAGENT 42.0');
102 | 
103 |             await scraper.quit();
104 |         });
105 | 
106 |         /**
107 |          * Test random_user_agent option
108 |          * TODO generated user_agent should be different for each keyword
109 |          * TODO this test will sometimes fail because user_agent not very random :-(
110 |          */
111 |         it('random_user_agent', async function () {
112 | 
113 |             const scrape_job = {
114 |                 search_engine: MockScraperTestUserAgent,
115 |                 keywords: ['news'],
116 |             };
117 | 
118 |             const NUMBER_OF_EXEC = 10;
119 | 
120 |             const uaList = await Promise.map(_.range(NUMBER_OF_EXEC), async (i) => {
121 |                 const scraper = new se_scraper.ScrapeManager({
122 |                     throw_on_detection: true,
123 |                     logger: testLogger,
124 |                     random_user_agent: true,
125 |                 });
126 |                 await scraper.start();
127 |                 const { results: { news } } = await scraper.scrape(scrape_job);
128 |                 await scraper.quit();
129 |                 return news['1'];
130 |             });
131 |             
132 |             uaList.forEach((userAgent) => {
133 |                 const uaParsed = UAParser(userAgent);
134 |                 assert(uaParsed.browser.name, 'UserAgent should have a browser name detected');
135 |                 assert(uaParsed.os.name, 'UserAgent should have a os name detected');
136 |             });
137 | 
138 |             assert( _.chain(uaList).countBy().toPairs().sortBy(e => e[1]).last().value()[1] < (NUMBER_OF_EXEC * 0.4), 'Each user agent should appear less than 40% of the time' );
139 |             
140 |         });
141 | 
142 |     });
143 | 
144 | });


--------------------------------------------------------------------------------