├── .gitignore
├── CHANGELOG.md
├── README.md
├── example
    ├── .dockerignore
    ├── Dockerfile
    ├── README.md
    ├── docker-compose.yaml
    ├── example
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── book.py
    │   │   └── movie.py
    ├── requirements.txt
    ├── run.py
    └── scrapy.cfg
├── gerapy_selenium
    ├── __init__.py
    ├── __version__.py
    ├── downloadermiddlewares.py
    ├── pretend.py
    ├── request.py
    └── settings.py
├── requirements.txt
├── setup.py
└── tests
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | .idea/
  7 | 
  8 | # User-specific stuff
  9 | .idea/**/workspace.xml
 10 | .idea/**/tasks.xml
 11 | .idea/**/usage.statistics.xml
 12 | .idea/**/dictionaries
 13 | .idea/**/shelf
 14 | 
 15 | # Generated files
 16 | .idea/**/contentModel.xml
 17 | 
 18 | # Sensitive or high-churn files
 19 | .idea/**/dataSources/
 20 | .idea/**/dataSources.ids
 21 | .idea/**/dataSources.local.xml
 22 | .idea/**/sqlDataSources.xml
 23 | .idea/**/dynamic.xml
 24 | .idea/**/uiDesigner.xml
 25 | .idea/**/dbnavigator.xml
 26 | 
 27 | # Gradle
 28 | .idea/**/gradle.xml
 29 | .idea/**/libraries
 30 | 
 31 | # Gradle and Maven with auto-import
 32 | # When using Gradle or Maven with auto-import, you should exclude module files,
 33 | # since they will be recreated, and may cause churn.  Uncomment if using
 34 | # auto-import.
 35 | # .idea/artifacts
 36 | # .idea/compiler.xml
 37 | # .idea/jarRepositories.xml
 38 | # .idea/modules.xml
 39 | # .idea/*.iml
 40 | # .idea/modules
 41 | # *.iml
 42 | # *.ipr
 43 | 
 44 | # CMake
 45 | cmake-build-*/
 46 | 
 47 | # Mongo Explorer plugin
 48 | .idea/**/mongoSettings.xml
 49 | 
 50 | # File-based project format
 51 | *.iws
 52 | 
 53 | # IntelliJ
 54 | out/
 55 | 
 56 | # mpeltonen/sbt-idea plugin
 57 | .idea_modules/
 58 | 
 59 | # JIRA plugin
 60 | atlassian-ide-plugin.xml
 61 | 
 62 | # Cursive Clojure plugin
 63 | .idea/replstate.xml
 64 | 
 65 | # Crashlytics plugin (for Android Studio and IntelliJ)
 66 | com_crashlytics_export_strings.xml
 67 | crashlytics.properties
 68 | crashlytics-build.properties
 69 | fabric.properties
 70 | 
 71 | # Editor-based Rest Client
 72 | .idea/httpRequests
 73 | 
 74 | # Android studio 3.1+ serialized cache file
 75 | .idea/caches/build_file_checksums.ser
 76 | 
 77 | ### Python template
 78 | # Byte-compiled / optimized / DLL files
 79 | __pycache__/
 80 | *.py[cod]
 81 | *$py.class
 82 | 
 83 | # C extensions
 84 | *.so
 85 | 
 86 | # Distribution / packaging
 87 | .Python
 88 | build/
 89 | develop-eggs/
 90 | dist/
 91 | downloads/
 92 | eggs/
 93 | .eggs/
 94 | lib/
 95 | lib64/
 96 | parts/
 97 | sdist/
 98 | var/
 99 | wheels/
100 | pip-wheel-metadata/
101 | share/python-wheels/
102 | *.egg-info/
103 | .installed.cfg
104 | *.egg
105 | MANIFEST
106 | 
107 | # PyInstaller
108 | #  Usually these files are written by a python script from a template
109 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
110 | *.manifest
111 | *.spec
112 | 
113 | # Installer logs
114 | pip-log.txt
115 | pip-delete-this-directory.txt
116 | 
117 | # Unit test / coverage reports
118 | htmlcov/
119 | .tox/
120 | .nox/
121 | .coverage
122 | .coverage.*
123 | .cache
124 | nosetests.xml
125 | coverage.xml
126 | *.cover
127 | *.py,cover
128 | .hypothesis/
129 | .pytest_cache/
130 | cover/
131 | 
132 | # Translations
133 | *.mo
134 | *.pot
135 | 
136 | # Django stuff:
137 | *.log
138 | local_settings.py
139 | db.sqlite3
140 | db.sqlite3-journal
141 | 
142 | # Flask stuff:
143 | instance/
144 | .webassets-cache
145 | 
146 | # Scrapy stuff:
147 | .scrapy
148 | 
149 | # Sphinx documentation
150 | docs/_build/
151 | 
152 | # PyBuilder
153 | .pybuilder/
154 | target/
155 | 
156 | # Jupyter Notebook
157 | .ipynb_checkpoints
158 | 
159 | # IPython
160 | profile_default/
161 | ipython_config.py
162 | 
163 | # pyenv
164 | #   For a library or package, you might want to ignore these files since the code is
165 | #   intended to run in multiple environments; otherwise, check them in:
166 | # .python-version
167 | 
168 | # pipenv
169 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
170 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
171 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
172 | #   install all needed dependencies.
173 | #Pipfile.lock
174 | 
175 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
176 | __pypackages__/
177 | 
178 | # Celery stuff
179 | celerybeat-schedule
180 | celerybeat.pid
181 | 
182 | # SageMath parsed files
183 | *.sage.py
184 | 
185 | # Environments
186 | .env
187 | .venv
188 | env/
189 | venv/
190 | ENV/
191 | env.bak/
192 | venv.bak/
193 | 
194 | # Spyder project settings
195 | .spyderproject
196 | .spyproject
197 | 
198 | # Rope project settings
199 | .ropeproject
200 | 
201 | # mkdocs documentation
202 | /site
203 | 
204 | # mypy
205 | .mypy_cache/
206 | .dmypy.json
207 | dmypy.json
208 | 
209 | # Pyre type checker
210 | .pyre/
211 | 
212 | # pytype static type analyzer
213 | .pytype/
214 | 
215 | # Cython debug symbols
216 | cython_debug/
217 | 
218 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Gerapy Selenium Changelog
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Gerapy Selenium
  2 | 
  3 | This is a package for supporting selenium in Scrapy, also this
  4 | package is a module in [Gerapy](https://github.com/Gerapy/Gerapy).
  5 | 
  6 | ## Installation
  7 | 
  8 | ```shell script
  9 | pip3 install gerapy-selenium
 10 | ```
 11 | 
 12 | ## Usage
 13 | 
 14 | You can use `SeleniumRequest` to specify a request which uses selenium to render.
 15 | 
 16 | For example:
 17 | 
 18 | ```python
 19 | yield SeleniumRequest(detail_url, callback=self.parse_detail)
 20 | ```
 21 | 
 22 | And you also need to enable `SeleniumMiddleware` in `DOWNLOADER_MIDDLEWARES`:
 23 | 
 24 | ```python
 25 | DOWNLOADER_MIDDLEWARES = {
 26 |     'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware': 543,
 27 | }
 28 | ```
 29 | 
 30 | Congratulate, you've finished the all of the required configuration.
 31 | 
 32 | If you run the Spider again, Selenium will be started to render every
 33 | web page which you configured the request as SeleniumRequest.
 34 | 
 35 | ## Settings
 36 | 
 37 | GerapySelenium provides some optional settings.
 38 | 
 39 | ### Concurrency 
 40 | 
 41 | You can directly use Scrapy's setting to set Concurrency of Selenium,
 42 | for example:
 43 | 
 44 | ```python
 45 | CONCURRENT_REQUESTS = 3
 46 | ```
 47 | 
 48 | ### Pretend as Real Browser
 49 | 
 50 | Some website will detect WebDriver or Headless, GerapySelenium can 
 51 | pretend Chromium by inject scripts. This is enabled by default.
 52 | 
 53 | You can close it if website does not detect WebDriver to speed up:
 54 | 
 55 | ```python
 56 | GERAPY_SELENIUM_PRETEND = False
 57 | ```
 58 | 
 59 | Also you can use `pretend` attribute in `SeleniumRequest` to overwrite this 
 60 | configuration.
 61 | 
 62 | ### Logging Level
 63 | 
 64 | By default, Selenium will log all the debug messages, so GerapySelenium
 65 | configured the logging level of Selenium to WARNING.
 66 | 
 67 | If you want to see more logs from Selenium, you can change the this setting: 
 68 | 
 69 | ```python
 70 | import logging
 71 | GERAPY_SELENIUM_LOGGING_LEVEL = logging.DEBUG
 72 | ```
 73 | 
 74 | ### Download Timeout
 75 | 
 76 | Selenium may take some time to render the required web page, you can also change this setting, default is `30s`:
 77 | 
 78 | ```python
 79 | # selenium timeout
 80 | GERAPY_SELENIUM_DOWNLOAD_TIMEOUT = 30
 81 | ```
 82 | 
 83 | ### Headless
 84 | 
 85 | By default, Selenium is running in `Headless` mode, you can also 
 86 | change it to `False` as you need, default is `True`:
 87 | 
 88 | ```python
 89 | GERAPY_SELENIUM_HEADLESS = False 
 90 | ```
 91 | 
 92 | ### Window Size
 93 | 
 94 | You can also set the width and height of Selenium window:
 95 | 
 96 | ```python
 97 | GERAPY_SELENIUM_WINDOW_WIDTH = 1400
 98 | GERAPY_SELENIUM_WINDOW_HEIGHT = 700
 99 | ```
100 | 
101 | Default is 1400, 700.
102 | 
103 | ## SeleniumRequest
104 | 
105 | `SeleniumRequest` provide args which can override global settings above.
106 | 
107 | * url: request url
108 | * callback: callback
109 | * wait_for: wait for some element to load, also supports dict
110 | * script: script to execute
111 | * proxy: use proxy for this time, like `http://x.x.x.x:x`
112 | * sleep: time to sleep after loaded, override `GERAPY_SELENIUM_SLEEP`
113 | * timeout: load timeout, override `GERAPY_SELENIUM_DOWNLOAD_TIMEOUT`
114 | * pretend: pretend as normal browser, override `GERAPY_SELENIUM_PRETEND`
115 | * screenshot: ignored resource types, see
116 |         https://miyakogi.github.io/selenium/_modules/selenium/page.html#Page.screenshot,
117 |         override `GERAPY_SELENIUM_SCREENSHOT`
118 | 
119 | For example, you can configure SeleniumRequest as:
120 | 
121 | ```python
122 | from gerapy_selenium import SeleniumRequest
123 | 
124 | def parse(self, response):
125 |     yield SeleniumRequest(url, 
126 |         callback=self.parse_detail,
127 |         wait_for='title',
128 |         script='() => { console.log(document) }',
129 |         sleep=2)
130 | ```
131 | 
132 | Then Selenium will:
133 | * wait for title to load
134 | * execute `console.log(document)` script
135 | * sleep for 2s
136 | * return the rendered web page content
137 | 
138 | ## Example
139 | 
140 | For more detail, please see [example](./example).
141 | 
142 | Also you can directly run with Docker:
143 | 
144 | ```
145 | docker run germey/gerapy-selenium-example
146 | ```
147 | 
148 | Outputs:
149 | 
150 | ```shell script
151 | 2020-07-13 01:49:13 [scrapy.utils.log] INFO: Scrapy 2.2.0 started (bot: example)
152 | 2020-07-13 01:49:13 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.7 (default, May  6 2020, 04:59:01) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.4.0-x86_64-i386-64bit
153 | 2020-07-13 01:49:13 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
154 | 2020-07-13 01:49:13 [scrapy.crawler] INFO: Overridden settings:
155 | {'BOT_NAME': 'example',
156 |  'CONCURRENT_REQUESTS': 3,
157 |  'NEWSPIDER_MODULE': 'example.spiders',
158 |  'RETRY_HTTP_CODES': [403, 500, 502, 503, 504],
159 |  'SPIDER_MODULES': ['example.spiders']}
160 | 2020-07-13 01:49:13 [scrapy.extensions.telnet] INFO: Telnet Password: 83c276fb41754bd0
161 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled extensions:
162 | ['scrapy.extensions.corestats.CoreStats',
163 |  'scrapy.extensions.telnet.TelnetConsole',
164 |  'scrapy.extensions.memusage.MemoryUsage',
165 |  'scrapy.extensions.logstats.LogStats']
166 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled downloader middlewares:
167 | ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
168 |  'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
169 |  'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
170 |  'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
171 |  'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware',
172 |  'scrapy.downloadermiddlewares.retry.RetryMiddleware',
173 |  'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
174 |  'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
175 |  'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
176 |  'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
177 |  'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
178 |  'scrapy.downloadermiddlewares.stats.DownloaderStats']
179 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled spider middlewares:
180 | ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
181 |  'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
182 |  'scrapy.spidermiddlewares.referer.RefererMiddleware',
183 |  'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
184 |  'scrapy.spidermiddlewares.depth.DepthMiddleware']
185 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled item pipelines:
186 | []
187 | 2020-07-13 01:49:13 [scrapy.core.engine] INFO: Spider opened
188 | 2020-07-13 01:49:13 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
189 | 2020-07-13 01:49:13 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
190 | 2020-07-13 01:49:13 [example.spiders.book] INFO: crawling https://dynamic5.scrape.center/page/1
191 | 2020-07-13 01:49:13 [gerapy.selenium] DEBUG: processing request <GET https://dynamic5.scrape.center/page/1>
192 | 2020-07-13 01:49:13 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']}
193 | 2020-07-13 01:49:14 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/page/1
194 | 2020-07-13 01:49:19 [gerapy.selenium] DEBUG: waiting for .item .name finished
195 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: wait for .item .name finished
196 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: close selenium
197 | 2020-07-13 01:49:20 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://dynamic5.scrape.center/page/1> (referer: None)
198 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: processing request <GET https://dynamic5.scrape.center/detail/26898909>
199 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: processing request <GET https://dynamic5.scrape.center/detail/26861389>
200 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: processing request <GET https://dynamic5.scrape.center/detail/26855315>
201 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']}
202 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']}
203 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']}
204 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/26855315
205 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/26861389
206 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/26898909
207 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: waiting for .item .name finished
208 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: wait for .item .name finished
209 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: close selenium
210 | 2020-07-13 01:49:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://dynamic5.scrape.center/detail/26861389> (referer: https://dynamic5.scrape.center/page/1)
211 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: processing request <GET https://dynamic5.scrape.center/page/2>
212 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']}
213 | 2020-07-13 01:49:25 [scrapy.core.scraper] DEBUG: Scraped from <200 https://dynamic5.scrape.center/detail/26861389>
214 | {'name': '壁穴ヘブンホール',
215 |  'score': '5.6',
216 |  'tags': ['BL漫画', '小基漫', 'BL', '『又腐又基』', 'BLコミック']}
217 | 2020-07-13 01:49:25 [gerapy.selenium] DEBUG: waiting for .item .name finished
218 | 2020-07-13 01:49:25 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/page/2
219 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: wait for .item .name finished
220 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: close selenium
221 | 2020-07-13 01:49:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://dynamic5.scrape.center/detail/26855315> (referer: https://dynamic5.scrape.center/page/1)
222 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: processing request <GET https://dynamic5.scrape.center/detail/27047626>
223 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']}
224 | 2020-07-13 01:49:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://dynamic5.scrape.center/detail/26855315>
225 | {'name': '冒险小虎队', 'score': '9.4', 'tags': ['冒险小虎队', '童年', '冒险', '推理', '小时候读的']}
226 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: waiting for .item .name finished
227 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/27047626
228 | 2020-07-13 01:49:27 [gerapy.selenium] DEBUG: wait for .item .name finished
229 | 2020-07-13 01:49:27 [gerapy.selenium] DEBUG: close selenium
230 | ...
231 | ```
232 | 


--------------------------------------------------------------------------------
/example/.dockerignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # pytype static type analyzer
138 | .pytype/
139 | 
140 | # Cython debug symbols
141 | cython_debug/
142 | 
143 | 


--------------------------------------------------------------------------------
/example/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM selenium/standalone-chrome:85.0-chromedriver-85.0-20200907
 2 | USER root
 3 | 
 4 | RUN apt-get update && \
 5 |   apt-get install -y python3-distutils && \
 6 |   curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
 7 |   python3 get-pip.py
 8 | 
 9 | WORKDIR /code
10 | COPY requirements.txt .
11 | RUN python3 -m pip install -r requirements.txt
12 | COPY . .
13 | CMD python3 run.py
14 | 


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
 1 | # Gerapy Pyppeteer Example
 2 | 
 3 | ## Run
 4 | 
 5 | There are two ways to run this example:
 6 | 
 7 | ### Run with Python
 8 | 
 9 | ```shell script
10 | pip3 install -r requierments.txt
11 | pyppeteer-install
12 | python3 run.py
13 | ```
14 | 
15 | ### Run with Docker
16 | 
17 | ```shell script
18 | docker run germey/gerapy-pyppeteer-example
19 | ```
20 | 


--------------------------------------------------------------------------------
/example/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 |   gerapy-selenium-example:
4 |     container_name: 'gerapy-selenium-example'
5 |     restart: always
6 |     build: .
7 |     image: 'germey/gerapy-selenium-example'
8 |     command: 'python3 run.py'


--------------------------------------------------------------------------------
/example/example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gerapy/GerapySelenium/b8c8685d389c0d77093d6c1c0918fc382eacc879/example/example/__init__.py


--------------------------------------------------------------------------------
/example/example/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Field, Item
 9 | 
10 | 
11 | class BookItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     name = Field()
15 |     tags = Field()
16 |     score = Field()
17 | 
18 | 
19 | class MovieItem(Item):
20 |     # define the fields for your item here like:
21 |     # name = scrapy.Field()
22 |     name = Field()
23 |     categories = Field()
24 |     score = Field()
25 | 


--------------------------------------------------------------------------------
/example/example/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class ExampleSpiderMiddleware:
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class ExampleDownloaderMiddleware:
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/example/example/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ExamplePipeline:
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/example/example/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for example project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'example'
 13 | 
 14 | SPIDER_MODULES = ['example.spiders']
 15 | NEWSPIDER_MODULE = 'example.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'example (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | CONCURRENT_REQUESTS = 3
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | # DOWNLOAD_DELAY = 3
 30 | # The download delay setting will honor only one of:
 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | # CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | # COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | # TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | # DEFAULT_REQUEST_HEADERS = {
 42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 43 | #   'Accept-Language': 'en',
 44 | # }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 48 | # SPIDER_MIDDLEWARES = {
 49 | #    'example.middlewares.ExampleSpiderMiddleware': 543,
 50 | # }
 51 | 
 52 | # Enable or disable downloader middlewares
 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 54 | DOWNLOADER_MIDDLEWARES = {
 55 |     'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware': 543,
 56 | }
 57 | 
 58 | # Enable or disable extensions
 59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 60 | # EXTENSIONS = {
 61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 62 | # }
 63 | 
 64 | # Configure item pipelines
 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 66 | # ITEM_PIPELINES = {
 67 | #    'example.pipelines.ExamplePipeline': 300,
 68 | # }
 69 | 
 70 | # Enable and configure the AutoThrottle extension (disabled by default)
 71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 72 | # AUTOTHROTTLE_ENABLED = True
 73 | # The initial download delay
 74 | # AUTOTHROTTLE_START_DELAY = 5
 75 | # The maximum download delay to be set in case of high latencies
 76 | # AUTOTHROTTLE_MAX_DELAY = 60
 77 | # The average number of requests Scrapy should be sending in parallel to
 78 | # each remote server
 79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 80 | # Enable showing throttling stats for every response received:
 81 | # AUTOTHROTTLE_DEBUG = False
 82 | 
 83 | # Enable and configure HTTP caching (disabled by default)
 84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 85 | # HTTPCACHE_ENABLED = True
 86 | # HTTPCACHE_EXPIRATION_SECS = 0
 87 | # HTTPCACHE_DIR = 'httpcache'
 88 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 89 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 90 | 
 91 | RETRY_HTTP_CODES = [403, 500, 502, 503, 504]
 92 | 
 93 | GERAPY_SELENIUM_HEADLESS = True
 94 | 
 95 | LOG_LEVEL = 'DEBUG'
 96 | 
 97 | GERAPY_SELENIUM_PRETEND = True
 98 | 
 99 | # GERAPY_SELENIUM_SCREENSHOT = {
100 | #     'selector': '.item'
101 | # }
102 | 
103 | GERAPY_SELENIUM_DOWNLOAD_TIMEOUT = 10
104 | 


--------------------------------------------------------------------------------
/example/example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/example/example/spiders/book.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import scrapy
 4 | from example.items import BookItem
 5 | from gerapy_selenium import SeleniumRequest
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class BookSpider(scrapy.Spider):
12 |     name = 'book'
13 |     allowed_domains = ['dynamic5.scrape.center']
14 |     base_url = 'https://dynamic5.scrape.center'
15 |     
16 |     def start_requests(self):
17 |         """
18 |         first page
19 |         :return:
20 |         """
21 |         start_url = f'{self.base_url}/page/1'
22 |         logger.info('crawling %s', start_url)
23 |         yield SeleniumRequest(start_url, callback=self.parse_index, wait_for='.item .name')
24 |     
25 |     def parse_index(self, response):
26 |         """
27 |         extract books and get next page
28 |         :param response:
29 |         :return:
30 |         """
31 |         items = response.css('.item')
32 |         for item in items:
33 |             href = item.css('.top a::attr(href)').extract_first()
34 |             detail_url = response.urljoin(href)
35 |             yield SeleniumRequest(detail_url, callback=self.parse_detail, wait_for='.item .name', priority=2,
36 |                                   screenshot={'selector': '.item'})
37 |         
38 |         # next page
39 |         match = re.search(r'page/(\d+)', response.url)
40 |         if not match: return
41 |         page = int(match.group(1)) + 1
42 |         next_url = f'{self.base_url}/page/{page}'
43 |         yield SeleniumRequest(next_url, callback=self.parse_index, wait_for='.item .name', )
44 |     
45 |     def parse_detail(self, response):
46 |         """
47 |         process detail info of book
48 |         :param response:
49 |         :return:
50 |         """
51 |         name = response.css('.name::text').extract_first()
52 |         tags = response.css('.tags button span::text').extract()
53 |         score = response.css('.score::text').extract_first()
54 |         tags = [tag.strip() for tag in tags] if tags else []
55 |         score = score.strip() if score else None
56 |         item = BookItem(name=name, tags=tags, score=score)
57 |         yield item
58 | 


--------------------------------------------------------------------------------
/example/example/spiders/movie.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from gerapy_selenium import SeleniumRequest
 4 | from scrapy import Request, signals
 5 | from example.items import MovieItem
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class MovieSpider(scrapy.Spider):
12 |     name = 'movie'
13 |     allowed_domains = ['antispider1.scrape.center']
14 |     base_url = 'https://antispider1.scrape.center'
15 |     max_page = 10
16 |     
17 |     def start_requests(self):
18 |         """
19 |         first page
20 |         :return:
21 |         """
22 |         for page in range(1, self.max_page + 1):
23 |             url = f'{self.base_url}/page/{page}'
24 |             logger.debug('start url %s', url)
25 |             cookies = {
26 |                 'name': 'germey'
27 |             }
28 |             yield SeleniumRequest(url, callback=self.parse_index, priority=10, wait_for='.item', pretend=True, cookies=cookies)
29 |     
30 |     def parse_index(self, response):
31 |         """
32 |         extract movies
33 |         :param response:
34 |         :return:
35 |         """
36 |         items = response.css('.item')
37 |         for item in items:
38 |             href = item.css('a::attr(href)').extract_first()
39 |             detail_url = response.urljoin(href)
40 |             logger.info('detail url %s', detail_url)
41 |             yield SeleniumRequest(detail_url, callback=self.parse_detail, wait_for='.item')
42 |     
43 |     def parse_detail(self, response):
44 |         """
45 |         process detail info of book
46 |         :param response:
47 |         :return:
48 |         """
49 |         name = response.css('h2::text').extract_first()
50 |         categories = response.css('.categories button span::text').extract()
51 |         score = response.css('.score::text').extract_first()
52 |         categories = [category.strip() for category in categories] if categories else []
53 |         score = score.strip() if score else None
54 |         yield MovieItem(name=name, categories=categories, score=score)
55 | 


--------------------------------------------------------------------------------
/example/requirements.txt:
--------------------------------------------------------------------------------
1 | gerapy-selenium


--------------------------------------------------------------------------------
/example/run.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | 
3 | execute('scrapy crawl book'.split())
4 | 


--------------------------------------------------------------------------------
/example/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = example.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = example
12 | 


--------------------------------------------------------------------------------
/gerapy_selenium/__init__.py:
--------------------------------------------------------------------------------
1 | from .downloadermiddlewares import SeleniumMiddleware
2 | from .request import SeleniumRequest
3 | 


--------------------------------------------------------------------------------
/gerapy_selenium/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 0, '3')
2 | 
3 | version = __version__ = '.'.join(map(str, VERSION))


--------------------------------------------------------------------------------
/gerapy_selenium/downloadermiddlewares.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from io import BytesIO
  3 | from scrapy.http import HtmlResponse
  4 | from scrapy.utils.python import global_object_name
  5 | from selenium.common.exceptions import TimeoutException
  6 | from selenium.webdriver.common.by import By
  7 | from selenium.webdriver.support.wait import WebDriverWait
  8 | from gerapy_selenium.pretend import SCRIPTS as PRETEND_SCRIPTS
  9 | from gerapy_selenium.settings import *
 10 | import urllib.parse
 11 | from selenium import webdriver
 12 | from selenium.webdriver import ChromeOptions
 13 | from selenium.webdriver.support import expected_conditions as EC
 14 | from twisted.internet.threads import deferToThread
 15 | 
 16 | logger = logging.getLogger('gerapy.selenium')
 17 | 
 18 | 
 19 | class SeleniumMiddleware(object):
 20 |     """
 21 |     Downloader middleware handling the requests with Selenium
 22 |     """
 23 |     
 24 |     def _retry(self, request, reason, spider):
 25 |         """
 26 |         get retry request
 27 |         :param request:
 28 |         :param reason:
 29 |         :param spider:
 30 |         :return:
 31 |         """
 32 |         if not self.retry_enabled:
 33 |             return
 34 |         
 35 |         retries = request.meta.get('retry_times', 0) + 1
 36 |         retry_times = self.max_retry_times
 37 |         
 38 |         if 'max_retry_times' in request.meta:
 39 |             retry_times = request.meta['max_retry_times']
 40 |         
 41 |         stats = spider.crawler.stats
 42 |         if retries <= retry_times:
 43 |             logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
 44 |                          {'request': request, 'retries': retries, 'reason': reason},
 45 |                          extra={'spider': spider})
 46 |             retryreq = request.copy()
 47 |             retryreq.meta['retry_times'] = retries
 48 |             retryreq.dont_filter = True
 49 |             retryreq.priority = request.priority + self.priority_adjust
 50 |             
 51 |             if isinstance(reason, Exception):
 52 |                 reason = global_object_name(reason.__class__)
 53 |             
 54 |             stats.inc_value('retry/count')
 55 |             stats.inc_value('retry/reason_count/%s' % reason)
 56 |             return retryreq
 57 |         else:
 58 |             stats.inc_value('retry/max_reached')
 59 |             logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
 60 |                          {'request': request, 'retries': retries, 'reason': reason},
 61 |                          extra={'spider': spider})
 62 |     
 63 |     @classmethod
 64 |     def from_crawler(cls, crawler):
 65 |         """
 66 |         init the middleware
 67 |         :param crawler:
 68 |         :return:
 69 |         """
 70 |         settings = crawler.settings
 71 |         logging_level = settings.get('GERAPY_SELENIUM_LOGGING_LEVEL', GERAPY_SELENIUM_LOGGING_LEVEL)
 72 |         logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging_level)
 73 |         logging.getLogger('urllib3.connectionpool').setLevel(logging_level)
 74 |         
 75 |         # init settings
 76 |         cls.window_width = settings.get('GERAPY_SELENIUM_WINDOW_WIDTH', GERAPY_SELENIUM_WINDOW_WIDTH)
 77 |         cls.window_height = settings.get('GERAPY_SELENIUM_WINDOW_HEIGHT', GERAPY_SELENIUM_WINDOW_HEIGHT)
 78 |         cls.headless = settings.get('GERAPY_SELENIUM_HEADLESS', GERAPY_SELENIUM_HEADLESS)
 79 |         cls.ignore_https_errors = settings.get('GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS',
 80 |                                                GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS)
 81 |         cls.executable_path = settings.get('GERAPY_SELENIUM_EXECUTABLE_PATH', GERAPY_SELENIUM_EXECUTABLE_PATH)
 82 |         cls.disable_extensions = settings.get('GERAPY_SELENIUM_DISABLE_EXTENSIONS',
 83 |                                               GERAPY_SELENIUM_DISABLE_EXTENSIONS)
 84 |         cls.hide_scrollbars = settings.get('GERAPY_SELENIUM_HIDE_SCROLLBARS', GERAPY_SELENIUM_HIDE_SCROLLBARS)
 85 |         cls.mute_audio = settings.get('GERAPY_SELENIUM_MUTE_AUDIO', GERAPY_SELENIUM_MUTE_AUDIO)
 86 |         cls.no_sandbox = settings.get('GERAPY_SELENIUM_NO_SANDBOX', GERAPY_SELENIUM_NO_SANDBOX)
 87 |         cls.disable_setuid_sandbox = settings.get('GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX',
 88 |                                                   GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX)
 89 |         cls.disable_gpu = settings.get('GERAPY_SELENIUM_DISABLE_GPU', GERAPY_SELENIUM_DISABLE_GPU)
 90 |         cls.download_timeout = settings.get('GERAPY_SELENIUM_DOWNLOAD_TIMEOUT',
 91 |                                             settings.get('DOWNLOAD_TIMEOUT', GERAPY_SELENIUM_DOWNLOAD_TIMEOUT))
 92 |         
 93 |         cls.screenshot = settings.get('GERAPY_SELENIUM_SCREENSHOT', GERAPY_SELENIUM_SCREENSHOT)
 94 |         cls.pretend = settings.get('GERAPY_SELENIUM_PRETEND', GERAPY_SELENIUM_PRETEND)
 95 |         cls.sleep = settings.get('GERAPY_SELENIUM_SLEEP', GERAPY_SELENIUM_SLEEP)
 96 |         cls.retry_enabled = settings.getbool('RETRY_ENABLED')
 97 |         cls.max_retry_times = settings.getint('RETRY_TIMES')
 98 |         cls.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
 99 |         cls.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
100 |         
101 |         return cls()
102 |     
103 |     def _process_request(self, request, spider):
104 |         """
105 |         use pyppeteer to process spider
106 |         :param request:
107 |         :param spider:
108 |         :return:
109 |         """
110 |         kwargs = {}
111 |         options = ChromeOptions()
112 |         kwargs['options'] = options
113 |         if self.headless:
114 |             options.add_argument('--headless')
115 |         if self.pretend:
116 |             options.add_experimental_option('excludeSwitches', ['enable-automation'])
117 |             options.add_experimental_option('useAutomationExtension', False)
118 |         if self.executable_path:
119 |             kwargs['executable_path'] = self.executable_path
120 |         if self.window_width and self.window_height:
121 |             options.add_argument(f'--window-size={self.window_width},{self.window_height}')
122 |         if self.disable_gpu:
123 |             options.add_argument('--disable-gpu')
124 |         if self.hide_scrollbars:
125 |             options.add_argument('--hide-scrollbars')
126 |         if self.ignore_https_errors:
127 |             options.add_argument('--ignore-certificate-errors')
128 |         if self.disable_extensions:
129 |             options.add_argument('--disable-extensions')
130 |         if self.mute_audio:
131 |             options.add_argument('--mute-audio')
132 |         if self.no_sandbox:
133 |             options.add_argument('--no-sandbox')
134 |         if self.disable_setuid_sandbox:
135 |             options.add_argument('--disable-setuid-sandbox')
136 |         
137 |         # get selenium meta
138 |         selenium_meta = request.meta.get('selenium') or {}
139 |         logger.debug('selenium_meta %s', selenium_meta)
140 |         
141 |         # set proxy
142 |         _proxy = request.meta.get('proxy')
143 |         if selenium_meta.get('proxy') is not None:
144 |             _proxy = selenium_meta.get('proxy')
145 |         if _proxy:
146 |             options.add_argument('--proxy-server=' + _proxy)
147 |         
148 |         browser = webdriver.Chrome(**kwargs)
149 |         browser.set_window_size(self.window_width, self.window_height)
150 |         
151 |         # pretend as normal browser
152 |         _pretend = self.pretend
153 |         if selenium_meta.get('pretend') is not None:
154 |             _pretend = selenium_meta.get('pretend')
155 |         if _pretend:
156 |             for script in PRETEND_SCRIPTS:
157 |                 browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
158 |                     'source': script
159 |                 })
160 |         
161 |         _timeout = self.download_timeout
162 |         if selenium_meta.get('timeout') is not None:
163 |             _timeout = selenium_meta.get('timeout')
164 |         browser.set_page_load_timeout(_timeout)
165 |         
166 |         try:
167 |             browser.get(request.url)
168 |         except TimeoutException:
169 |             browser.close()
170 |             return self._retry(request, 504, spider)
171 |         
172 |         # set cookies
173 |         parse_result = urllib.parse.urlsplit(request.url)
174 |         domain = parse_result.hostname
175 |         _cookies = []
176 |         if isinstance(request.cookies, dict):
177 |             _cookies = [{'name': k, 'value': v, 'domain': domain}
178 |                         for k, v in request.cookies.items()]
179 |         else:
180 |             for _cookie in _cookies:
181 |                 if isinstance(_cookie, dict) and 'domain' not in _cookie.keys():
182 |                     _cookie['domain'] = domain
183 |         for _cookie in _cookies:
184 |             browser.add_cookie(_cookie)
185 |         if _cookies:
186 |             browser.refresh()
187 |         
188 |         # wait for dom loaded
189 |         if selenium_meta.get('wait_for'):
190 |             _wait_for = selenium_meta.get('wait_for')
191 |             try:
192 |                 logger.debug('waiting for %s', _wait_for)
193 |                 WebDriverWait(browser, _timeout).until(
194 |                     EC.presence_of_element_located((By.CSS_SELECTOR, _wait_for))
195 |                 )
196 |             except TimeoutException:
197 |                 logger.error('error waiting for %s of %s', _wait_for, request.url)
198 |                 browser.close()
199 |                 return self._retry(request, 504, spider)
200 |         
201 |         # evaluate script
202 |         if selenium_meta.get('script'):
203 |             _script = selenium_meta.get('script')
204 |             logger.debug('evaluating %s', _script)
205 |             browser.execute(_script)
206 |         
207 |         # sleep
208 |         _sleep = self.sleep
209 |         if selenium_meta.get('sleep') is not None:
210 |             _sleep = selenium_meta.get('sleep')
211 |         if _sleep is not None:
212 |             logger.debug('sleep for %ss', _sleep)
213 |             time.sleep(_sleep)
214 |         
215 |         body = browser.page_source
216 |         
217 |         # screenshot
218 |         _screenshot = self.screenshot
219 |         if selenium_meta.get('screenshot') is not None:
220 |             _screenshot = selenium_meta.get('screenshot')
221 |         screenshot_result = None
222 |         if _screenshot is not None:
223 |             logger.debug('taking screenshot using args %s', _screenshot)
224 |             if 'selector' in _screenshot:
225 |                 screenshot_result = browser.find_element_by_css_selector(_screenshot['selector']).screenshot_as_png
226 |             elif 'xpath' in _screenshot:
227 |                 screenshot_result = browser.find_element_by_xpath(_screenshot['xpath']).screenshot_as_png
228 |             else:
229 |                 screenshot_result = browser.get_screenshot_as_png()
230 |             if isinstance(screenshot_result, bytes):
231 |                 screenshot_result = BytesIO(screenshot_result)
232 |         
233 |         # close page and browser
234 |         logger.debug('close selenium')
235 |         browser.close()
236 |         
237 |         response = HtmlResponse(
238 |             request.url,
239 |             status=200,
240 |             body=body,
241 |             encoding='utf-8',
242 |             request=request
243 |         )
244 |         if screenshot_result:
245 |             response.meta['screenshot'] = screenshot_result
246 |         return response
247 |     
248 |     def process_request(self, request, spider):
249 |         """
250 |         process request using pyppeteer
251 |         :param request:
252 |         :param spider:
253 |         :return:
254 |         """
255 |         logger.debug('processing request %s', request)
256 |         return deferToThread(self._process_request, request, spider)
257 |         # return self._process_request(request, spider)
258 |     
259 |     def _spider_closed(self):
260 |         pass
261 |     
262 |     def spider_closed(self):
263 |         """
264 |         callback when spider closed
265 |         :return:
266 |         """
267 |         return deferToThread(self._spider_closed)
268 | 


--------------------------------------------------------------------------------
/gerapy_selenium/pretend.py:
--------------------------------------------------------------------------------
 1 | SET_WEBDRIVER = '''Object.defineProperty(navigator, 'webdriver', {get: () => undefined})'''
 2 | SET_USER_AGENT = '''Object.defineProperty(navigator, 'userAgent', {get: () => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})'''
 3 | SET_APP_VERSION = '''Object.defineProperty(navigator, 'appVersion', {get: () => '5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})'''
 4 | EXTEND_LANGUAGES = '''Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh', 'en', 'zh-TW', 'ja']})'''
 5 | EXTEND_PLUGINS = '''Object.defineProperty(navigator, 'plugins', {get: () => [0, 1, 2, 3, 4]})'''
 6 | EXTEND_MIME_TYPES = '''Object.defineProperty(navigator, 'mimeTypes', {get: () => [0, 1, 2, 3, 4]})'''
 7 | SET_WEBGL = '''
 8 |     const getParameter = WebGLRenderingContext.getParameter
 9 |     WebGLRenderingContext.prototype.getParameter = (parameter) => {
10 |       if (parameter === 37445) {
11 |         return 'Intel Open Source Technology Center'
12 |       }
13 |       if (parameter === 37446) {
14 |         return 'Mesa DRI Intel(R) Ivybridge Mobile '
15 |       }
16 |       return getParameter(parameter)
17 |     }
18 | '''
19 | SET_CHROME_INFO = '''
20 |   Object.defineProperty(window, 'chrome', {
21 |     "app": {
22 |       "isInstalled": false,
23 |       "InstallState": {"DISABLED": "disabled", "INSTALLED": "installed", "NOT_INSTALLED": "not_installed"},
24 |       "RunningState": {"CANNOT_RUN": "cannot_run", "READY_TO_RUN": "ready_to_run", "RUNNING": "running"}
25 |     },
26 |     "runtime": {
27 |       "OnInstalledReason": {
28 |         "CHROME_UPDATE": "chrome_update",
29 |         "INSTALL": "install",
30 |         "SHARED_MODULE_UPDATE": "shared_module_update",
31 |         "UPDATE": "update"
32 |       },
33 |       "OnRestartRequiredReason": {"APP_UPDATE": "app_update", "OS_UPDATE": "os_update", "PERIODIC": "periodic"},
34 |       "PlatformArch": {
35 |         "ARM": "arm",
36 |         "ARM64": "arm64",
37 |         "MIPS": "mips",
38 |         "MIPS64": "mips64",
39 |         "X86_32": "x86-32",
40 |         "X86_64": "x86-64"
41 |       },
42 |       "PlatformNaclArch": {"ARM": "arm", "MIPS": "mips", "MIPS64": "mips64", "X86_32": "x86-32", "X86_64": "x86-64"},
43 |       "PlatformOs": {
44 |         "ANDROID": "android",
45 |         "CROS": "cros",
46 |         "LINUX": "linux",
47 |         "MAC": "mac",
48 |         "OPENBSD": "openbsd",
49 |         "WIN": "win"
50 |       },
51 |       "RequestUpdateCheckStatus": {
52 |         "NO_UPDATE": "no_update",
53 |         "THROTTLED": "throttled",
54 |         "UPDATE_AVAILABLE": "update_available"
55 |       }
56 |     }
57 |   })
58 | '''
59 | 
60 | SET_PERMISSION = '''
61 |   const originalQuery = window.navigator.permissions.query;
62 |   return window.navigator.permissions.query = (parameters) => (
63 |     parameters.name === 'notifications' ?
64 |       Promise.resolve({ state: Notification.permission }) :
65 |       originalQuery(parameters)
66 |   )
67 | '''
68 | 
69 | SCRIPTS = [
70 |     SET_WEBDRIVER,
71 |     SET_USER_AGENT,
72 |     SET_APP_VERSION,
73 |     EXTEND_LANGUAGES,
74 |     EXTEND_PLUGINS,
75 |     EXTEND_MIME_TYPES,
76 |     SET_CHROME_INFO,
77 |     SET_PERMISSION,
78 |     SET_WEBGL,
79 | ]
80 | 


--------------------------------------------------------------------------------
/gerapy_selenium/request.py:
--------------------------------------------------------------------------------
 1 | from scrapy import Request
 2 | import copy
 3 | 
 4 | 
 5 | class SeleniumRequest(Request):
 6 |     """
 7 |     Scrapy ``Request`` subclass providing additional arguments
 8 |     """
 9 |     
10 |     def __init__(self, url, callback=None, wait_for=None, script=None, proxy=None,
11 |                  sleep=None, timeout=None, pretend=None, screenshot=None, meta=None, *args,
12 |                  **kwargs):
13 |         """
14 |         :param url: request url
15 |         :param callback: callback
16 |         :param wait_for: wait for some element to load, also supports dict
17 |         :param script: script to execute
18 |         :param proxy: use proxy for this time, like `http://x.x.x.x:x`
19 |         :param sleep: time to sleep after loaded, override `GERAPY_SELENIUM_SLEEP`
20 |         :param timeout: load timeout, override `GERAPY_SELENIUM_DOWNLOAD_TIMEOUT`
21 |         :param pretend: pretend as normal browser, override `GERAPY_SELENIUM_PRETEND`
22 |         :param screenshot: ignored resource types, see
23 |                 https://miyakogi.github.io/pyppeteer/_modules/pyppeteer/page.html#Page.screenshot,
24 |                 override `GERAPY_SELENIUM_SCREENSHOT`
25 |         :param args:
26 |         :param kwargs:
27 |         """
28 |         # use meta info to save args
29 |         meta = copy.deepcopy(meta) or {}
30 |         selenium_meta = meta.get('selenium') or {}
31 |         
32 |         self.wait_for = selenium_meta.get('wait_for') if selenium_meta.get('wait_for') is not None else wait_for
33 |         self.script = selenium_meta.get('script') if selenium_meta.get('script') is not None else script
34 |         self.sleep = selenium_meta.get('sleep') if selenium_meta.get('sleep') is not None else sleep
35 |         self.proxy = selenium_meta.get('proxy') if selenium_meta.get('proxy') is not None else proxy
36 |         self.pretend = selenium_meta.get('pretend') if selenium_meta.get('pretend') is not None else pretend
37 |         self.timeout = selenium_meta.get('timeout') if selenium_meta.get('timeout') is not None else timeout
38 |         self.screenshot = selenium_meta.get('screenshot') if selenium_meta.get(
39 |             'screenshot') is not None else screenshot
40 |         
41 |         selenium_meta = meta.setdefault('selenium', {})
42 |         selenium_meta['wait_for'] = self.wait_for
43 |         selenium_meta['script'] = self.script
44 |         selenium_meta['sleep'] = self.sleep
45 |         selenium_meta['proxy'] = self.proxy
46 |         selenium_meta['pretend'] = self.pretend
47 |         selenium_meta['timeout'] = self.timeout
48 |         selenium_meta['screenshot'] = self.screenshot
49 |         
50 |         super().__init__(url, callback, meta=meta, *args, **kwargs)
51 | 


--------------------------------------------------------------------------------
/gerapy_selenium/settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | # selenium logging level
 4 | GERAPY_SELENIUM_LOGGING_LEVEL = logging.WARNING
 5 | 
 6 | # selenium timeout
 7 | GERAPY_SELENIUM_DOWNLOAD_TIMEOUT = 30
 8 | 
 9 | # selenium browser window
10 | GERAPY_SELENIUM_WINDOW_WIDTH = 1400
11 | GERAPY_SELENIUM_WINDOW_HEIGHT = 700
12 | 
13 | # selenium settings
14 | GERAPY_SELENIUM_HEADLESS = True
15 | GERAPY_SELENIUM_EXECUTABLE_PATH = None
16 | GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS = False
17 | GERAPY_SELENIUM_PRETEND = True
18 | 
19 | # selenium args
20 | GERAPY_SELENIUM_DISABLE_EXTENSIONS = True
21 | GERAPY_SELENIUM_HIDE_SCROLLBARS = True
22 | GERAPY_SELENIUM_MUTE_AUDIO = True
23 | GERAPY_SELENIUM_NO_SANDBOX = True
24 | GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX = True
25 | GERAPY_SELENIUM_DISABLE_GPU = True
26 | 
27 | GERAPY_SELENIUM_SCREENSHOT = None
28 | GERAPY_SELENIUM_SLEEP = 1
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy>=1.5.0
2 | selenium>=3.5.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from os.path import join, isfile
  5 | from os import walk
  6 | import io
  7 | import os
  8 | import sys
  9 | from shutil import rmtree
 10 | from setuptools import find_packages, setup, Command
 11 | 
 12 | 
 13 | def read_file(filename):
 14 |     with open(filename) as fp:
 15 |         return fp.read().strip()
 16 | 
 17 | 
 18 | def read_requirements(filename):
 19 |     return [line.strip() for line in read_file(filename).splitlines()
 20 |             if not line.startswith('#')]
 21 | 
 22 | 
 23 | NAME = 'gerapy-selenium'
 24 | FOLDER = 'gerapy_selenium'
 25 | DESCRIPTION = 'Selenium Components for Scrapy & Gerapy'
 26 | URL = 'https://github.com/Gerapy/GerapySelenium'
 27 | EMAIL = 'cqc@cuiqingcai.com'
 28 | AUTHOR = 'Germey'
 29 | REQUIRES_PYTHON = '>=3.6.0'
 30 | VERSION = None
 31 | 
 32 | REQUIRED = read_requirements('requirements.txt')
 33 | 
 34 | here = os.path.abspath(os.path.dirname(__file__))
 35 | 
 36 | try:
 37 |     with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
 38 |         long_description = '\n' + f.read()
 39 | except FileNotFoundError:
 40 |     long_description = DESCRIPTION
 41 | 
 42 | about = {}
 43 | if not VERSION:
 44 |     with open(os.path.join(here, FOLDER, '__version__.py')) as f:
 45 |         exec(f.read(), about)
 46 | else:
 47 |     about['__version__'] = VERSION
 48 | 
 49 | 
 50 | def package_files(directories):
 51 |     paths = []
 52 |     for item in directories:
 53 |         if isfile(item):
 54 |             paths.append(join('..', item))
 55 |             continue
 56 |         for (path, directories, filenames) in walk(item):
 57 |             for filename in filenames:
 58 |                 paths.append(join('..', path, filename))
 59 |     return paths
 60 | 
 61 | 
 62 | class UploadCommand(Command):
 63 |     description = 'Build and publish the package.'
 64 |     user_options = []
 65 |     
 66 |     @staticmethod
 67 |     def status(s):
 68 |         """Prints things in bold."""
 69 |         print('\033[1m{0}\033[0m'.format(s))
 70 |     
 71 |     def initialize_options(self):
 72 |         pass
 73 |     
 74 |     def finalize_options(self):
 75 |         pass
 76 |     
 77 |     def run(self):
 78 |         try:
 79 |             self.status('Removing previous builds…')
 80 |             rmtree(os.path.join(here, 'dist'))
 81 |         except OSError:
 82 |             pass
 83 |         
 84 |         self.status('Building Source and Wheel (universal) distribution…')
 85 |         os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable))
 86 |         
 87 |         self.status('Uploading the package to PyPI via Twine…')
 88 |         os.system('twine upload dist/*')
 89 |         
 90 |         self.status('Pushing git tags…')
 91 |         os.system('git tag v{0}'.format(about['__version__']))
 92 |         os.system('git push --tags')
 93 |         
 94 |         sys.exit()
 95 | 
 96 | 
 97 | setup(
 98 |     name=NAME,
 99 |     version=about['__version__'],
100 |     description=DESCRIPTION,
101 |     long_description=long_description,
102 |     long_description_content_type='text/markdown',
103 |     author=AUTHOR,
104 |     author_email=EMAIL,
105 |     python_requires=REQUIRES_PYTHON,
106 |     url=URL,
107 |     packages=find_packages(exclude=('tests',)),
108 |     install_requires=REQUIRED,
109 |     include_package_data=True,
110 |     license='MIT',
111 |     classifiers=[
112 |         'License :: OSI Approved :: MIT License',
113 |         'Programming Language :: Python :: 3.6',
114 |         'Programming Language :: Python :: 3.7',
115 |         'Programming Language :: Python :: 3.8',
116 |         'Programming Language :: Python :: Implementation :: CPython',
117 |         'Programming Language :: Python :: Implementation :: PyPy'
118 |     ],
119 |     # $ setup.py publish support.
120 |     cmdclass={
121 |         'upload': UploadCommand,
122 |     },
123 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gerapy/GerapySelenium/b8c8685d389c0d77093d6c1c0918fc382eacc879/tests/__init__.py


--------------------------------------------------------------------------------