├── .gitignore ├── CHANGELOG.md ├── README.md ├── example ├── .dockerignore ├── Dockerfile ├── README.md ├── docker-compose.yaml ├── example │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── book.py │ │ └── movie.py ├── requirements.txt ├── run.py └── scrapy.cfg ├── gerapy_selenium ├── __init__.py ├── __version__.py ├── downloadermiddlewares.py ├── pretend.py ├── request.py └── settings.py ├── requirements.txt ├── setup.py └── tests └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | .idea/ 7 | 8 | # User-specific stuff 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/**/usage.statistics.xml 12 | .idea/**/dictionaries 13 | .idea/**/shelf 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/artifacts 36 | # .idea/compiler.xml 37 | # .idea/jarRepositories.xml 38 | # .idea/modules.xml 39 | # .idea/*.iml 40 | # .idea/modules 41 | # *.iml 42 | # *.ipr 43 | 44 | # CMake 45 | cmake-build-*/ 46 | 47 | # Mongo Explorer plugin 48 | .idea/**/mongoSettings.xml 49 | 50 | # File-based project format 51 | *.iws 52 | 53 | # IntelliJ 54 | out/ 55 | 56 | # mpeltonen/sbt-idea plugin 57 | .idea_modules/ 58 | 59 | # JIRA plugin 60 | atlassian-ide-plugin.xml 61 | 62 | # Cursive Clojure plugin 63 | .idea/replstate.xml 64 | 65 | # Crashlytics plugin (for Android Studio and IntelliJ) 66 | com_crashlytics_export_strings.xml 67 | crashlytics.properties 68 | crashlytics-build.properties 69 | fabric.properties 70 | 71 | # Editor-based Rest Client 72 | .idea/httpRequests 73 | 74 | # Android studio 3.1+ serialized cache file 75 | .idea/caches/build_file_checksums.ser 76 | 77 | ### Python template 78 | # Byte-compiled / optimized / DLL files 79 | __pycache__/ 80 | *.py[cod] 81 | *$py.class 82 | 83 | # C extensions 84 | *.so 85 | 86 | # Distribution / packaging 87 | .Python 88 | build/ 89 | develop-eggs/ 90 | dist/ 91 | downloads/ 92 | eggs/ 93 | .eggs/ 94 | lib/ 95 | lib64/ 96 | parts/ 97 | sdist/ 98 | var/ 99 | wheels/ 100 | pip-wheel-metadata/ 101 | share/python-wheels/ 102 | *.egg-info/ 103 | .installed.cfg 104 | *.egg 105 | MANIFEST 106 | 107 | # PyInstaller 108 | # Usually these files are written by a python script from a template 109 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 110 | *.manifest 111 | *.spec 112 | 113 | # Installer logs 114 | pip-log.txt 115 | pip-delete-this-directory.txt 116 | 117 | # Unit test / coverage reports 118 | htmlcov/ 119 | .tox/ 120 | .nox/ 121 | .coverage 122 | .coverage.* 123 | .cache 124 | nosetests.xml 125 | coverage.xml 126 | *.cover 127 | *.py,cover 128 | .hypothesis/ 129 | .pytest_cache/ 130 | cover/ 131 | 132 | # Translations 133 | *.mo 134 | *.pot 135 | 136 | # Django stuff: 137 | *.log 138 | local_settings.py 139 | db.sqlite3 140 | db.sqlite3-journal 141 | 142 | # Flask stuff: 143 | instance/ 144 | .webassets-cache 145 | 146 | # Scrapy stuff: 147 | .scrapy 148 | 149 | # Sphinx documentation 150 | docs/_build/ 151 | 152 | # PyBuilder 153 | .pybuilder/ 154 | target/ 155 | 156 | # Jupyter Notebook 157 | .ipynb_checkpoints 158 | 159 | # IPython 160 | profile_default/ 161 | ipython_config.py 162 | 163 | # pyenv 164 | # For a library or package, you might want to ignore these files since the code is 165 | # intended to run in multiple environments; otherwise, check them in: 166 | # .python-version 167 | 168 | # pipenv 169 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 170 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 171 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 172 | # install all needed dependencies. 173 | #Pipfile.lock 174 | 175 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 176 | __pypackages__/ 177 | 178 | # Celery stuff 179 | celerybeat-schedule 180 | celerybeat.pid 181 | 182 | # SageMath parsed files 183 | *.sage.py 184 | 185 | # Environments 186 | .env 187 | .venv 188 | env/ 189 | venv/ 190 | ENV/ 191 | env.bak/ 192 | venv.bak/ 193 | 194 | # Spyder project settings 195 | .spyderproject 196 | .spyproject 197 | 198 | # Rope project settings 199 | .ropeproject 200 | 201 | # mkdocs documentation 202 | /site 203 | 204 | # mypy 205 | .mypy_cache/ 206 | .dmypy.json 207 | dmypy.json 208 | 209 | # Pyre type checker 210 | .pyre/ 211 | 212 | # pytype static type analyzer 213 | .pytype/ 214 | 215 | # Cython debug symbols 216 | cython_debug/ 217 | 218 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Gerapy Selenium Changelog 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gerapy Selenium 2 | 3 | This is a package for supporting selenium in Scrapy, also this 4 | package is a module in [Gerapy](https://github.com/Gerapy/Gerapy). 5 | 6 | ## Installation 7 | 8 | ```shell script 9 | pip3 install gerapy-selenium 10 | ``` 11 | 12 | ## Usage 13 | 14 | You can use `SeleniumRequest` to specify a request which uses selenium to render. 15 | 16 | For example: 17 | 18 | ```python 19 | yield SeleniumRequest(detail_url, callback=self.parse_detail) 20 | ``` 21 | 22 | And you also need to enable `SeleniumMiddleware` in `DOWNLOADER_MIDDLEWARES`: 23 | 24 | ```python 25 | DOWNLOADER_MIDDLEWARES = { 26 | 'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware': 543, 27 | } 28 | ``` 29 | 30 | Congratulate, you've finished the all of the required configuration. 31 | 32 | If you run the Spider again, Selenium will be started to render every 33 | web page which you configured the request as SeleniumRequest. 34 | 35 | ## Settings 36 | 37 | GerapySelenium provides some optional settings. 38 | 39 | ### Concurrency 40 | 41 | You can directly use Scrapy's setting to set Concurrency of Selenium, 42 | for example: 43 | 44 | ```python 45 | CONCURRENT_REQUESTS = 3 46 | ``` 47 | 48 | ### Pretend as Real Browser 49 | 50 | Some website will detect WebDriver or Headless, GerapySelenium can 51 | pretend Chromium by inject scripts. This is enabled by default. 52 | 53 | You can close it if website does not detect WebDriver to speed up: 54 | 55 | ```python 56 | GERAPY_SELENIUM_PRETEND = False 57 | ``` 58 | 59 | Also you can use `pretend` attribute in `SeleniumRequest` to overwrite this 60 | configuration. 61 | 62 | ### Logging Level 63 | 64 | By default, Selenium will log all the debug messages, so GerapySelenium 65 | configured the logging level of Selenium to WARNING. 66 | 67 | If you want to see more logs from Selenium, you can change the this setting: 68 | 69 | ```python 70 | import logging 71 | GERAPY_SELENIUM_LOGGING_LEVEL = logging.DEBUG 72 | ``` 73 | 74 | ### Download Timeout 75 | 76 | Selenium may take some time to render the required web page, you can also change this setting, default is `30s`: 77 | 78 | ```python 79 | # selenium timeout 80 | GERAPY_SELENIUM_DOWNLOAD_TIMEOUT = 30 81 | ``` 82 | 83 | ### Headless 84 | 85 | By default, Selenium is running in `Headless` mode, you can also 86 | change it to `False` as you need, default is `True`: 87 | 88 | ```python 89 | GERAPY_SELENIUM_HEADLESS = False 90 | ``` 91 | 92 | ### Window Size 93 | 94 | You can also set the width and height of Selenium window: 95 | 96 | ```python 97 | GERAPY_SELENIUM_WINDOW_WIDTH = 1400 98 | GERAPY_SELENIUM_WINDOW_HEIGHT = 700 99 | ``` 100 | 101 | Default is 1400, 700. 102 | 103 | ## SeleniumRequest 104 | 105 | `SeleniumRequest` provide args which can override global settings above. 106 | 107 | * url: request url 108 | * callback: callback 109 | * wait_for: wait for some element to load, also supports dict 110 | * script: script to execute 111 | * proxy: use proxy for this time, like `http://x.x.x.x:x` 112 | * sleep: time to sleep after loaded, override `GERAPY_SELENIUM_SLEEP` 113 | * timeout: load timeout, override `GERAPY_SELENIUM_DOWNLOAD_TIMEOUT` 114 | * pretend: pretend as normal browser, override `GERAPY_SELENIUM_PRETEND` 115 | * screenshot: ignored resource types, see 116 | https://miyakogi.github.io/selenium/_modules/selenium/page.html#Page.screenshot, 117 | override `GERAPY_SELENIUM_SCREENSHOT` 118 | 119 | For example, you can configure SeleniumRequest as: 120 | 121 | ```python 122 | from gerapy_selenium import SeleniumRequest 123 | 124 | def parse(self, response): 125 | yield SeleniumRequest(url, 126 | callback=self.parse_detail, 127 | wait_for='title', 128 | script='() => { console.log(document) }', 129 | sleep=2) 130 | ``` 131 | 132 | Then Selenium will: 133 | * wait for title to load 134 | * execute `console.log(document)` script 135 | * sleep for 2s 136 | * return the rendered web page content 137 | 138 | ## Example 139 | 140 | For more detail, please see [example](./example). 141 | 142 | Also you can directly run with Docker: 143 | 144 | ``` 145 | docker run germey/gerapy-selenium-example 146 | ``` 147 | 148 | Outputs: 149 | 150 | ```shell script 151 | 2020-07-13 01:49:13 [scrapy.utils.log] INFO: Scrapy 2.2.0 started (bot: example) 152 | 2020-07-13 01:49:13 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.7 (default, May 6 2020, 04:59:01) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d 10 Sep 2019), cryptography 2.8, Platform Darwin-19.4.0-x86_64-i386-64bit 153 | 2020-07-13 01:49:13 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor 154 | 2020-07-13 01:49:13 [scrapy.crawler] INFO: Overridden settings: 155 | {'BOT_NAME': 'example', 156 | 'CONCURRENT_REQUESTS': 3, 157 | 'NEWSPIDER_MODULE': 'example.spiders', 158 | 'RETRY_HTTP_CODES': [403, 500, 502, 503, 504], 159 | 'SPIDER_MODULES': ['example.spiders']} 160 | 2020-07-13 01:49:13 [scrapy.extensions.telnet] INFO: Telnet Password: 83c276fb41754bd0 161 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled extensions: 162 | ['scrapy.extensions.corestats.CoreStats', 163 | 'scrapy.extensions.telnet.TelnetConsole', 164 | 'scrapy.extensions.memusage.MemoryUsage', 165 | 'scrapy.extensions.logstats.LogStats'] 166 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled downloader middlewares: 167 | ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 168 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 169 | 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 170 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 171 | 'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware', 172 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 173 | 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 174 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 175 | 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 176 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 177 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 178 | 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 179 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled spider middlewares: 180 | ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 181 | 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 182 | 'scrapy.spidermiddlewares.referer.RefererMiddleware', 183 | 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 184 | 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 185 | 2020-07-13 01:49:13 [scrapy.middleware] INFO: Enabled item pipelines: 186 | [] 187 | 2020-07-13 01:49:13 [scrapy.core.engine] INFO: Spider opened 188 | 2020-07-13 01:49:13 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 189 | 2020-07-13 01:49:13 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023 190 | 2020-07-13 01:49:13 [example.spiders.book] INFO: crawling https://dynamic5.scrape.center/page/1 191 | 2020-07-13 01:49:13 [gerapy.selenium] DEBUG: processing request 192 | 2020-07-13 01:49:13 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']} 193 | 2020-07-13 01:49:14 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/page/1 194 | 2020-07-13 01:49:19 [gerapy.selenium] DEBUG: waiting for .item .name finished 195 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: wait for .item .name finished 196 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: close selenium 197 | 2020-07-13 01:49:20 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None) 198 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: processing request 199 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: processing request 200 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: processing request 201 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']} 202 | 2020-07-13 01:49:20 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']} 203 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']} 204 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/26855315 205 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/26861389 206 | 2020-07-13 01:49:21 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/26898909 207 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: waiting for .item .name finished 208 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: wait for .item .name finished 209 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: close selenium 210 | 2020-07-13 01:49:24 [scrapy.core.engine] DEBUG: Crawled (200) (referer: https://dynamic5.scrape.center/page/1) 211 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: processing request 212 | 2020-07-13 01:49:24 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']} 213 | 2020-07-13 01:49:25 [scrapy.core.scraper] DEBUG: Scraped from <200 https://dynamic5.scrape.center/detail/26861389> 214 | {'name': '壁穴ヘブンホール', 215 | 'score': '5.6', 216 | 'tags': ['BL漫画', '小基漫', 'BL', '『又腐又基』', 'BLコミック']} 217 | 2020-07-13 01:49:25 [gerapy.selenium] DEBUG: waiting for .item .name finished 218 | 2020-07-13 01:49:25 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/page/2 219 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: wait for .item .name finished 220 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: close selenium 221 | 2020-07-13 01:49:26 [scrapy.core.engine] DEBUG: Crawled (200) (referer: https://dynamic5.scrape.center/page/1) 222 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: processing request 223 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: set options {'headless': True, 'dumpio': False, 'devtools': False, 'args': ['--window-size=1400,700', '--disable-extensions', '--hide-scrollbars', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']} 224 | 2020-07-13 01:49:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://dynamic5.scrape.center/detail/26855315> 225 | {'name': '冒险小虎队', 'score': '9.4', 'tags': ['冒险小虎队', '童年', '冒险', '推理', '小时候读的']} 226 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: waiting for .item .name finished 227 | 2020-07-13 01:49:26 [gerapy.selenium] DEBUG: crawling https://dynamic5.scrape.center/detail/27047626 228 | 2020-07-13 01:49:27 [gerapy.selenium] DEBUG: wait for .item .name finished 229 | 2020-07-13 01:49:27 [gerapy.selenium] DEBUG: close selenium 230 | ... 231 | ``` 232 | -------------------------------------------------------------------------------- /example/.dockerignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # pytype static type analyzer 138 | .pytype/ 139 | 140 | # Cython debug symbols 141 | cython_debug/ 142 | 143 | -------------------------------------------------------------------------------- /example/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM selenium/standalone-chrome:85.0-chromedriver-85.0-20200907 2 | USER root 3 | 4 | RUN apt-get update && \ 5 | apt-get install -y python3-distutils && \ 6 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 7 | python3 get-pip.py 8 | 9 | WORKDIR /code 10 | COPY requirements.txt . 11 | RUN python3 -m pip install -r requirements.txt 12 | COPY . . 13 | CMD python3 run.py 14 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Gerapy Pyppeteer Example 2 | 3 | ## Run 4 | 5 | There are two ways to run this example: 6 | 7 | ### Run with Python 8 | 9 | ```shell script 10 | pip3 install -r requierments.txt 11 | pyppeteer-install 12 | python3 run.py 13 | ``` 14 | 15 | ### Run with Docker 16 | 17 | ```shell script 18 | docker run germey/gerapy-pyppeteer-example 19 | ``` 20 | -------------------------------------------------------------------------------- /example/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | gerapy-selenium-example: 4 | container_name: 'gerapy-selenium-example' 5 | restart: always 6 | build: . 7 | image: 'germey/gerapy-selenium-example' 8 | command: 'python3 run.py' -------------------------------------------------------------------------------- /example/example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapySelenium/b8c8685d389c0d77093d6c1c0918fc382eacc879/example/example/__init__.py -------------------------------------------------------------------------------- /example/example/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | 11 | class BookItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | name = Field() 15 | tags = Field() 16 | score = Field() 17 | 18 | 19 | class MovieItem(Item): 20 | # define the fields for your item here like: 21 | # name = scrapy.Field() 22 | name = Field() 23 | categories = Field() 24 | score = Field() 25 | -------------------------------------------------------------------------------- /example/example/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ExampleSpiderMiddleware: 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ExampleDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /example/example/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ExamplePipeline: 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /example/example/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for example project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'example' 13 | 14 | SPIDER_MODULES = ['example.spiders'] 15 | NEWSPIDER_MODULE = 'example.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'example (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | CONCURRENT_REQUESTS = 3 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | # DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | # } 45 | 46 | # Enable or disable spider middlewares 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'example.middlewares.ExampleSpiderMiddleware': 543, 50 | # } 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware': 543, 56 | } 57 | 58 | # Enable or disable extensions 59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 60 | # EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | # } 63 | 64 | # Configure item pipelines 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 66 | # ITEM_PIPELINES = { 67 | # 'example.pipelines.ExamplePipeline': 300, 68 | # } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 72 | # AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | # AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | # AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | # AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | # HTTPCACHE_ENABLED = True 86 | # HTTPCACHE_EXPIRATION_SECS = 0 87 | # HTTPCACHE_DIR = 'httpcache' 88 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | 91 | RETRY_HTTP_CODES = [403, 500, 502, 503, 504] 92 | 93 | GERAPY_SELENIUM_HEADLESS = True 94 | 95 | LOG_LEVEL = 'DEBUG' 96 | 97 | GERAPY_SELENIUM_PRETEND = True 98 | 99 | # GERAPY_SELENIUM_SCREENSHOT = { 100 | # 'selector': '.item' 101 | # } 102 | 103 | GERAPY_SELENIUM_DOWNLOAD_TIMEOUT = 10 104 | -------------------------------------------------------------------------------- /example/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /example/example/spiders/book.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | from example.items import BookItem 5 | from gerapy_selenium import SeleniumRequest 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class BookSpider(scrapy.Spider): 12 | name = 'book' 13 | allowed_domains = ['dynamic5.scrape.center'] 14 | base_url = 'https://dynamic5.scrape.center' 15 | 16 | def start_requests(self): 17 | """ 18 | first page 19 | :return: 20 | """ 21 | start_url = f'{self.base_url}/page/1' 22 | logger.info('crawling %s', start_url) 23 | yield SeleniumRequest(start_url, callback=self.parse_index, wait_for='.item .name') 24 | 25 | def parse_index(self, response): 26 | """ 27 | extract books and get next page 28 | :param response: 29 | :return: 30 | """ 31 | items = response.css('.item') 32 | for item in items: 33 | href = item.css('.top a::attr(href)').extract_first() 34 | detail_url = response.urljoin(href) 35 | yield SeleniumRequest(detail_url, callback=self.parse_detail, wait_for='.item .name', priority=2, 36 | screenshot={'selector': '.item'}) 37 | 38 | # next page 39 | match = re.search(r'page/(\d+)', response.url) 40 | if not match: return 41 | page = int(match.group(1)) + 1 42 | next_url = f'{self.base_url}/page/{page}' 43 | yield SeleniumRequest(next_url, callback=self.parse_index, wait_for='.item .name', ) 44 | 45 | def parse_detail(self, response): 46 | """ 47 | process detail info of book 48 | :param response: 49 | :return: 50 | """ 51 | name = response.css('.name::text').extract_first() 52 | tags = response.css('.tags button span::text').extract() 53 | score = response.css('.score::text').extract_first() 54 | tags = [tag.strip() for tag in tags] if tags else [] 55 | score = score.strip() if score else None 56 | item = BookItem(name=name, tags=tags, score=score) 57 | yield item 58 | -------------------------------------------------------------------------------- /example/example/spiders/movie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from gerapy_selenium import SeleniumRequest 4 | from scrapy import Request, signals 5 | from example.items import MovieItem 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class MovieSpider(scrapy.Spider): 12 | name = 'movie' 13 | allowed_domains = ['antispider1.scrape.center'] 14 | base_url = 'https://antispider1.scrape.center' 15 | max_page = 10 16 | 17 | def start_requests(self): 18 | """ 19 | first page 20 | :return: 21 | """ 22 | for page in range(1, self.max_page + 1): 23 | url = f'{self.base_url}/page/{page}' 24 | logger.debug('start url %s', url) 25 | cookies = { 26 | 'name': 'germey' 27 | } 28 | yield SeleniumRequest(url, callback=self.parse_index, priority=10, wait_for='.item', pretend=True, cookies=cookies) 29 | 30 | def parse_index(self, response): 31 | """ 32 | extract movies 33 | :param response: 34 | :return: 35 | """ 36 | items = response.css('.item') 37 | for item in items: 38 | href = item.css('a::attr(href)').extract_first() 39 | detail_url = response.urljoin(href) 40 | logger.info('detail url %s', detail_url) 41 | yield SeleniumRequest(detail_url, callback=self.parse_detail, wait_for='.item') 42 | 43 | def parse_detail(self, response): 44 | """ 45 | process detail info of book 46 | :param response: 47 | :return: 48 | """ 49 | name = response.css('h2::text').extract_first() 50 | categories = response.css('.categories button span::text').extract() 51 | score = response.css('.score::text').extract_first() 52 | categories = [category.strip() for category in categories] if categories else [] 53 | score = score.strip() if score else None 54 | yield MovieItem(name=name, categories=categories, score=score) 55 | -------------------------------------------------------------------------------- /example/requirements.txt: -------------------------------------------------------------------------------- 1 | gerapy-selenium -------------------------------------------------------------------------------- /example/run.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | execute('scrapy crawl book'.split()) 4 | -------------------------------------------------------------------------------- /example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /gerapy_selenium/__init__.py: -------------------------------------------------------------------------------- 1 | from .downloadermiddlewares import SeleniumMiddleware 2 | from .request import SeleniumRequest 3 | -------------------------------------------------------------------------------- /gerapy_selenium/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 0, '3') 2 | 3 | version = __version__ = '.'.join(map(str, VERSION)) -------------------------------------------------------------------------------- /gerapy_selenium/downloadermiddlewares.py: -------------------------------------------------------------------------------- 1 | import time 2 | from io import BytesIO 3 | from scrapy.http import HtmlResponse 4 | from scrapy.utils.python import global_object_name 5 | from selenium.common.exceptions import TimeoutException 6 | from selenium.webdriver.common.by import By 7 | from selenium.webdriver.support.wait import WebDriverWait 8 | from gerapy_selenium.pretend import SCRIPTS as PRETEND_SCRIPTS 9 | from gerapy_selenium.settings import * 10 | import urllib.parse 11 | from selenium import webdriver 12 | from selenium.webdriver import ChromeOptions 13 | from selenium.webdriver.support import expected_conditions as EC 14 | from twisted.internet.threads import deferToThread 15 | 16 | logger = logging.getLogger('gerapy.selenium') 17 | 18 | 19 | class SeleniumMiddleware(object): 20 | """ 21 | Downloader middleware handling the requests with Selenium 22 | """ 23 | 24 | def _retry(self, request, reason, spider): 25 | """ 26 | get retry request 27 | :param request: 28 | :param reason: 29 | :param spider: 30 | :return: 31 | """ 32 | if not self.retry_enabled: 33 | return 34 | 35 | retries = request.meta.get('retry_times', 0) + 1 36 | retry_times = self.max_retry_times 37 | 38 | if 'max_retry_times' in request.meta: 39 | retry_times = request.meta['max_retry_times'] 40 | 41 | stats = spider.crawler.stats 42 | if retries <= retry_times: 43 | logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", 44 | {'request': request, 'retries': retries, 'reason': reason}, 45 | extra={'spider': spider}) 46 | retryreq = request.copy() 47 | retryreq.meta['retry_times'] = retries 48 | retryreq.dont_filter = True 49 | retryreq.priority = request.priority + self.priority_adjust 50 | 51 | if isinstance(reason, Exception): 52 | reason = global_object_name(reason.__class__) 53 | 54 | stats.inc_value('retry/count') 55 | stats.inc_value('retry/reason_count/%s' % reason) 56 | return retryreq 57 | else: 58 | stats.inc_value('retry/max_reached') 59 | logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", 60 | {'request': request, 'retries': retries, 'reason': reason}, 61 | extra={'spider': spider}) 62 | 63 | @classmethod 64 | def from_crawler(cls, crawler): 65 | """ 66 | init the middleware 67 | :param crawler: 68 | :return: 69 | """ 70 | settings = crawler.settings 71 | logging_level = settings.get('GERAPY_SELENIUM_LOGGING_LEVEL', GERAPY_SELENIUM_LOGGING_LEVEL) 72 | logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging_level) 73 | logging.getLogger('urllib3.connectionpool').setLevel(logging_level) 74 | 75 | # init settings 76 | cls.window_width = settings.get('GERAPY_SELENIUM_WINDOW_WIDTH', GERAPY_SELENIUM_WINDOW_WIDTH) 77 | cls.window_height = settings.get('GERAPY_SELENIUM_WINDOW_HEIGHT', GERAPY_SELENIUM_WINDOW_HEIGHT) 78 | cls.headless = settings.get('GERAPY_SELENIUM_HEADLESS', GERAPY_SELENIUM_HEADLESS) 79 | cls.ignore_https_errors = settings.get('GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS', 80 | GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS) 81 | cls.executable_path = settings.get('GERAPY_SELENIUM_EXECUTABLE_PATH', GERAPY_SELENIUM_EXECUTABLE_PATH) 82 | cls.disable_extensions = settings.get('GERAPY_SELENIUM_DISABLE_EXTENSIONS', 83 | GERAPY_SELENIUM_DISABLE_EXTENSIONS) 84 | cls.hide_scrollbars = settings.get('GERAPY_SELENIUM_HIDE_SCROLLBARS', GERAPY_SELENIUM_HIDE_SCROLLBARS) 85 | cls.mute_audio = settings.get('GERAPY_SELENIUM_MUTE_AUDIO', GERAPY_SELENIUM_MUTE_AUDIO) 86 | cls.no_sandbox = settings.get('GERAPY_SELENIUM_NO_SANDBOX', GERAPY_SELENIUM_NO_SANDBOX) 87 | cls.disable_setuid_sandbox = settings.get('GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX', 88 | GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX) 89 | cls.disable_gpu = settings.get('GERAPY_SELENIUM_DISABLE_GPU', GERAPY_SELENIUM_DISABLE_GPU) 90 | cls.download_timeout = settings.get('GERAPY_SELENIUM_DOWNLOAD_TIMEOUT', 91 | settings.get('DOWNLOAD_TIMEOUT', GERAPY_SELENIUM_DOWNLOAD_TIMEOUT)) 92 | 93 | cls.screenshot = settings.get('GERAPY_SELENIUM_SCREENSHOT', GERAPY_SELENIUM_SCREENSHOT) 94 | cls.pretend = settings.get('GERAPY_SELENIUM_PRETEND', GERAPY_SELENIUM_PRETEND) 95 | cls.sleep = settings.get('GERAPY_SELENIUM_SLEEP', GERAPY_SELENIUM_SLEEP) 96 | cls.retry_enabled = settings.getbool('RETRY_ENABLED') 97 | cls.max_retry_times = settings.getint('RETRY_TIMES') 98 | cls.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) 99 | cls.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') 100 | 101 | return cls() 102 | 103 | def _process_request(self, request, spider): 104 | """ 105 | use pyppeteer to process spider 106 | :param request: 107 | :param spider: 108 | :return: 109 | """ 110 | kwargs = {} 111 | options = ChromeOptions() 112 | kwargs['options'] = options 113 | if self.headless: 114 | options.add_argument('--headless') 115 | if self.pretend: 116 | options.add_experimental_option('excludeSwitches', ['enable-automation']) 117 | options.add_experimental_option('useAutomationExtension', False) 118 | if self.executable_path: 119 | kwargs['executable_path'] = self.executable_path 120 | if self.window_width and self.window_height: 121 | options.add_argument(f'--window-size={self.window_width},{self.window_height}') 122 | if self.disable_gpu: 123 | options.add_argument('--disable-gpu') 124 | if self.hide_scrollbars: 125 | options.add_argument('--hide-scrollbars') 126 | if self.ignore_https_errors: 127 | options.add_argument('--ignore-certificate-errors') 128 | if self.disable_extensions: 129 | options.add_argument('--disable-extensions') 130 | if self.mute_audio: 131 | options.add_argument('--mute-audio') 132 | if self.no_sandbox: 133 | options.add_argument('--no-sandbox') 134 | if self.disable_setuid_sandbox: 135 | options.add_argument('--disable-setuid-sandbox') 136 | 137 | # get selenium meta 138 | selenium_meta = request.meta.get('selenium') or {} 139 | logger.debug('selenium_meta %s', selenium_meta) 140 | 141 | # set proxy 142 | _proxy = request.meta.get('proxy') 143 | if selenium_meta.get('proxy') is not None: 144 | _proxy = selenium_meta.get('proxy') 145 | if _proxy: 146 | options.add_argument('--proxy-server=' + _proxy) 147 | 148 | browser = webdriver.Chrome(**kwargs) 149 | browser.set_window_size(self.window_width, self.window_height) 150 | 151 | # pretend as normal browser 152 | _pretend = self.pretend 153 | if selenium_meta.get('pretend') is not None: 154 | _pretend = selenium_meta.get('pretend') 155 | if _pretend: 156 | for script in PRETEND_SCRIPTS: 157 | browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 158 | 'source': script 159 | }) 160 | 161 | _timeout = self.download_timeout 162 | if selenium_meta.get('timeout') is not None: 163 | _timeout = selenium_meta.get('timeout') 164 | browser.set_page_load_timeout(_timeout) 165 | 166 | try: 167 | browser.get(request.url) 168 | except TimeoutException: 169 | browser.close() 170 | return self._retry(request, 504, spider) 171 | 172 | # set cookies 173 | parse_result = urllib.parse.urlsplit(request.url) 174 | domain = parse_result.hostname 175 | _cookies = [] 176 | if isinstance(request.cookies, dict): 177 | _cookies = [{'name': k, 'value': v, 'domain': domain} 178 | for k, v in request.cookies.items()] 179 | else: 180 | for _cookie in _cookies: 181 | if isinstance(_cookie, dict) and 'domain' not in _cookie.keys(): 182 | _cookie['domain'] = domain 183 | for _cookie in _cookies: 184 | browser.add_cookie(_cookie) 185 | if _cookies: 186 | browser.refresh() 187 | 188 | # wait for dom loaded 189 | if selenium_meta.get('wait_for'): 190 | _wait_for = selenium_meta.get('wait_for') 191 | try: 192 | logger.debug('waiting for %s', _wait_for) 193 | WebDriverWait(browser, _timeout).until( 194 | EC.presence_of_element_located((By.CSS_SELECTOR, _wait_for)) 195 | ) 196 | except TimeoutException: 197 | logger.error('error waiting for %s of %s', _wait_for, request.url) 198 | browser.close() 199 | return self._retry(request, 504, spider) 200 | 201 | # evaluate script 202 | if selenium_meta.get('script'): 203 | _script = selenium_meta.get('script') 204 | logger.debug('evaluating %s', _script) 205 | browser.execute(_script) 206 | 207 | # sleep 208 | _sleep = self.sleep 209 | if selenium_meta.get('sleep') is not None: 210 | _sleep = selenium_meta.get('sleep') 211 | if _sleep is not None: 212 | logger.debug('sleep for %ss', _sleep) 213 | time.sleep(_sleep) 214 | 215 | body = browser.page_source 216 | 217 | # screenshot 218 | _screenshot = self.screenshot 219 | if selenium_meta.get('screenshot') is not None: 220 | _screenshot = selenium_meta.get('screenshot') 221 | screenshot_result = None 222 | if _screenshot is not None: 223 | logger.debug('taking screenshot using args %s', _screenshot) 224 | if 'selector' in _screenshot: 225 | screenshot_result = browser.find_element_by_css_selector(_screenshot['selector']).screenshot_as_png 226 | elif 'xpath' in _screenshot: 227 | screenshot_result = browser.find_element_by_xpath(_screenshot['xpath']).screenshot_as_png 228 | else: 229 | screenshot_result = browser.get_screenshot_as_png() 230 | if isinstance(screenshot_result, bytes): 231 | screenshot_result = BytesIO(screenshot_result) 232 | 233 | # close page and browser 234 | logger.debug('close selenium') 235 | browser.close() 236 | 237 | response = HtmlResponse( 238 | request.url, 239 | status=200, 240 | body=body, 241 | encoding='utf-8', 242 | request=request 243 | ) 244 | if screenshot_result: 245 | response.meta['screenshot'] = screenshot_result 246 | return response 247 | 248 | def process_request(self, request, spider): 249 | """ 250 | process request using pyppeteer 251 | :param request: 252 | :param spider: 253 | :return: 254 | """ 255 | logger.debug('processing request %s', request) 256 | return deferToThread(self._process_request, request, spider) 257 | # return self._process_request(request, spider) 258 | 259 | def _spider_closed(self): 260 | pass 261 | 262 | def spider_closed(self): 263 | """ 264 | callback when spider closed 265 | :return: 266 | """ 267 | return deferToThread(self._spider_closed) 268 | -------------------------------------------------------------------------------- /gerapy_selenium/pretend.py: -------------------------------------------------------------------------------- 1 | SET_WEBDRIVER = '''Object.defineProperty(navigator, 'webdriver', {get: () => undefined})''' 2 | SET_USER_AGENT = '''Object.defineProperty(navigator, 'userAgent', {get: () => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})''' 3 | SET_APP_VERSION = '''Object.defineProperty(navigator, 'appVersion', {get: () => '5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})''' 4 | EXTEND_LANGUAGES = '''Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh', 'en', 'zh-TW', 'ja']})''' 5 | EXTEND_PLUGINS = '''Object.defineProperty(navigator, 'plugins', {get: () => [0, 1, 2, 3, 4]})''' 6 | EXTEND_MIME_TYPES = '''Object.defineProperty(navigator, 'mimeTypes', {get: () => [0, 1, 2, 3, 4]})''' 7 | SET_WEBGL = ''' 8 | const getParameter = WebGLRenderingContext.getParameter 9 | WebGLRenderingContext.prototype.getParameter = (parameter) => { 10 | if (parameter === 37445) { 11 | return 'Intel Open Source Technology Center' 12 | } 13 | if (parameter === 37446) { 14 | return 'Mesa DRI Intel(R) Ivybridge Mobile ' 15 | } 16 | return getParameter(parameter) 17 | } 18 | ''' 19 | SET_CHROME_INFO = ''' 20 | Object.defineProperty(window, 'chrome', { 21 | "app": { 22 | "isInstalled": false, 23 | "InstallState": {"DISABLED": "disabled", "INSTALLED": "installed", "NOT_INSTALLED": "not_installed"}, 24 | "RunningState": {"CANNOT_RUN": "cannot_run", "READY_TO_RUN": "ready_to_run", "RUNNING": "running"} 25 | }, 26 | "runtime": { 27 | "OnInstalledReason": { 28 | "CHROME_UPDATE": "chrome_update", 29 | "INSTALL": "install", 30 | "SHARED_MODULE_UPDATE": "shared_module_update", 31 | "UPDATE": "update" 32 | }, 33 | "OnRestartRequiredReason": {"APP_UPDATE": "app_update", "OS_UPDATE": "os_update", "PERIODIC": "periodic"}, 34 | "PlatformArch": { 35 | "ARM": "arm", 36 | "ARM64": "arm64", 37 | "MIPS": "mips", 38 | "MIPS64": "mips64", 39 | "X86_32": "x86-32", 40 | "X86_64": "x86-64" 41 | }, 42 | "PlatformNaclArch": {"ARM": "arm", "MIPS": "mips", "MIPS64": "mips64", "X86_32": "x86-32", "X86_64": "x86-64"}, 43 | "PlatformOs": { 44 | "ANDROID": "android", 45 | "CROS": "cros", 46 | "LINUX": "linux", 47 | "MAC": "mac", 48 | "OPENBSD": "openbsd", 49 | "WIN": "win" 50 | }, 51 | "RequestUpdateCheckStatus": { 52 | "NO_UPDATE": "no_update", 53 | "THROTTLED": "throttled", 54 | "UPDATE_AVAILABLE": "update_available" 55 | } 56 | } 57 | }) 58 | ''' 59 | 60 | SET_PERMISSION = ''' 61 | const originalQuery = window.navigator.permissions.query; 62 | return window.navigator.permissions.query = (parameters) => ( 63 | parameters.name === 'notifications' ? 64 | Promise.resolve({ state: Notification.permission }) : 65 | originalQuery(parameters) 66 | ) 67 | ''' 68 | 69 | SCRIPTS = [ 70 | SET_WEBDRIVER, 71 | SET_USER_AGENT, 72 | SET_APP_VERSION, 73 | EXTEND_LANGUAGES, 74 | EXTEND_PLUGINS, 75 | EXTEND_MIME_TYPES, 76 | SET_CHROME_INFO, 77 | SET_PERMISSION, 78 | SET_WEBGL, 79 | ] 80 | -------------------------------------------------------------------------------- /gerapy_selenium/request.py: -------------------------------------------------------------------------------- 1 | from scrapy import Request 2 | import copy 3 | 4 | 5 | class SeleniumRequest(Request): 6 | """ 7 | Scrapy ``Request`` subclass providing additional arguments 8 | """ 9 | 10 | def __init__(self, url, callback=None, wait_for=None, script=None, proxy=None, 11 | sleep=None, timeout=None, pretend=None, screenshot=None, meta=None, *args, 12 | **kwargs): 13 | """ 14 | :param url: request url 15 | :param callback: callback 16 | :param wait_for: wait for some element to load, also supports dict 17 | :param script: script to execute 18 | :param proxy: use proxy for this time, like `http://x.x.x.x:x` 19 | :param sleep: time to sleep after loaded, override `GERAPY_SELENIUM_SLEEP` 20 | :param timeout: load timeout, override `GERAPY_SELENIUM_DOWNLOAD_TIMEOUT` 21 | :param pretend: pretend as normal browser, override `GERAPY_SELENIUM_PRETEND` 22 | :param screenshot: ignored resource types, see 23 | https://miyakogi.github.io/pyppeteer/_modules/pyppeteer/page.html#Page.screenshot, 24 | override `GERAPY_SELENIUM_SCREENSHOT` 25 | :param args: 26 | :param kwargs: 27 | """ 28 | # use meta info to save args 29 | meta = copy.deepcopy(meta) or {} 30 | selenium_meta = meta.get('selenium') or {} 31 | 32 | self.wait_for = selenium_meta.get('wait_for') if selenium_meta.get('wait_for') is not None else wait_for 33 | self.script = selenium_meta.get('script') if selenium_meta.get('script') is not None else script 34 | self.sleep = selenium_meta.get('sleep') if selenium_meta.get('sleep') is not None else sleep 35 | self.proxy = selenium_meta.get('proxy') if selenium_meta.get('proxy') is not None else proxy 36 | self.pretend = selenium_meta.get('pretend') if selenium_meta.get('pretend') is not None else pretend 37 | self.timeout = selenium_meta.get('timeout') if selenium_meta.get('timeout') is not None else timeout 38 | self.screenshot = selenium_meta.get('screenshot') if selenium_meta.get( 39 | 'screenshot') is not None else screenshot 40 | 41 | selenium_meta = meta.setdefault('selenium', {}) 42 | selenium_meta['wait_for'] = self.wait_for 43 | selenium_meta['script'] = self.script 44 | selenium_meta['sleep'] = self.sleep 45 | selenium_meta['proxy'] = self.proxy 46 | selenium_meta['pretend'] = self.pretend 47 | selenium_meta['timeout'] = self.timeout 48 | selenium_meta['screenshot'] = self.screenshot 49 | 50 | super().__init__(url, callback, meta=meta, *args, **kwargs) 51 | -------------------------------------------------------------------------------- /gerapy_selenium/settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # selenium logging level 4 | GERAPY_SELENIUM_LOGGING_LEVEL = logging.WARNING 5 | 6 | # selenium timeout 7 | GERAPY_SELENIUM_DOWNLOAD_TIMEOUT = 30 8 | 9 | # selenium browser window 10 | GERAPY_SELENIUM_WINDOW_WIDTH = 1400 11 | GERAPY_SELENIUM_WINDOW_HEIGHT = 700 12 | 13 | # selenium settings 14 | GERAPY_SELENIUM_HEADLESS = True 15 | GERAPY_SELENIUM_EXECUTABLE_PATH = None 16 | GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS = False 17 | GERAPY_SELENIUM_PRETEND = True 18 | 19 | # selenium args 20 | GERAPY_SELENIUM_DISABLE_EXTENSIONS = True 21 | GERAPY_SELENIUM_HIDE_SCROLLBARS = True 22 | GERAPY_SELENIUM_MUTE_AUDIO = True 23 | GERAPY_SELENIUM_NO_SANDBOX = True 24 | GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX = True 25 | GERAPY_SELENIUM_DISABLE_GPU = True 26 | 27 | GERAPY_SELENIUM_SCREENSHOT = None 28 | GERAPY_SELENIUM_SLEEP = 1 29 | 30 | 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=1.5.0 2 | selenium>=3.5.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from os.path import join, isfile 5 | from os import walk 6 | import io 7 | import os 8 | import sys 9 | from shutil import rmtree 10 | from setuptools import find_packages, setup, Command 11 | 12 | 13 | def read_file(filename): 14 | with open(filename) as fp: 15 | return fp.read().strip() 16 | 17 | 18 | def read_requirements(filename): 19 | return [line.strip() for line in read_file(filename).splitlines() 20 | if not line.startswith('#')] 21 | 22 | 23 | NAME = 'gerapy-selenium' 24 | FOLDER = 'gerapy_selenium' 25 | DESCRIPTION = 'Selenium Components for Scrapy & Gerapy' 26 | URL = 'https://github.com/Gerapy/GerapySelenium' 27 | EMAIL = 'cqc@cuiqingcai.com' 28 | AUTHOR = 'Germey' 29 | REQUIRES_PYTHON = '>=3.6.0' 30 | VERSION = None 31 | 32 | REQUIRED = read_requirements('requirements.txt') 33 | 34 | here = os.path.abspath(os.path.dirname(__file__)) 35 | 36 | try: 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 38 | long_description = '\n' + f.read() 39 | except FileNotFoundError: 40 | long_description = DESCRIPTION 41 | 42 | about = {} 43 | if not VERSION: 44 | with open(os.path.join(here, FOLDER, '__version__.py')) as f: 45 | exec(f.read(), about) 46 | else: 47 | about['__version__'] = VERSION 48 | 49 | 50 | def package_files(directories): 51 | paths = [] 52 | for item in directories: 53 | if isfile(item): 54 | paths.append(join('..', item)) 55 | continue 56 | for (path, directories, filenames) in walk(item): 57 | for filename in filenames: 58 | paths.append(join('..', path, filename)) 59 | return paths 60 | 61 | 62 | class UploadCommand(Command): 63 | description = 'Build and publish the package.' 64 | user_options = [] 65 | 66 | @staticmethod 67 | def status(s): 68 | """Prints things in bold.""" 69 | print('\033[1m{0}\033[0m'.format(s)) 70 | 71 | def initialize_options(self): 72 | pass 73 | 74 | def finalize_options(self): 75 | pass 76 | 77 | def run(self): 78 | try: 79 | self.status('Removing previous builds…') 80 | rmtree(os.path.join(here, 'dist')) 81 | except OSError: 82 | pass 83 | 84 | self.status('Building Source and Wheel (universal) distribution…') 85 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 86 | 87 | self.status('Uploading the package to PyPI via Twine…') 88 | os.system('twine upload dist/*') 89 | 90 | self.status('Pushing git tags…') 91 | os.system('git tag v{0}'.format(about['__version__'])) 92 | os.system('git push --tags') 93 | 94 | sys.exit() 95 | 96 | 97 | setup( 98 | name=NAME, 99 | version=about['__version__'], 100 | description=DESCRIPTION, 101 | long_description=long_description, 102 | long_description_content_type='text/markdown', 103 | author=AUTHOR, 104 | author_email=EMAIL, 105 | python_requires=REQUIRES_PYTHON, 106 | url=URL, 107 | packages=find_packages(exclude=('tests',)), 108 | install_requires=REQUIRED, 109 | include_package_data=True, 110 | license='MIT', 111 | classifiers=[ 112 | 'License :: OSI Approved :: MIT License', 113 | 'Programming Language :: Python :: 3.6', 114 | 'Programming Language :: Python :: 3.7', 115 | 'Programming Language :: Python :: 3.8', 116 | 'Programming Language :: Python :: Implementation :: CPython', 117 | 'Programming Language :: Python :: Implementation :: PyPy' 118 | ], 119 | # $ setup.py publish support. 120 | cmdclass={ 121 | 'upload': UploadCommand, 122 | }, 123 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gerapy/GerapySelenium/b8c8685d389c0d77093d6c1c0918fc382eacc879/tests/__init__.py --------------------------------------------------------------------------------