└── src
    ├── ch13
        ├── AndServerTest
        │   ├── app
        │   │   ├── .gitignore
        │   │   ├── src
        │   │   │   ├── main
        │   │   │   │   ├── res
        │   │   │   │   │   ├── mipmap-hdpi
        │   │   │   │   │   │   ├── ic_launcher.webp
        │   │   │   │   │   │   └── ic_launcher_round.webp
        │   │   │   │   │   ├── mipmap-mdpi
        │   │   │   │   │   │   ├── ic_launcher.webp
        │   │   │   │   │   │   └── ic_launcher_round.webp
        │   │   │   │   │   ├── mipmap-xhdpi
        │   │   │   │   │   │   ├── ic_launcher.webp
        │   │   │   │   │   │   └── ic_launcher_round.webp
        │   │   │   │   │   ├── mipmap-xxhdpi
        │   │   │   │   │   │   ├── ic_launcher.webp
        │   │   │   │   │   │   └── ic_launcher_round.webp
        │   │   │   │   │   ├── mipmap-xxxhdpi
        │   │   │   │   │   │   ├── ic_launcher.webp
        │   │   │   │   │   │   └── ic_launcher_round.webp
        │   │   │   │   │   ├── mipmap-anydpi-v26
        │   │   │   │   │   │   ├── ic_launcher.xml
        │   │   │   │   │   │   └── ic_launcher_round.xml
        │   │   │   │   │   ├── values
        │   │   │   │   │   │   ├── strings.xml
        │   │   │   │   │   │   ├── colors.xml
        │   │   │   │   │   │   └── themes.xml
        │   │   │   │   │   ├── values-night
        │   │   │   │   │   │   └── themes.xml
        │   │   │   │   │   ├── layout
        │   │   │   │   │   │   └── activity_main.xml
        │   │   │   │   │   └── drawable-v24
        │   │   │   │   │   │   └── ic_launcher_foreground.xml
        │   │   │   │   ├── java
        │   │   │   │   │   └── com
        │   │   │   │   │   │   ├── goldze
        │   │   │   │   │   │       └── mvvmhabit
        │   │   │   │   │   │       │   └── utils
        │   │   │   │   │   │       │       └── NativeUtils.java
        │   │   │   │   │   │   └── germey
        │   │   │   │   │   │       └── andservertest
        │   │   │   │   │   │           ├── AppController.java
        │   │   │   │   │   │           └── MainActivity.java
        │   │   │   │   └── AndroidManifest.xml
        │   │   │   ├── test
        │   │   │   │   └── java
        │   │   │   │   │   └── com
        │   │   │   │   │       └── germey
        │   │   │   │   │           └── andservertest
        │   │   │   │   │               └── ExampleUnitTest.java
        │   │   │   └── androidTest
        │   │   │   │   └── java
        │   │   │   │       └── com
        │   │   │   │           └── germey
        │   │   │   │               └── andservertest
        │   │   │   │                   └── ExampleInstrumentedTest.java
        │   │   ├── proguard-rules.pro
        │   │   └── build.gradle
        │   ├── gradle
        │   │   └── wrapper
        │   │   │   ├── gradle-wrapper.jar
        │   │   │   └── gradle-wrapper.properties
        │   ├── settings.gradle
        │   ├── build.gradle
        │   ├── gradle.properties
        │   └── gradlew.bat
        ├── files
        │   ├── frida_appbasic1.js
        │   ├── frida_rpc_app9.js
        │   └── frida_appbasic2.js
        ├── frida_appbasic1_demo.py
        ├── frida_appbasic2_demo.py
        ├── andserver_demo.py
        ├── jeb_demo.py
        ├── frida_rpc_demo.py
        └── ida_demo.py
    ├── ch15
        ├── scrapytutorial
        │   ├── scrapytutorial
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── quotes.py
        │   │   ├── items.py
        │   │   ├── extensions.py
        │   │   └── pipelines.py
        │   ├── run.py
        │   ├── scrapy.cfg
        │   └── server.py
        ├── scrapyseleniumdemo
        │   ├── scrapyseleniumdemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── book.py
        │   │   ├── items.py
        │   │   ├── pipelines.py
        │   │   └── settings.py
        │   ├── run.py
        │   └── scrapy.cfg
        ├── scrapyspiderdemo
        │   ├── scrapyspiderdemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── httpbin.py
        │   │   ├── items.py
        │   │   ├── pipelines.py
        │   │   └── settings.py
        │   ├── run.py
        │   └── scrapy.cfg
        ├── scrapypyppeteerdemo
        │   ├── scrapypyppeteerdemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── book.py
        │   │   ├── items.py
        │   │   ├── pipelines.py
        │   │   └── settings.py
        │   ├── run.py
        │   └── scrapy.cfg
        ├── scrapyuniversaldemo
        │   ├── scrapyuniversaldemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── movie.py
        │   │   │   └── universal.py
        │   │   ├── items.py
        │   │   ├── pipelines.py
        │   │   ├── utils.py
        │   │   ├── loaders.py
        │   │   ├── configs
        │   │   │   └── movie.json
        │   │   └── settings.py
        │   ├── scrapy.cfg
        │   └── run.py
        ├── scrapyitempipelinedemo
        │   ├── scrapyitempipelinedemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── scrape.py
        │   │   ├── items.py
        │   │   └── pipelines.py
        │   ├── run.py
        │   └── scrapy.cfg
        ├── scrapyspidermiddlewaredemo
        │   ├── scrapyspidermiddlewaredemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── httpbin.py
        │   │   ├── items.py
        │   │   └── pipelines.py
        │   ├── run.py
        │   └── scrapy.cfg
        ├── scrapydownloadermiddlewaredemo
        │   ├── scrapydownloadermiddlewaredemo
        │   │   ├── __init__.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── httpbin.py
        │   │   ├── items.py
        │   │   └── pipelines.py
        │   ├── run.py
        │   └── scrapy.cfg
        ├── scrape_selector_demo.py
        └── scrape_processor_demo.py
    ├── ch11
        ├── learn-ast
        │   ├── .babelrc
        │   ├── codes
        │   │   ├── code3.js
        │   │   ├── code2.js
        │   │   ├── code1.js
        │   │   ├── code5.js
        │   │   └── code4.js
        │   ├── package.json
        │   └── basic
        │   │   ├── basic1.js
        │   │   └── basic2.js
        ├── nodejs_demo
        │   ├── package.json
        │   ├── nodejs_client.py
        │   ├── nodejs_main.js
        │   └── nodejs_server.js
        ├── files
        │   └── Wasm.wasm
        ├── execjs_demo.py
        ├── pywasm_scrape_demo.py
        ├── wasmer_scrape_demo.py
        ├── execjs_web_demo.py
        └── js_scrape_practice.py
    ├── ch04
        ├── files
        │   ├── data.csv
        │   └── movies.txt
        ├── rabbitmq_oper_demo
        │   ├── scrape_producer.py
        │   ├── scrape_consume.py
        │   ├── consumer.py
        │   └── producer.py
        ├── text_oper_demo.py
        ├── csv_oper_demo.py
        ├── mongodb_demo.py
        └── elasticsearch_oper_demo.py
    ├── ch02
        ├── files
        │   ├── favicon.ico
        │   ├── mozilla_cookie.txt
        │   └── lwp_cookie.txt
        ├── urllib_demo
        │   ├── robotparser_demo.py
        │   ├── request_demo.py
        │   ├── request_hander_demo.py
        │   └── parse_demo.py
        ├── httpx_demo.py
        ├── requests_demo
        │   ├── advanced_use.py
        │   └── requests_demo.py
        └── regx_demo.py
    ├── ch08
        ├── files
        │   └── slide_captcha.png
        ├── tesserocr_demo.py
        └── opencv_demo.py
    ├── ch07
        ├── selenium_demo
        │   ├── files
        │   │   └── preview.png
        │   ├── back_forward.py
        │   ├── cookie_oper.py
        │   ├── tab_oper.py
        │   ├── headless_mode.py
        │   ├── exception_handle.py
        │   ├── node_interaction.py
        │   ├── action_chain.py
        │   ├── anti_shield.py
        │   ├── switch_frame.py
        │   ├── node_selector.py
        │   ├── node_info.py
        │   ├── simple_demo.py
        │   └── delay_wait.py
        ├── pyppeteer_demo
        │   ├── files
        │   │   ├── example2.png
        │   │   └── eval_example.png
        │   ├── dev_mode.py
        │   ├── incognito_mode.py
        │   ├── prevent_detect.py
        │   └── simple_demo.py
        ├── playwright_demo
        │   ├── files
        │   │   ├── np_picture.png
        │   │   ├── browser-iphone.png
        │   │   ├── screenshot-webkit.png
        │   │   ├── screenshot-chromium.png
        │   │   └── screenshot-firefox.png
        │   ├── mobile_web.py
        │   ├── simple_demo.py
        │   └── event_listen.py
        ├── css_locate_scrape.py
        └── font_scrape.py
    ├── ch01
        └── test.html
    ├── ch03
        ├── files
        │   └── test.html
        └── parsel_demo.py
    ├── ch10
        ├── account_pool
        │   ├── exceptions.py
        │   ├── utils.py
        │   ├── server.py
        │   ├── run_account_pool.py
        │   ├── storages_redis.py
        │   ├── tester.py
        │   ├── generator.py
        │   └── setting.py
        ├── jwt_simulate_login.py
        ├── antispider_scrape_with_account_pool.py
        └── session_cookie_simulate_login.py
    ├── ch06
        ├── coroutine_demo
        │   ├── coroutine_simple_demo.py
        │   ├── coroutine_task1.py
        │   ├── coroutine_task2.py
        │   ├── multi_task_coroutine.py
        │   ├── bing_callback.py
        │   └── coroutine_await_aiohttp.py
        ├── aiohttp_demo
        │   ├── timeout_demo.py
        │   ├── post_request.py
        │   ├── url_params.py
        │   ├── simple_demo.py
        │   ├── response_demo.py
        │   └── concurrency_demo.py
        └── aiohttp_scrape_demo.py
    ├── ch14
        └── ai_extract.md
    ├── ch12
        ├── appium_demo.py
        └── airtest_script.air
        │   └── airtest_script.py
    └── ch05
        └── scrape_ajax.py


/src/ch13/AndServerTest/app/.gitignore:
--------------------------------------------------------------------------------
1 | /build


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "presets": [
3 |     "@babel/preset-env"
4 |   ]
5 | }


--------------------------------------------------------------------------------
/src/ch04/files/data.csv:
--------------------------------------------------------------------------------
1 | id,name,age
2 | 10001,Mike,20
3 | 10002,Bob,22
4 | 10003,Jordan,21
5 | 


--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "express": "^4.17.2"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/src/ch11/files/Wasm.wasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch11/files/Wasm.wasm


--------------------------------------------------------------------------------
/src/ch02/files/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch02/files/favicon.ico


--------------------------------------------------------------------------------
/src/ch08/files/slide_captcha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch08/files/slide_captcha.png


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/files/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/selenium_demo/files/preview.png


--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/files/example2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/pyppeteer_demo/files/example2.png


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/np_picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/np_picture.png


--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/files/eval_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/pyppeteer_demo/files/eval_example.png


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/browser-iphone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/browser-iphone.png


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/screenshot-webkit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-webkit.png


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/screenshot-chromium.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-chromium.png


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/screenshot-firefox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-firefox.png


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code3.js:
--------------------------------------------------------------------------------
1 | /**
2 |  * @author: HuRuiFeng
3 |  * @file: code3.js
4 |  * @time:  11:18
5 |  * @project: python3-web-spider-learning
6 |  * @desc:
7 |  */
8 | 
9 | const strings = ["\"\x68\x65\x6c\x6c\x6f\"", "\"\x77\x6f\x72\x6c\x64\""];


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code2.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: code2.js
 4 |  * @time:  10:59
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc:
 7 |  */
 8 | 
 9 | const a = ![];
10 | const b = "abc" == "bcd"
11 | const c = (1 << 3) | 2;
12 | const d = parseInt("5" + "0")


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Mon Jan 17 20:43:10 CST 2022
2 | distributionBase=GRADLE_USER_HOME
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip
4 | distributionPath=wrapper/dists
5 | zipStorePath=wrapper/dists
6 | zipStoreBase=GRADLE_USER_HOME
7 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/java/com/goldze/mvvmhabit/utils/NativeUtils.java:
--------------------------------------------------------------------------------
 1 | package com.goldze.mvvmhabit.utils;
 2 | 
 3 | public class NativeUtils {
 4 | 
 5 |     static {
 6 |         System.loadLibrary("native");
 7 |     }
 8 | 
 9 |     public static native String encrypt(String str, int offset);
10 | }
11 | 


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/19 13:48
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.2 Scrapy入门（P743）
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'quotes'])
13 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/19 14:55
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.9 Scrapy对接Selenium
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'book'])
13 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/20 9:19
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.11 Scrapy对接Pyppeteer（P807）
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'book'])
13 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code1.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: code1.js
 4 |  * @time:  2022-01-14 09:45:29
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc:
 7 |  */
 8 | 
 9 | const a = 3;
10 | let string = "hello";
11 | for (let i = 0; i < a; i++) {
12 |     string += "world";
13 | }
14 | console.log("string", string)


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/19 14:55
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.7 Item Pipeline的使用（P781）
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'scrape'])
13 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/19 14:55
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.4 Spider的使用（P759）
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'httpbin', '--nolog'])
13 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
3 |     <background android:drawable="@drawable/ic_launcher_background" />
4 |     <foreground android:drawable="@drawable/ic_launcher_foreground" />
5 | </adaptive-icon>


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/19 14:55
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.6 Spider Middleware的使用（P775）
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'httpbin'])
13 | 


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapytutorial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapytutorial
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class QuoteItem(scrapy.Item):
10 |     text = scrapy.Field()
11 |     author = scrapy.Field()
12 |     tags = scrapy.Field()
13 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
3 |     <background android:drawable="@drawable/ic_launcher_background" />
4 |     <foreground android:drawable="@drawable/ic_launcher_foreground" />
5 | </adaptive-icon>


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/spiders/httpbin.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class HttpbinSpider(scrapy.Spider):
 5 |     name = 'httpbin'
 6 |     allowed_domains = ['www.httpbin.org']
 7 |     start_urls = ['https://www.httpbin.org/get']
 8 | 
 9 |     def parse(self, response):
10 |         print(response.text)
11 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapyspiderdemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyspiderdemo
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapyseleniumdemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyseleniumdemo
12 | 


--------------------------------------------------------------------------------
/src/ch01/test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>This is a Demo</title>
 6 | </head>
 7 | <body>
 8 |   <div id = "container">
 9 |     <div class = "wrapper">
10 |       <h2 class="title">Hello World</h2>
11 |       <p class="text">Hello, this is a paragraph.</p>
12 |     </div>
13 |   </div>
14 | </body>
15 | </html>


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/settings.gradle:
--------------------------------------------------------------------------------
 1 | dependencyResolutionManagement {
 2 |     repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
 3 |     repositories {
 4 |         google()
 5 |         mavenCentral()
 6 |         jcenter() // Warning: this repository is going to shut down soon
 7 |     }
 8 | }
 9 | rootProject.name = "AndServerTest"
10 | include ':app'
11 | 


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/19 14:55
 7 | @project: python3-web-spider-learning
 8 | @desc:15.5 Downloader Middleware的使用（P770）
 9 | """
10 | from scrapy.cmdline import execute
11 | 
12 | execute(['scrapy', 'crawl', 'httpbin', '--nolog'])
13 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapypyppeteerdemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapypyppeteerdemo
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapyuniversaldemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyuniversaldemo
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapyitempipelinedemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyitempipelinedemo
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class ScrapyspiderdemoItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     pass
13 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values/strings.xml:
--------------------------------------------------------------------------------
1 | <resources>
2 |     <string name="app_name">AndServerTest</string>
3 |     <string name="start_server">Start Server</string>
4 |     <string name="stop_server">Stop Server</string>
5 |     <string name="server_started">The server is started</string>
6 |     <string name="server_stopped">The server is stopped</string>
7 | </resources>


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapyspidermiddlewaredemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyspidermiddlewaredemo
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapydownloadermiddlewaredemo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapydownloadermiddlewaredemo
12 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy import Item, Field
 7 | 
 8 | 
 9 | class BookItem(Item):
10 |     name = Field()
11 |     tags = Field()
12 |     score = Field()
13 |     cover = Field()
14 |     price = Field()
15 | 


--------------------------------------------------------------------------------
/src/ch03/files/test.html:
--------------------------------------------------------------------------------
1 | <div>
2 |   <ul>
3 |     <li class="item-0"><a href="link1.html">first item</a></li>
4 |     <li class="item-1"><a href="link2.html">second item</a></li>
5 |     <li class="item-inactive"><a href="link3.html">third item</a></li>
6 |     <li class="item-1"><a href="link4.html">fourth item</a></li>
7 |     <li class="item-0"><a href="link5.html">fifth item</a>
8 |     </li></ul>
9 | </div>


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class DemoItem(scrapy.Item):
10 |     origin = scrapy.Field()
11 |     headers = scrapy.Field()
12 |     args = scrapy.Field()
13 |     url = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class ScrapydownloadermiddlewaredemoItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     pass
13 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | from scrapy import Item, Field
 8 | 
 9 | 
10 | class BookItem(Item):
11 |     name = Field()
12 |     tags = Field()
13 |     score = Field()
14 |     cover = Field()
15 |     price = Field()
16 | 


--------------------------------------------------------------------------------
/src/ch02/files/mozilla_cookie.txt:
--------------------------------------------------------------------------------
1 | # Netscape HTTP Cookie File
2 | # http://curl.haxx.se/rfc/cookie_spec.html
3 | # This is a generated file!  Do not edit.
4 | 
5 | .baidu.com	TRUE	/	FALSE	1672303248	BAIDUID	4DF8C4AA1B53D13A4C0A711C60505CAB:FG=1
6 | .baidu.com	TRUE	/	FALSE	3788250895	BIDUPSID	4DF8C4AA1B53D13A3F8EC394C3CC9551
7 | .baidu.com	TRUE	/	FALSE	3788250895	PSTM	1640767247
8 | www.baidu.com	FALSE	/	FALSE	1640767548	BD_NOT_HTTPS	1
9 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/exceptions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: exceptions.py
 6 | @time: 2022/1/12 10:36
 7 | @project: python3-web-spider-learning
 8 | @desc: 自定义异常
 9 | """
10 | 
11 | 
12 | class InitException(Exception):
13 |     def __str__(self):
14 |         """
15 |         init error
16 |         :return:
17 |         """
18 |         return repr('init failed')
19 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "learn-ast",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "devDependencies": {
12 |     "@babel/cli": "^7.16.8",
13 |     "@babel/core": "^7.16.7",
14 |     "@babel/preset-env": "^7.16.8"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | from scrapy import Item, Field
 8 | 
 9 | 
10 | class MovieItem(Item):
11 |     name = Field()
12 |     cover = Field()
13 |     categories = Field()
14 |     published_at = Field()
15 |     drama = Field()
16 |     score = Field()
17 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values/colors.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <resources>
 3 |     <color name="purple_200">#FFBB86FC</color>
 4 |     <color name="purple_500">#FF6200EE</color>
 5 |     <color name="purple_700">#FF3700B3</color>
 6 |     <color name="teal_200">#FF03DAC5</color>
 7 |     <color name="teal_700">#FF018786</color>
 8 |     <color name="black">#FF000000</color>
 9 |     <color name="white">#FFFFFFFF</color>
10 | </resources>


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class MovieItem(scrapy.Item):
10 |     name = scrapy.Field()
11 |     categories = scrapy.Field()
12 |     score = scrapy.Field()
13 |     drama = scrapy.Field()
14 |     directors = scrapy.Field()
15 |     actors = scrapy.Field()
16 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class ScrapyspiderdemoPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class ScrapyseleniumdemoPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class ScrapypyppeteerdemoPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class ScrapyuniversaldemoPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code5.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: code5.js
 4 |  * @time:  2022-01-14 11:40
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc:
 7 |  */
 8 | 
 9 | const s = "3|1|2".split("|");
10 | let x = 0;
11 | while (true) {
12 |   switch (s[x++]) {
13 |     case "1":
14 |       const a = 1;
15 |       continue;
16 |     case "2":
17 |       const b = 3;
18 |       continue;
19 |     case "3":
20 |       const c = 0;
21 |       continue;
22 |   }
23 |   break;
24 | }


--------------------------------------------------------------------------------
/src/ch13/files/frida_appbasic1.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: frida_appbasic1.js
 4 |  * @time:  2022-01-17 17:21
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc: frida Appbasic1 Hook script
 7 |  */
 8 | 
 9 | Java.perform(() => {
10 |     let MainActivity = Java.use('com.germey.appbasic1.MainActivity')
11 |     console.log('start hook')
12 |     MainActivity.getMessage.implementation = (arg1, arg2) => {
13 |         send('Start Hook!')
14 |         return '6'
15 |     }
16 | })


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class ScrapyspidermiddlewaredemoPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class ScrapydownloadermiddlewaredemoPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: utils.py
 6 | @time: 2022/1/20 15:52
 7 | @project: python3-web-spider-learning
 8 | @desc: 
 9 | """
10 | import json
11 | from os.path import join, dirname, realpath
12 | 
13 | 
14 | def get_config(name):
15 |     path = join(dirname(realpath(__file__)), 'configs', f'{name}.json')
16 |     with open(path, 'r', encoding='utf-8') as f:
17 |         return json.loads(f.read())
18 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/test/java/com/germey/andservertest/ExampleUnitTest.java:
--------------------------------------------------------------------------------
 1 | package com.germey.andservertest;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import static org.junit.Assert.*;
 6 | 
 7 | /**
 8 |  * Example local unit test, which will execute on the development machine (host).
 9 |  *
10 |  * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
11 |  */
12 | public class ExampleUnitTest {
13 |     @Test
14 |     public void addition_isCorrect() {
15 |         assertEquals(4, 2 + 2);
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/ch13/files/frida_rpc_app9.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: frida_rpc_app9.js
 4 |  * @time:  20:06
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc: frida RPC App9 Hook script
 7 |  */
 8 | 
 9 | rpc.exports = {
10 |     encrypt(string, offset) {
11 |         let token = null;
12 |         Java.perform(function () {
13 |             var util = Java.use("com.goldze.mvvmhabit.utils.NativeUtils").$new();
14 |             token = util.encrypt(string, offset)
15 |         });
16 |         return token;
17 |     }
18 | }


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/back_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: back_forward.py
 6 | @time: 2022/1/7 15:21
 7 | @project: python3-web-spider-learning
 8 | @desc: 前进和后退（P221）
 9 | """
10 | import time
11 | 
12 | from selenium import webdriver
13 | 
14 | browser = webdriver.Chrome()
15 | browser.get('https://www.baidu.com/')
16 | browser.get('https://www.taobao.com/')
17 | browser.get('https://www.python.org')
18 | browser.back()
19 | time.sleep(1)
20 | browser.forward()
21 | browser.close()
22 | 


--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/nodejs_client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: nodejs_client.py
 6 | @time: 2022/1/13 22:13
 7 | @project: python3-web-spider-learning
 8 | @desc: Python调用Node.js服务（P453）
 9 | """
10 | 
11 | import requests
12 | 
13 | data = {
14 |     "name": "凯文-杜兰特",
15 |     "image": "durant.png",
16 |     "birthday": "1988-09-29",
17 |     "height": "208cm",
18 |     "weight": "108.9KG"
19 | }
20 | 
21 | url = 'http://localhost:3000'
22 | response = requests.post(url, json=data)
23 | print(response.text)
24 | 


--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_simple_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: coroutine_simple_demo.py
 6 | @time: 2022/1/6 17:23
 7 | @project: python3-web-spider-learning
 8 | @desc: 定义协程（P194）
 9 | """
10 | import asyncio
11 | 
12 | 
13 | async def execute(x):
14 |     print('Number:', x)
15 | 
16 | coroutine = execute(1)
17 | print('Coroutine:', coroutine)
18 | print('After calling execute')
19 | 
20 | loop = asyncio.get_event_loop()
21 | # 将协程对象注册到事件循环上
22 | loop.run_until_complete(coroutine)
23 | print('After calling loop')
24 | 


--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/dev_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: dev_mode.py
 6 | @time: 2022/1/10 9:27
 7 | @project: python3-web-spider-learning
 8 | @desc: 调试模式（P247）
 9 | """
10 | import asyncio
11 | 
12 | from pyppeteer import launch
13 | 
14 | 
15 | async def main():
16 |     browser = await launch(devtools=True, args=['--disable-infobars'])
17 |     page = await browser.newPage()
18 |     await page.goto('https://www.baidu.com')
19 |     await asyncio.sleep(100)
20 | 
21 | asyncio.get_event_loop().run_until_complete(main())
22 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/basic/basic1.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: basic1.js
 4 |  * @time:  2022-01-14 09:47:06
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc:
 7 |  */
 8 | 
 9 | import {parse} from "@babel/parser"
10 | import generate from "@babel/generator"
11 | import fs from "fs"
12 | 
13 | const code = fs.readFileSync("../codes/code1.js", "utf-8")
14 | let ast = parse(code)
15 | console.log(ast)
16 | console.log(ast.program.body)
17 | 
18 | const {code: output} = generate(ast, {
19 |     ratainLines: true,
20 |     comments: false,
21 | });
22 | console.log(output)


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/loaders.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: loaders.py
 6 | @time: 2022/1/20 16:36
 7 | @project: python3-web-spider-learning
 8 | @desc: 
 9 | """
10 | 
11 | from scrapy.loader import ItemLoader
12 | from itemloaders.processors import TakeFirst, Identity, Compose
13 | 
14 | 
15 | class MovieItemLoader(ItemLoader):
16 |     default_output_processor = TakeFirst()
17 |     categories_out = Identity()
18 |     score_out = Compose(TakeFirst(), str.strip)
19 |     drama_out = Compose(TakeFirst(), str.strip)


--------------------------------------------------------------------------------
/src/ch02/files/lwp_cookie.txt:
--------------------------------------------------------------------------------
1 | #LWP-Cookies-2.0
2 | Set-Cookie3: BAIDUID="658C2C37B45D9239BAC08ECC578950E0:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2022-12-29 08:42:08Z"; comment=bd; version=0
3 | Set-Cookie3: BIDUPSID=658C2C37B45D92392188E29355D808F6; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-01-16 11:56:15Z"; version=0
4 | Set-Cookie3: PSTM=1640767327; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-01-16 11:56:15Z"; version=0
5 | Set-Cookie3: BD_NOT_HTTPS=1; path="/"; domain="www.baidu.com"; path_spec; expires="2021-12-29 08:47:08Z"; version=0
6 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/build.gradle:
--------------------------------------------------------------------------------
 1 | // Top-level build file where you can add configuration options common to all sub-projects/modules.
 2 | buildscript {
 3 |     repositories {
 4 |         google()
 5 |         mavenCentral()
 6 |     }
 7 |     dependencies {
 8 |         classpath 'com.android.tools.build:gradle:4.1.3'
 9 |         classpath 'com.yanzhenjie.andserver:plugin:2.1.9'
10 |         // NOTE: Do not place your application dependencies here; they belong
11 |         // in the individual module build.gradle files
12 |     }
13 | }
14 | 
15 | 
16 | task clean(type: Delete) {
17 |     delete rootProject.buildDir
18 | }


--------------------------------------------------------------------------------
/src/ch15/scrape_selector_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: scrape_selector_demo.py
 6 | @time: 2022/1/19 14:08
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.3 Selector的使用（P754）
 9 | """
10 | from scrapy import Selector
11 | 
12 | 
13 | def selector_demo():
14 |     # 直接使用
15 |     body = '<html><head><title>Hello World</title></head><body></body></html>'
16 |     selector = Selector(text=body)
17 |     title = selector.xpath('//title/text()').extract_first()
18 |     print(title)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     selector_demo()
23 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/cookie_oper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: cookie_oper.py
 6 | @time: 2022/1/7 15:28
 7 | @project: python3-web-spider-learning
 8 | @desc: Cookie操作（P222）
 9 | """
10 | from selenium import webdriver
11 | 
12 | browser = webdriver.Chrome()
13 | browser.get('https://www.zhihu.com/explore')
14 | print(browser.get_cookies())
15 | browser.add_cookie({'name': 'name',
16 |                     'domain': 'www.zhihu.com',
17 |                     'value': 'germey'})
18 | print(browser.get_cookies())
19 | browser.delete_all_cookies()
20 | print(browser.get_cookies())
21 | 


--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_task1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: coroutine_task1.py
 6 | @time: 2022/1/6 17:31
 7 | @project: python3-web-spider-learning
 8 | @desc: 协程task的使用（P194）
 9 | """
10 | import asyncio
11 | 
12 | 
13 | async def execute(x):
14 |     print('Number:', x)
15 |     return x
16 | 
17 | coroutine = execute(1)
18 | print('Coroutine:', coroutine)
19 | print('After calling execute')
20 | 
21 | loop = asyncio.get_event_loop()
22 | task = loop.create_task(coroutine)
23 | print('Task:', task)
24 | loop.run_until_complete(task)
25 | print('Task:', task)
26 | print('After calling loop')


--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_task2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: coroutine_task2.py
 6 | @time: 2022/1/6 18:50
 7 | @project: python3-web-spider-learning
 8 | @desc: 协程task的使用（P195）
 9 | """
10 | import asyncio
11 | 
12 | 
13 | async def execute(x):
14 |     print('Number:', x)
15 |     return x
16 | 
17 | coroutine = execute(1)
18 | print('Coroutine:', coroutine)
19 | print('After calling execute')
20 | 
21 | task = asyncio.ensure_future(coroutine)
22 | print('Task:', task)
23 | loop = asyncio.get_event_loop()
24 | loop.run_until_complete(task)
25 | print('Task:', task)
26 | print('After calling loop')


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/tab_oper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: tab_oper.py
 6 | @time: 2022/1/7 15:32
 7 | @project: python3-web-spider-learning
 8 | @desc: 选项卡管理（P222）
 9 | """
10 | import time
11 | 
12 | from selenium import webdriver
13 | 
14 | browser = webdriver.Chrome()
15 | browser.get('https://www.baidu.com')
16 | browser.execute_script('window.open()')
17 | print(browser.window_handles)
18 | browser.switch_to.window(browser.window_handles[1])
19 | browser.get('https://www.taobao.com')
20 | time.sleep(1)
21 | browser.switch_to.window(browser.window_handles[0])
22 | browser.get('https://python.org')


--------------------------------------------------------------------------------
/src/ch13/frida_appbasic1_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: frida_demo.py
 6 | @time: 2022/1/17 17:20
 7 | @project: python3-web-spider-learning
 8 | @desc: 13.5 Frida的使用，AppBasic1（P645）
 9 | """
10 | import sys
11 | 
12 | import frida
13 | 
14 | CODE = open('files/frida_appbasic1.js', encoding='utf-8').read()
15 | PROCESS_NAME = 'AppBasic1'
16 | 
17 | 
18 | def on_message(message, data):
19 |     print(message)
20 | 
21 | 
22 | process = frida.get_usb_device().attach(PROCESS_NAME)
23 | script = process.create_script(CODE)
24 | script.on('message', on_message)
25 | script.load()
26 | sys.stdin.read()
27 | 


--------------------------------------------------------------------------------
/src/ch13/frida_appbasic2_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: frida_appbasic2_demo.py
 6 | @time: 2022/1/17 17:43
 7 | @project: python3-web-spider-learning
 8 | @desc: 13.5 Frida的使用，AppBasic2（P648）
 9 | """
10 | import sys
11 | 
12 | import frida
13 | 
14 | CODE = open('files/frida_appbasic2.js', encoding='utf-8').read()
15 | PROCESS_NAME = 'AppBasic2'
16 | 
17 | 
18 | def on_message(message, data):
19 |     print(message)
20 | 
21 | 
22 | process = frida.get_usb_device().attach(PROCESS_NAME)
23 | script = process.create_script(CODE)
24 | script.on('message', on_message)
25 | script.load()
26 | sys.stdin.read()
27 | 


--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/timeout_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: timeout_demo.py
 6 | @time: 2022/1/6 19:58
 7 | @project: python3-web-spider-learning
 8 | @desc: 超时设置（P205）
 9 | """
10 | import asyncio
11 | 
12 | import aiohttp
13 | 
14 | 
15 | async def main():
16 |     timeout = aiohttp.ClientTimeout(total=1)
17 |     async with aiohttp.ClientSession(timeout=timeout) as session:
18 |         async with session.get('https://www.httpbin.org/get') as response:
19 |             print('status:', response.status)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     asyncio.get_event_loop().run_until_complete(main())
24 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/headless_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: headless_mode.py
 6 | @time: 2022/1/7 15:49
 7 | @project: python3-web-spider-learning
 8 | @desc: 无头模式（P225）
 9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver import ChromeOptions
12 | import os
13 | 
14 | option = ChromeOptions()
15 | option.add_argument('--headless')
16 | browser = webdriver.Chrome(options=option)
17 | browser.set_window_size(1366, 768)
18 | browser.get('https://www.baidu.com')
19 | 
20 | if not os.path.exists('files'):
21 |     os.makedirs('files')
22 | 
23 | browser.get_screenshot_as_file('files/preview.png')
24 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/exception_handle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: exception_handle.py
 6 | @time: 2022/1/7 15:35
 7 | @project: python3-web-spider-learning
 8 | @desc: 异常处理（P223）
 9 | """
10 | from selenium import webdriver
11 | from selenium.common.exceptions import TimeoutException, NoSuchElementException
12 | 
13 | browser = webdriver.Chrome()
14 | try:
15 |     browser.get('https://www.baidu.com')
16 | except TimeoutException:
17 |     print('Time Out')
18 | 
19 | try:
20 |     browser.find_element_by_id('hello')
21 | except NoSuchElementException:
22 |     print('No Element')
23 | finally:
24 |     browser.close()
25 | 


--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/post_request.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: post_request.py
 6 | @time: 2022/1/6 19:52
 7 | @project: python3-web-spider-learning
 8 | @desc: POST请求（P203）
 9 | """
10 | import asyncio
11 | 
12 | import aiohttp
13 | 
14 | 
15 | async def main():
16 |     data = {
17 |         'name': 'germey',
18 |         'age': 25
19 |     }
20 |     async with aiohttp.ClientSession() as session:
21 |         async with session.post('https://www.httpbin.org/post', data=data) as response:
22 |             print(await response.text())
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.get_event_loop().run_until_complete(main())
27 | 


--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/url_params.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: url_params.py
 6 | @time: 2022/1/6 19:49
 7 | @project: python3-web-spider-learning
 8 | @desc: URL参数设置（P203）
 9 | """
10 | import asyncio
11 | 
12 | import aiohttp
13 | 
14 | 
15 | async def main():
16 |     params = {
17 |         'name': 'germey',
18 |         'age': 25
19 |     }
20 |     async with aiohttp.ClientSession() as session:
21 |         async with session.get('https://www.httpbin.org/get', params=params) as response:
22 |             print(await response.text())
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     asyncio.get_event_loop().run_until_complete(main())
27 | 


--------------------------------------------------------------------------------
/src/ch14/ai_extract.md:
--------------------------------------------------------------------------------
 1 | # 智能解析实现思路
 2 | 
 3 | ## 1 详情页智能解析实现思路
 4 | 1. 提取标题：提取页面的h节点，将内容与title节点的文本进行比较，取出相似度最高的内容，即详情页的标题
 5 | 2. 提取时间：通过设置meta规则和时间匹配规则，得到时间
 6 | 3. 提取正文：将正文进行预处理（删除无用标签和其中的内容、删除标签对、删除噪声标签），通过计算文本密度和符号密度，根据得到的分数，取出分数最高的节点，即为正文内容所在的节点，将各节点进行拼接，得到正文
 7 | 
 8 | ## 2 列表页智能解析实现思路
 9 | 1. 数据预处理：将内容进行预处理（和详情页的提取正文中的预处理一致）
10 | 2. 选取组节点：通过父节点选择器以及相关的限制条件（限制兄弟节点数量、限制成员节点的文本内容最小长度、限制成员节点的文本内容最大长度、限制兄弟节点的相似度），得到符合要求的组节点
11 | 3. 合并组节点：通过简单的聚类方法，将组节点进行合并分类
12 | 4. 挑选最佳组节点：通过成员节点数量、平均字数分布、文本密度计算分数，选出分数最高的组节点
13 | 5. 提取标题和链接：根据标题长度计算置信度，得到最优节点路径，并通过成员节点提取标题和链接
14 | 
15 | ## 3 智能分辨列表页和详情页
16 | &emsp;&emsp;采用SVM模型，通过页面的特征（文本密度、超链接节点的数量和比例、符号密度、列表簇的数量、meta信息、正文标题和title内容的相似度），处理数据和训练模型，得到最终的分类模型，用于分辨列表页和详情页。


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/node_interaction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: node_interaction.py
 6 | @time: 2022/1/7 10:20
 7 | @project: python3-web-spider-learning
 8 | @desc: 节点交互（P216）
 9 | """
10 | import time
11 | 
12 | from selenium import webdriver
13 | 
14 | browser = webdriver.Chrome()
15 | browser.get('https://www.taobao.com')
16 | # 得到搜索框
17 | input = browser.find_element_by_id('q')
18 | # 输入搜索词“iPhone”
19 | input.send_keys('iPhone')
20 | time.sleep(1)
21 | # 清空搜索框
22 | input.clear()
23 | # 输入搜索词“iPad”
24 | input.send_keys('iPad')
25 | # 得到搜索按钮
26 | button = browser.find_element_by_class_name('btn-search')
27 | # 点击搜索按钮
28 | button.click()
29 | 


--------------------------------------------------------------------------------
/src/ch11/execjs_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: execjs_demo.py
 6 | @time: 2022/1/13 21:35
 7 | @project: python3-web-spider-learning
 8 | @desc: 11.5 使用Python模拟执行javascript（P446）
 9 | """
10 | 
11 | import execjs
12 | import json
13 | 
14 | item = {
15 |     "name": "勒布朗-詹姆斯",
16 |     "image": "james.png",
17 |     "birthday": "1984-12-30",
18 |     "height": "206cm",
19 |     "weight": "113.4KG"
20 | }
21 | 
22 | file = 'files/execjs_crypto.js'
23 | node = execjs.get()
24 | ctx = node.compile(open(file).read())
25 | 
26 | js = f"getToken({json.dumps(item, ensure_ascii=False)})"
27 | print(js)
28 | result = ctx.eval(js)
29 | print(result)
30 | 


--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/multi_task_coroutine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: multi_task_coroutine.py
 6 | @time: 2022/1/6 18:58
 7 | @project: python3-web-spider-learning
 8 | @desc: 多任务协程（P196）
 9 | """
10 | import asyncio
11 | 
12 | import requests
13 | 
14 | 
15 | async def request():
16 |     url = 'https://www.baidu.com'
17 |     status = requests.get(url)
18 |     return status
19 | 
20 | 
21 | tasks = [asyncio.ensure_future(request()) for _ in range(5)]
22 | print('Task:', tasks)
23 | 
24 | loop = asyncio.get_event_loop()
25 | loop.run_until_complete(asyncio.wait(tasks))
26 | 
27 | for task in tasks:
28 |     print('Task Result:', task.result())
29 | 


--------------------------------------------------------------------------------
/src/ch11/pywasm_scrape_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: pywasm_scrape_demo.py
 6 | @time: 2022/1/14 15:00
 7 | @project: python3-web-spider-learning
 8 | @desc: 11.11 WebAssembly案例分析和爬取实战（P495）
 9 | """
10 | import time
11 | 
12 | import pywasm
13 | import requests
14 | 
15 | BASE_URL = 'https://spa14.scrape.center'
16 | TOTAL_PAGE = 10
17 | 
18 | runtime = pywasm.load('files/Wasm.wasm')
19 | for i in range(TOTAL_PAGE):
20 |     offset = i * 10
21 |     sign = runtime.exec('encrypt', [offset, int(time.time())])
22 |     url = f'{BASE_URL}/api/movie/?limit=10&offset={offset}&sign={sign}'
23 |     response = requests.get(url)
24 |     print(response.json())
25 | 


--------------------------------------------------------------------------------
/src/ch13/files/frida_appbasic2.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: frida_appbasic1.js
 4 |  * @time:  2022-01-17 17:39
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc: frida Appbasic2 Hook script
 7 |  */
 8 | 
 9 | Java.perform(function () {
10 |     Interceptor.attach(Module.findExportByName('libnative.so', 'Java_com_appbasic2_MainActivity_getMessage'), {
11 |         onEnter: function (args) {
12 |             send('hook onEnter')
13 |             send('args[1]=' + args[2])
14 |             send('args[2]=' + args[3])
15 |         },
16 |         onLeave: function (val) {
17 |             send('hook Leave')
18 |             val.replace(Java.vm.getEnv().newStringUtf('5'))
19 |         }
20 |     })
21 | })


--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/spiders/httpbin.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy import Request
 3 | 
 4 | from ch15.scrapyspidermiddlewaredemo.scrapyspidermiddlewaredemo.items import DemoItem
 5 | 
 6 | 
 7 | class HttpbinSpider(scrapy.Spider):
 8 |     name = 'httpbin'
 9 |     allowed_domains = ['www.httpbin.org']
10 |     start_url = 'https://www.httpbin.org/get'
11 | 
12 |     def start_requests(self):
13 |         for i in range(5):
14 |             url = f'{self.start_url}?query={i}'
15 |             yield Request(url, callback=self.parse)
16 | 
17 |     def parse(self, response):
18 |         item = DemoItem(**response.json())
19 |         print('Status:', response.status)
20 |         yield item
21 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/action_chain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: action_chain.py
 6 | @time: 2022/1/7 10:25
 7 | @project: python3-web-spider-learning
 8 | @desc: 动作链（P217）
 9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver import ActionChains
12 | 
13 | browser = webdriver.Chrome()
14 | url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
15 | browser.get(url)
16 | browser.switch_to.frame('iframeResult')
17 | source = browser.find_element_by_css_selector('#draggable')
18 | target = browser.find_element_by_css_selector('#droppable')
19 | actions = ActionChains(browser)
20 | actions.drag_and_drop(source, target)
21 | actions.perform()
22 | 


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/mobile_web.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: mobile_web.py
 6 | @time: 2022/1/10 19:48
 7 | @project: python3-web-spider-learning
 8 | @desc: 支持移动端浏览器（P261）
 9 | """
10 | from playwright.sync_api import sync_playwright
11 | 
12 | with sync_playwright() as p:
13 |     iphone_12_pro_max = p.devices['iPhone 12 Pro Max']
14 |     browser = p.webkit.launch(headless=False)
15 |     context = browser.new_context(**iphone_12_pro_max, locale='zh-CN')
16 |     page = context.new_page()
17 |     page.goto('https://www.whatismybrowser.com')
18 |     page.wait_for_load_state(state='networkidle')
19 |     page.screenshot(path='files/browser-iphone.png')
20 |     browser.close()
21 | 


--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/bing_callback.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: bing_callback.py
 6 | @time: 2022/1/6 18:53
 7 | @project: python3-web-spider-learning
 8 | @desc: 绑定回调（P196）
 9 | """
10 | import asyncio
11 | 
12 | import requests
13 | 
14 | 
15 | async def request():
16 |     url = 'https://www.baidu.com'
17 |     status = requests.get(url)
18 |     return status
19 | 
20 | 
21 | def callback(task):
22 |     print('Status:', task.result())
23 | 
24 | 
25 | coroutine = request()
26 | task = asyncio.ensure_future(coroutine)
27 | task.add_done_callback(callback)
28 | print('Task:', task)
29 | 
30 | loop = asyncio.get_event_loop()
31 | loop.run_until_complete(task)
32 | print('Task:', task)
33 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: utils.py
 6 | @time: 2022/1/12 10:23
 7 | @project: python3-web-spider-learning
 8 | @desc: 
 9 | """
10 | import re
11 | 
12 | 
13 | def parse_redis_connection_string(connection_string):
14 |     """
15 |     parse a redis connection string, for example:
16 |     redis://[password]@host:port
17 |     rediss://[password]@host:port
18 |     :param connection_string:
19 |     :return:
20 |     """
21 |     result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string)
22 |     return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \
23 |         else ('localhost', 6379, None)


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: server.py
 6 | @time: 2022/1/19 18:38
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.8 Extension的使用（P793）
 9 | """
10 | from flask import Flask, request, jsonify
11 | from loguru import logger
12 | 
13 | app = Flask(__name__)
14 | 
15 | 
16 | @app.route('/notify', methods=['POST'])
17 | def receive():
18 |     post_data = request.get_json()
19 |     event = post_data.get('event')
20 |     data = post_data.get('data')
21 |     logger.debug(f'received event {event}, data {data}')
22 |     return jsonify(status='success')
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     app.run(debug=True, host='0.0.0.0', port=5000)
27 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code4.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: code4.js
 4 |  * @time:  11:29
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc:
 7 |  */
 8 | 
 9 | const _0x16c18d = function () {
10 |   if (!![[]]) {
11 |     console.log("hello world");
12 |   } else {
13 |     console.log("this");
14 |     console.log("is");
15 |     console.log("dead");
16 |     console.log("code");
17 |   }
18 | };
19 | const _0x1f7292 = function () {
20 |   if ("xmv2nOdfy2N".charAt(4) !== String.fromCharCode(110)) {
21 |     console.log("this");
22 |     console.log("is");
23 |     console.log("dead");
24 |     console.log("code");
25 |   } else {
26 |     console.log("nice to meet you");
27 |   }
28 | };
29 | 
30 | _0x16c18d();
31 | _0x1f7292();


--------------------------------------------------------------------------------
/src/ch02/urllib_demo/robotparser_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: robotparser_demo.py
 6 | @time: 2021/12/31 13:39
 7 | @project: python3-web-spider-learning
 8 | @desc: Robots协议（P46）
 9 | """
10 | 
11 | from urllib.robotparser import RobotFileParser
12 | 
13 | 
14 | def print_can_fetch(rp, spider, url):
15 |     print(rp.can_fetch(spider, url))
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     rp = RobotFileParser()
20 |     rp.set_url('https://www.baidu.com/robots.txt')
21 |     rp.read()
22 |     print_can_fetch(rp, 'Baiduspider', 'https://www.baidu.com')
23 |     print_can_fetch(rp, 'Baiduspider', 'https://www.baidu.com/homepage/')
24 |     print_can_fetch(rp, 'Googlebot', 'https://www.baidu.com/homepage/')
25 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/anti_shield.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: anti_shield.py
 6 | @time: 2022/1/7 15:40
 7 | @project: python3-web-spider-learning
 8 | @desc: 反屏蔽（P224）
 9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver import ChromeOptions
12 | 
13 | option = ChromeOptions()
14 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
15 | option.add_experimental_option('useAutomationExtension', False)
16 | browser = webdriver.Chrome(options=option)
17 | browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
18 |     'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
19 | })
20 | browser.get('https://antispider1.scrape.center')
21 | 
22 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/switch_frame.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: switch_frame.py
 6 | @time: 2022/1/7 10:41
 7 | @project: python3-web-spider-learning
 8 | @desc: 切换Frame（P219）
 9 | """
10 | from selenium import webdriver
11 | from selenium.common.exceptions import NoSuchElementException
12 | 
13 | browser = webdriver.Chrome()
14 | url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
15 | browser.get(url)
16 | browser.switch_to.frame('iframeResult')
17 | try:
18 |     logo = browser.find_element_by_class_name('logo')
19 | except NoSuchElementException:
20 |     print('No Logo')
21 | 
22 | browser.switch_to.parent_frame()
23 | logo = browser.find_element_by_class_name('logo')
24 | print(logo)
25 | print(logo.text)
26 | 


--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/simple_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: simple_demo.py
 6 | @time: 2022/1/6 19:21
 7 | @project: python3-web-spider-learning
 8 | @desc: aiohttp基本实例（P202）
 9 | """
10 | import asyncio
11 | 
12 | import aiohttp
13 | 
14 | 
15 | async def fetch(session, url):
16 |     async with session.get(url) as response:
17 |         return await response.text(), response.status
18 | 
19 | 
20 | async def main():
21 |     async with aiohttp.ClientSession() as session:
22 |         html, status = await fetch(session, 'https://cuiqingcai.com')
23 |         print(f'html: {html[:100]}...')
24 |         print(f'status: {status}')
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
29 |     asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/incognito_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: incognito_mode.py
 6 | @time: 2022/1/10 18:26
 7 | @project: python3-web-spider-learning
 8 | @desc: 无痕模式（P252）
 9 | """
10 | import asyncio
11 | 
12 | from pyppeteer import launch
13 | 
14 | width, height = 1366, 768
15 | 
16 | 
17 | async def main():
18 |     # 设置浏览器窗口大小
19 |     browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width}, {height}'])
20 |     context = await browser.createIncogniteBrowserContext()
21 |     page = await context.newPage()
22 |     # 设置页面大小
23 |     await page.setViewport({'width': width, 'height': height})
24 |     await page.goto('https://www.baidu.com/')
25 |     await asyncio.sleep(100)
26 | 
27 | 
28 | asyncio.get_event_loop().run_until_complete(main())
29 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/node_selector.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: node_selector.py
 6 | @time: 2022/1/7 10:04
 7 | @project: python3-web-spider-learning
 8 | @desc: 查找节点（P215-P216）
 9 | """
10 | from selenium import webdriver
11 | 
12 | browser = webdriver.Chrome()
13 | browser.get('https://www.taobao.com')
14 | 
15 | 
16 | def get_signal_node():
17 |     input_first = browser.find_element_by_id('q')
18 |     input_second = browser.find_element_by_css_selector('#q')
19 |     input_third = browser.find_element_by_xpath('//*[@id="q"]')
20 |     print(input_first, input_second, input_third)
21 | 
22 | 
23 | def get_nodes():
24 |     lis = browser.find_elements_by_css_selector('.service-bd li')
25 |     print(lis)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     get_nodes()
30 |     browser.close()
31 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/proguard-rules.pro:
--------------------------------------------------------------------------------
 1 | # Add project specific ProGuard rules here.
 2 | # You can control the set of applied configuration files using the
 3 | # proguardFiles setting in build.gradle.
 4 | #
 5 | # For more details, see
 6 | #   http://developer.android.com/guide/developing/tools/proguard.html
 7 | 
 8 | # If your project uses WebView with JS, uncomment the following
 9 | # and specify the fully qualified class name to the JavaScript interface
10 | # class:
11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
12 | #   public *;
13 | #}
14 | 
15 | # Uncomment this to preserve the line number information for
16 | # debugging stack traces.
17 | #-keepattributes SourceFile,LineNumberTable
18 | 
19 | # If you keep the line number information, uncomment this to
20 | # hide the original source file name.
21 | #-renamesourcefileattribute SourceFile


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/java/com/germey/andservertest/AppController.java:
--------------------------------------------------------------------------------
 1 | package com.germey.andservertest;
 2 | 
 3 | import com.goldze.mvvmhabit.utils.NativeUtils;
 4 | import com.yanzhenjie.andserver.annotation.GetMapping;
 5 | import com.yanzhenjie.andserver.annotation.QueryParam;
 6 | import com.yanzhenjie.andserver.annotation.RestController;
 7 | 
 8 | import org.json.JSONObject;
 9 | 
10 | import java.util.HashMap;
11 | import java.util.Map;
12 | 
13 | @RestController
14 | public class AppController {
15 | 
16 |     @GetMapping("/encrypt")
17 |     public JSONObject login(@QueryParam("string") String string,
18 |                             @QueryParam("offset") int offset) {
19 |         Map<String, String> map = new HashMap<>();
20 |         String sign = NativeUtils.encrypt(string, offset);
21 |         map.put("sign", sign);
22 |         return new JSONObject(map);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/spiders/quotes.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | from ch15.scrapytutorial.scrapytutorial.items import QuoteItem
 4 | 
 5 | 
 6 | class QuotesSpider(scrapy.Spider):
 7 |     name = 'quotes'
 8 |     allowed_domains = ['quotes.toscrape.com']
 9 |     start_urls = ['http://quotes.toscrape.com/']
10 | 
11 |     def parse(self, response):
12 |         quotes = response.css('.quote')
13 |         for quote in quotes:
14 |             item = QuoteItem()
15 |             item['text'] = quote.css('.text::text').extract_first()
16 |             item['author'] = quote.css('.author::text').extract_first()
17 |             item['tags'] = quote.css('.tags .tag::text').extract()
18 |             yield item
19 | 
20 |         next = response.css('.pager .next a::attr("href")').extract_first()
21 |         url = response.urljoin(next)
22 |         yield scrapy.Request(url=url, callback=self.parse)
23 | 


--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/prevent_detect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: prevent_detect.py
 6 | @time: 2022/1/10 18:12
 7 | @project: python3-web-spider-learning
 8 | @desc: 防止检测（P248-P250）
 9 | """
10 | import asyncio
11 | 
12 | from pyppeteer import launch
13 | 
14 | width, height = 1366, 768
15 | 
16 | 
17 | async def main():
18 |     # 设置浏览器窗口大小
19 |     browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width}, {height}'])
20 |     page = await browser.newPage()
21 |     # 设置页面大小
22 |     await page.setViewport({'width': width, 'height': height})
23 |     await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: ()=> undefined})')
24 |     await page.goto('https://antispider1.scrape.center/')
25 |     await asyncio.sleep(100)
26 | 
27 | 
28 | asyncio.get_event_loop().run_until_complete(main())
29 | 


--------------------------------------------------------------------------------
/src/ch11/wasmer_scrape_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: wasmer_scrape_demo.py
 6 | @time: 2022/1/14 17:08
 7 | @project: python3-web-spider-learning
 8 | @desc: wasmer库实战
 9 | """
10 | import time
11 | 
12 | import requests
13 | from wasmer import engine, Store, Module, Instance
14 | from wasmer_compiler_cranelift.wasmer_compiler_cranelift import Compiler
15 | 
16 | # 读取wasm文件
17 | store = Store(engine.JIT(Compiler))
18 | module = Module(store, open('files/Wasm.wasm', 'rb').read())
19 | instance = Instance(module)
20 | 
21 | BASE_URL = 'https://spa14.scrape.center'
22 | TOTAL_PAGE = 10
23 | 
24 | for i in range(TOTAL_PAGE):
25 |     offset = i * 10
26 |     sign = instance.exports.encrypt(offset, int(time.time()))
27 |     url = f'{BASE_URL}/api/movie/?limit=10&offset={offset}&sign={sign}'
28 |     response = requests.get(url)
29 |     print(response.json())


--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/response_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: response_demo.py
 6 | @time: 2022/1/6 19:54
 7 | @project: python3-web-spider-learning
 8 | @desc: 响应（P205）
 9 | """
10 | import asyncio
11 | 
12 | import aiohttp
13 | 
14 | 
15 | async def main():
16 |     data = {
17 |         'name': 'germey',
18 |         'age': 25
19 |     }
20 |     async with aiohttp.ClientSession() as session:
21 |         async with session.post('https://www.httpbin.org/post', data=data) as response:
22 |             print('status:', response.status)
23 |             print('headers:', response.headers)
24 |             print('body:', await response.text())
25 |             print('bytes:', await response.read())
26 |             print('json:', await response.json())
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     asyncio.get_event_loop().run_until_complete(main())
31 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/node_info.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: node_info.py
 6 | @time: 2022/1/7 10:36
 7 | @project: python3-web-spider-learning
 8 | @desc: 获取节点信息（P218）
 9 | """
10 | from selenium import webdriver
11 | 
12 | browser = webdriver.Chrome()
13 | url = 'https://spa2.scrape.center/'
14 | browser.get(url)
15 | 
16 | 
17 | def get_attr():
18 |     logo = browser.find_element_by_class_name('logo-image')
19 |     print(logo)
20 |     print(logo.get_attribute('src'))
21 | 
22 | 
23 | def get_text():
24 |     input = browser.find_element_by_class_name('logo-title')
25 |     print(input.text)
26 | 
27 | 
28 | def get_other_info():
29 |     input = browser.find_element_by_class_name('logo-title')
30 |     print(input.id)
31 |     print(input.location)
32 |     print(input.tag_name)
33 |     print(input.size)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     get_other_info()
38 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values/themes.xml:
--------------------------------------------------------------------------------
 1 | <resources xmlns:tools="http://schemas.android.com/tools">
 2 |     <!-- Base application theme. -->
 3 |     <style name="Theme.AndServerTest" parent="Theme.MaterialComponents.DayNight.DarkActionBar">
 4 |         <!-- Primary brand color. -->
 5 |         <item name="colorPrimary">@color/purple_500</item>
 6 |         <item name="colorPrimaryVariant">@color/purple_700</item>
 7 |         <item name="colorOnPrimary">@color/white</item>
 8 |         <!-- Secondary brand color. -->
 9 |         <item name="colorSecondary">@color/teal_200</item>
10 |         <item name="colorSecondaryVariant">@color/teal_700</item>
11 |         <item name="colorOnSecondary">@color/black</item>
12 |         <!-- Status bar color. -->
13 |         <item name="android:statusBarColor" tools:targetApi="l">?attr/colorPrimaryVariant</item>
14 |         <!-- Customize your theme here. -->
15 |     </style>
16 | </resources>


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values-night/themes.xml:
--------------------------------------------------------------------------------
 1 | <resources xmlns:tools="http://schemas.android.com/tools">
 2 |     <!-- Base application theme. -->
 3 |     <style name="Theme.AndServerTest" parent="Theme.MaterialComponents.DayNight.DarkActionBar">
 4 |         <!-- Primary brand color. -->
 5 |         <item name="colorPrimary">@color/purple_200</item>
 6 |         <item name="colorPrimaryVariant">@color/purple_700</item>
 7 |         <item name="colorOnPrimary">@color/black</item>
 8 |         <!-- Secondary brand color. -->
 9 |         <item name="colorSecondary">@color/teal_200</item>
10 |         <item name="colorSecondaryVariant">@color/teal_200</item>
11 |         <item name="colorOnSecondary">@color/black</item>
12 |         <!-- Status bar color. -->
13 |         <item name="android:statusBarColor" tools:targetApi="l">?attr/colorPrimaryVariant</item>
14 |         <!-- Customize your theme here. -->
15 |     </style>
16 | </resources>


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/androidTest/java/com/germey/andservertest/ExampleInstrumentedTest.java:
--------------------------------------------------------------------------------
 1 | package com.germey.andservertest;
 2 | 
 3 | import android.content.Context;
 4 | 
 5 | import androidx.test.platform.app.InstrumentationRegistry;
 6 | import androidx.test.ext.junit.runners.AndroidJUnit4;
 7 | 
 8 | import org.junit.Test;
 9 | import org.junit.runner.RunWith;
10 | 
11 | import static org.junit.Assert.*;
12 | 
13 | /**
14 |  * Instrumented test, which will execute on an Android device.
15 |  *
16 |  * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
17 |  */
18 | @RunWith(AndroidJUnit4.class)
19 | public class ExampleInstrumentedTest {
20 |     @Test
21 |     public void useAppContext() {
22 |         // Context of the app under test.
23 |         Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
24 |         assertEquals("com.germey.andservertest", appContext.getPackageName());
25 |     }
26 | }


--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/scrape_producer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: scrape_producer.py
 6 | @time: 2022/1/6 15:10
 7 | @project: python3-web-spider-learning
 8 | @desc: RabbitMQ实战 生产者（P171）
 9 | """
10 | import pickle
11 | 
12 | import pika
13 | import requests
14 | 
15 | MAX_PRORITY = 100
16 | TOTAL = 100
17 | QUEUE_NAME = 'scrape_queue'
18 | 
19 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
20 | channel = connection.channel()
21 | channel.queue_declare(queue=QUEUE_NAME, durable=True)
22 | 
23 | for i in range(1, TOTAL + 1):
24 |     url = f'http://ssr1.scrape.center/detail/{i}'
25 |     request = requests.Request('GET', url)
26 |     channel.basic_publish(exchange='', routing_key=QUEUE_NAME,
27 |                           properties=pika.BasicProperties(delivery_mode=2),
28 |                           body=pickle.dumps(request))
29 |     print(f'Put request of {url}')
30 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/simple_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: simple_demo.py
 6 | @time: 2022/1/7 9:41
 7 | @project: python3-web-spider-learning
 8 | @desc: Selenium基本用法（P213）
 9 | """
10 | 
11 | from selenium import webdriver
12 | from selenium.webdriver.common.by import By
13 | from selenium.webdriver.common.keys import Keys
14 | from selenium.webdriver.support.wait import WebDriverWait
15 | from selenium.webdriver.support import expected_conditions as EC
16 | 
17 | browser = webdriver.Chrome()
18 | try:
19 |     browser.get('https://www.baidu.com')
20 |     input = browser.find_element_by_id('kw')
21 |     input.send_keys('Python')
22 |     input.send_keys(Keys.ENTER)
23 |     wait = WebDriverWait(browser, 10)
24 |     wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
25 |     print(browser.current_url)
26 |     print(browser.get_cookies())
27 |     print(browser.page_source)
28 | finally:
29 |     browser.close()


--------------------------------------------------------------------------------
/src/ch13/andserver_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: andserver_demo.py
 6 | @time: 2022/1/17 23:07
 7 | @project: python3-web-spider-learning
 8 | @desc: 13.10 基于AndServer-RPC模拟执行so文件（Python爬取数据）（P691）
 9 | """
10 | import requests
11 | 
12 | BASE_URL = 'https://app9.scrape.center'
13 | INDEX_URL = BASE_URL + '/api/movie?limit={limit}&offset={offset}&token={token}'
14 | ANDSERVER_URL = 'http://localhost:8080/encrypt?string={string}&offset={offset}'
15 | MAX_PAGE = 10
16 | LIMIT = 10
17 | 
18 | 
19 | def get_token(string, offset):
20 |     andserver_url = ANDSERVER_URL.format(string=string, offset=offset)
21 |     return requests.get(andserver_url).json().get('sign')
22 | 
23 | 
24 | for i in range(MAX_PAGE):
25 |     offset = i * LIMIT
26 |     token = get_token("/api/movie", offset)
27 |     index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
28 |     response = requests.get(index_url)
29 |     print("response:", response.json())
30 | 


--------------------------------------------------------------------------------
/src/ch13/jeb_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: jeb_demo.py
 6 | @time: 2022/1/17 10:29
 7 | @project: python3-web-spider-learning
 8 | @desc: 13.2 JEB的使用（P624）
 9 | """
10 | import base64
11 | import hashlib
12 | import time
13 | 
14 | import requests
15 | 
16 | INDEX_URL = 'https://app5.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}'
17 | MAX_PAGE = 10
18 | LIMIT = 10
19 | 
20 | 
21 | def get_token(args):
22 |     timestamp = str(int(time.time()))
23 |     args.append(timestamp)
24 |     sign = hashlib.sha1(','.join(args).encode('utf-8')).hexdigest()
25 |     return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8')
26 | 
27 | 
28 | for i in range(MAX_PAGE):
29 |     offset = i * LIMIT
30 |     token = get_token(args=['/api/movie'])
31 |     index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
32 |     response = requests.get(index_url)
33 |     print('response:', response.json())
34 | 


--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/concurrency_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: concurrency_demo.py
 6 | @time: 2022/1/6 20:01
 7 | @project: python3-web-spider-learning
 8 | @desc: 并发限制（P206）
 9 | """
10 | import asyncio
11 | 
12 | import aiohttp
13 | 
14 | CONCURRENCY = 5
15 | URL = 'https://www.baidu.com'
16 | 
17 | semaphoer = asyncio.Semaphore(CONCURRENCY)
18 | session = None
19 | 
20 | 
21 | async def scrape_api():
22 |     async with semaphoer:
23 |         print('scraping', URL)
24 |         async with session.get(URL) as response:
25 |             await asyncio.sleep(1)
26 |             return await response.text()
27 | 
28 | 
29 | async def main():
30 |     global session
31 |     session = aiohttp.ClientSession()
32 |     scrape_index_tasks = [asyncio.ensure_future(scrape_api()) for _ in range(10000)]
33 |     await asyncio.gather(*scrape_index_tasks)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     asyncio.get_event_loop().run_until_complete(main())
38 | 


--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_await_aiohttp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: coroutine_await_aiohttp.py
 6 | @time: 2022/1/6 19:05
 7 | @project: python3-web-spider-learning
 8 | @desc: 协程实现，await、aiohttp的使用（P197）
 9 | """
10 | import asyncio
11 | import time
12 | 
13 | import aiohttp
14 | 
15 | start = time.time()
16 | 
17 | 
18 | async def get(url):
19 |     session = aiohttp.ClientSession()
20 |     response = await session.get(url)
21 |     await response.text()
22 |     await session.close()
23 |     return response
24 | 
25 | 
26 | async def request():
27 |     url = 'https://www.httpbin.org/delay/5'
28 |     print('Waiting for', url)
29 |     response = await get(url)
30 |     print('Get response from', url, 'response', response)
31 | 
32 | 
33 | tasks = [asyncio.ensure_future(request()) for _ in range(10)]
34 | loop = asyncio.get_event_loop()
35 | loop.run_until_complete(asyncio.wait(tasks))
36 | 
37 | end = time.time()
38 | print('Cost time:', end - start)
39 | 


--------------------------------------------------------------------------------
/src/ch13/frida_rpc_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: frida_rpc_demo.py
 6 | @time: 2022/1/17 20:11
 7 | @project: python3-web-spider-learning
 8 | @desc: 13.9 基于Frida-RPC 模拟执行so文件（P683）
 9 | """
10 | import frida
11 | import requests
12 | 
13 | BASE_URL = 'https://app9.scrape.center'
14 | INDEX_URL = BASE_URL + '/api/movie?limit={limit}&offset={offset}&token={token}'
15 | MAX_PAGE = 10
16 | LIMIT = 10
17 | 
18 | session = frida.get_usb_device().attach('App9')
19 | source = open('files/frida_rpc_app9.js', encoding='utf-8').read()
20 | script = session.create_script(source)
21 | script.load()
22 | 
23 | 
24 | def get_token(string, offset):
25 |     return script.exports.encrypt(string, offset)
26 | 
27 | 
28 | for i in range(MAX_PAGE):
29 |     offset = i * LIMIT
30 |     token = get_token('/api/movie', offset)
31 |     index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
32 |     response = requests.get(index_url)
33 |     print('response', response.json())
34 | 


--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/nodejs_main.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: nodejs_main.js
 4 |  * @time:  22:01
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc: 11.6 使用Node.js模拟执行JavaScript（P451）
 7 |  */
 8 | 
 9 | const CryptoJS = require("./files/crypto.js")
10 | 
11 | function getToken(player) {
12 |     let key = CryptoJS.enc.Utf8.parse("fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt");
13 |     const {name, birthday, height, weight} = player;
14 |     let base64Name = CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(name));
15 |     let encrypted = CryptoJS.DES.encrypt(
16 |         `${base64Name}${birthday}${height}${weight}`,
17 |         key, {
18 |             mode: CryptoJS.mode.ECB,
19 |             padding: CryptoJS.pad.Pkcs7,
20 |         }
21 |     );
22 |     return encrypted.toString();
23 | }
24 | 
25 | const player = {
26 |     "name": "凯文-杜兰特",
27 |     "image": "durant.png",
28 |     "birthday": "1988-09-29",
29 |     "height": "208cm",
30 |     "weight": "108.9KG"
31 | }
32 | 
33 | console.log(getToken(player))


--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/scrape_consume.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: scrape_consume.py
 6 | @time: 2022/1/6 15:10
 7 | @project: python3-web-spider-learning
 8 | @desc: RabbitMQ实战 消费者（P172）
 9 | """
10 | import pickle
11 | 
12 | import pika
13 | import requests
14 | 
15 | MAX_PRORITY = 100
16 | QUEUE_NAME = 'scrape_queue'
17 | 
18 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
19 | channel = connection.channel()
20 | session = requests.Session()
21 | 
22 | 
23 | def scrape(request):
24 |     try:
25 |         response = session.send(request.prepare())
26 |         print(f'success scraped {response.url}')
27 |     except requests.RequestException:
28 |         print(f'error occurred when scraping {request.url}')
29 | 
30 | 
31 | while True:
32 |     method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
33 |     if body:
34 |         request = pickle.loads(body)
35 |         print(f'Get {request}')
36 |         scrape(request)
37 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/AndroidManifest.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <manifest xmlns:android="http://schemas.android.com/apk/res/android"
 3 |     package="com.germey.andservertest">
 4 |     <uses-permission android:name="android.permission.INTERNET" />
 5 |     <uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />
 6 |     <application
 7 |         android:allowBackup="true"
 8 |         android:icon="@mipmap/ic_launcher"
 9 |         android:label="@string/app_name"
10 |         android:roundIcon="@mipmap/ic_launcher_round"
11 |         android:supportsRtl="true"
12 |         android:theme="@style/Theme.AndServerTest">
13 |         <activity
14 |             android:name=".MainActivity"
15 |             android:exported="true">
16 |             <intent-filter>
17 |                 <action android:name="android.intent.action.MAIN" />
18 | 
19 |                 <category android:name="android.intent.category.LAUNCHER" />
20 |             </intent-filter>
21 |         </activity>
22 |     </application>
23 | 
24 | </manifest>


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run.py
 6 | @time: 2022/1/20 11:29
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.12 Scrapy规则化爬虫（实战，P818）
 9 | """
10 | import argparse
11 | 
12 | from scrapy.crawler import CrawlerProcess
13 | from scrapy.utils.project import get_project_settings
14 | 
15 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.utils import get_config
16 | 
17 | parser = argparse.ArgumentParser(description='Universal Spider')
18 | parser.add_argument('name', help='name of spider to run')
19 | args = parser.parse_args()
20 | name = args.name
21 | 
22 | 
23 | def run():
24 |     config = get_config(name)
25 |     spider = config.get('spider', 'universal')
26 |     project_settings = get_project_settings()
27 |     settings = dict(project_settings.copy())
28 |     settings.update(config.get('settings'))
29 |     process = CrawlerProcess(settings)
30 |     process.crawl(spider, **{'name': name})
31 |     process.start()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     run()
36 | 


--------------------------------------------------------------------------------
/src/ch10/jwt_simulate_login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: jwt_simulate_login.py
 6 | @time: 2022/1/12 9:49
 7 | @project: python3-web-spider-learning
 8 | @desc: 10.3 基于JWT的模拟登录爬取实战（P381）
 9 | """
10 | from urllib.parse import urljoin
11 | import requests
12 | 
13 | BASE_URL = 'https://login3.scrape.center/'
14 | LOGIN_URL = urljoin(BASE_URL, '/api/login')
15 | INDEX_URL = urljoin(BASE_URL, '/api/book')
16 | USERNAME = 'admin'
17 | PASSWORD = 'admin'
18 | 
19 | response_login = requests.post(LOGIN_URL, json={
20 |     'username': USERNAME,
21 |     'password': PASSWORD
22 | })
23 | data = response_login.json()
24 | print('Response JSON:', data)
25 | # 获取token jwt
26 | jwt = data.get('token')
27 | print('JWT:', jwt)
28 | 
29 | headers = {
30 |     'Authorization': f'jwt {jwt}'
31 | }
32 | response_index = requests.get(INDEX_URL, params={
33 |     'limit': 18,
34 |     'offset': 0
35 | }, headers=headers)
36 | print('Response Status', response_index.status_code)
37 | print('Response URL', response_index.url)
38 | print('Response Data', response_index.json())
39 | 


--------------------------------------------------------------------------------
/src/ch07/selenium_demo/delay_wait.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: delay_wait.py
 6 | @time: 2022/1/7 15:05
 7 | @project: python3-web-spider-learning
 8 | @desc: 延时等待（P220）
 9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 | 
15 | 
16 | def implicit_wait():
17 |     browser = webdriver.Chrome()
18 |     browser.implicitly_wait(10)
19 |     browser.get('https://spa2.scrape.center/')
20 |     input = browser.find_element_by_class_name('logo-image')
21 |     print(input)
22 | 
23 | 
24 | def explicit_wait():
25 |     browser = webdriver.Chrome()
26 |     browser.get('https://www.taobao.com/')
27 |     wait = WebDriverWait(browser, 10)
28 |     input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
29 |     button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
30 |     print(input, button)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     explicit_wait()
35 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: server.py
 6 | @time: 2022/1/12 14:00
 7 | @project: python3-web-spider-learning
 8 | @desc: 
 9 | """
10 | from flask import Flask, g
11 | 
12 | from ch10.account_pool.setting import GENERATOR_MAP
13 | from ch10.account_pool.storages_redis import RedisClient
14 | from loguru import logger
15 | 
16 | app = Flask(__name__)
17 | 
18 | account = 'account'
19 | credential = 'credential'
20 | 
21 | 
22 | @app.route('/')
23 | def index():
24 |     return '<h2>Welcome to Cookie Pool System</h2>'
25 | 
26 | 
27 | def get_conn():
28 |     for website in GENERATOR_MAP:
29 |         if not hasattr(g, website):
30 |             setattr(g, f'{website}_{credential}', RedisClient(credential, website))
31 |             setattr(g, f'{website}_{account}', RedisClient(account, website))
32 |     return g
33 | 
34 | 
35 | @app.route('/<website>/random')
36 | def random(website):
37 |     g = get_conn()
38 |     result = getattr(g, f'{website}_{credential}').random()
39 |     logger.debug(f'get credential {result}')
40 |     return result
41 | 


--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/nodejs_server.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author: HuRuiFeng
 3 |  * @file: nodejs_server.js
 4 |  * @time:  22:09
 5 |  * @project: python3-web-spider-learning
 6 |  * @desc: 搭建nodejs服务（P453）
 7 |  */
 8 | 
 9 | const CryptoJS = require("./crypto.js")
10 | const express = require("express")
11 | const app = express();
12 | const port = 3000;
13 | app.use(express.json())
14 | 
15 | 
16 | function getToken(player) {
17 |     let key = CryptoJS.enc.Utf8.parse("fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt");
18 |     const {name, birthday, height, weight} = player;
19 |     let base64Name = CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(name));
20 |     let encrypted = CryptoJS.DES.encrypt(
21 |         `${base64Name}${birthday}${height}${weight}`,
22 |         key, {
23 |             mode: CryptoJS.mode.ECB,
24 |             padding: CryptoJS.pad.Pkcs7,
25 |         }
26 |     );
27 |     return encrypted.toString();
28 | }
29 | 
30 | app.post("/", (req, res)=> {
31 |     const data = req.body;
32 |     res.send(getToken(data))
33 | });
34 | 
35 | app.listen(port, ()=> {
36 |     console.log(`Example app listening on port ${port}`);
37 | })


--------------------------------------------------------------------------------
/src/ch13/ida_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: ida_demo.py
 6 | @time: 2022/1/17 19:19
 7 | @project: python3-web-spider-learning
 8 | @desc: 13.8 IDA Pro静态分析和动态调试so文件（汇编代码调试）（P679）
 9 | """
10 | import requests
11 | import hashlib
12 | import time
13 | import base64
14 | 
15 | 
16 | def get_token(value, offset):
17 |     array = []
18 |     array.append(value)
19 |     array.append('9fdLnciVh4FxQbri')
20 |     array.append(str(offset))
21 |     timestamp = str(int(time.time()))
22 |     array.append(timestamp)
23 |     sign = hashlib.sha1(','.join(array).encode('utf-8')).hexdigest()
24 |     return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8')
25 | 
26 | 
27 | INDEX_URL = 'https://app8.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}'
28 | MAX_PAGE = 10
29 | LIMIT = 10
30 | 
31 | 
32 | for i in range(MAX_PAGE):
33 |     offset = i * LIMIT
34 |     token = get_token('/api/movie', offset)
35 |     index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
36 |     response = requests.get(index_url)
37 |     print('response', response.json())
38 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradle.properties:
--------------------------------------------------------------------------------
 1 | # Project-wide Gradle settings.
 2 | # IDE (e.g. Android Studio) users:
 3 | # Gradle settings configured through the IDE *will override*
 4 | # any settings specified in this file.
 5 | # For more details on how to configure your build environment visit
 6 | # http://www.gradle.org/docs/current/userguide/build_environment.html
 7 | # Specifies the JVM arguments used for the daemon process.
 8 | # The setting is particularly useful for tweaking memory settings.
 9 | org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
10 | # When configured, Gradle will run in incubating parallel mode.
11 | # This option should only be used with decoupled projects. More details, visit
12 | # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
13 | # org.gradle.parallel=true
14 | # AndroidX package structure to make it clearer which packages are bundled with the
15 | # Android operating system, and which are packaged with your app"s APK
16 | # https://developer.android.com/topic/libraries/support-library/androidx-rn
17 | android.useAndroidX=true
18 | # Automatically convert third-party libraries to use AndroidX
19 | android.enableJetifier=true


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/spiders/httpbin.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy import Request
 3 | 
 4 | 
 5 | class HttpbinSpider(scrapy.Spider):
 6 |     name = 'httpbin'
 7 |     allowed_domains = ['www.httpbin.org']
 8 |     start_url = 'https://www.httpbin.org/get'
 9 |     headers = {
10 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
11 |     }
12 |     cookies = {'name': 'germey',
13 |                'age': '26'}
14 | 
15 |     def start_requests(self):
16 |         for offset in range(5):
17 |             url = self.start_url + f'?offset={offset}'
18 |             yield Request(url, headers=self.headers,
19 |                           cookies=self.cookies,
20 |                           callback=self.parse_response,
21 |                           meta={'offset': offset})
22 |     
23 |     def parse_response(self, response):
24 |         print('url:', response.url)
25 |         print('request:', response.request)
26 |         print('status:', response.status)
27 |         print('headers:', response.headers)
28 |         print('text:', response.text)
29 |         print('meta:', response.meta)
30 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/run_account_pool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: run_account_pool.py
 6 | @time: 2022/1/12 14:27
 7 | @project: python3-web-spider-learning
 8 | @desc: 运行账号池
 9 | """
10 | from ch10.account_pool.setting import ENABLE_IMPORT_DATA
11 | from ch10.account_pool.storages_redis import RedisClient
12 | from scheduler import Scheduler
13 | import argparse
14 | 
15 | parser = argparse.ArgumentParser(description='AccountPool')
16 | parser.add_argument('website', type=str, help='website')
17 | parser.add_argument('--processor', type=str, help='processor to run')
18 | args = parser.parse_args()
19 | website = args.website
20 | 
21 | if __name__ == '__main__':
22 |     if ENABLE_IMPORT_DATA:
23 |         conn = RedisClient('account', website)
24 |         start = 1
25 |         end = 20
26 |         for i in range(start, end + 1):
27 |             username = password = f'admin{i}'
28 |             conn.set(username, password)
29 |         conn.close()
30 | 
31 |     # if processor set, just run it
32 |     if args.processor:
33 |         getattr(Scheduler(), f'run_{args.processor}')(website)
34 |     else:
35 |         Scheduler().run(website)
36 | 


--------------------------------------------------------------------------------
/src/ch15/scrape_processor_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: scrape_processor_demo.py
 6 | @time: 2022/1/20 10:52
 7 | @project: python3-web-spider-learning
 8 | @desc: 15.12 Scrapy规则化爬虫（P816）
 9 | """
10 | from itemloaders.processors import TakeFirst, Join, Compose, MapCompose, SelectJmes
11 | 
12 | 
13 | def takefirst():
14 |     # 返回列表的第一个非空值
15 |     processor = TakeFirst()
16 |     print(processor(['', 1, 2, 3]))
17 | 
18 | 
19 | def join():
20 |     # 把列表拼接成字符串
21 |     processor = Join()
22 |     print(processor(['one', 'two', 'three']))
23 | 
24 |     processor = Join(',')
25 |     print(processor(['one', 'two', 'three']))
26 | 
27 | 
28 | def compose():
29 |     # 使用多个函数组合构造而成
30 |     processor = Compose(str.upper, lambda s: s.strip())
31 |     print(processor(' hello world'))
32 | 
33 | 
34 | def map_compose():
35 |     # 和compose类似，迭代处理一个列表输入值
36 |     processor = MapCompose(str.upper, lambda s: s.strip())
37 |     print(processor(['Hello', 'World', 'Python']))
38 | 
39 | 
40 | def select_jmes():
41 |     # 查询JSON，传入Key，返回查询所得的Value
42 |     processor = SelectJmes('foo')
43 |     print(processor({'foo': 'bar'}))
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     select_jmes()
48 | 


--------------------------------------------------------------------------------
/src/ch12/appium_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: appium_demo.py
 6 | @time: 2022/1/16 2:26
 7 | @project: python3-web-spider-learning
 8 | @desc: 12.4 Appium的使用（P557）
 9 | """
10 | from appium import webdriver
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 | 
15 | server = 'http://localhost:4723/wd/hub'
16 | desired_capabilitis= {
17 |   "platformName": "Android",
18 |   "appium:deviceName": "VirtualBox",
19 |   "appium:appPackage": "com.goldze.mvvmhabit",
20 |   "appium:appActivity": "com.goldze.mvvmhabit.ui.MainActivity",
21 |   "appium:noReset": True
22 | }
23 | 
24 | # 启动示例App
25 | driver = webdriver.Remote(server, desired_capabilitis)
26 | wait = WebDriverWait(driver, 30)
27 | # 等到所有电影条目都加载之后
28 | wait.until(EC.presence_of_element_located((By.XPATH, '//android.support.v7.widget.RecyclerView/android.widget.LinearLayout')))
29 | window_size = driver.get_window_size()
30 | width, height = window_size.get('width'), window_size.get('height')
31 | # 前两个表示初始位置，后两个表示滑动的结束位置，1000表示滑动时间为1秒
32 | driver.swipe(width * 0.5, height * 0.8, width * 0.5, height * 0.2, 1000)


--------------------------------------------------------------------------------
/src/ch11/execjs_web_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: execjs_web_demo.py
 6 | @time: 2022/1/14 9:22
 7 | @project: python3-web-spider-learning
 8 | @desc: 11.7 浏览器环境下JavaScript的模拟执行（P457）
 9 | """
10 | import requests
11 | from playwright.sync_api import sync_playwright
12 | 
13 | BASE_URL = "https://spa2.scrape.center"
14 | INDEX_URL = BASE_URL + "/api/movie?limit={limit}&offset={offset}&token={token}"
15 | MAX_PAGE = 10
16 | LIMIT = 10
17 | 
18 | # 创建一个无头Chromium浏览器
19 | context = sync_playwright().start()
20 | browser = context.chromium.launch()
21 | # 创建一个新页面
22 | page = browser.new_page()
23 | # 配置路由，将浏览器加载的js替换为本地js
24 | page.route(
25 |     "/js/chunk-10192a00.243cb8b7.js",
26 |     lambda route: route.fulfill(path="files/chunk.js")
27 | )
28 | page.goto(BASE_URL)
29 | 
30 | 
31 | def get_token(offset):
32 |     # 使用evaluate方法模拟执行
33 |     result = page.evaluate('''()=> {
34 |         return window.encrypt("%s", "%s")
35 |     }''' % ('/api/movie', offset))
36 |     return result
37 | 
38 | 
39 | for i in range(MAX_PAGE):
40 |     offset = i * LIMIT
41 |     token = get_token(offset)
42 |     index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
43 |     response = requests.get(index_url)
44 |     print('response:', response.json())
45 | 


--------------------------------------------------------------------------------
/src/ch04/text_oper_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: text_oper_demo.py
 6 | @time: 2022/1/5 19:00
 7 | @project: python3-web-spider-learning
 8 | @desc: 4.1 TXT文本存储（P128~P130）
 9 | """
10 | import os
11 | import re
12 | 
13 | import requests
14 | from pyquery import PyQuery as pq
15 | 
16 | url = 'https://ssr1.scrape.center'
17 | html = requests.get(url).text
18 | doc = pq(html)
19 | items = doc('.el-card').items()
20 | 
21 | if not os.path.exists('files'):
22 |     os.makedirs('files')
23 | 
24 | file = open('files/movies.txt', 'w', encoding='utf-8')
25 | for item in items:
26 |     # 电影名称
27 |     name = item.find('a > h2').text()
28 |     file.write(f'名称：{name}\n')
29 |     # 类别
30 |     categories = [item.text() for item in item.find('.categories button span').items()]
31 |     file.write(f'类别：{categories}\n')
32 |     # 上映时间
33 |     published_at = item.find('.info:contains(上映)').text()
34 |     published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
35 |         if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None
36 |     file.write(f'上映时间：{published_at}\n')
37 |     # 评分
38 |     score = item.find('p.score').text()
39 |     file.write(f'评分：{score}\n')
40 |     file.write(f'{"=" * 50}\n')
41 | 
42 | file.close()
43 | 


--------------------------------------------------------------------------------
/src/ch04/csv_oper_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: csv_oper_demo.py
 6 | @time: 2022/1/5 19:20
 7 | @project: python3-web-spider-learning
 8 | @desc: 4.3 CSV文件存储（P134~P138）
 9 | """
10 | import csv
11 | 
12 | 
13 | def write_to_csv():
14 |     with open('files/data.csv', 'w', newline='') as csv_file:
15 |         writer = csv.writer(csv_file)
16 |         writer.writerow(['id', 'name', 'age'])
17 |         writer.writerow(['10001', 'Mike', 20])
18 |         writer.writerow(['10002', 'Bob', 22])
19 |         writer.writerow(['10003', 'Jordan', 21])
20 | 
21 | 
22 | def write_dict_to_csv():
23 |     with open('files/data.csv', 'w', encoding='utf-8', newline='') as csv_file:
24 |         filednames = ['id', 'name', 'age']
25 |         writer = csv.DictWriter(csv_file, fieldnames=filednames)
26 |         writer.writeheader()
27 |         writer.writerow({'id': '10001', 'name': 'Mike', 'age': 20})
28 |         writer.writerow({'id': '10002', 'name': 'Bob', 'age': 22})
29 |         writer.writerow({'id': '10003', 'name': 'Jordan', 'age': 21})
30 | 
31 | 
32 | def read_csv():
33 |     with open('files/data.csv', 'r', encoding='utf-8') as csv_file:
34 |         reader = csv.reader(csv_file)
35 |         for row in reader:
36 |             print(row)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     write_dict_to_csv()
41 |     read_csv()
42 | 


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/extensions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: extensions.py
 6 | @time: 2022/1/19 18:43
 7 | @project: python3-web-spider-learning
 8 | @desc: 
 9 | """
10 | import requests
11 | from scrapy import signals
12 | 
13 | NOTIFICATION_URL = 'http://localhost:5000/notify'
14 | 
15 | 
16 | class NotificationExtension:
17 |     def spider_opend(self, spider):
18 |         requests.post(NOTIFICATION_URL, json={
19 |             'event': 'SPIDER_OPENED',
20 |             'data': {'spider_name': spider.name}
21 |         })
22 | 
23 |     def spider_closed(self, spider):
24 |         requests.post(NOTIFICATION_URL, json={
25 |             'event': 'SPIDER_CLOSED',
26 |             'data': {'spider_name': spider.name}
27 |         })
28 | 
29 |     def item_scraped(self, item, spider):
30 |         requests.post(NOTIFICATION_URL, json={
31 |             'event': 'ITEM_SCRAPED',
32 |             'data': {'spider_name': spider.name, 'item': dict(item)}
33 |         })
34 | 
35 |     @classmethod
36 |     def from_crawler(cls, crawler):
37 |         ext = cls()
38 |         crawler.signals.connect(ext.spider_opend, signal=signals.spider_opened)
39 |         crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
40 |         crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
41 |         return ext
42 | 


--------------------------------------------------------------------------------
/src/ch04/files/movies.txt:
--------------------------------------------------------------------------------
 1 | 名称：霸王别姬 - Farewell My Concubine
 2 | 类别：['剧情', '爱情']
 3 | 上映时间：1993-07-26
 4 | 评分：9.5
 5 | ==================================================
 6 | 名称：这个杀手不太冷 - Léon
 7 | 类别：['剧情', '动作', '犯罪']
 8 | 上映时间：1994-09-14
 9 | 评分：9.5
10 | ==================================================
11 | 名称：肖申克的救赎 - The Shawshank Redemption
12 | 类别：['剧情', '犯罪']
13 | 上映时间：1994-09-10
14 | 评分：9.5
15 | ==================================================
16 | 名称：泰坦尼克号 - Titanic
17 | 类别：['剧情', '爱情', '灾难']
18 | 上映时间：1998-04-03
19 | 评分：9.5
20 | ==================================================
21 | 名称：罗马假日 - Roman Holiday
22 | 类别：['剧情', '喜剧', '爱情']
23 | 上映时间：1953-08-20
24 | 评分：9.5
25 | ==================================================
26 | 名称：唐伯虎点秋香 - Flirting Scholar
27 | 类别：['喜剧', '爱情', '古装']
28 | 上映时间：1993-07-01
29 | 评分：9.5
30 | ==================================================
31 | 名称：乱世佳人 - Gone with the Wind
32 | 类别：['剧情', '爱情', '历史', '战争']
33 | 上映时间：1939-12-15
34 | 评分：9.5
35 | ==================================================
36 | 名称：喜剧之王 - The King of Comedy
37 | 类别：['剧情', '喜剧', '爱情']
38 | 上映时间：1999-02-13
39 | 评分：9.5
40 | ==================================================
41 | 名称：楚门的世界 - The Truman Show
42 | 类别：['剧情', '科幻']
43 | 上映时间：None
44 | 评分：9.0
45 | ==================================================
46 | 名称：狮子王 - The Lion King
47 | 类别：['动画', '歌舞', '冒险']
48 | 上映时间：1995-07-15
49 | 评分：9.0
50 | ==================================================
51 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/build.gradle:
--------------------------------------------------------------------------------
 1 | plugins {
 2 |     id 'com.android.application'
 3 |     id 'com.yanzhenjie.andserver'
 4 | }
 5 | 
 6 | android {
 7 |     compileSdk 31
 8 | 
 9 |     defaultConfig {
10 |         applicationId "com.germey.andservertest"
11 |         minSdk 16
12 |         targetSdk 31
13 |         versionCode 1
14 |         versionName "1.0"
15 | 
16 |         testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
17 |     }
18 | 
19 |     buildTypes {
20 |         release {
21 |             minifyEnabled false
22 |             proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
23 |         }
24 |     }
25 |     compileOptions {
26 |         sourceCompatibility 1.8
27 |         targetCompatibility 1.8
28 |     }
29 | 
30 |     sourceSets {
31 |         main {
32 |             jniLibs.srcDirs = ["libs"]
33 |         }
34 |     }
35 | }
36 | 
37 | dependencies {
38 |     implementation 'androidx.appcompat:appcompat:1.4.1'
39 |     implementation 'com.google.android.material:material:1.5.0'
40 |     implementation 'androidx.constraintlayout:constraintlayout:2.0.4'
41 |     testImplementation 'junit:junit:4.+'
42 |     androidTestImplementation 'androidx.test.ext:junit:1.1.3'
43 |     androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
44 |     implementation 'com.yanzhenjie.andserver:api:2.1.9'
45 |     annotationProcessor 'com.yanzhenjie.andserver:processor:2.1.9'
46 | }


--------------------------------------------------------------------------------
/src/ch10/account_pool/storages_redis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: storages_redis.py
 6 | @time: 2022/1/12 10:18
 7 | @project: python3-web-spider-learning
 8 | @desc: 存储模块：使用Redis作为账号池的存储库，数据结构如下：
 9 | <user_name>: <password>
10 | <user_name>: <cookie_value>
11 | """
12 | import random
13 | 
14 | from ch10.account_pool.setting import *
15 | import redis
16 | 
17 | 
18 | class RedisClient:
19 |     def __init__(self, type, website, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
20 |         self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True)
21 |         # 网站类型
22 |         self.type = type
23 |         # 网站名称
24 |         self.website = website
25 | 
26 |     def name(self):
27 |         return f'{self.type}:{self.website}'
28 | 
29 |     def set(self, username, value):
30 |         return self.db.hset(self.name(), username, value)
31 | 
32 |     def get(self, username):
33 |         return self.db.hget(self.name(), username)
34 | 
35 |     def delete(self, username):
36 |         return self.db.hdel(self.name(), username)
37 | 
38 |     def count(self):
39 |         return self.db.hlen(self.name())
40 | 
41 |     def random(self):
42 |         # 随机选择一个cookie
43 |         return random.choice(self.db.hvals(self.name()))
44 | 
45 |     def usernames(self):
46 |         return self.db.hkeys(self.name())
47 | 
48 |     def all(self):
49 |         return self.db.hgetall(self.name())
50 | 
51 |     def close(self):
52 |         self.db.close()


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/layout/activity_main.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <androidx.constraintlayout.widget.ConstraintLayout
 3 |     xmlns:android="http://schemas.android.com/apk/res/android"
 4 |     xmlns:app="http://schemas.android.com/apk/res-auto"
 5 |     xmlns:tools="http://schemas.android.com/tools"
 6 |     android:layout_width="match_parent"
 7 |     android:layout_height="match_parent"
 8 |     tools:context=".MainActivity">
 9 | 
10 |     <Button
11 |         android:text="@string/start_server"
12 |         android:layout_width="wrap_content"
13 |         android:layout_height="wrap_content"
14 |         android:id="@+id/toggle_server"
15 |         app:layout_constraintEnd_toEndOf="parent"
16 |         app:layout_constraintTop_toTopOf="parent"
17 |         app:layout_constraintStart_toStartOf="parent"
18 |         app:layout_constraintBottom_toBottomOf="parent"
19 |         android:onClick="toggleServer"/>
20 | 
21 |     <TextView
22 |         android:text = ""
23 |         android:layout_width="wrap_content"
24 |         android:layout_height="wrap_content"
25 |         android:id="@+id/server_status"
26 |         app:layout_constraintEnd_toEndOf="parent"
27 |         app:layout_constraintStart_toStartOf="parent"
28 |         app:layout_constraintTop_toBottomOf="@+id/toggle_server"
29 |         app:layout_constraintBottom_toBottomOf="parent"
30 |         app:layout_constraintHorizontal_bias="0.5"
31 |         app:layout_constraintVertical_bias="0.15" />
32 | 
33 | </androidx.constraintlayout.widget.ConstraintLayout>


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/movie.py:
--------------------------------------------------------------------------------
 1 | from itemloaders import ItemLoader
 2 | from itemloaders.processors import TakeFirst, Identity, Compose
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | 
 6 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.items import MovieItem
 7 | 
 8 | 
 9 | class MovieSpider(CrawlSpider):
10 |     name = 'movie'
11 |     allowed_domains = ['ssr1.scrape.center']
12 |     start_urls = ['https://ssr1.scrape.center/']
13 | 
14 |     rules = (
15 |         Rule(LinkExtractor(restrict_css='.item .name'), follow=True, callback='parse_detail'),
16 |         Rule(LinkExtractor(restrict_css='.next'), follow=True),
17 |     )
18 | 
19 |     def parse_detail(self, response):
20 |         loader = MovieItemLoader(item=MovieItem(), response=response)
21 |         loader.add_css('name', '.item h2::text')
22 |         loader.add_css('categories', '.categories button span::text')
23 |         loader.add_css('cover', '.cover::attr(src)')
24 |         loader.add_css('published_at', '.info span::text', re='(\d{4}-\d{2}-\d{2})\s?上映')
25 |         loader.add_xpath('score', '//p[contains(@class, "score")]/text()')
26 |         loader.add_xpath('drama', '//div[contains(@class, "drama")]/p/text()')
27 |         yield loader.load_item()
28 | 
29 | 
30 | class MovieItemLoader(ItemLoader):
31 |     default_output_processor = TakeFirst()
32 |     categories_out = Identity()
33 |     score_out = Compose(TakeFirst(), str.strip)
34 |     drama_out = Compose(TakeFirst(), str.strip)
35 | 


--------------------------------------------------------------------------------
/src/ch02/httpx_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: httpx_demo.py
 6 | @time: 2022/1/4 11:32
 7 | @project: python3-web-spider-learning
 8 | @desc: 2.4 httpx的使用（P75~P78）
 9 | """
10 | import asyncio
11 | 
12 | import httpx
13 | 
14 | 
15 | def httpx_deom():
16 |     response = httpx.get('https://www.httpbin.org/get')
17 |     print(response.status_code)
18 |     print(response.headers)
19 |     print(response.text)
20 | 
21 | 
22 | def httpx_with_user_agent():
23 |     headers = {
24 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko)'
25 |                       'Chrome/52.0.2743.116 Safari/537.36'
26 |     }
27 |     response = httpx.get('https://www.httpbin.org/get', headers=headers)
28 |     print(response.text)
29 | 
30 | 
31 | def http2_demo():
32 |     client = httpx.Client(http2=True)
33 |     response = client.get('https://spa16.scrape.center/')
34 |     print(response.text)
35 | 
36 | 
37 | def client_demo():
38 |     url = 'https://www.httpbin.org/headers'
39 |     headers = {'User-Agent': 'my-app/0.0.1'}
40 |     with httpx.Client(headers=headers) as client:
41 |         r = client.get(url)
42 |         print(r.json()['headers']['User-Agent'])
43 | 
44 | 
45 | async def fetch(url):
46 |     # 异步请求
47 |     async with httpx.AsyncClient(http2=True) as client:
48 |         response = await client.get(url)
49 |         print(response.text)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     asyncio.get_event_loop().run_until_complete(fetch('https://www.httpbin.org/get'))
54 | 


--------------------------------------------------------------------------------
/src/ch07/css_locate_scrape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: css_locate_scrape.py
 6 | @time: 2022/1/10 21:45
 7 | @project: python3-web-spider-learning
 8 | @desc: 7.7 CSS位置偏移反爬与爬取实战（P282）
 9 | """
10 | import re
11 | 
12 | from selenium import webdriver
13 | from selenium.webdriver.common.by import By
14 | from selenium.webdriver.support.wait import WebDriverWait
15 | from selenium.webdriver.support import expected_conditions as EC
16 | from pyquery import PyQuery as pq
17 | 
18 | 
19 | def parse_name(name_html):
20 |     # 处理特殊的情况
21 |     has_whole = name_html('.whole')
22 |     if has_whole:
23 |         return name_html.text()
24 |     else:
25 |         chars = name_html('.char')
26 |         items = []
27 |         for char in chars.items():
28 |             # 提取文字和偏移值
29 |             items.append({
30 |                 'text': char.text().strip(),
31 |                 'left': int(re.search('(\d+)px', char.attr('style')).group(1))
32 |             })
33 |         # 排序
34 |         items = sorted(items, key=lambda x: x['left'], reverse=False)
35 |         # 将文字组合在一起
36 |         return ''.join([item.get('text') for item in items])
37 | 
38 | 
39 | browser = webdriver.Chrome()
40 | browser.get('https://antispider3.scrape.center/')
41 | WebDriverWait(browser, 10) \
42 |     .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.item')))
43 | html = browser.page_source
44 | doc = pq(html)
45 | names = doc('.item .name')
46 | for name_html in names.items():
47 |     name = parse_name(name_html)
48 |     print(name)
49 | browser.close()
50 | 


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/simple_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: simple_demo.py
 6 | @time: 2022/1/10 19:28
 7 | @project: python3-web-spider-learning
 8 | @desc: Playwright基本使用（P257）
 9 | """
10 | import asyncio
11 | import os
12 | 
13 | from playwright.async_api import async_playwright
14 | from playwright.sync_api import sync_playwright
15 | 
16 | 
17 | def sync_demo():
18 |     with sync_playwright() as p:
19 |         for browser_type in [p.chromium, p.firefox, p.webkit]:
20 |             browser = browser_type.launch(headless=False)
21 |             page = browser.new_page()
22 |             page.goto('https://www.baidu.com')
23 | 
24 |             if not os.path.exists('files'):
25 |                 os.makedirs('files')
26 | 
27 |             page.screenshot(path=f'files/screenshot-{browser_type.name}.png')
28 |             print(page.title())
29 |             browser.close()
30 | 
31 | 
32 | async def async_demo():
33 |     async with async_playwright() as p:
34 |         for browser_type in [p.chromium, p.firefox, p.webkit]:
35 |             browser = await browser_type.launch(headless=False)
36 |             page = await browser.new_page()
37 |             await page.goto('https://www.baidu.com')
38 | 
39 |             if not os.path.exists('files'):
40 |                 os.makedirs('files')
41 | 
42 |             await page.screenshot(path=f'files/screenshot-{browser_type.name}.png')
43 |             print(await page.title())
44 |             await browser.close()
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     # 同步模式
49 |     # sync_demo()
50 | 
51 |     # 异步模式
52 |     asyncio.run(async_demo())


--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/simple_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: simple_demo.py
 6 | @time: 2022/1/7 17:23
 7 | @project: python3-web-spider-learning
 8 | @desc: pyppeteer基本使用（P243）
 9 | """
10 | import asyncio
11 | import os
12 | 
13 | from pyppeteer import launch
14 | from pyquery import PyQuery as pq
15 | 
16 | 
17 | async def simple_demo():
18 |     browser = await launch()
19 |     page = await browser.newPage()
20 |     await page.goto('https://spa2.scrape.center/')
21 |     await page.waitForSelector('.item .name')
22 |     doc = pq(await page.content())
23 |     names = [item.text() for item in doc('.item .name').items()]
24 |     print('Name:', names)
25 |     await browser.close()
26 | 
27 | 
28 | async def simple_demo2():
29 |     width, height = 1366, 768
30 |     browser = await launch()
31 |     page = await browser.newPage()
32 |     await page.setViewport({'width': width, 'height': height})
33 |     await page.goto('https://spa2.scrape.center/')
34 |     await page.waitForSelector('.item .name')
35 |     await asyncio.sleep(2)
36 | 
37 |     if not os.path.exists('files'):
38 |         os.makedirs('files')
39 | 
40 |     await page.screenshot(path='files/example2.png')
41 |     dimensions = await page.evaluate('''() =>{
42 |         return {
43 |             width: document.documentElement.clientWidth,
44 |             height: document.documentElement.clientHeight,
45 |             deviceScaleFactor: window.devicePixelRatio,
46 |         }
47 |     }''')
48 | 
49 |     print(dimensions)
50 |     await browser.close()
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     asyncio.get_event_loop().run_until_complete(simple_demo2())
55 | 


--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | import pymongo
 9 | from scrapy.exceptions import DropItem
10 | 
11 | 
12 | class TextPipeline:
13 |     def __init__(self):
14 |         # 限制text长度
15 |         self.limit = 50
16 | 
17 |     def process_item(self, item, spider):
18 |         if item['text']:
19 |             if len(item['text']) > self.limit:
20 |                 item['text'] = item['text'][0:self.limit].rstrip() + '...'
21 |             return item
22 |         else:
23 |             return DropItem('Missing Text')
24 | 
25 | 
26 | class MongoDBPipeline:
27 |     def __init__(self, connection_string, database):
28 |         self.connection_string = connection_string
29 |         self.database = database
30 | 
31 |     @classmethod
32 |     def from_crawler(cls, crawler):
33 |         return cls(
34 |             connection_string=crawler.settings.get('MONGODB_CONNECTION_STRING'),
35 |             database=crawler.settings.get('MONGODE_DATABASE')
36 |         )
37 | 
38 |     def open_spider(self, spider):
39 |         # spider开启时调用
40 |         self.client = pymongo.MongoClient(self.connection_string)
41 |         self.db = self.client[self.database]
42 | 
43 |     def process_item(self, item, spider):
44 |         # 执行数据插入
45 |         name = item.__class__.__name__
46 |         self.db[name].insert_one(dict(item))
47 |         return item
48 | 
49 |     def close_spider(self, spider):
50 |         # spider关闭时调用
51 |         self.client.close()
52 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/spiders/book.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import scrapy
 4 | from gerapy_selenium import SeleniumRequest
 5 | 
 6 | from ch15.scrapyseleniumdemo.scrapyseleniumdemo.items import BookItem
 7 | 
 8 | 
 9 | class BookSpider(scrapy.Spider):
10 |     name = 'book'
11 |     allowed_domains = ['spa5.scrape.center']
12 |     base_url = 'https://spa5.scrape.center'
13 | 
14 |     def start_requests(self):
15 |         start_url = f'{self.base_url}/page/1'
16 |         yield SeleniumRequest(start_url, callback=self.parse_index)
17 | 
18 |     def parse_index(self, response):
19 |         items = response.css('item')
20 |         for item in items:
21 |             href = item.css('.top a::attr(href)').extract_first()
22 |             detail_url = response.urljoin(href)
23 |             yield SeleniumRequest(detail_url, callback=self.parse_detail, priority=2)
24 | 
25 |         match = re.search(r'page/(\d+)', response.url)
26 |         if not match:
27 |             return
28 |         page = int(match.group(1)) + 1
29 |         next_url = f'{self.base_url}/page/{page}'
30 |         yield SeleniumRequest(next_url, callback=self.parse_index)
31 | 
32 |     def parse_detail(self, response):
33 |         name = response.css('.name::text').extract_first()
34 |         tags = response.css('.tags button span::text').extract()
35 |         score = response.css('.score::text').extract_first()
36 |         price = response.css('.price span::text').extract_first()
37 |         cover = response.css('.cover::attr(src)').extract_first()
38 |         tags = [tag.strip() for tag in tags] if tags else []
39 |         score = score.strip() if score else None
40 |         item = BookItem(name=name, tags=tags, score=score, price=price, cover=cover)
41 |         yield item
42 | 


--------------------------------------------------------------------------------
/src/ch02/requests_demo/advanced_use.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: advanced_use.py
 6 | @time: 2021/12/31 15:13
 7 | @project: python3-web-spider-learning
 8 | @desc: 高级用法（P55~P63）
 9 | """
10 | import requests
11 | import urllib3
12 | from requests import Session, Request
13 | 
14 | urllib3.disable_warnings()
15 | 
16 | 
17 | def upload_file():
18 |     files = {
19 |         'file': open('../files/favicon.ico', 'rb')
20 |     }
21 |     r = requests.post('https://www.httpbin.org/post', files=files)
22 |     print(r.text)
23 | 
24 | 
25 | def print_cookie():
26 |     r = requests.get('https://www.baidu.com')
27 |     print(r.cookies)
28 |     for key, value in r.cookies.items():
29 |         print(key + '=' + value)
30 | 
31 | 
32 | def print_https_with_verify():
33 |     r = requests.get('https://ssr2.scrape.center/', verify=False)
34 |     print(r.status_code)
35 | 
36 | 
37 | def print_with_timeout():
38 |     r = requests.get('https://www.httpbin.org/get', timeout=1)
39 |     print(r.status_code)
40 | 
41 | 
42 | def print_with_auth():
43 |     r = requests.get('https://ssr3.scrape.center/', auth=('admin', 'admin'))
44 |     print(r.status_code)
45 | 
46 | 
47 | def print_prepared_request():
48 |     url = 'https://www.httpbin.org/post'
49 |     data = {'name': 'germey'}
50 |     headers = {
51 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko)'
52 |                       'Chrome/52.0.2743.116 Safari/537.36'
53 |     }
54 |     s = Session()
55 |     req = Request('POST', url, data=data, headers=headers)
56 |     prepped = s.prepare_request(req)
57 |     r = s.send(prepped)
58 |     print(r.text)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     print_prepared_request()
63 | 


--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/consumer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: consumer.py
 6 | @time: 2022/1/6 14:32
 7 | @project: python3-web-spider-learning
 8 | @desc: RabbitMQ 消费者示例（P167）
 9 | """
10 | import pika
11 | 
12 | MAX_PRIORITY = 100
13 | QUEUE_NAME = 'scrape'
14 | connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
15 | channel = connection.channel()
16 | 
17 | 
18 | def callback(ch, method, properties, body):
19 |     print(f'Get {body}')
20 | 
21 | 
22 | def simple_consume():
23 |     channel.queue_declare(queue=QUEUE_NAME)
24 |     channel.basic_consume(queue=QUEUE_NAME, auto_ack=True, on_message_callback=callback)
25 |     channel.start_consuming()
26 | 
27 | 
28 | def on_demand_consume():
29 |     channel.queue_declare(queue=QUEUE_NAME)
30 |     while True:
31 |         input()
32 |         method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
33 |         if body:
34 |             print(f'Get {body}')
35 | 
36 | 
37 | def priority_consume():
38 |     channel.queue_declare(queue=QUEUE_NAME, arguments={
39 |         'x-max-priority': MAX_PRIORITY
40 |     })
41 | 
42 |     while True:
43 |         input()
44 |         method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
45 |         if body:
46 |             print(f'Get {body}')
47 | 
48 | 
49 | def persistence_consume():
50 |     channel.queue_declare(queue=QUEUE_NAME, arguments={
51 |         'x-max-priority': MAX_PRIORITY
52 |     }, durable=True)
53 | 
54 |     while True:
55 |         input()
56 |         method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
57 |         if body:
58 |             print(f'Get {body}')
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     on_demand_consume()
63 | 


--------------------------------------------------------------------------------
/src/ch07/font_scrape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: font_scrape.py
 6 | @time: 2022/1/10 21:55
 7 | @project: python3-web-spider-learning
 8 | @desc: 7.8 字体反爬与爬取案例（P287）
 9 | 难点：评分是通过CSS样式控制的
10 | """
11 | import re
12 | 
13 | import requests
14 | from selenium import webdriver
15 | from selenium.webdriver.common.by import By
16 | from selenium.webdriver.support import expected_conditions as EC
17 | from selenium.webdriver.support.wait import WebDriverWait
18 | from pyquery import PyQuery as pq
19 | 
20 | url = 'https://antispider4.scrape.center/css/app.654ba59e.css'
21 | 
22 | response = requests.get(url)
23 | pattern = re.compile('.icon-(.*?):before\{content:"(.*?)"\}')
24 | results = re.findall(pattern, response.text)
25 | icon_map = {item[0]: item[1] for item in results}
26 | 
27 | 
28 | def parse_score(item):
29 |     elements = item('.icon')
30 |     icon_values = []
31 |     for element in elements.items():
32 |         class_name = (element.attr('class'))
33 |         # 提取CSS的icon代号
34 |         icon_key = re.search('icon-(\d+)', class_name).group(1)
35 |         # 得到真实值
36 |         icon_value = icon_map.get(icon_key)
37 |         icon_values.append(icon_value)
38 |     # 将值进行连接，组成score
39 |     return ''.join(icon_values)
40 | 
41 | 
42 | browser = webdriver.Chrome()
43 | browser.get('https://antispider4.scrape.center/')
44 | WebDriverWait(browser, 10) \
45 |     .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.item')))
46 | html = browser.page_source
47 | doc = pq(html)
48 | items = doc('.item')
49 | for item in items.items():
50 |     name = item('name').text()
51 |     categories = [o.text() for o in item('.categories button').items()]
52 |     score = parse_score(item)
53 |     print(f'name: {name} categories: {categories} score: {score}')
54 | browser.close()
55 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/spiders/book.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import re
 3 | 
 4 | import scrapy
 5 | from scrapy import Request
 6 | 
 7 | from ch15.scrapypyppeteerdemo.scrapypyppeteerdemo.items import BookItem
 8 | 
 9 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
10 | 
11 | 
12 | class BookSpider(scrapy.Spider):
13 |     name = 'book'
14 |     allowed_domains = ['spa5.scrape.center']
15 |     base_url = 'https://spa5.scrape.center'
16 | 
17 |     def start_requests(self):
18 |         start_url = f'{self.base_url}/page/1'
19 |         yield Request(start_url, callback=self.parse_index)
20 | 
21 |     def parse_index(self, response):
22 |         items = response.css('.item')
23 |         for item in items:
24 |             href = item.css('.top a::attr(href)').extract_first()
25 |             detail_url = response.urljoin(href)
26 |             yield Request(detail_url, callback=self.parse_detail, priority=2)
27 | 
28 |         match = re.search(r'page/(\d+)', response.url)
29 |         if not match:
30 |             return
31 |         page = int(match.group(1)) + 1
32 |         next_url = f'{self.base_url}/page/{page}'
33 |         yield Request(next_url, callback=self.parse_index)
34 | 
35 |     def parse_detail(self, response):
36 |         name = response.css('.name::text').extract_first()
37 |         tags = response.css('.tags button span::text').extract()
38 |         score = response.css('.score::text').extract_first()
39 |         price = response.css('.price span::text').extract_first()
40 |         cover = response.css('.cover::attr(src)').extract_first()
41 |         tags = [tag.strip() for tag in tags] if tags else []
42 |         score = score.strip() if score else None
43 |         item = BookItem(name=name, tags=tags, score=score, price=price, cover=cover)
44 |         yield item
45 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/configs/movie.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "spider": "universal",
 3 |   "type": "电影",
 4 |   "home": "https://ssr1.scrape.center/",
 5 |   "settings": {
 6 |     "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
 7 |   },
 8 |   "start_urls": [
 9 |     "https://ssr1.scrape.center/"
10 |   ],
11 |   "allowed_domains": [
12 |     "ssr1.scrape.center"
13 |   ],
14 |   "rules": [
15 |     {
16 |       "link_extractor":  {
17 |         "restrict_css":  ".item .name"
18 |       },
19 |       "follow": true,
20 |       "callback": "parse_detail"
21 |     },
22 |     {
23 |       "link_extractor":  {
24 |         "restrict_css":  ".next"
25 |       },
26 |       "follow": true
27 |     }
28 |   ],
29 |   "item": {
30 |     "class": "MovieItem",
31 |     "loader": "MovieItemLoader",
32 |     "attrs": {
33 |       "name": [
34 |         {
35 |           "method": "css",
36 |           "arg": ".item h2::text"
37 |         }
38 |       ],
39 |       "categories": [
40 |         {
41 |           "method": "css",
42 |           "arg": ".categories button span::text"
43 |         }
44 |       ],
45 |       "cover": [
46 |         {
47 |           "method": "css",
48 |           "arg": ".cover::attr(src)"
49 |         }
50 |       ],
51 |       "published_at": [
52 |         {
53 |           "method": "css",
54 |           "arg": ".info span::text",
55 |           "re": "(\\d{4}-\\d{2}-\\d{2})\\s?上映"
56 |         }
57 |       ],
58 |       "score": [
59 |         {
60 |           "method": "xpath",
61 |           "arg": "//p[contains(@class, \"score\")]/text()"
62 |         }
63 |       ],
64 |       "drama": [
65 |         {
66 |           "method": "xpath",
67 |           "arg": "//div[contains(@class, \"drama\")]/p/text()"
68 |         }
69 |       ]
70 |     }
71 |   }
72 | }


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/drawable-v24/ic_launcher_foreground.xml:
--------------------------------------------------------------------------------
 1 | <vector xmlns:android="http://schemas.android.com/apk/res/android"
 2 |     xmlns:aapt="http://schemas.android.com/aapt"
 3 |     android:width="108dp"
 4 |     android:height="108dp"
 5 |     android:viewportWidth="108"
 6 |     android:viewportHeight="108">
 7 |     <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
 8 |         <aapt:attr name="android:fillColor">
 9 |             <gradient
10 |                 android:endX="85.84757"
11 |                 android:endY="92.4963"
12 |                 android:startX="42.9492"
13 |                 android:startY="49.59793"
14 |                 android:type="linear">
15 |                 <item
16 |                     android:color="#44000000"
17 |                     android:offset="0.0" />
18 |                 <item
19 |                     android:color="#00000000"
20 |                     android:offset="1.0" />
21 |             </gradient>
22 |         </aapt:attr>
23 |     </path>
24 |     <path
25 |         android:fillColor="#FFFFFF"
26 |         android:fillType="nonZero"
27 |         android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
28 |         android:strokeWidth="1"
29 |         android:strokeColor="#00000000" />
30 | </vector>


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/universal.py:
--------------------------------------------------------------------------------
 1 | from scrapy.linkextractors import LinkExtractor
 2 | from scrapy.spiders import CrawlSpider, Rule
 3 | 
 4 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo import items, loaders
 5 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.utils import get_config
 6 | 
 7 | 
 8 | class UniversalSpider(CrawlSpider):
 9 |     name = 'universal'
10 | 
11 |     def __init__(self, name, *args, **kwargs):
12 |         config = get_config(name)
13 |         self.config = config
14 |         self.start_urls = config.get('start_urls')
15 |         self.allowed_domains = config.get('allowed_domains')
16 |         rules = []
17 |         for rule_kwargs in config.get('rules'):
18 |             link_extractor = LinkExtractor(**rule_kwargs.get('link_extractor'))
19 |             rule_kwargs['link_extractor'] = link_extractor
20 |             rule = Rule(**rule_kwargs)
21 |             rules.append(rule)
22 |         self.rules = rules
23 |         super(UniversalSpider, self).__init__(*args, **kwargs)
24 | 
25 |     def parse_detail(self, response):
26 |         item = self.config.get('item')
27 |         if item:
28 |             cls = getattr(items, item.get('class'))()
29 |             loader = getattr(loaders, item.get('loader'))(cls, response=response)
30 |             for key, value in item.get('attrs').items():
31 |                 for extractor in value:
32 |                     if extractor.get('method') == 'xpath':
33 |                         loader.add_xpath(key, extractor.get('arg'), **{'re': extractor.get('re')})
34 |                     if extractor.get('method') == 'css':
35 |                         loader.add_css(key, extractor.get('arg'), **{'re': extractor.get('re')})
36 |                     if extractor.get('method') == 'value':
37 |                         loader.add_value(key, extractor.get('args'), **{'re': extractor.get('re')})
38 |                 yield loader.load_item()
39 | 
40 | 


--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/producer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: producer.py
 6 | @time: 2022/1/6 14:32
 7 | @project: python3-web-spider-learning
 8 | @desc: RabbitMQ 生产者示例（P169）
 9 | """
10 | import pika
11 | 
12 | MAX_PRIORITY = 100
13 | QUEUE_NAME = 'scrape'
14 | connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
15 | channel = connection.channel()
16 | 
17 | 
18 | def simple_producer():
19 |     channel.queue_declare(queue=QUEUE_NAME)
20 |     channel.basic_publish(exchange='', routing_key=QUEUE_NAME, body='Hello World!')
21 | 
22 | 
23 | def on_demand_producer():
24 |     channel.queue_declare(queue=QUEUE_NAME)
25 |     while True:
26 |         data = input()
27 |         channel.basic_publish(exchange='', routing_key=QUEUE_NAME, body=data)
28 |         print(f'Put {data}')
29 | 
30 | 
31 | def priority_producer():
32 |     channel.queue_declare(queue=QUEUE_NAME, arguments={
33 |         'x-max-priority': MAX_PRIORITY
34 |     })
35 | 
36 |     while True:
37 |         data, priority = input().split()
38 |         channel.basic_publish(exchange='', routing_key=QUEUE_NAME,
39 |                               properties=pika.BasicProperties(priority=int(priority)),
40 |                               body=data)
41 |         print(f'Put {data}')
42 | 
43 | 
44 | def persistence_producer():
45 |     channel.queue_declare(queue=QUEUE_NAME, arguments={
46 |         'x-max-priority': MAX_PRIORITY
47 |     }, durable=True)
48 | 
49 |     while True:
50 |         data, priority = input().split()
51 |         channel.basic_publish(exchange='', routing_key=QUEUE_NAME,
52 |                               properties=pika.BasicProperties(
53 |                                   priority=int(priority),
54 |                                   delivery_mode=2
55 |                               ),
56 |                               body=data)
57 |         print(f'Put {data}')
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     priority_producer()
62 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/tester.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: tester.py
 6 | @time: 2022/1/12 10:46
 7 | @project: python3-web-spider-learning
 8 | @desc: 检测模块：检测失效Cookie，然后将其从Redis中删除
 9 | """
10 | from ch10.account_pool.exceptions import InitException
11 | from ch10.account_pool.setting import *
12 | from ch10.account_pool.storages_redis import RedisClient
13 | from loguru import logger
14 | import requests
15 | 
16 | class BaseTester:
17 |     def __init__(self, website=None):
18 |         self.website = website
19 |         if not self.website:
20 |             raise InitException
21 |         self.account_operator = RedisClient(type='account', website=self.website)
22 |         self.credential_operator = RedisClient(type='credential', website=self.website)
23 | 
24 |     def test(self, username, credential):
25 |         raise NotImplementedError
26 | 
27 |     def run(self):
28 |         credentials = self.credential_operator.all()
29 |         for username, credential in credentials.items():
30 |             self.test(username, credential)
31 | 
32 | 
33 | class Antispider6Tester(BaseTester):
34 |     def __init__(self, website=None):
35 |         super().__init__(website)
36 | 
37 |     def test(self, username, credential):
38 |         logger.info(f'testing credential for {username}')
39 |         try:
40 |             test_url = TEST_URL_MAP[self.website]
41 |             response = requests.get(test_url, headers={
42 |                 'Cookie': credential
43 |             }, timeout=TEST_TIMEOUT, allow_redirects=False)
44 |             if response.status_code == 200:
45 |                 logger.info('credential is valid')
46 |             else:
47 |                 logger.info('credential is not valid, delete it')
48 |                 self.credential_operator.delete(username)
49 |         except Exception as e:
50 |             logger.error(f'test failed: {e}')
51 |             logger.info('credential is not valid, delete it')
52 |             self.credential_operator.delete(username)


--------------------------------------------------------------------------------
/src/ch04/mongodb_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: mongodb_demo.py
 6 | @time: 2022/1/5 20:16
 7 | @project: python3-web-spider-learning
 8 | @desc: 4.5 MongoDB文档存储（P144~P150）
 9 | """
10 | import pymongo
11 | 
12 | 
13 | def insert_data():
14 |     # 插入数据
15 |     student = {
16 |         'id': '20170101',
17 |         'name': 'Jordan',
18 |         'age': 20,
19 |         'gender': 'male'
20 |     }
21 |     result = collection.insert_one(student)
22 |     print(result)
23 | 
24 |     student2 = {
25 |         'id': '20170102',
26 |         'name': 'Mike',
27 |         'age': 21,
28 |         'gender': 'male'
29 |     }
30 |     result = collection.insert_one(student2)
31 |     print(result.inserted_id)
32 | 
33 | 
34 | def select_data():
35 |     # 查找数据
36 |     result = collection.find_one({'name': 'Mike'})
37 |     print(type(result))
38 |     print(result)
39 | 
40 |     results = collection.find({'age': {'$gt': 20}})
41 |     print(results)
42 |     for result in results:
43 |         print(result)
44 | 
45 | 
46 | def counts():
47 |     # 计数
48 |     count = collection.count_documents()
49 |     print(count)
50 | 
51 | 
52 | def sort():
53 |     # 排序
54 |     results = collection.find().sort('name', pymongo.ASCENDING)
55 |     print([result['name'] for result in results])
56 | 
57 | 
58 | def skip():
59 |     # 偏移
60 |     results = collection.find().sort('name', pymongo.ASCENDING).skip(2)
61 |     print([result['name'] for result in results])
62 | 
63 | 
64 | def update_data():
65 |     condition = {'name': 'Mike'}
66 |     student = collection.find_one(condition)
67 |     student['age'] = 25
68 |     result = collection.update_one(condition, {'$set': student})
69 |     print(result)
70 |     print(result.matched_count, result.modified_count)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     # 连接MongoDB
75 |     client = pymongo.MongoClient(host='localhost', port=27017)
76 |     # 指定test数据库
77 |     db = client['test']
78 |     # 指定students集合
79 |     collection = db.students
80 |     update_data()
81 | 


--------------------------------------------------------------------------------
/src/ch11/js_scrape_practice.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: js_scrape_practice.py
 6 | @time: 2022/1/14 18:50
 7 | @project: python3-web-spider-learning
 8 | @desc: 11.13 JavaScript逆向爬虫实战（P507）
 9 | 目标：爬取网页（https://spa6.scrape.center/）
10 | 重难点：
11 | （1）列表页的Ajax接口参数带有加密的token
12 | （2）详情页的URL带有加密id
13 | （3）详情页的Ajax接口参数带有加密id和加密token
14 | （4）Ajax接口存在时效性，过段时间会返回401
15 | （5）前端JavaScript有压缩和混淆
16 | 逆向爬取思路：
17 | （1）通过全局搜索token，得到构造Ajax请求，设置断点
18 | （2）分析列表页加密逻辑，查看各变量的值，得到基本思路：将/api/movie放入一个列表中，加入当前时间戳，用逗号拼接，进行SHA1编码，将编码结果再次进行拼接
19 | 将拼接后的结果进行Base64编码
20 | （3）分析详情页加密id逻辑：使用Hook btoa，推荐使用Tampermonkey注入，分析得到：将一个固定值加上id进行Base64编码
21 | （4）分析详情页Ajax的token：得到与列表页token的构造逻辑是一样的
22 | """
23 | import base64
24 | import hashlib
25 | import time
26 | 
27 | import requests
28 | 
29 | INDEX_URL = 'https://spa6.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}'
30 | DETAIL_URL = 'https://spa6.scrape.center/api/movie/{id}?token={token}'
31 | LIMIT = 10
32 | OFFSET = 0
33 | SECRET = 'ef34#teuq0btua#(-57w1q5o5--j@98xygimlyfxs*-!i-0-mb'
34 | 
35 | 
36 | # 得到token
37 | def get_token(args: list):
38 |     timestamp = str(int(time.time()))
39 |     args.append(timestamp)
40 |     sign = hashlib.sha1(','.join(args).encode('utf-8')).hexdigest()
41 |     return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8')
42 | 
43 | 
44 | args = ['/api/movie']
45 | token = get_token(args)
46 | # 得到列表页的URL
47 | index_url = INDEX_URL.format(limit=LIMIT, offset=OFFSET, token=token)
48 | response = requests.get(index_url)
49 | print('response:', response.json())
50 | 
51 | result = response.json()
52 | 
53 | for item in result['results']:
54 |     id = item['id']
55 |     encrypt_id = base64.b64encode((SECRET + str(id)).encode('utf-8')).decode('utf-8')
56 |     args = [f'/api/movie/{encrypt_id}']
57 |     token = get_token(args=args)
58 |     # 得到详情页的URL
59 |     detail_url = DETAIL_URL.format(id=encrypt_id, token=token)
60 |     response = requests.get(detail_url)
61 |     print('detail response:', response.json())
62 | 


--------------------------------------------------------------------------------
/src/ch03/parsel_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: parsel_demo.py
 6 | @time: 2022/1/5 16:42
 7 | @project: python3-web-spider-learning
 8 | @desc: 3.4 parsel的使用（P124~P127）
 9 | """
10 | from parsel import Selector
11 | 
12 | html = '''
13 |     <div>
14 |         <ul>
15 |              <li class="item-0">first item</li>
16 |              <li class="item-1"><a href="link2.html">second item</a></li>
17 |              <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
18 |              <li class="item-1 active"><a href="link4.html">fourth item</a></li>
19 |              <li class="item-0"><a href="link5.html">fifth item</a></li>
20 |          </ul>
21 |      </div>
22 |     '''
23 | 
24 | 
25 | def parsel_demo():
26 |     selector = Selector(text=html)
27 |     items = selector.css('.item-0')
28 |     print(len(items), type(items), items)
29 |     items2 = selector.xpath('//li[contains(@class, "item-0")]')
30 |     print(len(items2), type(items2), items2)
31 | 
32 | 
33 | def extract_text():
34 |     selector = Selector(text=html)
35 |     items = selector.css('.item-0')
36 |     for item in items:
37 |         text = item.xpath('.//text()').get()
38 |         print(text)
39 | 
40 |     result = selector.xpath('//li[contains(@class, "item-0")]//text()').getall()
41 |     print(result)
42 | 
43 | 
44 | def extract_attrs():
45 |     selector = Selector(text=html)
46 |     result = selector.css('.item-0.active a::attr(href)').get()
47 |     print(result)
48 | 
49 |     result = selector.xpath('//li[contains(@class, "item-0") and contains(@class, "active")]/a/@href').get()
50 |     print(result)
51 | 
52 | 
53 | def extract_re():
54 |     selector = Selector(text=html)
55 |     result = selector.css('.item-0').re('link.*')
56 |     print(result)
57 | 
58 |     result = selector.css('.item-0 *::text').re('.*item')
59 |     print(result)
60 | 
61 |     result = selector.css('.item-0').re_first('<span class="bold">(.*?)</span>')
62 |     print(result)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     extract_re()
67 | 


--------------------------------------------------------------------------------
/src/ch08/tesserocr_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: tesserocr_demo.py
 6 | @time: 2022/1/11 9:37
 7 | @project: python3-web-spider-learning
 8 | @desc: 8.1 使用OCR技术识别图形验证码（P296）
 9 | """
10 | import re
11 | import time
12 | from io import BytesIO
13 | 
14 | import numpy as np
15 | import tesserocr
16 | from PIL import Image
17 | from retrying import retry
18 | from selenium import webdriver
19 | from selenium.common.exceptions import TimeoutException
20 | from selenium.webdriver.common.by import By
21 | from selenium.webdriver.support.wait import WebDriverWait
22 | from selenium.webdriver.support import expected_conditions as EC
23 | 
24 | 
25 | def preprocess(image):
26 |     image = image.convert('L')
27 |     array = np.array(image)
28 |     array = np.where(array > 115, 255, 0)
29 |     image = Image.fromarray(array.astype('uint8'))
30 |     return image
31 | 
32 | 
33 | @retry(stop_max_attempt_number=10, retry_on_result=lambda x: x is False)
34 | def login():
35 |     """
36 |     最大尝试10次
37 |     """
38 |     browser.get('https://captcha7.scrape.center/')
39 |     browser.find_element_by_css_selector('.username input[type="text"]').send_keys('admin')
40 |     browser.find_element_by_css_selector('.password input[type="password"]').send_keys('admin')
41 |     captcha = browser.find_element_by_css_selector('#captcha')
42 |     image = Image.open(BytesIO(captcha.screenshot_as_png))
43 |     image = preprocess(image)
44 |     captcha = tesserocr.image_to_text(image)
45 |     captcha = re.sub('[^A-Za-z0-9]', '', captcha)
46 |     print("Captcha:", captcha)
47 |     browser.find_element_by_css_selector('.captcha input[type="text"]').send_keys(captcha)
48 |     browser.find_element_by_css_selector('.login').click()
49 | 
50 |     try:
51 |         WebDriverWait(browser, 4).until(EC.presence_of_element_located((By.XPATH, '//h2[contains(., "登录成功")]')))
52 |         time.sleep(10)
53 |         browser.close()
54 |         return True
55 |     except TimeoutException:
56 |         return False
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     browser = webdriver.Chrome()
61 |     login()
62 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/java/com/germey/andservertest/MainActivity.java:
--------------------------------------------------------------------------------
 1 | package com.germey.andservertest;
 2 | 
 3 | import androidx.appcompat.app.AppCompatActivity;
 4 | 
 5 | import android.os.Bundle;
 6 | import android.util.Log;
 7 | import android.view.View;
 8 | import android.widget.Button;
 9 | import android.widget.TextView;
10 | 
11 | import com.yanzhenjie.andserver.AndServer;
12 | import com.yanzhenjie.andserver.Server;
13 | 
14 | import java.util.concurrent.TimeUnit;
15 | 
16 | public class MainActivity extends AppCompatActivity {
17 | 
18 |     private Server server;
19 |     private Button button;
20 |     private TextView textView;
21 | 
22 |     @Override
23 |     protected void onCreate(Bundle savedInstanceState) {
24 |         super.onCreate(savedInstanceState);
25 |         setContentView(R.layout.activity_main);
26 |         button = findViewById(R.id.toggle_server);
27 |         textView = findViewById(R.id.server_status);
28 |         server = AndServer.webServer(getApplicationContext())
29 |                 .port(8080)
30 |                 .timeout(10, TimeUnit.SECONDS)
31 |                 .listener(new Server.ServerListener() {
32 |                     @Override
33 |                     public void onStarted() {
34 |                         button.setText(R.string.stop_server);
35 |                         textView.setText(R.string.server_started);
36 |                     }
37 | 
38 |                     @Override
39 |                     public void onStopped() {
40 |                         button.setText(R.string.start_server);
41 |                         textView.setText(R.string.server_stopped);
42 |                     }
43 | 
44 |                     @Override
45 |                     public void onException(Exception e) {
46 |                         Log.d("AndServer", e.toString());
47 |                     }
48 |                 })
49 |         .build();
50 |         button.setText(R.string.start_server);
51 |         textView.setText(R.string.server_stopped);
52 |     }
53 | 
54 |     public void toggleServer(View view) {
55 |         if (!server.isRunning()) {
56 |             server.startup();
57 |         } else {
58 |             server.shutdown();
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/src/ch10/antispider_scrape_with_account_pool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: antispider_scrape_with_account_pool.py
 6 | @time: 2022/1/12 17:18
 7 | @project: python3-web-spider-learning
 8 | @desc: 使用账号池爬取网页（P394）
 9 | """
10 | import asyncio
11 | from pyquery import PyQuery as pq
12 | from loguru import logger
13 | import aiohttp
14 | from aiohttp import TCPConnector
15 | 
16 | MAX_ID = 20
17 | CONCURRENCY = 2
18 | TARGET_URL = 'https://antispider6.scrape.center'
19 | ACCOUNT_POOL_URL = 'http://localhost:6789/antispider6/random'
20 | 
21 | semaphore = asyncio.Semaphore(CONCURRENCY)
22 | 
23 | 
24 | async def parse_detail(html):
25 |     doc = pq(html)
26 |     title = doc('.item h2').text()
27 |     categories = [item.text() for item in doc('.item .categories span').items()]
28 |     cover = doc('.item .cover').attr('src')
29 |     score = doc('.item .score').text()
30 |     drama = doc('.item .drama').text().strip()
31 | 
32 |     return {
33 |         'title': title,
34 |         'categories': categories,
35 |         'cover': cover,
36 |         'score': score,
37 |         'drama': drama
38 |     }
39 | 
40 | 
41 | async def fetch_credential(session):
42 |     async with session.get(ACCOUNT_POOL_URL) as response:
43 |         return await response.text()
44 | 
45 | 
46 | async def scrape_detail(session, url):
47 |     async with semaphore:
48 |         credential = await fetch_credential(session)
49 |         headers = {'cookie': credential}
50 |         logger.debug(f'scrape {url} using credential {credential}')
51 |         async with session.get(url, headers=headers) as response:
52 |             html = await response.text()
53 |             data = await parse_detail(html)
54 |             logger.debug(f'data {data}')
55 | 
56 | 
57 | async def main():
58 |     session = aiohttp.ClientSession(connector=TCPConnector(ssl=False))
59 |     tasks = []
60 |     for i in range(1, MAX_ID + 1):
61 |         url = f'{TARGET_URL}/detail/{i}'
62 |         task = asyncio.ensure_future(scrape_detail(session, url))
63 |         tasks.append(task)
64 |     await asyncio.gather(*tasks)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     asyncio.get_event_loop().run_until_complete(main())
69 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/spiders/scrape.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy import Request
 3 | 
 4 | from ch15.scrapyitempipelinedemo.scrapyitempipelinedemo.items import MovieItem
 5 | 
 6 | 
 7 | class ScrapeSpider(scrapy.Spider):
 8 |     name = 'scrape'
 9 |     allowed_domains = ['ssr1.scrape.center']
10 |     base_url = 'https://ssr1.scrape.center'
11 |     max_page = 10
12 | 
13 |     def start_requests(self):
14 |         for i in range(1, self.max_page + 1):
15 |             url = f'{self.base_url}/page/{i}'
16 |             yield Request(url, callback=self.parse_index)
17 | 
18 |     def parse_index(self, response):
19 |         for item in response.css('.item'):
20 |             href = item.css('.name::attr(href)').extract_first()
21 |             url = response.urljoin(href)
22 |             yield Request(url, callback=self.parse_detail)
23 | 
24 |     def parse_detail(self, response):
25 |         item = MovieItem()
26 |         item['name'] = response.xpath('//div[contains(@class, "item")]//h2/text()').extract_first()
27 |         item['categories'] = response.xpath('//button[contains(@class, "category")]/span/text()').extract()
28 |         item['score'] = response.css('.score::text').re_first('[\d\.]+')
29 |         item['drama'] = response.css('.drama p::text').extract_first().strip()
30 |         item['directors'] = []
31 |         directors = response.xpath('//div[contains(@class, "directors")]//div[contains(@class, "director")]')
32 |         for director in directors:
33 |             director_image = director.xpath('.//img[@class="image"]/@src').extract_first()
34 |             director_name = director.xpath('.//p[contains(@class, "name")]/text()').extract_first()
35 |             item['directors'].append({
36 |                 'name': director_name,
37 |                 'image': director_image
38 |             })
39 |         item['actors'] = []
40 |         actors = response.css('.actors .actor')
41 |         for actor in actors:
42 |             actor_image = actor.css('.actor .image::attr(src)').extract_first()
43 |             actor_name = actor.css('.actor .name::text').extract_first()
44 |             item['actors'].append({
45 |                 'name': actor_name,
46 |                 'image': actor_image
47 |             })
48 |         yield item
49 | 


--------------------------------------------------------------------------------
/src/ch05/scrape_ajax.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: scrape_ajax.py
 6 | @time: 2022/1/6 15:46
 7 | @project: python3-web-spider-learning
 8 | @desc: 5.3 Ajax分析与爬取实战（P184~P190）
 9 | """
10 | import logging
11 | 
12 | import pymongo
13 | import requests
14 | 
15 | logging.basicConfig(level=logging.INFO,
16 |                     format='%(asctime)s - %(levelname)s: %(message)s')
17 | 
18 | INDEX_URL = 'https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}'
19 | 
20 | 
21 | def scrape_api(url):
22 |     """
23 |     爬取详情页
24 |     """
25 |     logging.info('scraping %s...', url)
26 |     try:
27 |         response = requests.get(url)
28 |         if response.status_code == 200:
29 |             return response.json()
30 |         logging.error('get invalid status code %s while scraping %s', response.status_code, url)
31 |     except requests.RequestException:
32 |         logging.error('error occurred while scraping %s', url, exc_info=True)
33 | 
34 | 
35 | LIMIT = 10
36 | 
37 | 
38 | def scrape_index(page):
39 |     """
40 |     爬取列表页
41 |     """
42 |     url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
43 |     return scrape_api(url)
44 | 
45 | 
46 | DETAIL_URL = 'https://spa1.scrape.center/api/movie/{id}'
47 | 
48 | 
49 | def scrape_detail(id):
50 |     """
51 |     爬取详情页
52 |     """
53 |     url = DETAIL_URL.format(id=id)
54 |     return scrape_api(url)
55 | 
56 | 
57 | TOTAL_PAGE = 10
58 | MONGO_CONNETCTION_STRING = 'mongodb://localhost:27017'
59 | MONGO_DB_NAME = 'movies'
60 | MONGO_COLLECTION_NAME = 'movies'
61 | 
62 | client = pymongo.MongoClient(MONGO_CONNETCTION_STRING)
63 | db = client[MONGO_DB_NAME]
64 | collection = db[MONGO_COLLECTION_NAME]
65 | 
66 | 
67 | def save_data(data):
68 |     collection.update_one({
69 |         'name': data.get('name')
70 |     }, {'$set': data}, upsert=True)
71 | 
72 | 
73 | def main():
74 |     for page in range(1, TOTAL_PAGE + 1):
75 |         index_data = scrape_index(page)
76 |         for item in index_data.get('results'):
77 |             id = item.get('id')
78 |             detail_data = scrape_detail(id)
79 |             logging.info('detail data %s', detail_data)
80 |             save_data(detail_data)
81 |             logging.info('data saved successfully')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/generator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: processors_generator.py
 6 | @time: 2022/1/12 10:34
 7 | @project: python3-web-spider-learning
 8 | @desc: 获取模块，主要负责从存储模块中拿取各个账号信息，并模拟登录，将登录成功后生产的Cookie保存到存储模块中
 9 | """
10 | 
11 | import requests
12 | from loguru import logger
13 | 
14 | from ch10.account_pool.exceptions import InitException
15 | from ch10.account_pool.storages_redis import RedisClient
16 | 
17 | 
18 | class BaseGenerator:
19 |     def __init__(self, website=None):
20 |         self.website = website
21 |         if not self.website:
22 |             raise InitException
23 |         self.account_operator = RedisClient(type='account', website=self.website)
24 |         self.credential_operator = RedisClient(type='credential', website=self.website)
25 | 
26 |     def generate(self, username, password):
27 |         raise NotImplementedError
28 | 
29 |     def init(self):
30 |         pass
31 | 
32 |     def run(self):
33 |         self.init()
34 |         logger.debug('start to run generator')
35 |         for username, password in self.account_operator.all().items():
36 |             if self.credential_operator.get(username):
37 |                 continue
38 |             logger.debug(f'start to generator credential of {username}')
39 |             self.generate(username, password)
40 | 
41 | 
42 | class Antispider6Generator(BaseGenerator):
43 |     def generate(self, username, password):
44 |         if self.credential_operator.get(username):
45 |             logger.debug(f'credential of {username} exists, skip')
46 |             return
47 |         login_url = 'https://antispider6.scrape.center/login'
48 |         s = requests.Session()
49 |         try:
50 |             s.post(login_url, data={
51 |                 'username': username,
52 |                 'password': password
53 |             })
54 |             result = []
55 |             for cookie in s.cookies:
56 |                 print(cookie.name, cookie.value)
57 |                 result.append(f'{cookie.name}={cookie.value}')
58 |             result = ';'.join(result)
59 |             if len(result) > 0:
60 |                 logger.debug(f'get {username} credential {result}')
61 |                 self.credential_operator.set(username, result)
62 |         except Exception as e:
63 |             logger.error(f'get {username} credential failed: {e}')
64 | 


--------------------------------------------------------------------------------
/src/ch10/session_cookie_simulate_login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: session_cookie_simulate_login.py
 6 | @time: 2022/1/12 9:12
 7 | @project: python3-web-spider-learning
 8 | @desc: 10.2 基于Session和Cookie的模拟登录爬取实战（P376）
 9 | """
10 | import time
11 | from urllib.parse import urljoin
12 | import requests
13 | from selenium import webdriver
14 | 
15 | BASE_URL = 'https://login2.scrape.center/'
16 | LOGIN_URL = urljoin(BASE_URL, '/login')
17 | INDEX_URL = urljoin(BASE_URL, '/page/1')
18 | USERNAME = 'admin'
19 | PASSWORD = 'admin'
20 | 
21 | 
22 | def simul_login_with_cookies():
23 |     # 登录网站
24 |     response_login = requests.post(LOGIN_URL, data={
25 |         'username': USERNAME,
26 |         'password': PASSWORD
27 |     }, allow_redirects=False)
28 | 
29 |     # 保存Cookie
30 |     cookies = response_login.cookies
31 |     print('Cookies:', cookies)
32 | 
33 |     # 携带cookies访问列表页
34 |     response_index = requests.get(INDEX_URL, cookies=cookies)
35 |     print('Response Status', response_index.status_code)
36 |     print('Response URL', response_index.url)
37 | 
38 | 
39 | def simul_login_with_session():
40 |     session = requests.Session()
41 | 
42 |     # 登录网站
43 |     response_login = session.post(LOGIN_URL, data={
44 |         'username': USERNAME,
45 |         'password': PASSWORD
46 |     })
47 | 
48 |     # 保存Cookie
49 |     cookies = session.cookies
50 |     print('Cookies:', cookies)
51 | 
52 |     # 携带cookies访问列表页
53 |     response_index = session.get(INDEX_URL)
54 |     print('Response Status', response_index.status_code)
55 |     print('Response URL', response_index.url)
56 | 
57 | 
58 | def simul_login_with_selenium():
59 |     browser = webdriver.Chrome()
60 |     browser.get(BASE_URL)
61 |     browser.find_element_by_css_selector('input[name="username"]').send_keys(USERNAME)
62 |     browser.find_element_by_css_selector('input[name="password"]').send_keys(PASSWORD)
63 |     browser.find_element_by_css_selector('input[type="submit"]').click()
64 |     time.sleep(10)
65 | 
66 |     # 从浏览器对象中获取Cookie信息
67 |     cookies = browser.get_cookies()
68 |     print('Cookies:', cookies)
69 |     browser.close()
70 | 
71 |     # 把Cookies信息放入请求中
72 |     session = requests.Session()
73 |     for cookie in cookies:
74 |         session.cookies.set(cookie['name'], cookie['value'])
75 | 
76 |     response_index = session.get(INDEX_URL)
77 |     print('Response Status', response_index.status_code)
78 |     print('Response URL', response_index.url)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     simul_login_with_selenium()
83 | 


--------------------------------------------------------------------------------
/src/ch02/urllib_demo/request_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: requests_demo.py
 6 | @time: 2021/12/29 15:17
 7 | @project: python3-web-spider-learning
 8 | @desc: Request模块示例（P30~P34）
 9 | """
10 | import socket
11 | import urllib.request
12 | import urllib.error
13 | import urllib.parse
14 | 
15 | 
16 | def print_content(url='https://www.python.org'):
17 |     response = urllib.request.urlopen(url)
18 |     # 打印网页源代码
19 |     print(response.read().decode('utf-8'))
20 | 
21 | 
22 | def print_response_type(url='https://www.python.org'):
23 |     response = urllib.request.urlopen(url)
24 |     # 打印响应类型
25 |     print(type(response))
26 | 
27 | 
28 | def print_status(url='https://www.python.org'):
29 |     response = urllib.request.urlopen(url)
30 |     # 打印响应的状态码
31 |     print(response.status)
32 | 
33 | 
34 | def print_header(name='Server', url='https://www.python.org'):
35 |     response = urllib.request.urlopen(url)
36 |     # 打印响应的头信息
37 |     print(response.getheaders())
38 |     if name:
39 |         # 打印响应头中的指定值
40 |         print(response.getheader(name))
41 | 
42 | 
43 | def print_content_with_data(url='https://www.httpbin.org/post'):
44 |     data = bytes(urllib.parse.urlencode({'name': 'germey'}), encoding='utf-8')
45 |     # 使用data参数
46 |     response = urllib.request.urlopen(url, data=data)
47 |     print(response.read().decode('utf-8'))
48 | 
49 | 
50 | def print_content_with_timeout(url='https://www.httpbin.org/get'):
51 |     # 使用timeout参数
52 |     response = urllib.request.urlopen(url, timeout=0.1)
53 |     print(response.read())
54 | 
55 | 
56 | def print_content_with_try_except(url='https://www.httpbin.org/get'):
57 |     # 使用timeout参数
58 |     try:
59 |         urllib.request.urlopen(url, timeout=0.1)
60 |     except urllib.error.URLError as e:
61 |         if isinstance(e.reason, socket.timeout):
62 |             print('TIME OUT')
63 | 
64 | 
65 | def print_content_with_request(url='https://www.httpbin.org/post'):
66 |     # 指定headers的User-Agent和Host
67 |     headers = {
68 |         'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
69 |         'Host': 'www.httpbin.org'
70 |     }
71 | 
72 |     data_dict = {'name': 'germey'}
73 |     # 将字典数据转成字节流格式
74 |     data = bytes(urllib.parse.urlencode(data_dict), encoding='utf-8')
75 |     # 构造Request类
76 |     req = urllib.request.Request(url=url, data=data, headers=headers, method='POST')
77 |     response = urllib.request.urlopen(req)
78 |     print(response.read().decode('utf-8'))
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     print_content_with_request()
83 | 


--------------------------------------------------------------------------------
/src/ch02/requests_demo/requests_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: requests_demo.py
 6 | @time: 2021/12/31 13:52
 7 | @project: python3-web-spider-learning
 8 | @desc: requests基本用法（P48~P55）
 9 | """
10 | import os
11 | import re
12 | 
13 | import requests
14 | 
15 | 
16 | def print_get_request():
17 |     r = requests.get('https://www.baidu.com')
18 |     print(type(r))
19 |     print(r.status_code)
20 |     print(type(r.text))
21 |     print(r.text[:100])
22 |     print(r.cookies)
23 | 
24 | 
25 | def print_request():
26 |     r = requests.get('https://www.httpbin.org/get')
27 |     r = requests.post('https://www.httpbin.org/post')
28 |     r = requests.put('https://www.httpbin.org/put')
29 |     r = requests.delete('https://www.httpbin.org/delete')
30 |     r = requests.patch('https://www.httpbin.org/patch')
31 | 
32 | 
33 | def print_get_with_params(url, params):
34 |     r = requests.get(url, params=params)
35 |     print(r.text)
36 | 
37 | 
38 | def print_json():
39 |     r = requests.get('https://www.httpbin.org/get')
40 |     print(type(r.text))
41 |     print(r.json())
42 |     print(type(r.json()))
43 | 
44 | 
45 | def fetch_web():
46 |     r = requests.get('https://ssr1.scrape.center/')
47 |     pattern = re.compile('<h2.*?>(.*?)</h2>', re.S)
48 |     titles = re.findall(pattern, r.text)
49 |     print(titles)
50 | 
51 | 
52 | def get_favicon():
53 |     if not os.path.exists('../files'):
54 |         os.mkdir('../files')
55 | 
56 |     r = requests.get('https://scrape.center/favicon.ico')
57 |     with open('../files/favicon.ico', 'wb') as f:
58 |         f.write(r.content)
59 | 
60 | 
61 | def print_get_with_headers():
62 |     headers = {
63 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko)'
64 |                       'Chrome/52.0.2743.116 Safari/537.36'
65 |     }
66 |     r = requests.get('https://ssr1.scrape.center/', headers=headers)
67 |     print(r.text)
68 | 
69 | 
70 | def print_post():
71 |     data = {
72 |         'name': 'germey',
73 |         'age': '25'
74 |     }
75 |     r = requests.post("https://www.httpbin.org/post", data=data)
76 |     print(r.text)
77 | 
78 | 
79 | def check_request():
80 |     r = requests.get('https://ssr1.scrape.center/')
81 |     exit() if not r.status_code == requests.codes.ok else print('Request Successfully')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     # url = 'https://www.httpbin.org/get'
86 |     # data = {
87 |     #     'name': 'germey',
88 |     #     'age': 25
89 |     # }
90 |     #
91 |     # print_get_with_params(url, data)
92 | 
93 |     check_request()
94 | 


--------------------------------------------------------------------------------
/src/ch06/aiohttp_scrape_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: aiohttp_scrape_demo.py
 6 | @time: 2022/1/6 20:06
 7 | @project: python3-web-spider-learning
 8 | @desc: 6.3 aiohttp异步爬取实战（P207~P211）
 9 | """
10 | import asyncio
11 | import json
12 | import logging
13 | from aiohttp import TCPConnector
14 | import aiohttp
15 | from motor.motor_asyncio import AsyncIOMotorClient
16 | 
17 | logging.basicConfig(level=logging.INFO,
18 |                     format='%(asctime)s - %(levelname)s: %(message)s')
19 | 
20 | INDEX_URL = 'https://spa5.scrape.center/api/book?limit=18&offset={offset}'
21 | DETAIL_URL = 'https://spa5.scrape.center/api/book/{id}'
22 | PAGE_SIZE = 18
23 | PAGE_NUMBER = 100
24 | CONCURRENCY = 5
25 | 
26 | semaphore = asyncio.Semaphore(CONCURRENCY)
27 | session = None
28 | 
29 | 
30 | async def scrape_api(url):
31 |     """
32 |     通用爬取方法
33 |     """
34 |     async with semaphore:
35 |         try:
36 |             logging.info('scraping %s', url)
37 |             async with session.get(url) as response:
38 |                 return await response.json()
39 |         except aiohttp.ClientError:
40 |             logging.error('error occurred with scaping %s', url, exc_info=True)
41 | 
42 | 
43 | async def scrape_index(page):
44 |     """
45 |     爬取列表页
46 |     """
47 |     url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
48 |     return await scrape_api(url)
49 | 
50 | 
51 | MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
52 | MONGO_DB_NAME = 'books'
53 | MONGO_COLLECTION_NAME = 'books'
54 | 
55 | client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
56 | db = client[MONGO_DB_NAME]
57 | collection = db[MONGO_COLLECTION_NAME]
58 | 
59 | 
60 | async def save_data(data):
61 |     logging.info('saving data %s', data)
62 |     if data:
63 |         return await collection.update_one({
64 |             'id': data.get('id')
65 |         }, {
66 |             '$set': data
67 |         }, upsert=True)
68 | 
69 | 
70 | async def scrape_detail(id):
71 |     url = DETAIL_URL.format(id=id)
72 |     data = await scrape_api(url)
73 |     await save_data(data)
74 | 
75 | 
76 | async def main():
77 |     global session
78 |     session = aiohttp.ClientSession(connector=TCPConnector(ssl=False))
79 |     scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
80 |     results = await asyncio.gather(*scrape_index_tasks)
81 |     logging.info('results %s', json.dumps(results, ensure_ascii=False, indent=2))
82 | 
83 |     # 所有书的ID
84 |     ids = []
85 |     for index_data in results:
86 |         if not index_data:
87 |             continue
88 |         for item in index_data.get('results'):
89 |             ids.append(item.get('id'))
90 | 
91 |     scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(id)) for id in ids]
92 |     await asyncio.wait(scrape_detail_tasks)
93 |     await session.close()
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     asyncio.get_event_loop().run_until_complete(main())
98 | 


--------------------------------------------------------------------------------
/src/ch04/elasticsearch_oper_demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | """
  4 | @author: HuRuiFeng
  5 | @file: elasticsearch_oper_demo.py
  6 | @time: 2022/1/6 9:15
  7 | @project: python3-web-spider-learning
  8 | @desc: 4.7 Elasticsearch搜索引擎存储（P161~P166）
  9 | """
 10 | from elasticsearch import Elasticsearch
 11 | 
 12 | 
 13 | def create_index():
 14 |     result = es.indices.create(index='news', ignore=400)
 15 |     print(result)
 16 | 
 17 | 
 18 | def delete_index():
 19 |     result = es.indices.delete(index='news', ignore=[400, 404])
 20 |     print(result)
 21 | 
 22 | 
 23 | def insert_data():
 24 |     es.indices.create(index='news', ignore=400)
 25 | 
 26 |     data = {
 27 |         'title': '乘风破浪不负韶华，奋斗青春圆梦高考',
 28 |         'url': 'http://view.indws.qq.com/a/EDU20210416007322200'
 29 |     }
 30 |     result = es.create(index='news', id=1, body=data)
 31 |     print(result)
 32 | 
 33 | 
 34 | def update_data():
 35 |     data = {
 36 |         'title': '乘风破浪不负韶华，奋斗青春圆梦高考',
 37 |         'url': 'http://view.indws.qq.com/a/EDU20210416007322200',
 38 |         'date': '2021-07-05'
 39 |     }
 40 |     result = es.update(index='news', body=data, id=1, ignore=400)
 41 |     print(result)
 42 | 
 43 | 
 44 | def delete_data():
 45 |     result = es.delete(index='news', id=1)
 46 |     print(result)
 47 | 
 48 | 
 49 | def select_data():
 50 |     mapping = {
 51 |         'properties': {
 52 |             'title': {
 53 |                 'type': 'text',
 54 |                 'analyzer': 'ik_max_word',
 55 |                 'search_analyzer': 'ik_max_word'
 56 |             }
 57 |         }
 58 |     }
 59 |     es.indices.delete(index='news', ignore=[400, 404])
 60 |     es.indices.create(index='news', ignore=400)
 61 |     result = es.indices.put_mapping(index='news', body=mapping)
 62 |     print(result)
 63 | 
 64 |     datas = [
 65 |         {
 66 |             'title': '高考结局大不同',
 67 |             'url': 'https://k.sina.com.cn/article_7571064628_1c3454734001011lz9.html',
 68 |         },
 69 |         {
 70 |             'title': '进入职业大洗牌时代，“吃香”职业还吃香吗？',
 71 |             'url': 'https://new.qq.com/omn/20210828/20210828A025LK00.html',
 72 |         },
 73 |         {
 74 |             'title': '乘风破浪不负韶华，奋斗青春圆梦高考',
 75 |             'url': 'http://view.inews.qq.com/a/EDU2021041600732200',
 76 |         },
 77 |         {
 78 |             'title': '他，活出了我们理想的样子',
 79 |             'url': 'https://new.qq.com/omn/20210821/20210821A020ID00.html',
 80 |         }
 81 |     ]
 82 | 
 83 |     for data in datas:
 84 |         es.index(index='news', body=data)
 85 | 
 86 |     result = es.search(index='news')
 87 |     print(result)
 88 | 
 89 | 
 90 | def full_text_search():
 91 |     dsl = {
 92 |         'query': {
 93 |             'match': {
 94 |                 'title': '高考 圆梦'
 95 |             }
 96 |         }
 97 |     }
 98 |     result = es.search(index='news', body=dsl)
 99 |     print(result)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     es = Elasticsearch()
104 |     full_text_search()
105 | 


--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/src/ch08/opencv_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: opencv_demo.py
 6 | @time: 2022/1/11 10:41
 7 | @project: python3-web-spider-learning
 8 | @desc: 8.2 使用OpenCV识别滑动验证码的缺口（P298~P303）
 9 | """
10 | import cv2
11 | 
12 | GAUSSIAN_BLUR_KERNEL_SIZE = (5, 5)
13 | GAUSSIAN_BLUR_SIGMA_X = 0
14 | CANNY_THRESHOLD1 = 200
15 | CANNY_THRESHOLD2 = 450
16 | 
17 | 
18 | def get_gaussian_blur_image(image):
19 |     """
20 |     得到高斯滤波处理后的图片
21 |     """
22 |     return cv2.GaussianBlur(image, GAUSSIAN_BLUR_KERNEL_SIZE, GAUSSIAN_BLUR_SIGMA_X)
23 | 
24 | 
25 | def get_canny_image(image):
26 |     """
27 |     得到边缘检测处理后的图片
28 |     """
29 |     return cv2.Canny(image, CANNY_THRESHOLD1, CANNY_THRESHOLD2)
30 | 
31 | 
32 | def get_contours(image):
33 |     """
34 |     得到轮廓信息
35 |     """
36 |     contours, _ = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
37 |     return contours
38 | 
39 | 
40 | def get_contour_area_thrshold(image_width, image_height):
41 |     """
42 |     定义目标轮廓的面积上下限
43 |     """
44 |     contour_area_min = (image_width * 0.15) * (image_height * 0.25) * 0.8
45 |     contour_area_max = (image_width * 0.15) * (image_height * 0.25) * 1.2
46 |     return contour_area_min, contour_area_max
47 | 
48 | 
49 | def get_arc_threshold(image_width, image_height):
50 |     """
51 |     定义目标轮廓的周长上下限
52 |     """
53 |     arc_length_min = ((image_width * 0.15) + (image_height * 0.25)) * 2 * 0.8
54 |     arc_length_max = ((image_width * 0.15) + (image_height * 0.25)) * 2 * 1.2
55 |     return arc_length_min, arc_length_max
56 | 
57 | 
58 | def get_offset_threshold(image_width):
59 |     """
60 |     定义缺口位置的偏移量上下限
61 |     """
62 |     offset_min = 0.2 * image_width
63 |     offset_max = 0.85 * image_width
64 |     return offset_min, offset_max
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     image_raw = cv2.imread('files/slide_captcha.png')
69 |     # 得到图片的宽高
70 |     image_height, image_width, _ = image_raw.shape
71 |     image_gaussian_blur = get_gaussian_blur_image(image_raw)
72 |     cv2.imwrite('files/image_gaussian_blur.png', image_gaussian_blur)
73 |     image_canny = get_canny_image(image_gaussian_blur)
74 |     cv2.imwrite('files/image_canny.png', image_canny)
75 |     contours = get_contours(image_canny)
76 | 
77 |     contour_area_min, contour_area_max = get_contour_area_thrshold(image_width, image_height)
78 |     arc_length_min, arc_length_max = get_arc_threshold(image_width, image_height)
79 |     offset_min, offset_max = get_offset_threshold(image_width)
80 |     offset = None
81 | 
82 |     for contour in contours:
83 |         x, y, w, h = cv2.boundingRect(contour)
84 |         # 判断满足条件的缺口位置
85 |         if contour_area_min < cv2.contourArea(contour) < contour_area_max and \
86 |                 arc_length_min < cv2.arcLength(contour, True) < arc_length_max and \
87 |                 offset_min < x < offset_max:
88 |             # 用矩形框标注出来
89 |             cv2.rectangle(image_raw, (x, y), (x + w, y + h), (0, 0, 255), 2)
90 |             offset = x
91 | 
92 |     cv2.imwrite('files/image_label.png', image_raw)
93 |     print('offset:', offset)
94 | 


--------------------------------------------------------------------------------
/src/ch07/playwright_demo/event_listen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: event_listen.py
 6 | @time: 2022/1/10 19:56
 7 | @project: python3-web-spider-learning
 8 | @desc: 事件监听（P263）
 9 | """
10 | import re
11 | 
12 | from playwright.sync_api import sync_playwright
13 | 
14 | 
15 | def sync_on_response(response):
16 |     # 打印请求和响应
17 |     # print(f'Statue{response.status}: {response.url}')
18 | 
19 |     if '/api/movie/' in response.url and response.status == 200:
20 |         print(response.json())
21 | 
22 | 
23 | def sync():
24 |     with sync_playwright() as p:
25 |         browser = p.chromium.launch(headless=False)
26 |         page = browser.new_page()
27 |         page.on('response', sync_on_response)
28 |         page.goto('https://spa6.scrape.center/')
29 |         page.wait_for_load_state('networkidle')
30 |         browser.close()
31 | 
32 | 
33 | def get_web_source():
34 |     with sync_playwright() as p:
35 |         browser = p.chromium.launch(headless=False)
36 |         page = browser.new_page()
37 |         page.goto('https://spa6.scrape.center/')
38 |         page.wait_for_load_state('networkidle')
39 |         html = page.content()
40 |         print(html)
41 |         browser.close()
42 | 
43 | 
44 | def get_node_attr():
45 |     with sync_playwright() as p:
46 |         browser = p.chromium.launch(headless=False)
47 |         page = browser.new_page()
48 |         page.goto('https://spa6.scrape.center/')
49 |         page.wait_for_load_state('networkidle')
50 |         href = page.get_attribute('a.name', 'href')
51 |         print(href)
52 |         browser.close()
53 | 
54 | 
55 | def get_node_attrs():
56 |     with sync_playwright() as p:
57 |         browser = p.chromium.launch(headless=False)
58 |         page = browser.new_page()
59 |         page.goto('https://spa6.scrape.center/')
60 |         page.wait_for_load_state('networkidle')
61 |         elements = page.query_selector_all('a.name')
62 |         for element in elements:
63 |             print(element.get_attribute('href'))
64 |             print(element.text_content())
65 |         browser.close()
66 | 
67 | 
68 | def get_node():
69 |     with sync_playwright() as p:
70 |         browser = p.chromium.launch(headless=False)
71 |         page = browser.new_page()
72 |         page.goto('https://spa6.scrape.center/')
73 |         page.wait_for_load_state('networkidle')
74 |         element = page.query_selector('a.name')
75 |         print(element.get_attribute('href'))
76 |         print(element.text_content())
77 |         browser.close()
78 | 
79 | 
80 | def route_demo():
81 |     with sync_playwright() as p:
82 |         browser = p.chromium.launch(headless=False)
83 |         page = browser.new_page()
84 | 
85 |         def cancel_request(route, request):
86 |             route.abort()
87 | 
88 |         page.route(re.compile(r"(\.png)|(\.jpg)"), cancel_request)
89 |         page.goto("https://spa6.scrape.center/")
90 |         page.wait_for_load_state('networkidle')
91 |         page.screenshot(path='files/np_picture.png')
92 |         browser.close()
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     route_demo()
97 | 


--------------------------------------------------------------------------------
/src/ch02/urllib_demo/request_hander_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: request_hander_demo.py
 6 | @time: 2021/12/29 16:32
 7 | @project: python3-web-spider-learning
 8 | @desc: 验证、代理、Cookie（P35-P36）
 9 | """
10 | 
11 | import http.cookiejar
12 | import os
13 | import urllib.request
14 | from urllib.error import URLError
15 | from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler
16 | from urllib.request import ProxyHandler, build_opener
17 | 
18 | 
19 | def valid():
20 |     username = 'admin'
21 |     password = 'admin'
22 |     url = 'https://ssr3.scrape.center/'
23 | 
24 |     p = HTTPPasswordMgrWithDefaultRealm()
25 |     p.add_password(None, url, username, password)
26 |     auth_handler = HTTPBasicAuthHandler(p)
27 |     opener = build_opener(auth_handler)
28 | 
29 |     try:
30 |         result = opener.open(url)
31 |         html = result.read().decode('utf-8')
32 |         print(html)
33 |     except URLError as e:
34 |         print(e.reason)
35 | 
36 | 
37 | def proxy():
38 |     proxy_hander = ProxyHandler({
39 |         'http': 'http://127.0.0.1:8080',
40 |         'https': 'https://127.0.0.1:8080'
41 |     })
42 | 
43 |     opener = build_opener(proxy_hander)
44 |     try:
45 |         response = opener.open('https://www.baidu.com')
46 |         print(response.read().decode('utf-8'))
47 |     except URLError as e:
48 |         print(e.reason)
49 | 
50 | 
51 | def cookie_values():
52 |     # 声明CookieJar对象
53 |     cookie = http.cookiejar.CookieJar()
54 |     # 构建Handler
55 |     handler = urllib.request.HTTPCookieProcessor(cookie)
56 |     # 构建Opener
57 |     opener = urllib.request.build_opener(handler)
58 |     response = opener.open('https://www.baidu.com')
59 |     for item in cookie:
60 |         print(item.name + '=' + item.value)
61 | 
62 | 
63 | def cookie_mozilla_content():
64 |     if not os.path.exists('../files'):
65 |         os.mkdir('../files')
66 | 
67 |     filename = 'files/mozilla_cookie.txt'
68 |     cookie = http.cookiejar.MozillaCookieJar(filename)
69 |     handler = urllib.request.HTTPCookieProcessor(cookie)
70 |     opener = urllib.request.build_opener(handler)
71 |     response = opener.open('https://www.baidu.com')
72 |     cookie.save(ignore_discard=True, ignore_expires=True)
73 | 
74 | 
75 | def cookie_lwp_content():
76 |     if not os.path.exists('../files'):
77 |         os.mkdir('../files')
78 | 
79 |     filename = 'files/lwp_cookie.txt'
80 |     cookie = http.cookiejar.LWPCookieJar(filename)
81 |     handler = urllib.request.HTTPCookieProcessor(cookie)
82 |     opener = urllib.request.build_opener(handler)
83 |     response = opener.open('https://www.baidu.com')
84 |     cookie.save(ignore_discard=True, ignore_expires=True)
85 | 
86 | 
87 | def cookie_use_lwp():
88 |     cookie = http.cookiejar.LWPCookieJar()
89 |     cookie.load('files/lwp_cookie.txt', ignore_discard=True, ignore_expires=True)
90 |     handler = urllib.request.HTTPCookieProcessor(cookie)
91 |     opener = urllib.request.build_opener(handler)
92 |     response = opener.open('https://www.baidu.com')
93 |     print(response.read().decode('utf-8'))
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     valid()
98 | 


--------------------------------------------------------------------------------
/src/ch11/learn-ast/basic/basic2.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author: HuRuiFeng
  3 |  * @file: basic2.js
  4 |  * @time:  2022-01-14 10:00:25
  5 |  * @project: python3-web-spider-learning
  6 |  * @desc: AST操作
  7 |  */
  8 | 
  9 | import {parse} from "@babel/parser";
 10 | import traverse from "@babel/traverse";
 11 | import generate from "@babel/generator";
 12 | import * as types from "@babel/types"
 13 | import fs from "fs";
 14 | 
 15 | const code = fs.readFileSync("../codes/code1.js", "utf-8")
 16 | let ast = parse(code);
 17 | 
 18 | function traverse_nodes() {
 19 |     // 遍历AST节点
 20 |     traverse(ast, {
 21 |         enter(path) {
 22 |             console.log(path)
 23 |         }
 24 |     })
 25 | }
 26 | 
 27 | function modify_value1() {
 28 |     // 利用修改AST的方式修改赋值变量
 29 |     traverse(ast, {
 30 |         enter(path) {
 31 |             let node = path.node;
 32 |             if (node.type === "NumericLiteral" && node.value === 3) {
 33 |                 node.value = 5;
 34 |             }
 35 |             if (node.type === "StringLiteral" && node.value === "hello") {
 36 |                 node.value = "hi";
 37 |             }
 38 |         },
 39 |     })
 40 |     const {code: output } = generate(ast, {
 41 |     retainLines: true,
 42 |     });
 43 | 
 44 |     console.log(output);
 45 | }
 46 | 
 47 | function modify_value2() {
 48 |     // 利用修改AST的方式修改赋值变量
 49 |     traverse(ast, {
 50 |         NumericLiteral(path) {
 51 |             if (path.node.value === 3) {
 52 |                 path.node.value = 5;
 53 |             }
 54 |         },
 55 |         StringLiteral(path) {
 56 |             if (path.node.value === "hello") {
 57 |                 path.node.value = "hi";
 58 |             }
 59 |         }
 60 |     })
 61 |     const {code: output } = generate(ast, {
 62 |         comments: false
 63 |     });
 64 | 
 65 |     console.log(output);
 66 | }
 67 | 
 68 | function delete_node() {
 69 |     // 删除所有的console.log
 70 |     traverse(ast, {
 71 |         CallExpression(path) {
 72 |             let node = path.node;
 73 |             if (
 74 |                 node.callee.object.name === "console" &&
 75 |                 node.callee.property.name === "log"
 76 |             ) {
 77 |                 path.remove();
 78 |             }
 79 |         },
 80 |     });
 81 | 
 82 |     const {code: output } = generate(ast, {
 83 |         comments: false
 84 |     });
 85 | 
 86 |     console.log(output);
 87 | }
 88 | 
 89 | function add_node() {
 90 |     // 添加const b = a + 1;
 91 |     const code = "const a = 1;";
 92 |     let ast = parse(code);
 93 |     traverse(ast, {
 94 |        VariableDeclaration(path) {
 95 |            let init = types.binaryExpression(
 96 |                "+",
 97 |                types.identifier("a"),
 98 |                types.numericLiteral(1)
 99 |            );
100 |            let declarator = types.variableDeclarator(types.identifier("b"), init);
101 |            let declaration = types.variableDeclaration("const", [declarator]);
102 |            path.insertAfter(declaration);
103 |            path.stop();
104 |        },
105 |     });
106 |     const output = generate(ast, {
107 |         retainLines: true,
108 |     }).code;
109 |     console.log(output);
110 | }
111 | 
112 | add_node()
113 | 
114 | 


--------------------------------------------------------------------------------
/src/ch12/airtest_script.air/airtest_script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: airtest_script.py
 6 | @time: 2022/1/16 2:45
 7 | @project: python3-web-spider-learning
 8 | @desc: 12.7 基于Airtest的App爬取实战（P586）
 9 | """
10 | import os
11 | 
12 | from airtest.core.api import stop_app, start_app, keyevent, swipe, connect_device
13 | from itsdangerous import json
14 | from loguru import logger
15 | from poco.drivers.android.uiautomation import AndroidUiautomationPoco
16 | 
17 | poco = AndroidUiautomationPoco(use_airtest_input=True, screenshot_each_action=False)
18 | window_width, window_height = poco.get_screen_size()
19 | PACKAGE_NAME = "com.goldze.mvvmhabit"
20 | TOTAL_NUMBER = 100
21 | 
22 | 
23 | def scrape_index():
24 |     elements = poco(f'{PACKAGE_NAME}:id/item')
25 |     elements.wait_for_appearance()
26 |     return elements
27 | 
28 | 
29 | def scrape_detail(element):
30 |     logger.debug(f'scraping {element}')
31 |     element.click()
32 |     panel = poco(f'{PACKAGE_NAME}:id/content')
33 |     panel.wait_for_appearance()
34 |     title = poco(f'{PACKAGE_NAME}:id/title').attr('text')
35 |     categories = poco(f'{PACKAGE_NAME}:id/categories_value').attr('text')
36 |     score = poco(f'{PACKAGE_NAME}:id/score_value').attr('text')
37 |     published_at = poco(f'{PACKAGE_NAME}:id/published_at_value').attr('text')
38 |     drama = poco(f'{PACKAGE_NAME}:id/drama_value').attr('text')
39 |     keyevent('BACK')
40 |     return {
41 |         'title': title,
42 |         'categories': categories,
43 |         'score': score,
44 |         'published_at': published_at,
45 |         'drama': drama,
46 |     }
47 | 
48 | 
49 | def scroll_up():
50 |     """
51 |     上滑动操作
52 |     """
53 |     swipe((window_width * 0.5, window_height * 0.8),
54 |           vertor=[0, -0.5], duration=1)
55 | 
56 | 
57 | OUTPUT_FOLDER = 'movie'
58 | os.path.exists(OUTPUT_FOLDER) or os.makedirs(OUTPUT_FOLDER)
59 | 
60 | 
61 | def save_data(element_data):
62 |     """
63 |     保存数据
64 |     """
65 |     with open(f'{OUTPUT_FOLDER}/{element_data.get("title")}.json', 'w', encoding='utf-8') as f:
66 |         f.write(json.dumps(element_data, ensure_ascii=False, indent=2))
67 |         logger.debug(f'saved as file {element_data.get("title")}.json')
68 | 
69 | 
70 | def main():
71 |     scraped_titles = []
72 |     while len(scraped_titles) < TOTAL_NUMBER:
73 |         elements = scrape_index()
74 |         for element in elements:
75 |             element_title = element.offspring(f'{PACKAGE_NAME}:id/tv_title')
76 |             if not element_title.exists():
77 |                 continue
78 |             title = element_title.attr('text')
79 |             logger.debug(f'get title {title}')
80 |             if title in scraped_titles:
81 |                 continue
82 |             _, element_y = element.get_position()
83 |             if element_y > 0.7:
84 |                 scroll_up()
85 |             element_data = scrape_detail(element)
86 |             scraped_titles.append(title)
87 |             logger.debug(f'scraped data {element_data}')
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     connect_device("Android://127.0.0.1:5037/192.168.1.26:5555")
92 |     stop_app(PACKAGE_NAME)
93 |     start_app(PACKAGE_NAME)
94 |     main()
95 | 


--------------------------------------------------------------------------------
/src/ch10/account_pool/setting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | @author: HuRuiFeng
 5 | @file: setting.py
 6 | @time: 2022/1/12 10:21
 7 | @project: python3-web-spider-learning
 8 | @desc: 配置Redis和定时任务的时间
 9 | """
10 | 
11 | import platform
12 | from os.path import dirname, abspath, join
13 | from environs import Env
14 | from loguru import logger
15 | from utils import parse_redis_connection_string
16 | 
17 | env = Env()
18 | env.read_env()
19 | 
20 | # definition of flags
21 | IS_WINDOWS = platform.system().lower() == 'windows'
22 | 
23 | # definition of dirs
24 | ROOT_DIR = dirname(dirname(abspath(__file__)))
25 | LOG_DIR = join(ROOT_DIR, env.str('LOG_DIR', 'logs'))
26 | 
27 | # definition of environments
28 | DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod'
29 | APP_ENV = env.str('APP_ENV', DEV_MODE).lower()
30 | APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False)
31 | APP_DEV = IS_DEV = APP_ENV == DEV_MODE
32 | APP_PROD = IS_PROD = APP_ENV == PROD_MODE
33 | APP_TEST = IS_TEST = APP_ENV == TEST_MODE
34 | 
35 | # redis host
36 | REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1')
37 | # redis port
38 | REDIS_PORT = env.int('REDIS_PORT', 6379)
39 | # redis password, if no password, set it to None
40 | REDIS_PASSWORD = env.str('REDIS_PASSWORD', None)
41 | # redis db, if no choice, set it to 0
42 | REDIS_DB = env.int('REDIS_DB', 0)
43 | # redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0
44 | REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None)
45 | 
46 | if REDIS_CONNECTION_STRING:
47 |     REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING)
48 | 
49 | # redis hash table key name
50 | REDIS_ACCOUNT_KEY = env.str('REDIS_ACCOUNT_KEY', 'accounts:%s')
51 | REDIS_CREDENTIAL_KEY = env.str('REDIS_CREDENTIAL_KEY', 'credential:%s')
52 | 
53 | # integrated generator
54 | GENERATOR_MAP = {
55 |     'antispider6': 'Antispider6Generator'
56 | }
57 | 
58 | # integrated tester
59 | TESTER_MAP = {
60 |     'antispider6': 'Antispider6Tester'
61 | }
62 | 
63 | # definition of tester cycle, it will test every CYCLE_TESTER second
64 | CYCLE_TESTER = env.int('CYCLE_TESTER', 600)
65 | # definition of getter cycle, it will get proxy every CYCLE_GENERATOR second
66 | CYCLE_GENERATOR = env.int('CYCLE_GENERATOR', 600)
67 | GET_TIMEOUT = env.int('GET_TIMEOUT', 10)
68 | 
69 | # definition of tester
70 | TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
71 | TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10)
72 | TEST_BATCH = env.int('TEST_BATCH', 20)
73 | # test url
74 | TEST_URL_MAP = {
75 |     'antispider6': 'https://antispider6.scrape.center/'
76 | }
77 | 
78 | # definition of api
79 | API_HOST = env.str('API_HOST', '127.0.0.1')
80 | API_PORT = env.int('API_PORT', 6789)
81 | API_THREADED = env.bool('API_THREADED', True)
82 | 
83 | # flags of enable
84 | ENABLE_IMPORT_DATA = env.bool('ENABLE_IMPORT_DATA', False)
85 | ENABLE_TESTER = env.bool('ENABLE_TESTER', True)
86 | ENABLE_GENERATOR = env.bool('ENABLE_GENERATOR', True)
87 | ENABLE_SERVER = env.bool('ENABLE_SERVER', True)
88 | 
89 | logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week',
90 |            retention='20 days')
91 | logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week')


--------------------------------------------------------------------------------
/src/ch02/urllib_demo/parse_demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | """
  4 | @author: HuRuiFeng
  5 | @file: parse_demo.py
  6 | @time: 2021/12/29 16:49
  7 | @project: python3-web-spider-learning
  8 | @desc: parse模块示例（P40~P44）
  9 | """
 10 | from urllib.parse import urlparse, urlunparse, urlsplit, urlunsplit, urljoin, urlencode, parse_qs, parse_qsl, quote, \
 11 |     unquote
 12 | 
 13 | 
 14 | class UrlLibDemo:
 15 |     def __init__(self):
 16 |         self.base_url = None
 17 |         self.scheme = ''
 18 |         self.allow_fragments = True
 19 |         self.data = None
 20 | 
 21 |     def print_urlparse(self):
 22 |         # 对一个URL进行解析
 23 |         result = urlparse(self.base_url, scheme=self.scheme, allow_fragments=self.allow_fragments)
 24 |         print(type(result))
 25 |         print(result)
 26 | 
 27 |     def print_urlunparse(self):
 28 |         # 构造一个URL
 29 |         print(urlunparse(self.data))
 30 | 
 31 |     def print_urlsplit(self):
 32 |         # 解析整个url，并返回5个部分
 33 |         print(urlsplit(self.base_url))
 34 | 
 35 |     def print_urlunsplit(self):
 36 |         # 将链接各个部分组合成完整链接
 37 |         print(urlunsplit(self.data))
 38 | 
 39 |     def print_urljoin(self, other_url):
 40 |         # 分析base_url的scheme、netloc和path这3个内容，并对新链接缺失的部分进行补充
 41 |         print(urljoin(self.base_url, other_url))
 42 | 
 43 |     def print_urlencode(self, params):
 44 |         # 将params字典转换成URL的Get请求
 45 |         print(self.base_url + urlencode(params))
 46 | 
 47 |     def print_parse_qs(self, query):
 48 |         # 将一串Get请求参数转回字典
 49 |         print(parse_qs(query))
 50 | 
 51 |     def print_parse_qsl(self, query):
 52 |         # 将一串Get请求参数转回元组
 53 |         print(parse_qsl(query))
 54 | 
 55 |     def print_quote(self, keyword):
 56 |         # 将内容转化为URL编码格式
 57 |         print(self.base_url + quote(keyword))
 58 | 
 59 |     def print_unquote(self):
 60 |         # 对URL进行解码
 61 |         print(unquote(self.base_url))
 62 | 
 63 | 
 64 | if __name__ == '__main__':
 65 |     urllib_demo = UrlLibDemo()
 66 |     # urllib_demo.base_url = 'https://www.baidu.com/index.html#comment'
 67 |     # urllib_demo.allow_fragments = False
 68 |     #
 69 |     # urllib_demo.print_urlparse()
 70 | 
 71 |     # urllib_demo.data = ['https', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
 72 |     # urllib_demo.print_urlunparse()
 73 | 
 74 |     # urllib_demo.base_url = 'https://www.baidu.com/index.html;user?id=5#comment'
 75 |     # urllib_demo.print_urlsplit()
 76 | 
 77 |     # urllib_demo.data = ['https', 'www.baidu.com', 'index.html', 'a=6', 'comment']
 78 |     # urllib_demo.print_urlunsplit()
 79 | 
 80 |     # urllib_demo.base_url = 'https://www.baidu.com'
 81 |     # urllib_demo.print_urljoin('FAQ.html')
 82 | 
 83 |     # urllib_demo.base_url = 'https://www.baidu.com?'
 84 |     # params = {
 85 |     #     'name': 'germey',
 86 |     #     'age': 25
 87 |     # }
 88 |     # urllib_demo.print_urlencode(params)
 89 | 
 90 |     # query = 'name=germey&age=25'
 91 |     # urllib_demo.print_parse_qs(query)
 92 | 
 93 |     # urllib_demo.print_parse_qsl(query)
 94 | 
 95 |     # keyword = '壁纸'
 96 |     # urllib_demo.base_url = 'https://www.baidu.com/s?wd='
 97 |     # urllib_demo.print_quote(keyword)
 98 | 
 99 |     urllib_demo.base_url = 'https://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8'
100 |     urllib_demo.print_unquote()
101 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | import pymongo
 9 | from elasticsearch import Elasticsearch
10 | from scrapy import Request
11 | from scrapy.exceptions import DropItem
12 | from scrapy.pipelines.images import ImagesPipeline
13 | 
14 | 
15 | class MongoDBPipeline:
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         cls.connect_string = crawler.settings.get('MONGODB_CONNECTION_STRING')
19 |         cls.database = crawler.settings.get('MONGODB_DATABASE')
20 |         cls.collection = crawler.settings.get('MONGODB_COLLECTION')
21 |         return cls()
22 | 
23 |     def open_spider(self, spider):
24 |         self.client = pymongo.MongoClient(self.connect_string)
25 |         self.db = self.client[self.database]
26 | 
27 |     def process_item(self, item, spider):
28 |         collection = self.db[self.collection]
29 |         collection.update_one({
30 |             'name': item['name']
31 |         }, {
32 |             '$set': dict(item)
33 |         }, True)
34 |         return item
35 | 
36 |     def close_spider(self, spider):
37 |         self.client.close()
38 | 
39 | 
40 | class ElasticsearchPipeline:
41 |     @classmethod
42 |     def from_crawler(cls, crawler):
43 |         cls.connection_string = crawler.settings.get('ELASTICSEARCH_CONNECTION_STRING')
44 |         cls.index = crawler.settings.get('ELASTICSEARCH_INDEX')
45 |         return cls()
46 | 
47 |     def open_spider(self, spider):
48 |         self.conn = Elasticsearch([self.connection_string])
49 |         if not self.conn.indices.exists(index=self.index):
50 |             self.conn.indices.create(index=self.index)
51 | 
52 |     def process_item(self, item, spider):
53 |         self.conn.index(index=self.index, body=dict(item), id=hash(item['name']))
54 |         return item
55 | 
56 |     def close_spider(self, spider):
57 |         self.conn.transport.close()
58 | 
59 | 
60 | class ImagePipeline(ImagesPipeline):
61 |     def file_path(self, request, response=None, info=None, *, item=None):
62 |         movie = request.meta['movie']
63 |         type = request.meta['type']
64 |         name = request.meta['name']
65 |         file_name = f'{movie}/{type}/{name}.jpg'
66 |         return file_name
67 | 
68 |     def item_completed(self, results, item, info):
69 |         image_paths = [x['path'] for ok, x in results if ok]
70 |         if not image_paths:
71 |             raise DropItem('Image Downloaded Failed')
72 |         return item
73 | 
74 |     def get_media_requests(self, item, info):
75 |         for director in item['directors']:
76 |             director_name = director['name']
77 |             director_image = director['image']
78 |             yield Request(director_image, meta={
79 |                 'name': director_name,
80 |                 'type': 'director',
81 |                 'movie': item['name']
82 |             })
83 | 
84 |         for actor in item['actors']:
85 |             actor_name = actor['name']
86 |             actor_image = actor['image']
87 |             yield Request(actor_image, meta={
88 |                 'name': actor_name,
89 |                 'type': 'actor',
90 |                 'movie': item['name']
91 |             })
92 | 


--------------------------------------------------------------------------------
/src/ch02/regx_demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | """
  4 | @author: HuRuiFeng
  5 | @file: regx_demo.py
  6 | @time: 2022/1/4 9:33
  7 | @project: python3-web-spider-learning
  8 | @desc: 2.3 正则表达式（P66~P73）
  9 | """
 10 | import re
 11 | 
 12 | html = '''<div id="songs-list">
 13 |     <h2 class="title">经典老歌</h2>
 14 |     <p class="introduction">
 15 |         经典老歌列表
 16 |     </p>
 17 |     <ul id="list" class="list-group">
 18 |         <li data-view="2">一路有你</li>
 19 |         <li data-view="7">
 20 |             <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
 21 |         </li>
 22 |         <li data-view="4" class="active">
 23 |             <a href="/3.mp3" singer="齐秦">往事随风</a>
 24 |         </li>
 25 |         <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
 26 |         <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
 27 |         <li data-view="5">
 28 |             <a href="/6.mp3" singer="邓丽君">但愿人长久</a>
 29 |         </li>
 30 |     </ul>
 31 | </div>'''
 32 | 
 33 | 
 34 | def regex_match():
 35 |     content = 'Hello 123 4567 World_This is a Regex Demo'
 36 |     print(len(content))
 37 |     result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}', content)
 38 |     print(result)
 39 |     print(result.group())
 40 |     print(result.span())
 41 | 
 42 | 
 43 | def match_object():
 44 |     # 匹配目标
 45 |     content = 'Hello 1234567 World_This is a Regex Demo'
 46 |     result = re.match('^Hello\s(\d+)\sWorld', content)
 47 |     print(result)
 48 |     print(result.group())
 49 |     print(result.group(1))
 50 |     print(result.span())
 51 | 
 52 | 
 53 | def common_match():
 54 |     # 通用匹配
 55 |     content = 'Hello 123 4567 World_This is a Regex Demo'
 56 |     result = re.match('^Hello.*Demo$', content)
 57 |     print(result)
 58 |     print(result.group())
 59 |     print(result.span())
 60 | 
 61 | 
 62 | def greedy_match():
 63 |     # 贪婪匹配
 64 |     content = 'Hello 123 4567 World_This is a Regex Demo'
 65 |     result = re.match('^He.*?(\d+).*Demo$', content)
 66 |     print(result)
 67 |     print(result.group())
 68 |     print(result.span())
 69 | 
 70 | 
 71 | def match_with_modifier():
 72 |     # 使用修饰符
 73 |     content = '''Hello 1234567 World_This
 74 |     is a Regex Demo'''
 75 |     result = re.match('^He.*?(\d+).*?Demo$', content, re.S)
 76 |     print(result.group(1))
 77 | 
 78 | 
 79 | def transferred_match():
 80 |     # 转义匹配
 81 |     content = '(百度)www.baidu.com'
 82 |     result = re.match('\(百度\)www\.baidu\.com', content)
 83 |     print(result)
 84 | 
 85 | 
 86 | def search_match():
 87 |     regx = '<li.*?active.*?singer="(.*?)">(.*?)</a>'
 88 |     result = re.search(regx, html, re.S)
 89 |     if result:
 90 |         print(result.group(1), result.group(2))
 91 | 
 92 | 
 93 | def findall_match():
 94 |     regx = '<li.*?href="(.*?)".*?singer="(.*?)">(.*?)</a>'
 95 |     results = re.findall(regx, html, re.S)
 96 |     print(results)
 97 |     print(type(results))
 98 |     for result in results:
 99 |         print(result)
100 |         print(result[0], result[1], result[2])
101 | 
102 | 
103 | def sub_match():
104 |     # 替换
105 |     content = '54aK54yr5oiR54ix5L2g'
106 |     content = re.sub('\d+', '', content)
107 |     print(content)
108 | 
109 | 
110 | def sub_html_match():
111 |     content = re.sub('<a.*?>|</a>', '', html)
112 |     print(content)
113 |     results = re.findall('<li.*?>(.*?)</li>', content, re.S)
114 |     for result in results:
115 |         print(result.strip())
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     sub_html_match()
120 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for scrapyspiderdemo project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'scrapyspiderdemo'
11 | 
12 | SPIDER_MODULES = ['scrapyspiderdemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapyspiderdemo.spiders'
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'scrapyspiderdemo (+http://www.yourdomain.com)'
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | #   'Accept-Language': 'en',
43 | #}
44 | 
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | #    'scrapyspiderdemo.middlewares.ScrapyspiderdemoSpiderMiddleware': 543,
49 | #}
50 | 
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | #    'scrapyspiderdemo.middlewares.ScrapyspiderdemoDownloaderMiddleware': 543,
55 | #}
56 | 
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | #    'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 | 
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | #    'scrapyspiderdemo.pipelines.ScrapyspiderdemoPipeline': 300,
67 | #}
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 


--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for scrapyseleniumdemo project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'scrapyseleniumdemo'
11 | 
12 | SPIDER_MODULES = ['scrapyseleniumdemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapyseleniumdemo.spiders'
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'scrapyseleniumdemo (+http://www.yourdomain.com)'
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | #   'Accept-Language': 'en',
43 | #}
44 | 
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | #    'scrapyseleniumdemo.middlewares.ScrapyseleniumdemoSpiderMiddleware': 543,
49 | #}
50 | 
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | DOWNLOADER_MIDDLEWARES = {
54 |    'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware': 543,
55 | }
56 | 
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | #    'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 | 
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | #    'scrapyseleniumdemo.pipelines.ScrapyseleniumdemoPipeline': 300,
67 | #}
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 
90 | CONCURRENT_REQUESTS = 6


--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for scrapyuniversaldemo project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'scrapyuniversaldemo'
11 | 
12 | SPIDER_MODULES = ['scrapyuniversaldemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapyuniversaldemo.spiders'
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'scrapyuniversaldemo (+http://www.yourdomain.com)'
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | #   'Accept-Language': 'en',
43 | #}
44 | 
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | #    'scrapyuniversaldemo.middlewares.ScrapyuniversaldemoSpiderMiddleware': 543,
49 | #}
50 | 
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | #    'scrapyuniversaldemo.middlewares.ScrapyuniversaldemoDownloaderMiddleware': 543,
55 | #}
56 | 
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | #    'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 | 
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | #    'scrapyuniversaldemo.pipelines.ScrapyuniversaldemoPipeline': 300,
67 | #}
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 


--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for scrapypyppeteerdemo project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'scrapypyppeteerdemo'
11 | 
12 | SPIDER_MODULES = ['scrapypyppeteerdemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapypyppeteerdemo.spiders'
14 | 
15 | TWISTED_REACTOR='twisted.internet.asyncioreactor.AsyncioSelectorReactor'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'scrapypyppeteerdemo (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | CONCURRENT_REQUESTS = 3
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | #   'Accept-Language': 'en',
44 | #}
45 | 
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | #    'scrapypyppeteerdemo.middlewares.ScrapypyppeteerdemoSpiderMiddleware': 543,
50 | #}
51 | 
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 |    'scrapypyppeteerdemo.middlewares.PyppeteerMiddleware': 543,
56 | }
57 | 
58 | # Enable or disable extensions
59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 | 
64 | # Configure item pipelines
65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66 | #ITEM_PIPELINES = {
67 | #    'scrapypyppeteerdemo.pipelines.ScrapypyppeteerdemoPipeline': 300,
68 | #}
69 | 
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 | 
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 | 


--------------------------------------------------------------------------------