└── src
├── ch13
├── AndServerTest
│ ├── app
│ │ ├── .gitignore
│ │ ├── src
│ │ │ ├── main
│ │ │ │ ├── res
│ │ │ │ │ ├── mipmap-hdpi
│ │ │ │ │ │ ├── ic_launcher.webp
│ │ │ │ │ │ └── ic_launcher_round.webp
│ │ │ │ │ ├── mipmap-mdpi
│ │ │ │ │ │ ├── ic_launcher.webp
│ │ │ │ │ │ └── ic_launcher_round.webp
│ │ │ │ │ ├── mipmap-xhdpi
│ │ │ │ │ │ ├── ic_launcher.webp
│ │ │ │ │ │ └── ic_launcher_round.webp
│ │ │ │ │ ├── mipmap-xxhdpi
│ │ │ │ │ │ ├── ic_launcher.webp
│ │ │ │ │ │ └── ic_launcher_round.webp
│ │ │ │ │ ├── mipmap-xxxhdpi
│ │ │ │ │ │ ├── ic_launcher.webp
│ │ │ │ │ │ └── ic_launcher_round.webp
│ │ │ │ │ ├── mipmap-anydpi-v26
│ │ │ │ │ │ ├── ic_launcher.xml
│ │ │ │ │ │ └── ic_launcher_round.xml
│ │ │ │ │ ├── values
│ │ │ │ │ │ ├── strings.xml
│ │ │ │ │ │ ├── colors.xml
│ │ │ │ │ │ └── themes.xml
│ │ │ │ │ ├── values-night
│ │ │ │ │ │ └── themes.xml
│ │ │ │ │ ├── layout
│ │ │ │ │ │ └── activity_main.xml
│ │ │ │ │ └── drawable-v24
│ │ │ │ │ │ └── ic_launcher_foreground.xml
│ │ │ │ ├── java
│ │ │ │ │ └── com
│ │ │ │ │ │ ├── goldze
│ │ │ │ │ │ └── mvvmhabit
│ │ │ │ │ │ │ └── utils
│ │ │ │ │ │ │ └── NativeUtils.java
│ │ │ │ │ │ └── germey
│ │ │ │ │ │ └── andservertest
│ │ │ │ │ │ ├── AppController.java
│ │ │ │ │ │ └── MainActivity.java
│ │ │ │ └── AndroidManifest.xml
│ │ │ ├── test
│ │ │ │ └── java
│ │ │ │ │ └── com
│ │ │ │ │ └── germey
│ │ │ │ │ └── andservertest
│ │ │ │ │ └── ExampleUnitTest.java
│ │ │ └── androidTest
│ │ │ │ └── java
│ │ │ │ └── com
│ │ │ │ └── germey
│ │ │ │ └── andservertest
│ │ │ │ └── ExampleInstrumentedTest.java
│ │ ├── proguard-rules.pro
│ │ └── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── settings.gradle
│ ├── build.gradle
│ ├── gradle.properties
│ └── gradlew.bat
├── files
│ ├── frida_appbasic1.js
│ ├── frida_rpc_app9.js
│ └── frida_appbasic2.js
├── frida_appbasic1_demo.py
├── frida_appbasic2_demo.py
├── andserver_demo.py
├── jeb_demo.py
├── frida_rpc_demo.py
└── ida_demo.py
├── ch15
├── scrapytutorial
│ ├── scrapytutorial
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── quotes.py
│ │ ├── items.py
│ │ ├── extensions.py
│ │ └── pipelines.py
│ ├── run.py
│ ├── scrapy.cfg
│ └── server.py
├── scrapyseleniumdemo
│ ├── scrapyseleniumdemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── book.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ └── settings.py
│ ├── run.py
│ └── scrapy.cfg
├── scrapyspiderdemo
│ ├── scrapyspiderdemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── httpbin.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ └── settings.py
│ ├── run.py
│ └── scrapy.cfg
├── scrapypyppeteerdemo
│ ├── scrapypyppeteerdemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── book.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ └── settings.py
│ ├── run.py
│ └── scrapy.cfg
├── scrapyuniversaldemo
│ ├── scrapyuniversaldemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ ├── movie.py
│ │ │ └── universal.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ ├── utils.py
│ │ ├── loaders.py
│ │ ├── configs
│ │ │ └── movie.json
│ │ └── settings.py
│ ├── scrapy.cfg
│ └── run.py
├── scrapyitempipelinedemo
│ ├── scrapyitempipelinedemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── scrape.py
│ │ ├── items.py
│ │ └── pipelines.py
│ ├── run.py
│ └── scrapy.cfg
├── scrapyspidermiddlewaredemo
│ ├── scrapyspidermiddlewaredemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── httpbin.py
│ │ ├── items.py
│ │ └── pipelines.py
│ ├── run.py
│ └── scrapy.cfg
├── scrapydownloadermiddlewaredemo
│ ├── scrapydownloadermiddlewaredemo
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── httpbin.py
│ │ ├── items.py
│ │ └── pipelines.py
│ ├── run.py
│ └── scrapy.cfg
├── scrape_selector_demo.py
└── scrape_processor_demo.py
├── ch11
├── learn-ast
│ ├── .babelrc
│ ├── codes
│ │ ├── code3.js
│ │ ├── code2.js
│ │ ├── code1.js
│ │ ├── code5.js
│ │ └── code4.js
│ ├── package.json
│ └── basic
│ │ ├── basic1.js
│ │ └── basic2.js
├── nodejs_demo
│ ├── package.json
│ ├── nodejs_client.py
│ ├── nodejs_main.js
│ └── nodejs_server.js
├── files
│ └── Wasm.wasm
├── execjs_demo.py
├── pywasm_scrape_demo.py
├── wasmer_scrape_demo.py
├── execjs_web_demo.py
└── js_scrape_practice.py
├── ch04
├── files
│ ├── data.csv
│ └── movies.txt
├── rabbitmq_oper_demo
│ ├── scrape_producer.py
│ ├── scrape_consume.py
│ ├── consumer.py
│ └── producer.py
├── text_oper_demo.py
├── csv_oper_demo.py
├── mongodb_demo.py
└── elasticsearch_oper_demo.py
├── ch02
├── files
│ ├── favicon.ico
│ ├── mozilla_cookie.txt
│ └── lwp_cookie.txt
├── urllib_demo
│ ├── robotparser_demo.py
│ ├── request_demo.py
│ ├── request_hander_demo.py
│ └── parse_demo.py
├── httpx_demo.py
├── requests_demo
│ ├── advanced_use.py
│ └── requests_demo.py
└── regx_demo.py
├── ch08
├── files
│ └── slide_captcha.png
├── tesserocr_demo.py
└── opencv_demo.py
├── ch07
├── selenium_demo
│ ├── files
│ │ └── preview.png
│ ├── back_forward.py
│ ├── cookie_oper.py
│ ├── tab_oper.py
│ ├── headless_mode.py
│ ├── exception_handle.py
│ ├── node_interaction.py
│ ├── action_chain.py
│ ├── anti_shield.py
│ ├── switch_frame.py
│ ├── node_selector.py
│ ├── node_info.py
│ ├── simple_demo.py
│ └── delay_wait.py
├── pyppeteer_demo
│ ├── files
│ │ ├── example2.png
│ │ └── eval_example.png
│ ├── dev_mode.py
│ ├── incognito_mode.py
│ ├── prevent_detect.py
│ └── simple_demo.py
├── playwright_demo
│ ├── files
│ │ ├── np_picture.png
│ │ ├── browser-iphone.png
│ │ ├── screenshot-webkit.png
│ │ ├── screenshot-chromium.png
│ │ └── screenshot-firefox.png
│ ├── mobile_web.py
│ ├── simple_demo.py
│ └── event_listen.py
├── css_locate_scrape.py
└── font_scrape.py
├── ch01
└── test.html
├── ch03
├── files
│ └── test.html
└── parsel_demo.py
├── ch10
├── account_pool
│ ├── exceptions.py
│ ├── utils.py
│ ├── server.py
│ ├── run_account_pool.py
│ ├── storages_redis.py
│ ├── tester.py
│ ├── generator.py
│ └── setting.py
├── jwt_simulate_login.py
├── antispider_scrape_with_account_pool.py
└── session_cookie_simulate_login.py
├── ch06
├── coroutine_demo
│ ├── coroutine_simple_demo.py
│ ├── coroutine_task1.py
│ ├── coroutine_task2.py
│ ├── multi_task_coroutine.py
│ ├── bing_callback.py
│ └── coroutine_await_aiohttp.py
├── aiohttp_demo
│ ├── timeout_demo.py
│ ├── post_request.py
│ ├── url_params.py
│ ├── simple_demo.py
│ ├── response_demo.py
│ └── concurrency_demo.py
└── aiohttp_scrape_demo.py
├── ch14
└── ai_extract.md
├── ch12
├── appium_demo.py
└── airtest_script.air
│ └── airtest_script.py
└── ch05
└── scrape_ajax.py
/src/ch13/AndServerTest/app/.gitignore:
--------------------------------------------------------------------------------
1 | /build
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "presets": [
3 | "@babel/preset-env"
4 | ]
5 | }
--------------------------------------------------------------------------------
/src/ch04/files/data.csv:
--------------------------------------------------------------------------------
1 | id,name,age
2 | 10001,Mike,20
3 | 10002,Bob,22
4 | 10003,Jordan,21
5 |
--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "dependencies": {
3 | "express": "^4.17.2"
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/src/ch11/files/Wasm.wasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch11/files/Wasm.wasm
--------------------------------------------------------------------------------
/src/ch02/files/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch02/files/favicon.ico
--------------------------------------------------------------------------------
/src/ch08/files/slide_captcha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch08/files/slide_captcha.png
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/files/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/selenium_demo/files/preview.png
--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/files/example2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/pyppeteer_demo/files/example2.png
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/np_picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/np_picture.png
--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/files/eval_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/pyppeteer_demo/files/eval_example.png
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/browser-iphone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/browser-iphone.png
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/screenshot-webkit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-webkit.png
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/screenshot-chromium.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-chromium.png
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/files/screenshot-firefox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-firefox.png
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code3.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: code3.js
4 | * @time: 11:18
5 | * @project: python3-web-spider-learning
6 | * @desc:
7 | */
8 |
9 | const strings = ["\"\x68\x65\x6c\x6c\x6f\"", "\"\x77\x6f\x72\x6c\x64\""];
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code2.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: code2.js
4 | * @time: 10:59
5 | * @project: python3-web-spider-learning
6 | * @desc:
7 | */
8 |
9 | const a = ![];
10 | const b = "abc" == "bcd"
11 | const c = (1 << 3) | 2;
12 | const d = parseInt("5" + "0")
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Mon Jan 17 20:43:10 CST 2022
2 | distributionBase=GRADLE_USER_HOME
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip
4 | distributionPath=wrapper/dists
5 | zipStorePath=wrapper/dists
6 | zipStoreBase=GRADLE_USER_HOME
7 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/java/com/goldze/mvvmhabit/utils/NativeUtils.java:
--------------------------------------------------------------------------------
1 | package com.goldze.mvvmhabit.utils;
2 |
3 | public class NativeUtils {
4 |
5 | static {
6 | System.loadLibrary("native");
7 | }
8 |
9 | public static native String encrypt(String str, int offset);
10 | }
11 |
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/19 13:48
7 | @project: python3-web-spider-learning
8 | @desc: 15.2 Scrapy入门(P743)
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'quotes'])
13 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/19 14:55
7 | @project: python3-web-spider-learning
8 | @desc: 15.9 Scrapy对接Selenium
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'book'])
13 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/20 9:19
7 | @project: python3-web-spider-learning
8 | @desc: 15.11 Scrapy对接Pyppeteer(P807)
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'book'])
13 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code1.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: code1.js
4 | * @time: 2022-01-14 09:45:29
5 | * @project: python3-web-spider-learning
6 | * @desc:
7 | */
8 |
9 | const a = 3;
10 | let string = "hello";
11 | for (let i = 0; i < a; i++) {
12 | string += "world";
13 | }
14 | console.log("string", string)
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/19 14:55
7 | @project: python3-web-spider-learning
8 | @desc: 15.7 Item Pipeline的使用(P781)
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'scrape'])
13 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/19 14:55
7 | @project: python3-web-spider-learning
8 | @desc: 15.4 Spider的使用(P759)
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'httpbin', '--nolog'])
13 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/19 14:55
7 | @project: python3-web-spider-learning
8 | @desc: 15.6 Spider Middleware的使用(P775)
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'httpbin'])
13 |
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapytutorial.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapytutorial
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class QuoteItem(scrapy.Item):
10 | text = scrapy.Field()
11 | author = scrapy.Field()
12 | tags = scrapy.Field()
13 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/spiders/httpbin.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 |
3 |
4 | class HttpbinSpider(scrapy.Spider):
5 | name = 'httpbin'
6 | allowed_domains = ['www.httpbin.org']
7 | start_urls = ['https://www.httpbin.org/get']
8 |
9 | def parse(self, response):
10 | print(response.text)
11 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapyspiderdemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyspiderdemo
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapyseleniumdemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyseleniumdemo
12 |
--------------------------------------------------------------------------------
/src/ch01/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | This is a Demo
6 |
7 |
8 |
9 |
10 |
Hello World
11 |
Hello, this is a paragraph.
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/settings.gradle:
--------------------------------------------------------------------------------
1 | dependencyResolutionManagement {
2 | repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
3 | repositories {
4 | google()
5 | mavenCentral()
6 | jcenter() // Warning: this repository is going to shut down soon
7 | }
8 | }
9 | rootProject.name = "AndServerTest"
10 | include ':app'
11 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/19 14:55
7 | @project: python3-web-spider-learning
8 | @desc:15.5 Downloader Middleware的使用(P770)
9 | """
10 | from scrapy.cmdline import execute
11 |
12 | execute(['scrapy', 'crawl', 'httpbin', '--nolog'])
13 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapypyppeteerdemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapypyppeteerdemo
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapyuniversaldemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyuniversaldemo
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapyitempipelinedemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyitempipelinedemo
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class ScrapyspiderdemoItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | pass
13 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values/strings.xml:
--------------------------------------------------------------------------------
1 |
2 | AndServerTest
3 | Start Server
4 | Stop Server
5 | The server is started
6 | The server is stopped
7 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapyspidermiddlewaredemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyspidermiddlewaredemo
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapydownloadermiddlewaredemo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapydownloadermiddlewaredemo
12 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | from scrapy import Item, Field
7 |
8 |
9 | class BookItem(Item):
10 | name = Field()
11 | tags = Field()
12 | score = Field()
13 | cover = Field()
14 | price = Field()
15 |
--------------------------------------------------------------------------------
/src/ch03/files/test.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class DemoItem(scrapy.Item):
10 | origin = scrapy.Field()
11 | headers = scrapy.Field()
12 | args = scrapy.Field()
13 | url = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class ScrapydownloadermiddlewaredemoItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | pass
13 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 | from scrapy import Item, Field
8 |
9 |
10 | class BookItem(Item):
11 | name = Field()
12 | tags = Field()
13 | score = Field()
14 | cover = Field()
15 | price = Field()
16 |
--------------------------------------------------------------------------------
/src/ch02/files/mozilla_cookie.txt:
--------------------------------------------------------------------------------
1 | # Netscape HTTP Cookie File
2 | # http://curl.haxx.se/rfc/cookie_spec.html
3 | # This is a generated file! Do not edit.
4 |
5 | .baidu.com TRUE / FALSE 1672303248 BAIDUID 4DF8C4AA1B53D13A4C0A711C60505CAB:FG=1
6 | .baidu.com TRUE / FALSE 3788250895 BIDUPSID 4DF8C4AA1B53D13A3F8EC394C3CC9551
7 | .baidu.com TRUE / FALSE 3788250895 PSTM 1640767247
8 | www.baidu.com FALSE / FALSE 1640767548 BD_NOT_HTTPS 1
9 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/exceptions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: exceptions.py
6 | @time: 2022/1/12 10:36
7 | @project: python3-web-spider-learning
8 | @desc: 自定义异常
9 | """
10 |
11 |
12 | class InitException(Exception):
13 | def __str__(self):
14 | """
15 | init error
16 | :return:
17 | """
18 | return repr('init failed')
19 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "learn-ast",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "index.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1"
8 | },
9 | "author": "",
10 | "license": "ISC",
11 | "devDependencies": {
12 | "@babel/cli": "^7.16.8",
13 | "@babel/core": "^7.16.7",
14 | "@babel/preset-env": "^7.16.8"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 | from scrapy import Item, Field
8 |
9 |
10 | class MovieItem(Item):
11 | name = Field()
12 | cover = Field()
13 | categories = Field()
14 | published_at = Field()
15 | drama = Field()
16 | score = Field()
17 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values/colors.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | #FFBB86FC
4 | #FF6200EE
5 | #FF3700B3
6 | #FF03DAC5
7 | #FF018786
8 | #FF000000
9 | #FFFFFFFF
10 |
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class MovieItem(scrapy.Item):
10 | name = scrapy.Field()
11 | categories = scrapy.Field()
12 | score = scrapy.Field()
13 | drama = scrapy.Field()
14 | directors = scrapy.Field()
15 | actors = scrapy.Field()
16 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class ScrapyspiderdemoPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class ScrapyseleniumdemoPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class ScrapypyppeteerdemoPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class ScrapyuniversaldemoPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code5.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: code5.js
4 | * @time: 2022-01-14 11:40
5 | * @project: python3-web-spider-learning
6 | * @desc:
7 | */
8 |
9 | const s = "3|1|2".split("|");
10 | let x = 0;
11 | while (true) {
12 | switch (s[x++]) {
13 | case "1":
14 | const a = 1;
15 | continue;
16 | case "2":
17 | const b = 3;
18 | continue;
19 | case "3":
20 | const c = 0;
21 | continue;
22 | }
23 | break;
24 | }
--------------------------------------------------------------------------------
/src/ch13/files/frida_appbasic1.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: frida_appbasic1.js
4 | * @time: 2022-01-17 17:21
5 | * @project: python3-web-spider-learning
6 | * @desc: frida Appbasic1 Hook script
7 | */
8 |
9 | Java.perform(() => {
10 | let MainActivity = Java.use('com.germey.appbasic1.MainActivity')
11 | console.log('start hook')
12 | MainActivity.getMessage.implementation = (arg1, arg2) => {
13 | send('Start Hook!')
14 | return '6'
15 | }
16 | })
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class ScrapyspidermiddlewaredemoPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class ScrapydownloadermiddlewaredemoPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: utils.py
6 | @time: 2022/1/20 15:52
7 | @project: python3-web-spider-learning
8 | @desc:
9 | """
10 | import json
11 | from os.path import join, dirname, realpath
12 |
13 |
14 | def get_config(name):
15 | path = join(dirname(realpath(__file__)), 'configs', f'{name}.json')
16 | with open(path, 'r', encoding='utf-8') as f:
17 | return json.loads(f.read())
18 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/test/java/com/germey/andservertest/ExampleUnitTest.java:
--------------------------------------------------------------------------------
1 | package com.germey.andservertest;
2 |
3 | import org.junit.Test;
4 |
5 | import static org.junit.Assert.*;
6 |
7 | /**
8 | * Example local unit test, which will execute on the development machine (host).
9 | *
10 | * @see Testing documentation
11 | */
12 | public class ExampleUnitTest {
13 | @Test
14 | public void addition_isCorrect() {
15 | assertEquals(4, 2 + 2);
16 | }
17 | }
--------------------------------------------------------------------------------
/src/ch13/files/frida_rpc_app9.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: frida_rpc_app9.js
4 | * @time: 20:06
5 | * @project: python3-web-spider-learning
6 | * @desc: frida RPC App9 Hook script
7 | */
8 |
9 | rpc.exports = {
10 | encrypt(string, offset) {
11 | let token = null;
12 | Java.perform(function () {
13 | var util = Java.use("com.goldze.mvvmhabit.utils.NativeUtils").$new();
14 | token = util.encrypt(string, offset)
15 | });
16 | return token;
17 | }
18 | }
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/back_forward.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: back_forward.py
6 | @time: 2022/1/7 15:21
7 | @project: python3-web-spider-learning
8 | @desc: 前进和后退(P221)
9 | """
10 | import time
11 |
12 | from selenium import webdriver
13 |
14 | browser = webdriver.Chrome()
15 | browser.get('https://www.baidu.com/')
16 | browser.get('https://www.taobao.com/')
17 | browser.get('https://www.python.org')
18 | browser.back()
19 | time.sleep(1)
20 | browser.forward()
21 | browser.close()
22 |
--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/nodejs_client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: nodejs_client.py
6 | @time: 2022/1/13 22:13
7 | @project: python3-web-spider-learning
8 | @desc: Python调用Node.js服务(P453)
9 | """
10 |
11 | import requests
12 |
13 | data = {
14 | "name": "凯文-杜兰特",
15 | "image": "durant.png",
16 | "birthday": "1988-09-29",
17 | "height": "208cm",
18 | "weight": "108.9KG"
19 | }
20 |
21 | url = 'http://localhost:3000'
22 | response = requests.post(url, json=data)
23 | print(response.text)
24 |
--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_simple_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: coroutine_simple_demo.py
6 | @time: 2022/1/6 17:23
7 | @project: python3-web-spider-learning
8 | @desc: 定义协程(P194)
9 | """
10 | import asyncio
11 |
12 |
13 | async def execute(x):
14 | print('Number:', x)
15 |
16 | coroutine = execute(1)
17 | print('Coroutine:', coroutine)
18 | print('After calling execute')
19 |
20 | loop = asyncio.get_event_loop()
21 | # 将协程对象注册到事件循环上
22 | loop.run_until_complete(coroutine)
23 | print('After calling loop')
24 |
--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/dev_mode.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: dev_mode.py
6 | @time: 2022/1/10 9:27
7 | @project: python3-web-spider-learning
8 | @desc: 调试模式(P247)
9 | """
10 | import asyncio
11 |
12 | from pyppeteer import launch
13 |
14 |
15 | async def main():
16 | browser = await launch(devtools=True, args=['--disable-infobars'])
17 | page = await browser.newPage()
18 | await page.goto('https://www.baidu.com')
19 | await asyncio.sleep(100)
20 |
21 | asyncio.get_event_loop().run_until_complete(main())
22 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/basic/basic1.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: basic1.js
4 | * @time: 2022-01-14 09:47:06
5 | * @project: python3-web-spider-learning
6 | * @desc:
7 | */
8 |
9 | import {parse} from "@babel/parser"
10 | import generate from "@babel/generator"
11 | import fs from "fs"
12 |
13 | const code = fs.readFileSync("../codes/code1.js", "utf-8")
14 | let ast = parse(code)
15 | console.log(ast)
16 | console.log(ast.program.body)
17 |
18 | const {code: output} = generate(ast, {
19 | ratainLines: true,
20 | comments: false,
21 | });
22 | console.log(output)
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/loaders.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: loaders.py
6 | @time: 2022/1/20 16:36
7 | @project: python3-web-spider-learning
8 | @desc:
9 | """
10 |
11 | from scrapy.loader import ItemLoader
12 | from itemloaders.processors import TakeFirst, Identity, Compose
13 |
14 |
15 | class MovieItemLoader(ItemLoader):
16 | default_output_processor = TakeFirst()
17 | categories_out = Identity()
18 | score_out = Compose(TakeFirst(), str.strip)
19 | drama_out = Compose(TakeFirst(), str.strip)
--------------------------------------------------------------------------------
/src/ch02/files/lwp_cookie.txt:
--------------------------------------------------------------------------------
1 | #LWP-Cookies-2.0
2 | Set-Cookie3: BAIDUID="658C2C37B45D9239BAC08ECC578950E0:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2022-12-29 08:42:08Z"; comment=bd; version=0
3 | Set-Cookie3: BIDUPSID=658C2C37B45D92392188E29355D808F6; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-01-16 11:56:15Z"; version=0
4 | Set-Cookie3: PSTM=1640767327; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-01-16 11:56:15Z"; version=0
5 | Set-Cookie3: BD_NOT_HTTPS=1; path="/"; domain="www.baidu.com"; path_spec; expires="2021-12-29 08:47:08Z"; version=0
6 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/build.gradle:
--------------------------------------------------------------------------------
1 | // Top-level build file where you can add configuration options common to all sub-projects/modules.
2 | buildscript {
3 | repositories {
4 | google()
5 | mavenCentral()
6 | }
7 | dependencies {
8 | classpath 'com.android.tools.build:gradle:4.1.3'
9 | classpath 'com.yanzhenjie.andserver:plugin:2.1.9'
10 | // NOTE: Do not place your application dependencies here; they belong
11 | // in the individual module build.gradle files
12 | }
13 | }
14 |
15 |
16 | task clean(type: Delete) {
17 | delete rootProject.buildDir
18 | }
--------------------------------------------------------------------------------
/src/ch15/scrape_selector_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: scrape_selector_demo.py
6 | @time: 2022/1/19 14:08
7 | @project: python3-web-spider-learning
8 | @desc: 15.3 Selector的使用(P754)
9 | """
10 | from scrapy import Selector
11 |
12 |
13 | def selector_demo():
14 | # 直接使用
15 | body = 'Hello World'
16 | selector = Selector(text=body)
17 | title = selector.xpath('//title/text()').extract_first()
18 | print(title)
19 |
20 |
21 | if __name__ == '__main__':
22 | selector_demo()
23 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/cookie_oper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: cookie_oper.py
6 | @time: 2022/1/7 15:28
7 | @project: python3-web-spider-learning
8 | @desc: Cookie操作(P222)
9 | """
10 | from selenium import webdriver
11 |
12 | browser = webdriver.Chrome()
13 | browser.get('https://www.zhihu.com/explore')
14 | print(browser.get_cookies())
15 | browser.add_cookie({'name': 'name',
16 | 'domain': 'www.zhihu.com',
17 | 'value': 'germey'})
18 | print(browser.get_cookies())
19 | browser.delete_all_cookies()
20 | print(browser.get_cookies())
21 |
--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_task1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: coroutine_task1.py
6 | @time: 2022/1/6 17:31
7 | @project: python3-web-spider-learning
8 | @desc: 协程task的使用(P194)
9 | """
10 | import asyncio
11 |
12 |
13 | async def execute(x):
14 | print('Number:', x)
15 | return x
16 |
17 | coroutine = execute(1)
18 | print('Coroutine:', coroutine)
19 | print('After calling execute')
20 |
21 | loop = asyncio.get_event_loop()
22 | task = loop.create_task(coroutine)
23 | print('Task:', task)
24 | loop.run_until_complete(task)
25 | print('Task:', task)
26 | print('After calling loop')
--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_task2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: coroutine_task2.py
6 | @time: 2022/1/6 18:50
7 | @project: python3-web-spider-learning
8 | @desc: 协程task的使用(P195)
9 | """
10 | import asyncio
11 |
12 |
13 | async def execute(x):
14 | print('Number:', x)
15 | return x
16 |
17 | coroutine = execute(1)
18 | print('Coroutine:', coroutine)
19 | print('After calling execute')
20 |
21 | task = asyncio.ensure_future(coroutine)
22 | print('Task:', task)
23 | loop = asyncio.get_event_loop()
24 | loop.run_until_complete(task)
25 | print('Task:', task)
26 | print('After calling loop')
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/tab_oper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: tab_oper.py
6 | @time: 2022/1/7 15:32
7 | @project: python3-web-spider-learning
8 | @desc: 选项卡管理(P222)
9 | """
10 | import time
11 |
12 | from selenium import webdriver
13 |
14 | browser = webdriver.Chrome()
15 | browser.get('https://www.baidu.com')
16 | browser.execute_script('window.open()')
17 | print(browser.window_handles)
18 | browser.switch_to.window(browser.window_handles[1])
19 | browser.get('https://www.taobao.com')
20 | time.sleep(1)
21 | browser.switch_to.window(browser.window_handles[0])
22 | browser.get('https://python.org')
--------------------------------------------------------------------------------
/src/ch13/frida_appbasic1_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: frida_demo.py
6 | @time: 2022/1/17 17:20
7 | @project: python3-web-spider-learning
8 | @desc: 13.5 Frida的使用,AppBasic1(P645)
9 | """
10 | import sys
11 |
12 | import frida
13 |
14 | CODE = open('files/frida_appbasic1.js', encoding='utf-8').read()
15 | PROCESS_NAME = 'AppBasic1'
16 |
17 |
18 | def on_message(message, data):
19 | print(message)
20 |
21 |
22 | process = frida.get_usb_device().attach(PROCESS_NAME)
23 | script = process.create_script(CODE)
24 | script.on('message', on_message)
25 | script.load()
26 | sys.stdin.read()
27 |
--------------------------------------------------------------------------------
/src/ch13/frida_appbasic2_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: frida_appbasic2_demo.py
6 | @time: 2022/1/17 17:43
7 | @project: python3-web-spider-learning
8 | @desc: 13.5 Frida的使用,AppBasic2(P648)
9 | """
10 | import sys
11 |
12 | import frida
13 |
14 | CODE = open('files/frida_appbasic2.js', encoding='utf-8').read()
15 | PROCESS_NAME = 'AppBasic2'
16 |
17 |
18 | def on_message(message, data):
19 | print(message)
20 |
21 |
22 | process = frida.get_usb_device().attach(PROCESS_NAME)
23 | script = process.create_script(CODE)
24 | script.on('message', on_message)
25 | script.load()
26 | sys.stdin.read()
27 |
--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/timeout_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: timeout_demo.py
6 | @time: 2022/1/6 19:58
7 | @project: python3-web-spider-learning
8 | @desc: 超时设置(P205)
9 | """
10 | import asyncio
11 |
12 | import aiohttp
13 |
14 |
15 | async def main():
16 | timeout = aiohttp.ClientTimeout(total=1)
17 | async with aiohttp.ClientSession(timeout=timeout) as session:
18 | async with session.get('https://www.httpbin.org/get') as response:
19 | print('status:', response.status)
20 |
21 |
22 | if __name__ == '__main__':
23 | asyncio.get_event_loop().run_until_complete(main())
24 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/headless_mode.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: headless_mode.py
6 | @time: 2022/1/7 15:49
7 | @project: python3-web-spider-learning
8 | @desc: 无头模式(P225)
9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver import ChromeOptions
12 | import os
13 |
14 | option = ChromeOptions()
15 | option.add_argument('--headless')
16 | browser = webdriver.Chrome(options=option)
17 | browser.set_window_size(1366, 768)
18 | browser.get('https://www.baidu.com')
19 |
20 | if not os.path.exists('files'):
21 | os.makedirs('files')
22 |
23 | browser.get_screenshot_as_file('files/preview.png')
24 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/exception_handle.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: exception_handle.py
6 | @time: 2022/1/7 15:35
7 | @project: python3-web-spider-learning
8 | @desc: 异常处理(P223)
9 | """
10 | from selenium import webdriver
11 | from selenium.common.exceptions import TimeoutException, NoSuchElementException
12 |
13 | browser = webdriver.Chrome()
14 | try:
15 | browser.get('https://www.baidu.com')
16 | except TimeoutException:
17 | print('Time Out')
18 |
19 | try:
20 | browser.find_element_by_id('hello')
21 | except NoSuchElementException:
22 | print('No Element')
23 | finally:
24 | browser.close()
25 |
--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/post_request.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: post_request.py
6 | @time: 2022/1/6 19:52
7 | @project: python3-web-spider-learning
8 | @desc: POST请求(P203)
9 | """
10 | import asyncio
11 |
12 | import aiohttp
13 |
14 |
15 | async def main():
16 | data = {
17 | 'name': 'germey',
18 | 'age': 25
19 | }
20 | async with aiohttp.ClientSession() as session:
21 | async with session.post('https://www.httpbin.org/post', data=data) as response:
22 | print(await response.text())
23 |
24 |
25 | if __name__ == '__main__':
26 | asyncio.get_event_loop().run_until_complete(main())
27 |
--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/url_params.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: url_params.py
6 | @time: 2022/1/6 19:49
7 | @project: python3-web-spider-learning
8 | @desc: URL参数设置(P203)
9 | """
10 | import asyncio
11 |
12 | import aiohttp
13 |
14 |
15 | async def main():
16 | params = {
17 | 'name': 'germey',
18 | 'age': 25
19 | }
20 | async with aiohttp.ClientSession() as session:
21 | async with session.get('https://www.httpbin.org/get', params=params) as response:
22 | print(await response.text())
23 |
24 |
25 | if __name__ == '__main__':
26 | asyncio.get_event_loop().run_until_complete(main())
27 |
--------------------------------------------------------------------------------
/src/ch14/ai_extract.md:
--------------------------------------------------------------------------------
1 | # 智能解析实现思路
2 |
3 | ## 1 详情页智能解析实现思路
4 | 1. 提取标题:提取页面的h节点,将内容与title节点的文本进行比较,取出相似度最高的内容,即详情页的标题
5 | 2. 提取时间:通过设置meta规则和时间匹配规则,得到时间
6 | 3. 提取正文:将正文进行预处理(删除无用标签和其中的内容、删除标签对、删除噪声标签),通过计算文本密度和符号密度,根据得到的分数,取出分数最高的节点,即为正文内容所在的节点,将各节点进行拼接,得到正文
7 |
8 | ## 2 列表页智能解析实现思路
9 | 1. 数据预处理:将内容进行预处理(和详情页的提取正文中的预处理一致)
10 | 2. 选取组节点:通过父节点选择器以及相关的限制条件(限制兄弟节点数量、限制成员节点的文本内容最小长度、限制成员节点的文本内容最大长度、限制兄弟节点的相似度),得到符合要求的组节点
11 | 3. 合并组节点:通过简单的聚类方法,将组节点进行合并分类
12 | 4. 挑选最佳组节点:通过成员节点数量、平均字数分布、文本密度计算分数,选出分数最高的组节点
13 | 5. 提取标题和链接:根据标题长度计算置信度,得到最优节点路径,并通过成员节点提取标题和链接
14 |
15 | ## 3 智能分辨列表页和详情页
16 | 采用SVM模型,通过页面的特征(文本密度、超链接节点的数量和比例、符号密度、列表簇的数量、meta信息、正文标题和title内容的相似度),处理数据和训练模型,得到最终的分类模型,用于分辨列表页和详情页。
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/node_interaction.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: node_interaction.py
6 | @time: 2022/1/7 10:20
7 | @project: python3-web-spider-learning
8 | @desc: 节点交互(P216)
9 | """
10 | import time
11 |
12 | from selenium import webdriver
13 |
14 | browser = webdriver.Chrome()
15 | browser.get('https://www.taobao.com')
16 | # 得到搜索框
17 | input = browser.find_element_by_id('q')
18 | # 输入搜索词“iPhone”
19 | input.send_keys('iPhone')
20 | time.sleep(1)
21 | # 清空搜索框
22 | input.clear()
23 | # 输入搜索词“iPad”
24 | input.send_keys('iPad')
25 | # 得到搜索按钮
26 | button = browser.find_element_by_class_name('btn-search')
27 | # 点击搜索按钮
28 | button.click()
29 |
--------------------------------------------------------------------------------
/src/ch11/execjs_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: execjs_demo.py
6 | @time: 2022/1/13 21:35
7 | @project: python3-web-spider-learning
8 | @desc: 11.5 使用Python模拟执行javascript(P446)
9 | """
10 |
11 | import execjs
12 | import json
13 |
14 | item = {
15 | "name": "勒布朗-詹姆斯",
16 | "image": "james.png",
17 | "birthday": "1984-12-30",
18 | "height": "206cm",
19 | "weight": "113.4KG"
20 | }
21 |
22 | file = 'files/execjs_crypto.js'
23 | node = execjs.get()
24 | ctx = node.compile(open(file).read())
25 |
26 | js = f"getToken({json.dumps(item, ensure_ascii=False)})"
27 | print(js)
28 | result = ctx.eval(js)
29 | print(result)
30 |
--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/multi_task_coroutine.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: multi_task_coroutine.py
6 | @time: 2022/1/6 18:58
7 | @project: python3-web-spider-learning
8 | @desc: 多任务协程(P196)
9 | """
10 | import asyncio
11 |
12 | import requests
13 |
14 |
15 | async def request():
16 | url = 'https://www.baidu.com'
17 | status = requests.get(url)
18 | return status
19 |
20 |
21 | tasks = [asyncio.ensure_future(request()) for _ in range(5)]
22 | print('Task:', tasks)
23 |
24 | loop = asyncio.get_event_loop()
25 | loop.run_until_complete(asyncio.wait(tasks))
26 |
27 | for task in tasks:
28 | print('Task Result:', task.result())
29 |
--------------------------------------------------------------------------------
/src/ch11/pywasm_scrape_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: pywasm_scrape_demo.py
6 | @time: 2022/1/14 15:00
7 | @project: python3-web-spider-learning
8 | @desc: 11.11 WebAssembly案例分析和爬取实战(P495)
9 | """
10 | import time
11 |
12 | import pywasm
13 | import requests
14 |
15 | BASE_URL = 'https://spa14.scrape.center'
16 | TOTAL_PAGE = 10
17 |
18 | runtime = pywasm.load('files/Wasm.wasm')
19 | for i in range(TOTAL_PAGE):
20 | offset = i * 10
21 | sign = runtime.exec('encrypt', [offset, int(time.time())])
22 | url = f'{BASE_URL}/api/movie/?limit=10&offset={offset}&sign={sign}'
23 | response = requests.get(url)
24 | print(response.json())
25 |
--------------------------------------------------------------------------------
/src/ch13/files/frida_appbasic2.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: frida_appbasic1.js
4 | * @time: 2022-01-17 17:39
5 | * @project: python3-web-spider-learning
6 | * @desc: frida Appbasic2 Hook script
7 | */
8 |
9 | Java.perform(function () {
10 | Interceptor.attach(Module.findExportByName('libnative.so', 'Java_com_appbasic2_MainActivity_getMessage'), {
11 | onEnter: function (args) {
12 | send('hook onEnter')
13 | send('args[1]=' + args[2])
14 | send('args[2]=' + args[3])
15 | },
16 | onLeave: function (val) {
17 | send('hook Leave')
18 | val.replace(Java.vm.getEnv().newStringUtf('5'))
19 | }
20 | })
21 | })
--------------------------------------------------------------------------------
/src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/spiders/httpbin.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy import Request
3 |
4 | from ch15.scrapyspidermiddlewaredemo.scrapyspidermiddlewaredemo.items import DemoItem
5 |
6 |
7 | class HttpbinSpider(scrapy.Spider):
8 | name = 'httpbin'
9 | allowed_domains = ['www.httpbin.org']
10 | start_url = 'https://www.httpbin.org/get'
11 |
12 | def start_requests(self):
13 | for i in range(5):
14 | url = f'{self.start_url}?query={i}'
15 | yield Request(url, callback=self.parse)
16 |
17 | def parse(self, response):
18 | item = DemoItem(**response.json())
19 | print('Status:', response.status)
20 | yield item
21 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/action_chain.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: action_chain.py
6 | @time: 2022/1/7 10:25
7 | @project: python3-web-spider-learning
8 | @desc: 动作链(P217)
9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver import ActionChains
12 |
13 | browser = webdriver.Chrome()
14 | url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
15 | browser.get(url)
16 | browser.switch_to.frame('iframeResult')
17 | source = browser.find_element_by_css_selector('#draggable')
18 | target = browser.find_element_by_css_selector('#droppable')
19 | actions = ActionChains(browser)
20 | actions.drag_and_drop(source, target)
21 | actions.perform()
22 |
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/mobile_web.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: mobile_web.py
6 | @time: 2022/1/10 19:48
7 | @project: python3-web-spider-learning
8 | @desc: 支持移动端浏览器(P261)
9 | """
10 | from playwright.sync_api import sync_playwright
11 |
12 | with sync_playwright() as p:
13 | iphone_12_pro_max = p.devices['iPhone 12 Pro Max']
14 | browser = p.webkit.launch(headless=False)
15 | context = browser.new_context(**iphone_12_pro_max, locale='zh-CN')
16 | page = context.new_page()
17 | page.goto('https://www.whatismybrowser.com')
18 | page.wait_for_load_state(state='networkidle')
19 | page.screenshot(path='files/browser-iphone.png')
20 | browser.close()
21 |
--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/bing_callback.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: bing_callback.py
6 | @time: 2022/1/6 18:53
7 | @project: python3-web-spider-learning
8 | @desc: 绑定回调(P196)
9 | """
10 | import asyncio
11 |
12 | import requests
13 |
14 |
15 | async def request():
16 | url = 'https://www.baidu.com'
17 | status = requests.get(url)
18 | return status
19 |
20 |
21 | def callback(task):
22 | print('Status:', task.result())
23 |
24 |
25 | coroutine = request()
26 | task = asyncio.ensure_future(coroutine)
27 | task.add_done_callback(callback)
28 | print('Task:', task)
29 |
30 | loop = asyncio.get_event_loop()
31 | loop.run_until_complete(task)
32 | print('Task:', task)
33 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: utils.py
6 | @time: 2022/1/12 10:23
7 | @project: python3-web-spider-learning
8 | @desc:
9 | """
10 | import re
11 |
12 |
13 | def parse_redis_connection_string(connection_string):
14 | """
15 | parse a redis connection string, for example:
16 | redis://[password]@host:port
17 | rediss://[password]@host:port
18 | :param connection_string:
19 | :return:
20 | """
21 | result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string)
22 | return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \
23 | else ('localhost', 6379, None)
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: server.py
6 | @time: 2022/1/19 18:38
7 | @project: python3-web-spider-learning
8 | @desc: 15.8 Extension的使用(P793)
9 | """
10 | from flask import Flask, request, jsonify
11 | from loguru import logger
12 |
13 | app = Flask(__name__)
14 |
15 |
16 | @app.route('/notify', methods=['POST'])
17 | def receive():
18 | post_data = request.get_json()
19 | event = post_data.get('event')
20 | data = post_data.get('data')
21 | logger.debug(f'received event {event}, data {data}')
22 | return jsonify(status='success')
23 |
24 |
25 | if __name__ == '__main__':
26 | app.run(debug=True, host='0.0.0.0', port=5000)
27 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/codes/code4.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: code4.js
4 | * @time: 11:29
5 | * @project: python3-web-spider-learning
6 | * @desc:
7 | */
8 |
9 | const _0x16c18d = function () {
10 | if (!![[]]) {
11 | console.log("hello world");
12 | } else {
13 | console.log("this");
14 | console.log("is");
15 | console.log("dead");
16 | console.log("code");
17 | }
18 | };
19 | const _0x1f7292 = function () {
20 | if ("xmv2nOdfy2N".charAt(4) !== String.fromCharCode(110)) {
21 | console.log("this");
22 | console.log("is");
23 | console.log("dead");
24 | console.log("code");
25 | } else {
26 | console.log("nice to meet you");
27 | }
28 | };
29 |
30 | _0x16c18d();
31 | _0x1f7292();
--------------------------------------------------------------------------------
/src/ch02/urllib_demo/robotparser_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: robotparser_demo.py
6 | @time: 2021/12/31 13:39
7 | @project: python3-web-spider-learning
8 | @desc: Robots协议(P46)
9 | """
10 |
11 | from urllib.robotparser import RobotFileParser
12 |
13 |
14 | def print_can_fetch(rp, spider, url):
15 | print(rp.can_fetch(spider, url))
16 |
17 |
18 | if __name__ == '__main__':
19 | rp = RobotFileParser()
20 | rp.set_url('https://www.baidu.com/robots.txt')
21 | rp.read()
22 | print_can_fetch(rp, 'Baiduspider', 'https://www.baidu.com')
23 | print_can_fetch(rp, 'Baiduspider', 'https://www.baidu.com/homepage/')
24 | print_can_fetch(rp, 'Googlebot', 'https://www.baidu.com/homepage/')
25 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/anti_shield.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: anti_shield.py
6 | @time: 2022/1/7 15:40
7 | @project: python3-web-spider-learning
8 | @desc: 反屏蔽(P224)
9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver import ChromeOptions
12 |
13 | option = ChromeOptions()
14 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
15 | option.add_experimental_option('useAutomationExtension', False)
16 | browser = webdriver.Chrome(options=option)
17 | browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
18 | 'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
19 | })
20 | browser.get('https://antispider1.scrape.center')
21 |
22 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/switch_frame.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: switch_frame.py
6 | @time: 2022/1/7 10:41
7 | @project: python3-web-spider-learning
8 | @desc: 切换Frame(P219)
9 | """
10 | from selenium import webdriver
11 | from selenium.common.exceptions import NoSuchElementException
12 |
13 | browser = webdriver.Chrome()
14 | url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
15 | browser.get(url)
16 | browser.switch_to.frame('iframeResult')
17 | try:
18 | logo = browser.find_element_by_class_name('logo')
19 | except NoSuchElementException:
20 | print('No Logo')
21 |
22 | browser.switch_to.parent_frame()
23 | logo = browser.find_element_by_class_name('logo')
24 | print(logo)
25 | print(logo.text)
26 |
--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/simple_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: simple_demo.py
6 | @time: 2022/1/6 19:21
7 | @project: python3-web-spider-learning
8 | @desc: aiohttp基本实例(P202)
9 | """
10 | import asyncio
11 |
12 | import aiohttp
13 |
14 |
15 | async def fetch(session, url):
16 | async with session.get(url) as response:
17 | return await response.text(), response.status
18 |
19 |
20 | async def main():
21 | async with aiohttp.ClientSession() as session:
22 | html, status = await fetch(session, 'https://cuiqingcai.com')
23 | print(f'html: {html[:100]}...')
24 | print(f'status: {status}')
25 |
26 |
27 | if __name__ == '__main__':
28 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
29 | asyncio.run(main())
30 |
--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/incognito_mode.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: incognito_mode.py
6 | @time: 2022/1/10 18:26
7 | @project: python3-web-spider-learning
8 | @desc: 无痕模式(P252)
9 | """
10 | import asyncio
11 |
12 | from pyppeteer import launch
13 |
14 | width, height = 1366, 768
15 |
16 |
17 | async def main():
18 | # 设置浏览器窗口大小
19 | browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width}, {height}'])
20 | context = await browser.createIncogniteBrowserContext()
21 | page = await context.newPage()
22 | # 设置页面大小
23 | await page.setViewport({'width': width, 'height': height})
24 | await page.goto('https://www.baidu.com/')
25 | await asyncio.sleep(100)
26 |
27 |
28 | asyncio.get_event_loop().run_until_complete(main())
29 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/node_selector.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: node_selector.py
6 | @time: 2022/1/7 10:04
7 | @project: python3-web-spider-learning
8 | @desc: 查找节点(P215-P216)
9 | """
10 | from selenium import webdriver
11 |
12 | browser = webdriver.Chrome()
13 | browser.get('https://www.taobao.com')
14 |
15 |
16 | def get_signal_node():
17 | input_first = browser.find_element_by_id('q')
18 | input_second = browser.find_element_by_css_selector('#q')
19 | input_third = browser.find_element_by_xpath('//*[@id="q"]')
20 | print(input_first, input_second, input_third)
21 |
22 |
23 | def get_nodes():
24 | lis = browser.find_elements_by_css_selector('.service-bd li')
25 | print(lis)
26 |
27 |
28 | if __name__ == '__main__':
29 | get_nodes()
30 | browser.close()
31 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/proguard-rules.pro:
--------------------------------------------------------------------------------
1 | # Add project specific ProGuard rules here.
2 | # You can control the set of applied configuration files using the
3 | # proguardFiles setting in build.gradle.
4 | #
5 | # For more details, see
6 | # http://developer.android.com/guide/developing/tools/proguard.html
7 |
8 | # If your project uses WebView with JS, uncomment the following
9 | # and specify the fully qualified class name to the JavaScript interface
10 | # class:
11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
12 | # public *;
13 | #}
14 |
15 | # Uncomment this to preserve the line number information for
16 | # debugging stack traces.
17 | #-keepattributes SourceFile,LineNumberTable
18 |
19 | # If you keep the line number information, uncomment this to
20 | # hide the original source file name.
21 | #-renamesourcefileattribute SourceFile
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/java/com/germey/andservertest/AppController.java:
--------------------------------------------------------------------------------
1 | package com.germey.andservertest;
2 |
3 | import com.goldze.mvvmhabit.utils.NativeUtils;
4 | import com.yanzhenjie.andserver.annotation.GetMapping;
5 | import com.yanzhenjie.andserver.annotation.QueryParam;
6 | import com.yanzhenjie.andserver.annotation.RestController;
7 |
8 | import org.json.JSONObject;
9 |
10 | import java.util.HashMap;
11 | import java.util.Map;
12 |
13 | @RestController
14 | public class AppController {
15 |
16 | @GetMapping("/encrypt")
17 | public JSONObject login(@QueryParam("string") String string,
18 | @QueryParam("offset") int offset) {
19 | Map map = new HashMap<>();
20 | String sign = NativeUtils.encrypt(string, offset);
21 | map.put("sign", sign);
22 | return new JSONObject(map);
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/spiders/quotes.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 |
3 | from ch15.scrapytutorial.scrapytutorial.items import QuoteItem
4 |
5 |
6 | class QuotesSpider(scrapy.Spider):
7 | name = 'quotes'
8 | allowed_domains = ['quotes.toscrape.com']
9 | start_urls = ['http://quotes.toscrape.com/']
10 |
11 | def parse(self, response):
12 | quotes = response.css('.quote')
13 | for quote in quotes:
14 | item = QuoteItem()
15 | item['text'] = quote.css('.text::text').extract_first()
16 | item['author'] = quote.css('.author::text').extract_first()
17 | item['tags'] = quote.css('.tags .tag::text').extract()
18 | yield item
19 |
20 | next = response.css('.pager .next a::attr("href")').extract_first()
21 | url = response.urljoin(next)
22 | yield scrapy.Request(url=url, callback=self.parse)
23 |
--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/prevent_detect.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: prevent_detect.py
6 | @time: 2022/1/10 18:12
7 | @project: python3-web-spider-learning
8 | @desc: 防止检测(P248-P250)
9 | """
10 | import asyncio
11 |
12 | from pyppeteer import launch
13 |
14 | width, height = 1366, 768
15 |
16 |
17 | async def main():
18 | # 设置浏览器窗口大小
19 | browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width}, {height}'])
20 | page = await browser.newPage()
21 | # 设置页面大小
22 | await page.setViewport({'width': width, 'height': height})
23 | await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: ()=> undefined})')
24 | await page.goto('https://antispider1.scrape.center/')
25 | await asyncio.sleep(100)
26 |
27 |
28 | asyncio.get_event_loop().run_until_complete(main())
29 |
--------------------------------------------------------------------------------
/src/ch11/wasmer_scrape_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: wasmer_scrape_demo.py
6 | @time: 2022/1/14 17:08
7 | @project: python3-web-spider-learning
8 | @desc: wasmer库实战
9 | """
10 | import time
11 |
12 | import requests
13 | from wasmer import engine, Store, Module, Instance
14 | from wasmer_compiler_cranelift.wasmer_compiler_cranelift import Compiler
15 |
16 | # 读取wasm文件
17 | store = Store(engine.JIT(Compiler))
18 | module = Module(store, open('files/Wasm.wasm', 'rb').read())
19 | instance = Instance(module)
20 |
21 | BASE_URL = 'https://spa14.scrape.center'
22 | TOTAL_PAGE = 10
23 |
24 | for i in range(TOTAL_PAGE):
25 | offset = i * 10
26 | sign = instance.exports.encrypt(offset, int(time.time()))
27 | url = f'{BASE_URL}/api/movie/?limit=10&offset={offset}&sign={sign}'
28 | response = requests.get(url)
29 | print(response.json())
--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/response_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: response_demo.py
6 | @time: 2022/1/6 19:54
7 | @project: python3-web-spider-learning
8 | @desc: 响应(P205)
9 | """
10 | import asyncio
11 |
12 | import aiohttp
13 |
14 |
15 | async def main():
16 | data = {
17 | 'name': 'germey',
18 | 'age': 25
19 | }
20 | async with aiohttp.ClientSession() as session:
21 | async with session.post('https://www.httpbin.org/post', data=data) as response:
22 | print('status:', response.status)
23 | print('headers:', response.headers)
24 | print('body:', await response.text())
25 | print('bytes:', await response.read())
26 | print('json:', await response.json())
27 |
28 |
29 | if __name__ == '__main__':
30 | asyncio.get_event_loop().run_until_complete(main())
31 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/node_info.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: node_info.py
6 | @time: 2022/1/7 10:36
7 | @project: python3-web-spider-learning
8 | @desc: 获取节点信息(P218)
9 | """
10 | from selenium import webdriver
11 |
12 | browser = webdriver.Chrome()
13 | url = 'https://spa2.scrape.center/'
14 | browser.get(url)
15 |
16 |
17 | def get_attr():
18 | logo = browser.find_element_by_class_name('logo-image')
19 | print(logo)
20 | print(logo.get_attribute('src'))
21 |
22 |
23 | def get_text():
24 | input = browser.find_element_by_class_name('logo-title')
25 | print(input.text)
26 |
27 |
28 | def get_other_info():
29 | input = browser.find_element_by_class_name('logo-title')
30 | print(input.id)
31 | print(input.location)
32 | print(input.tag_name)
33 | print(input.size)
34 |
35 |
36 | if __name__ == '__main__':
37 | get_other_info()
38 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values/themes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/values-night/themes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/androidTest/java/com/germey/andservertest/ExampleInstrumentedTest.java:
--------------------------------------------------------------------------------
1 | package com.germey.andservertest;
2 |
3 | import android.content.Context;
4 |
5 | import androidx.test.platform.app.InstrumentationRegistry;
6 | import androidx.test.ext.junit.runners.AndroidJUnit4;
7 |
8 | import org.junit.Test;
9 | import org.junit.runner.RunWith;
10 |
11 | import static org.junit.Assert.*;
12 |
13 | /**
14 | * Instrumented test, which will execute on an Android device.
15 | *
16 | * @see Testing documentation
17 | */
18 | @RunWith(AndroidJUnit4.class)
19 | public class ExampleInstrumentedTest {
20 | @Test
21 | public void useAppContext() {
22 | // Context of the app under test.
23 | Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
24 | assertEquals("com.germey.andservertest", appContext.getPackageName());
25 | }
26 | }
--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/scrape_producer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: scrape_producer.py
6 | @time: 2022/1/6 15:10
7 | @project: python3-web-spider-learning
8 | @desc: RabbitMQ实战 生产者(P171)
9 | """
10 | import pickle
11 |
12 | import pika
13 | import requests
14 |
15 | MAX_PRORITY = 100
16 | TOTAL = 100
17 | QUEUE_NAME = 'scrape_queue'
18 |
19 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
20 | channel = connection.channel()
21 | channel.queue_declare(queue=QUEUE_NAME, durable=True)
22 |
23 | for i in range(1, TOTAL + 1):
24 | url = f'http://ssr1.scrape.center/detail/{i}'
25 | request = requests.Request('GET', url)
26 | channel.basic_publish(exchange='', routing_key=QUEUE_NAME,
27 | properties=pika.BasicProperties(delivery_mode=2),
28 | body=pickle.dumps(request))
29 | print(f'Put request of {url}')
30 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/simple_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: simple_demo.py
6 | @time: 2022/1/7 9:41
7 | @project: python3-web-spider-learning
8 | @desc: Selenium基本用法(P213)
9 | """
10 |
11 | from selenium import webdriver
12 | from selenium.webdriver.common.by import By
13 | from selenium.webdriver.common.keys import Keys
14 | from selenium.webdriver.support.wait import WebDriverWait
15 | from selenium.webdriver.support import expected_conditions as EC
16 |
17 | browser = webdriver.Chrome()
18 | try:
19 | browser.get('https://www.baidu.com')
20 | input = browser.find_element_by_id('kw')
21 | input.send_keys('Python')
22 | input.send_keys(Keys.ENTER)
23 | wait = WebDriverWait(browser, 10)
24 | wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
25 | print(browser.current_url)
26 | print(browser.get_cookies())
27 | print(browser.page_source)
28 | finally:
29 | browser.close()
--------------------------------------------------------------------------------
/src/ch13/andserver_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: andserver_demo.py
6 | @time: 2022/1/17 23:07
7 | @project: python3-web-spider-learning
8 | @desc: 13.10 基于AndServer-RPC模拟执行so文件(Python爬取数据)(P691)
9 | """
10 | import requests
11 |
12 | BASE_URL = 'https://app9.scrape.center'
13 | INDEX_URL = BASE_URL + '/api/movie?limit={limit}&offset={offset}&token={token}'
14 | ANDSERVER_URL = 'http://localhost:8080/encrypt?string={string}&offset={offset}'
15 | MAX_PAGE = 10
16 | LIMIT = 10
17 |
18 |
19 | def get_token(string, offset):
20 | andserver_url = ANDSERVER_URL.format(string=string, offset=offset)
21 | return requests.get(andserver_url).json().get('sign')
22 |
23 |
24 | for i in range(MAX_PAGE):
25 | offset = i * LIMIT
26 | token = get_token("/api/movie", offset)
27 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
28 | response = requests.get(index_url)
29 | print("response:", response.json())
30 |
--------------------------------------------------------------------------------
/src/ch13/jeb_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: jeb_demo.py
6 | @time: 2022/1/17 10:29
7 | @project: python3-web-spider-learning
8 | @desc: 13.2 JEB的使用(P624)
9 | """
10 | import base64
11 | import hashlib
12 | import time
13 |
14 | import requests
15 |
16 | INDEX_URL = 'https://app5.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}'
17 | MAX_PAGE = 10
18 | LIMIT = 10
19 |
20 |
21 | def get_token(args):
22 | timestamp = str(int(time.time()))
23 | args.append(timestamp)
24 | sign = hashlib.sha1(','.join(args).encode('utf-8')).hexdigest()
25 | return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8')
26 |
27 |
28 | for i in range(MAX_PAGE):
29 | offset = i * LIMIT
30 | token = get_token(args=['/api/movie'])
31 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
32 | response = requests.get(index_url)
33 | print('response:', response.json())
34 |
--------------------------------------------------------------------------------
/src/ch06/aiohttp_demo/concurrency_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: concurrency_demo.py
6 | @time: 2022/1/6 20:01
7 | @project: python3-web-spider-learning
8 | @desc: 并发限制(P206)
9 | """
10 | import asyncio
11 |
12 | import aiohttp
13 |
14 | CONCURRENCY = 5
15 | URL = 'https://www.baidu.com'
16 |
17 | semaphoer = asyncio.Semaphore(CONCURRENCY)
18 | session = None
19 |
20 |
21 | async def scrape_api():
22 | async with semaphoer:
23 | print('scraping', URL)
24 | async with session.get(URL) as response:
25 | await asyncio.sleep(1)
26 | return await response.text()
27 |
28 |
29 | async def main():
30 | global session
31 | session = aiohttp.ClientSession()
32 | scrape_index_tasks = [asyncio.ensure_future(scrape_api()) for _ in range(10000)]
33 | await asyncio.gather(*scrape_index_tasks)
34 |
35 |
36 | if __name__ == '__main__':
37 | asyncio.get_event_loop().run_until_complete(main())
38 |
--------------------------------------------------------------------------------
/src/ch06/coroutine_demo/coroutine_await_aiohttp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: coroutine_await_aiohttp.py
6 | @time: 2022/1/6 19:05
7 | @project: python3-web-spider-learning
8 | @desc: 协程实现,await、aiohttp的使用(P197)
9 | """
10 | import asyncio
11 | import time
12 |
13 | import aiohttp
14 |
15 | start = time.time()
16 |
17 |
18 | async def get(url):
19 | session = aiohttp.ClientSession()
20 | response = await session.get(url)
21 | await response.text()
22 | await session.close()
23 | return response
24 |
25 |
26 | async def request():
27 | url = 'https://www.httpbin.org/delay/5'
28 | print('Waiting for', url)
29 | response = await get(url)
30 | print('Get response from', url, 'response', response)
31 |
32 |
33 | tasks = [asyncio.ensure_future(request()) for _ in range(10)]
34 | loop = asyncio.get_event_loop()
35 | loop.run_until_complete(asyncio.wait(tasks))
36 |
37 | end = time.time()
38 | print('Cost time:', end - start)
39 |
--------------------------------------------------------------------------------
/src/ch13/frida_rpc_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: frida_rpc_demo.py
6 | @time: 2022/1/17 20:11
7 | @project: python3-web-spider-learning
8 | @desc: 13.9 基于Frida-RPC 模拟执行so文件(P683)
9 | """
10 | import frida
11 | import requests
12 |
13 | BASE_URL = 'https://app9.scrape.center'
14 | INDEX_URL = BASE_URL + '/api/movie?limit={limit}&offset={offset}&token={token}'
15 | MAX_PAGE = 10
16 | LIMIT = 10
17 |
18 | session = frida.get_usb_device().attach('App9')
19 | source = open('files/frida_rpc_app9.js', encoding='utf-8').read()
20 | script = session.create_script(source)
21 | script.load()
22 |
23 |
24 | def get_token(string, offset):
25 | return script.exports.encrypt(string, offset)
26 |
27 |
28 | for i in range(MAX_PAGE):
29 | offset = i * LIMIT
30 | token = get_token('/api/movie', offset)
31 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
32 | response = requests.get(index_url)
33 | print('response', response.json())
34 |
--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/nodejs_main.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: nodejs_main.js
4 | * @time: 22:01
5 | * @project: python3-web-spider-learning
6 | * @desc: 11.6 使用Node.js模拟执行JavaScript(P451)
7 | */
8 |
9 | const CryptoJS = require("./files/crypto.js")
10 |
11 | function getToken(player) {
12 | let key = CryptoJS.enc.Utf8.parse("fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt");
13 | const {name, birthday, height, weight} = player;
14 | let base64Name = CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(name));
15 | let encrypted = CryptoJS.DES.encrypt(
16 | `${base64Name}${birthday}${height}${weight}`,
17 | key, {
18 | mode: CryptoJS.mode.ECB,
19 | padding: CryptoJS.pad.Pkcs7,
20 | }
21 | );
22 | return encrypted.toString();
23 | }
24 |
25 | const player = {
26 | "name": "凯文-杜兰特",
27 | "image": "durant.png",
28 | "birthday": "1988-09-29",
29 | "height": "208cm",
30 | "weight": "108.9KG"
31 | }
32 |
33 | console.log(getToken(player))
--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/scrape_consume.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: scrape_consume.py
6 | @time: 2022/1/6 15:10
7 | @project: python3-web-spider-learning
8 | @desc: RabbitMQ实战 消费者(P172)
9 | """
10 | import pickle
11 |
12 | import pika
13 | import requests
14 |
15 | MAX_PRORITY = 100
16 | QUEUE_NAME = 'scrape_queue'
17 |
18 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
19 | channel = connection.channel()
20 | session = requests.Session()
21 |
22 |
23 | def scrape(request):
24 | try:
25 | response = session.send(request.prepare())
26 | print(f'success scraped {response.url}')
27 | except requests.RequestException:
28 | print(f'error occurred when scraping {request.url}')
29 |
30 |
31 | while True:
32 | method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
33 | if body:
34 | request = pickle.loads(body)
35 | print(f'Get {request}')
36 | scrape(request)
37 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/AndroidManifest.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
13 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run.py
6 | @time: 2022/1/20 11:29
7 | @project: python3-web-spider-learning
8 | @desc: 15.12 Scrapy规则化爬虫(实战,P818)
9 | """
10 | import argparse
11 |
12 | from scrapy.crawler import CrawlerProcess
13 | from scrapy.utils.project import get_project_settings
14 |
15 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.utils import get_config
16 |
17 | parser = argparse.ArgumentParser(description='Universal Spider')
18 | parser.add_argument('name', help='name of spider to run')
19 | args = parser.parse_args()
20 | name = args.name
21 |
22 |
23 | def run():
24 | config = get_config(name)
25 | spider = config.get('spider', 'universal')
26 | project_settings = get_project_settings()
27 | settings = dict(project_settings.copy())
28 | settings.update(config.get('settings'))
29 | process = CrawlerProcess(settings)
30 | process.crawl(spider, **{'name': name})
31 | process.start()
32 |
33 |
34 | if __name__ == '__main__':
35 | run()
36 |
--------------------------------------------------------------------------------
/src/ch10/jwt_simulate_login.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: jwt_simulate_login.py
6 | @time: 2022/1/12 9:49
7 | @project: python3-web-spider-learning
8 | @desc: 10.3 基于JWT的模拟登录爬取实战(P381)
9 | """
10 | from urllib.parse import urljoin
11 | import requests
12 |
13 | BASE_URL = 'https://login3.scrape.center/'
14 | LOGIN_URL = urljoin(BASE_URL, '/api/login')
15 | INDEX_URL = urljoin(BASE_URL, '/api/book')
16 | USERNAME = 'admin'
17 | PASSWORD = 'admin'
18 |
19 | response_login = requests.post(LOGIN_URL, json={
20 | 'username': USERNAME,
21 | 'password': PASSWORD
22 | })
23 | data = response_login.json()
24 | print('Response JSON:', data)
25 | # 获取token jwt
26 | jwt = data.get('token')
27 | print('JWT:', jwt)
28 |
29 | headers = {
30 | 'Authorization': f'jwt {jwt}'
31 | }
32 | response_index = requests.get(INDEX_URL, params={
33 | 'limit': 18,
34 | 'offset': 0
35 | }, headers=headers)
36 | print('Response Status', response_index.status_code)
37 | print('Response URL', response_index.url)
38 | print('Response Data', response_index.json())
39 |
--------------------------------------------------------------------------------
/src/ch07/selenium_demo/delay_wait.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: delay_wait.py
6 | @time: 2022/1/7 15:05
7 | @project: python3-web-spider-learning
8 | @desc: 延时等待(P220)
9 | """
10 | from selenium import webdriver
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 |
15 |
16 | def implicit_wait():
17 | browser = webdriver.Chrome()
18 | browser.implicitly_wait(10)
19 | browser.get('https://spa2.scrape.center/')
20 | input = browser.find_element_by_class_name('logo-image')
21 | print(input)
22 |
23 |
24 | def explicit_wait():
25 | browser = webdriver.Chrome()
26 | browser.get('https://www.taobao.com/')
27 | wait = WebDriverWait(browser, 10)
28 | input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
29 | button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
30 | print(input, button)
31 |
32 |
33 | if __name__ == '__main__':
34 | explicit_wait()
35 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: server.py
6 | @time: 2022/1/12 14:00
7 | @project: python3-web-spider-learning
8 | @desc:
9 | """
10 | from flask import Flask, g
11 |
12 | from ch10.account_pool.setting import GENERATOR_MAP
13 | from ch10.account_pool.storages_redis import RedisClient
14 | from loguru import logger
15 |
16 | app = Flask(__name__)
17 |
18 | account = 'account'
19 | credential = 'credential'
20 |
21 |
22 | @app.route('/')
23 | def index():
24 | return 'Welcome to Cookie Pool System
'
25 |
26 |
27 | def get_conn():
28 | for website in GENERATOR_MAP:
29 | if not hasattr(g, website):
30 | setattr(g, f'{website}_{credential}', RedisClient(credential, website))
31 | setattr(g, f'{website}_{account}', RedisClient(account, website))
32 | return g
33 |
34 |
35 | @app.route('//random')
36 | def random(website):
37 | g = get_conn()
38 | result = getattr(g, f'{website}_{credential}').random()
39 | logger.debug(f'get credential {result}')
40 | return result
41 |
--------------------------------------------------------------------------------
/src/ch11/nodejs_demo/nodejs_server.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: nodejs_server.js
4 | * @time: 22:09
5 | * @project: python3-web-spider-learning
6 | * @desc: 搭建nodejs服务(P453)
7 | */
8 |
9 | const CryptoJS = require("./crypto.js")
10 | const express = require("express")
11 | const app = express();
12 | const port = 3000;
13 | app.use(express.json())
14 |
15 |
16 | function getToken(player) {
17 | let key = CryptoJS.enc.Utf8.parse("fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt");
18 | const {name, birthday, height, weight} = player;
19 | let base64Name = CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(name));
20 | let encrypted = CryptoJS.DES.encrypt(
21 | `${base64Name}${birthday}${height}${weight}`,
22 | key, {
23 | mode: CryptoJS.mode.ECB,
24 | padding: CryptoJS.pad.Pkcs7,
25 | }
26 | );
27 | return encrypted.toString();
28 | }
29 |
30 | app.post("/", (req, res)=> {
31 | const data = req.body;
32 | res.send(getToken(data))
33 | });
34 |
35 | app.listen(port, ()=> {
36 | console.log(`Example app listening on port ${port}`);
37 | })
--------------------------------------------------------------------------------
/src/ch13/ida_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: ida_demo.py
6 | @time: 2022/1/17 19:19
7 | @project: python3-web-spider-learning
8 | @desc: 13.8 IDA Pro静态分析和动态调试so文件(汇编代码调试)(P679)
9 | """
10 | import requests
11 | import hashlib
12 | import time
13 | import base64
14 |
15 |
16 | def get_token(value, offset):
17 | array = []
18 | array.append(value)
19 | array.append('9fdLnciVh4FxQbri')
20 | array.append(str(offset))
21 | timestamp = str(int(time.time()))
22 | array.append(timestamp)
23 | sign = hashlib.sha1(','.join(array).encode('utf-8')).hexdigest()
24 | return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8')
25 |
26 |
27 | INDEX_URL = 'https://app8.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}'
28 | MAX_PAGE = 10
29 | LIMIT = 10
30 |
31 |
32 | for i in range(MAX_PAGE):
33 | offset = i * LIMIT
34 | token = get_token('/api/movie', offset)
35 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
36 | response = requests.get(index_url)
37 | print('response', response.json())
38 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradle.properties:
--------------------------------------------------------------------------------
1 | # Project-wide Gradle settings.
2 | # IDE (e.g. Android Studio) users:
3 | # Gradle settings configured through the IDE *will override*
4 | # any settings specified in this file.
5 | # For more details on how to configure your build environment visit
6 | # http://www.gradle.org/docs/current/userguide/build_environment.html
7 | # Specifies the JVM arguments used for the daemon process.
8 | # The setting is particularly useful for tweaking memory settings.
9 | org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
10 | # When configured, Gradle will run in incubating parallel mode.
11 | # This option should only be used with decoupled projects. More details, visit
12 | # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
13 | # org.gradle.parallel=true
14 | # AndroidX package structure to make it clearer which packages are bundled with the
15 | # Android operating system, and which are packaged with your app"s APK
16 | # https://developer.android.com/topic/libraries/support-library/androidx-rn
17 | android.useAndroidX=true
18 | # Automatically convert third-party libraries to use AndroidX
19 | android.enableJetifier=true
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/spiders/httpbin.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy import Request
3 |
4 |
5 | class HttpbinSpider(scrapy.Spider):
6 | name = 'httpbin'
7 | allowed_domains = ['www.httpbin.org']
8 | start_url = 'https://www.httpbin.org/get'
9 | headers = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
11 | }
12 | cookies = {'name': 'germey',
13 | 'age': '26'}
14 |
15 | def start_requests(self):
16 | for offset in range(5):
17 | url = self.start_url + f'?offset={offset}'
18 | yield Request(url, headers=self.headers,
19 | cookies=self.cookies,
20 | callback=self.parse_response,
21 | meta={'offset': offset})
22 |
23 | def parse_response(self, response):
24 | print('url:', response.url)
25 | print('request:', response.request)
26 | print('status:', response.status)
27 | print('headers:', response.headers)
28 | print('text:', response.text)
29 | print('meta:', response.meta)
30 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/run_account_pool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: run_account_pool.py
6 | @time: 2022/1/12 14:27
7 | @project: python3-web-spider-learning
8 | @desc: 运行账号池
9 | """
10 | from ch10.account_pool.setting import ENABLE_IMPORT_DATA
11 | from ch10.account_pool.storages_redis import RedisClient
12 | from scheduler import Scheduler
13 | import argparse
14 |
15 | parser = argparse.ArgumentParser(description='AccountPool')
16 | parser.add_argument('website', type=str, help='website')
17 | parser.add_argument('--processor', type=str, help='processor to run')
18 | args = parser.parse_args()
19 | website = args.website
20 |
21 | if __name__ == '__main__':
22 | if ENABLE_IMPORT_DATA:
23 | conn = RedisClient('account', website)
24 | start = 1
25 | end = 20
26 | for i in range(start, end + 1):
27 | username = password = f'admin{i}'
28 | conn.set(username, password)
29 | conn.close()
30 |
31 | # if processor set, just run it
32 | if args.processor:
33 | getattr(Scheduler(), f'run_{args.processor}')(website)
34 | else:
35 | Scheduler().run(website)
36 |
--------------------------------------------------------------------------------
/src/ch15/scrape_processor_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: scrape_processor_demo.py
6 | @time: 2022/1/20 10:52
7 | @project: python3-web-spider-learning
8 | @desc: 15.12 Scrapy规则化爬虫(P816)
9 | """
10 | from itemloaders.processors import TakeFirst, Join, Compose, MapCompose, SelectJmes
11 |
12 |
13 | def takefirst():
14 | # 返回列表的第一个非空值
15 | processor = TakeFirst()
16 | print(processor(['', 1, 2, 3]))
17 |
18 |
19 | def join():
20 | # 把列表拼接成字符串
21 | processor = Join()
22 | print(processor(['one', 'two', 'three']))
23 |
24 | processor = Join(',')
25 | print(processor(['one', 'two', 'three']))
26 |
27 |
28 | def compose():
29 | # 使用多个函数组合构造而成
30 | processor = Compose(str.upper, lambda s: s.strip())
31 | print(processor(' hello world'))
32 |
33 |
34 | def map_compose():
35 | # 和compose类似,迭代处理一个列表输入值
36 | processor = MapCompose(str.upper, lambda s: s.strip())
37 | print(processor(['Hello', 'World', 'Python']))
38 |
39 |
40 | def select_jmes():
41 | # 查询JSON,传入Key,返回查询所得的Value
42 | processor = SelectJmes('foo')
43 | print(processor({'foo': 'bar'}))
44 |
45 |
46 | if __name__ == '__main__':
47 | select_jmes()
48 |
--------------------------------------------------------------------------------
/src/ch12/appium_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: appium_demo.py
6 | @time: 2022/1/16 2:26
7 | @project: python3-web-spider-learning
8 | @desc: 12.4 Appium的使用(P557)
9 | """
10 | from appium import webdriver
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 |
15 | server = 'http://localhost:4723/wd/hub'
16 | desired_capabilitis= {
17 | "platformName": "Android",
18 | "appium:deviceName": "VirtualBox",
19 | "appium:appPackage": "com.goldze.mvvmhabit",
20 | "appium:appActivity": "com.goldze.mvvmhabit.ui.MainActivity",
21 | "appium:noReset": True
22 | }
23 |
24 | # 启动示例App
25 | driver = webdriver.Remote(server, desired_capabilitis)
26 | wait = WebDriverWait(driver, 30)
27 | # 等到所有电影条目都加载之后
28 | wait.until(EC.presence_of_element_located((By.XPATH, '//android.support.v7.widget.RecyclerView/android.widget.LinearLayout')))
29 | window_size = driver.get_window_size()
30 | width, height = window_size.get('width'), window_size.get('height')
31 | # 前两个表示初始位置,后两个表示滑动的结束位置,1000表示滑动时间为1秒
32 | driver.swipe(width * 0.5, height * 0.8, width * 0.5, height * 0.2, 1000)
--------------------------------------------------------------------------------
/src/ch11/execjs_web_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: execjs_web_demo.py
6 | @time: 2022/1/14 9:22
7 | @project: python3-web-spider-learning
8 | @desc: 11.7 浏览器环境下JavaScript的模拟执行(P457)
9 | """
10 | import requests
11 | from playwright.sync_api import sync_playwright
12 |
13 | BASE_URL = "https://spa2.scrape.center"
14 | INDEX_URL = BASE_URL + "/api/movie?limit={limit}&offset={offset}&token={token}"
15 | MAX_PAGE = 10
16 | LIMIT = 10
17 |
18 | # 创建一个无头Chromium浏览器
19 | context = sync_playwright().start()
20 | browser = context.chromium.launch()
21 | # 创建一个新页面
22 | page = browser.new_page()
23 | # 配置路由,将浏览器加载的js替换为本地js
24 | page.route(
25 | "/js/chunk-10192a00.243cb8b7.js",
26 | lambda route: route.fulfill(path="files/chunk.js")
27 | )
28 | page.goto(BASE_URL)
29 |
30 |
31 | def get_token(offset):
32 | # 使用evaluate方法模拟执行
33 | result = page.evaluate('''()=> {
34 | return window.encrypt("%s", "%s")
35 | }''' % ('/api/movie', offset))
36 | return result
37 |
38 |
39 | for i in range(MAX_PAGE):
40 | offset = i * LIMIT
41 | token = get_token(offset)
42 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token)
43 | response = requests.get(index_url)
44 | print('response:', response.json())
45 |
--------------------------------------------------------------------------------
/src/ch04/text_oper_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: text_oper_demo.py
6 | @time: 2022/1/5 19:00
7 | @project: python3-web-spider-learning
8 | @desc: 4.1 TXT文本存储(P128~P130)
9 | """
10 | import os
11 | import re
12 |
13 | import requests
14 | from pyquery import PyQuery as pq
15 |
16 | url = 'https://ssr1.scrape.center'
17 | html = requests.get(url).text
18 | doc = pq(html)
19 | items = doc('.el-card').items()
20 |
21 | if not os.path.exists('files'):
22 | os.makedirs('files')
23 |
24 | file = open('files/movies.txt', 'w', encoding='utf-8')
25 | for item in items:
26 | # 电影名称
27 | name = item.find('a > h2').text()
28 | file.write(f'名称:{name}\n')
29 | # 类别
30 | categories = [item.text() for item in item.find('.categories button span').items()]
31 | file.write(f'类别:{categories}\n')
32 | # 上映时间
33 | published_at = item.find('.info:contains(上映)').text()
34 | published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
35 | if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None
36 | file.write(f'上映时间:{published_at}\n')
37 | # 评分
38 | score = item.find('p.score').text()
39 | file.write(f'评分:{score}\n')
40 | file.write(f'{"=" * 50}\n')
41 |
42 | file.close()
43 |
--------------------------------------------------------------------------------
/src/ch04/csv_oper_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: csv_oper_demo.py
6 | @time: 2022/1/5 19:20
7 | @project: python3-web-spider-learning
8 | @desc: 4.3 CSV文件存储(P134~P138)
9 | """
10 | import csv
11 |
12 |
13 | def write_to_csv():
14 | with open('files/data.csv', 'w', newline='') as csv_file:
15 | writer = csv.writer(csv_file)
16 | writer.writerow(['id', 'name', 'age'])
17 | writer.writerow(['10001', 'Mike', 20])
18 | writer.writerow(['10002', 'Bob', 22])
19 | writer.writerow(['10003', 'Jordan', 21])
20 |
21 |
22 | def write_dict_to_csv():
23 | with open('files/data.csv', 'w', encoding='utf-8', newline='') as csv_file:
24 | filednames = ['id', 'name', 'age']
25 | writer = csv.DictWriter(csv_file, fieldnames=filednames)
26 | writer.writeheader()
27 | writer.writerow({'id': '10001', 'name': 'Mike', 'age': 20})
28 | writer.writerow({'id': '10002', 'name': 'Bob', 'age': 22})
29 | writer.writerow({'id': '10003', 'name': 'Jordan', 'age': 21})
30 |
31 |
32 | def read_csv():
33 | with open('files/data.csv', 'r', encoding='utf-8') as csv_file:
34 | reader = csv.reader(csv_file)
35 | for row in reader:
36 | print(row)
37 |
38 |
39 | if __name__ == '__main__':
40 | write_dict_to_csv()
41 | read_csv()
42 |
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/extensions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: extensions.py
6 | @time: 2022/1/19 18:43
7 | @project: python3-web-spider-learning
8 | @desc:
9 | """
10 | import requests
11 | from scrapy import signals
12 |
13 | NOTIFICATION_URL = 'http://localhost:5000/notify'
14 |
15 |
16 | class NotificationExtension:
17 | def spider_opend(self, spider):
18 | requests.post(NOTIFICATION_URL, json={
19 | 'event': 'SPIDER_OPENED',
20 | 'data': {'spider_name': spider.name}
21 | })
22 |
23 | def spider_closed(self, spider):
24 | requests.post(NOTIFICATION_URL, json={
25 | 'event': 'SPIDER_CLOSED',
26 | 'data': {'spider_name': spider.name}
27 | })
28 |
29 | def item_scraped(self, item, spider):
30 | requests.post(NOTIFICATION_URL, json={
31 | 'event': 'ITEM_SCRAPED',
32 | 'data': {'spider_name': spider.name, 'item': dict(item)}
33 | })
34 |
35 | @classmethod
36 | def from_crawler(cls, crawler):
37 | ext = cls()
38 | crawler.signals.connect(ext.spider_opend, signal=signals.spider_opened)
39 | crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
40 | crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
41 | return ext
42 |
--------------------------------------------------------------------------------
/src/ch04/files/movies.txt:
--------------------------------------------------------------------------------
1 | 名称:霸王别姬 - Farewell My Concubine
2 | 类别:['剧情', '爱情']
3 | 上映时间:1993-07-26
4 | 评分:9.5
5 | ==================================================
6 | 名称:这个杀手不太冷 - Léon
7 | 类别:['剧情', '动作', '犯罪']
8 | 上映时间:1994-09-14
9 | 评分:9.5
10 | ==================================================
11 | 名称:肖申克的救赎 - The Shawshank Redemption
12 | 类别:['剧情', '犯罪']
13 | 上映时间:1994-09-10
14 | 评分:9.5
15 | ==================================================
16 | 名称:泰坦尼克号 - Titanic
17 | 类别:['剧情', '爱情', '灾难']
18 | 上映时间:1998-04-03
19 | 评分:9.5
20 | ==================================================
21 | 名称:罗马假日 - Roman Holiday
22 | 类别:['剧情', '喜剧', '爱情']
23 | 上映时间:1953-08-20
24 | 评分:9.5
25 | ==================================================
26 | 名称:唐伯虎点秋香 - Flirting Scholar
27 | 类别:['喜剧', '爱情', '古装']
28 | 上映时间:1993-07-01
29 | 评分:9.5
30 | ==================================================
31 | 名称:乱世佳人 - Gone with the Wind
32 | 类别:['剧情', '爱情', '历史', '战争']
33 | 上映时间:1939-12-15
34 | 评分:9.5
35 | ==================================================
36 | 名称:喜剧之王 - The King of Comedy
37 | 类别:['剧情', '喜剧', '爱情']
38 | 上映时间:1999-02-13
39 | 评分:9.5
40 | ==================================================
41 | 名称:楚门的世界 - The Truman Show
42 | 类别:['剧情', '科幻']
43 | 上映时间:None
44 | 评分:9.0
45 | ==================================================
46 | 名称:狮子王 - The Lion King
47 | 类别:['动画', '歌舞', '冒险']
48 | 上映时间:1995-07-15
49 | 评分:9.0
50 | ==================================================
51 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id 'com.android.application'
3 | id 'com.yanzhenjie.andserver'
4 | }
5 |
6 | android {
7 | compileSdk 31
8 |
9 | defaultConfig {
10 | applicationId "com.germey.andservertest"
11 | minSdk 16
12 | targetSdk 31
13 | versionCode 1
14 | versionName "1.0"
15 |
16 | testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
17 | }
18 |
19 | buildTypes {
20 | release {
21 | minifyEnabled false
22 | proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
23 | }
24 | }
25 | compileOptions {
26 | sourceCompatibility 1.8
27 | targetCompatibility 1.8
28 | }
29 |
30 | sourceSets {
31 | main {
32 | jniLibs.srcDirs = ["libs"]
33 | }
34 | }
35 | }
36 |
37 | dependencies {
38 | implementation 'androidx.appcompat:appcompat:1.4.1'
39 | implementation 'com.google.android.material:material:1.5.0'
40 | implementation 'androidx.constraintlayout:constraintlayout:2.0.4'
41 | testImplementation 'junit:junit:4.+'
42 | androidTestImplementation 'androidx.test.ext:junit:1.1.3'
43 | androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
44 | implementation 'com.yanzhenjie.andserver:api:2.1.9'
45 | annotationProcessor 'com.yanzhenjie.andserver:processor:2.1.9'
46 | }
--------------------------------------------------------------------------------
/src/ch10/account_pool/storages_redis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: storages_redis.py
6 | @time: 2022/1/12 10:18
7 | @project: python3-web-spider-learning
8 | @desc: 存储模块:使用Redis作为账号池的存储库,数据结构如下:
9 | :
10 | :
11 | """
12 | import random
13 |
14 | from ch10.account_pool.setting import *
15 | import redis
16 |
17 |
18 | class RedisClient:
19 | def __init__(self, type, website, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
20 | self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True)
21 | # 网站类型
22 | self.type = type
23 | # 网站名称
24 | self.website = website
25 |
26 | def name(self):
27 | return f'{self.type}:{self.website}'
28 |
29 | def set(self, username, value):
30 | return self.db.hset(self.name(), username, value)
31 |
32 | def get(self, username):
33 | return self.db.hget(self.name(), username)
34 |
35 | def delete(self, username):
36 | return self.db.hdel(self.name(), username)
37 |
38 | def count(self):
39 | return self.db.hlen(self.name())
40 |
41 | def random(self):
42 | # 随机选择一个cookie
43 | return random.choice(self.db.hvals(self.name()))
44 |
45 | def usernames(self):
46 | return self.db.hkeys(self.name())
47 |
48 | def all(self):
49 | return self.db.hgetall(self.name())
50 |
51 | def close(self):
52 | self.db.close()
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/layout/activity_main.xml:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
20 |
21 |
32 |
33 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/movie.py:
--------------------------------------------------------------------------------
1 | from itemloaders import ItemLoader
2 | from itemloaders.processors import TakeFirst, Identity, Compose
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 |
6 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.items import MovieItem
7 |
8 |
9 | class MovieSpider(CrawlSpider):
10 | name = 'movie'
11 | allowed_domains = ['ssr1.scrape.center']
12 | start_urls = ['https://ssr1.scrape.center/']
13 |
14 | rules = (
15 | Rule(LinkExtractor(restrict_css='.item .name'), follow=True, callback='parse_detail'),
16 | Rule(LinkExtractor(restrict_css='.next'), follow=True),
17 | )
18 |
19 | def parse_detail(self, response):
20 | loader = MovieItemLoader(item=MovieItem(), response=response)
21 | loader.add_css('name', '.item h2::text')
22 | loader.add_css('categories', '.categories button span::text')
23 | loader.add_css('cover', '.cover::attr(src)')
24 | loader.add_css('published_at', '.info span::text', re='(\d{4}-\d{2}-\d{2})\s?上映')
25 | loader.add_xpath('score', '//p[contains(@class, "score")]/text()')
26 | loader.add_xpath('drama', '//div[contains(@class, "drama")]/p/text()')
27 | yield loader.load_item()
28 |
29 |
30 | class MovieItemLoader(ItemLoader):
31 | default_output_processor = TakeFirst()
32 | categories_out = Identity()
33 | score_out = Compose(TakeFirst(), str.strip)
34 | drama_out = Compose(TakeFirst(), str.strip)
35 |
--------------------------------------------------------------------------------
/src/ch02/httpx_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: httpx_demo.py
6 | @time: 2022/1/4 11:32
7 | @project: python3-web-spider-learning
8 | @desc: 2.4 httpx的使用(P75~P78)
9 | """
10 | import asyncio
11 |
12 | import httpx
13 |
14 |
15 | def httpx_deom():
16 | response = httpx.get('https://www.httpbin.org/get')
17 | print(response.status_code)
18 | print(response.headers)
19 | print(response.text)
20 |
21 |
22 | def httpx_with_user_agent():
23 | headers = {
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko)'
25 | 'Chrome/52.0.2743.116 Safari/537.36'
26 | }
27 | response = httpx.get('https://www.httpbin.org/get', headers=headers)
28 | print(response.text)
29 |
30 |
31 | def http2_demo():
32 | client = httpx.Client(http2=True)
33 | response = client.get('https://spa16.scrape.center/')
34 | print(response.text)
35 |
36 |
37 | def client_demo():
38 | url = 'https://www.httpbin.org/headers'
39 | headers = {'User-Agent': 'my-app/0.0.1'}
40 | with httpx.Client(headers=headers) as client:
41 | r = client.get(url)
42 | print(r.json()['headers']['User-Agent'])
43 |
44 |
45 | async def fetch(url):
46 | # 异步请求
47 | async with httpx.AsyncClient(http2=True) as client:
48 | response = await client.get(url)
49 | print(response.text)
50 |
51 |
52 | if __name__ == '__main__':
53 | asyncio.get_event_loop().run_until_complete(fetch('https://www.httpbin.org/get'))
54 |
--------------------------------------------------------------------------------
/src/ch07/css_locate_scrape.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: css_locate_scrape.py
6 | @time: 2022/1/10 21:45
7 | @project: python3-web-spider-learning
8 | @desc: 7.7 CSS位置偏移反爬与爬取实战(P282)
9 | """
10 | import re
11 |
12 | from selenium import webdriver
13 | from selenium.webdriver.common.by import By
14 | from selenium.webdriver.support.wait import WebDriverWait
15 | from selenium.webdriver.support import expected_conditions as EC
16 | from pyquery import PyQuery as pq
17 |
18 |
19 | def parse_name(name_html):
20 | # 处理特殊的情况
21 | has_whole = name_html('.whole')
22 | if has_whole:
23 | return name_html.text()
24 | else:
25 | chars = name_html('.char')
26 | items = []
27 | for char in chars.items():
28 | # 提取文字和偏移值
29 | items.append({
30 | 'text': char.text().strip(),
31 | 'left': int(re.search('(\d+)px', char.attr('style')).group(1))
32 | })
33 | # 排序
34 | items = sorted(items, key=lambda x: x['left'], reverse=False)
35 | # 将文字组合在一起
36 | return ''.join([item.get('text') for item in items])
37 |
38 |
39 | browser = webdriver.Chrome()
40 | browser.get('https://antispider3.scrape.center/')
41 | WebDriverWait(browser, 10) \
42 | .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.item')))
43 | html = browser.page_source
44 | doc = pq(html)
45 | names = doc('.item .name')
46 | for name_html in names.items():
47 | name = parse_name(name_html)
48 | print(name)
49 | browser.close()
50 |
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/simple_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: simple_demo.py
6 | @time: 2022/1/10 19:28
7 | @project: python3-web-spider-learning
8 | @desc: Playwright基本使用(P257)
9 | """
10 | import asyncio
11 | import os
12 |
13 | from playwright.async_api import async_playwright
14 | from playwright.sync_api import sync_playwright
15 |
16 |
17 | def sync_demo():
18 | with sync_playwright() as p:
19 | for browser_type in [p.chromium, p.firefox, p.webkit]:
20 | browser = browser_type.launch(headless=False)
21 | page = browser.new_page()
22 | page.goto('https://www.baidu.com')
23 |
24 | if not os.path.exists('files'):
25 | os.makedirs('files')
26 |
27 | page.screenshot(path=f'files/screenshot-{browser_type.name}.png')
28 | print(page.title())
29 | browser.close()
30 |
31 |
32 | async def async_demo():
33 | async with async_playwright() as p:
34 | for browser_type in [p.chromium, p.firefox, p.webkit]:
35 | browser = await browser_type.launch(headless=False)
36 | page = await browser.new_page()
37 | await page.goto('https://www.baidu.com')
38 |
39 | if not os.path.exists('files'):
40 | os.makedirs('files')
41 |
42 | await page.screenshot(path=f'files/screenshot-{browser_type.name}.png')
43 | print(await page.title())
44 | await browser.close()
45 |
46 |
47 | if __name__ == '__main__':
48 | # 同步模式
49 | # sync_demo()
50 |
51 | # 异步模式
52 | asyncio.run(async_demo())
--------------------------------------------------------------------------------
/src/ch07/pyppeteer_demo/simple_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: simple_demo.py
6 | @time: 2022/1/7 17:23
7 | @project: python3-web-spider-learning
8 | @desc: pyppeteer基本使用(P243)
9 | """
10 | import asyncio
11 | import os
12 |
13 | from pyppeteer import launch
14 | from pyquery import PyQuery as pq
15 |
16 |
17 | async def simple_demo():
18 | browser = await launch()
19 | page = await browser.newPage()
20 | await page.goto('https://spa2.scrape.center/')
21 | await page.waitForSelector('.item .name')
22 | doc = pq(await page.content())
23 | names = [item.text() for item in doc('.item .name').items()]
24 | print('Name:', names)
25 | await browser.close()
26 |
27 |
28 | async def simple_demo2():
29 | width, height = 1366, 768
30 | browser = await launch()
31 | page = await browser.newPage()
32 | await page.setViewport({'width': width, 'height': height})
33 | await page.goto('https://spa2.scrape.center/')
34 | await page.waitForSelector('.item .name')
35 | await asyncio.sleep(2)
36 |
37 | if not os.path.exists('files'):
38 | os.makedirs('files')
39 |
40 | await page.screenshot(path='files/example2.png')
41 | dimensions = await page.evaluate('''() =>{
42 | return {
43 | width: document.documentElement.clientWidth,
44 | height: document.documentElement.clientHeight,
45 | deviceScaleFactor: window.devicePixelRatio,
46 | }
47 | }''')
48 |
49 | print(dimensions)
50 | await browser.close()
51 |
52 |
53 | if __name__ == '__main__':
54 | asyncio.get_event_loop().run_until_complete(simple_demo2())
55 |
--------------------------------------------------------------------------------
/src/ch15/scrapytutorial/scrapytutorial/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | import pymongo
9 | from scrapy.exceptions import DropItem
10 |
11 |
12 | class TextPipeline:
13 | def __init__(self):
14 | # 限制text长度
15 | self.limit = 50
16 |
17 | def process_item(self, item, spider):
18 | if item['text']:
19 | if len(item['text']) > self.limit:
20 | item['text'] = item['text'][0:self.limit].rstrip() + '...'
21 | return item
22 | else:
23 | return DropItem('Missing Text')
24 |
25 |
26 | class MongoDBPipeline:
27 | def __init__(self, connection_string, database):
28 | self.connection_string = connection_string
29 | self.database = database
30 |
31 | @classmethod
32 | def from_crawler(cls, crawler):
33 | return cls(
34 | connection_string=crawler.settings.get('MONGODB_CONNECTION_STRING'),
35 | database=crawler.settings.get('MONGODE_DATABASE')
36 | )
37 |
38 | def open_spider(self, spider):
39 | # spider开启时调用
40 | self.client = pymongo.MongoClient(self.connection_string)
41 | self.db = self.client[self.database]
42 |
43 | def process_item(self, item, spider):
44 | # 执行数据插入
45 | name = item.__class__.__name__
46 | self.db[name].insert_one(dict(item))
47 | return item
48 |
49 | def close_spider(self, spider):
50 | # spider关闭时调用
51 | self.client.close()
52 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/spiders/book.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import scrapy
4 | from gerapy_selenium import SeleniumRequest
5 |
6 | from ch15.scrapyseleniumdemo.scrapyseleniumdemo.items import BookItem
7 |
8 |
9 | class BookSpider(scrapy.Spider):
10 | name = 'book'
11 | allowed_domains = ['spa5.scrape.center']
12 | base_url = 'https://spa5.scrape.center'
13 |
14 | def start_requests(self):
15 | start_url = f'{self.base_url}/page/1'
16 | yield SeleniumRequest(start_url, callback=self.parse_index)
17 |
18 | def parse_index(self, response):
19 | items = response.css('item')
20 | for item in items:
21 | href = item.css('.top a::attr(href)').extract_first()
22 | detail_url = response.urljoin(href)
23 | yield SeleniumRequest(detail_url, callback=self.parse_detail, priority=2)
24 |
25 | match = re.search(r'page/(\d+)', response.url)
26 | if not match:
27 | return
28 | page = int(match.group(1)) + 1
29 | next_url = f'{self.base_url}/page/{page}'
30 | yield SeleniumRequest(next_url, callback=self.parse_index)
31 |
32 | def parse_detail(self, response):
33 | name = response.css('.name::text').extract_first()
34 | tags = response.css('.tags button span::text').extract()
35 | score = response.css('.score::text').extract_first()
36 | price = response.css('.price span::text').extract_first()
37 | cover = response.css('.cover::attr(src)').extract_first()
38 | tags = [tag.strip() for tag in tags] if tags else []
39 | score = score.strip() if score else None
40 | item = BookItem(name=name, tags=tags, score=score, price=price, cover=cover)
41 | yield item
42 |
--------------------------------------------------------------------------------
/src/ch02/requests_demo/advanced_use.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: advanced_use.py
6 | @time: 2021/12/31 15:13
7 | @project: python3-web-spider-learning
8 | @desc: 高级用法(P55~P63)
9 | """
10 | import requests
11 | import urllib3
12 | from requests import Session, Request
13 |
14 | urllib3.disable_warnings()
15 |
16 |
17 | def upload_file():
18 | files = {
19 | 'file': open('../files/favicon.ico', 'rb')
20 | }
21 | r = requests.post('https://www.httpbin.org/post', files=files)
22 | print(r.text)
23 |
24 |
25 | def print_cookie():
26 | r = requests.get('https://www.baidu.com')
27 | print(r.cookies)
28 | for key, value in r.cookies.items():
29 | print(key + '=' + value)
30 |
31 |
32 | def print_https_with_verify():
33 | r = requests.get('https://ssr2.scrape.center/', verify=False)
34 | print(r.status_code)
35 |
36 |
37 | def print_with_timeout():
38 | r = requests.get('https://www.httpbin.org/get', timeout=1)
39 | print(r.status_code)
40 |
41 |
42 | def print_with_auth():
43 | r = requests.get('https://ssr3.scrape.center/', auth=('admin', 'admin'))
44 | print(r.status_code)
45 |
46 |
47 | def print_prepared_request():
48 | url = 'https://www.httpbin.org/post'
49 | data = {'name': 'germey'}
50 | headers = {
51 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko)'
52 | 'Chrome/52.0.2743.116 Safari/537.36'
53 | }
54 | s = Session()
55 | req = Request('POST', url, data=data, headers=headers)
56 | prepped = s.prepare_request(req)
57 | r = s.send(prepped)
58 | print(r.text)
59 |
60 |
61 | if __name__ == '__main__':
62 | print_prepared_request()
63 |
--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/consumer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: consumer.py
6 | @time: 2022/1/6 14:32
7 | @project: python3-web-spider-learning
8 | @desc: RabbitMQ 消费者示例(P167)
9 | """
10 | import pika
11 |
12 | MAX_PRIORITY = 100
13 | QUEUE_NAME = 'scrape'
14 | connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
15 | channel = connection.channel()
16 |
17 |
18 | def callback(ch, method, properties, body):
19 | print(f'Get {body}')
20 |
21 |
22 | def simple_consume():
23 | channel.queue_declare(queue=QUEUE_NAME)
24 | channel.basic_consume(queue=QUEUE_NAME, auto_ack=True, on_message_callback=callback)
25 | channel.start_consuming()
26 |
27 |
28 | def on_demand_consume():
29 | channel.queue_declare(queue=QUEUE_NAME)
30 | while True:
31 | input()
32 | method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
33 | if body:
34 | print(f'Get {body}')
35 |
36 |
37 | def priority_consume():
38 | channel.queue_declare(queue=QUEUE_NAME, arguments={
39 | 'x-max-priority': MAX_PRIORITY
40 | })
41 |
42 | while True:
43 | input()
44 | method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
45 | if body:
46 | print(f'Get {body}')
47 |
48 |
49 | def persistence_consume():
50 | channel.queue_declare(queue=QUEUE_NAME, arguments={
51 | 'x-max-priority': MAX_PRIORITY
52 | }, durable=True)
53 |
54 | while True:
55 | input()
56 | method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True)
57 | if body:
58 | print(f'Get {body}')
59 |
60 |
61 | if __name__ == '__main__':
62 | on_demand_consume()
63 |
--------------------------------------------------------------------------------
/src/ch07/font_scrape.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: font_scrape.py
6 | @time: 2022/1/10 21:55
7 | @project: python3-web-spider-learning
8 | @desc: 7.8 字体反爬与爬取案例(P287)
9 | 难点:评分是通过CSS样式控制的
10 | """
11 | import re
12 |
13 | import requests
14 | from selenium import webdriver
15 | from selenium.webdriver.common.by import By
16 | from selenium.webdriver.support import expected_conditions as EC
17 | from selenium.webdriver.support.wait import WebDriverWait
18 | from pyquery import PyQuery as pq
19 |
20 | url = 'https://antispider4.scrape.center/css/app.654ba59e.css'
21 |
22 | response = requests.get(url)
23 | pattern = re.compile('.icon-(.*?):before\{content:"(.*?)"\}')
24 | results = re.findall(pattern, response.text)
25 | icon_map = {item[0]: item[1] for item in results}
26 |
27 |
28 | def parse_score(item):
29 | elements = item('.icon')
30 | icon_values = []
31 | for element in elements.items():
32 | class_name = (element.attr('class'))
33 | # 提取CSS的icon代号
34 | icon_key = re.search('icon-(\d+)', class_name).group(1)
35 | # 得到真实值
36 | icon_value = icon_map.get(icon_key)
37 | icon_values.append(icon_value)
38 | # 将值进行连接,组成score
39 | return ''.join(icon_values)
40 |
41 |
42 | browser = webdriver.Chrome()
43 | browser.get('https://antispider4.scrape.center/')
44 | WebDriverWait(browser, 10) \
45 | .until(EC.presence_of_element_located((By.CSS_SELECTOR, '.item')))
46 | html = browser.page_source
47 | doc = pq(html)
48 | items = doc('.item')
49 | for item in items.items():
50 | name = item('name').text()
51 | categories = [o.text() for o in item('.categories button').items()]
52 | score = parse_score(item)
53 | print(f'name: {name} categories: {categories} score: {score}')
54 | browser.close()
55 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/spiders/book.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import re
3 |
4 | import scrapy
5 | from scrapy import Request
6 |
7 | from ch15.scrapypyppeteerdemo.scrapypyppeteerdemo.items import BookItem
8 |
9 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
10 |
11 |
12 | class BookSpider(scrapy.Spider):
13 | name = 'book'
14 | allowed_domains = ['spa5.scrape.center']
15 | base_url = 'https://spa5.scrape.center'
16 |
17 | def start_requests(self):
18 | start_url = f'{self.base_url}/page/1'
19 | yield Request(start_url, callback=self.parse_index)
20 |
21 | def parse_index(self, response):
22 | items = response.css('.item')
23 | for item in items:
24 | href = item.css('.top a::attr(href)').extract_first()
25 | detail_url = response.urljoin(href)
26 | yield Request(detail_url, callback=self.parse_detail, priority=2)
27 |
28 | match = re.search(r'page/(\d+)', response.url)
29 | if not match:
30 | return
31 | page = int(match.group(1)) + 1
32 | next_url = f'{self.base_url}/page/{page}'
33 | yield Request(next_url, callback=self.parse_index)
34 |
35 | def parse_detail(self, response):
36 | name = response.css('.name::text').extract_first()
37 | tags = response.css('.tags button span::text').extract()
38 | score = response.css('.score::text').extract_first()
39 | price = response.css('.price span::text').extract_first()
40 | cover = response.css('.cover::attr(src)').extract_first()
41 | tags = [tag.strip() for tag in tags] if tags else []
42 | score = score.strip() if score else None
43 | item = BookItem(name=name, tags=tags, score=score, price=price, cover=cover)
44 | yield item
45 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/configs/movie.json:
--------------------------------------------------------------------------------
1 | {
2 | "spider": "universal",
3 | "type": "电影",
4 | "home": "https://ssr1.scrape.center/",
5 | "settings": {
6 | "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
7 | },
8 | "start_urls": [
9 | "https://ssr1.scrape.center/"
10 | ],
11 | "allowed_domains": [
12 | "ssr1.scrape.center"
13 | ],
14 | "rules": [
15 | {
16 | "link_extractor": {
17 | "restrict_css": ".item .name"
18 | },
19 | "follow": true,
20 | "callback": "parse_detail"
21 | },
22 | {
23 | "link_extractor": {
24 | "restrict_css": ".next"
25 | },
26 | "follow": true
27 | }
28 | ],
29 | "item": {
30 | "class": "MovieItem",
31 | "loader": "MovieItemLoader",
32 | "attrs": {
33 | "name": [
34 | {
35 | "method": "css",
36 | "arg": ".item h2::text"
37 | }
38 | ],
39 | "categories": [
40 | {
41 | "method": "css",
42 | "arg": ".categories button span::text"
43 | }
44 | ],
45 | "cover": [
46 | {
47 | "method": "css",
48 | "arg": ".cover::attr(src)"
49 | }
50 | ],
51 | "published_at": [
52 | {
53 | "method": "css",
54 | "arg": ".info span::text",
55 | "re": "(\\d{4}-\\d{2}-\\d{2})\\s?上映"
56 | }
57 | ],
58 | "score": [
59 | {
60 | "method": "xpath",
61 | "arg": "//p[contains(@class, \"score\")]/text()"
62 | }
63 | ],
64 | "drama": [
65 | {
66 | "method": "xpath",
67 | "arg": "//div[contains(@class, \"drama\")]/p/text()"
68 | }
69 | ]
70 | }
71 | }
72 | }
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/res/drawable-v24/ic_launcher_foreground.xml:
--------------------------------------------------------------------------------
1 |
7 |
8 |
9 |
15 |
18 |
21 |
22 |
23 |
24 |
30 |
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/universal.py:
--------------------------------------------------------------------------------
1 | from scrapy.linkextractors import LinkExtractor
2 | from scrapy.spiders import CrawlSpider, Rule
3 |
4 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo import items, loaders
5 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.utils import get_config
6 |
7 |
8 | class UniversalSpider(CrawlSpider):
9 | name = 'universal'
10 |
11 | def __init__(self, name, *args, **kwargs):
12 | config = get_config(name)
13 | self.config = config
14 | self.start_urls = config.get('start_urls')
15 | self.allowed_domains = config.get('allowed_domains')
16 | rules = []
17 | for rule_kwargs in config.get('rules'):
18 | link_extractor = LinkExtractor(**rule_kwargs.get('link_extractor'))
19 | rule_kwargs['link_extractor'] = link_extractor
20 | rule = Rule(**rule_kwargs)
21 | rules.append(rule)
22 | self.rules = rules
23 | super(UniversalSpider, self).__init__(*args, **kwargs)
24 |
25 | def parse_detail(self, response):
26 | item = self.config.get('item')
27 | if item:
28 | cls = getattr(items, item.get('class'))()
29 | loader = getattr(loaders, item.get('loader'))(cls, response=response)
30 | for key, value in item.get('attrs').items():
31 | for extractor in value:
32 | if extractor.get('method') == 'xpath':
33 | loader.add_xpath(key, extractor.get('arg'), **{'re': extractor.get('re')})
34 | if extractor.get('method') == 'css':
35 | loader.add_css(key, extractor.get('arg'), **{'re': extractor.get('re')})
36 | if extractor.get('method') == 'value':
37 | loader.add_value(key, extractor.get('args'), **{'re': extractor.get('re')})
38 | yield loader.load_item()
39 |
40 |
--------------------------------------------------------------------------------
/src/ch04/rabbitmq_oper_demo/producer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: producer.py
6 | @time: 2022/1/6 14:32
7 | @project: python3-web-spider-learning
8 | @desc: RabbitMQ 生产者示例(P169)
9 | """
10 | import pika
11 |
12 | MAX_PRIORITY = 100
13 | QUEUE_NAME = 'scrape'
14 | connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
15 | channel = connection.channel()
16 |
17 |
18 | def simple_producer():
19 | channel.queue_declare(queue=QUEUE_NAME)
20 | channel.basic_publish(exchange='', routing_key=QUEUE_NAME, body='Hello World!')
21 |
22 |
23 | def on_demand_producer():
24 | channel.queue_declare(queue=QUEUE_NAME)
25 | while True:
26 | data = input()
27 | channel.basic_publish(exchange='', routing_key=QUEUE_NAME, body=data)
28 | print(f'Put {data}')
29 |
30 |
31 | def priority_producer():
32 | channel.queue_declare(queue=QUEUE_NAME, arguments={
33 | 'x-max-priority': MAX_PRIORITY
34 | })
35 |
36 | while True:
37 | data, priority = input().split()
38 | channel.basic_publish(exchange='', routing_key=QUEUE_NAME,
39 | properties=pika.BasicProperties(priority=int(priority)),
40 | body=data)
41 | print(f'Put {data}')
42 |
43 |
44 | def persistence_producer():
45 | channel.queue_declare(queue=QUEUE_NAME, arguments={
46 | 'x-max-priority': MAX_PRIORITY
47 | }, durable=True)
48 |
49 | while True:
50 | data, priority = input().split()
51 | channel.basic_publish(exchange='', routing_key=QUEUE_NAME,
52 | properties=pika.BasicProperties(
53 | priority=int(priority),
54 | delivery_mode=2
55 | ),
56 | body=data)
57 | print(f'Put {data}')
58 |
59 |
60 | if __name__ == '__main__':
61 | priority_producer()
62 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/tester.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: tester.py
6 | @time: 2022/1/12 10:46
7 | @project: python3-web-spider-learning
8 | @desc: 检测模块:检测失效Cookie,然后将其从Redis中删除
9 | """
10 | from ch10.account_pool.exceptions import InitException
11 | from ch10.account_pool.setting import *
12 | from ch10.account_pool.storages_redis import RedisClient
13 | from loguru import logger
14 | import requests
15 |
16 | class BaseTester:
17 | def __init__(self, website=None):
18 | self.website = website
19 | if not self.website:
20 | raise InitException
21 | self.account_operator = RedisClient(type='account', website=self.website)
22 | self.credential_operator = RedisClient(type='credential', website=self.website)
23 |
24 | def test(self, username, credential):
25 | raise NotImplementedError
26 |
27 | def run(self):
28 | credentials = self.credential_operator.all()
29 | for username, credential in credentials.items():
30 | self.test(username, credential)
31 |
32 |
33 | class Antispider6Tester(BaseTester):
34 | def __init__(self, website=None):
35 | super().__init__(website)
36 |
37 | def test(self, username, credential):
38 | logger.info(f'testing credential for {username}')
39 | try:
40 | test_url = TEST_URL_MAP[self.website]
41 | response = requests.get(test_url, headers={
42 | 'Cookie': credential
43 | }, timeout=TEST_TIMEOUT, allow_redirects=False)
44 | if response.status_code == 200:
45 | logger.info('credential is valid')
46 | else:
47 | logger.info('credential is not valid, delete it')
48 | self.credential_operator.delete(username)
49 | except Exception as e:
50 | logger.error(f'test failed: {e}')
51 | logger.info('credential is not valid, delete it')
52 | self.credential_operator.delete(username)
--------------------------------------------------------------------------------
/src/ch04/mongodb_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: mongodb_demo.py
6 | @time: 2022/1/5 20:16
7 | @project: python3-web-spider-learning
8 | @desc: 4.5 MongoDB文档存储(P144~P150)
9 | """
10 | import pymongo
11 |
12 |
13 | def insert_data():
14 | # 插入数据
15 | student = {
16 | 'id': '20170101',
17 | 'name': 'Jordan',
18 | 'age': 20,
19 | 'gender': 'male'
20 | }
21 | result = collection.insert_one(student)
22 | print(result)
23 |
24 | student2 = {
25 | 'id': '20170102',
26 | 'name': 'Mike',
27 | 'age': 21,
28 | 'gender': 'male'
29 | }
30 | result = collection.insert_one(student2)
31 | print(result.inserted_id)
32 |
33 |
34 | def select_data():
35 | # 查找数据
36 | result = collection.find_one({'name': 'Mike'})
37 | print(type(result))
38 | print(result)
39 |
40 | results = collection.find({'age': {'$gt': 20}})
41 | print(results)
42 | for result in results:
43 | print(result)
44 |
45 |
46 | def counts():
47 | # 计数
48 | count = collection.count_documents()
49 | print(count)
50 |
51 |
52 | def sort():
53 | # 排序
54 | results = collection.find().sort('name', pymongo.ASCENDING)
55 | print([result['name'] for result in results])
56 |
57 |
58 | def skip():
59 | # 偏移
60 | results = collection.find().sort('name', pymongo.ASCENDING).skip(2)
61 | print([result['name'] for result in results])
62 |
63 |
64 | def update_data():
65 | condition = {'name': 'Mike'}
66 | student = collection.find_one(condition)
67 | student['age'] = 25
68 | result = collection.update_one(condition, {'$set': student})
69 | print(result)
70 | print(result.matched_count, result.modified_count)
71 |
72 |
73 | if __name__ == '__main__':
74 | # 连接MongoDB
75 | client = pymongo.MongoClient(host='localhost', port=27017)
76 | # 指定test数据库
77 | db = client['test']
78 | # 指定students集合
79 | collection = db.students
80 | update_data()
81 |
--------------------------------------------------------------------------------
/src/ch11/js_scrape_practice.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: js_scrape_practice.py
6 | @time: 2022/1/14 18:50
7 | @project: python3-web-spider-learning
8 | @desc: 11.13 JavaScript逆向爬虫实战(P507)
9 | 目标:爬取网页(https://spa6.scrape.center/)
10 | 重难点:
11 | (1)列表页的Ajax接口参数带有加密的token
12 | (2)详情页的URL带有加密id
13 | (3)详情页的Ajax接口参数带有加密id和加密token
14 | (4)Ajax接口存在时效性,过段时间会返回401
15 | (5)前端JavaScript有压缩和混淆
16 | 逆向爬取思路:
17 | (1)通过全局搜索token,得到构造Ajax请求,设置断点
18 | (2)分析列表页加密逻辑,查看各变量的值,得到基本思路:将/api/movie放入一个列表中,加入当前时间戳,用逗号拼接,进行SHA1编码,将编码结果再次进行拼接
19 | 将拼接后的结果进行Base64编码
20 | (3)分析详情页加密id逻辑:使用Hook btoa,推荐使用Tampermonkey注入,分析得到:将一个固定值加上id进行Base64编码
21 | (4)分析详情页Ajax的token:得到与列表页token的构造逻辑是一样的
22 | """
23 | import base64
24 | import hashlib
25 | import time
26 |
27 | import requests
28 |
29 | INDEX_URL = 'https://spa6.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}'
30 | DETAIL_URL = 'https://spa6.scrape.center/api/movie/{id}?token={token}'
31 | LIMIT = 10
32 | OFFSET = 0
33 | SECRET = 'ef34#teuq0btua#(-57w1q5o5--j@98xygimlyfxs*-!i-0-mb'
34 |
35 |
36 | # 得到token
37 | def get_token(args: list):
38 | timestamp = str(int(time.time()))
39 | args.append(timestamp)
40 | sign = hashlib.sha1(','.join(args).encode('utf-8')).hexdigest()
41 | return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8')
42 |
43 |
44 | args = ['/api/movie']
45 | token = get_token(args)
46 | # 得到列表页的URL
47 | index_url = INDEX_URL.format(limit=LIMIT, offset=OFFSET, token=token)
48 | response = requests.get(index_url)
49 | print('response:', response.json())
50 |
51 | result = response.json()
52 |
53 | for item in result['results']:
54 | id = item['id']
55 | encrypt_id = base64.b64encode((SECRET + str(id)).encode('utf-8')).decode('utf-8')
56 | args = [f'/api/movie/{encrypt_id}']
57 | token = get_token(args=args)
58 | # 得到详情页的URL
59 | detail_url = DETAIL_URL.format(id=encrypt_id, token=token)
60 | response = requests.get(detail_url)
61 | print('detail response:', response.json())
62 |
--------------------------------------------------------------------------------
/src/ch03/parsel_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: parsel_demo.py
6 | @time: 2022/1/5 16:42
7 | @project: python3-web-spider-learning
8 | @desc: 3.4 parsel的使用(P124~P127)
9 | """
10 | from parsel import Selector
11 |
12 | html = '''
13 |
22 | '''
23 |
24 |
25 | def parsel_demo():
26 | selector = Selector(text=html)
27 | items = selector.css('.item-0')
28 | print(len(items), type(items), items)
29 | items2 = selector.xpath('//li[contains(@class, "item-0")]')
30 | print(len(items2), type(items2), items2)
31 |
32 |
33 | def extract_text():
34 | selector = Selector(text=html)
35 | items = selector.css('.item-0')
36 | for item in items:
37 | text = item.xpath('.//text()').get()
38 | print(text)
39 |
40 | result = selector.xpath('//li[contains(@class, "item-0")]//text()').getall()
41 | print(result)
42 |
43 |
44 | def extract_attrs():
45 | selector = Selector(text=html)
46 | result = selector.css('.item-0.active a::attr(href)').get()
47 | print(result)
48 |
49 | result = selector.xpath('//li[contains(@class, "item-0") and contains(@class, "active")]/a/@href').get()
50 | print(result)
51 |
52 |
53 | def extract_re():
54 | selector = Selector(text=html)
55 | result = selector.css('.item-0').re('link.*')
56 | print(result)
57 |
58 | result = selector.css('.item-0 *::text').re('.*item')
59 | print(result)
60 |
61 | result = selector.css('.item-0').re_first('(.*?)')
62 | print(result)
63 |
64 |
65 | if __name__ == '__main__':
66 | extract_re()
67 |
--------------------------------------------------------------------------------
/src/ch08/tesserocr_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: tesserocr_demo.py
6 | @time: 2022/1/11 9:37
7 | @project: python3-web-spider-learning
8 | @desc: 8.1 使用OCR技术识别图形验证码(P296)
9 | """
10 | import re
11 | import time
12 | from io import BytesIO
13 |
14 | import numpy as np
15 | import tesserocr
16 | from PIL import Image
17 | from retrying import retry
18 | from selenium import webdriver
19 | from selenium.common.exceptions import TimeoutException
20 | from selenium.webdriver.common.by import By
21 | from selenium.webdriver.support.wait import WebDriverWait
22 | from selenium.webdriver.support import expected_conditions as EC
23 |
24 |
25 | def preprocess(image):
26 | image = image.convert('L')
27 | array = np.array(image)
28 | array = np.where(array > 115, 255, 0)
29 | image = Image.fromarray(array.astype('uint8'))
30 | return image
31 |
32 |
33 | @retry(stop_max_attempt_number=10, retry_on_result=lambda x: x is False)
34 | def login():
35 | """
36 | 最大尝试10次
37 | """
38 | browser.get('https://captcha7.scrape.center/')
39 | browser.find_element_by_css_selector('.username input[type="text"]').send_keys('admin')
40 | browser.find_element_by_css_selector('.password input[type="password"]').send_keys('admin')
41 | captcha = browser.find_element_by_css_selector('#captcha')
42 | image = Image.open(BytesIO(captcha.screenshot_as_png))
43 | image = preprocess(image)
44 | captcha = tesserocr.image_to_text(image)
45 | captcha = re.sub('[^A-Za-z0-9]', '', captcha)
46 | print("Captcha:", captcha)
47 | browser.find_element_by_css_selector('.captcha input[type="text"]').send_keys(captcha)
48 | browser.find_element_by_css_selector('.login').click()
49 |
50 | try:
51 | WebDriverWait(browser, 4).until(EC.presence_of_element_located((By.XPATH, '//h2[contains(., "登录成功")]')))
52 | time.sleep(10)
53 | browser.close()
54 | return True
55 | except TimeoutException:
56 | return False
57 |
58 |
59 | if __name__ == '__main__':
60 | browser = webdriver.Chrome()
61 | login()
62 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/app/src/main/java/com/germey/andservertest/MainActivity.java:
--------------------------------------------------------------------------------
1 | package com.germey.andservertest;
2 |
3 | import androidx.appcompat.app.AppCompatActivity;
4 |
5 | import android.os.Bundle;
6 | import android.util.Log;
7 | import android.view.View;
8 | import android.widget.Button;
9 | import android.widget.TextView;
10 |
11 | import com.yanzhenjie.andserver.AndServer;
12 | import com.yanzhenjie.andserver.Server;
13 |
14 | import java.util.concurrent.TimeUnit;
15 |
16 | public class MainActivity extends AppCompatActivity {
17 |
18 | private Server server;
19 | private Button button;
20 | private TextView textView;
21 |
22 | @Override
23 | protected void onCreate(Bundle savedInstanceState) {
24 | super.onCreate(savedInstanceState);
25 | setContentView(R.layout.activity_main);
26 | button = findViewById(R.id.toggle_server);
27 | textView = findViewById(R.id.server_status);
28 | server = AndServer.webServer(getApplicationContext())
29 | .port(8080)
30 | .timeout(10, TimeUnit.SECONDS)
31 | .listener(new Server.ServerListener() {
32 | @Override
33 | public void onStarted() {
34 | button.setText(R.string.stop_server);
35 | textView.setText(R.string.server_started);
36 | }
37 |
38 | @Override
39 | public void onStopped() {
40 | button.setText(R.string.start_server);
41 | textView.setText(R.string.server_stopped);
42 | }
43 |
44 | @Override
45 | public void onException(Exception e) {
46 | Log.d("AndServer", e.toString());
47 | }
48 | })
49 | .build();
50 | button.setText(R.string.start_server);
51 | textView.setText(R.string.server_stopped);
52 | }
53 |
54 | public void toggleServer(View view) {
55 | if (!server.isRunning()) {
56 | server.startup();
57 | } else {
58 | server.shutdown();
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/src/ch10/antispider_scrape_with_account_pool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: antispider_scrape_with_account_pool.py
6 | @time: 2022/1/12 17:18
7 | @project: python3-web-spider-learning
8 | @desc: 使用账号池爬取网页(P394)
9 | """
10 | import asyncio
11 | from pyquery import PyQuery as pq
12 | from loguru import logger
13 | import aiohttp
14 | from aiohttp import TCPConnector
15 |
16 | MAX_ID = 20
17 | CONCURRENCY = 2
18 | TARGET_URL = 'https://antispider6.scrape.center'
19 | ACCOUNT_POOL_URL = 'http://localhost:6789/antispider6/random'
20 |
21 | semaphore = asyncio.Semaphore(CONCURRENCY)
22 |
23 |
24 | async def parse_detail(html):
25 | doc = pq(html)
26 | title = doc('.item h2').text()
27 | categories = [item.text() for item in doc('.item .categories span').items()]
28 | cover = doc('.item .cover').attr('src')
29 | score = doc('.item .score').text()
30 | drama = doc('.item .drama').text().strip()
31 |
32 | return {
33 | 'title': title,
34 | 'categories': categories,
35 | 'cover': cover,
36 | 'score': score,
37 | 'drama': drama
38 | }
39 |
40 |
41 | async def fetch_credential(session):
42 | async with session.get(ACCOUNT_POOL_URL) as response:
43 | return await response.text()
44 |
45 |
46 | async def scrape_detail(session, url):
47 | async with semaphore:
48 | credential = await fetch_credential(session)
49 | headers = {'cookie': credential}
50 | logger.debug(f'scrape {url} using credential {credential}')
51 | async with session.get(url, headers=headers) as response:
52 | html = await response.text()
53 | data = await parse_detail(html)
54 | logger.debug(f'data {data}')
55 |
56 |
57 | async def main():
58 | session = aiohttp.ClientSession(connector=TCPConnector(ssl=False))
59 | tasks = []
60 | for i in range(1, MAX_ID + 1):
61 | url = f'{TARGET_URL}/detail/{i}'
62 | task = asyncio.ensure_future(scrape_detail(session, url))
63 | tasks.append(task)
64 | await asyncio.gather(*tasks)
65 |
66 |
67 | if __name__ == '__main__':
68 | asyncio.get_event_loop().run_until_complete(main())
69 |
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/spiders/scrape.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy import Request
3 |
4 | from ch15.scrapyitempipelinedemo.scrapyitempipelinedemo.items import MovieItem
5 |
6 |
7 | class ScrapeSpider(scrapy.Spider):
8 | name = 'scrape'
9 | allowed_domains = ['ssr1.scrape.center']
10 | base_url = 'https://ssr1.scrape.center'
11 | max_page = 10
12 |
13 | def start_requests(self):
14 | for i in range(1, self.max_page + 1):
15 | url = f'{self.base_url}/page/{i}'
16 | yield Request(url, callback=self.parse_index)
17 |
18 | def parse_index(self, response):
19 | for item in response.css('.item'):
20 | href = item.css('.name::attr(href)').extract_first()
21 | url = response.urljoin(href)
22 | yield Request(url, callback=self.parse_detail)
23 |
24 | def parse_detail(self, response):
25 | item = MovieItem()
26 | item['name'] = response.xpath('//div[contains(@class, "item")]//h2/text()').extract_first()
27 | item['categories'] = response.xpath('//button[contains(@class, "category")]/span/text()').extract()
28 | item['score'] = response.css('.score::text').re_first('[\d\.]+')
29 | item['drama'] = response.css('.drama p::text').extract_first().strip()
30 | item['directors'] = []
31 | directors = response.xpath('//div[contains(@class, "directors")]//div[contains(@class, "director")]')
32 | for director in directors:
33 | director_image = director.xpath('.//img[@class="image"]/@src').extract_first()
34 | director_name = director.xpath('.//p[contains(@class, "name")]/text()').extract_first()
35 | item['directors'].append({
36 | 'name': director_name,
37 | 'image': director_image
38 | })
39 | item['actors'] = []
40 | actors = response.css('.actors .actor')
41 | for actor in actors:
42 | actor_image = actor.css('.actor .image::attr(src)').extract_first()
43 | actor_name = actor.css('.actor .name::text').extract_first()
44 | item['actors'].append({
45 | 'name': actor_name,
46 | 'image': actor_image
47 | })
48 | yield item
49 |
--------------------------------------------------------------------------------
/src/ch05/scrape_ajax.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: scrape_ajax.py
6 | @time: 2022/1/6 15:46
7 | @project: python3-web-spider-learning
8 | @desc: 5.3 Ajax分析与爬取实战(P184~P190)
9 | """
10 | import logging
11 |
12 | import pymongo
13 | import requests
14 |
15 | logging.basicConfig(level=logging.INFO,
16 | format='%(asctime)s - %(levelname)s: %(message)s')
17 |
18 | INDEX_URL = 'https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}'
19 |
20 |
21 | def scrape_api(url):
22 | """
23 | 爬取详情页
24 | """
25 | logging.info('scraping %s...', url)
26 | try:
27 | response = requests.get(url)
28 | if response.status_code == 200:
29 | return response.json()
30 | logging.error('get invalid status code %s while scraping %s', response.status_code, url)
31 | except requests.RequestException:
32 | logging.error('error occurred while scraping %s', url, exc_info=True)
33 |
34 |
35 | LIMIT = 10
36 |
37 |
38 | def scrape_index(page):
39 | """
40 | 爬取列表页
41 | """
42 | url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
43 | return scrape_api(url)
44 |
45 |
46 | DETAIL_URL = 'https://spa1.scrape.center/api/movie/{id}'
47 |
48 |
49 | def scrape_detail(id):
50 | """
51 | 爬取详情页
52 | """
53 | url = DETAIL_URL.format(id=id)
54 | return scrape_api(url)
55 |
56 |
57 | TOTAL_PAGE = 10
58 | MONGO_CONNETCTION_STRING = 'mongodb://localhost:27017'
59 | MONGO_DB_NAME = 'movies'
60 | MONGO_COLLECTION_NAME = 'movies'
61 |
62 | client = pymongo.MongoClient(MONGO_CONNETCTION_STRING)
63 | db = client[MONGO_DB_NAME]
64 | collection = db[MONGO_COLLECTION_NAME]
65 |
66 |
67 | def save_data(data):
68 | collection.update_one({
69 | 'name': data.get('name')
70 | }, {'$set': data}, upsert=True)
71 |
72 |
73 | def main():
74 | for page in range(1, TOTAL_PAGE + 1):
75 | index_data = scrape_index(page)
76 | for item in index_data.get('results'):
77 | id = item.get('id')
78 | detail_data = scrape_detail(id)
79 | logging.info('detail data %s', detail_data)
80 | save_data(detail_data)
81 | logging.info('data saved successfully')
82 |
83 |
84 | if __name__ == '__main__':
85 | main()
86 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/generator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: processors_generator.py
6 | @time: 2022/1/12 10:34
7 | @project: python3-web-spider-learning
8 | @desc: 获取模块,主要负责从存储模块中拿取各个账号信息,并模拟登录,将登录成功后生产的Cookie保存到存储模块中
9 | """
10 |
11 | import requests
12 | from loguru import logger
13 |
14 | from ch10.account_pool.exceptions import InitException
15 | from ch10.account_pool.storages_redis import RedisClient
16 |
17 |
18 | class BaseGenerator:
19 | def __init__(self, website=None):
20 | self.website = website
21 | if not self.website:
22 | raise InitException
23 | self.account_operator = RedisClient(type='account', website=self.website)
24 | self.credential_operator = RedisClient(type='credential', website=self.website)
25 |
26 | def generate(self, username, password):
27 | raise NotImplementedError
28 |
29 | def init(self):
30 | pass
31 |
32 | def run(self):
33 | self.init()
34 | logger.debug('start to run generator')
35 | for username, password in self.account_operator.all().items():
36 | if self.credential_operator.get(username):
37 | continue
38 | logger.debug(f'start to generator credential of {username}')
39 | self.generate(username, password)
40 |
41 |
42 | class Antispider6Generator(BaseGenerator):
43 | def generate(self, username, password):
44 | if self.credential_operator.get(username):
45 | logger.debug(f'credential of {username} exists, skip')
46 | return
47 | login_url = 'https://antispider6.scrape.center/login'
48 | s = requests.Session()
49 | try:
50 | s.post(login_url, data={
51 | 'username': username,
52 | 'password': password
53 | })
54 | result = []
55 | for cookie in s.cookies:
56 | print(cookie.name, cookie.value)
57 | result.append(f'{cookie.name}={cookie.value}')
58 | result = ';'.join(result)
59 | if len(result) > 0:
60 | logger.debug(f'get {username} credential {result}')
61 | self.credential_operator.set(username, result)
62 | except Exception as e:
63 | logger.error(f'get {username} credential failed: {e}')
64 |
--------------------------------------------------------------------------------
/src/ch10/session_cookie_simulate_login.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: session_cookie_simulate_login.py
6 | @time: 2022/1/12 9:12
7 | @project: python3-web-spider-learning
8 | @desc: 10.2 基于Session和Cookie的模拟登录爬取实战(P376)
9 | """
10 | import time
11 | from urllib.parse import urljoin
12 | import requests
13 | from selenium import webdriver
14 |
15 | BASE_URL = 'https://login2.scrape.center/'
16 | LOGIN_URL = urljoin(BASE_URL, '/login')
17 | INDEX_URL = urljoin(BASE_URL, '/page/1')
18 | USERNAME = 'admin'
19 | PASSWORD = 'admin'
20 |
21 |
22 | def simul_login_with_cookies():
23 | # 登录网站
24 | response_login = requests.post(LOGIN_URL, data={
25 | 'username': USERNAME,
26 | 'password': PASSWORD
27 | }, allow_redirects=False)
28 |
29 | # 保存Cookie
30 | cookies = response_login.cookies
31 | print('Cookies:', cookies)
32 |
33 | # 携带cookies访问列表页
34 | response_index = requests.get(INDEX_URL, cookies=cookies)
35 | print('Response Status', response_index.status_code)
36 | print('Response URL', response_index.url)
37 |
38 |
39 | def simul_login_with_session():
40 | session = requests.Session()
41 |
42 | # 登录网站
43 | response_login = session.post(LOGIN_URL, data={
44 | 'username': USERNAME,
45 | 'password': PASSWORD
46 | })
47 |
48 | # 保存Cookie
49 | cookies = session.cookies
50 | print('Cookies:', cookies)
51 |
52 | # 携带cookies访问列表页
53 | response_index = session.get(INDEX_URL)
54 | print('Response Status', response_index.status_code)
55 | print('Response URL', response_index.url)
56 |
57 |
58 | def simul_login_with_selenium():
59 | browser = webdriver.Chrome()
60 | browser.get(BASE_URL)
61 | browser.find_element_by_css_selector('input[name="username"]').send_keys(USERNAME)
62 | browser.find_element_by_css_selector('input[name="password"]').send_keys(PASSWORD)
63 | browser.find_element_by_css_selector('input[type="submit"]').click()
64 | time.sleep(10)
65 |
66 | # 从浏览器对象中获取Cookie信息
67 | cookies = browser.get_cookies()
68 | print('Cookies:', cookies)
69 | browser.close()
70 |
71 | # 把Cookies信息放入请求中
72 | session = requests.Session()
73 | for cookie in cookies:
74 | session.cookies.set(cookie['name'], cookie['value'])
75 |
76 | response_index = session.get(INDEX_URL)
77 | print('Response Status', response_index.status_code)
78 | print('Response URL', response_index.url)
79 |
80 |
81 | if __name__ == '__main__':
82 | simul_login_with_selenium()
83 |
--------------------------------------------------------------------------------
/src/ch02/urllib_demo/request_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: requests_demo.py
6 | @time: 2021/12/29 15:17
7 | @project: python3-web-spider-learning
8 | @desc: Request模块示例(P30~P34)
9 | """
10 | import socket
11 | import urllib.request
12 | import urllib.error
13 | import urllib.parse
14 |
15 |
16 | def print_content(url='https://www.python.org'):
17 | response = urllib.request.urlopen(url)
18 | # 打印网页源代码
19 | print(response.read().decode('utf-8'))
20 |
21 |
22 | def print_response_type(url='https://www.python.org'):
23 | response = urllib.request.urlopen(url)
24 | # 打印响应类型
25 | print(type(response))
26 |
27 |
28 | def print_status(url='https://www.python.org'):
29 | response = urllib.request.urlopen(url)
30 | # 打印响应的状态码
31 | print(response.status)
32 |
33 |
34 | def print_header(name='Server', url='https://www.python.org'):
35 | response = urllib.request.urlopen(url)
36 | # 打印响应的头信息
37 | print(response.getheaders())
38 | if name:
39 | # 打印响应头中的指定值
40 | print(response.getheader(name))
41 |
42 |
43 | def print_content_with_data(url='https://www.httpbin.org/post'):
44 | data = bytes(urllib.parse.urlencode({'name': 'germey'}), encoding='utf-8')
45 | # 使用data参数
46 | response = urllib.request.urlopen(url, data=data)
47 | print(response.read().decode('utf-8'))
48 |
49 |
50 | def print_content_with_timeout(url='https://www.httpbin.org/get'):
51 | # 使用timeout参数
52 | response = urllib.request.urlopen(url, timeout=0.1)
53 | print(response.read())
54 |
55 |
56 | def print_content_with_try_except(url='https://www.httpbin.org/get'):
57 | # 使用timeout参数
58 | try:
59 | urllib.request.urlopen(url, timeout=0.1)
60 | except urllib.error.URLError as e:
61 | if isinstance(e.reason, socket.timeout):
62 | print('TIME OUT')
63 |
64 |
65 | def print_content_with_request(url='https://www.httpbin.org/post'):
66 | # 指定headers的User-Agent和Host
67 | headers = {
68 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
69 | 'Host': 'www.httpbin.org'
70 | }
71 |
72 | data_dict = {'name': 'germey'}
73 | # 将字典数据转成字节流格式
74 | data = bytes(urllib.parse.urlencode(data_dict), encoding='utf-8')
75 | # 构造Request类
76 | req = urllib.request.Request(url=url, data=data, headers=headers, method='POST')
77 | response = urllib.request.urlopen(req)
78 | print(response.read().decode('utf-8'))
79 |
80 |
81 | if __name__ == '__main__':
82 | print_content_with_request()
83 |
--------------------------------------------------------------------------------
/src/ch02/requests_demo/requests_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: requests_demo.py
6 | @time: 2021/12/31 13:52
7 | @project: python3-web-spider-learning
8 | @desc: requests基本用法(P48~P55)
9 | """
10 | import os
11 | import re
12 |
13 | import requests
14 |
15 |
16 | def print_get_request():
17 | r = requests.get('https://www.baidu.com')
18 | print(type(r))
19 | print(r.status_code)
20 | print(type(r.text))
21 | print(r.text[:100])
22 | print(r.cookies)
23 |
24 |
25 | def print_request():
26 | r = requests.get('https://www.httpbin.org/get')
27 | r = requests.post('https://www.httpbin.org/post')
28 | r = requests.put('https://www.httpbin.org/put')
29 | r = requests.delete('https://www.httpbin.org/delete')
30 | r = requests.patch('https://www.httpbin.org/patch')
31 |
32 |
33 | def print_get_with_params(url, params):
34 | r = requests.get(url, params=params)
35 | print(r.text)
36 |
37 |
38 | def print_json():
39 | r = requests.get('https://www.httpbin.org/get')
40 | print(type(r.text))
41 | print(r.json())
42 | print(type(r.json()))
43 |
44 |
45 | def fetch_web():
46 | r = requests.get('https://ssr1.scrape.center/')
47 | pattern = re.compile('(.*?)', re.S)
48 | titles = re.findall(pattern, r.text)
49 | print(titles)
50 |
51 |
52 | def get_favicon():
53 | if not os.path.exists('../files'):
54 | os.mkdir('../files')
55 |
56 | r = requests.get('https://scrape.center/favicon.ico')
57 | with open('../files/favicon.ico', 'wb') as f:
58 | f.write(r.content)
59 |
60 |
61 | def print_get_with_headers():
62 | headers = {
63 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko)'
64 | 'Chrome/52.0.2743.116 Safari/537.36'
65 | }
66 | r = requests.get('https://ssr1.scrape.center/', headers=headers)
67 | print(r.text)
68 |
69 |
70 | def print_post():
71 | data = {
72 | 'name': 'germey',
73 | 'age': '25'
74 | }
75 | r = requests.post("https://www.httpbin.org/post", data=data)
76 | print(r.text)
77 |
78 |
79 | def check_request():
80 | r = requests.get('https://ssr1.scrape.center/')
81 | exit() if not r.status_code == requests.codes.ok else print('Request Successfully')
82 |
83 |
84 | if __name__ == '__main__':
85 | # url = 'https://www.httpbin.org/get'
86 | # data = {
87 | # 'name': 'germey',
88 | # 'age': 25
89 | # }
90 | #
91 | # print_get_with_params(url, data)
92 |
93 | check_request()
94 |
--------------------------------------------------------------------------------
/src/ch06/aiohttp_scrape_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: aiohttp_scrape_demo.py
6 | @time: 2022/1/6 20:06
7 | @project: python3-web-spider-learning
8 | @desc: 6.3 aiohttp异步爬取实战(P207~P211)
9 | """
10 | import asyncio
11 | import json
12 | import logging
13 | from aiohttp import TCPConnector
14 | import aiohttp
15 | from motor.motor_asyncio import AsyncIOMotorClient
16 |
17 | logging.basicConfig(level=logging.INFO,
18 | format='%(asctime)s - %(levelname)s: %(message)s')
19 |
20 | INDEX_URL = 'https://spa5.scrape.center/api/book?limit=18&offset={offset}'
21 | DETAIL_URL = 'https://spa5.scrape.center/api/book/{id}'
22 | PAGE_SIZE = 18
23 | PAGE_NUMBER = 100
24 | CONCURRENCY = 5
25 |
26 | semaphore = asyncio.Semaphore(CONCURRENCY)
27 | session = None
28 |
29 |
30 | async def scrape_api(url):
31 | """
32 | 通用爬取方法
33 | """
34 | async with semaphore:
35 | try:
36 | logging.info('scraping %s', url)
37 | async with session.get(url) as response:
38 | return await response.json()
39 | except aiohttp.ClientError:
40 | logging.error('error occurred with scaping %s', url, exc_info=True)
41 |
42 |
43 | async def scrape_index(page):
44 | """
45 | 爬取列表页
46 | """
47 | url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
48 | return await scrape_api(url)
49 |
50 |
51 | MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
52 | MONGO_DB_NAME = 'books'
53 | MONGO_COLLECTION_NAME = 'books'
54 |
55 | client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
56 | db = client[MONGO_DB_NAME]
57 | collection = db[MONGO_COLLECTION_NAME]
58 |
59 |
60 | async def save_data(data):
61 | logging.info('saving data %s', data)
62 | if data:
63 | return await collection.update_one({
64 | 'id': data.get('id')
65 | }, {
66 | '$set': data
67 | }, upsert=True)
68 |
69 |
70 | async def scrape_detail(id):
71 | url = DETAIL_URL.format(id=id)
72 | data = await scrape_api(url)
73 | await save_data(data)
74 |
75 |
76 | async def main():
77 | global session
78 | session = aiohttp.ClientSession(connector=TCPConnector(ssl=False))
79 | scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
80 | results = await asyncio.gather(*scrape_index_tasks)
81 | logging.info('results %s', json.dumps(results, ensure_ascii=False, indent=2))
82 |
83 | # 所有书的ID
84 | ids = []
85 | for index_data in results:
86 | if not index_data:
87 | continue
88 | for item in index_data.get('results'):
89 | ids.append(item.get('id'))
90 |
91 | scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(id)) for id in ids]
92 | await asyncio.wait(scrape_detail_tasks)
93 | await session.close()
94 |
95 |
96 | if __name__ == '__main__':
97 | asyncio.get_event_loop().run_until_complete(main())
98 |
--------------------------------------------------------------------------------
/src/ch04/elasticsearch_oper_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: elasticsearch_oper_demo.py
6 | @time: 2022/1/6 9:15
7 | @project: python3-web-spider-learning
8 | @desc: 4.7 Elasticsearch搜索引擎存储(P161~P166)
9 | """
10 | from elasticsearch import Elasticsearch
11 |
12 |
13 | def create_index():
14 | result = es.indices.create(index='news', ignore=400)
15 | print(result)
16 |
17 |
18 | def delete_index():
19 | result = es.indices.delete(index='news', ignore=[400, 404])
20 | print(result)
21 |
22 |
23 | def insert_data():
24 | es.indices.create(index='news', ignore=400)
25 |
26 | data = {
27 | 'title': '乘风破浪不负韶华,奋斗青春圆梦高考',
28 | 'url': 'http://view.indws.qq.com/a/EDU20210416007322200'
29 | }
30 | result = es.create(index='news', id=1, body=data)
31 | print(result)
32 |
33 |
34 | def update_data():
35 | data = {
36 | 'title': '乘风破浪不负韶华,奋斗青春圆梦高考',
37 | 'url': 'http://view.indws.qq.com/a/EDU20210416007322200',
38 | 'date': '2021-07-05'
39 | }
40 | result = es.update(index='news', body=data, id=1, ignore=400)
41 | print(result)
42 |
43 |
44 | def delete_data():
45 | result = es.delete(index='news', id=1)
46 | print(result)
47 |
48 |
49 | def select_data():
50 | mapping = {
51 | 'properties': {
52 | 'title': {
53 | 'type': 'text',
54 | 'analyzer': 'ik_max_word',
55 | 'search_analyzer': 'ik_max_word'
56 | }
57 | }
58 | }
59 | es.indices.delete(index='news', ignore=[400, 404])
60 | es.indices.create(index='news', ignore=400)
61 | result = es.indices.put_mapping(index='news', body=mapping)
62 | print(result)
63 |
64 | datas = [
65 | {
66 | 'title': '高考结局大不同',
67 | 'url': 'https://k.sina.com.cn/article_7571064628_1c3454734001011lz9.html',
68 | },
69 | {
70 | 'title': '进入职业大洗牌时代,“吃香”职业还吃香吗?',
71 | 'url': 'https://new.qq.com/omn/20210828/20210828A025LK00.html',
72 | },
73 | {
74 | 'title': '乘风破浪不负韶华,奋斗青春圆梦高考',
75 | 'url': 'http://view.inews.qq.com/a/EDU2021041600732200',
76 | },
77 | {
78 | 'title': '他,活出了我们理想的样子',
79 | 'url': 'https://new.qq.com/omn/20210821/20210821A020ID00.html',
80 | }
81 | ]
82 |
83 | for data in datas:
84 | es.index(index='news', body=data)
85 |
86 | result = es.search(index='news')
87 | print(result)
88 |
89 |
90 | def full_text_search():
91 | dsl = {
92 | 'query': {
93 | 'match': {
94 | 'title': '高考 圆梦'
95 | }
96 | }
97 | }
98 | result = es.search(index='news', body=dsl)
99 | print(result)
100 |
101 |
102 | if __name__ == '__main__':
103 | es = Elasticsearch()
104 | full_text_search()
105 |
--------------------------------------------------------------------------------
/src/ch13/AndServerTest/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/src/ch08/opencv_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: opencv_demo.py
6 | @time: 2022/1/11 10:41
7 | @project: python3-web-spider-learning
8 | @desc: 8.2 使用OpenCV识别滑动验证码的缺口(P298~P303)
9 | """
10 | import cv2
11 |
12 | GAUSSIAN_BLUR_KERNEL_SIZE = (5, 5)
13 | GAUSSIAN_BLUR_SIGMA_X = 0
14 | CANNY_THRESHOLD1 = 200
15 | CANNY_THRESHOLD2 = 450
16 |
17 |
18 | def get_gaussian_blur_image(image):
19 | """
20 | 得到高斯滤波处理后的图片
21 | """
22 | return cv2.GaussianBlur(image, GAUSSIAN_BLUR_KERNEL_SIZE, GAUSSIAN_BLUR_SIGMA_X)
23 |
24 |
25 | def get_canny_image(image):
26 | """
27 | 得到边缘检测处理后的图片
28 | """
29 | return cv2.Canny(image, CANNY_THRESHOLD1, CANNY_THRESHOLD2)
30 |
31 |
32 | def get_contours(image):
33 | """
34 | 得到轮廓信息
35 | """
36 | contours, _ = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
37 | return contours
38 |
39 |
40 | def get_contour_area_thrshold(image_width, image_height):
41 | """
42 | 定义目标轮廓的面积上下限
43 | """
44 | contour_area_min = (image_width * 0.15) * (image_height * 0.25) * 0.8
45 | contour_area_max = (image_width * 0.15) * (image_height * 0.25) * 1.2
46 | return contour_area_min, contour_area_max
47 |
48 |
49 | def get_arc_threshold(image_width, image_height):
50 | """
51 | 定义目标轮廓的周长上下限
52 | """
53 | arc_length_min = ((image_width * 0.15) + (image_height * 0.25)) * 2 * 0.8
54 | arc_length_max = ((image_width * 0.15) + (image_height * 0.25)) * 2 * 1.2
55 | return arc_length_min, arc_length_max
56 |
57 |
58 | def get_offset_threshold(image_width):
59 | """
60 | 定义缺口位置的偏移量上下限
61 | """
62 | offset_min = 0.2 * image_width
63 | offset_max = 0.85 * image_width
64 | return offset_min, offset_max
65 |
66 |
67 | if __name__ == '__main__':
68 | image_raw = cv2.imread('files/slide_captcha.png')
69 | # 得到图片的宽高
70 | image_height, image_width, _ = image_raw.shape
71 | image_gaussian_blur = get_gaussian_blur_image(image_raw)
72 | cv2.imwrite('files/image_gaussian_blur.png', image_gaussian_blur)
73 | image_canny = get_canny_image(image_gaussian_blur)
74 | cv2.imwrite('files/image_canny.png', image_canny)
75 | contours = get_contours(image_canny)
76 |
77 | contour_area_min, contour_area_max = get_contour_area_thrshold(image_width, image_height)
78 | arc_length_min, arc_length_max = get_arc_threshold(image_width, image_height)
79 | offset_min, offset_max = get_offset_threshold(image_width)
80 | offset = None
81 |
82 | for contour in contours:
83 | x, y, w, h = cv2.boundingRect(contour)
84 | # 判断满足条件的缺口位置
85 | if contour_area_min < cv2.contourArea(contour) < contour_area_max and \
86 | arc_length_min < cv2.arcLength(contour, True) < arc_length_max and \
87 | offset_min < x < offset_max:
88 | # 用矩形框标注出来
89 | cv2.rectangle(image_raw, (x, y), (x + w, y + h), (0, 0, 255), 2)
90 | offset = x
91 |
92 | cv2.imwrite('files/image_label.png', image_raw)
93 | print('offset:', offset)
94 |
--------------------------------------------------------------------------------
/src/ch07/playwright_demo/event_listen.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: event_listen.py
6 | @time: 2022/1/10 19:56
7 | @project: python3-web-spider-learning
8 | @desc: 事件监听(P263)
9 | """
10 | import re
11 |
12 | from playwright.sync_api import sync_playwright
13 |
14 |
15 | def sync_on_response(response):
16 | # 打印请求和响应
17 | # print(f'Statue{response.status}: {response.url}')
18 |
19 | if '/api/movie/' in response.url and response.status == 200:
20 | print(response.json())
21 |
22 |
23 | def sync():
24 | with sync_playwright() as p:
25 | browser = p.chromium.launch(headless=False)
26 | page = browser.new_page()
27 | page.on('response', sync_on_response)
28 | page.goto('https://spa6.scrape.center/')
29 | page.wait_for_load_state('networkidle')
30 | browser.close()
31 |
32 |
33 | def get_web_source():
34 | with sync_playwright() as p:
35 | browser = p.chromium.launch(headless=False)
36 | page = browser.new_page()
37 | page.goto('https://spa6.scrape.center/')
38 | page.wait_for_load_state('networkidle')
39 | html = page.content()
40 | print(html)
41 | browser.close()
42 |
43 |
44 | def get_node_attr():
45 | with sync_playwright() as p:
46 | browser = p.chromium.launch(headless=False)
47 | page = browser.new_page()
48 | page.goto('https://spa6.scrape.center/')
49 | page.wait_for_load_state('networkidle')
50 | href = page.get_attribute('a.name', 'href')
51 | print(href)
52 | browser.close()
53 |
54 |
55 | def get_node_attrs():
56 | with sync_playwright() as p:
57 | browser = p.chromium.launch(headless=False)
58 | page = browser.new_page()
59 | page.goto('https://spa6.scrape.center/')
60 | page.wait_for_load_state('networkidle')
61 | elements = page.query_selector_all('a.name')
62 | for element in elements:
63 | print(element.get_attribute('href'))
64 | print(element.text_content())
65 | browser.close()
66 |
67 |
68 | def get_node():
69 | with sync_playwright() as p:
70 | browser = p.chromium.launch(headless=False)
71 | page = browser.new_page()
72 | page.goto('https://spa6.scrape.center/')
73 | page.wait_for_load_state('networkidle')
74 | element = page.query_selector('a.name')
75 | print(element.get_attribute('href'))
76 | print(element.text_content())
77 | browser.close()
78 |
79 |
80 | def route_demo():
81 | with sync_playwright() as p:
82 | browser = p.chromium.launch(headless=False)
83 | page = browser.new_page()
84 |
85 | def cancel_request(route, request):
86 | route.abort()
87 |
88 | page.route(re.compile(r"(\.png)|(\.jpg)"), cancel_request)
89 | page.goto("https://spa6.scrape.center/")
90 | page.wait_for_load_state('networkidle')
91 | page.screenshot(path='files/np_picture.png')
92 | browser.close()
93 |
94 |
95 | if __name__ == '__main__':
96 | route_demo()
97 |
--------------------------------------------------------------------------------
/src/ch02/urllib_demo/request_hander_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: request_hander_demo.py
6 | @time: 2021/12/29 16:32
7 | @project: python3-web-spider-learning
8 | @desc: 验证、代理、Cookie(P35-P36)
9 | """
10 |
11 | import http.cookiejar
12 | import os
13 | import urllib.request
14 | from urllib.error import URLError
15 | from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler
16 | from urllib.request import ProxyHandler, build_opener
17 |
18 |
19 | def valid():
20 | username = 'admin'
21 | password = 'admin'
22 | url = 'https://ssr3.scrape.center/'
23 |
24 | p = HTTPPasswordMgrWithDefaultRealm()
25 | p.add_password(None, url, username, password)
26 | auth_handler = HTTPBasicAuthHandler(p)
27 | opener = build_opener(auth_handler)
28 |
29 | try:
30 | result = opener.open(url)
31 | html = result.read().decode('utf-8')
32 | print(html)
33 | except URLError as e:
34 | print(e.reason)
35 |
36 |
37 | def proxy():
38 | proxy_hander = ProxyHandler({
39 | 'http': 'http://127.0.0.1:8080',
40 | 'https': 'https://127.0.0.1:8080'
41 | })
42 |
43 | opener = build_opener(proxy_hander)
44 | try:
45 | response = opener.open('https://www.baidu.com')
46 | print(response.read().decode('utf-8'))
47 | except URLError as e:
48 | print(e.reason)
49 |
50 |
51 | def cookie_values():
52 | # 声明CookieJar对象
53 | cookie = http.cookiejar.CookieJar()
54 | # 构建Handler
55 | handler = urllib.request.HTTPCookieProcessor(cookie)
56 | # 构建Opener
57 | opener = urllib.request.build_opener(handler)
58 | response = opener.open('https://www.baidu.com')
59 | for item in cookie:
60 | print(item.name + '=' + item.value)
61 |
62 |
63 | def cookie_mozilla_content():
64 | if not os.path.exists('../files'):
65 | os.mkdir('../files')
66 |
67 | filename = 'files/mozilla_cookie.txt'
68 | cookie = http.cookiejar.MozillaCookieJar(filename)
69 | handler = urllib.request.HTTPCookieProcessor(cookie)
70 | opener = urllib.request.build_opener(handler)
71 | response = opener.open('https://www.baidu.com')
72 | cookie.save(ignore_discard=True, ignore_expires=True)
73 |
74 |
75 | def cookie_lwp_content():
76 | if not os.path.exists('../files'):
77 | os.mkdir('../files')
78 |
79 | filename = 'files/lwp_cookie.txt'
80 | cookie = http.cookiejar.LWPCookieJar(filename)
81 | handler = urllib.request.HTTPCookieProcessor(cookie)
82 | opener = urllib.request.build_opener(handler)
83 | response = opener.open('https://www.baidu.com')
84 | cookie.save(ignore_discard=True, ignore_expires=True)
85 |
86 |
87 | def cookie_use_lwp():
88 | cookie = http.cookiejar.LWPCookieJar()
89 | cookie.load('files/lwp_cookie.txt', ignore_discard=True, ignore_expires=True)
90 | handler = urllib.request.HTTPCookieProcessor(cookie)
91 | opener = urllib.request.build_opener(handler)
92 | response = opener.open('https://www.baidu.com')
93 | print(response.read().decode('utf-8'))
94 |
95 |
96 | if __name__ == '__main__':
97 | valid()
98 |
--------------------------------------------------------------------------------
/src/ch11/learn-ast/basic/basic2.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author: HuRuiFeng
3 | * @file: basic2.js
4 | * @time: 2022-01-14 10:00:25
5 | * @project: python3-web-spider-learning
6 | * @desc: AST操作
7 | */
8 |
9 | import {parse} from "@babel/parser";
10 | import traverse from "@babel/traverse";
11 | import generate from "@babel/generator";
12 | import * as types from "@babel/types"
13 | import fs from "fs";
14 |
15 | const code = fs.readFileSync("../codes/code1.js", "utf-8")
16 | let ast = parse(code);
17 |
18 | function traverse_nodes() {
19 | // 遍历AST节点
20 | traverse(ast, {
21 | enter(path) {
22 | console.log(path)
23 | }
24 | })
25 | }
26 |
27 | function modify_value1() {
28 | // 利用修改AST的方式修改赋值变量
29 | traverse(ast, {
30 | enter(path) {
31 | let node = path.node;
32 | if (node.type === "NumericLiteral" && node.value === 3) {
33 | node.value = 5;
34 | }
35 | if (node.type === "StringLiteral" && node.value === "hello") {
36 | node.value = "hi";
37 | }
38 | },
39 | })
40 | const {code: output } = generate(ast, {
41 | retainLines: true,
42 | });
43 |
44 | console.log(output);
45 | }
46 |
47 | function modify_value2() {
48 | // 利用修改AST的方式修改赋值变量
49 | traverse(ast, {
50 | NumericLiteral(path) {
51 | if (path.node.value === 3) {
52 | path.node.value = 5;
53 | }
54 | },
55 | StringLiteral(path) {
56 | if (path.node.value === "hello") {
57 | path.node.value = "hi";
58 | }
59 | }
60 | })
61 | const {code: output } = generate(ast, {
62 | comments: false
63 | });
64 |
65 | console.log(output);
66 | }
67 |
68 | function delete_node() {
69 | // 删除所有的console.log
70 | traverse(ast, {
71 | CallExpression(path) {
72 | let node = path.node;
73 | if (
74 | node.callee.object.name === "console" &&
75 | node.callee.property.name === "log"
76 | ) {
77 | path.remove();
78 | }
79 | },
80 | });
81 |
82 | const {code: output } = generate(ast, {
83 | comments: false
84 | });
85 |
86 | console.log(output);
87 | }
88 |
89 | function add_node() {
90 | // 添加const b = a + 1;
91 | const code = "const a = 1;";
92 | let ast = parse(code);
93 | traverse(ast, {
94 | VariableDeclaration(path) {
95 | let init = types.binaryExpression(
96 | "+",
97 | types.identifier("a"),
98 | types.numericLiteral(1)
99 | );
100 | let declarator = types.variableDeclarator(types.identifier("b"), init);
101 | let declaration = types.variableDeclaration("const", [declarator]);
102 | path.insertAfter(declaration);
103 | path.stop();
104 | },
105 | });
106 | const output = generate(ast, {
107 | retainLines: true,
108 | }).code;
109 | console.log(output);
110 | }
111 |
112 | add_node()
113 |
114 |
--------------------------------------------------------------------------------
/src/ch12/airtest_script.air/airtest_script.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: airtest_script.py
6 | @time: 2022/1/16 2:45
7 | @project: python3-web-spider-learning
8 | @desc: 12.7 基于Airtest的App爬取实战(P586)
9 | """
10 | import os
11 |
12 | from airtest.core.api import stop_app, start_app, keyevent, swipe, connect_device
13 | from itsdangerous import json
14 | from loguru import logger
15 | from poco.drivers.android.uiautomation import AndroidUiautomationPoco
16 |
17 | poco = AndroidUiautomationPoco(use_airtest_input=True, screenshot_each_action=False)
18 | window_width, window_height = poco.get_screen_size()
19 | PACKAGE_NAME = "com.goldze.mvvmhabit"
20 | TOTAL_NUMBER = 100
21 |
22 |
23 | def scrape_index():
24 | elements = poco(f'{PACKAGE_NAME}:id/item')
25 | elements.wait_for_appearance()
26 | return elements
27 |
28 |
29 | def scrape_detail(element):
30 | logger.debug(f'scraping {element}')
31 | element.click()
32 | panel = poco(f'{PACKAGE_NAME}:id/content')
33 | panel.wait_for_appearance()
34 | title = poco(f'{PACKAGE_NAME}:id/title').attr('text')
35 | categories = poco(f'{PACKAGE_NAME}:id/categories_value').attr('text')
36 | score = poco(f'{PACKAGE_NAME}:id/score_value').attr('text')
37 | published_at = poco(f'{PACKAGE_NAME}:id/published_at_value').attr('text')
38 | drama = poco(f'{PACKAGE_NAME}:id/drama_value').attr('text')
39 | keyevent('BACK')
40 | return {
41 | 'title': title,
42 | 'categories': categories,
43 | 'score': score,
44 | 'published_at': published_at,
45 | 'drama': drama,
46 | }
47 |
48 |
49 | def scroll_up():
50 | """
51 | 上滑动操作
52 | """
53 | swipe((window_width * 0.5, window_height * 0.8),
54 | vertor=[0, -0.5], duration=1)
55 |
56 |
57 | OUTPUT_FOLDER = 'movie'
58 | os.path.exists(OUTPUT_FOLDER) or os.makedirs(OUTPUT_FOLDER)
59 |
60 |
61 | def save_data(element_data):
62 | """
63 | 保存数据
64 | """
65 | with open(f'{OUTPUT_FOLDER}/{element_data.get("title")}.json', 'w', encoding='utf-8') as f:
66 | f.write(json.dumps(element_data, ensure_ascii=False, indent=2))
67 | logger.debug(f'saved as file {element_data.get("title")}.json')
68 |
69 |
70 | def main():
71 | scraped_titles = []
72 | while len(scraped_titles) < TOTAL_NUMBER:
73 | elements = scrape_index()
74 | for element in elements:
75 | element_title = element.offspring(f'{PACKAGE_NAME}:id/tv_title')
76 | if not element_title.exists():
77 | continue
78 | title = element_title.attr('text')
79 | logger.debug(f'get title {title}')
80 | if title in scraped_titles:
81 | continue
82 | _, element_y = element.get_position()
83 | if element_y > 0.7:
84 | scroll_up()
85 | element_data = scrape_detail(element)
86 | scraped_titles.append(title)
87 | logger.debug(f'scraped data {element_data}')
88 |
89 |
90 | if __name__ == '__main__':
91 | connect_device("Android://127.0.0.1:5037/192.168.1.26:5555")
92 | stop_app(PACKAGE_NAME)
93 | start_app(PACKAGE_NAME)
94 | main()
95 |
--------------------------------------------------------------------------------
/src/ch10/account_pool/setting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: setting.py
6 | @time: 2022/1/12 10:21
7 | @project: python3-web-spider-learning
8 | @desc: 配置Redis和定时任务的时间
9 | """
10 |
11 | import platform
12 | from os.path import dirname, abspath, join
13 | from environs import Env
14 | from loguru import logger
15 | from utils import parse_redis_connection_string
16 |
17 | env = Env()
18 | env.read_env()
19 |
20 | # definition of flags
21 | IS_WINDOWS = platform.system().lower() == 'windows'
22 |
23 | # definition of dirs
24 | ROOT_DIR = dirname(dirname(abspath(__file__)))
25 | LOG_DIR = join(ROOT_DIR, env.str('LOG_DIR', 'logs'))
26 |
27 | # definition of environments
28 | DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod'
29 | APP_ENV = env.str('APP_ENV', DEV_MODE).lower()
30 | APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False)
31 | APP_DEV = IS_DEV = APP_ENV == DEV_MODE
32 | APP_PROD = IS_PROD = APP_ENV == PROD_MODE
33 | APP_TEST = IS_TEST = APP_ENV == TEST_MODE
34 |
35 | # redis host
36 | REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1')
37 | # redis port
38 | REDIS_PORT = env.int('REDIS_PORT', 6379)
39 | # redis password, if no password, set it to None
40 | REDIS_PASSWORD = env.str('REDIS_PASSWORD', None)
41 | # redis db, if no choice, set it to 0
42 | REDIS_DB = env.int('REDIS_DB', 0)
43 | # redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0
44 | REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None)
45 |
46 | if REDIS_CONNECTION_STRING:
47 | REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING)
48 |
49 | # redis hash table key name
50 | REDIS_ACCOUNT_KEY = env.str('REDIS_ACCOUNT_KEY', 'accounts:%s')
51 | REDIS_CREDENTIAL_KEY = env.str('REDIS_CREDENTIAL_KEY', 'credential:%s')
52 |
53 | # integrated generator
54 | GENERATOR_MAP = {
55 | 'antispider6': 'Antispider6Generator'
56 | }
57 |
58 | # integrated tester
59 | TESTER_MAP = {
60 | 'antispider6': 'Antispider6Tester'
61 | }
62 |
63 | # definition of tester cycle, it will test every CYCLE_TESTER second
64 | CYCLE_TESTER = env.int('CYCLE_TESTER', 600)
65 | # definition of getter cycle, it will get proxy every CYCLE_GENERATOR second
66 | CYCLE_GENERATOR = env.int('CYCLE_GENERATOR', 600)
67 | GET_TIMEOUT = env.int('GET_TIMEOUT', 10)
68 |
69 | # definition of tester
70 | TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
71 | TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10)
72 | TEST_BATCH = env.int('TEST_BATCH', 20)
73 | # test url
74 | TEST_URL_MAP = {
75 | 'antispider6': 'https://antispider6.scrape.center/'
76 | }
77 |
78 | # definition of api
79 | API_HOST = env.str('API_HOST', '127.0.0.1')
80 | API_PORT = env.int('API_PORT', 6789)
81 | API_THREADED = env.bool('API_THREADED', True)
82 |
83 | # flags of enable
84 | ENABLE_IMPORT_DATA = env.bool('ENABLE_IMPORT_DATA', False)
85 | ENABLE_TESTER = env.bool('ENABLE_TESTER', True)
86 | ENABLE_GENERATOR = env.bool('ENABLE_GENERATOR', True)
87 | ENABLE_SERVER = env.bool('ENABLE_SERVER', True)
88 |
89 | logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week',
90 | retention='20 days')
91 | logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week')
--------------------------------------------------------------------------------
/src/ch02/urllib_demo/parse_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: parse_demo.py
6 | @time: 2021/12/29 16:49
7 | @project: python3-web-spider-learning
8 | @desc: parse模块示例(P40~P44)
9 | """
10 | from urllib.parse import urlparse, urlunparse, urlsplit, urlunsplit, urljoin, urlencode, parse_qs, parse_qsl, quote, \
11 | unquote
12 |
13 |
14 | class UrlLibDemo:
15 | def __init__(self):
16 | self.base_url = None
17 | self.scheme = ''
18 | self.allow_fragments = True
19 | self.data = None
20 |
21 | def print_urlparse(self):
22 | # 对一个URL进行解析
23 | result = urlparse(self.base_url, scheme=self.scheme, allow_fragments=self.allow_fragments)
24 | print(type(result))
25 | print(result)
26 |
27 | def print_urlunparse(self):
28 | # 构造一个URL
29 | print(urlunparse(self.data))
30 |
31 | def print_urlsplit(self):
32 | # 解析整个url,并返回5个部分
33 | print(urlsplit(self.base_url))
34 |
35 | def print_urlunsplit(self):
36 | # 将链接各个部分组合成完整链接
37 | print(urlunsplit(self.data))
38 |
39 | def print_urljoin(self, other_url):
40 | # 分析base_url的scheme、netloc和path这3个内容,并对新链接缺失的部分进行补充
41 | print(urljoin(self.base_url, other_url))
42 |
43 | def print_urlencode(self, params):
44 | # 将params字典转换成URL的Get请求
45 | print(self.base_url + urlencode(params))
46 |
47 | def print_parse_qs(self, query):
48 | # 将一串Get请求参数转回字典
49 | print(parse_qs(query))
50 |
51 | def print_parse_qsl(self, query):
52 | # 将一串Get请求参数转回元组
53 | print(parse_qsl(query))
54 |
55 | def print_quote(self, keyword):
56 | # 将内容转化为URL编码格式
57 | print(self.base_url + quote(keyword))
58 |
59 | def print_unquote(self):
60 | # 对URL进行解码
61 | print(unquote(self.base_url))
62 |
63 |
64 | if __name__ == '__main__':
65 | urllib_demo = UrlLibDemo()
66 | # urllib_demo.base_url = 'https://www.baidu.com/index.html#comment'
67 | # urllib_demo.allow_fragments = False
68 | #
69 | # urllib_demo.print_urlparse()
70 |
71 | # urllib_demo.data = ['https', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
72 | # urllib_demo.print_urlunparse()
73 |
74 | # urllib_demo.base_url = 'https://www.baidu.com/index.html;user?id=5#comment'
75 | # urllib_demo.print_urlsplit()
76 |
77 | # urllib_demo.data = ['https', 'www.baidu.com', 'index.html', 'a=6', 'comment']
78 | # urllib_demo.print_urlunsplit()
79 |
80 | # urllib_demo.base_url = 'https://www.baidu.com'
81 | # urllib_demo.print_urljoin('FAQ.html')
82 |
83 | # urllib_demo.base_url = 'https://www.baidu.com?'
84 | # params = {
85 | # 'name': 'germey',
86 | # 'age': 25
87 | # }
88 | # urllib_demo.print_urlencode(params)
89 |
90 | # query = 'name=germey&age=25'
91 | # urllib_demo.print_parse_qs(query)
92 |
93 | # urllib_demo.print_parse_qsl(query)
94 |
95 | # keyword = '壁纸'
96 | # urllib_demo.base_url = 'https://www.baidu.com/s?wd='
97 | # urllib_demo.print_quote(keyword)
98 |
99 | urllib_demo.base_url = 'https://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8'
100 | urllib_demo.print_unquote()
101 |
--------------------------------------------------------------------------------
/src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | import pymongo
9 | from elasticsearch import Elasticsearch
10 | from scrapy import Request
11 | from scrapy.exceptions import DropItem
12 | from scrapy.pipelines.images import ImagesPipeline
13 |
14 |
15 | class MongoDBPipeline:
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | cls.connect_string = crawler.settings.get('MONGODB_CONNECTION_STRING')
19 | cls.database = crawler.settings.get('MONGODB_DATABASE')
20 | cls.collection = crawler.settings.get('MONGODB_COLLECTION')
21 | return cls()
22 |
23 | def open_spider(self, spider):
24 | self.client = pymongo.MongoClient(self.connect_string)
25 | self.db = self.client[self.database]
26 |
27 | def process_item(self, item, spider):
28 | collection = self.db[self.collection]
29 | collection.update_one({
30 | 'name': item['name']
31 | }, {
32 | '$set': dict(item)
33 | }, True)
34 | return item
35 |
36 | def close_spider(self, spider):
37 | self.client.close()
38 |
39 |
40 | class ElasticsearchPipeline:
41 | @classmethod
42 | def from_crawler(cls, crawler):
43 | cls.connection_string = crawler.settings.get('ELASTICSEARCH_CONNECTION_STRING')
44 | cls.index = crawler.settings.get('ELASTICSEARCH_INDEX')
45 | return cls()
46 |
47 | def open_spider(self, spider):
48 | self.conn = Elasticsearch([self.connection_string])
49 | if not self.conn.indices.exists(index=self.index):
50 | self.conn.indices.create(index=self.index)
51 |
52 | def process_item(self, item, spider):
53 | self.conn.index(index=self.index, body=dict(item), id=hash(item['name']))
54 | return item
55 |
56 | def close_spider(self, spider):
57 | self.conn.transport.close()
58 |
59 |
60 | class ImagePipeline(ImagesPipeline):
61 | def file_path(self, request, response=None, info=None, *, item=None):
62 | movie = request.meta['movie']
63 | type = request.meta['type']
64 | name = request.meta['name']
65 | file_name = f'{movie}/{type}/{name}.jpg'
66 | return file_name
67 |
68 | def item_completed(self, results, item, info):
69 | image_paths = [x['path'] for ok, x in results if ok]
70 | if not image_paths:
71 | raise DropItem('Image Downloaded Failed')
72 | return item
73 |
74 | def get_media_requests(self, item, info):
75 | for director in item['directors']:
76 | director_name = director['name']
77 | director_image = director['image']
78 | yield Request(director_image, meta={
79 | 'name': director_name,
80 | 'type': 'director',
81 | 'movie': item['name']
82 | })
83 |
84 | for actor in item['actors']:
85 | actor_name = actor['name']
86 | actor_image = actor['image']
87 | yield Request(actor_image, meta={
88 | 'name': actor_name,
89 | 'type': 'actor',
90 | 'movie': item['name']
91 | })
92 |
--------------------------------------------------------------------------------
/src/ch02/regx_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @author: HuRuiFeng
5 | @file: regx_demo.py
6 | @time: 2022/1/4 9:33
7 | @project: python3-web-spider-learning
8 | @desc: 2.3 正则表达式(P66~P73)
9 | """
10 | import re
11 |
12 | html = '''
13 |
经典老歌
14 |
15 | 经典老歌列表
16 |
17 |
31 |
'''
32 |
33 |
34 | def regex_match():
35 | content = 'Hello 123 4567 World_This is a Regex Demo'
36 | print(len(content))
37 | result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}', content)
38 | print(result)
39 | print(result.group())
40 | print(result.span())
41 |
42 |
43 | def match_object():
44 | # 匹配目标
45 | content = 'Hello 1234567 World_This is a Regex Demo'
46 | result = re.match('^Hello\s(\d+)\sWorld', content)
47 | print(result)
48 | print(result.group())
49 | print(result.group(1))
50 | print(result.span())
51 |
52 |
53 | def common_match():
54 | # 通用匹配
55 | content = 'Hello 123 4567 World_This is a Regex Demo'
56 | result = re.match('^Hello.*Demo$', content)
57 | print(result)
58 | print(result.group())
59 | print(result.span())
60 |
61 |
62 | def greedy_match():
63 | # 贪婪匹配
64 | content = 'Hello 123 4567 World_This is a Regex Demo'
65 | result = re.match('^He.*?(\d+).*Demo$', content)
66 | print(result)
67 | print(result.group())
68 | print(result.span())
69 |
70 |
71 | def match_with_modifier():
72 | # 使用修饰符
73 | content = '''Hello 1234567 World_This
74 | is a Regex Demo'''
75 | result = re.match('^He.*?(\d+).*?Demo$', content, re.S)
76 | print(result.group(1))
77 |
78 |
79 | def transferred_match():
80 | # 转义匹配
81 | content = '(百度)www.baidu.com'
82 | result = re.match('\(百度\)www\.baidu\.com', content)
83 | print(result)
84 |
85 |
86 | def search_match():
87 | regx = '(.*?)'
88 | result = re.search(regx, html, re.S)
89 | if result:
90 | print(result.group(1), result.group(2))
91 |
92 |
93 | def findall_match():
94 | regx = '(.*?)'
95 | results = re.findall(regx, html, re.S)
96 | print(results)
97 | print(type(results))
98 | for result in results:
99 | print(result)
100 | print(result[0], result[1], result[2])
101 |
102 |
103 | def sub_match():
104 | # 替换
105 | content = '54aK54yr5oiR54ix5L2g'
106 | content = re.sub('\d+', '', content)
107 | print(content)
108 |
109 |
110 | def sub_html_match():
111 | content = re.sub('|', '', html)
112 | print(content)
113 | results = re.findall('(.*?)', content, re.S)
114 | for result in results:
115 | print(result.strip())
116 |
117 |
118 | if __name__ == '__main__':
119 | sub_html_match()
120 |
--------------------------------------------------------------------------------
/src/ch15/scrapyspiderdemo/scrapyspiderdemo/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for scrapyspiderdemo project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'scrapyspiderdemo'
11 |
12 | SPIDER_MODULES = ['scrapyspiderdemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapyspiderdemo.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'scrapyspiderdemo (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 |
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 |
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 |
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 |
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 |
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | # 'Accept-Language': 'en',
43 | #}
44 |
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | # 'scrapyspiderdemo.middlewares.ScrapyspiderdemoSpiderMiddleware': 543,
49 | #}
50 |
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | # 'scrapyspiderdemo.middlewares.ScrapyspiderdemoDownloaderMiddleware': 543,
55 | #}
56 |
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | # 'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 |
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | # 'scrapyspiderdemo.pipelines.ScrapyspiderdemoPipeline': 300,
67 | #}
68 |
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 |
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 |
--------------------------------------------------------------------------------
/src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for scrapyseleniumdemo project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'scrapyseleniumdemo'
11 |
12 | SPIDER_MODULES = ['scrapyseleniumdemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapyseleniumdemo.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'scrapyseleniumdemo (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 |
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 |
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 |
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 |
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 |
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | # 'Accept-Language': 'en',
43 | #}
44 |
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | # 'scrapyseleniumdemo.middlewares.ScrapyseleniumdemoSpiderMiddleware': 543,
49 | #}
50 |
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | DOWNLOADER_MIDDLEWARES = {
54 | 'gerapy_selenium.downloadermiddlewares.SeleniumMiddleware': 543,
55 | }
56 |
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | # 'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 |
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | # 'scrapyseleniumdemo.pipelines.ScrapyseleniumdemoPipeline': 300,
67 | #}
68 |
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 |
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 |
90 | CONCURRENT_REQUESTS = 6
--------------------------------------------------------------------------------
/src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for scrapyuniversaldemo project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'scrapyuniversaldemo'
11 |
12 | SPIDER_MODULES = ['scrapyuniversaldemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapyuniversaldemo.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'scrapyuniversaldemo (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 |
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 |
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 |
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 |
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 |
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | # 'Accept-Language': 'en',
43 | #}
44 |
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | # 'scrapyuniversaldemo.middlewares.ScrapyuniversaldemoSpiderMiddleware': 543,
49 | #}
50 |
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | # 'scrapyuniversaldemo.middlewares.ScrapyuniversaldemoDownloaderMiddleware': 543,
55 | #}
56 |
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | # 'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 |
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | # 'scrapyuniversaldemo.pipelines.ScrapyuniversaldemoPipeline': 300,
67 | #}
68 |
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 |
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 |
--------------------------------------------------------------------------------
/src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for scrapypyppeteerdemo project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'scrapypyppeteerdemo'
11 |
12 | SPIDER_MODULES = ['scrapypyppeteerdemo.spiders']
13 | NEWSPIDER_MODULE = 'scrapypyppeteerdemo.spiders'
14 |
15 | TWISTED_REACTOR='twisted.internet.asyncioreactor.AsyncioSelectorReactor'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'scrapypyppeteerdemo (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | CONCURRENT_REQUESTS = 3
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'scrapypyppeteerdemo.middlewares.ScrapypyppeteerdemoSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 | 'scrapypyppeteerdemo.middlewares.PyppeteerMiddleware': 543,
56 | }
57 |
58 | # Enable or disable extensions
59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66 | #ITEM_PIPELINES = {
67 | # 'scrapypyppeteerdemo.pipelines.ScrapypyppeteerdemoPipeline': 300,
68 | #}
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
--------------------------------------------------------------------------------