└── src ├── ch13 ├── AndServerTest │ ├── app │ │ ├── .gitignore │ │ ├── src │ │ │ ├── main │ │ │ │ ├── res │ │ │ │ │ ├── mipmap-hdpi │ │ │ │ │ │ ├── ic_launcher.webp │ │ │ │ │ │ └── ic_launcher_round.webp │ │ │ │ │ ├── mipmap-mdpi │ │ │ │ │ │ ├── ic_launcher.webp │ │ │ │ │ │ └── ic_launcher_round.webp │ │ │ │ │ ├── mipmap-xhdpi │ │ │ │ │ │ ├── ic_launcher.webp │ │ │ │ │ │ └── ic_launcher_round.webp │ │ │ │ │ ├── mipmap-xxhdpi │ │ │ │ │ │ ├── ic_launcher.webp │ │ │ │ │ │ └── ic_launcher_round.webp │ │ │ │ │ ├── mipmap-xxxhdpi │ │ │ │ │ │ ├── ic_launcher.webp │ │ │ │ │ │ └── ic_launcher_round.webp │ │ │ │ │ ├── mipmap-anydpi-v26 │ │ │ │ │ │ ├── ic_launcher.xml │ │ │ │ │ │ └── ic_launcher_round.xml │ │ │ │ │ ├── values │ │ │ │ │ │ ├── strings.xml │ │ │ │ │ │ ├── colors.xml │ │ │ │ │ │ └── themes.xml │ │ │ │ │ ├── values-night │ │ │ │ │ │ └── themes.xml │ │ │ │ │ ├── layout │ │ │ │ │ │ └── activity_main.xml │ │ │ │ │ └── drawable-v24 │ │ │ │ │ │ └── ic_launcher_foreground.xml │ │ │ │ ├── java │ │ │ │ │ └── com │ │ │ │ │ │ ├── goldze │ │ │ │ │ │ └── mvvmhabit │ │ │ │ │ │ │ └── utils │ │ │ │ │ │ │ └── NativeUtils.java │ │ │ │ │ │ └── germey │ │ │ │ │ │ └── andservertest │ │ │ │ │ │ ├── AppController.java │ │ │ │ │ │ └── MainActivity.java │ │ │ │ └── AndroidManifest.xml │ │ │ ├── test │ │ │ │ └── java │ │ │ │ │ └── com │ │ │ │ │ └── germey │ │ │ │ │ └── andservertest │ │ │ │ │ └── ExampleUnitTest.java │ │ │ └── androidTest │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── germey │ │ │ │ └── andservertest │ │ │ │ └── ExampleInstrumentedTest.java │ │ ├── proguard-rules.pro │ │ └── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── settings.gradle │ ├── build.gradle │ ├── gradle.properties │ └── gradlew.bat ├── files │ ├── frida_appbasic1.js │ ├── frida_rpc_app9.js │ └── frida_appbasic2.js ├── frida_appbasic1_demo.py ├── frida_appbasic2_demo.py ├── andserver_demo.py ├── jeb_demo.py ├── frida_rpc_demo.py └── ida_demo.py ├── ch15 ├── scrapytutorial │ ├── scrapytutorial │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── quotes.py │ │ ├── items.py │ │ ├── extensions.py │ │ └── pipelines.py │ ├── run.py │ ├── scrapy.cfg │ └── server.py ├── scrapyseleniumdemo │ ├── scrapyseleniumdemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── book.py │ │ ├── items.py │ │ ├── pipelines.py │ │ └── settings.py │ ├── run.py │ └── scrapy.cfg ├── scrapyspiderdemo │ ├── scrapyspiderdemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── httpbin.py │ │ ├── items.py │ │ ├── pipelines.py │ │ └── settings.py │ ├── run.py │ └── scrapy.cfg ├── scrapypyppeteerdemo │ ├── scrapypyppeteerdemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── book.py │ │ ├── items.py │ │ ├── pipelines.py │ │ └── settings.py │ ├── run.py │ └── scrapy.cfg ├── scrapyuniversaldemo │ ├── scrapyuniversaldemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── movie.py │ │ │ └── universal.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── utils.py │ │ ├── loaders.py │ │ ├── configs │ │ │ └── movie.json │ │ └── settings.py │ ├── scrapy.cfg │ └── run.py ├── scrapyitempipelinedemo │ ├── scrapyitempipelinedemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── scrape.py │ │ ├── items.py │ │ └── pipelines.py │ ├── run.py │ └── scrapy.cfg ├── scrapyspidermiddlewaredemo │ ├── scrapyspidermiddlewaredemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── httpbin.py │ │ ├── items.py │ │ └── pipelines.py │ ├── run.py │ └── scrapy.cfg ├── scrapydownloadermiddlewaredemo │ ├── scrapydownloadermiddlewaredemo │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── httpbin.py │ │ ├── items.py │ │ └── pipelines.py │ ├── run.py │ └── scrapy.cfg ├── scrape_selector_demo.py └── scrape_processor_demo.py ├── ch11 ├── learn-ast │ ├── .babelrc │ ├── codes │ │ ├── code3.js │ │ ├── code2.js │ │ ├── code1.js │ │ ├── code5.js │ │ └── code4.js │ ├── package.json │ └── basic │ │ ├── basic1.js │ │ └── basic2.js ├── nodejs_demo │ ├── package.json │ ├── nodejs_client.py │ ├── nodejs_main.js │ └── nodejs_server.js ├── files │ └── Wasm.wasm ├── execjs_demo.py ├── pywasm_scrape_demo.py ├── wasmer_scrape_demo.py ├── execjs_web_demo.py └── js_scrape_practice.py ├── ch04 ├── files │ ├── data.csv │ └── movies.txt ├── rabbitmq_oper_demo │ ├── scrape_producer.py │ ├── scrape_consume.py │ ├── consumer.py │ └── producer.py ├── text_oper_demo.py ├── csv_oper_demo.py ├── mongodb_demo.py └── elasticsearch_oper_demo.py ├── ch02 ├── files │ ├── favicon.ico │ ├── mozilla_cookie.txt │ └── lwp_cookie.txt ├── urllib_demo │ ├── robotparser_demo.py │ ├── request_demo.py │ ├── request_hander_demo.py │ └── parse_demo.py ├── httpx_demo.py ├── requests_demo │ ├── advanced_use.py │ └── requests_demo.py └── regx_demo.py ├── ch08 ├── files │ └── slide_captcha.png ├── tesserocr_demo.py └── opencv_demo.py ├── ch07 ├── selenium_demo │ ├── files │ │ └── preview.png │ ├── back_forward.py │ ├── cookie_oper.py │ ├── tab_oper.py │ ├── headless_mode.py │ ├── exception_handle.py │ ├── node_interaction.py │ ├── action_chain.py │ ├── anti_shield.py │ ├── switch_frame.py │ ├── node_selector.py │ ├── node_info.py │ ├── simple_demo.py │ └── delay_wait.py ├── pyppeteer_demo │ ├── files │ │ ├── example2.png │ │ └── eval_example.png │ ├── dev_mode.py │ ├── incognito_mode.py │ ├── prevent_detect.py │ └── simple_demo.py ├── playwright_demo │ ├── files │ │ ├── np_picture.png │ │ ├── browser-iphone.png │ │ ├── screenshot-webkit.png │ │ ├── screenshot-chromium.png │ │ └── screenshot-firefox.png │ ├── mobile_web.py │ ├── simple_demo.py │ └── event_listen.py ├── css_locate_scrape.py └── font_scrape.py ├── ch01 └── test.html ├── ch03 ├── files │ └── test.html └── parsel_demo.py ├── ch10 ├── account_pool │ ├── exceptions.py │ ├── utils.py │ ├── server.py │ ├── run_account_pool.py │ ├── storages_redis.py │ ├── tester.py │ ├── generator.py │ └── setting.py ├── jwt_simulate_login.py ├── antispider_scrape_with_account_pool.py └── session_cookie_simulate_login.py ├── ch06 ├── coroutine_demo │ ├── coroutine_simple_demo.py │ ├── coroutine_task1.py │ ├── coroutine_task2.py │ ├── multi_task_coroutine.py │ ├── bing_callback.py │ └── coroutine_await_aiohttp.py ├── aiohttp_demo │ ├── timeout_demo.py │ ├── post_request.py │ ├── url_params.py │ ├── simple_demo.py │ ├── response_demo.py │ └── concurrency_demo.py └── aiohttp_scrape_demo.py ├── ch14 └── ai_extract.md ├── ch12 ├── appium_demo.py └── airtest_script.air │ └── airtest_script.py └── ch05 └── scrape_ajax.py /src/ch13/AndServerTest/app/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/scrapytutorial/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/scrapyspiderdemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "@babel/preset-env" 4 | ] 5 | } -------------------------------------------------------------------------------- /src/ch04/files/data.csv: -------------------------------------------------------------------------------- 1 | id,name,age 2 | 10001,Mike,20 3 | 10002,Bob,22 4 | 10003,Jordan,21 5 | -------------------------------------------------------------------------------- /src/ch11/nodejs_demo/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "express": "^4.17.2" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /src/ch11/files/Wasm.wasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch11/files/Wasm.wasm -------------------------------------------------------------------------------- /src/ch02/files/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch02/files/favicon.ico -------------------------------------------------------------------------------- /src/ch08/files/slide_captcha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch08/files/slide_captcha.png -------------------------------------------------------------------------------- /src/ch07/selenium_demo/files/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/selenium_demo/files/preview.png -------------------------------------------------------------------------------- /src/ch07/pyppeteer_demo/files/example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/pyppeteer_demo/files/example2.png -------------------------------------------------------------------------------- /src/ch07/playwright_demo/files/np_picture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/np_picture.png -------------------------------------------------------------------------------- /src/ch07/pyppeteer_demo/files/eval_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/pyppeteer_demo/files/eval_example.png -------------------------------------------------------------------------------- /src/ch07/playwright_demo/files/browser-iphone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/browser-iphone.png -------------------------------------------------------------------------------- /src/ch07/playwright_demo/files/screenshot-webkit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-webkit.png -------------------------------------------------------------------------------- /src/ch07/playwright_demo/files/screenshot-chromium.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-chromium.png -------------------------------------------------------------------------------- /src/ch07/playwright_demo/files/screenshot-firefox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch07/playwright_demo/files/screenshot-firefox.png -------------------------------------------------------------------------------- /src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Relph1119/python3-web-spider-learning/HEAD/src/ch13/AndServerTest/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/scrapytutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/scrapyspiderdemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/codes/code3.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: code3.js 4 | * @time: 11:18 5 | * @project: python3-web-spider-learning 6 | * @desc: 7 | */ 8 | 9 | const strings = ["\"\x68\x65\x6c\x6c\x6f\"", "\"\x77\x6f\x72\x6c\x64\""]; -------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/codes/code2.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: code2.js 4 | * @time: 10:59 5 | * @project: python3-web-spider-learning 6 | * @desc: 7 | */ 8 | 9 | const a = ![]; 10 | const b = "abc" == "bcd" 11 | const c = (1 << 3) | 2; 12 | const d = parseInt("5" + "0") -------------------------------------------------------------------------------- /src/ch13/AndServerTest/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Mon Jan 17 20:43:10 CST 2022 2 | distributionBase=GRADLE_USER_HOME 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip 4 | distributionPath=wrapper/dists 5 | zipStorePath=wrapper/dists 6 | zipStoreBase=GRADLE_USER_HOME 7 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/java/com/goldze/mvvmhabit/utils/NativeUtils.java: -------------------------------------------------------------------------------- 1 | package com.goldze.mvvmhabit.utils; 2 | 3 | public class NativeUtils { 4 | 5 | static { 6 | System.loadLibrary("native"); 7 | } 8 | 9 | public static native String encrypt(String str, int offset); 10 | } 11 | -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/19 13:48 7 | @project: python3-web-spider-learning 8 | @desc: 15.2 Scrapy入门(P743) 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'quotes']) 13 | -------------------------------------------------------------------------------- /src/ch15/scrapyseleniumdemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/19 14:55 7 | @project: python3-web-spider-learning 8 | @desc: 15.9 Scrapy对接Selenium 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'book']) 13 | -------------------------------------------------------------------------------- /src/ch15/scrapypyppeteerdemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/20 9:19 7 | @project: python3-web-spider-learning 8 | @desc: 15.11 Scrapy对接Pyppeteer(P807) 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'book']) 13 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/codes/code1.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: code1.js 4 | * @time: 2022-01-14 09:45:29 5 | * @project: python3-web-spider-learning 6 | * @desc: 7 | */ 8 | 9 | const a = 3; 10 | let string = "hello"; 11 | for (let i = 0; i < a; i++) { 12 | string += "world"; 13 | } 14 | console.log("string", string) -------------------------------------------------------------------------------- /src/ch15/scrapyitempipelinedemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/19 14:55 7 | @project: python3-web-spider-learning 8 | @desc: 15.7 Item Pipeline的使用(P781) 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'scrape']) 13 | -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/19 14:55 7 | @project: python3-web-spider-learning 8 | @desc: 15.4 Spider的使用(P759) 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'httpbin', '--nolog']) 13 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/19 14:55 7 | @project: python3-web-spider-learning 8 | @desc: 15.6 Spider Middleware的使用(P775) 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'httpbin']) 13 | -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapytutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapytutorial 12 | -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/scrapytutorial/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class QuoteItem(scrapy.Item): 10 | text = scrapy.Field() 11 | author = scrapy.Field() 12 | tags = scrapy.Field() 13 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/spiders/httpbin.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class HttpbinSpider(scrapy.Spider): 5 | name = 'httpbin' 6 | allowed_domains = ['www.httpbin.org'] 7 | start_urls = ['https://www.httpbin.org/get'] 8 | 9 | def parse(self, response): 10 | print(response.text) 11 | -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapyspiderdemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapyspiderdemo 12 | -------------------------------------------------------------------------------- /src/ch15/scrapyseleniumdemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapyseleniumdemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapyseleniumdemo 12 | -------------------------------------------------------------------------------- /src/ch01/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | This is a Demo 6 | 7 | 8 |
9 |
10 |

Hello World

11 |

Hello, this is a paragraph.

12 |
13 |
14 | 15 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/settings.gradle: -------------------------------------------------------------------------------- 1 | dependencyResolutionManagement { 2 | repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) 3 | repositories { 4 | google() 5 | mavenCentral() 6 | jcenter() // Warning: this repository is going to shut down soon 7 | } 8 | } 9 | rootProject.name = "AndServerTest" 10 | include ':app' 11 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/19 14:55 7 | @project: python3-web-spider-learning 8 | @desc:15.5 Downloader Middleware的使用(P770) 9 | """ 10 | from scrapy.cmdline import execute 11 | 12 | execute(['scrapy', 'crawl', 'httpbin', '--nolog']) 13 | -------------------------------------------------------------------------------- /src/ch15/scrapypyppeteerdemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapypyppeteerdemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapypyppeteerdemo 12 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapyuniversaldemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapyuniversaldemo 12 | -------------------------------------------------------------------------------- /src/ch15/scrapyitempipelinedemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapyitempipelinedemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapyitempipelinedemo 12 | -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/scrapyspiderdemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class ScrapyspiderdemoItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/values/strings.xml: -------------------------------------------------------------------------------- 1 | 2 | AndServerTest 3 | Start Server 4 | Stop Server 5 | The server is started 6 | The server is stopped 7 | -------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapyspidermiddlewaredemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapyspidermiddlewaredemo 12 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapydownloadermiddlewaredemo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapydownloadermiddlewaredemo 12 | -------------------------------------------------------------------------------- /src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy import Item, Field 7 | 8 | 9 | class BookItem(Item): 10 | name = Field() 11 | tags = Field() 12 | score = Field() 13 | cover = Field() 14 | price = Field() 15 | -------------------------------------------------------------------------------- /src/ch03/files/test.html: -------------------------------------------------------------------------------- 1 |
2 | 9 |
-------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class DemoItem(scrapy.Item): 10 | origin = scrapy.Field() 11 | headers = scrapy.Field() 12 | args = scrapy.Field() 13 | url = scrapy.Field() 14 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class ScrapydownloadermiddlewaredemoItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | from scrapy import Item, Field 8 | 9 | 10 | class BookItem(Item): 11 | name = Field() 12 | tags = Field() 13 | score = Field() 14 | cover = Field() 15 | price = Field() 16 | -------------------------------------------------------------------------------- /src/ch02/files/mozilla_cookie.txt: -------------------------------------------------------------------------------- 1 | # Netscape HTTP Cookie File 2 | # http://curl.haxx.se/rfc/cookie_spec.html 3 | # This is a generated file! Do not edit. 4 | 5 | .baidu.com TRUE / FALSE 1672303248 BAIDUID 4DF8C4AA1B53D13A4C0A711C60505CAB:FG=1 6 | .baidu.com TRUE / FALSE 3788250895 BIDUPSID 4DF8C4AA1B53D13A3F8EC394C3CC9551 7 | .baidu.com TRUE / FALSE 3788250895 PSTM 1640767247 8 | www.baidu.com FALSE / FALSE 1640767548 BD_NOT_HTTPS 1 9 | -------------------------------------------------------------------------------- /src/ch10/account_pool/exceptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: exceptions.py 6 | @time: 2022/1/12 10:36 7 | @project: python3-web-spider-learning 8 | @desc: 自定义异常 9 | """ 10 | 11 | 12 | class InitException(Exception): 13 | def __str__(self): 14 | """ 15 | init error 16 | :return: 17 | """ 18 | return repr('init failed') 19 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "learn-ast", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "devDependencies": { 12 | "@babel/cli": "^7.16.8", 13 | "@babel/core": "^7.16.7", 14 | "@babel/preset-env": "^7.16.8" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | from scrapy import Item, Field 8 | 9 | 10 | class MovieItem(Item): 11 | name = Field() 12 | cover = Field() 13 | categories = Field() 14 | published_at = Field() 15 | drama = Field() 16 | score = Field() 17 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/values/colors.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | #FFBB86FC 4 | #FF6200EE 5 | #FF3700B3 6 | #FF03DAC5 7 | #FF018786 8 | #FF000000 9 | #FFFFFFFF 10 | -------------------------------------------------------------------------------- /src/ch15/scrapyitempipelinedemo/scrapyitempipelinedemo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class MovieItem(scrapy.Item): 10 | name = scrapy.Field() 11 | categories = scrapy.Field() 12 | score = scrapy.Field() 13 | drama = scrapy.Field() 14 | directors = scrapy.Field() 15 | actors = scrapy.Field() 16 | -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/scrapyspiderdemo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class ScrapyspiderdemoPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /src/ch15/scrapyseleniumdemo/scrapyseleniumdemo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class ScrapyseleniumdemoPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /src/ch15/scrapypyppeteerdemo/scrapypyppeteerdemo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class ScrapypyppeteerdemoPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class ScrapyuniversaldemoPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/codes/code5.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: code5.js 4 | * @time: 2022-01-14 11:40 5 | * @project: python3-web-spider-learning 6 | * @desc: 7 | */ 8 | 9 | const s = "3|1|2".split("|"); 10 | let x = 0; 11 | while (true) { 12 | switch (s[x++]) { 13 | case "1": 14 | const a = 1; 15 | continue; 16 | case "2": 17 | const b = 3; 18 | continue; 19 | case "3": 20 | const c = 0; 21 | continue; 22 | } 23 | break; 24 | } -------------------------------------------------------------------------------- /src/ch13/files/frida_appbasic1.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: frida_appbasic1.js 4 | * @time: 2022-01-17 17:21 5 | * @project: python3-web-spider-learning 6 | * @desc: frida Appbasic1 Hook script 7 | */ 8 | 9 | Java.perform(() => { 10 | let MainActivity = Java.use('com.germey.appbasic1.MainActivity') 11 | console.log('start hook') 12 | MainActivity.getMessage.implementation = (arg1, arg2) => { 13 | send('Start Hook!') 14 | return '6' 15 | } 16 | }) -------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class ScrapyspidermiddlewaredemoPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /src/ch15/scrapydownloadermiddlewaredemo/scrapydownloadermiddlewaredemo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class ScrapydownloadermiddlewaredemoPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: utils.py 6 | @time: 2022/1/20 15:52 7 | @project: python3-web-spider-learning 8 | @desc: 9 | """ 10 | import json 11 | from os.path import join, dirname, realpath 12 | 13 | 14 | def get_config(name): 15 | path = join(dirname(realpath(__file__)), 'configs', f'{name}.json') 16 | with open(path, 'r', encoding='utf-8') as f: 17 | return json.loads(f.read()) 18 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/test/java/com/germey/andservertest/ExampleUnitTest.java: -------------------------------------------------------------------------------- 1 | package com.germey.andservertest; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.*; 6 | 7 | /** 8 | * Example local unit test, which will execute on the development machine (host). 9 | * 10 | * @see Testing documentation 11 | */ 12 | public class ExampleUnitTest { 13 | @Test 14 | public void addition_isCorrect() { 15 | assertEquals(4, 2 + 2); 16 | } 17 | } -------------------------------------------------------------------------------- /src/ch13/files/frida_rpc_app9.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: frida_rpc_app9.js 4 | * @time: 20:06 5 | * @project: python3-web-spider-learning 6 | * @desc: frida RPC App9 Hook script 7 | */ 8 | 9 | rpc.exports = { 10 | encrypt(string, offset) { 11 | let token = null; 12 | Java.perform(function () { 13 | var util = Java.use("com.goldze.mvvmhabit.utils.NativeUtils").$new(); 14 | token = util.encrypt(string, offset) 15 | }); 16 | return token; 17 | } 18 | } -------------------------------------------------------------------------------- /src/ch07/selenium_demo/back_forward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: back_forward.py 6 | @time: 2022/1/7 15:21 7 | @project: python3-web-spider-learning 8 | @desc: 前进和后退(P221) 9 | """ 10 | import time 11 | 12 | from selenium import webdriver 13 | 14 | browser = webdriver.Chrome() 15 | browser.get('https://www.baidu.com/') 16 | browser.get('https://www.taobao.com/') 17 | browser.get('https://www.python.org') 18 | browser.back() 19 | time.sleep(1) 20 | browser.forward() 21 | browser.close() 22 | -------------------------------------------------------------------------------- /src/ch11/nodejs_demo/nodejs_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: nodejs_client.py 6 | @time: 2022/1/13 22:13 7 | @project: python3-web-spider-learning 8 | @desc: Python调用Node.js服务(P453) 9 | """ 10 | 11 | import requests 12 | 13 | data = { 14 | "name": "凯文-杜兰特", 15 | "image": "durant.png", 16 | "birthday": "1988-09-29", 17 | "height": "208cm", 18 | "weight": "108.9KG" 19 | } 20 | 21 | url = 'http://localhost:3000' 22 | response = requests.post(url, json=data) 23 | print(response.text) 24 | -------------------------------------------------------------------------------- /src/ch06/coroutine_demo/coroutine_simple_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: coroutine_simple_demo.py 6 | @time: 2022/1/6 17:23 7 | @project: python3-web-spider-learning 8 | @desc: 定义协程(P194) 9 | """ 10 | import asyncio 11 | 12 | 13 | async def execute(x): 14 | print('Number:', x) 15 | 16 | coroutine = execute(1) 17 | print('Coroutine:', coroutine) 18 | print('After calling execute') 19 | 20 | loop = asyncio.get_event_loop() 21 | # 将协程对象注册到事件循环上 22 | loop.run_until_complete(coroutine) 23 | print('After calling loop') 24 | -------------------------------------------------------------------------------- /src/ch07/pyppeteer_demo/dev_mode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: dev_mode.py 6 | @time: 2022/1/10 9:27 7 | @project: python3-web-spider-learning 8 | @desc: 调试模式(P247) 9 | """ 10 | import asyncio 11 | 12 | from pyppeteer import launch 13 | 14 | 15 | async def main(): 16 | browser = await launch(devtools=True, args=['--disable-infobars']) 17 | page = await browser.newPage() 18 | await page.goto('https://www.baidu.com') 19 | await asyncio.sleep(100) 20 | 21 | asyncio.get_event_loop().run_until_complete(main()) 22 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/basic/basic1.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: basic1.js 4 | * @time: 2022-01-14 09:47:06 5 | * @project: python3-web-spider-learning 6 | * @desc: 7 | */ 8 | 9 | import {parse} from "@babel/parser" 10 | import generate from "@babel/generator" 11 | import fs from "fs" 12 | 13 | const code = fs.readFileSync("../codes/code1.js", "utf-8") 14 | let ast = parse(code) 15 | console.log(ast) 16 | console.log(ast.program.body) 17 | 18 | const {code: output} = generate(ast, { 19 | ratainLines: true, 20 | comments: false, 21 | }); 22 | console.log(output) -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/scrapyuniversaldemo/loaders.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: loaders.py 6 | @time: 2022/1/20 16:36 7 | @project: python3-web-spider-learning 8 | @desc: 9 | """ 10 | 11 | from scrapy.loader import ItemLoader 12 | from itemloaders.processors import TakeFirst, Identity, Compose 13 | 14 | 15 | class MovieItemLoader(ItemLoader): 16 | default_output_processor = TakeFirst() 17 | categories_out = Identity() 18 | score_out = Compose(TakeFirst(), str.strip) 19 | drama_out = Compose(TakeFirst(), str.strip) -------------------------------------------------------------------------------- /src/ch02/files/lwp_cookie.txt: -------------------------------------------------------------------------------- 1 | #LWP-Cookies-2.0 2 | Set-Cookie3: BAIDUID="658C2C37B45D9239BAC08ECC578950E0:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2022-12-29 08:42:08Z"; comment=bd; version=0 3 | Set-Cookie3: BIDUPSID=658C2C37B45D92392188E29355D808F6; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-01-16 11:56:15Z"; version=0 4 | Set-Cookie3: PSTM=1640767327; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-01-16 11:56:15Z"; version=0 5 | Set-Cookie3: BD_NOT_HTTPS=1; path="/"; domain="www.baidu.com"; path_spec; expires="2021-12-29 08:47:08Z"; version=0 6 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/build.gradle: -------------------------------------------------------------------------------- 1 | // Top-level build file where you can add configuration options common to all sub-projects/modules. 2 | buildscript { 3 | repositories { 4 | google() 5 | mavenCentral() 6 | } 7 | dependencies { 8 | classpath 'com.android.tools.build:gradle:4.1.3' 9 | classpath 'com.yanzhenjie.andserver:plugin:2.1.9' 10 | // NOTE: Do not place your application dependencies here; they belong 11 | // in the individual module build.gradle files 12 | } 13 | } 14 | 15 | 16 | task clean(type: Delete) { 17 | delete rootProject.buildDir 18 | } -------------------------------------------------------------------------------- /src/ch15/scrape_selector_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: scrape_selector_demo.py 6 | @time: 2022/1/19 14:08 7 | @project: python3-web-spider-learning 8 | @desc: 15.3 Selector的使用(P754) 9 | """ 10 | from scrapy import Selector 11 | 12 | 13 | def selector_demo(): 14 | # 直接使用 15 | body = 'Hello World' 16 | selector = Selector(text=body) 17 | title = selector.xpath('//title/text()').extract_first() 18 | print(title) 19 | 20 | 21 | if __name__ == '__main__': 22 | selector_demo() 23 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/cookie_oper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: cookie_oper.py 6 | @time: 2022/1/7 15:28 7 | @project: python3-web-spider-learning 8 | @desc: Cookie操作(P222) 9 | """ 10 | from selenium import webdriver 11 | 12 | browser = webdriver.Chrome() 13 | browser.get('https://www.zhihu.com/explore') 14 | print(browser.get_cookies()) 15 | browser.add_cookie({'name': 'name', 16 | 'domain': 'www.zhihu.com', 17 | 'value': 'germey'}) 18 | print(browser.get_cookies()) 19 | browser.delete_all_cookies() 20 | print(browser.get_cookies()) 21 | -------------------------------------------------------------------------------- /src/ch06/coroutine_demo/coroutine_task1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: coroutine_task1.py 6 | @time: 2022/1/6 17:31 7 | @project: python3-web-spider-learning 8 | @desc: 协程task的使用(P194) 9 | """ 10 | import asyncio 11 | 12 | 13 | async def execute(x): 14 | print('Number:', x) 15 | return x 16 | 17 | coroutine = execute(1) 18 | print('Coroutine:', coroutine) 19 | print('After calling execute') 20 | 21 | loop = asyncio.get_event_loop() 22 | task = loop.create_task(coroutine) 23 | print('Task:', task) 24 | loop.run_until_complete(task) 25 | print('Task:', task) 26 | print('After calling loop') -------------------------------------------------------------------------------- /src/ch06/coroutine_demo/coroutine_task2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: coroutine_task2.py 6 | @time: 2022/1/6 18:50 7 | @project: python3-web-spider-learning 8 | @desc: 协程task的使用(P195) 9 | """ 10 | import asyncio 11 | 12 | 13 | async def execute(x): 14 | print('Number:', x) 15 | return x 16 | 17 | coroutine = execute(1) 18 | print('Coroutine:', coroutine) 19 | print('After calling execute') 20 | 21 | task = asyncio.ensure_future(coroutine) 22 | print('Task:', task) 23 | loop = asyncio.get_event_loop() 24 | loop.run_until_complete(task) 25 | print('Task:', task) 26 | print('After calling loop') -------------------------------------------------------------------------------- /src/ch07/selenium_demo/tab_oper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: tab_oper.py 6 | @time: 2022/1/7 15:32 7 | @project: python3-web-spider-learning 8 | @desc: 选项卡管理(P222) 9 | """ 10 | import time 11 | 12 | from selenium import webdriver 13 | 14 | browser = webdriver.Chrome() 15 | browser.get('https://www.baidu.com') 16 | browser.execute_script('window.open()') 17 | print(browser.window_handles) 18 | browser.switch_to.window(browser.window_handles[1]) 19 | browser.get('https://www.taobao.com') 20 | time.sleep(1) 21 | browser.switch_to.window(browser.window_handles[0]) 22 | browser.get('https://python.org') -------------------------------------------------------------------------------- /src/ch13/frida_appbasic1_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: frida_demo.py 6 | @time: 2022/1/17 17:20 7 | @project: python3-web-spider-learning 8 | @desc: 13.5 Frida的使用,AppBasic1(P645) 9 | """ 10 | import sys 11 | 12 | import frida 13 | 14 | CODE = open('files/frida_appbasic1.js', encoding='utf-8').read() 15 | PROCESS_NAME = 'AppBasic1' 16 | 17 | 18 | def on_message(message, data): 19 | print(message) 20 | 21 | 22 | process = frida.get_usb_device().attach(PROCESS_NAME) 23 | script = process.create_script(CODE) 24 | script.on('message', on_message) 25 | script.load() 26 | sys.stdin.read() 27 | -------------------------------------------------------------------------------- /src/ch13/frida_appbasic2_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: frida_appbasic2_demo.py 6 | @time: 2022/1/17 17:43 7 | @project: python3-web-spider-learning 8 | @desc: 13.5 Frida的使用,AppBasic2(P648) 9 | """ 10 | import sys 11 | 12 | import frida 13 | 14 | CODE = open('files/frida_appbasic2.js', encoding='utf-8').read() 15 | PROCESS_NAME = 'AppBasic2' 16 | 17 | 18 | def on_message(message, data): 19 | print(message) 20 | 21 | 22 | process = frida.get_usb_device().attach(PROCESS_NAME) 23 | script = process.create_script(CODE) 24 | script.on('message', on_message) 25 | script.load() 26 | sys.stdin.read() 27 | -------------------------------------------------------------------------------- /src/ch06/aiohttp_demo/timeout_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: timeout_demo.py 6 | @time: 2022/1/6 19:58 7 | @project: python3-web-spider-learning 8 | @desc: 超时设置(P205) 9 | """ 10 | import asyncio 11 | 12 | import aiohttp 13 | 14 | 15 | async def main(): 16 | timeout = aiohttp.ClientTimeout(total=1) 17 | async with aiohttp.ClientSession(timeout=timeout) as session: 18 | async with session.get('https://www.httpbin.org/get') as response: 19 | print('status:', response.status) 20 | 21 | 22 | if __name__ == '__main__': 23 | asyncio.get_event_loop().run_until_complete(main()) 24 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/headless_mode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: headless_mode.py 6 | @time: 2022/1/7 15:49 7 | @project: python3-web-spider-learning 8 | @desc: 无头模式(P225) 9 | """ 10 | from selenium import webdriver 11 | from selenium.webdriver import ChromeOptions 12 | import os 13 | 14 | option = ChromeOptions() 15 | option.add_argument('--headless') 16 | browser = webdriver.Chrome(options=option) 17 | browser.set_window_size(1366, 768) 18 | browser.get('https://www.baidu.com') 19 | 20 | if not os.path.exists('files'): 21 | os.makedirs('files') 22 | 23 | browser.get_screenshot_as_file('files/preview.png') 24 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/exception_handle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: exception_handle.py 6 | @time: 2022/1/7 15:35 7 | @project: python3-web-spider-learning 8 | @desc: 异常处理(P223) 9 | """ 10 | from selenium import webdriver 11 | from selenium.common.exceptions import TimeoutException, NoSuchElementException 12 | 13 | browser = webdriver.Chrome() 14 | try: 15 | browser.get('https://www.baidu.com') 16 | except TimeoutException: 17 | print('Time Out') 18 | 19 | try: 20 | browser.find_element_by_id('hello') 21 | except NoSuchElementException: 22 | print('No Element') 23 | finally: 24 | browser.close() 25 | -------------------------------------------------------------------------------- /src/ch06/aiohttp_demo/post_request.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: post_request.py 6 | @time: 2022/1/6 19:52 7 | @project: python3-web-spider-learning 8 | @desc: POST请求(P203) 9 | """ 10 | import asyncio 11 | 12 | import aiohttp 13 | 14 | 15 | async def main(): 16 | data = { 17 | 'name': 'germey', 18 | 'age': 25 19 | } 20 | async with aiohttp.ClientSession() as session: 21 | async with session.post('https://www.httpbin.org/post', data=data) as response: 22 | print(await response.text()) 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.get_event_loop().run_until_complete(main()) 27 | -------------------------------------------------------------------------------- /src/ch06/aiohttp_demo/url_params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: url_params.py 6 | @time: 2022/1/6 19:49 7 | @project: python3-web-spider-learning 8 | @desc: URL参数设置(P203) 9 | """ 10 | import asyncio 11 | 12 | import aiohttp 13 | 14 | 15 | async def main(): 16 | params = { 17 | 'name': 'germey', 18 | 'age': 25 19 | } 20 | async with aiohttp.ClientSession() as session: 21 | async with session.get('https://www.httpbin.org/get', params=params) as response: 22 | print(await response.text()) 23 | 24 | 25 | if __name__ == '__main__': 26 | asyncio.get_event_loop().run_until_complete(main()) 27 | -------------------------------------------------------------------------------- /src/ch14/ai_extract.md: -------------------------------------------------------------------------------- 1 | # 智能解析实现思路 2 | 3 | ## 1 详情页智能解析实现思路 4 | 1. 提取标题:提取页面的h节点,将内容与title节点的文本进行比较,取出相似度最高的内容,即详情页的标题 5 | 2. 提取时间:通过设置meta规则和时间匹配规则,得到时间 6 | 3. 提取正文:将正文进行预处理(删除无用标签和其中的内容、删除标签对、删除噪声标签),通过计算文本密度和符号密度,根据得到的分数,取出分数最高的节点,即为正文内容所在的节点,将各节点进行拼接,得到正文 7 | 8 | ## 2 列表页智能解析实现思路 9 | 1. 数据预处理:将内容进行预处理(和详情页的提取正文中的预处理一致) 10 | 2. 选取组节点:通过父节点选择器以及相关的限制条件(限制兄弟节点数量、限制成员节点的文本内容最小长度、限制成员节点的文本内容最大长度、限制兄弟节点的相似度),得到符合要求的组节点 11 | 3. 合并组节点:通过简单的聚类方法,将组节点进行合并分类 12 | 4. 挑选最佳组节点:通过成员节点数量、平均字数分布、文本密度计算分数,选出分数最高的组节点 13 | 5. 提取标题和链接:根据标题长度计算置信度,得到最优节点路径,并通过成员节点提取标题和链接 14 | 15 | ## 3 智能分辨列表页和详情页 16 |   采用SVM模型,通过页面的特征(文本密度、超链接节点的数量和比例、符号密度、列表簇的数量、meta信息、正文标题和title内容的相似度),处理数据和训练模型,得到最终的分类模型,用于分辨列表页和详情页。 -------------------------------------------------------------------------------- /src/ch07/selenium_demo/node_interaction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: node_interaction.py 6 | @time: 2022/1/7 10:20 7 | @project: python3-web-spider-learning 8 | @desc: 节点交互(P216) 9 | """ 10 | import time 11 | 12 | from selenium import webdriver 13 | 14 | browser = webdriver.Chrome() 15 | browser.get('https://www.taobao.com') 16 | # 得到搜索框 17 | input = browser.find_element_by_id('q') 18 | # 输入搜索词“iPhone” 19 | input.send_keys('iPhone') 20 | time.sleep(1) 21 | # 清空搜索框 22 | input.clear() 23 | # 输入搜索词“iPad” 24 | input.send_keys('iPad') 25 | # 得到搜索按钮 26 | button = browser.find_element_by_class_name('btn-search') 27 | # 点击搜索按钮 28 | button.click() 29 | -------------------------------------------------------------------------------- /src/ch11/execjs_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: execjs_demo.py 6 | @time: 2022/1/13 21:35 7 | @project: python3-web-spider-learning 8 | @desc: 11.5 使用Python模拟执行javascript(P446) 9 | """ 10 | 11 | import execjs 12 | import json 13 | 14 | item = { 15 | "name": "勒布朗-詹姆斯", 16 | "image": "james.png", 17 | "birthday": "1984-12-30", 18 | "height": "206cm", 19 | "weight": "113.4KG" 20 | } 21 | 22 | file = 'files/execjs_crypto.js' 23 | node = execjs.get() 24 | ctx = node.compile(open(file).read()) 25 | 26 | js = f"getToken({json.dumps(item, ensure_ascii=False)})" 27 | print(js) 28 | result = ctx.eval(js) 29 | print(result) 30 | -------------------------------------------------------------------------------- /src/ch06/coroutine_demo/multi_task_coroutine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: multi_task_coroutine.py 6 | @time: 2022/1/6 18:58 7 | @project: python3-web-spider-learning 8 | @desc: 多任务协程(P196) 9 | """ 10 | import asyncio 11 | 12 | import requests 13 | 14 | 15 | async def request(): 16 | url = 'https://www.baidu.com' 17 | status = requests.get(url) 18 | return status 19 | 20 | 21 | tasks = [asyncio.ensure_future(request()) for _ in range(5)] 22 | print('Task:', tasks) 23 | 24 | loop = asyncio.get_event_loop() 25 | loop.run_until_complete(asyncio.wait(tasks)) 26 | 27 | for task in tasks: 28 | print('Task Result:', task.result()) 29 | -------------------------------------------------------------------------------- /src/ch11/pywasm_scrape_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: pywasm_scrape_demo.py 6 | @time: 2022/1/14 15:00 7 | @project: python3-web-spider-learning 8 | @desc: 11.11 WebAssembly案例分析和爬取实战(P495) 9 | """ 10 | import time 11 | 12 | import pywasm 13 | import requests 14 | 15 | BASE_URL = 'https://spa14.scrape.center' 16 | TOTAL_PAGE = 10 17 | 18 | runtime = pywasm.load('files/Wasm.wasm') 19 | for i in range(TOTAL_PAGE): 20 | offset = i * 10 21 | sign = runtime.exec('encrypt', [offset, int(time.time())]) 22 | url = f'{BASE_URL}/api/movie/?limit=10&offset={offset}&sign={sign}' 23 | response = requests.get(url) 24 | print(response.json()) 25 | -------------------------------------------------------------------------------- /src/ch13/files/frida_appbasic2.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: frida_appbasic1.js 4 | * @time: 2022-01-17 17:39 5 | * @project: python3-web-spider-learning 6 | * @desc: frida Appbasic2 Hook script 7 | */ 8 | 9 | Java.perform(function () { 10 | Interceptor.attach(Module.findExportByName('libnative.so', 'Java_com_appbasic2_MainActivity_getMessage'), { 11 | onEnter: function (args) { 12 | send('hook onEnter') 13 | send('args[1]=' + args[2]) 14 | send('args[2]=' + args[3]) 15 | }, 16 | onLeave: function (val) { 17 | send('hook Leave') 18 | val.replace(Java.vm.getEnv().newStringUtf('5')) 19 | } 20 | }) 21 | }) -------------------------------------------------------------------------------- /src/ch15/scrapyspidermiddlewaredemo/scrapyspidermiddlewaredemo/spiders/httpbin.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy import Request 3 | 4 | from ch15.scrapyspidermiddlewaredemo.scrapyspidermiddlewaredemo.items import DemoItem 5 | 6 | 7 | class HttpbinSpider(scrapy.Spider): 8 | name = 'httpbin' 9 | allowed_domains = ['www.httpbin.org'] 10 | start_url = 'https://www.httpbin.org/get' 11 | 12 | def start_requests(self): 13 | for i in range(5): 14 | url = f'{self.start_url}?query={i}' 15 | yield Request(url, callback=self.parse) 16 | 17 | def parse(self, response): 18 | item = DemoItem(**response.json()) 19 | print('Status:', response.status) 20 | yield item 21 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/action_chain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: action_chain.py 6 | @time: 2022/1/7 10:25 7 | @project: python3-web-spider-learning 8 | @desc: 动作链(P217) 9 | """ 10 | from selenium import webdriver 11 | from selenium.webdriver import ActionChains 12 | 13 | browser = webdriver.Chrome() 14 | url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' 15 | browser.get(url) 16 | browser.switch_to.frame('iframeResult') 17 | source = browser.find_element_by_css_selector('#draggable') 18 | target = browser.find_element_by_css_selector('#droppable') 19 | actions = ActionChains(browser) 20 | actions.drag_and_drop(source, target) 21 | actions.perform() 22 | -------------------------------------------------------------------------------- /src/ch07/playwright_demo/mobile_web.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: mobile_web.py 6 | @time: 2022/1/10 19:48 7 | @project: python3-web-spider-learning 8 | @desc: 支持移动端浏览器(P261) 9 | """ 10 | from playwright.sync_api import sync_playwright 11 | 12 | with sync_playwright() as p: 13 | iphone_12_pro_max = p.devices['iPhone 12 Pro Max'] 14 | browser = p.webkit.launch(headless=False) 15 | context = browser.new_context(**iphone_12_pro_max, locale='zh-CN') 16 | page = context.new_page() 17 | page.goto('https://www.whatismybrowser.com') 18 | page.wait_for_load_state(state='networkidle') 19 | page.screenshot(path='files/browser-iphone.png') 20 | browser.close() 21 | -------------------------------------------------------------------------------- /src/ch06/coroutine_demo/bing_callback.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: bing_callback.py 6 | @time: 2022/1/6 18:53 7 | @project: python3-web-spider-learning 8 | @desc: 绑定回调(P196) 9 | """ 10 | import asyncio 11 | 12 | import requests 13 | 14 | 15 | async def request(): 16 | url = 'https://www.baidu.com' 17 | status = requests.get(url) 18 | return status 19 | 20 | 21 | def callback(task): 22 | print('Status:', task.result()) 23 | 24 | 25 | coroutine = request() 26 | task = asyncio.ensure_future(coroutine) 27 | task.add_done_callback(callback) 28 | print('Task:', task) 29 | 30 | loop = asyncio.get_event_loop() 31 | loop.run_until_complete(task) 32 | print('Task:', task) 33 | -------------------------------------------------------------------------------- /src/ch10/account_pool/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: utils.py 6 | @time: 2022/1/12 10:23 7 | @project: python3-web-spider-learning 8 | @desc: 9 | """ 10 | import re 11 | 12 | 13 | def parse_redis_connection_string(connection_string): 14 | """ 15 | parse a redis connection string, for example: 16 | redis://[password]@host:port 17 | rediss://[password]@host:port 18 | :param connection_string: 19 | :return: 20 | """ 21 | result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string) 22 | return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \ 23 | else ('localhost', 6379, None) -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: server.py 6 | @time: 2022/1/19 18:38 7 | @project: python3-web-spider-learning 8 | @desc: 15.8 Extension的使用(P793) 9 | """ 10 | from flask import Flask, request, jsonify 11 | from loguru import logger 12 | 13 | app = Flask(__name__) 14 | 15 | 16 | @app.route('/notify', methods=['POST']) 17 | def receive(): 18 | post_data = request.get_json() 19 | event = post_data.get('event') 20 | data = post_data.get('data') 21 | logger.debug(f'received event {event}, data {data}') 22 | return jsonify(status='success') 23 | 24 | 25 | if __name__ == '__main__': 26 | app.run(debug=True, host='0.0.0.0', port=5000) 27 | -------------------------------------------------------------------------------- /src/ch11/learn-ast/codes/code4.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: code4.js 4 | * @time: 11:29 5 | * @project: python3-web-spider-learning 6 | * @desc: 7 | */ 8 | 9 | const _0x16c18d = function () { 10 | if (!![[]]) { 11 | console.log("hello world"); 12 | } else { 13 | console.log("this"); 14 | console.log("is"); 15 | console.log("dead"); 16 | console.log("code"); 17 | } 18 | }; 19 | const _0x1f7292 = function () { 20 | if ("xmv2nOdfy2N".charAt(4) !== String.fromCharCode(110)) { 21 | console.log("this"); 22 | console.log("is"); 23 | console.log("dead"); 24 | console.log("code"); 25 | } else { 26 | console.log("nice to meet you"); 27 | } 28 | }; 29 | 30 | _0x16c18d(); 31 | _0x1f7292(); -------------------------------------------------------------------------------- /src/ch02/urllib_demo/robotparser_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: robotparser_demo.py 6 | @time: 2021/12/31 13:39 7 | @project: python3-web-spider-learning 8 | @desc: Robots协议(P46) 9 | """ 10 | 11 | from urllib.robotparser import RobotFileParser 12 | 13 | 14 | def print_can_fetch(rp, spider, url): 15 | print(rp.can_fetch(spider, url)) 16 | 17 | 18 | if __name__ == '__main__': 19 | rp = RobotFileParser() 20 | rp.set_url('https://www.baidu.com/robots.txt') 21 | rp.read() 22 | print_can_fetch(rp, 'Baiduspider', 'https://www.baidu.com') 23 | print_can_fetch(rp, 'Baiduspider', 'https://www.baidu.com/homepage/') 24 | print_can_fetch(rp, 'Googlebot', 'https://www.baidu.com/homepage/') 25 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/anti_shield.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: anti_shield.py 6 | @time: 2022/1/7 15:40 7 | @project: python3-web-spider-learning 8 | @desc: 反屏蔽(P224) 9 | """ 10 | from selenium import webdriver 11 | from selenium.webdriver import ChromeOptions 12 | 13 | option = ChromeOptions() 14 | option.add_experimental_option('excludeSwitches', ['enable-automation']) 15 | option.add_experimental_option('useAutomationExtension', False) 16 | browser = webdriver.Chrome(options=option) 17 | browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 18 | 'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' 19 | }) 20 | browser.get('https://antispider1.scrape.center') 21 | 22 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/switch_frame.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: switch_frame.py 6 | @time: 2022/1/7 10:41 7 | @project: python3-web-spider-learning 8 | @desc: 切换Frame(P219) 9 | """ 10 | from selenium import webdriver 11 | from selenium.common.exceptions import NoSuchElementException 12 | 13 | browser = webdriver.Chrome() 14 | url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' 15 | browser.get(url) 16 | browser.switch_to.frame('iframeResult') 17 | try: 18 | logo = browser.find_element_by_class_name('logo') 19 | except NoSuchElementException: 20 | print('No Logo') 21 | 22 | browser.switch_to.parent_frame() 23 | logo = browser.find_element_by_class_name('logo') 24 | print(logo) 25 | print(logo.text) 26 | -------------------------------------------------------------------------------- /src/ch06/aiohttp_demo/simple_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: simple_demo.py 6 | @time: 2022/1/6 19:21 7 | @project: python3-web-spider-learning 8 | @desc: aiohttp基本实例(P202) 9 | """ 10 | import asyncio 11 | 12 | import aiohttp 13 | 14 | 15 | async def fetch(session, url): 16 | async with session.get(url) as response: 17 | return await response.text(), response.status 18 | 19 | 20 | async def main(): 21 | async with aiohttp.ClientSession() as session: 22 | html, status = await fetch(session, 'https://cuiqingcai.com') 23 | print(f'html: {html[:100]}...') 24 | print(f'status: {status}') 25 | 26 | 27 | if __name__ == '__main__': 28 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 29 | asyncio.run(main()) 30 | -------------------------------------------------------------------------------- /src/ch07/pyppeteer_demo/incognito_mode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: incognito_mode.py 6 | @time: 2022/1/10 18:26 7 | @project: python3-web-spider-learning 8 | @desc: 无痕模式(P252) 9 | """ 10 | import asyncio 11 | 12 | from pyppeteer import launch 13 | 14 | width, height = 1366, 768 15 | 16 | 17 | async def main(): 18 | # 设置浏览器窗口大小 19 | browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width}, {height}']) 20 | context = await browser.createIncogniteBrowserContext() 21 | page = await context.newPage() 22 | # 设置页面大小 23 | await page.setViewport({'width': width, 'height': height}) 24 | await page.goto('https://www.baidu.com/') 25 | await asyncio.sleep(100) 26 | 27 | 28 | asyncio.get_event_loop().run_until_complete(main()) 29 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/node_selector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: node_selector.py 6 | @time: 2022/1/7 10:04 7 | @project: python3-web-spider-learning 8 | @desc: 查找节点(P215-P216) 9 | """ 10 | from selenium import webdriver 11 | 12 | browser = webdriver.Chrome() 13 | browser.get('https://www.taobao.com') 14 | 15 | 16 | def get_signal_node(): 17 | input_first = browser.find_element_by_id('q') 18 | input_second = browser.find_element_by_css_selector('#q') 19 | input_third = browser.find_element_by_xpath('//*[@id="q"]') 20 | print(input_first, input_second, input_third) 21 | 22 | 23 | def get_nodes(): 24 | lis = browser.find_elements_by_css_selector('.service-bd li') 25 | print(lis) 26 | 27 | 28 | if __name__ == '__main__': 29 | get_nodes() 30 | browser.close() 31 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/proguard-rules.pro: -------------------------------------------------------------------------------- 1 | # Add project specific ProGuard rules here. 2 | # You can control the set of applied configuration files using the 3 | # proguardFiles setting in build.gradle. 4 | # 5 | # For more details, see 6 | # http://developer.android.com/guide/developing/tools/proguard.html 7 | 8 | # If your project uses WebView with JS, uncomment the following 9 | # and specify the fully qualified class name to the JavaScript interface 10 | # class: 11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview { 12 | # public *; 13 | #} 14 | 15 | # Uncomment this to preserve the line number information for 16 | # debugging stack traces. 17 | #-keepattributes SourceFile,LineNumberTable 18 | 19 | # If you keep the line number information, uncomment this to 20 | # hide the original source file name. 21 | #-renamesourcefileattribute SourceFile -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/java/com/germey/andservertest/AppController.java: -------------------------------------------------------------------------------- 1 | package com.germey.andservertest; 2 | 3 | import com.goldze.mvvmhabit.utils.NativeUtils; 4 | import com.yanzhenjie.andserver.annotation.GetMapping; 5 | import com.yanzhenjie.andserver.annotation.QueryParam; 6 | import com.yanzhenjie.andserver.annotation.RestController; 7 | 8 | import org.json.JSONObject; 9 | 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | @RestController 14 | public class AppController { 15 | 16 | @GetMapping("/encrypt") 17 | public JSONObject login(@QueryParam("string") String string, 18 | @QueryParam("offset") int offset) { 19 | Map map = new HashMap<>(); 20 | String sign = NativeUtils.encrypt(string, offset); 21 | map.put("sign", sign); 22 | return new JSONObject(map); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/scrapytutorial/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from ch15.scrapytutorial.scrapytutorial.items import QuoteItem 4 | 5 | 6 | class QuotesSpider(scrapy.Spider): 7 | name = 'quotes' 8 | allowed_domains = ['quotes.toscrape.com'] 9 | start_urls = ['http://quotes.toscrape.com/'] 10 | 11 | def parse(self, response): 12 | quotes = response.css('.quote') 13 | for quote in quotes: 14 | item = QuoteItem() 15 | item['text'] = quote.css('.text::text').extract_first() 16 | item['author'] = quote.css('.author::text').extract_first() 17 | item['tags'] = quote.css('.tags .tag::text').extract() 18 | yield item 19 | 20 | next = response.css('.pager .next a::attr("href")').extract_first() 21 | url = response.urljoin(next) 22 | yield scrapy.Request(url=url, callback=self.parse) 23 | -------------------------------------------------------------------------------- /src/ch07/pyppeteer_demo/prevent_detect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: prevent_detect.py 6 | @time: 2022/1/10 18:12 7 | @project: python3-web-spider-learning 8 | @desc: 防止检测(P248-P250) 9 | """ 10 | import asyncio 11 | 12 | from pyppeteer import launch 13 | 14 | width, height = 1366, 768 15 | 16 | 17 | async def main(): 18 | # 设置浏览器窗口大小 19 | browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width}, {height}']) 20 | page = await browser.newPage() 21 | # 设置页面大小 22 | await page.setViewport({'width': width, 'height': height}) 23 | await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: ()=> undefined})') 24 | await page.goto('https://antispider1.scrape.center/') 25 | await asyncio.sleep(100) 26 | 27 | 28 | asyncio.get_event_loop().run_until_complete(main()) 29 | -------------------------------------------------------------------------------- /src/ch11/wasmer_scrape_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: wasmer_scrape_demo.py 6 | @time: 2022/1/14 17:08 7 | @project: python3-web-spider-learning 8 | @desc: wasmer库实战 9 | """ 10 | import time 11 | 12 | import requests 13 | from wasmer import engine, Store, Module, Instance 14 | from wasmer_compiler_cranelift.wasmer_compiler_cranelift import Compiler 15 | 16 | # 读取wasm文件 17 | store = Store(engine.JIT(Compiler)) 18 | module = Module(store, open('files/Wasm.wasm', 'rb').read()) 19 | instance = Instance(module) 20 | 21 | BASE_URL = 'https://spa14.scrape.center' 22 | TOTAL_PAGE = 10 23 | 24 | for i in range(TOTAL_PAGE): 25 | offset = i * 10 26 | sign = instance.exports.encrypt(offset, int(time.time())) 27 | url = f'{BASE_URL}/api/movie/?limit=10&offset={offset}&sign={sign}' 28 | response = requests.get(url) 29 | print(response.json()) -------------------------------------------------------------------------------- /src/ch06/aiohttp_demo/response_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: response_demo.py 6 | @time: 2022/1/6 19:54 7 | @project: python3-web-spider-learning 8 | @desc: 响应(P205) 9 | """ 10 | import asyncio 11 | 12 | import aiohttp 13 | 14 | 15 | async def main(): 16 | data = { 17 | 'name': 'germey', 18 | 'age': 25 19 | } 20 | async with aiohttp.ClientSession() as session: 21 | async with session.post('https://www.httpbin.org/post', data=data) as response: 22 | print('status:', response.status) 23 | print('headers:', response.headers) 24 | print('body:', await response.text()) 25 | print('bytes:', await response.read()) 26 | print('json:', await response.json()) 27 | 28 | 29 | if __name__ == '__main__': 30 | asyncio.get_event_loop().run_until_complete(main()) 31 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/node_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: node_info.py 6 | @time: 2022/1/7 10:36 7 | @project: python3-web-spider-learning 8 | @desc: 获取节点信息(P218) 9 | """ 10 | from selenium import webdriver 11 | 12 | browser = webdriver.Chrome() 13 | url = 'https://spa2.scrape.center/' 14 | browser.get(url) 15 | 16 | 17 | def get_attr(): 18 | logo = browser.find_element_by_class_name('logo-image') 19 | print(logo) 20 | print(logo.get_attribute('src')) 21 | 22 | 23 | def get_text(): 24 | input = browser.find_element_by_class_name('logo-title') 25 | print(input.text) 26 | 27 | 28 | def get_other_info(): 29 | input = browser.find_element_by_class_name('logo-title') 30 | print(input.id) 31 | print(input.location) 32 | print(input.tag_name) 33 | print(input.size) 34 | 35 | 36 | if __name__ == '__main__': 37 | get_other_info() 38 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/values/themes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/values-night/themes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/androidTest/java/com/germey/andservertest/ExampleInstrumentedTest.java: -------------------------------------------------------------------------------- 1 | package com.germey.andservertest; 2 | 3 | import android.content.Context; 4 | 5 | import androidx.test.platform.app.InstrumentationRegistry; 6 | import androidx.test.ext.junit.runners.AndroidJUnit4; 7 | 8 | import org.junit.Test; 9 | import org.junit.runner.RunWith; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | /** 14 | * Instrumented test, which will execute on an Android device. 15 | * 16 | * @see Testing documentation 17 | */ 18 | @RunWith(AndroidJUnit4.class) 19 | public class ExampleInstrumentedTest { 20 | @Test 21 | public void useAppContext() { 22 | // Context of the app under test. 23 | Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); 24 | assertEquals("com.germey.andservertest", appContext.getPackageName()); 25 | } 26 | } -------------------------------------------------------------------------------- /src/ch04/rabbitmq_oper_demo/scrape_producer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: scrape_producer.py 6 | @time: 2022/1/6 15:10 7 | @project: python3-web-spider-learning 8 | @desc: RabbitMQ实战 生产者(P171) 9 | """ 10 | import pickle 11 | 12 | import pika 13 | import requests 14 | 15 | MAX_PRORITY = 100 16 | TOTAL = 100 17 | QUEUE_NAME = 'scrape_queue' 18 | 19 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost')) 20 | channel = connection.channel() 21 | channel.queue_declare(queue=QUEUE_NAME, durable=True) 22 | 23 | for i in range(1, TOTAL + 1): 24 | url = f'http://ssr1.scrape.center/detail/{i}' 25 | request = requests.Request('GET', url) 26 | channel.basic_publish(exchange='', routing_key=QUEUE_NAME, 27 | properties=pika.BasicProperties(delivery_mode=2), 28 | body=pickle.dumps(request)) 29 | print(f'Put request of {url}') 30 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/simple_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: simple_demo.py 6 | @time: 2022/1/7 9:41 7 | @project: python3-web-spider-learning 8 | @desc: Selenium基本用法(P213) 9 | """ 10 | 11 | from selenium import webdriver 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.common.keys import Keys 14 | from selenium.webdriver.support.wait import WebDriverWait 15 | from selenium.webdriver.support import expected_conditions as EC 16 | 17 | browser = webdriver.Chrome() 18 | try: 19 | browser.get('https://www.baidu.com') 20 | input = browser.find_element_by_id('kw') 21 | input.send_keys('Python') 22 | input.send_keys(Keys.ENTER) 23 | wait = WebDriverWait(browser, 10) 24 | wait.until(EC.presence_of_element_located((By.ID, 'content_left'))) 25 | print(browser.current_url) 26 | print(browser.get_cookies()) 27 | print(browser.page_source) 28 | finally: 29 | browser.close() -------------------------------------------------------------------------------- /src/ch13/andserver_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: andserver_demo.py 6 | @time: 2022/1/17 23:07 7 | @project: python3-web-spider-learning 8 | @desc: 13.10 基于AndServer-RPC模拟执行so文件(Python爬取数据)(P691) 9 | """ 10 | import requests 11 | 12 | BASE_URL = 'https://app9.scrape.center' 13 | INDEX_URL = BASE_URL + '/api/movie?limit={limit}&offset={offset}&token={token}' 14 | ANDSERVER_URL = 'http://localhost:8080/encrypt?string={string}&offset={offset}' 15 | MAX_PAGE = 10 16 | LIMIT = 10 17 | 18 | 19 | def get_token(string, offset): 20 | andserver_url = ANDSERVER_URL.format(string=string, offset=offset) 21 | return requests.get(andserver_url).json().get('sign') 22 | 23 | 24 | for i in range(MAX_PAGE): 25 | offset = i * LIMIT 26 | token = get_token("/api/movie", offset) 27 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token) 28 | response = requests.get(index_url) 29 | print("response:", response.json()) 30 | -------------------------------------------------------------------------------- /src/ch13/jeb_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: jeb_demo.py 6 | @time: 2022/1/17 10:29 7 | @project: python3-web-spider-learning 8 | @desc: 13.2 JEB的使用(P624) 9 | """ 10 | import base64 11 | import hashlib 12 | import time 13 | 14 | import requests 15 | 16 | INDEX_URL = 'https://app5.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}' 17 | MAX_PAGE = 10 18 | LIMIT = 10 19 | 20 | 21 | def get_token(args): 22 | timestamp = str(int(time.time())) 23 | args.append(timestamp) 24 | sign = hashlib.sha1(','.join(args).encode('utf-8')).hexdigest() 25 | return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8') 26 | 27 | 28 | for i in range(MAX_PAGE): 29 | offset = i * LIMIT 30 | token = get_token(args=['/api/movie']) 31 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token) 32 | response = requests.get(index_url) 33 | print('response:', response.json()) 34 | -------------------------------------------------------------------------------- /src/ch06/aiohttp_demo/concurrency_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: concurrency_demo.py 6 | @time: 2022/1/6 20:01 7 | @project: python3-web-spider-learning 8 | @desc: 并发限制(P206) 9 | """ 10 | import asyncio 11 | 12 | import aiohttp 13 | 14 | CONCURRENCY = 5 15 | URL = 'https://www.baidu.com' 16 | 17 | semaphoer = asyncio.Semaphore(CONCURRENCY) 18 | session = None 19 | 20 | 21 | async def scrape_api(): 22 | async with semaphoer: 23 | print('scraping', URL) 24 | async with session.get(URL) as response: 25 | await asyncio.sleep(1) 26 | return await response.text() 27 | 28 | 29 | async def main(): 30 | global session 31 | session = aiohttp.ClientSession() 32 | scrape_index_tasks = [asyncio.ensure_future(scrape_api()) for _ in range(10000)] 33 | await asyncio.gather(*scrape_index_tasks) 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.get_event_loop().run_until_complete(main()) 38 | -------------------------------------------------------------------------------- /src/ch06/coroutine_demo/coroutine_await_aiohttp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: coroutine_await_aiohttp.py 6 | @time: 2022/1/6 19:05 7 | @project: python3-web-spider-learning 8 | @desc: 协程实现,await、aiohttp的使用(P197) 9 | """ 10 | import asyncio 11 | import time 12 | 13 | import aiohttp 14 | 15 | start = time.time() 16 | 17 | 18 | async def get(url): 19 | session = aiohttp.ClientSession() 20 | response = await session.get(url) 21 | await response.text() 22 | await session.close() 23 | return response 24 | 25 | 26 | async def request(): 27 | url = 'https://www.httpbin.org/delay/5' 28 | print('Waiting for', url) 29 | response = await get(url) 30 | print('Get response from', url, 'response', response) 31 | 32 | 33 | tasks = [asyncio.ensure_future(request()) for _ in range(10)] 34 | loop = asyncio.get_event_loop() 35 | loop.run_until_complete(asyncio.wait(tasks)) 36 | 37 | end = time.time() 38 | print('Cost time:', end - start) 39 | -------------------------------------------------------------------------------- /src/ch13/frida_rpc_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: frida_rpc_demo.py 6 | @time: 2022/1/17 20:11 7 | @project: python3-web-spider-learning 8 | @desc: 13.9 基于Frida-RPC 模拟执行so文件(P683) 9 | """ 10 | import frida 11 | import requests 12 | 13 | BASE_URL = 'https://app9.scrape.center' 14 | INDEX_URL = BASE_URL + '/api/movie?limit={limit}&offset={offset}&token={token}' 15 | MAX_PAGE = 10 16 | LIMIT = 10 17 | 18 | session = frida.get_usb_device().attach('App9') 19 | source = open('files/frida_rpc_app9.js', encoding='utf-8').read() 20 | script = session.create_script(source) 21 | script.load() 22 | 23 | 24 | def get_token(string, offset): 25 | return script.exports.encrypt(string, offset) 26 | 27 | 28 | for i in range(MAX_PAGE): 29 | offset = i * LIMIT 30 | token = get_token('/api/movie', offset) 31 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token) 32 | response = requests.get(index_url) 33 | print('response', response.json()) 34 | -------------------------------------------------------------------------------- /src/ch11/nodejs_demo/nodejs_main.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: nodejs_main.js 4 | * @time: 22:01 5 | * @project: python3-web-spider-learning 6 | * @desc: 11.6 使用Node.js模拟执行JavaScript(P451) 7 | */ 8 | 9 | const CryptoJS = require("./files/crypto.js") 10 | 11 | function getToken(player) { 12 | let key = CryptoJS.enc.Utf8.parse("fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt"); 13 | const {name, birthday, height, weight} = player; 14 | let base64Name = CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(name)); 15 | let encrypted = CryptoJS.DES.encrypt( 16 | `${base64Name}${birthday}${height}${weight}`, 17 | key, { 18 | mode: CryptoJS.mode.ECB, 19 | padding: CryptoJS.pad.Pkcs7, 20 | } 21 | ); 22 | return encrypted.toString(); 23 | } 24 | 25 | const player = { 26 | "name": "凯文-杜兰特", 27 | "image": "durant.png", 28 | "birthday": "1988-09-29", 29 | "height": "208cm", 30 | "weight": "108.9KG" 31 | } 32 | 33 | console.log(getToken(player)) -------------------------------------------------------------------------------- /src/ch04/rabbitmq_oper_demo/scrape_consume.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: scrape_consume.py 6 | @time: 2022/1/6 15:10 7 | @project: python3-web-spider-learning 8 | @desc: RabbitMQ实战 消费者(P172) 9 | """ 10 | import pickle 11 | 12 | import pika 13 | import requests 14 | 15 | MAX_PRORITY = 100 16 | QUEUE_NAME = 'scrape_queue' 17 | 18 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost')) 19 | channel = connection.channel() 20 | session = requests.Session() 21 | 22 | 23 | def scrape(request): 24 | try: 25 | response = session.send(request.prepare()) 26 | print(f'success scraped {response.url}') 27 | except requests.RequestException: 28 | print(f'error occurred when scraping {request.url}') 29 | 30 | 31 | while True: 32 | method_frame, header, body = channel.basic_get(queue=QUEUE_NAME, auto_ack=True) 33 | if body: 34 | request = pickle.loads(body) 35 | print(f'Get {request}') 36 | scrape(request) 37 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 13 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/ch15/scrapyuniversaldemo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run.py 6 | @time: 2022/1/20 11:29 7 | @project: python3-web-spider-learning 8 | @desc: 15.12 Scrapy规则化爬虫(实战,P818) 9 | """ 10 | import argparse 11 | 12 | from scrapy.crawler import CrawlerProcess 13 | from scrapy.utils.project import get_project_settings 14 | 15 | from ch15.scrapyuniversaldemo.scrapyuniversaldemo.utils import get_config 16 | 17 | parser = argparse.ArgumentParser(description='Universal Spider') 18 | parser.add_argument('name', help='name of spider to run') 19 | args = parser.parse_args() 20 | name = args.name 21 | 22 | 23 | def run(): 24 | config = get_config(name) 25 | spider = config.get('spider', 'universal') 26 | project_settings = get_project_settings() 27 | settings = dict(project_settings.copy()) 28 | settings.update(config.get('settings')) 29 | process = CrawlerProcess(settings) 30 | process.crawl(spider, **{'name': name}) 31 | process.start() 32 | 33 | 34 | if __name__ == '__main__': 35 | run() 36 | -------------------------------------------------------------------------------- /src/ch10/jwt_simulate_login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: jwt_simulate_login.py 6 | @time: 2022/1/12 9:49 7 | @project: python3-web-spider-learning 8 | @desc: 10.3 基于JWT的模拟登录爬取实战(P381) 9 | """ 10 | from urllib.parse import urljoin 11 | import requests 12 | 13 | BASE_URL = 'https://login3.scrape.center/' 14 | LOGIN_URL = urljoin(BASE_URL, '/api/login') 15 | INDEX_URL = urljoin(BASE_URL, '/api/book') 16 | USERNAME = 'admin' 17 | PASSWORD = 'admin' 18 | 19 | response_login = requests.post(LOGIN_URL, json={ 20 | 'username': USERNAME, 21 | 'password': PASSWORD 22 | }) 23 | data = response_login.json() 24 | print('Response JSON:', data) 25 | # 获取token jwt 26 | jwt = data.get('token') 27 | print('JWT:', jwt) 28 | 29 | headers = { 30 | 'Authorization': f'jwt {jwt}' 31 | } 32 | response_index = requests.get(INDEX_URL, params={ 33 | 'limit': 18, 34 | 'offset': 0 35 | }, headers=headers) 36 | print('Response Status', response_index.status_code) 37 | print('Response URL', response_index.url) 38 | print('Response Data', response_index.json()) 39 | -------------------------------------------------------------------------------- /src/ch07/selenium_demo/delay_wait.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: delay_wait.py 6 | @time: 2022/1/7 15:05 7 | @project: python3-web-spider-learning 8 | @desc: 延时等待(P220) 9 | """ 10 | from selenium import webdriver 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.support.wait import WebDriverWait 13 | from selenium.webdriver.support import expected_conditions as EC 14 | 15 | 16 | def implicit_wait(): 17 | browser = webdriver.Chrome() 18 | browser.implicitly_wait(10) 19 | browser.get('https://spa2.scrape.center/') 20 | input = browser.find_element_by_class_name('logo-image') 21 | print(input) 22 | 23 | 24 | def explicit_wait(): 25 | browser = webdriver.Chrome() 26 | browser.get('https://www.taobao.com/') 27 | wait = WebDriverWait(browser, 10) 28 | input = wait.until(EC.presence_of_element_located((By.ID, 'q'))) 29 | button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search'))) 30 | print(input, button) 31 | 32 | 33 | if __name__ == '__main__': 34 | explicit_wait() 35 | -------------------------------------------------------------------------------- /src/ch10/account_pool/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: server.py 6 | @time: 2022/1/12 14:00 7 | @project: python3-web-spider-learning 8 | @desc: 9 | """ 10 | from flask import Flask, g 11 | 12 | from ch10.account_pool.setting import GENERATOR_MAP 13 | from ch10.account_pool.storages_redis import RedisClient 14 | from loguru import logger 15 | 16 | app = Flask(__name__) 17 | 18 | account = 'account' 19 | credential = 'credential' 20 | 21 | 22 | @app.route('/') 23 | def index(): 24 | return '

Welcome to Cookie Pool System

' 25 | 26 | 27 | def get_conn(): 28 | for website in GENERATOR_MAP: 29 | if not hasattr(g, website): 30 | setattr(g, f'{website}_{credential}', RedisClient(credential, website)) 31 | setattr(g, f'{website}_{account}', RedisClient(account, website)) 32 | return g 33 | 34 | 35 | @app.route('//random') 36 | def random(website): 37 | g = get_conn() 38 | result = getattr(g, f'{website}_{credential}').random() 39 | logger.debug(f'get credential {result}') 40 | return result 41 | -------------------------------------------------------------------------------- /src/ch11/nodejs_demo/nodejs_server.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author: HuRuiFeng 3 | * @file: nodejs_server.js 4 | * @time: 22:09 5 | * @project: python3-web-spider-learning 6 | * @desc: 搭建nodejs服务(P453) 7 | */ 8 | 9 | const CryptoJS = require("./crypto.js") 10 | const express = require("express") 11 | const app = express(); 12 | const port = 3000; 13 | app.use(express.json()) 14 | 15 | 16 | function getToken(player) { 17 | let key = CryptoJS.enc.Utf8.parse("fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt"); 18 | const {name, birthday, height, weight} = player; 19 | let base64Name = CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(name)); 20 | let encrypted = CryptoJS.DES.encrypt( 21 | `${base64Name}${birthday}${height}${weight}`, 22 | key, { 23 | mode: CryptoJS.mode.ECB, 24 | padding: CryptoJS.pad.Pkcs7, 25 | } 26 | ); 27 | return encrypted.toString(); 28 | } 29 | 30 | app.post("/", (req, res)=> { 31 | const data = req.body; 32 | res.send(getToken(data)) 33 | }); 34 | 35 | app.listen(port, ()=> { 36 | console.log(`Example app listening on port ${port}`); 37 | }) -------------------------------------------------------------------------------- /src/ch13/ida_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: ida_demo.py 6 | @time: 2022/1/17 19:19 7 | @project: python3-web-spider-learning 8 | @desc: 13.8 IDA Pro静态分析和动态调试so文件(汇编代码调试)(P679) 9 | """ 10 | import requests 11 | import hashlib 12 | import time 13 | import base64 14 | 15 | 16 | def get_token(value, offset): 17 | array = [] 18 | array.append(value) 19 | array.append('9fdLnciVh4FxQbri') 20 | array.append(str(offset)) 21 | timestamp = str(int(time.time())) 22 | array.append(timestamp) 23 | sign = hashlib.sha1(','.join(array).encode('utf-8')).hexdigest() 24 | return base64.b64encode(','.join([sign, timestamp]).encode('utf-8')).decode('utf-8') 25 | 26 | 27 | INDEX_URL = 'https://app8.scrape.center/api/movie?limit={limit}&offset={offset}&token={token}' 28 | MAX_PAGE = 10 29 | LIMIT = 10 30 | 31 | 32 | for i in range(MAX_PAGE): 33 | offset = i * LIMIT 34 | token = get_token('/api/movie', offset) 35 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token) 36 | response = requests.get(index_url) 37 | print('response', response.json()) 38 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/gradle.properties: -------------------------------------------------------------------------------- 1 | # Project-wide Gradle settings. 2 | # IDE (e.g. Android Studio) users: 3 | # Gradle settings configured through the IDE *will override* 4 | # any settings specified in this file. 5 | # For more details on how to configure your build environment visit 6 | # http://www.gradle.org/docs/current/userguide/build_environment.html 7 | # Specifies the JVM arguments used for the daemon process. 8 | # The setting is particularly useful for tweaking memory settings. 9 | org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 10 | # When configured, Gradle will run in incubating parallel mode. 11 | # This option should only be used with decoupled projects. More details, visit 12 | # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects 13 | # org.gradle.parallel=true 14 | # AndroidX package structure to make it clearer which packages are bundled with the 15 | # Android operating system, and which are packaged with your app"s APK 16 | # https://developer.android.com/topic/libraries/support-library/androidx-rn 17 | android.useAndroidX=true 18 | # Automatically convert third-party libraries to use AndroidX 19 | android.enableJetifier=true -------------------------------------------------------------------------------- /src/ch15/scrapyspiderdemo/scrapyspiderdemo/spiders/httpbin.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy import Request 3 | 4 | 5 | class HttpbinSpider(scrapy.Spider): 6 | name = 'httpbin' 7 | allowed_domains = ['www.httpbin.org'] 8 | start_url = 'https://www.httpbin.org/get' 9 | headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' 11 | } 12 | cookies = {'name': 'germey', 13 | 'age': '26'} 14 | 15 | def start_requests(self): 16 | for offset in range(5): 17 | url = self.start_url + f'?offset={offset}' 18 | yield Request(url, headers=self.headers, 19 | cookies=self.cookies, 20 | callback=self.parse_response, 21 | meta={'offset': offset}) 22 | 23 | def parse_response(self, response): 24 | print('url:', response.url) 25 | print('request:', response.request) 26 | print('status:', response.status) 27 | print('headers:', response.headers) 28 | print('text:', response.text) 29 | print('meta:', response.meta) 30 | -------------------------------------------------------------------------------- /src/ch10/account_pool/run_account_pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: run_account_pool.py 6 | @time: 2022/1/12 14:27 7 | @project: python3-web-spider-learning 8 | @desc: 运行账号池 9 | """ 10 | from ch10.account_pool.setting import ENABLE_IMPORT_DATA 11 | from ch10.account_pool.storages_redis import RedisClient 12 | from scheduler import Scheduler 13 | import argparse 14 | 15 | parser = argparse.ArgumentParser(description='AccountPool') 16 | parser.add_argument('website', type=str, help='website') 17 | parser.add_argument('--processor', type=str, help='processor to run') 18 | args = parser.parse_args() 19 | website = args.website 20 | 21 | if __name__ == '__main__': 22 | if ENABLE_IMPORT_DATA: 23 | conn = RedisClient('account', website) 24 | start = 1 25 | end = 20 26 | for i in range(start, end + 1): 27 | username = password = f'admin{i}' 28 | conn.set(username, password) 29 | conn.close() 30 | 31 | # if processor set, just run it 32 | if args.processor: 33 | getattr(Scheduler(), f'run_{args.processor}')(website) 34 | else: 35 | Scheduler().run(website) 36 | -------------------------------------------------------------------------------- /src/ch15/scrape_processor_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: scrape_processor_demo.py 6 | @time: 2022/1/20 10:52 7 | @project: python3-web-spider-learning 8 | @desc: 15.12 Scrapy规则化爬虫(P816) 9 | """ 10 | from itemloaders.processors import TakeFirst, Join, Compose, MapCompose, SelectJmes 11 | 12 | 13 | def takefirst(): 14 | # 返回列表的第一个非空值 15 | processor = TakeFirst() 16 | print(processor(['', 1, 2, 3])) 17 | 18 | 19 | def join(): 20 | # 把列表拼接成字符串 21 | processor = Join() 22 | print(processor(['one', 'two', 'three'])) 23 | 24 | processor = Join(',') 25 | print(processor(['one', 'two', 'three'])) 26 | 27 | 28 | def compose(): 29 | # 使用多个函数组合构造而成 30 | processor = Compose(str.upper, lambda s: s.strip()) 31 | print(processor(' hello world')) 32 | 33 | 34 | def map_compose(): 35 | # 和compose类似,迭代处理一个列表输入值 36 | processor = MapCompose(str.upper, lambda s: s.strip()) 37 | print(processor(['Hello', 'World', 'Python'])) 38 | 39 | 40 | def select_jmes(): 41 | # 查询JSON,传入Key,返回查询所得的Value 42 | processor = SelectJmes('foo') 43 | print(processor({'foo': 'bar'})) 44 | 45 | 46 | if __name__ == '__main__': 47 | select_jmes() 48 | -------------------------------------------------------------------------------- /src/ch12/appium_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: appium_demo.py 6 | @time: 2022/1/16 2:26 7 | @project: python3-web-spider-learning 8 | @desc: 12.4 Appium的使用(P557) 9 | """ 10 | from appium import webdriver 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.support.wait import WebDriverWait 13 | from selenium.webdriver.support import expected_conditions as EC 14 | 15 | server = 'http://localhost:4723/wd/hub' 16 | desired_capabilitis= { 17 | "platformName": "Android", 18 | "appium:deviceName": "VirtualBox", 19 | "appium:appPackage": "com.goldze.mvvmhabit", 20 | "appium:appActivity": "com.goldze.mvvmhabit.ui.MainActivity", 21 | "appium:noReset": True 22 | } 23 | 24 | # 启动示例App 25 | driver = webdriver.Remote(server, desired_capabilitis) 26 | wait = WebDriverWait(driver, 30) 27 | # 等到所有电影条目都加载之后 28 | wait.until(EC.presence_of_element_located((By.XPATH, '//android.support.v7.widget.RecyclerView/android.widget.LinearLayout'))) 29 | window_size = driver.get_window_size() 30 | width, height = window_size.get('width'), window_size.get('height') 31 | # 前两个表示初始位置,后两个表示滑动的结束位置,1000表示滑动时间为1秒 32 | driver.swipe(width * 0.5, height * 0.8, width * 0.5, height * 0.2, 1000) -------------------------------------------------------------------------------- /src/ch11/execjs_web_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: execjs_web_demo.py 6 | @time: 2022/1/14 9:22 7 | @project: python3-web-spider-learning 8 | @desc: 11.7 浏览器环境下JavaScript的模拟执行(P457) 9 | """ 10 | import requests 11 | from playwright.sync_api import sync_playwright 12 | 13 | BASE_URL = "https://spa2.scrape.center" 14 | INDEX_URL = BASE_URL + "/api/movie?limit={limit}&offset={offset}&token={token}" 15 | MAX_PAGE = 10 16 | LIMIT = 10 17 | 18 | # 创建一个无头Chromium浏览器 19 | context = sync_playwright().start() 20 | browser = context.chromium.launch() 21 | # 创建一个新页面 22 | page = browser.new_page() 23 | # 配置路由,将浏览器加载的js替换为本地js 24 | page.route( 25 | "/js/chunk-10192a00.243cb8b7.js", 26 | lambda route: route.fulfill(path="files/chunk.js") 27 | ) 28 | page.goto(BASE_URL) 29 | 30 | 31 | def get_token(offset): 32 | # 使用evaluate方法模拟执行 33 | result = page.evaluate('''()=> { 34 | return window.encrypt("%s", "%s") 35 | }''' % ('/api/movie', offset)) 36 | return result 37 | 38 | 39 | for i in range(MAX_PAGE): 40 | offset = i * LIMIT 41 | token = get_token(offset) 42 | index_url = INDEX_URL.format(limit=LIMIT, offset=offset, token=token) 43 | response = requests.get(index_url) 44 | print('response:', response.json()) 45 | -------------------------------------------------------------------------------- /src/ch04/text_oper_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: text_oper_demo.py 6 | @time: 2022/1/5 19:00 7 | @project: python3-web-spider-learning 8 | @desc: 4.1 TXT文本存储(P128~P130) 9 | """ 10 | import os 11 | import re 12 | 13 | import requests 14 | from pyquery import PyQuery as pq 15 | 16 | url = 'https://ssr1.scrape.center' 17 | html = requests.get(url).text 18 | doc = pq(html) 19 | items = doc('.el-card').items() 20 | 21 | if not os.path.exists('files'): 22 | os.makedirs('files') 23 | 24 | file = open('files/movies.txt', 'w', encoding='utf-8') 25 | for item in items: 26 | # 电影名称 27 | name = item.find('a > h2').text() 28 | file.write(f'名称:{name}\n') 29 | # 类别 30 | categories = [item.text() for item in item.find('.categories button span').items()] 31 | file.write(f'类别:{categories}\n') 32 | # 上映时间 33 | published_at = item.find('.info:contains(上映)').text() 34 | published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \ 35 | if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None 36 | file.write(f'上映时间:{published_at}\n') 37 | # 评分 38 | score = item.find('p.score').text() 39 | file.write(f'评分:{score}\n') 40 | file.write(f'{"=" * 50}\n') 41 | 42 | file.close() 43 | -------------------------------------------------------------------------------- /src/ch04/csv_oper_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: csv_oper_demo.py 6 | @time: 2022/1/5 19:20 7 | @project: python3-web-spider-learning 8 | @desc: 4.3 CSV文件存储(P134~P138) 9 | """ 10 | import csv 11 | 12 | 13 | def write_to_csv(): 14 | with open('files/data.csv', 'w', newline='') as csv_file: 15 | writer = csv.writer(csv_file) 16 | writer.writerow(['id', 'name', 'age']) 17 | writer.writerow(['10001', 'Mike', 20]) 18 | writer.writerow(['10002', 'Bob', 22]) 19 | writer.writerow(['10003', 'Jordan', 21]) 20 | 21 | 22 | def write_dict_to_csv(): 23 | with open('files/data.csv', 'w', encoding='utf-8', newline='') as csv_file: 24 | filednames = ['id', 'name', 'age'] 25 | writer = csv.DictWriter(csv_file, fieldnames=filednames) 26 | writer.writeheader() 27 | writer.writerow({'id': '10001', 'name': 'Mike', 'age': 20}) 28 | writer.writerow({'id': '10002', 'name': 'Bob', 'age': 22}) 29 | writer.writerow({'id': '10003', 'name': 'Jordan', 'age': 21}) 30 | 31 | 32 | def read_csv(): 33 | with open('files/data.csv', 'r', encoding='utf-8') as csv_file: 34 | reader = csv.reader(csv_file) 35 | for row in reader: 36 | print(row) 37 | 38 | 39 | if __name__ == '__main__': 40 | write_dict_to_csv() 41 | read_csv() 42 | -------------------------------------------------------------------------------- /src/ch15/scrapytutorial/scrapytutorial/extensions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: extensions.py 6 | @time: 2022/1/19 18:43 7 | @project: python3-web-spider-learning 8 | @desc: 9 | """ 10 | import requests 11 | from scrapy import signals 12 | 13 | NOTIFICATION_URL = 'http://localhost:5000/notify' 14 | 15 | 16 | class NotificationExtension: 17 | def spider_opend(self, spider): 18 | requests.post(NOTIFICATION_URL, json={ 19 | 'event': 'SPIDER_OPENED', 20 | 'data': {'spider_name': spider.name} 21 | }) 22 | 23 | def spider_closed(self, spider): 24 | requests.post(NOTIFICATION_URL, json={ 25 | 'event': 'SPIDER_CLOSED', 26 | 'data': {'spider_name': spider.name} 27 | }) 28 | 29 | def item_scraped(self, item, spider): 30 | requests.post(NOTIFICATION_URL, json={ 31 | 'event': 'ITEM_SCRAPED', 32 | 'data': {'spider_name': spider.name, 'item': dict(item)} 33 | }) 34 | 35 | @classmethod 36 | def from_crawler(cls, crawler): 37 | ext = cls() 38 | crawler.signals.connect(ext.spider_opend, signal=signals.spider_opened) 39 | crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) 40 | crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) 41 | return ext 42 | -------------------------------------------------------------------------------- /src/ch04/files/movies.txt: -------------------------------------------------------------------------------- 1 | 名称:霸王别姬 - Farewell My Concubine 2 | 类别:['剧情', '爱情'] 3 | 上映时间:1993-07-26 4 | 评分:9.5 5 | ================================================== 6 | 名称:这个杀手不太冷 - Léon 7 | 类别:['剧情', '动作', '犯罪'] 8 | 上映时间:1994-09-14 9 | 评分:9.5 10 | ================================================== 11 | 名称:肖申克的救赎 - The Shawshank Redemption 12 | 类别:['剧情', '犯罪'] 13 | 上映时间:1994-09-10 14 | 评分:9.5 15 | ================================================== 16 | 名称:泰坦尼克号 - Titanic 17 | 类别:['剧情', '爱情', '灾难'] 18 | 上映时间:1998-04-03 19 | 评分:9.5 20 | ================================================== 21 | 名称:罗马假日 - Roman Holiday 22 | 类别:['剧情', '喜剧', '爱情'] 23 | 上映时间:1953-08-20 24 | 评分:9.5 25 | ================================================== 26 | 名称:唐伯虎点秋香 - Flirting Scholar 27 | 类别:['喜剧', '爱情', '古装'] 28 | 上映时间:1993-07-01 29 | 评分:9.5 30 | ================================================== 31 | 名称:乱世佳人 - Gone with the Wind 32 | 类别:['剧情', '爱情', '历史', '战争'] 33 | 上映时间:1939-12-15 34 | 评分:9.5 35 | ================================================== 36 | 名称:喜剧之王 - The King of Comedy 37 | 类别:['剧情', '喜剧', '爱情'] 38 | 上映时间:1999-02-13 39 | 评分:9.5 40 | ================================================== 41 | 名称:楚门的世界 - The Truman Show 42 | 类别:['剧情', '科幻'] 43 | 上映时间:None 44 | 评分:9.0 45 | ================================================== 46 | 名称:狮子王 - The Lion King 47 | 类别:['动画', '歌舞', '冒险'] 48 | 上映时间:1995-07-15 49 | 评分:9.0 50 | ================================================== 51 | -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'com.android.application' 3 | id 'com.yanzhenjie.andserver' 4 | } 5 | 6 | android { 7 | compileSdk 31 8 | 9 | defaultConfig { 10 | applicationId "com.germey.andservertest" 11 | minSdk 16 12 | targetSdk 31 13 | versionCode 1 14 | versionName "1.0" 15 | 16 | testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" 17 | } 18 | 19 | buildTypes { 20 | release { 21 | minifyEnabled false 22 | proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' 23 | } 24 | } 25 | compileOptions { 26 | sourceCompatibility 1.8 27 | targetCompatibility 1.8 28 | } 29 | 30 | sourceSets { 31 | main { 32 | jniLibs.srcDirs = ["libs"] 33 | } 34 | } 35 | } 36 | 37 | dependencies { 38 | implementation 'androidx.appcompat:appcompat:1.4.1' 39 | implementation 'com.google.android.material:material:1.5.0' 40 | implementation 'androidx.constraintlayout:constraintlayout:2.0.4' 41 | testImplementation 'junit:junit:4.+' 42 | androidTestImplementation 'androidx.test.ext:junit:1.1.3' 43 | androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0' 44 | implementation 'com.yanzhenjie.andserver:api:2.1.9' 45 | annotationProcessor 'com.yanzhenjie.andserver:processor:2.1.9' 46 | } -------------------------------------------------------------------------------- /src/ch10/account_pool/storages_redis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @author: HuRuiFeng 5 | @file: storages_redis.py 6 | @time: 2022/1/12 10:18 7 | @project: python3-web-spider-learning 8 | @desc: 存储模块:使用Redis作为账号池的存储库,数据结构如下: 9 | : 10 | : 11 | """ 12 | import random 13 | 14 | from ch10.account_pool.setting import * 15 | import redis 16 | 17 | 18 | class RedisClient: 19 | def __init__(self, type, website, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD): 20 | self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True) 21 | # 网站类型 22 | self.type = type 23 | # 网站名称 24 | self.website = website 25 | 26 | def name(self): 27 | return f'{self.type}:{self.website}' 28 | 29 | def set(self, username, value): 30 | return self.db.hset(self.name(), username, value) 31 | 32 | def get(self, username): 33 | return self.db.hget(self.name(), username) 34 | 35 | def delete(self, username): 36 | return self.db.hdel(self.name(), username) 37 | 38 | def count(self): 39 | return self.db.hlen(self.name()) 40 | 41 | def random(self): 42 | # 随机选择一个cookie 43 | return random.choice(self.db.hvals(self.name())) 44 | 45 | def usernames(self): 46 | return self.db.hkeys(self.name()) 47 | 48 | def all(self): 49 | return self.db.hgetall(self.name()) 50 | 51 | def close(self): 52 | self.db.close() -------------------------------------------------------------------------------- /src/ch13/AndServerTest/app/src/main/res/layout/activity_main.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 |