├── .gitignore ├── 01-Cnblogs ├── README.md ├── cnblogs.py ├── demo │ └── demo.gif └── screenshot │ ├── README.md │ ├── db_01.jpg │ └── db_02.jpg ├── 02-Golory_of_Kings ├── Glory_of_Kings.py ├── README.md └── result.jpg ├── 03-MaoYan_Top100 ├── MaoYan_Top100.py ├── README.md └── result.txt ├── 04-Selenium_Taobao ├── README.md ├── __pycache__ │ └── xdaili.cpython-36.pyc ├── demo │ └── demo.gif ├── result │ ├── README.md │ ├── db_01.jpg │ └── db_02.jpg ├── taobao.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── config.cpython-36.pyc │ ├── config.py │ └── proxy_auth_plugin.zip └── xdaili.py ├── 05-Moments ├── .idea │ ├── 05-Moments.iml │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── Readme.md ├── __pycache__ │ ├── config.cpython-36.pyc │ └── processor.cpython-36.pyc ├── config.py ├── moments.py ├── plates │ ├── SDK.png │ ├── content.png │ ├── contents.png │ ├── datetime.png │ ├── device_name.png │ ├── login.png │ ├── moment_db.png │ ├── nickname.png │ └── yes-no.png └── processor.py ├── 06-Github_Login ├── README.md └── github_login.py ├── 07-IGetGet ├── README.md ├── __pycache__ │ └── script.cpython-36.pyc ├── dedao.json ├── script.py └── utils │ ├── app_error.jpg │ ├── charles安装证书页面.png │ ├── connect_error.png │ ├── demo.gif │ ├── mitmproxy证书.png │ ├── mongo_server_error.png │ ├── result_json.png │ ├── 乱码_01.png │ ├── 乱码_02.png │ └── 手机证书安装.png ├── 08-Selenium_Cnki ├── README.md ├── __pycache__ │ ├── config.cpython-36.pyc │ └── handle_code.cpython-36.pyc ├── chaojiying.py ├── cnki.py ├── demo │ ├── demo.gif │ └── 超级鹰积分.jpg └── utils │ ├── config.py │ └── handle.py ├── 09-Bilibili ├── .idea │ ├── bilibili.iml │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── README.md ├── bilibili.py ├── captcha1.png ├── captcha2.png ├── require │ ├── demo.gif │ └── demo_location.png └── utils │ ├── __pycache__ │ └── config.cpython-36.pyc │ └── config.py ├── 10-DouYin ├── README.md ├── __pycache__ │ └── script.cpython-36.pyc ├── plates │ ├── JSONDecodeError.jpg │ ├── TypeError.jpg │ ├── charles.png │ ├── demo.gif │ ├── douyin.json │ ├── video_demo.gif │ ├── video_info_json.png │ └── video_screentshot.png └── script.py ├── 11-Jianshu ├── README.md ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── script.cpython-36.pyc ├── action.py ├── demo │ └── demo.gif ├── result │ ├── __init__.py │ └── jianshu.json ├── script.py └── utils │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── config.cpython-36.pyc │ └── config.py ├── 12-Crack_Jianshu ├── .idea │ ├── Jianshu.iml │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── README.md ├── captcha.png ├── jianshu.py ├── require │ ├── chaojiying.png │ ├── code_demo.png │ ├── demo.gif │ └── 超级鹰返回结果处理示例.png └── utils │ ├── __pycache__ │ ├── chaojiying.cpython-36.pyc │ └── config.cpython-36.pyc │ ├── chaojiying.py │ └── config.py ├── 13-Pyspider_Lagou ├── README.md ├── data │ ├── project.db │ ├── result.db │ ├── scheduler.1d │ ├── scheduler.1h │ ├── scheduler.all │ └── task.db ├── demo.py ├── lagou.py └── result │ ├── db.jpg │ ├── 单步测试结果_01.jpg │ ├── 单步测试结果_02.jpg │ ├── 单步测试结果_03.jpg │ ├── 单步测试结果_04.jpg │ └── 单步测试结果_05.jpg ├── 14-Scrapy_Tutorial ├── README.md ├── demo.gif └── tutorial │ ├── scrapy.cfg │ └── tutorial │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── main.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── quotes.cpython-36.pyc │ └── quotes.py ├── 15-Scrapy_Images360 ├── README.md ├── images360 │ ├── images360 │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── middlewares.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── images │ │ │ ├── t01a3ee5a4ff05fe133.jpg │ │ │ ├── t01a5f844c4a5d5ed7d.jpg │ │ │ ├── t01ad50ec608cde5fdc.jpg │ │ │ ├── t01aed1278f885e26ec.jpg │ │ │ ├── t01b29ea494ffdab388.jpg │ │ │ ├── t01bf8bb6d4c6b93fff.jpg │ │ │ └── t01c2bb853e048be307.jpg │ │ ├── items.py │ │ ├── main.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── images.cpython-36.pyc │ │ │ └── images.py │ └── scrapy.cfg └── screenshot │ ├── README.md │ ├── demo.gif │ ├── images.jpg │ ├── mongodb.jpg │ └── mysql.jpg ├── 16-vczh ├── .idea │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ ├── vczh.iml │ └── workspace.xml ├── README.md ├── scrapy.cfg └── vczh │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── middlewares.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ ├── sendemail.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── main.py │ ├── middlewares.py │ ├── pipelines.py │ ├── sendemail.py │ ├── settings.py │ ├── spiders │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── vc.cpython-36.pyc │ └── vc.py │ └── utils │ ├── db_follower.png │ ├── email.png │ ├── followers.png │ ├── huaji.png │ └── log.png ├── 17-City_58 ├── City_58 │ ├── City_58 │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── middlewares.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── main.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── spiders │ │ │ ├── 58.py │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ │ ├── 58.cpython-36.pyc │ │ │ │ └── __init__.cpython-36.pyc │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── api.cpython-36.pyc │ │ │ ├── parse.cpython-36.pyc │ │ │ ├── proxy.cpython-36.pyc │ │ │ └── xdaili.cpython-36.pyc │ │ │ ├── api.py │ │ │ ├── parse.py │ │ │ ├── proxy.py │ │ │ └── xdaili.py │ └── scrapy.cfg ├── README.md └── screenshot │ ├── monogdb.jpg │ ├── run_01.jpg │ └── run_02.jpg ├── 18-36kr ├── .idea │ ├── 36kr.iml │ ├── encodings.xml │ ├── inspectionProfiles │ │ └── Project_Default.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── 36kr.py ├── README.md └── utils │ ├── 36kr.txt │ ├── FZSTK.TTF │ ├── __pycache__ │ └── word.cpython-36.pyc │ ├── cloud.jpg │ ├── db.png │ ├── show.jpg │ └── word.py ├── 19-Youku_DanMu ├── .idea │ ├── Youku_DanMu.iml │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── README.md ├── danmu.py └── utils │ ├── FZSTK.TTF │ ├── cloud.jpg │ ├── danmu.txt │ ├── require │ ├── danmu_content.png │ └── danmu_json.png │ ├── show.jpg │ └── word.py ├── 20-Selenium_163 ├── .idea │ ├── 20-Selenium_163Email.iml │ ├── encodings.xml │ ├── inspectionProfiles │ │ └── Project_Default.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── 163.py ├── README.md ├── require │ ├── content_frame.png │ ├── demo.gif │ └── login_frame.png └── utils │ ├── __pycache__ │ └── config.cpython-36.pyc │ └── config.py ├── 21-AutoCrawl_DouYin ├── .idea │ ├── DouYin.iml │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── README.md ├── __pycache__ │ ├── config.cpython-36.pyc │ └── scripts.cpython-36.pyc ├── actions.py ├── config.py ├── plates │ ├── demo.gif │ ├── douyin_demo.gif │ ├── start.png │ ├── video_name.png │ ├── video_url.png │ └── 图形点触验证码.png └── scripts.py ├── 22-Stackoverflow ├── .idea │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ ├── stackoverflow.iml │ └── workspace.xml ├── README.md ├── scrapy.cfg └── stackoverflow │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── middlewares.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── main.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── stack.cpython-36.pyc │ └── stack.py │ └── utils │ ├── Error.png │ └── db.png ├── 23-GithubLogin ├── .idea │ ├── encodings.xml │ ├── github.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── README.md ├── github │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── main.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── logingit.cpython-36.pyc │ │ └── logingit.py │ └── utils │ │ ├── __init__.py │ │ └── acatar.jpg └── scrapy.cfg ├── 24-Dianping ├── README.md ├── demo.py └── utils │ ├── prtsc1.png │ ├── prtsc2.png │ ├── prtsc3.png │ ├── prtsc4.png │ └── prtsc5.png ├── 25-DouYin ├── README.md ├── douyin.py ├── font.py └── shareid.txt ├── README.md └── sogou_wechat_captcha.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | /.idea 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ -------------------------------------------------------------------------------- /01-Cnblogs/README.md: -------------------------------------------------------------------------------- 1 | ## Cnblogs 2 |   使用 urllib 请求库抓取博客园首页最新文章信息并将数据存储到MongoDB, 包含标题、作者、发布时间、阅读量、评论等。 3 | 4 | ## Tip 5 |   博客园的数据请求接口有点特殊, 要认真分析。若有疑问, 可邮箱联系。 6 | 7 | ## Demo 8 | ![GIF](https://github.com/Northxw/Python3_WebSpider/blob/master/01-Cnblogs/demo/demo.gif) 9 | -------------------------------------------------------------------------------- /01-Cnblogs/demo/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/01-Cnblogs/demo/demo.gif -------------------------------------------------------------------------------- /01-Cnblogs/screenshot/README.md: -------------------------------------------------------------------------------- 1 | ## Display 1 2 | ![db_screenshot_01](https://github.com/Northxw/Python3_WebSpider/blob/master/01-Cnblogs/screenshot/db_01.jpg) 3 | 4 | ## Display 2 5 | ![db_screenshot_02](https://github.com/Northxw/Python3_WebSpider/blob/master/01-Cnblogs/screenshot/db_02.jpg) 6 | -------------------------------------------------------------------------------- /01-Cnblogs/screenshot/db_01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/01-Cnblogs/screenshot/db_01.jpg -------------------------------------------------------------------------------- /01-Cnblogs/screenshot/db_02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/01-Cnblogs/screenshot/db_02.jpg -------------------------------------------------------------------------------- /02-Golory_of_Kings/Glory_of_Kings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | Created at 21:27 at Sep 17,2018 4 | @author: Northxw 5 | """ 6 | 7 | import requests 8 | import os 9 | 10 | # 全英雄列表请求链接 11 | herolist_url = 'https://pvp.qq.com/web201605/js/herolist.json' 12 | # 获取数据 13 | response = requests.get(herolist_url).json() 14 | 15 | # 根据英雄的皮肤链接,分析并下载英雄的皮肤 16 | save_dir = "E:\Python\Spider\Ex\\01-Spider_Glory_of_Kings\hero_list\\" # 指定下载位置 17 | if not os.path.exists(save_dir): 18 | os.mkdir(save_dir) 19 | 20 | for i in range(len(response)): 21 | # 获取英雄皮肤列表 22 | skin_names = response[i]['skin_name'].split('|') 23 | for cnt in range(len(skin_names)): 24 | # 下载当前英雄的所有皮肤 25 | hero_num = response[i]['ename'] # 英雄序号 26 | hero_name = response[i]['cname'] # 英雄名称 27 | skin_name = skin_names[cnt] # 皮肤名称 28 | 29 | save_file_name = save_dir + str(hero_num) + '-' + hero_name + '-' + skin_name + '.jpg' 30 | skin_url = 'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'.format(hero_num, hero_num, str(cnt+1)) 31 | # 获取图片的位数据(二进制流数据) 32 | response_skin_content = requests.get(skin_url).content 33 | # 保存图片 34 | with open(save_file_name, 'wb') as f: 35 | f.write(response_skin_content) 36 | -------------------------------------------------------------------------------- /02-Golory_of_Kings/README.md: -------------------------------------------------------------------------------- 1 | ## Golory of Kings 2 |   使用requests请求库完成对王者荣耀英雄全皮肤的下载。 3 | 4 | ## Introduction 5 |   代码中的请求接口可通过Chrome开发者工具轻松获取, 具体的爬取过程可以参考公众号"C与python实战"爬虫模块。 6 | 7 | ## Result 8 | ![运行结果截图](https://github.com/Northxw/Python3_WebSpider/blob/master/02-Golory_of_Kings/result.jpg) 9 | -------------------------------------------------------------------------------- /02-Golory_of_Kings/result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/02-Golory_of_Kings/result.jpg -------------------------------------------------------------------------------- /03-MaoYan_Top100/MaoYan_Top100.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | Updated at 14:33 on March 11,2019 4 | @title: Spider Maoyan Top100 5 | @author: Northxw 6 | """ 7 | 8 | import requests 9 | import re 10 | import json 11 | from requests.exceptions import RequestException 12 | from pymongo import MongoClient 13 | import time 14 | 15 | # 创建数据库连接 16 | client = MongoClient('mongodb://localhost:27017/') 17 | db = client.maoyan 18 | collection = db.rank 19 | 20 | def get_one_page(url): 21 | """ 22 | 获取每页的网页源代码 23 | :param url: 请求链接 24 | :return: 网页的文本内容 25 | """ 26 | try: 27 | headers = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 29 | } 30 | response = requests.get(url=url, headers=headers) 31 | if response.status_code == 200: 32 | return response.text 33 | return None 34 | except RequestException: 35 | return None 36 | 37 | def parse_one_page(html): 38 | """ 39 | 使用正则表达式解析网页数据 40 | :param html: 网页的文本内容 41 | :return: 字典 42 | """ 43 | pattern = re.compile( 44 | r'
.*?board-index.*?>(.*?).*?data-src="(.*?)".*?name.*?a.*?>(.*?).*?star.*?>(.*?)

.' 45 | r'*?releasetime.*?>(.*?)

.*?integer.*?>(.*?).*?fraction.*?>(.*?).*?
', 46 | re.S 47 | ) 48 | items = re.findall(pattern, html) 49 | for item in items: 50 | yield { 51 | 'index': item[0], 52 | 'image': item[1].split('@')[0], 53 | 'title': item[2].strip(), 54 | 'actor': item[3].strip()[3:] if len(item[3]) > 3 else '', 55 | 'time': item[4].strip()[5:] if len(item[4]) > 5 else '', 56 | 'score': item[5].strip() + item[6].strip() 57 | } 58 | 59 | def write_to_file(content): 60 | with open('result.txt', 'a', encoding='utf-8') as f: 61 | f.write(json.dumps(content, ensure_ascii=False) + '\n') 62 | 63 | def save_to_mongo(item): 64 | """ 65 | 将数据存储到MongoDB 66 | :param dict: 字典类型的数据 67 | :return: None 68 | """ 69 | collection.insert(item) 70 | 71 | def main(offset): 72 | url = 'http://maoyan.com/board/4?offset={}'.format(str(offset)) 73 | html = get_one_page(url) 74 | for item in parse_one_page(html): 75 | write_to_file(item) 76 | save_to_mongo(item) 77 | 78 | if __name__ == '__main__': 79 | for i in range(10): 80 | main(offset=i*10) 81 | time.sleep(1) -------------------------------------------------------------------------------- /03-MaoYan_Top100/README.md: -------------------------------------------------------------------------------- 1 | ## MaoYan Top100 2 |   使用requests请求库获取猫眼电影排行TOP100的电影名称、时间、评分、图片等信息,结果以文本格式保存。 3 | 4 | ## Crawl analysis 5 |   打开目标站点,查看榜单信息,如图: 6 | ![1](https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-11.jpg) 7 | 排名第一的电影是霸王别姬,页面中显示的有效信息有影片名称、主演、上映时间、上映地区、评分、图片等信息。 8 |   翻页规律:按住鼠标滑轮滚动到页面底部,点击下一页,观察页面URL和内容发生的变化,如图: 9 | ![2](https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-12.jpg) 10 | 可以发现页面的URL变成http://maoyan.com/board/4?offset=10, 比之前的URL多了一个参数,那就是offset=10,而目前显示的结果是排行11-20名的电影,初步推断这是一个偏移量的参数。再点击下一页,发现页面的URL变成了http://maoyan.com/board/4?offset=20, 参数offset变成了20,而显示的结果是排行21~30的电影 11 |   由此可以总结出规律,off代表偏移量值,如果偏移量为n,则显示的电影序号就是n+1到n+10,每页显示10个。所以,如果想获取TOP100电影,只需要分开请求10次,而10次的offset参数分别设置为0、10、20、…90即可,这样获取不同的页面之后,再用正则表达式提取出相关信息,就可以得到TOP100的所有电影信息了。 12 | 13 | ## Other 14 | + 目标信息采用正则匹配(当然,完全可以利用xpath,pyquery,css等方法) 15 | + 网页的真实源码可以在Chroem浏览器的开发者模式下的Network监听组件中查看 16 | + 写入文件的时候为了保证输出结果是中文形式而不是Unicode编码,需要将open的encoding参数设置为"utf-8",然后在 f.write 时添加 ensure_ascii 参数并设置为False 17 | 18 | ## Result 19 | ![3](https://qiniu.cuiqingcai.com/wp-content/uploads/2018/02/3-15.jpg) 20 | -------------------------------------------------------------------------------- /04-Selenium_Taobao/README.md: -------------------------------------------------------------------------------- 1 | ## Selenium Taobao 2 |   使用 Selenium+Chrome+Xdaili 爬取淘宝商品数据, 包含商品的图片、名称、价格、购买人数、店铺名称等。 3 | 4 | ## Explain 5 |   本次实践参考崔大"Python3网络爬虫开发实践"第七章。 6 | 7 | ## Demo 8 | ![demo](https://github.com/Northxw/Python3_WebSpider/blob/master/04-Selenium_Taobao/demo/demo.gif) 9 | -------------------------------------------------------------------------------- /04-Selenium_Taobao/__pycache__/xdaili.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/__pycache__/xdaili.cpython-36.pyc -------------------------------------------------------------------------------- /04-Selenium_Taobao/demo/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/demo/demo.gif -------------------------------------------------------------------------------- /04-Selenium_Taobao/result/README.md: -------------------------------------------------------------------------------- 1 | ### Display 1 2 | ![result01](https://github.com/Northxw/Python3_WebSpider/blob/master/04-Selenium_Taobao/result/db_01.jpg) 3 | 4 | ### Display 2 5 | ![result02](https://github.com/Northxw/Python3_WebSpider/blob/master/04-Selenium_Taobao/result/db_02.jpg) 6 | -------------------------------------------------------------------------------- /04-Selenium_Taobao/result/db_01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/result/db_01.jpg -------------------------------------------------------------------------------- /04-Selenium_Taobao/result/db_02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/result/db_02.jpg -------------------------------------------------------------------------------- /04-Selenium_Taobao/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/__init__.py -------------------------------------------------------------------------------- /04-Selenium_Taobao/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /04-Selenium_Taobao/utils/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /04-Selenium_Taobao/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # 搜索关键字 4 | KEYWORD = 'iPad' 5 | 6 | # URL 7 | URL = 'https://s.taobao.com/search?q=' 8 | 9 | # 数据库配置 10 | MONGO_URL = 'localhost' 11 | MONGO_DB = 'taobao' 12 | MONGO_COLLECTION = 'products' 13 | 14 | # 加载延时 15 | TIMEOUT = 30 16 | 17 | # 最大页数 18 | MAX_PAGE = 100 -------------------------------------------------------------------------------- /04-Selenium_Taobao/utils/proxy_auth_plugin.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/proxy_auth_plugin.zip -------------------------------------------------------------------------------- /04-Selenium_Taobao/xdaili.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | 3 | from selenium import webdriver 4 | 5 | import zipfile 6 | 7 | class Xdaili(object): 8 | def __init__(self): 9 | """ 10 | 初始化信息 11 | """ 12 | # 代理服务器 13 | self.ip = "forward.xdaili.cn" 14 | self.port = '80' 15 | # 订单号和个人密钥(可在讯代理官网购买) 16 | self.orderno = "ZF2018***********" 17 | self.secert = "**********************************" 18 | 19 | def auth(self): 20 | """ 21 | 构造代理 22 | :return: 23 | """ 24 | manifest_json = """ 25 | { 26 | "version": "1.0.0", 27 | "manifest_version": 2, 28 | "name": "Xdaili Proxy", 29 | "permissions": [ 30 | "proxy", 31 | "tabs", 32 | "unlimitedStorage", 33 | "storage", 34 | "", 35 | "webRequest", 36 | "webRequestBlocking" 37 | ], 38 | "background": { 39 | "scripts": ["background.js"] 40 | }, 41 | "minimum_chrome_version":"22.0.0" 42 | } 43 | """ 44 | 45 | background_js = """ 46 | var config = { 47 | mode: "fixed_servers", 48 | rules: { 49 | singleProxy: { 50 | scheme: "http", 51 | host: "%(ip)s", 52 | port: "%(port)s") 53 | }, 54 | bypassList: ["foobar.com"] 55 | } 56 | }; 57 | 58 | chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); 59 | 60 | function callbackFn(details) { 61 | return { 62 | authCredentials: { 63 | username: "%(orderno)s", 64 | password: "%(secert)s" 65 | } 66 | }; 67 | } 68 | 69 | chrome.webRequest.onAuthRequired.addListener( 70 | callbackFn, 71 | {urls: [""]}, 72 | ['blocking'] 73 | ); 74 | """ % {'ip': self.ip, 'port': self.port, 'orderno': self.orderno, 'secert': self.secert} 75 | playin_file = './utils/proxy_auth_plugin.zip' 76 | with zipfile.ZipFile(playin_file, 'w') as zp: 77 | zp.writestr("manifest.json", manifest_json) 78 | zp.writestr("background.js", background_js) 79 | -------------------------------------------------------------------------------- /05-Moments/.idea/05-Moments.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /05-Moments/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /05-Moments/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /05-Moments/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /05-Moments/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /05-Moments/__pycache__/processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/__pycache__/processor.cpython-36.pyc -------------------------------------------------------------------------------- /05-Moments/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | 5 | # 设备类型:Android 或 iOS 6 | PLANTFORM = 'Android' 7 | # 设备名称:可在命令行输入 adb devices -l 获取 8 | DEVICE_NAME = 'vivo_X7' 9 | # APP包名 10 | APP_PACKAGE = 'com.tencent.mm' 11 | # 入口类型 12 | APP_ACTIVITY = '.ui.LauncherUI' 13 | 14 | # APP安装包路径(手机没有安装微信时,通过修改启动参数完成安装并启动微信执行后续操作) 15 | APP = os.path.abspath('.') + '/weixin.apk' 16 | 17 | # Appium 服务地址 18 | DRIVER_SERVER = 'http://localhost:4723/wd/hub' 19 | 20 | # 元素加载时间(一般退出重新登录的耗时主要在登录和加载数据界面,可根据设备运行速度灵活调整) 21 | TIMEOUT = 200 22 | 23 | # 微信登录的手机号、密码 24 | USERNAME = '132********' # 你的手机号码 25 | PASSWORD = '123456789' # 你的微信账号密码 26 | 27 | # 滑动点 28 | FLICK_START_X = 300 29 | FLICK_START_Y = 300 30 | FLICK_DISTANCE = 500 31 | 32 | # 滑动的间隔时间 33 | SCROLL_SLEEP_TIME = 3 #设置间隔5秒+是确保新加载的朋友圈节点信息能完全加载出来 34 | 35 | # MYSQL数据库配置 36 | HOST = 'localhost' 37 | USER = 'root' 38 | PASSWORD_ = '123456' 39 | PORT = 3306 40 | DB = 'wechat' 41 | 42 | # MongoDB配置 43 | MONGO_URL = 'localhost' 44 | MONGO_DB = 'wechat' 45 | MONGO_COLLECTION = 'moments' 46 | -------------------------------------------------------------------------------- /05-Moments/plates/SDK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/SDK.png -------------------------------------------------------------------------------- /05-Moments/plates/content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/content.png -------------------------------------------------------------------------------- /05-Moments/plates/contents.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/contents.png -------------------------------------------------------------------------------- /05-Moments/plates/datetime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/datetime.png -------------------------------------------------------------------------------- /05-Moments/plates/device_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/device_name.png -------------------------------------------------------------------------------- /05-Moments/plates/login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/login.png -------------------------------------------------------------------------------- /05-Moments/plates/moment_db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/moment_db.png -------------------------------------------------------------------------------- /05-Moments/plates/nickname.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/nickname.png -------------------------------------------------------------------------------- /05-Moments/plates/yes-no.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/yes-no.png -------------------------------------------------------------------------------- /05-Moments/processor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import time 4 | import re 5 | 6 | class Processor(): 7 | def date(self, datetime): 8 | """ 9 | 格式化时间 10 | :param date: 原始时间 11 | :return: 处理后时间 12 | """ 13 | if re.match('\d+分钟前', datetime): 14 | minute = re.match('(\d+)', datetime).group(1) 15 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(minute) * 60)) 16 | if re.match('\d+小时前', datetime): 17 | hour = re.match('(\d+)', datetime).group(1) 18 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(hour) * 60 * 60)) 19 | if re.match('昨天', datetime): 20 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60)) 21 | if re.match('\d+天前', datetime): 22 | day = re.match('(\d+)', datetime).group(1) 23 | datetime = time.strftime('%Y-%m-%d', time.localtime((time.time()) - float(day) * 24 * 60 * 60)) 24 | return datetime -------------------------------------------------------------------------------- /06-Github_Login/README.md: -------------------------------------------------------------------------------- 1 | ## Github Login 2 |   模拟登录Github并抓取登录后才可以访问的页面信息, 包括好友动态、个人信息等。 3 | 4 | ## Sort 5 |   **模拟登陆 - requests** 6 | 7 | ## Explain 8 | #### 1.清除Cookies 9 |   清除浏览器中待抓取网站的Cookies: [清除方法](https://blog.csdn.net/panbiao1999/article/details/77880649) 10 | #### 2.浏览器设置Coookies 11 |   设置Cookies的过程发生在请求登录界面后(即:http://github.com/login)。 12 | #### 3.From表单的验证参数 13 |   Form表单的authenticity_token参数可在登陆界面的源码中获取。 14 | 15 | -------------------------------------------------------------------------------- /06-Github_Login/github_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | class Login(object): 7 | def __init__(self): 8 | self.headers = { 9 | 'Host': 'github.com', 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 11 | 'Referer': 'https://github.com' 12 | } 13 | self.login_url = 'https://github.com/login' 14 | self.post_url = 'https://github.com/session' 15 | self.logined_url = 'https://github.com/settings/profile' 16 | self.session = requests.Session() # 维持会话,处理Cookies, 使得我们不同担心Cookies的问题 17 | 18 | def token(self): 19 | response = self.session.get(self.login_url, headers=self.headers) 20 | selector = etree.HTML(response.text) 21 | token = selector.xpath('//*[@id="login"]/form/input[2]/@value') # 获取authenticity_token的值 22 | return token 23 | 24 | 25 | def login(self, email, password): 26 | post_data = { 27 | 'commit': 'Sign in', 28 | 'utf8': '✓', # ✓ 可在"xpath('//*[@id="login"]/form/input[1]/@value')" 位置复制粘贴 29 | 'authenticity_token': self.token(), # 获取隐藏在源码中的authenticity_token值. 30 | 'login': email, 31 | 'password': password 32 | } 33 | response = self.session.post(self.post_url, data=post_data, headers=self.headers) 34 | if response.status_code == 200: 35 | self.dynamics(response.text) 36 | 37 | response = self.session.get(self.logined_url, headers=self.headers) 38 | if response.status_code == 200: 39 | self.profile(response.text) 40 | 41 | def dynamics(self, html): 42 | selector = etree.HTML(html) 43 | dynamics = selector.xpath('//div[contains(@class, "news")]/div') # 获取动态信息的div标签(需要处理) 44 | print(len(dynamics)) 45 | div_class_values = ['watch_started', 'fork', 'follow', 'repo'] # 所有动态信息的class属性值 46 | for item in dynamics: 47 | value = item.xpath('./@class') # 获取标签的class属性值, 如果没在列表, 则不做处理 48 | print(value) 49 | if value in div_class_values: 50 | text = item.xpath('.//div[contains(@class, "flex-items-baseline")]//text()').strip() 51 | print(text) 52 | 53 | def profile(self, html): 54 | selector = etree.HTML(html) 55 | name = selector.xpath('//input[@id="user_profile_name"]/@value') # 获取用户名称 56 | email = selector.xpath('//select[@id="user_profile_email"]/option[@value!=""]/text()') 57 | print(name, email) 58 | 59 | if __name__ == "__main__": 60 | login = Login() 61 | login.login(email="northxw@163.com", password='your_password') 62 | -------------------------------------------------------------------------------- /07-IGetGet/README.md: -------------------------------------------------------------------------------- 1 | # Project Name 2 |   **IGetGet**,使用 Mitmproxy 的 Mitmdump 组件爬取"得到"App的电子书信息,并将信息存储至Json文件。 3 | 4 | # Sort 5 |   **非自动化爬取App数据** - 通过Python脚本捕获服务器返回的response并处理。 6 | 7 | # Demand 8 | **1. Charles** - 跨平台支持度很好的网络抓包工具。addr: https://www.charlesproxy.com/download/ 9 | 10 | **2. mitmproxy** - 一个支持HTTP、HTTPS的抓包程序,类似Fiddler、Charles的功能, 通过控制台的形式操作。 11 | ``` 12 | pip3 install mitmproxy 13 | ``` 14 | 15 | # Process analysis 16 | #### 1.Charles证书安装 17 |   不同OS安装过程基本一致。打开Charles, 点击"Help->SSL Proxy->Install Charles Root Certificate",即可进入证书安装页面。如图: 18 | 19 | ![CA](https://github.com/Northxw/Python3_WebSpider/blob/master/07-IGetGet/utils/charles%E5%AE%89%E8%A3%85%E8%AF%81%E4%B9%A6%E9%A1%B5%E9%9D%A2.png) 20 | 21 |   具体的证书安装过程请自行谷歌。 22 | 23 | #### 2.手机证书安装 24 |   **前提**:确保Charles的HTTP代理开启,默认端口8888。然后将手机和电脑连接再同一个局域网下。如图: 25 | 26 | ![shouji_ca](https://github.com/Northxw/Python3_WebSpider/blob/master/07-IGetGet/utils/%E6%89%8B%E6%9C%BA%E8%AF%81%E4%B9%A6%E5%AE%89%E8%A3%85.png) 27 | 28 |   然后,在手机浏览器上打开 chls.pro/ssl, 即可自动安装(**安卓尽量使用本机自带浏览器**) 29 | 30 | #### 3.mitmproxy证书安装的Bug 31 |   PC端证书安装请自行谷歌。安装结束后可在用户目录的.mitmproxy目录下找到CA证书,如图: 32 | 33 | ![mitmproxy_ca](https://github.com/Northxw/Python3_WebSpider/blob/master/07-IGetGet/utils/mitmproxy%E8%AF%81%E4%B9%A6.png) 34 | 35 |   手机安装此证书不要局限于"mitmproxy-ca-cert.pem", 可能无法识别为CA证书并安装。可以尝试将上图中1-5中的任何一个传输到手机安装测试,哪个能用即用哪个。 36 | 37 | #### 4.数据库存储失败 38 |   测试期间,若添加数据库的插入操作,命令行就不会显示数据并且手机端网络丢失(当前局域网没有任何出错);而注释掉数据库插入操作,即可正常显示。具体原因尚不清楚,目前暂时将数据存储至Json。 错误如图: 39 | 40 | ![monogo_error](https://github.com/Northxw/Python3_WebSpider/blob/master/07-IGetGet/utils/app_error.jpg) 41 | 42 | # Other 43 |   目前的遗留问题是数据库存储,还没有一个切实可行的解决方案。有知道的可以提交issue。 44 | 45 | # Demo 46 | #### 1.JSON 47 | ![json_screenshot](https://github.com/Northxw/Python3_WebSpider/blob/master/07-IGetGet/utils/result_json.png) 48 | 49 | #### 2.Run Screenshot 50 | ![demo](https://github.com/Northxw/Python3_WebSpider/blob/master/07-IGetGet/utils/demo.gif) 51 | -------------------------------------------------------------------------------- /07-IGetGet/__pycache__/script.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/__pycache__/script.cpython-36.pyc -------------------------------------------------------------------------------- /07-IGetGet/script.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Updated at 9:45 at March 18, 2019 5 | @title: 爬取得到APP电子书信息并将数据存储至MongoDB 6 | @author: Northxw 7 | """ 8 | 9 | import time 10 | import json 11 | # import pymongo 12 | from mitmproxy import ctx 13 | 14 | """ 15 | class DedaoMongo(object): 16 | def __init__(self): 17 | # set client 18 | self.client = pymongo.MongoClient('localhst', 27017) 19 | # db 20 | self.db = self.client['dedao'] 21 | # set 22 | self.collection = self.db['ebook'] 23 | 24 | def update_book(self, book_info): 25 | self.collection.insert_one(book_info) 26 | 27 | """ 28 | 29 | def response(flow): 30 | """ 31 | 抓取得到APP电子书信息, 包含书本ID、书名、封面图片、推荐语、发布时间、当前时间、当前价格、内容简介等。 32 | """ 33 | # data_ = DedaoMongo() 34 | url = 'https://entree.igetget.com/ebook2/v1/ebook/list' 35 | if flow.request.url.startswith(url): 36 | text = flow.response.text 37 | data = json.loads(text) 38 | info = ctx.log.info 39 | books = data.get('c').get('list') 40 | 41 | ebooks = list() 42 | # 获取电子书信息 43 | for book in books: 44 | ebook_data = { 45 | # ID 46 | 'id': str(book['id']), 47 | # 书名 48 | 'name': book['operating_title'], 49 | # 封面 50 | 'ico': book['cover'], 51 | # 推荐语 52 | 'share_summary': book['other_share_summary'], 53 | # 发布时间 54 | 'publish_time': book['datetime'], 55 | # 当前价格 56 | 'current_price': book['current_price'], 57 | # 内容简介 58 | 'book_intro': book['book_intro'], 59 | } 60 | # data_.update_book(ebook_data) 61 | 62 | # 终端显示已获取到的信息 63 | info('ID:' + ebook_data['id']) 64 | info('书名:' + ebook_data['name']) 65 | info('推荐语:' + ebook_data['share_summary']) 66 | info('发布时间:' + ebook_data['publish_time']) 67 | info('当前价格:' + '¥{}'.format(ebook_data['current_price'])) 68 | info('封面:' + ebook_data['ico']) 69 | info('内容简介:' + ebook_data['book_intro']) 70 | info('-' * 80) 71 | 72 | # 存储为JSON格式 73 | with open('./dedao.json', 'a', encoding='utf-8') as f: 74 | f.write(json.dumps(ebook_data, indent=2, ensure_ascii=False)) 75 | f.write(', \n') -------------------------------------------------------------------------------- /07-IGetGet/utils/app_error.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/app_error.jpg -------------------------------------------------------------------------------- /07-IGetGet/utils/charles安装证书页面.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/charles安装证书页面.png -------------------------------------------------------------------------------- /07-IGetGet/utils/connect_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/connect_error.png -------------------------------------------------------------------------------- /07-IGetGet/utils/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/demo.gif -------------------------------------------------------------------------------- /07-IGetGet/utils/mitmproxy证书.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/mitmproxy证书.png -------------------------------------------------------------------------------- /07-IGetGet/utils/mongo_server_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/mongo_server_error.png -------------------------------------------------------------------------------- /07-IGetGet/utils/result_json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/result_json.png -------------------------------------------------------------------------------- /07-IGetGet/utils/乱码_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/乱码_01.png -------------------------------------------------------------------------------- /07-IGetGet/utils/乱码_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/乱码_02.png -------------------------------------------------------------------------------- /07-IGetGet/utils/手机证书安装.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/手机证书安装.png -------------------------------------------------------------------------------- /08-Selenium_Cnki/README.md: -------------------------------------------------------------------------------- 1 | # Project Name 2 |   使用Selenium注册并登录中国知网并识别知网的图形验证码。 3 | 4 | # Sort 5 |   **识别验证码** - 常见四位英文数字混合验证码 6 | 7 | # Demand 8 | **1. Selenium** - 建议使用低版本的Python-Selenium库,因为高版本在Chrome中不支持。 9 | ``` 10 | pip3 install selenium==2.48.0 11 | ``` 12 | **2. chromedriver.exe** - download_addr:http://npm.taobao.org/mirrors/chromedriver/ 13 | 14 | **3. Chaojiying_Python.rar** - download_addr:http://www.chaojiying.com/download/Chaojiying_Python.rar 15 | 16 | # Process analysis 17 | #### 1.验证码类型 18 |   知网注册页的验证码类型属于常见四位英文和数字组成的验证码。可以在超级鹰的 [验证码类型于价格表](http://www.chaojiying.com/price.html#table-item5) 页面参考。 19 | 20 | #### 2.Python识别库 - tesserocr、pytesseract 21 |   这两个三方库识别精度均较差, 字体略微差异可能就不是正常结果。所以选择超级鹰识别,识别前可做灰度、二值化处理(我这里做了注释选择不用,感觉平台打码精度挺高的),代码如下: 22 | ```Python 23 | def handle_code(image): 24 | """ 25 | 处理验证码 26 | :param image: Image对象 27 | :return: 28 | """ 29 | # 灰度处理 30 | image = image.convert("L") 31 | # 阈值120(可灵活配置) 32 | threshold = 120 33 | table = [] 34 | for i in range(256): # 35 | if i < threshold: 36 | table.append(0) 37 | else: 38 | table.append(1) 39 | # 二值化处理 40 | image = image.point(table, '1') 41 | # 使用tesserocr获取处理结果 42 | result_1 = tesserocr.image_to_text(image).strip() 43 | # 使用pytesseract获取处理结果 44 | result_2 = pytesseract.image_to_string(image).strip() 45 | # print('验证码为:', result) 46 | # 两者识别结果相同再继续程序,否则循环识别。但是代价很大,所以弃用。 47 | return result_1, result_2 48 | ``` 49 | 50 | # Other 51 |   代码可继续扩展,例如:登录后知网文献的爬取,并做数据可视化分析等。 52 | 53 | # Demo 54 | ![程序运行的GIF动态演示图](https://github.com/Northxw/Python3_WebSpider/blob/master/08-Selenium_Cnki/demo/demo.gif) 55 | -------------------------------------------------------------------------------- /08-Selenium_Cnki/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /08-Selenium_Cnki/__pycache__/handle_code.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/__pycache__/handle_code.cpython-36.pyc -------------------------------------------------------------------------------- /08-Selenium_Cnki/chaojiying.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import requests 4 | from hashlib import md5 5 | 6 | class Chaojiying(object): 7 | 8 | def __init__(self, username, password, soft_id): 9 | self.username = username 10 | password = password.encode('utf8') 11 | self.password = md5(password).hexdigest() 12 | self.soft_id = soft_id 13 | self.base_params = { 14 | 'user': self.username, 15 | 'pass2': self.password, 16 | 'softid': self.soft_id, 17 | } 18 | self.headers = { 19 | 'Connection': 'Keep-Alive', 20 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', 21 | } 22 | 23 | def PostPic(self, im, codetype): 24 | """ 25 | im: 图片字节 26 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html 27 | """ 28 | params = { 29 | 'codetype': codetype, 30 | } 31 | params.update(self.base_params) 32 | files = {'userfile': ('ccc.jpg', im)} 33 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) 34 | return r.json() 35 | 36 | def ReportError(self, im_id): 37 | """ 38 | im_id:报错题目的图片ID 39 | """ 40 | params = { 41 | 'id': im_id, 42 | } 43 | params.update(self.base_params) 44 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) 45 | return r.json() 46 | 47 | 48 | if __name__ == '__main__': 49 | # 以下均为超级鹰平台提供的样码,没有修改. 50 | chaojiying = Chaojiying('超级鹰用户名', '超级鹰用户名的密码', '96001') 51 | im = open('a.jpg', 'rb').read() 52 | print(chaojiying.PostPic(im, 1902)) 53 | 54 | -------------------------------------------------------------------------------- /08-Selenium_Cnki/demo/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/demo/demo.gif -------------------------------------------------------------------------------- /08-Selenium_Cnki/demo/超级鹰积分.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/demo/超级鹰积分.jpg -------------------------------------------------------------------------------- /08-Selenium_Cnki/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # 知网用户名 4 | USERNAME = '要注册的用户名' 5 | # 知网密码 6 | PASSWORD = '要注册用的密码' 7 | # 知网邮箱 8 | EMAIL = 'northxw@qq.com' 9 | 10 | # 目标站点 11 | URL = 'http://my.cnki.net/elibregister/commonRegister.aspx' 12 | 13 | # 超级鹰用户名、密码、软件ID、验证码类型 14 | CHAIJIYING_USERNAME = 'Northxw' 15 | CHAOJIYING_PASSWORD = '**********' 16 | CHAIJIYING_SOFT_ID = ******** 17 | CHAOJIYING_KIND = 1902 # 1902代表常见四位英文和数字组成的验证码。验证码类型可以在打码平台的题分表上查看 18 | -------------------------------------------------------------------------------- /08-Selenium_Cnki/utils/handle.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tesserocr 4 | import pytesseract 5 | 6 | def handle_code(image): 7 | """ 8 | 处理验证码 9 | :param image: Image对象 10 | :return: 11 | """ 12 | image = image.convert("L") # 灰度处理 13 | threshold = 120 # 设置阈值为120(可灵活配置) 14 | table = [] 15 | for i in range(256): # 16 | if i < threshold: 17 | table.append(0) 18 | else: 19 | table.append(1) 20 | image = image.point(table, '1') # 二值化处理 21 | result_1 = tesserocr.image_to_text(image).strip() # 使用tesserocr获取处理结果 22 | result_2 = pytesseract.image_to_string(image).strip() # 使用pytesseract获取处理结果 23 | # print('验证码为:', result_1) 24 | return result_1, result_2 25 | -------------------------------------------------------------------------------- /09-Bilibili/.idea/bilibili.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /09-Bilibili/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /09-Bilibili/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /09-Bilibili/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /09-Bilibili/README.md: -------------------------------------------------------------------------------- 1 | # Project name 2 |   使用Selenium模拟登录B站并破解滑动验证码。 3 | 4 | # Sort 5 |   **验证码识别** - 破解滑动验证码 6 | 7 | # Install 8 | **1. Selenium** - 建议使用低版本的Python-Selenium库,因为高版本在Chrome中不支持。 9 | ``` 10 | pip3 install selenium==2.48.0 11 | ``` 12 | **2. chromedriver.exe** - download_addr:http://npm.taobao.org/mirrors/chromedriver/, 版本要匹配。 13 | 14 | # Process analysis 15 | **1.验证码节点** 16 | 17 |   B站验证码只要鼠标悬浮滑块就会出现, 当验证码出现后定位节点即可。过程比较繁琐,直接贴出来: 18 | ``` 19 | img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'gt_box'))) 20 | ``` 21 | 22 | **2.获取坐标值** 23 | 24 |   获取的坐标值分别是左上角和右下角, 而前端页面的坐标原点在屏幕左上角并且元素节点一般都是相对位置,所以坐标值部分需要好好理解。比如B站登录界面包含"登录"的div节点其父节点是id=" login-app"的div,如图: 25 | 26 | ![location_demo](https://github.com/Northxw/Python3_WebSpider/blob/master/09-Bilibili/require/demo_location.png) 27 | 28 | **3.缺口偏移量** 29 | 30 |   通过遍历图片的每个坐标点获取两张图片对应像素点的RGB,如果RGB差距在阈值范围内就认为相同,继续比对下一像素点。如果超过阈值,则说明像素点不同,当前位置 31 | 即为缺口位置。 32 | ```Python 33 | def get_gap(self, image1, image2): 34 | """ 35 | 获取缺口偏移量 36 | :param image1: 不带缺口的图片 37 | :param image2: 带缺口的图片 38 | :return: None 39 | """ 40 | left = 60 41 | # 遍历两张图片的每个像素并判断同一位置像素是否相同,不相同的像素点即缺口位置 42 | for i in range(left, image1.size[0]): 43 | for j in range(image1.size[1]): 44 | if not self.is_pixel_equal(image1, image2, i, j): 45 | left = i 46 | return left 47 | return left 48 | 49 | def is_pixel_equal(self, image1, image2, x, y): 50 | """ 51 | 判断像素是否相同 52 | :param image1: 极验原图 53 | :param image2: 缺口图片 54 | :param x: 位置X 55 | :param y: 位置Y 56 | :return: 像素是否相同 57 | """ 58 | # 取两个图片的像素点 59 | pixel1 = image1.load()[x, y] 60 | pixel2 = image2.load()[x, y] 61 | # 阈值60 62 | threshold = 60 63 | # 比较RGB的绝对值是否小于阈值60,如果在阈值内则相同,反之不同 64 | if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(pixel1[2] - pixel2[2]) < threshold: 65 | return True 66 | else: 67 | return False 68 | ``` 69 | **4.模拟拖动** 70 | 71 |   模拟拖动滑块继承崔大模拟人类行为轨迹的"前段匀加速后段匀减速"。 72 | 73 | **5.点按滑块呼出验证码** 74 | 75 |   点按滑块后, 两到三秒后验证码会自动隐藏, 所以不要添加延时,直接获取。 76 | 77 | # Other 78 |   代码已更新, 正常情况下的破解率应该在50%以上, 主要看服务器判定边界的方式(可能像素差)。 79 | 80 | # Demo 81 | ![demo](https://github.com/Northxw/Python3_WebSpider/blob/master/09-Bilibili/require/demo.gif) 82 | -------------------------------------------------------------------------------- /09-Bilibili/captcha1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/captcha1.png -------------------------------------------------------------------------------- /09-Bilibili/captcha2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/captcha2.png -------------------------------------------------------------------------------- /09-Bilibili/require/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/require/demo.gif -------------------------------------------------------------------------------- /09-Bilibili/require/demo_location.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/require/demo_location.png -------------------------------------------------------------------------------- /09-Bilibili/utils/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/utils/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /09-Bilibili/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # B站账号 4 | EMAIL = 'northxw@163.com' 5 | # 密码 6 | PASSWORD = '******' 7 | 8 | # B站登录页URL 9 | URL = 'https://passport.bilibili.com/login' 10 | 11 | BORDER = 10 12 | INIT_LEFT = 51 13 | -------------------------------------------------------------------------------- /10-DouYin/README.md: -------------------------------------------------------------------------------- 1 | # DouYin 2 |   使用 Mitmdump 爬取 "抖音" App短视频信息,包含标题、视频下载地址、作者、发布时间、获赞数等。 3 | 4 | # Sort 5 |   **非自动化爬取App数据** - 基于Mitmproxy的Mitmdump组件实现APP数据的爬取。 6 | 7 | # Explain 8 | #### 1. Charles获取视频接口 9 |  爬取之前先将手机与PC至于同局域网并确保手机WIFI的代理端口为8888,然后打开Charles获取视频请求的链接,如图: 10 | 11 | ![video_url](https://github.com/Northxw/Python3_WebSpider/blob/master/10-DouYin/plates/charles.png) 12 | 13 | #### 2. 手动上滑触发视频请求接口 14 |   自动化滑动刷新有尝试过,但是由于技术有限,不能实现抖音APP的登录,所以用Charles只能获取视频下载链接,而不能获取其他有效信息,比如视频的名称、作者名称、获赞数、转发量等。 15 | 16 | #### 3. Python脚本获取视频信息 17 |   使用Python脚本拦截response爬取视频信息并下载视频,同时将视频信息存储至JSON。 18 | 19 | #### 4. 视频无水印 20 |   如图: 21 | 22 | ![video_demo](https://github.com/Northxw/Python3_WebSpider/blob/master/10-DouYin/plates/video_demo.gif) 23 | 24 | # Other 25 |   获取的数据不能直接存储至MongoDB等数据库,具体原因尚不清楚,若您知道,请提交issuse。 26 | 27 | # Demo Of Screenshot 28 | #### 1.JSON 29 | ![json_result](https://github.com/Northxw/Python3_WebSpider/blob/master/10-DouYin/plates/video_info_json.png) 30 | 31 | #### 2.VIDEO 32 | ![video_screenshot](https://github.com/Northxw/Python3_WebSpider/blob/master/10-DouYin/plates/video_screentshot.png) 33 | 34 | #### 3.Demo 35 | ![gif_show](https://github.com/Northxw/Python3_WebSpider/blob/master/10-DouYin/plates/demo.gif) 36 | -------------------------------------------------------------------------------- /10-DouYin/__pycache__/script.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/__pycache__/script.cpython-36.pyc -------------------------------------------------------------------------------- /10-DouYin/plates/JSONDecodeError.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/JSONDecodeError.jpg -------------------------------------------------------------------------------- /10-DouYin/plates/TypeError.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/TypeError.jpg -------------------------------------------------------------------------------- /10-DouYin/plates/charles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/charles.png -------------------------------------------------------------------------------- /10-DouYin/plates/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/demo.gif -------------------------------------------------------------------------------- /10-DouYin/plates/video_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/video_demo.gif -------------------------------------------------------------------------------- /10-DouYin/plates/video_info_json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/video_info_json.png -------------------------------------------------------------------------------- /10-DouYin/plates/video_screentshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/video_screentshot.png -------------------------------------------------------------------------------- /10-DouYin/script.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Updated at 17:57 at March 19,2019 5 | @title: 爬取抖音App短视频 6 | @author: Northxw 7 | """ 8 | 9 | from mitmproxy import ctx 10 | import json 11 | import requests 12 | import time 13 | import os 14 | 15 | def response(flow): 16 | """ 17 | 抓取抖音标题、APP视频链接、作者、抖音ID、发布时间、获赞数、评论和转发数等信息, 并将结果保存为JSON格式. 18 | :return: None 19 | """ 20 | # 通过Charles获取的抖音视频信息的URL接口 21 | url = 'https://api.amemv.com/' 22 | if flow.request.url.startswith(url): 23 | # 获取服务器返回的响应 24 | text = flow.response.text 25 | # 转化为Json格式 26 | dyjson = json.loads(text) 27 | info = ctx.log.info 28 | 29 | # 获取视频列表 30 | aweme_list = dyjson.get('aweme_list') 31 | # 遍历列表,获取每个视频的相应数据 32 | for i in range(len(aweme_list)): 33 | # 视频标题 34 | title = aweme_list[i].get('share_info').get('share_title') 35 | # 视频链接 36 | videourl = aweme_list[i].get('video').get('play_addr').get('url_list')[0] 37 | # 保存视频 38 | res = requests.get(videourl, stream=True) 39 | # 规范文件命名 40 | _str = ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '.', '..', '?'] 41 | for _ in _str: 42 | if _ in title: 43 | title.replace(_, '') 44 | # 判断文件路径是否存在 45 | save_dir = './video/' 46 | if not os.path.exists(save_dir): 47 | os.mkdir(save_dir) 48 | with open('{}/{}.mp4'.format(save_dir, title), 'wb') as f: 49 | f.write(res.content) 50 | 51 | # 作者名称 52 | nickname = aweme_list[i].get('author').get('nickname') 53 | # 抖音ID 54 | short_id = aweme_list[i].get('author').get('short_id') 55 | # 发布时间 56 | create_time = aweme_list[i].get('create_time') 57 | # 格式化 58 | create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(create_time)) 59 | # 获赞、评论、转发数 60 | digg_count = aweme_list[i].get('statistics').get('digg_count') 61 | comment_count = aweme_list[i].get('statistics').get('comment_count') 62 | share_count = aweme_list[i].get('statistics').get('share_count') 63 | 64 | # 显示所有获取信息 65 | info("标题:" + title) 66 | info("URL:" + videourl) 67 | info("作者: " + nickname) 68 | info("ID: " + short_id) 69 | info("发布时间: " + create_time) 70 | info("获赞:" + str(digg_count)) 71 | info("评论:" + str(comment_count)) 72 | info("转发:" + str(share_count)) 73 | info('-'*80) 74 | 75 | # 保存为json文件 76 | data = { 77 | 'title': title, 78 | 'url': videourl, 79 | 'nickname': nickname, 80 | 'douyin_id': short_id, 81 | 'create_time': create_time, 82 | 'diggs': digg_count, 83 | 'commments': comment_count, 84 | 'shares': share_count 85 | } 86 | 87 | # 下载视频 88 | with open('./douyin.json', 'a', encoding='utf-8') as f: 89 | f.write(json.dumps(data, indent=2, ensure_ascii=False)) 90 | f.write(', \n') 91 | -------------------------------------------------------------------------------- /11-Jianshu/README.md: -------------------------------------------------------------------------------- 1 | ## JianShu 2 |   结合 Appium 和 mitmdump 实现自动化获取简书"发现"页面的推荐文章信息, 包括文章标题、作者、评论数、点赞数、阅读量等。 3 | 4 | ## Demo 5 | ![程序运行的动态演示图](https://github.com/Northxw/Python3_WebSpider/blob/master/11-Jianshu/demo/demo.gif) 6 | -------------------------------------------------------------------------------- /11-Jianshu/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /11-Jianshu/__pycache__/script.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/__pycache__/script.cpython-36.pyc -------------------------------------------------------------------------------- /11-Jianshu/action.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from appium import webdriver 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from selenium.common.exceptions import NoSuchElementException 8 | from time import sleep 9 | from .utils.config import * 10 | 11 | class JianshuAction(object): 12 | def __init__(self): 13 | """ 14 | 初始化信息 15 | """ 16 | # 驱动配置 17 | self.desired_caps = { 18 | "platformName": PLATFORM, 19 | "deviceName": DEVICE_NAME, 20 | "appPackage": APP_PACKAGE, 21 | "appActivity": APP_ACTIVITY 22 | } 23 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps) 24 | self.wait = WebDriverWait(self.driver, TIMEOUT) 25 | 26 | def login(self): 27 | """ 28 | 登录 29 | :return: None 30 | """ 31 | # 点击"我的"进入登录界面 32 | tab_login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/tab_more'))) 33 | tab_login.click() 34 | sleep(3) 35 | # 点击"头像"登录简书 36 | image_login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/user_top_info_avatar'))) 37 | image_login.click() 38 | sleep(3) 39 | 40 | # 用户 41 | # account = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/et_account'))) 42 | # account.send_keys(USER_PHONENUMBER) 43 | # 密码 44 | # password = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/et_password'))) 45 | # password.send_keys(PASSWORD) 46 | 47 | # 选择"微信登录"省略输入账号密码的步骤 48 | weixin_login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/iv_wechat'))) 49 | weixin_login.click() 50 | sleep(10) 51 | 52 | # 解释:因为之前已经微信授权,所以这里直接登录进入个人页面 53 | 54 | # 点击"发现"进入文章推荐页面 55 | discorver = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/tab_discover'))) 56 | discorver.click() 57 | sleep(3) 58 | 59 | def scroll(self): 60 | """ 61 | 上滑页面、触发请求 62 | :return:None 63 | """ 64 | # 由于推荐页面的文章数目很多,当前仅获取部分文章信息。 65 | count = 1000 # 可灵活配置该参数 66 | while count > 0: 67 | # 模拟拖动 68 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y) 69 | sleep(SCROLL_SLEEP_TIME) 70 | count = count - 1 71 | 72 | def main(self): 73 | """ 74 | 主函数 75 | :return: 76 | """ 77 | self.login() 78 | self.scroll() 79 | 80 | if __name__ == '__main__': 81 | action = JianshuAction() 82 | action.main() 83 | -------------------------------------------------------------------------------- /11-Jianshu/demo/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/demo/demo.gif -------------------------------------------------------------------------------- /11-Jianshu/result/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/result/__init__.py -------------------------------------------------------------------------------- /11-Jianshu/script.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created at 20:50 on Nov 13,2018 5 | @title: 自动化抓取简书文章信息 6 | @author: Northxw 7 | """ 8 | 9 | from mitmproxy import ctx 10 | import json 11 | 12 | def response(flow): 13 | """ 14 | 爬取简书"发现"页面的推荐文章信息,包括文章标题、作者、ID、喜欢数、评论数、获赏数、阅读量等 15 | :return: None 16 | """ 17 | url = 'https://s0.jianshuapi.com/' 18 | url_ = 'https://s0.jianshuapi.com/v3/trending/now3?' 19 | if flow.request.url.startswith(url): 20 | if flow.request.url.startswith(url_): 21 | text = flow.response.text # 获取响应 22 | data = json.loads(text) 23 | info = ctx.log.info 24 | 25 | # 获取文章信息列表 26 | for i in range(len(data)): 27 | # 文章标题 28 | title = data[i].get('object').get('data').get('title') 29 | # ID 30 | id = data[i].get('object').get('data').get('user').get('id') 31 | # 作者 32 | author = data[i].get('object').get('data').get('user').get('nickname') 33 | # 获得的"喜欢" 34 | likes_count = data[i].get('object').get('data').get('likes_count') 35 | # 评论数 36 | comments_count = data[i].get('object').get('data').get('comments_count') 37 | # 获得的"赞赏" 38 | total_rewards_count = data[i].get('object').get('data').get('total_rewards_count') 39 | # 阅读数 40 | views_count = data[i].get('object').get('data').get('views_count') 41 | 42 | # 显示获取的信息 43 | info('总数据' + str(len(data))) 44 | info('文章标题:' + title) 45 | info('作者:' + author) 46 | info('ID:' + str(id)) 47 | info('喜欢:' + str(likes_count)) 48 | info('评论:' + str(comments_count)) 49 | info('赞赏:' + str(total_rewards_count)) 50 | info('阅读量:' + str(views_count)) 51 | info('-'*80) 52 | 53 | # 存储为JSON文件 54 | data_ = { 55 | 'title': title, 56 | 'id': id, 57 | 'author': author, 58 | 'likes': likes_count, 59 | 'comments': comments_count, 60 | 'rewards': total_rewards_count, 61 | 'views': views_count, 62 | } 63 | with open('./result/jianshu.json', 'a', encoding='utf-8') as f: 64 | f.write(json.dumps(data_, indent=2, ensure_ascii=False)) 65 | f.write(', \n') 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /11-Jianshu/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /11-Jianshu/utils/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/utils/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /11-Jianshu/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # Appium 服务器 4 | DRIVER_SERVER = 'http://localhost:4723/wd/hub' 5 | 6 | # 启动参数:设备类型、名称、APP包名、入口类型 7 | PLATFORM = 'Android' 8 | DEVICE_NAME = 'vivo_X7' 9 | APP_PACKAGE = 'com.jianshu.haruki' 10 | APP_ACTIVITY = 'com.baiji.jianshu.MainActivity' 11 | 12 | # 简书账号、密码 13 | USER_PHONENUMBER = '********' 14 | PASSWORD = '********' 15 | 16 | # 等待时间 17 | TIMEOUT = 100 18 | 19 | # 滑动点 20 | FLICK_START_X = 300 21 | FLICK_START_Y = 300 22 | FLICK_DISTANCE = 600 23 | 24 | # 滑动的间隔时间 25 | SCROLL_SLEEP_TIME = 3 26 | -------------------------------------------------------------------------------- /12-Crack_Jianshu/.idea/Jianshu.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /12-Crack_Jianshu/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /12-Crack_Jianshu/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /12-Crack_Jianshu/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /12-Crack_Jianshu/README.md: -------------------------------------------------------------------------------- 1 | # Project name 2 |   使用Selenium模拟登录网页版简书并识别点触式验证码。 3 | 4 | # Sort 5 |   **验证码识别** - 点触验证码 6 | 7 | # Demand 8 | **1. Selenium** 9 | ``` 10 | pip3 install selenium==2.48.0 11 | ``` 12 | **2. chromedriver.exe** - download_addr:http://npm.taobao.org/mirrors/chromedriver/ 13 | 14 | **3. Chaojiying_Python.rar** - download_addr:http://www.chaojiying.com/download/Chaojiying_Python.rar 15 | 16 | # Process analysis 17 | ### 1.不要频繁运行程序模拟登录 18 |   频繁模拟登录并识别验证码后,会出现验证码却来越模糊到难以识别,并且识别后点击"确认"按钮无法登录(或者说登录按键失效的)的情况。如图所示的位置失效: 19 | 20 | ![sure_button](https://github.com/Northxw/Python3_WebSpider/blob/master/12-Crack_Jianshu/require/code_demo.png) 21 | 22 | ### 2.超级鹰 23 |   [超级鹰打码平台](http://www.chaojiying.com/) 打码效率可以达到90%以上。在平台上注册绑定微信后会赠送1000积分,基本够用了。如图是我的积分情况: 24 | 25 | ![jifen](https://github.com/Northxw/Python3_WebSpider/blob/master/12-Crack_Jianshu/require/chaojiying.png) 26 | 27 | ### 3.超级鹰软件ID和验证码类型 28 |   软件ID相当于工作牌(或护照),每次打码都必须携带;验证码类型需要你去 [平台](http://www.chaojiying.com/price.html#table-item5) 确认。例如该项目的验证码类型属于**9004 坐标多选,返回1~4个坐标**。 29 | 30 | ### 4.识别思路(简要) 31 |   首先,获取验证码位置并获取网页截图;然后,裁剪获取验证码图像并以字节流的格式发送给超级鹰打码平台;最后,转化识别结果并使用Selenium点击登录。 32 | 33 | # Other 34 |   代码中pass留空函数为预留功能:爬取简书文章信息。有兴趣可以继续完善。 35 | 36 | # Demo 37 | ![demo](https://github.com/Northxw/Python3_WebSpider/blob/master/12-Crack_Jianshu/require/demo.gif) 38 | -------------------------------------------------------------------------------- /12-Crack_Jianshu/captcha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/captcha.png -------------------------------------------------------------------------------- /12-Crack_Jianshu/require/chaojiying.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/chaojiying.png -------------------------------------------------------------------------------- /12-Crack_Jianshu/require/code_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/code_demo.png -------------------------------------------------------------------------------- /12-Crack_Jianshu/require/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/demo.gif -------------------------------------------------------------------------------- /12-Crack_Jianshu/require/超级鹰返回结果处理示例.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/超级鹰返回结果处理示例.png -------------------------------------------------------------------------------- /12-Crack_Jianshu/utils/__pycache__/chaojiying.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/utils/__pycache__/chaojiying.cpython-36.pyc -------------------------------------------------------------------------------- /12-Crack_Jianshu/utils/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/utils/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /12-Crack_Jianshu/utils/chaojiying.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | 4 | import requests 5 | from hashlib import md5 6 | 7 | class Chaojiying_Client(object): 8 | 9 | def __init__(self, username, password, soft_id): 10 | self.username = username 11 | self.password = md5(password.encode('utf-8')).hexdigest() 12 | self.soft_id = soft_id 13 | self.base_params = { 14 | 'user': self.username, 15 | 'pass2': self.password, 16 | 'softid': self.soft_id, 17 | } 18 | self.headers = { 19 | 'Connection': 'Keep-Alive', 20 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', 21 | } 22 | 23 | def PostPic(self, im, codetype): 24 | """ 25 | im: 图片字节 26 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html 27 | """ 28 | params = { 29 | 'codetype': codetype, 30 | } 31 | params.update(self.base_params) 32 | files = {'userfile': ('ccc.jpg', im)} 33 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) 34 | return r.json() 35 | 36 | def ReportError(self, im_id): 37 | """ 38 | im_id:报错题目的图片ID 39 | """ 40 | params = { 41 | 'id': im_id, 42 | } 43 | params.update(self.base_params) 44 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) 45 | return r.json() -------------------------------------------------------------------------------- /12-Crack_Jianshu/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # 简书登录界面URL 4 | URL = 'https://www.jianshu.com/sign_in' 5 | 6 | # 邮箱(手机号)、密码 7 | EMAIL = 'northxw@163.com' 8 | PASSWORD = '******' 9 | 10 | # 超级鹰用户名、密码、软件ID、验证码类型 11 | CHAIJIYING_USERNAME = 'Northxw' 12 | CHAOJIYING_PASSWORD = '******' 13 | CHAIJIYING_SOFT_ID = '******' 14 | CHAOJIYING_KIND = 9004 15 | 16 | # 显式加载时间 17 | TIME_OUT = 15 -------------------------------------------------------------------------------- /13-Pyspider_Lagou/README.md: -------------------------------------------------------------------------------- 1 | ## Pyspider Lagou 2 |   本次实践使用国人 **binux** 编写的 Pyspider 框架爬取拉勾网发布的职位信息, 主要包括招聘公司、职位、薪资、岗位要求、职位描述等。 3 | 4 | ## Explain 5 |   本次提交的代码, 通过WebUI界面单步测试未发现Bug, 测试请使用demo.py。若出现599证书问题,可参考"[599 Error 解决方案](https://www.jianshu.com/p/6900cce4e488)"。其他Bug,可邮箱联系。 6 | 7 | ## Demo 8 | ![数据库](https://github.com/Northxw/Python3_WebSpider/blob/master/13-Pyspider_Lagou/result/db.jpg) 9 | -------------------------------------------------------------------------------- /13-Pyspider_Lagou/data/project.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/project.db -------------------------------------------------------------------------------- /13-Pyspider_Lagou/data/result.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/result.db -------------------------------------------------------------------------------- /13-Pyspider_Lagou/data/scheduler.1d: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/scheduler.1d -------------------------------------------------------------------------------- /13-Pyspider_Lagou/data/scheduler.1h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/scheduler.1h -------------------------------------------------------------------------------- /13-Pyspider_Lagou/data/scheduler.all: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/scheduler.all -------------------------------------------------------------------------------- /13-Pyspider_Lagou/data/task.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/task.db -------------------------------------------------------------------------------- /13-Pyspider_Lagou/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2018-11-16 11:48:05 4 | # Project: Lagou 5 | 6 | from pyspider.libs.base_handler import * 7 | import time 8 | 9 | class Handler(BaseHandler): 10 | crawl_config = { 11 | 'headers': { 12 | 'Host': 'www.lagou.com', 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' 14 | }, 15 | } 16 | 17 | @every(minutes=24 * 60) 18 | def on_start(self): 19 | self.crawl('https://www.lagou.com/zhaopin/Python/', callback=self.index_page, validate_cert=False, 20 | params={'labelWords': 'label'}) 21 | 22 | @config(age=2 * 60 * 60) 23 | def index_page(self, response): 24 | for each in response.doc('.position_link').items(): 25 | self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False) 26 | time.sleep(1) 27 | # 获取下一页链接 28 | next = response.doc('.item_con_pager a:last-child').attr.href 29 | self.crawl(next, callback=self.index_page, validate_cert=False) 30 | 31 | @config(priority=2) 32 | def detail_page(self, response): 33 | return { 34 | "company": response.doc('.job-name > .company').text(), 35 | "job": response.doc('.job-name > .name').text(), 36 | "salary": response.doc('.salary').text(), 37 | "other": response.doc('.job_request span').text().split('/')[1:-1], 38 | "labels": response.doc('.job_request li').text(), 39 | "publish_time": "".join(response.doc('.publish_time').text().split()), 40 | "job_advantage": response.doc('.job-advantage > p').text(), 41 | "job_description": response.doc('.job_bt p').text(), 42 | "work_address": response.doc('.work_addr').text().replace('查看地图', '') 43 | } -------------------------------------------------------------------------------- /13-Pyspider_Lagou/lagou.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2018-11-16 11:48:05 4 | # Project: Lagou 5 | 6 | from pyspider.libs.base_handler import * 7 | from pymongo import MongoClient 8 | import time 9 | 10 | 11 | class Mongo(object): 12 | def __init__(self): 13 | # 初始化数据库 14 | self.client = MongoClient() 15 | self.db = self.client['lagou'] 16 | self.collection = self.db['python'] 17 | 18 | def insert(self, data): 19 | # 将字典数据插入到数据库 20 | if data: 21 | self.collection.insert(data) 22 | 23 | def __del__(self): 24 | # 关闭数据库连接 25 | self.client.close() 26 | 27 | 28 | class Agent_abuyun(object): 29 | def __init__(self): 30 | self.proxyHost = "proxy.abuyun.com" 31 | self.proxyPort = "9010" 32 | self.proxyUser = "H72RXH024162Y0VD" 33 | self.proxyPass = "E8A5838333933FFE" 34 | 35 | def ip_port(self): 36 | # 代理隧道验证信息 37 | proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 38 | "host": self.proxyHost, 39 | "port": self.proxyPort, 40 | "user": self.proxyUser, 41 | "pass": self.proxyPass, 42 | } 43 | proxies = { 44 | "http": proxyMeta, 45 | "https": proxyMeta, 46 | } 47 | return proxies 48 | 49 | 50 | class Handler(BaseHandler): 51 | crawl_config = { 52 | 'headers': { 53 | 'Host': 'www.lagou.com', 54 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' 55 | }, 56 | 'proxy': Agent_abuyun().ip_port(), 57 | 'mongo': Mongo(), 58 | } 59 | 60 | @every(minutes=24 * 60) 61 | def on_start(self): 62 | self.crawl('https://www.lagou.com/zhaopin/Python/', callback=self.index_page, validate_cert=False, 63 | params={'labelWords': 'label'}) 64 | 65 | # 设置任务有效期为两个小时(因为一般为30个页面左右) 66 | @config(age=2 * 60 * 60) 67 | def index_page(self, response): 68 | for each in response.doc('.position_link').items(): 69 | self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False) 70 | time.sleep(1) 71 | # 获取下一页链接 72 | next = response.doc('.item_con_pager a:last-child').attr.href 73 | self.crawl(next, callback=self.index_page, validate_cert=False) 74 | 75 | @config(priority=2) 76 | def detail_page(self, response): 77 | return { 78 | "company": response.doc('.job-name > .company').text(), 79 | "job": response.doc('.job-name > .name').text(), 80 | "salary": response.doc('.salary').text(), 81 | "other": response.doc('.job_request span').text().split('/')[1:-1], 82 | "labels": response.doc('.job_request li').text(), 83 | "publish_time": "".join(response.doc('.publish_time').text().split()), 84 | "job_advantage": response.doc('.job-advantage > p').text(), 85 | "job_description": response.doc('.job_bt p').text(), 86 | "work_address": response.doc('.work_addr').text().replace('查看地图', '') 87 | } 88 | 89 | def on_result(self, data): 90 | self.crawl_config['mongo'].insert(data) -------------------------------------------------------------------------------- /13-Pyspider_Lagou/result/db.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/db.jpg -------------------------------------------------------------------------------- /13-Pyspider_Lagou/result/单步测试结果_01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_01.jpg -------------------------------------------------------------------------------- /13-Pyspider_Lagou/result/单步测试结果_02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_02.jpg -------------------------------------------------------------------------------- /13-Pyspider_Lagou/result/单步测试结果_03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_03.jpg -------------------------------------------------------------------------------- /13-Pyspider_Lagou/result/单步测试结果_04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_04.jpg -------------------------------------------------------------------------------- /13-Pyspider_Lagou/result/单步测试结果_05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_05.jpg -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/README.md: -------------------------------------------------------------------------------- 1 | ## Scrapy Tutorial 2 |   Scrapy初体验 - 使用 Scrapy 框架抓取 tutorial 站点的相关信息, 包含文本、作者、标签等。 3 | 4 | ## Tip 5 |   本次实践参考崔大"Python3网络爬虫开发实战"第13章, 在此基础上做了些许优化。代码可在 Pycharm 中直接运行,只需要 **run main.py** 即可。 6 | 7 | ## Demo 8 | ![运行动态图](https://github.com/Northxw/Python3_WebSpider/blob/master/14-Scrapy_Tutorial/demo.gif) 9 | -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/demo.gif -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutorial 12 | -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__init__.py -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QuoteItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | text = scrapy.Field() 15 | author = scrapy.Field() 16 | tags = scrapy.Field() 17 | -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/main.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | execute('scrapy crawl quotes'.split()) -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exceptions import DropItem 9 | import pymongo 10 | 11 | class TextPipeline(object): 12 | def __init__(self): 13 | self.limit = 50 14 | 15 | def process_item(self, item, spider): 16 | if item['text']: 17 | if len(item['text']) > self.limit: 18 | item['text'] = item['text'][0:self.limit].rstrip() + '...' 19 | return item 20 | else: 21 | return DropItem('Missing Text') 22 | 23 | class MongoPipeline(object): 24 | def __init__(self, mongo_url, mongo_db): 25 | self.mongo_url = mongo_url 26 | self.mongo_db = mongo_db 27 | 28 | @classmethod 29 | def from_crawler(cls, crawler): 30 | return cls( 31 | mongo_url=crawler.settings.get('MONGO_URL'), 32 | mongo_db = crawler.settings.get('MONGO_DB') 33 | ) 34 | 35 | def open_spider(self, spider): 36 | self.client = pymongo.MongoClient(self.mongo_url) 37 | self.db = self.client[self.mongo_db] 38 | 39 | def process_item(self, item, spider): 40 | name = item.__class__.__name__ 41 | self.db[name].insert(dict(item)) 42 | return item 43 | 44 | def close_spider(self, spider): 45 | self.client.close() -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tutorial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tutorial' 13 | 14 | SPIDER_MODULES = ['tutorial.spiders'] 15 | NEWSPIDER_MODULE = 'tutorial.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tutorial.middlewares.TutorialDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'tutorial.pipelines.TextPipeline': 300, 69 | 'tutorial.pipelines.MongoPipeline': 400, 70 | } 71 | 72 | # MongoDB SETTINGS 73 | MONGO_URL = 'localhost' 74 | MONGO_DB = 'tutorial' 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/quotes.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/quotes.cpython-36.pyc -------------------------------------------------------------------------------- /14-Scrapy_Tutorial/tutorial/tutorial/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from ..items import QuoteItem 4 | from traceback import format_exc, print_exc 5 | 6 | class QuotesSpider(scrapy.Spider): 7 | name = 'quotes' 8 | allowed_domains = ['quotes.toscrape.com'] 9 | start_urls = ['http://quotes.toscrape.com/'] 10 | 11 | def parse(self, response): 12 | global next 13 | quotes = response.css('.quote') 14 | for quote in quotes: 15 | item = QuoteItem() 16 | item['text'] = quote.css('.text::text').extract_first() 17 | item['author'] = quote.css('.author::text').extract_first() 18 | item['tags'] = quote.css('.tags .tag::text').extract() 19 | yield item 20 | try: 21 | next = response.css('.pager .next a:attr("href")').extract_first() 22 | except Exception as e: 23 | _ = e # 接收异常 24 | next = None 25 | if next: 26 | url = response.urljoin(next) 27 | yield scrapy.Request(url=url, callback=self.parse) 28 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/README.md: -------------------------------------------------------------------------------- 1 | ## Scrapy Images360 2 |   Scrapy实战 - 使用 Scrapy 框架抓取 image.so.com 站点的图片信息。包括图片ID、图片标题、图片下载链接等。 3 | 4 | ## Tip 5 |   本次实践依旧参考崔大"Python3网络爬虫开发实战"第13章, 在原代码的基础上添加了UA中间件(随机User-Agent)。继续加油, 再接再厉! 6 | 7 | ## Demo of Images 8 | ![GIF](https://github.com/Northxw/Python3_WebSpider/blob/master/15-Scrapy_Images360/screenshot/images.jpg) 9 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__init__.py -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01a3ee5a4ff05fe133.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01a3ee5a4ff05fe133.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01a5f844c4a5d5ed7d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01a5f844c4a5d5ed7d.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01ad50ec608cde5fdc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01ad50ec608cde5fdc.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01aed1278f885e26ec.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01aed1278f885e26ec.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01b29ea494ffdab388.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01b29ea494ffdab388.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01bf8bb6d4c6b93fff.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01bf8bb6d4c6b93fff.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/images/t01c2bb853e048be307.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01c2bb853e048be307.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | class Images360Item(Item): 11 | # MongoDB、Mysql存储的表格名称 12 | collection = table = 'images' 13 | # ID 14 | id = Field() 15 | # 链接 16 | url = Field() 17 | # 标题 18 | title = Field() 19 | # 缩略图 20 | thumb = Field() 21 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from scrapy.cmdline import execute 4 | 5 | execute("scrapy crawl images".split()) -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | import random 9 | import logging 10 | 11 | class UAMiddleware(object): 12 | def __init__(self): 13 | # 添加UA 14 | self.ua_list = [ 15 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ', 16 | '(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 17 | 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', 18 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 19 | 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 20 | 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', 21 | ] 22 | 23 | def process_request(self, request, spider): 24 | user_agent = random.choices(self.ua_list) 25 | request.headers['User-Agent'] = user_agent 26 | # 通过打印日志查看随机User-Agent 27 | # logging.info(request.url) 28 | # logging.info(request.headers['User-Agent']) 29 | 30 | def process_response(self, request, response, spider): 31 | return response 32 | 33 | def process_exception(self, request, exception, spider): 34 | pass -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy import Request 9 | from scrapy.exceptions import DropItem 10 | from scrapy.pipelines.images import ImagesPipeline 11 | import pymongo 12 | import pymysql 13 | 14 | class MongoPipeline(object): 15 | def __init__(self, mongo_url, mongo_db): 16 | self.mongo_url = mongo_url 17 | self.mong_db = mongo_db 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | # 通过crawler对象拿到Scrapy的所有核心组件(如全局配置信息)并创建一个Pipeline实例 22 | return cls( 23 | mongo_url=crawler.settings.get('MONGO_URL'), 24 | mongo_db=crawler.settings.get('MONGO_DB') 25 | ) 26 | 27 | def open_spider(self, spider): 28 | # 创建数据库连接对象 29 | self.client = pymongo.MongoClient(self.mongo_url) 30 | # 指定数据库 31 | self.db = self.client[self.mong_db] 32 | 33 | def process_item(self, item, spider): 34 | # 将数据插入到指定的表格 35 | self.db[item.collection].insert(dict(item)) 36 | return item 37 | 38 | def close_spider(self, spider): 39 | # 关闭数据库连接 40 | self.client.close() 41 | 42 | 43 | class MysqlPipeline(): 44 | def __init__(self, host, database, user, password, port): 45 | self.host = host 46 | self.database = database 47 | self.user = user 48 | self.password = password 49 | self.port = port 50 | 51 | @classmethod 52 | def from_crawler(cls, crawler): 53 | return cls( 54 | host=crawler.settings.get('MYSQL_HOST'), 55 | database=crawler.settings.get('MYSQL_DATABASE'), 56 | user=crawler.settings.get('MYSQL_USER'), 57 | password=crawler.settings.get('MYSQL_PASSWORD'), 58 | port=crawler.settings.get('MYSQL_PORT'), 59 | ) 60 | 61 | def open_spider(self, spider): 62 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', 63 | port=self.port) 64 | self.cursor = self.db.cursor() 65 | 66 | def process_item(self, item, spider): 67 | data = dict(item) 68 | keys = ', '.join(data.keys()) 69 | values = ', '.join(['%s'] * len(data)) 70 | sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values) 71 | self.cursor.execute(sql, tuple(data.values())) 72 | self.db.commit() 73 | return item 74 | 75 | def close_spider(self, spider): 76 | self.db.close() 77 | 78 | class ImagePipeline(ImagesPipeline): 79 | def file_path(self, request, response=None, info=None): 80 | url = request.url 81 | file_name = url.split('/')[-1] 82 | return file_name 83 | 84 | def item_completed(self, results, item, info): 85 | image_paths = [x['path'] for ok, x in results if ok] 86 | if not image_paths: 87 | raise DropItem('Image Downloaded Failed') 88 | return item 89 | 90 | def get_media_requests(self, item, info): 91 | yield Request(item['url']) -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for images360 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'images360' 13 | 14 | SPIDER_MODULES = ['images360.spiders'] 15 | NEWSPIDER_MODULE = 'images360.spiders' 16 | 17 | MAX_PAGE = 50 18 | 19 | # MonogDB Settings 20 | MONGO_URL = 'localhost' 21 | MONGO_DB = 'images360' 22 | 23 | # Mysql Settings 24 | MYSQL_HOST = 'localhost' 25 | MYSQL_DATABASE = 'images360' 26 | MYSQL_USER = 'root' 27 | MYSQL_PASSWORD = '0000' 28 | MYSQL_PORT = 3306 29 | 30 | # Image Path 31 | IMAGES_STORE = './images' 32 | 33 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 34 | #USER_AGENT = 'images360 (+http://www.yourdomain.com)' 35 | 36 | # Obey robots.txt rules 37 | ROBOTSTXT_OBEY = False 38 | 39 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 40 | #CONCURRENT_REQUESTS = 32 41 | 42 | # Configure a delay for requests for the same website (default: 0) 43 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 44 | # See also autothrottle settings and docs 45 | #DOWNLOAD_DELAY = 3 46 | # The download delay setting will honor only one of: 47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 48 | #CONCURRENT_REQUESTS_PER_IP = 16 49 | 50 | # Disable cookies (enabled by default) 51 | #COOKIES_ENABLED = False 52 | 53 | # Disable Telnet Console (enabled by default) 54 | #TELNETCONSOLE_ENABLED = False 55 | 56 | # Override the default request headers: 57 | #DEFAULT_REQUEST_HEADERS = { 58 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 59 | # 'Accept-Language': 'en', 60 | #} 61 | 62 | # Enable or disable spider middlewares 63 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 64 | #SPIDER_MIDDLEWARES = { 65 | # 'images360.middlewares.Images360SpiderMiddleware': 543, 66 | #} 67 | 68 | # Enable or disable downloader middlewares 69 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 70 | DOWNLOADER_MIDDLEWARES = { 71 | 'images360.middlewares.UAMiddleware': 543, 72 | } 73 | 74 | # Enable or disable extensions 75 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 76 | #EXTENSIONS = { 77 | # 'scrapy.extensions.telnet.TelnetConsole': None, 78 | #} 79 | 80 | # Configure item pipelines 81 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 82 | ITEM_PIPELINES = { 83 | 'images360.pipelines.MongoPipeline': 300, 84 | 'images360.pipelines.MysqlPipeline': 301, 85 | 'images360.pipelines.ImagePipeline': 302, 86 | } 87 | 88 | # Enable and configure the AutoThrottle extension (disabled by default) 89 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 90 | #AUTOTHROTTLE_ENABLED = True 91 | # The initial download delay 92 | #AUTOTHROTTLE_START_DELAY = 5 93 | # The maximum download delay to be set in case of high latencies 94 | #AUTOTHROTTLE_MAX_DELAY = 60 95 | # The average number of requests Scrapy should be sending in parallel to 96 | # each remote server 97 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 98 | # Enable showing throttling stats for every response received: 99 | #AUTOTHROTTLE_DEBUG = False 100 | 101 | # Enable and configure HTTP caching (disabled by default) 102 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 103 | #HTTPCACHE_ENABLED = True 104 | #HTTPCACHE_EXPIRATION_SECS = 0 105 | #HTTPCACHE_DIR = 'httpcache' 106 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 107 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 108 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/spiders/__pycache__/images.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/spiders/__pycache__/images.cpython-36.pyc -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/images360/spiders/images.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy import Spider, Request 5 | from urllib.parse import urlencode 6 | from ..items import Images360Item 7 | import json 8 | 9 | class ImagesSpider(scrapy.Spider): 10 | name = 'images' 11 | allowed_domains = ['image.so.com'] 12 | 13 | def start_requests(self): 14 | # GET请求参数 15 | data = { 16 | 'ch': 'photography', 17 | 'listtype': 'new', 18 | } 19 | base_url = 'https://image.so.com/zj?' 20 | for page in range(1, self.settings.get('MAX_PAGE') + 1): 21 | # 偏移量参数 22 | data['sn'] = page * 30 23 | params = urlencode(data) 24 | # 完整请求链接 25 | url = base_url + params 26 | yield Request(url, self.parse) 27 | 28 | def parse(self, response): 29 | result = json.loads(response.text) 30 | for image in result.get('list'): 31 | item = Images360Item() 32 | item['id'] = image.get('imageid') 33 | item['url'] = image.get('qhimg_url') 34 | item['title'] = image.get('group_title') 35 | item['thumb'] = image.get('qhimg_thumb_url') 36 | yield item 37 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/images360/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = images360.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = images360 12 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/screenshot/README.md: -------------------------------------------------------------------------------- 1 | ## GIF 2 | ![程序运行动态图](https://github.com/Northxw/Python3_WebSpider/blob/master/15-Scrapy_Images360/screenshot/demo.gif) 3 | 4 | ## MongoDB 5 | ![MongoDB](https://github.com/Northxw/Python3_WebSpider/blob/master/15-Scrapy_Images360/screenshot/mongodb.jpg) 6 | 7 | ## Mysql 8 | ![Mysql](https://github.com/Northxw/Python3_WebSpider/blob/master/15-Scrapy_Images360/screenshot/mysql.jpg) 9 | 10 | ## Images 11 | ![Images](https://github.com/Northxw/Python3_WebSpider/blob/master/15-Scrapy_Images360/screenshot/images.jpg) 12 | -------------------------------------------------------------------------------- /15-Scrapy_Images360/screenshot/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/demo.gif -------------------------------------------------------------------------------- /15-Scrapy_Images360/screenshot/images.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/images.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/screenshot/mongodb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/mongodb.jpg -------------------------------------------------------------------------------- /15-Scrapy_Images360/screenshot/mysql.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/mysql.jpg -------------------------------------------------------------------------------- /16-vczh/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /16-vczh/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /16-vczh/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /16-vczh/.idea/vczh.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /16-vczh/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = vczh.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = vczh 12 | -------------------------------------------------------------------------------- /16-vczh/vczh/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__init__.py -------------------------------------------------------------------------------- /16-vczh/vczh/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/__pycache__/sendemail.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/sendemail.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy import Item, Field 4 | 5 | class VczhItem(Item): 6 | table = 'followig' 7 | id = Field() 8 | avatar_url = Field() 9 | name = Field() 10 | gender = Field() 11 | headline = Field() 12 | person_url = Field() 13 | follower_count = Field() 14 | answer_count = Field() 15 | articles_count = Field() -------------------------------------------------------------------------------- /16-vczh/vczh/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from scrapy.cmdline import execute 4 | 5 | execute('scrapy crawl vc'.split()) -------------------------------------------------------------------------------- /16-vczh/vczh/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 4 | from scrapy.utils.response import response_status_message 5 | from fake_useragent import UserAgent 6 | import base64 7 | import logging 8 | 9 | class UAMiddleware(object): 10 | def __init__(self): 11 | self.user_agent = UserAgent().random 12 | 13 | def process_request(self, request, spider): 14 | request.headers['User-Agent'] = self.user_agent 15 | 16 | 17 | class ProxyMiddleware(object): 18 | def __init__(self, proxy_server, proxy_user, proxy_pass): 19 | self.proxy_server = proxy_server 20 | self.proxy_user = proxy_user 21 | self.proxy_pass = proxy_pass 22 | self.proxy_auth = "Basic " + base64.urlsafe_b64encode(bytes((self.proxy_user + ":" + self.proxy_pass), "ascii")).decode("utf8") 23 | self.logger = logging.getLogger(__name__) 24 | 25 | @classmethod 26 | def from_crawler(cls, crawler): 27 | return cls( 28 | proxy_server = crawler.settings.get('PROXY_SERVER'), 29 | proxy_user = crawler.settings.get('PROXY_USER'), 30 | proxy_pass = crawler.settings.get('PROXY_PASS') 31 | ) 32 | 33 | def process_request(self, request, spider): 34 | request.meta["proxy"] = self.proxy_server 35 | request.headers["Proxy-Authorization"] = self.proxy_auth 36 | 37 | def process_response(self, request, response, spider): 38 | try: 39 | spider.crawler.stats.inc_value('normal_response') 40 | except Exception as e: 41 | self.logger.error('Response Error: {}'.format(e.args)) 42 | return response 43 | 44 | def process_exception(self, request, exception, spider): 45 | pass 46 | 47 | class DownloadRetryMiddleware(RetryMiddleware): 48 | def process_response(self, request, response, spider): 49 | if response.status in self.retry_http_codes: 50 | reason = response_status_message(response.status) 51 | return self._retry(request, reason, spider) or response 52 | return response 53 | 54 | def process_exception(self, request, exception, spider): 55 | if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \ 56 | and not request.meta.get('dont_retry', False): 57 | return self._retry(request, exception, spider) -------------------------------------------------------------------------------- /16-vczh/vczh/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from scrapy import Request 4 | from scrapy.exceptions import DropItem 5 | from scrapy.pipelines.images import ImagesPipeline 6 | import pymysql 7 | import logging 8 | 9 | # 统计下载图片的总量 10 | COUNT_IMAGES_NUMS = {'IMAGES_NUMS': 0} 11 | 12 | class MysqlPipeline(object): 13 | def __init__(self, host, database, user, password, port): 14 | self.host = host 15 | self.database = database 16 | self.user = user 17 | self.password = password 18 | self.port = port 19 | self.logger = logging.getLogger(__name__) 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | return cls( 24 | host=crawler.settings.get('MYSQL_HOST'), 25 | database=crawler.settings.get('MYSQL_DB'), 26 | user=crawler.settings.get('MYSQL_USER'), 27 | password=crawler.settings.get('MYSQL_PASSWORD'), 28 | port=crawler.settings.get('MYSQL_PORT') 29 | ) 30 | 31 | def open_spider(self, spider): 32 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, self.port) 33 | self.cursor = self.db.cursor() 34 | 35 | def process_item(self, item, spider): 36 | data = dict(item) 37 | keys = ', '.join(data.keys()) 38 | values = ', '.join(['%s'] * len(data)) 39 | sql = "INSERT INTO %s (%s) VALUES (%s)" % (item.table, keys, values) 40 | try: 41 | self.cursor.execute(sql, tuple(data.values())) 42 | self.db.commit() 43 | # 设置属性Success_InsertDB并自增1 44 | spider.crawler.stats.inc_value('success_insertdb') 45 | except Exception as e: 46 | self.logger.error('Error: {}'.format(e.args)) 47 | self.db.rollback() 48 | return item 49 | 50 | def close_spider(self, spider): 51 | self.db.close() 52 | 53 | 54 | class ImagePipeline(ImagesPipeline): 55 | 56 | def file_path(self, request, response=None, info=None): 57 | url = request.url 58 | file_name = url.split('/')[-1] 59 | return file_name 60 | 61 | def item_completed(self, results, item, info): 62 | image_paths = [x['path'] for ok, x in results if ok] 63 | if not image_paths: 64 | raise DropItem('Image Downloaded Failed') 65 | else: 66 | COUNT_IMAGES_NUMS['IMAGES_NUMS'] += 1 67 | return item 68 | 69 | def get_media_requests(self, item, info): 70 | yield Request(item['avatar_url']) -------------------------------------------------------------------------------- /16-vczh/vczh/sendemail.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import smtplib 4 | from email.mime.text import MIMEText 5 | 6 | class EmailSender(object): 7 | def __init__(self): 8 | # 发送方smtp服务器 9 | self.smtp_host = 'smtp.163.com' 10 | # 发送方邮箱(同于登录smtp服务器) 11 | self.smtp_user = 'northxw@163.com' 12 | # 授权码 13 | self.smtp_authcode = '123456' 14 | # smtp服务器默认端口465 15 | self.smtp_port = 465 16 | # 发送方邮箱 17 | self.sender = 'northxw@163.com' 18 | 19 | def sendEmail(self, recipient_list, email_subject, body): 20 | """ 21 | 发送邮件 22 | :param recipient_list: 收件人列表 23 | :param email_subject: 邮件主题 24 | :param body: 邮件内容 25 | :return: None 26 | """ 27 | # 邮件内容、格式、编码 28 | message = MIMEText(_text=body, _subtype='plain', _charset='utf-8') 29 | # 发件人 30 | message['From'] = self.sender 31 | # 收件人 32 | message['To'] = ', '.join(recipient_list) 33 | # 主题 34 | message['Subject'] = email_subject 35 | try: 36 | # 实例化SMTP_SSL对象 37 | smtpSSLClient = smtplib.SMTP_SSL(self.smtp_host,self.smtp_port) 38 | # 登录 39 | loginResult = smtpSSLClient.login(self.smtp_user, self.smtp_authcode) 40 | # loginRes = (235, b'Authentication successful') 41 | print("Login Result:LoginRes = {}".format(loginResult)) 42 | 43 | if loginResult and loginResult[0] == 235: 44 | print("Successful login, Code = {}".format(loginResult[0])) 45 | smtpSSLClient.sendmail(self.sender, recipient_list, message.as_string()) 46 | print("Successful delivery. Message:{}".format(message.as_string())) 47 | else: 48 | print("Login failed, Code = {}".format(str(loginResult[0]))) 49 | 50 | except Exception as e: 51 | print("Failed to send, Exception: e={}".format(e)) 52 | -------------------------------------------------------------------------------- /16-vczh/vczh/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for vczh project 4 | 5 | import time 6 | 7 | BOT_NAME = 'vczh' 8 | 9 | SPIDER_MODULES = ['vczh.spiders'] 10 | NEWSPIDER_MODULE = 'vczh.spiders' 11 | 12 | # Obey robots.txt rules 13 | ROBOTSTXT_OBEY = False 14 | 15 | # 设置延时0.3秒 16 | DOWNLOAD_DELAY = 0.3 17 | 18 | #SPIDER_MIDDLEWARES = { 19 | # 'vczh.middlewares.VczhSpiderMiddleware': 543, 20 | #} 21 | 22 | DOWNLOADER_MIDDLEWARES = { 23 | 'vczh.middlewares.DownloadRetryMiddleware': 100, 24 | 'vczh.middlewares.UAMiddleware': 543, 25 | 'vczh.middlewares.ProxyMiddleware': 544, 26 | } 27 | 28 | ITEM_PIPELINES = { 29 | 'vczh.pipelines.ImagePipeline': 300, 30 | # 'vczh.pipelines.MongoPipeline': 301, 31 | 'vczh.pipelines.MysqlPipeline': 303, 32 | } 33 | 34 | # 爬取最大页码 35 | MAX_PAGE = 155 36 | 37 | # MYSQL SEETINGS 38 | MYSQL_HOST = 'localhost' 39 | MYSQL_USER = 'root' 40 | MYSQL_PASSWORD = '0513' 41 | MYSQL_DB = 'vczh' 42 | MYSQL_PORT = 3306 43 | 44 | # 代理服务器 45 | PROXY_SERVER = "http://http-dyn.abuyun.com:9020" 46 | # 代理服务器隧道验证信息 47 | PROXY_USER = "HR827T805WJ4667D" 48 | PROXY_PASS = "124D18494FF76D09" 49 | 50 | # 图片存储位置 51 | IMAGES_STORE = './images' 52 | 53 | # LOG名称: 加入时间可以保证每次生成的报告不会重叠,也能清楚的知道报告生成时间 54 | LOG_FILE = './logs/{}.log'.format(str(time.strftime("%Y-%m-%d %H_%M_%S"))) 55 | # LOG编码 56 | # LOG_ENCODING = 'utf-8' 57 | # LOG级别: DEBUG级别最低,如果设置DEBUG,所有的log都会记录,不利于查错 58 | LOG_LEVEL = 'WARNING' 59 | 60 | # 邮件发送者 61 | MAIL_FROM = 'northxw@163.com' 62 | # 邮件服务器 63 | MAIL_HOST = 'smtp.163.com' 64 | # 端口 65 | MAIL_PORT = 25 66 | # 发送者 67 | MAIL_USER = 'northxw@163.com' 68 | # 授权码 69 | MAIL_PASS = 'authcode' 70 | 71 | # 邮件接收者列表 72 | RECEIVE_LIST = ['northxw@gmail.com', 'northxw@qq.com', 'northxw@sina.com'] 73 | # 邮件主题 74 | SUBJECT = '爬虫状态报告' -------------------------------------------------------------------------------- /16-vczh/vczh/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /16-vczh/vczh/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/spiders/__pycache__/vc.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/spiders/__pycache__/vc.cpython-36.pyc -------------------------------------------------------------------------------- /16-vczh/vczh/spiders/vc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import scrapy 5 | import time 6 | import logging 7 | from urllib.parse import urlencode 8 | from scrapy import Request 9 | from ..items import VczhItem 10 | from scrapy.mail import MailSender 11 | from ..pipelines import COUNT_IMAGES_NUMS 12 | 13 | class VcSpider(scrapy.Spider): 14 | start = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 15 | name = 'vc' 16 | allowed_domains = ['www.zhihu.com'] 17 | base_url = 'https://www.zhihu.com/api/v4/members/excited-vczh/followees?' 18 | logger = logging.getLogger(__name__) 19 | 20 | def start_requests(self): 21 | data = { 22 | 'include': 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics', 23 | 'limit': 20 24 | } 25 | for page in range(1, self.settings.get('MAX_PAGE') + 1): 26 | data['offset'] = page * 20 27 | params = urlencode(data) 28 | url = self.base_url + params 29 | yield Request(url, callback=self.parse, errback=self.error_back) 30 | 31 | def parse(self, response): 32 | result = json.loads(response.text) 33 | for data_ in result.get('data'): 34 | item = VczhItem() 35 | item['id'] = data_.get('id') 36 | item['avatar_url'] = data_.get('avatar_url').replace('_is', '') 37 | item['name'] = data_.get('name') 38 | item['gender'] = data_.get('gender') 39 | item['headline'] = data_.get('headline') 40 | item['person_url'] = data_.get('url'), 41 | item['follower_count'] = data_.get('follower_count') 42 | item['answer_count'] = data_.get('answer_count') 43 | item['articles_count'] = data_.get('articles_count') 44 | yield item 45 | 46 | 47 | def closed(self, reason): 48 | """ 49 | 爬虫关闭发送通知邮件 50 | """ 51 | # 爬虫完成时间 52 | fnished = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 53 | # 创建邮件发送对象 54 | mail = MailSender.from_settings(self.settings) 55 | # 邮件内容 56 | body = "爬虫名称: {}\n\n 开始时间: {}\n\n 请求成功总量:{}\n 图片下载总量:{}\n 数据库存储总量:{}\n\n 结束时间 : {}\n".format( 57 | '知乎轮子哥粉丝爬虫', 58 | str(self.start), 59 | str(self.crawler.stats.get_value("normal_response")), 60 | str(COUNT_IMAGES_NUMS['IMAGES_NUMS']), 61 | str(self.crawler.stats.get_value("success_insertdb")), 62 | str(str(fnished))) 63 | # 发送邮件 64 | mail.send(to=self.settings.get('RECEIVE_LIST'), subject=self.settings.get('SUBJECT'), body=body) 65 | 66 | def error_back(self, e): 67 | _ = self 68 | # 打印错误信息到日志 69 | self.logger.error('Error: {}'.format(e.reason)) -------------------------------------------------------------------------------- /16-vczh/vczh/utils/db_follower.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/db_follower.png -------------------------------------------------------------------------------- /16-vczh/vczh/utils/email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/email.png -------------------------------------------------------------------------------- /16-vczh/vczh/utils/followers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/followers.png -------------------------------------------------------------------------------- /16-vczh/vczh/utils/huaji.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/huaji.png -------------------------------------------------------------------------------- /16-vczh/vczh/utils/log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/log.png -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__init__.py -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class City58XiaoQu(scrapy.Item): 12 | """ 13 | 小区详情页数据 14 | """ 15 | id = scrapy.Field() 16 | name = scrapy.Field() 17 | location = scrapy.Field() 18 | price = scrapy.Field() 19 | address = scrapy.Field() 20 | times = scrapy.Field() 21 | 22 | class City58ItemChuZuInfo(scrapy.Item): 23 | """ 24 | 小区出租房页数据 25 | """ 26 | id = scrapy.Field() # 关联小区信息 27 | name = scrapy.Field() 28 | zu_price = scrapy.Field() 29 | mianji = scrapy.Field() 30 | type = scrapy.Field() 31 | chuzu_price_pre = scrapy.Field() # 每平米的房价 32 | url = scrapy.Field() # 出租房页面的唯一ID 33 | price_pre = scrapy.Field() # 存储每个出租房的每平米房价 -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from scrapy.cmdline import execute 4 | 5 | execute("scrapy crawl 58".split()) -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from .utils.api import get_ip_port 10 | 11 | class ProxyMiddleware(object): 12 | 13 | def process_request(self, request, spider): 14 | # 获取一个优质代理(此处请更换为自己购买的API生成的提取链接) 15 | proxy = get_ip_port('http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=***************69b51b303859ac446&orderno=*********************&returnType=2&count=1') 16 | # 设置代理 17 | request.meta['proxy'] = proxy 18 | 19 | def process_response(self, request, response, spider): 20 | return response 21 | 22 | def process_exception(self, request, exception, spider): 23 | pass 24 | -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from scrapy.exceptions import DropItem 4 | from pymongo import MongoClient 5 | from scrapy.conf import settings 6 | from pymongo.errors import DuplicateKeyError 7 | from traceback import format_exc 8 | from .items import City58XiaoQu, City58ItemChuZuInfo 9 | 10 | 11 | class City58Pipeline(object): 12 | 13 | def __init__(self, mongo_uri, mongo_db): 14 | self.mongo_uri = mongo_uri 15 | self.mongo_db = mongo_db 16 | self.client = None 17 | self.db = None 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | return cls( 22 | mongo_uri=crawler.settings.get('MONGODB_URI'), 23 | mongo_db=settings.get('MONGODB_DATABASE', 'items') 24 | ) 25 | 26 | def open_spider(self, spider): 27 | _ = spider 28 | self.client = MongoClient(self.mongo_uri) 29 | self.db = self.client[self.mongo_db] 30 | self.db['city58_info'].ensure_index('id', unique=True) # 在表 city58_info 中建立索引,并保证索引的唯一性 31 | self.db['city58_chuzu_info'].ensure_index('url', unique=True) # 在表 city58_chuzu_info 中建立索引,并保证索引的唯一性 32 | 33 | def close_spider(self, spider): 34 | _ = spider 35 | self.client.close() 36 | 37 | def process_item(self, item, spider): 38 | try: 39 | if isinstance(item, City58XiaoQu): # 判断是否是小区的item 40 | self.db['city58_info'].update({'id': item['id']}, {'$set': item}, upsert=True) # 通过id判断,有就更新,没有就插入 41 | elif isinstance(item, City58ItemChuZuInfo): # 判断是否是小区出租信息的item 42 | try: 43 | fangjia = HandleFangjiaPipline.price_per_square_meter_dict[item['id']] # 把HandleFangjiaPipline管道的字典price_per_square_meter_dict中每平米平均价格赋值给fangjia 44 | # del HandleFangjiaPipline.price_per_square_meter_dict[item['id']] 45 | item['price_pre'] = fangjia 46 | 47 | self.db['city58_chuzu_info'].update({'url': item['url']}, {'$set': item}, upsert=True) # 通过url判断,有就更新,没有就插入 48 | except Exception as e: 49 | print(e) 50 | 51 | except DuplicateKeyError: 52 | spider.logger.debug(' duplicate key error collection') # 唯一键冲突报错 53 | except Exception as e: 54 | _ = e 55 | spider.logger.error(format_exc()) 56 | return item 57 | 58 | 59 | class HandleZuFangPipline(object): 60 | 61 | def process_item(self, item, spider): 62 | _ = spider, self 63 | # self.db[self.collection_name].insert_one(dict(item)) 64 | # 判断进来的item是否是City58ItemXiaoChuZuQuInfo,是否含有面积参数 65 | if isinstance(item, City58ItemChuZuInfo) and 'mianji' in item: 66 | item['chuzu_price_pre'] = int(item['zu_price']) / int(item['mianji']) # 租金除以面积得到平均价格 67 | return item 68 | 69 | 70 | class HandleFangjiaPipline(object): 71 | 72 | price_per_square_meter_dict = dict() 73 | 74 | def process_item(self, item, spider): 75 | _ = spider 76 | 77 | # 判断传进来的item是否是个字典,并且是否含有price_list 78 | if isinstance(item, dict) and 'price_list' in item: 79 | item['price_list'] = [int(i) for i in item['price_list']] 80 | if item['price_list']: 81 | self.price_per_square_meter_dict[item['id']] = sum(item['price_list']) / len(item['price_list']) # 得到每个小区的平均价格 82 | else: 83 | self.price_per_square_meter_dict[item['id']] = 0 84 | raise DropItem() 85 | return item -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for City_58 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'City_58' 13 | 14 | SPIDER_MODULES = ['City_58.spiders'] 15 | NEWSPIDER_MODULE = 'City_58.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'City_58 (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 0.3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'City_58.middlewares.City58SpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'City_58.middlewares.ProxyMiddleware': 500, 57 | } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'City_58.pipelines.HandleFangjiaPipline': 300, # 租房平均每平米价格 69 | 'City_58.pipelines.HandleZuFangPipline': 310, 70 | 'City_58.pipelines.City58Pipeline': 320 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | # 58全国城市站点(测试:成都站) 95 | HOST = ['cd'] 96 | 97 | # 不同行政区的编号(测试:天府新区) 98 | AREA_CODE = ['21611'] 99 | 100 | # 数据库配置 101 | MONGODB_HOST = '127.0.0.1' 102 | MONGODB_PORT = '27017' 103 | MONGODB_URI = 'mongodb://{}:{}'.format(MONGODB_HOST, MONGODB_PORT) 104 | MONGODB_DATABASE = '58' -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/spiders/58.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy import Request 5 | from traceback import format_exc 6 | from ..items import City58XiaoQu, City58ItemChuZuInfo 7 | from ..utils.parse import parse_xiaoqu, parse_xiaoqu_detail, \ 8 | get_ershoufang_list_page, get_chuzu_detail_page_list_url, get_chuzu_house_info 9 | 10 | class A58Spider(scrapy.Spider): 11 | name = '58' 12 | allowed_domains = ['58.com'] 13 | base_url = 'https://{}.58.com/xiaoqu/{}/' 14 | 15 | def start_requests(self): 16 | # 根据HOST和CODE构造各行政区的小区页面的URL 17 | for host in self.settings.get('HOST'): 18 | for code in self.settings.get('AREA_CODE'): 19 | url = self.base_url.format(host, code) 20 | self.logger.debug(url) 21 | yield Request(url=url, callback=self.parse) 22 | 23 | def parse(self, response): 24 | # 提取小区列表页的URL 25 | xiaoqu_url_list = parse_xiaoqu(response) 26 | for xiaoqu_url in xiaoqu_url_list: 27 | yield Request(xiaoqu_url, callback=self.xiaoqu_detail_page, errback=self.error_back) 28 | 29 | def xiaoqu_detail_page(self, response): 30 | # 提取小区详情页的数据 31 | xiaoqu_detail_data = parse_xiaoqu_detail(response) 32 | item = City58XiaoQu() 33 | item.update(xiaoqu_detail_data) 34 | item['id'] = response.url 35 | self.logger.debug(item) 36 | yield item 37 | 38 | # 二手房页面 39 | ershoufang_url = self.base_url.format(self.settings.get('HOST'), item['id']) + 'ershoufang' # 二手房页面的完整请求链接 40 | yield Request(url=ershoufang_url, callback=self.ershoufang_list_page, 41 | errback=self.error_back, meta={'id': item['id']}) 42 | 43 | # 出租房页面 44 | chuzufang_url = self.base_url.format(self.settings.get('HOST'), item['id']) + 'chuzu' # 出租房页面的完整请求链接 45 | yield Request(url=chuzufang_url, callback=self.chuzufang_detail_page, 46 | errback=self.error_back, meta={'id': item['id']}) 47 | 48 | def ershoufang_list_page(self, response): 49 | # 保持编码规则,在self不使用的情况下接收它 50 | _ = self 51 | # 提取二手房页面的所有房价 52 | price_list = get_ershoufang_list_page(response) 53 | yield {'id': response.item['id'], 'price_list': price_list} # 仅计算该小区的平均房价,不做存储及其他处理 54 | 55 | # 翻页 56 | 57 | def chuzufang_detail_page_url_list(self, response): 58 | # 保持编码规则,在self不使用的情况下接收它 59 | _ = self 60 | # 提取出租房页面的所有详情页链接 61 | chuzufang_detail_url = get_chuzu_detail_page_list_url(response) 62 | for url in chuzufang_detail_url: 63 | yield Request(url=url, callback=self.chuzufang_detail_page, 64 | errback=self.error_back, meta={'id': response.item['id']}) 65 | 66 | # 翻页 67 | 68 | def chuzufang_detail_page(self, response): 69 | # 保持编码规则,在self不使用的情况下接收它 70 | _ = self 71 | # 提取出租房页面的详细数据(注意:当前时间-2018/11/24,目前了解至少从2018年9月份开始该页面已添加字体反爬, 爬取的数据已经做反反爬处理) 72 | chuzufang_data = get_chuzu_house_info(response) 73 | item = City58ItemChuZuInfo() 74 | item.update(chuzufang_data) 75 | item['id'] = response.meta['id'] 76 | item['url'] = response.url 77 | yield item 78 | 79 | def error_back(self, e): 80 | _ = e 81 | # 打印堆栈的错误信息 82 | self.logger.debug(format_exc()) 83 | pass -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/spiders/__pycache__/58.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/spiders/__pycache__/58.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__init__.py -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/__pycache__/api.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/api.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/__pycache__/parse.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/parse.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/__pycache__/proxy.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/proxy.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/__pycache__/xdaili.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/xdaili.cpython-36.pyc -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import requests 4 | import json 5 | import time 6 | from fake_useragent import UserAgent 7 | import logging 8 | 9 | def get_ip_port(url): 10 | """ 11 | 获取API返回的JSON数据 12 | :param url: 代理API 13 | :return: 有效IP 14 | """ 15 | time.sleep(1) 16 | response = requests.get(url) 17 | response = json.loads(response.text) 18 | result = response['RESULT'] 19 | agent = '' 20 | for i in range(len(result)): 21 | agent = 'https://{}:{}/'.format(result[i]['ip'], result[i]['port']) 22 | logging.debug(agent) 23 | return agent 24 | 25 | if __name__ == '__main__': 26 | # 测试 - 这里我购买了讯代理的"优质代理",通过API生成提取链接来提取ip. 测试有效! 27 | url = '' 28 | agent = get_ip_port(url=url) 29 | -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 轻量级ip代理池, 以89代理网站为例, ip成活率较低, 可做测试使用. 5 | """ 6 | 7 | import requests 8 | from pyquery import PyQuery 9 | from fake_useragent import UserAgent 10 | import random 11 | 12 | def get_ip_port(page): 13 | """ 14 | 获取网页的ip和port 15 | :param page: 页码 16 | :return: 随机ip 17 | """ 18 | # 请求头(根据需要另行设置) 19 | headers = dict() 20 | # 代理池 21 | agents = list() 22 | for i in range(page): 23 | url = 'http://www.89ip.cn/index_{}.html'.format(i+1) # 格式化请求链接 24 | response = requests.get(url) # 获取网页内容 25 | 26 | if response.status_code == 200: 27 | jpy = PyQuery(response.text) 28 | tr_list = jpy('div.layui-form > table > tbody > tr').items() 29 | for tr in tr_list: 30 | ip = tr('td:nth-child(1)').text() 31 | port = tr('td:nth-child(2)').text() 32 | agent = 'http://{}:{}'.format(ip, port) # 格式化ip,port 33 | agents.append(agent) # 添加至代理池 34 | else: 35 | print('The status code is {},Try again! '.format(response.status_code)) 36 | 37 | # 检测有效ip代理,随机返回使用 38 | return random.choices(test_agent(agents))[0] 39 | 40 | def test_agent(agents): 41 | """ 42 | 针对58同城测试获取的免费代理 43 | :param agents: 代理池 44 | :return: 有效的代理 45 | """ 46 | agents_copy = agents 47 | for agent in agents_copy: 48 | try: 49 | res = requests.get('https://cd.58.com/', proxy=agent) 50 | except Exception as e: 51 | agents.remove(agent) 52 | continue 53 | return agents 54 | 55 | if __name__ == '__main__': 56 | print(get_ip_port(random.randint(2, 4))) -------------------------------------------------------------------------------- /17-City_58/City_58/City_58/utils/xdaili.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 提示:讯代理的Python3接入文档需要稍作修改,方能使用. 5 | """ 6 | 7 | import sys 8 | import time 9 | import hashlib 10 | import requests 11 | # import grequests 12 | from lxml import etree 13 | 14 | class Xdaili(object): 15 | def __init__(self): 16 | # 请将此处的订单号和个人密钥修改为你自己的. 17 | self.orderno = 'ZF201812********************' 18 | self.secret = 'ddde303a6*******************' 19 | self.ip = "forward.xdaili.cn" 20 | self.port = '80' 21 | self.ip_port = self.ip + ":" + self.port 22 | 23 | def proxy(self): 24 | # 时间戳 25 | timestamp = str(int(time.time())) 26 | # 签名算法参数 27 | string = "orderno=" + self.orderno + "," + "secret=" + self.secret + "," + "timestamp=" + timestamp 28 | # Python3需要编码 29 | string = string.encode() 30 | # 计算sign 31 | md5_string = hashlib.md5(string).hexdigest() 32 | # 转大写 33 | sign = md5_string.upper() 34 | # auth 35 | auth = "sign=" + sign + "&" + "orderno=" + self.orderno + "&" + "timestamp=" + timestamp 36 | proxy = { 37 | "http": "http://" + self.ip_port, 38 | "https": "https://" + self.ip_port 39 | } 40 | return [auth, proxy] 41 | -------------------------------------------------------------------------------- /17-City_58/City_58/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = City_58.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = City_58 12 | -------------------------------------------------------------------------------- /17-City_58/README.md: -------------------------------------------------------------------------------- 1 | ## Scrapy 58 City 2 |   Scrapy实战项目 - 使用Scrapy框架抓取58同城的房屋信息,并将数据存储至MongoDB。 3 | 4 | ## Tip 5 |   本次实践代码的综合性较高, 建议有选择性的尝试。 对于代码中的疑惑点, 可随时提交问题或邮箱联系。Good Luck! 6 | 7 | ## Demo 8 | ![数据库截屏](https://github.com/Northxw/Python3_WebSpider/blob/master/17-City_58/screenshot/monogdb.jpg) 9 | -------------------------------------------------------------------------------- /17-City_58/screenshot/monogdb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/screenshot/monogdb.jpg -------------------------------------------------------------------------------- /17-City_58/screenshot/run_01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/screenshot/run_01.jpg -------------------------------------------------------------------------------- /17-City_58/screenshot/run_02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/screenshot/run_02.jpg -------------------------------------------------------------------------------- /18-36kr/.idea/36kr.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /18-36kr/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /18-36kr/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 18 | -------------------------------------------------------------------------------- /18-36kr/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /18-36kr/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /18-36kr/36kr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created at 21:04 at March 12,2019 5 | @title: 爬取36kr的最新文章信息并保存至Mysql数据库 6 | @author: Northxw 7 | """ 8 | 9 | from tqdm import tqdm 10 | from colorama import init, Fore 11 | from icon.word import show 12 | from fake_useragent import UserAgent 13 | from requests.exceptions import RequestException 14 | import requests 15 | import pymysql 16 | import time 17 | import re 18 | 19 | init(autoreset=True) 20 | 21 | def connect_db(): 22 | """ 23 | 连接Mysql数据库 24 | :return: db 25 | """ 26 | db = pymysql.connect(host='localhost', user='root', password='******', port=3306, db='36kr') 27 | # print('数据库连接成功!') 28 | return db 29 | 30 | def get_one_page(page): 31 | """ 32 | 获取一页的最新文章JSON数据 33 | :param page: 页码 34 | :return: json 35 | """ 36 | # 真实请求 37 | url = 'https://36kr.com/api/search-column/mainsite?per_page=20&page={}'.format(str(page)) 38 | # 设置Headers 39 | headers = { 40 | 'User-Agent': UserAgent().random, 41 | 'Referer': 'https://36kr.com/', 42 | 'Host': '36kr.com' 43 | } 44 | # 获取网页源代码 45 | try: 46 | response = requests.get(url, headers=headers) 47 | if response.status_code == 200: 48 | items = response.json()['data']['items'] 49 | return items 50 | return None 51 | except RequestException: 52 | return None 53 | 54 | def parse_one_page(items): 55 | """ 56 | 解析获取的JSON数据 57 | :param items: 获取的JSON数据段items 58 | :return: dict 59 | """ 60 | # 存储单页总数据 61 | datas = list() 62 | for item in items: 63 | data= { 64 | # 文章ID 65 | 'id': str(item['id']), 66 | # 标题 67 | 'title': item['title'], 68 | # 类别 69 | 'column_name': item['column_name'], 70 | # id 71 | 'column_id': item['column_id'], 72 | # 封面图片链接 73 | 'cover': item['cover'], 74 | # 发布时间 75 | 'publish_time': item['published_at'] , 76 | # 文章总结 77 | 'summary': item['summary'] 78 | } 79 | # 处理时间 80 | data['publish_time'] = re.search('(.*?)T(.*?)\+.*', data['publish_time']).group(1) + ' ' + re.search('(.*?)T(.*?)\+.*', data['publish_time']).group(2) 81 | # 存储 82 | datas.append(data) 83 | # 将标题写入文件.制作中文词云 84 | with open('./icon/36kr.txt', 'a', encoding='utf-8') as f: 85 | f.write(data['title']) 86 | return datas 87 | 88 | def save_to_mysql(datas): 89 | """ 90 | 将解析数据存储到Mysql数据库 91 | :param item: 获取的单页有效数据 92 | :return: None 93 | """ 94 | # 连接数据库 95 | db = connect_db() 96 | # 获得Mysql操作指针 97 | cursor = db.cursor() 98 | # sql 99 | sql = "INSERT INTO kr(id, article_title, colum_name, colum_id, cover, publish_time, summary) " \ 100 | "VALUES(%s, %s, %s, %s, %s, %s, %s)" 101 | for _item in datas: 102 | try: 103 | # 插入数据 104 | cursor.execute(sql, (_item['id'], _item['title'], _item['column_name'], 105 | _item['column_id'], _item['cover'], _item['publish_time'], _item['summary'])) 106 | # 提交 107 | db.commit() 108 | # print('数据插入成功!') 109 | except Exception as e: 110 | # print('数据插入失败!',e) 111 | db.rollback() 112 | # 关闭数据库连接 113 | db.close() 114 | 115 | def main(): 116 | """ 117 | 主函数 118 | :return: None 119 | """ 120 | print(Fore.RED + '提示:截止目前的总数据量是77998条, 测试仅抓取前10页的共200条数据!\n') 121 | for i in tqdm(range(10), desc='抓取进度'): 122 | # 获取 123 | items = get_one_page(i+1) 124 | # 解析 125 | data = parse_one_page(items) 126 | # 保存 127 | save_to_mysql(data) 128 | time.sleep(1) 129 | 130 | if __name__ == '__main__': 131 | main() 132 | -------------------------------------------------------------------------------- /18-36kr/README.md: -------------------------------------------------------------------------------- 1 | ## Spider 36kr 2 |   爬取36氪的最新文章信息并存储至Mysql、制作中文词云图, 爬取内容包含文章ID, 标题,封面图片链接,发布时间,类别名称等。 3 | 4 | ## Explain 5 |   首先,确定36氪的新闻信息是通过Js加载;然后,打开谷歌浏览器开发者工具选择NetWork寻找真实请求的URL;最后,编写Code爬取文章信息。 6 | 7 |   注意:真实请求URL最后的数字参数是时间戳,去掉后可正常获取网页内容。 8 | 9 | ## Demo 10 | ![db](https://github.com/Northxw/Python3_WebSpider/blob/master/18-36kr/utils/db.png) 11 |   12 | ![wordcloud](https://github.com/Northxw/Python3_WebSpider/blob/master/18-36kr/utils/cloud.jpg) 13 | -------------------------------------------------------------------------------- /18-36kr/utils/FZSTK.TTF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/FZSTK.TTF -------------------------------------------------------------------------------- /18-36kr/utils/__pycache__/word.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/__pycache__/word.cpython-36.pyc -------------------------------------------------------------------------------- /18-36kr/utils/cloud.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/cloud.jpg -------------------------------------------------------------------------------- /18-36kr/utils/db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/db.png -------------------------------------------------------------------------------- /18-36kr/utils/show.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/show.jpg -------------------------------------------------------------------------------- /18-36kr/utils/word.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from wordcloud import WordCloud 4 | import cv2 5 | import jieba 6 | import matplotlib.pyplot as plt 7 | 8 | def show(): 9 | """ 10 | 根据文章标题,制作中文词云 11 | :return: None 12 | """ 13 | # 文本 14 | with open('36kr.txt', 'r', encoding='utf-8') as f: 15 | text = f.read() 16 | cut_text = " ".join(jieba.cut(text)) 17 | color_mask = cv2.imread('show.jpg') 18 | cloud = WordCloud( 19 | # 设置字体,不指定就会出现乱码 20 | font_path = "./FZSTK.TTF", 21 | # 设置背景色 22 | background_color = 'white', 23 | # 词云形状 24 | mask = color_mask, 25 | # 允许最大词汇 26 | max_words = 2000, 27 | # 最大号字体 28 | max_font_size = 40 29 | ) 30 | wCloud = cloud.generate(cut_text) 31 | wCloud.to_file('cloud.jpg') 32 | 33 | plt.imshow(wCloud, interpolation='bilinear') 34 | plt.axis('off') 35 | plt.show() 36 | 37 | if __name__ == '__main__': 38 | show() -------------------------------------------------------------------------------- /19-Youku_DanMu/.idea/Youku_DanMu.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /19-Youku_DanMu/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /19-Youku_DanMu/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /19-Youku_DanMu/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /19-Youku_DanMu/README.md: -------------------------------------------------------------------------------- 1 | ## Youku DanMu 2 |   **弹幕爬取01** - 网页版优酷视频《我不是药神》的弹幕数据并制作词云图。 3 | 4 | ## Explain 5 |   首先,播放影片并打开Chrome开发者工具,选择Network。逐步拖动进度条并观察本地与服务器的请求规律,如图: 6 | ![danmu_request_url_png](https://github.com/Northxw/Python3_WebSpider/blob/master/19-Youku_DanMu/utils/require/danmu_json.png) 7 | 8 |   然后,确定弹幕数据来自JS实时加载而非XHR。需要注意的是,弹幕的请求数据不是规范的JSON格式。如图: 9 | ![danmu_json_content](https://github.com/Northxw/Python3_WebSpider/blob/master/19-Youku_DanMu/utils/require/danmu_content.png) 10 | 11 | ## Other 12 | 1. 请求链接的最后一个参数类似时间戳,去掉后不会影响数据的获取。 13 | 2. 不要使用urllib.parse.urlencode()函数构造GET请求的链接,否则获取的数据为空,亲测。 14 | 15 | ## Demo 16 | ![wordcloud](https://github.com/Northxw/Python3_WebSpider/blob/master/19-Youku_DanMu/utils/cloud.jpg) 17 | 18 |   从词云图可以看出,"会员、电影票、五星力荐、王传君、癌症..."等关键字最为突出。 19 | -------------------------------------------------------------------------------- /19-Youku_DanMu/danmu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Created at 22:08 at March 13,2019 5 | @title: 爬取优酷《我不是药神仙》弹幕数据并制作词云 6 | @author: Northxw 7 | """ 8 | 9 | from fake_useragent import UserAgent 10 | from requests.exceptions import RequestException 11 | from tqdm import tqdm 12 | import requests 13 | import time 14 | import os 15 | import re 16 | 17 | def get_data(mat): 18 | """ 19 | 循环遍历爬取弹幕数据 20 | :param mat: 偏移量 21 | :return: list 22 | """ 23 | # 请求链接 24 | url = 'https://service.danmu.youku.com/list?jsoncallback=jQuery111207035726936412456_1552483671572&mat={}&mcount=1&ct=1001&iid=959955945&aid=333822&cid=96&lid=0&ouid=0'.format(mat) 25 | # headers 26 | headers = { 27 | 'Referer': 'https://v.youku.com/v_show/id_XMzgzOTgyMzc4MA==.html?spm=a2h0k.11417342.soresults.dplaybutton&s=c6c62a475a5d4a14ab48', 28 | 'User-Agent': UserAgent().random 29 | } 30 | """ 31 | # 参数 32 | params = { 33 | 'jsoncallback': 'jQuery11120003560802190473389_1552479833762', 34 | 'mat': mat, 35 | 'mcount': '1', 36 | 'ct': '1001', 37 | 'id': '959955945', 38 | 'aid': '333822', 39 | 'cid': '96', 40 | 'lid': '0', 41 | 'ouid': '0' 42 | # '_': '1552479833815' 提示:类似时间戳,去掉后不影响数据的获取 43 | } 44 | """ 45 | # 获取弹幕 46 | try: 47 | response = requests.get(url, headers=headers) 48 | print(response) 49 | if response.status_code == 200: 50 | html = response.text 51 | # 正则解析(结果为list类型) 52 | results = re.findall(',\"content\":\"(.*?)\",', html, re.S) 53 | # 文本存储 54 | save_dir = './utils/danmu.txt' 55 | if not os.path.exists(save_dir): # Determine whether storage path exists, no creation 56 | os.mkdir(save_dir) 57 | with open(save_dir, 'a', encoding='utf-8') as f: 58 | f.write(str(results)) 59 | return results 60 | return None 61 | except RequestException as e: 62 | print('Error: ', e.args) 63 | return None 64 | 65 | if __name__ == '__main__': 66 | for i in tqdm(range(10), desc='Progress'): 67 | time.sleep(1) 68 | get_data(str(i)) 69 | -------------------------------------------------------------------------------- /19-Youku_DanMu/utils/FZSTK.TTF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/FZSTK.TTF -------------------------------------------------------------------------------- /19-Youku_DanMu/utils/cloud.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/cloud.jpg -------------------------------------------------------------------------------- /19-Youku_DanMu/utils/require/danmu_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/require/danmu_content.png -------------------------------------------------------------------------------- /19-Youku_DanMu/utils/require/danmu_json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/require/danmu_json.png -------------------------------------------------------------------------------- /19-Youku_DanMu/utils/show.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/show.jpg -------------------------------------------------------------------------------- /19-Youku_DanMu/utils/word.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from wordcloud import WordCloud 4 | import cv2 5 | import jieba 6 | import matplotlib.pyplot as plt 7 | 8 | def show(): 9 | # 文本 10 | with open('danmu.txt', 'r', encoding='utf-8') as f: 11 | text = f.read() 12 | cut_text = " ".join(jieba.cut(text)) 13 | color_mask = cv2.imread('show.jpg') 14 | cloud = WordCloud( 15 | # 设置字体,不指定就会出现乱码 16 | font_path = "./FZSTK.TTF", 17 | # 设置背景色 18 | background_color = 'white', 19 | # 词云形状 20 | mask = color_mask, 21 | # 允许最大词汇 22 | max_words = 2000, 23 | # 最大号字体 24 | max_font_size = 40 25 | ) 26 | wCloud = cloud.generate(cut_text) 27 | wCloud.to_file('cloud.jpg') 28 | plt.imshow(wCloud, interpolation='bilinear') 29 | plt.axis('off') 30 | plt.show() 31 | 32 | if __name__ == '__main__': 33 | show() -------------------------------------------------------------------------------- /20-Selenium_163/.idea/20-Selenium_163Email.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /20-Selenium_163/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /20-Selenium_163/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | -------------------------------------------------------------------------------- /20-Selenium_163/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /20-Selenium_163/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /20-Selenium_163/README.md: -------------------------------------------------------------------------------- 1 | # Project name 2 |   模拟登录网易163邮箱并发送SOS邮件。 3 | 4 | # Sort 5 | - **iframe子页面处理** - 通过网易163邮箱的多iframe特点训练对子页面的处理。 6 | 7 | - **模拟登陆** - Selenium 8 | 9 | # Install 10 | **1. Selenium** - 建议使用低版本的Python-Selenium库,因为高版本在Chrome中不支持。 11 | ``` 12 | pip3 install selenium==2.48.0 13 | ``` 14 | **2. chromedriver.exe** - 下载地址:http://npm.taobao.org/mirrors/chromedriver/, 版本要匹配。将 .exe 程序放在"..Python\Python36\Scripts"目录下。 15 | 16 | **3. pymysql** 17 | ``` 18 | pip3 install pymysql 19 | ``` 20 | 21 | # Process analysis 22 | **1.登录界面iframe** 23 |   iframe的id值添加了时间戳,直接获取相对麻烦。可通过XPATH或CSS选择器获取该节点。如图: 24 | 25 | 26 | ![login_frame](https://github.com/Northxw/Python3_WebSpider/blob/master/20-Selenium_163/require/login_frame.png) 27 | 28 | **2. "写信"节点** 29 |   写信节点的元素定位li节点, 不要定位span子节点,否则获取不到。另外,如果是获取APP节点,可以选择小一级的。 30 | 31 | **3. 邮件主题** 32 |   主题节点不可交互,无法输入文字,这里选择不设置。 33 | 34 | **4. 邮件内容** 35 |   邮件内容的文本输入框处于iframe中,输入文本前需要切换frame,可直接通过class获取并切换。如图: 36 | 37 | ![content_frame](https://github.com/Northxw/Python3_WebSpider/blob/master/20-Selenium_163/require/content_frame.png) 38 | 39 | **5. "发送"节点** 40 |   由于输入邮件内容时切换至子页面,在点击发送前需要切换到父级Frame。 41 | 42 | **6. 登录限制** 43 |   不要频繁使用Selenium, 否则会出现点触式验证。当然,完全可以破解。但是,网易相对友好,短时间过后便可恢复正常访问,也不会ban IP。 44 | 45 | # Other 46 |   代码注释部分为保留功能:获取所有邮件的有效信息(发件人、收件时间、邮件内容概要),并保存至数据库。由于节点采集遇到问题,所以暂时注释保留。 47 | 48 | # Demo 49 | ![demo](https://github.com/Northxw/Python3_WebSpider/blob/master/20-Selenium_163/require/demo.gif) 50 | -------------------------------------------------------------------------------- /20-Selenium_163/require/content_frame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/require/content_frame.png -------------------------------------------------------------------------------- /20-Selenium_163/require/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/require/demo.gif -------------------------------------------------------------------------------- /20-Selenium_163/require/login_frame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/require/login_frame.png -------------------------------------------------------------------------------- /20-Selenium_163/utils/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/utils/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /20-Selenium_163/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # 登陆页面URL 4 | URL = 'https://mail.163.com/' 5 | 6 | 7 | # 邮箱账号 8 | MAIL_USER = 'northxw' 9 | # 邮箱密码[更换为你的密码] 10 | MAIL_PASS = '******' 11 | 12 | 13 | # 收件人邮箱账号[更换为你想发送的收件人] 14 | RECIPIENT = '******' 15 | # 内容 16 | CONTENT = '6的二进制 !!!' 17 | 18 | """ 19 | # localhost 20 | MYSQL_LOCALHOST = 'localhost' 21 | # 用户 22 | MYSQL_USER = 'root' 23 | # 密码 24 | MYSQL_PASS = '0513' 25 | # 端口 26 | MYSQL_PORT = 3306 27 | # 数据库 28 | MYSQL_DB = 'mail' 29 | """ 30 | 31 | # 间隔时间 32 | TIME_OUT = 10 33 | 34 | -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/.idea/DouYin.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/README.md: -------------------------------------------------------------------------------- 1 | # Project Name 2 |   Appium、Mitmdump爬取抖音短视频。 3 | 4 | # Sort 5 |   **自动化爬取APP数据** - 基于 Appium + Mitmdump 的自动化爬取程序。 6 | 7 | # Install 8 |   请移步:[Environmental_installation](https://github.com/Northxw/Python3_WebSpider/blob/master/05-Moments/Readme.md) 9 | 10 | # Explain 11 | ### 1. 不登录抖音账号 12 |   若选择登录抖音账号,第一个问题是无法自动化获取短信验证码,第二个问题是填写短信验证码后会出现点触式图形验证码,如图: 13 | 14 | ![yanzhengma](https://github.com/Northxw/Python3_WebSpider/blob/master/21-AutoCrawl_DouYin/plates/%E5%9B%BE%E5%BD%A2%E7%82%B9%E8%A7%A6%E9%AA%8C%E8%AF%81%E7%A0%81.png) 15 | 16 | ### 2.跳过"滑动查看更多" 17 |   自动化打开抖音APP后会出现"滑动查看更多", 须通过获取点击位置跳过该页面,如图: 18 | 19 | ![scroll_and_more](https://github.com/Northxw/Python3_WebSpider/blob/master/21-AutoCrawl_DouYin/plates/start.png) 20 | 21 | ### 3. 视频请求接口 22 |   抖音视频的接口较多,有的包含较多广告,有的全是短视频,这里选择全部获取,构造共16个URL,代码如下: 23 | ```Python 24 | nums = [1,3,6,9] 25 | for num in nums: 26 | url_first = 'http://v{}-dy.ixigua.com/'.format(str(num)) 27 | url_second = 'http://v{}-dy-x.ixigua.com'.format(str(num)) 28 | url_third = 'http://v{}-dy-z.ixigua.com'.format(str(num)) 29 | url_fouth = 'http://v{}-dy-y.ixigua.com'.format(str(num)) 30 | urls.extend([url_first, url_second, url_third, url_fouth]) 31 | ``` 32 | 33 | ### 4. 视频文件名称 34 |   取视频URL中的唯一值作为保存视频的名称,如图: 35 | 36 | ![file_name](https://github.com/Northxw/Python3_WebSpider/blob/master/21-AutoCrawl_DouYin/plates/video_url.png) 37 | 38 | # Other 39 |   自动化爬取抖音短视频只能下载视频,而不能获取视频的其他有效信息,就好比有些网站必须登录之后才能获取数据是一样的。 40 | 41 | # Demo 42 | #### 1. GIF-Download_Video 43 | ![download](https://github.com/Northxw/Python3_WebSpider/blob/master/21-AutoCrawl_DouYin/plates/douyin_demo.gif) 44 | 45 | #### 2. GIF-Crawl_Video 46 | ![crawl](https://github.com/Northxw/Python3_WebSpider/blob/master/21-AutoCrawl_DouYin/plates/demo.gif) 47 | -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/__pycache__/scripts.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/__pycache__/scripts.cpython-36.pyc -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/actions.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from appium import webdriver 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.common.by import By 7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 8 | from time import sleep 9 | from config import * 10 | import time 11 | 12 | class DouYin(object): 13 | def __init__(self): 14 | """ 15 | 初始化 16 | """ 17 | # 配置启动APP的参数 18 | self.desired_caps = { 19 | 'platformName': PLATFORM, 20 | 'deviceName': DEVICE_NAME, 21 | 'appPackage': APP_PACKAGE, 22 | 'appActivity': APP_ACTICITY 23 | } 24 | self.driver = webdriver.Remote(APPIUM_SERVER, self.desired_caps) 25 | self.wait = WebDriverWait(self.driver, TIME_OUT) 26 | 27 | def open(self): 28 | """ 29 | 打开抖音APP 30 | """ 31 | time.sleep(5) 32 | # 跳过"滑动查看更多"界面 33 | unknown = self.wait.until( 34 | EC.presence_of_element_located((By.XPATH, '//*[@class="android.widget.FrameLayout"]'))) 35 | unknown.click() 36 | """ 37 | try: 38 | # 出现抖音"用户隐私政策概要"界面后,选择"仅浏览" 39 | yes = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.ss.android.ugc.aweme:id/mw'))) 40 | yes.click() 41 | except NoSuchElementException as e: 42 | pass 43 | # 跳过"滑动查看更多"界面 44 | unknown = self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="android.widget.FrameLayout"]'))) 45 | unknown.click() 46 | """ 47 | 48 | def scroll(self): 49 | """ 50 | 滑动 51 | """ 52 | while True: 53 | # 上滑刷新 54 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y) 55 | sleep(SCROLL_SLEEP_TIME) 56 | 57 | def main(self): 58 | self.open() 59 | self.scroll() 60 | 61 | if __name__ == '__main__': 62 | douyin = DouYin() 63 | douyin.main() -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # Appium 服务接口 4 | APPIUM_SERVER = 'http://localhost:4723/wd/hub' 5 | 6 | 7 | # 设备类型 8 | DEVICE_NAME = 'vivo_X7' 9 | # 设备类型(安卓或IOS) 10 | PLATFORM = 'Android' 11 | # APP包名 12 | APP_PACKAGE = 'com.ss.android.ugc.aweme' 13 | # 入口类型 14 | APP_ACTICITY = '.main.MainActivity' 15 | 16 | 17 | # 元素加载时间 18 | TIME_OUT = 300 19 | 20 | # 滑动点 21 | FLICK_START_X = 300 22 | FLICK_START_Y = 300 23 | FLICK_DISTANCE = 900 24 | 25 | # 滑动间隔时间 26 | SCROLL_SLEEP_TIME = 5 -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/plates/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/demo.gif -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/plates/douyin_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/douyin_demo.gif -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/plates/start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/start.png -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/plates/video_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/video_name.png -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/plates/video_url.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/video_url.png -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/plates/图形点触验证码.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/图形点触验证码.png -------------------------------------------------------------------------------- /21-AutoCrawl_DouYin/scripts.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Crated at 07:34 at March 20,2019 5 | @title: 使用Appium + Mitmdump 自动化爬取抖音视频 6 | @author: Northxw 7 | """ 8 | 9 | import requests 10 | import os 11 | 12 | def response(flow): 13 | """ 14 | 爬取抖音短视频 15 | """ 16 | urls = list() 17 | # 抖音短视频接口 18 | nums = [1,3,6,9] 19 | for num in nums: 20 | url_first = 'http://v{}-dy.ixigua.com/'.format(str(num)) 21 | url_second = 'http://v{}-dy-x.ixigua.com'.format(str(num)) 22 | url_third = 'http://v{}-dy-z.ixigua.com'.format(str(num)) 23 | url_fouth = 'http://v{}-dy-y.ixigua.com'.format(str(num)) 24 | urls.extend([url_first, url_second, url_third, url_fouth]) 25 | 26 | for url in urls: 27 | if flow.request.url.startswith(url): 28 | # 取URL中取值唯一的部分作为文件名称 29 | video_name = flow.request.url.split('/')[3] 30 | # 获取视频的二进制内容 31 | content = requests.get(flow.request.url, stream=True).content 32 | # 判断文件路径是否存在 33 | save_dir = './video' 34 | if not os.path.exists(save_dir): 35 | os.mkdir(save_dir) 36 | # 视频存储路径 37 | save_dir = '{}/{}.mp4'.format(save_dir, video_name) 38 | 39 | # 存储 40 | with open(save_dir, 'wb') as f: 41 | f.write(content) -------------------------------------------------------------------------------- /22-Stackoverflow/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /22-Stackoverflow/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /22-Stackoverflow/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /22-Stackoverflow/.idea/stackoverflow.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /22-Stackoverflow/README.md: -------------------------------------------------------------------------------- 1 | # Spider Stackoverflow 2 |   爬取 **Stackoverflow** 前1000个问题的相关信息。 3 | 4 | # Sort 5 |   **Scrapy** - 爬取外网数据。 6 | 7 | # Explain 8 | ## 1. 设置 "ROBOTSTXT_OBEY = True" 9 |   如果你没有某墙软件,建议遵循爬虫协议,否则会被强制切断请求。在此基础上,设置 **DOWNLOAD_DELAY** 爬取时间间隔, 访问不要过于频繁。 10 | 11 | ## 2. 建议设置"佛跳墙" 12 |   经测,设置某墙后,可以在不设爬取时延的状态下,更快更高效的获取数据。如果某强是客户端软件,在 requests 超过TIMEOUT时切换节点可继续获取数据。 13 | 14 | ## 3. UAMiddleware、ProxyMiddleware 15 |   此外,添加随机UA中间件以及代理中间件(由于本机有佛跳墙的客户端软件,所以没有开启代理中间件)。 16 | ```Python 17 | from fake_useragent import UserAgent 18 | 19 | class UAMiddleware(object): 20 | def __init__(self): 21 | self.user_agent = UserAgent().random 22 | 23 | def process_request(self, request, spider): 24 | request.headers['User-Agent'] = self.user_agent 25 | ``` 26 | 27 | ## 4.爬取思路 28 | - **start_requests()** 初始化前100页链接 29 | - 爬取每页问题的详情页链接 30 | - 爬取问题详情页的标题、投票数、正文、标签等信息 31 | - 管道清洗后存入MonogoDB 32 | 33 |   注意:**Reqeust()** 过程产生的异常,由error_back()函数接收并在控制台打印错误信息;爬取问题详情页由于部分问题没有code,所以返回None。数据库管道如下: 34 | ```Python 35 | import pymongo 36 | 37 | class MongoPipeline(object): 38 | def __init__(self, mongo_url, mongo_db): 39 | self.mongo_url = mongo_url 40 | self.mongo_db = mongo_db 41 | 42 | @classmethod 43 | def from_crawler(cls, crawler): 44 | return cls( 45 | mongo_url=crawler.settings.get('MONGO_INIT_URL'), 46 | mongo_db=crawler.settings.get('MONGO_DB') 47 | ) 48 | 49 | def open_spider(self, spider): 50 | self.client = pymongo.MongoClient(self.mongo_url) 51 | self.db = self.client[self.mongo_db] 52 | 53 | def process_item(self, item, spider): 54 | self.db[item.table].insert(dict(item)) 55 | return item 56 | 57 | def close_spider(self, spider): 58 | self.client.close() 59 | ``` 60 | 61 | # Other 62 |   ??? 63 | 64 | # Result 65 | 66 | ![db](https://github.com/Northxw/Python3_WebSpider/blob/master/22-Stackoverflow/stackoverflow/utils/db.png) 67 | -------------------------------------------------------------------------------- /22-Stackoverflow/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = stackoverflow.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = stackoverflow 12 | -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__init__.py -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | class StackoverflowItem(Item): 11 | table = 'stackoverflow' 12 | link = Field() 13 | title = Field() 14 | votes = Field() 15 | body = Field() 16 | tags = Field() 17 | 18 | 19 | -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/main.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from scrapy.cmdline import execute 4 | 5 | execute('scrapy crawl stack'.split()) -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from fake_useragent import UserAgent 4 | import base64 5 | 6 | class UAMiddleware(object): 7 | def __init__(self): 8 | self.user_agent = UserAgent().random 9 | 10 | def process_request(self, request, spider): 11 | request.headers['User-Agent'] = self.user_agent 12 | 13 | class ProxyMiddleware(object): 14 | def __init__(self, proxy_server, proxy_user, proxy_pass): 15 | self.proxy_server = proxy_server 16 | self.proxy_user = proxy_user 17 | self.proxy_pass = proxy_pass 18 | self.proxy_auth = "Basic " + base64.urlsafe_b64encode(bytes((self.proxy_user + ":" + self.proxy_pass), "ascii")).decode("utf8") 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls( 23 | proxy_server = crawler.settings.get('PROXY_SERVER'), 24 | proxy_user = crawler.settings.get('PROXY_USER'), 25 | proxy_pass = crawler.settings.get('PROXY_PASS') 26 | ) 27 | 28 | def process_request(self, request, spider): 29 | request.meta["proxy"] = self.proxy_server 30 | request.headers["Proxy-Authorization"] = self.proxy_auth 31 | 32 | def process_response(self, request, response, spider): 33 | # 统计状态码正常的请求总数量 34 | if response.status not in [500, 502, 503, 504, 522, 524, 408]: 35 | return response 36 | 37 | def process_exception(self, request, exception, spider): 38 | pass -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pymongo 4 | 5 | class MongoPipeline(object): 6 | def __init__(self, mongo_url, mongo_db): 7 | self.mongo_url = mongo_url 8 | self.mongo_db = mongo_db 9 | 10 | @classmethod 11 | def from_crawler(cls, crawler): 12 | return cls( 13 | mongo_url=crawler.settings.get('MONGO_INIT_URL'), 14 | mongo_db=crawler.settings.get('MONGO_DB') 15 | ) 16 | 17 | def open_spider(self, spider): 18 | self.client = pymongo.MongoClient(self.mongo_url) 19 | self.db = self.client[self.mongo_db] 20 | 21 | def process_item(self, item, spider): 22 | self.db[item.table].insert(dict(item)) 23 | return item 24 | 25 | def close_spider(self, spider): 26 | self.client.close() -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for stackoverflow project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'stackoverflow' 13 | 14 | SPIDER_MODULES = ['stackoverflow.spiders'] 15 | NEWSPIDER_MODULE = 'stackoverflow.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'stackoverflow (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'stackoverflow.middlewares.StackoverflowSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'stackoverflow.middlewares.UAMiddleware': 543, 57 | # 'stackoverflow.middlewares.ProxyMiddleware':545, 58 | } 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'stackoverflow.pipelines.MongoPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | MAX_PAGES = 100 94 | 95 | # 代理服务器 96 | PROXY_SERVER = "http://http-dyn.abuyun.com:9020" 97 | 98 | # 代理隧道验证信息(阿布云) 99 | PROXY_USER = "HEO8FRWV77C1H36D" 100 | PROXY_PASS = "6CF467F7135C59B6" 101 | 102 | MONGO_INIT_URL = 'localhost' 103 | MONGO_DB = 'stackoverflow' -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/spiders/__pycache__/stack.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/spiders/__pycache__/stack.cpython-36.pyc -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/spiders/stack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy import Request 4 | from urllib.parse import urlencode 5 | from stackoverflow.items import StackoverflowItem 6 | 7 | class StackSpider(scrapy.Spider): 8 | name = 'stack' 9 | allowed_domains = ['stackoverflow.com/'] 10 | base_url = 'https://stackoverflow.com/questions?' 11 | 12 | def start_requests(self): 13 | """ 14 | 构建请求链接 15 | """ 16 | for i in range(1, self.settings.get('MAX_PAGES') + 1): 17 | params = {'sort': 'votes', 'page': i} 18 | url = self.base_url + urlencode(params) 19 | yield Request(url, callback=self.parse_quetion_list, errback=self.error_back) 20 | 21 | def parse_quetion_list(self, response): 22 | """ 23 | 获取每页的问题链接 24 | """ 25 | for href in response.xpath('//*[@class="summary"]/h3/a/@href'): 26 | url = response.urljoin(href.extract()) 27 | yield Request(url, callback=self.parse_question, errback=self.error_back, dont_filter=True) 28 | 29 | def parse_question(self, response): 30 | """ 31 | 获取问题详情页的数据 32 | """ 33 | self.logger.debug('Already into Pipeline!') 34 | item = StackoverflowItem() 35 | item['link'] = response.url 36 | item['title'] = response.xpath('//*[@id="question-header"]/h1/a/text()').extract_first() 37 | item['votes'] = response.xpath('//*[@id="question"]/div/div[1]/div/div/text()').extract_first() 38 | item['body'] = response.css('.post-text').xpath('.//*[contains(@class, "prettyprint")]').extract() 39 | item['tags'] = response.css('.question .post-tag::text').extract() 40 | yield item 41 | 42 | def error_back(self, e): 43 | _ = self 44 | self.logger.debug('Error: {}'.format(e)) 45 | -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/utils/Error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/utils/Error.png -------------------------------------------------------------------------------- /22-Stackoverflow/stackoverflow/utils/db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/utils/db.png -------------------------------------------------------------------------------- /23-GithubLogin/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /23-GithubLogin/.idea/github.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /23-GithubLogin/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /23-GithubLogin/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /23-GithubLogin/README.md: -------------------------------------------------------------------------------- 1 | ## Github Login 2 |   使用 Scrapy 的 FormReqeust 模拟登陆 Github。 3 | 4 | ## Sort 5 |   **模拟登陆 - FormReqeust** 6 | 7 | ## Analysis 8 | #### 1. 清除Cookies 9 |   查找POST表单参数之前先清除待爬取站点的Cookies。 10 | 11 | #### 2. Form表单 12 |   打开Github登陆界面,F12打开开发者工具并选择All,正常登陆Github,在请求列表中可以看到session请求,然后查看POST参数。 13 | 14 | #### 3. 表单参数 - authenticity_token 15 |   该参数是在访问登陆界面时浏览器设置的,可以在登陆界面的源码中找到。 16 | 17 | #### 4. Cookies 18 |   利用Scrapy的FormReqeust模拟登陆时,不需要像requests模拟登陆时保存Cookies, 因为在后续的Request中会默认将前面的Cookies携带。 19 | 20 | ## Tip 21 |   截止2019/4/2 19:50代码运行无误。 22 | -------------------------------------------------------------------------------- /23-GithubLogin/github/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/__init__.py -------------------------------------------------------------------------------- /23-GithubLogin/github/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /23-GithubLogin/github/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /23-GithubLogin/github/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class GithubItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /23-GithubLogin/github/main.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from scrapy.cmdline import execute 4 | 5 | execute('scrapy crawl logingit'.split()) -------------------------------------------------------------------------------- /23-GithubLogin/github/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class GithubPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /23-GithubLogin/github/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for github project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'github' 13 | 14 | SPIDER_MODULES = ['github.spiders'] 15 | NEWSPIDER_MODULE = 'github.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'github (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Host': 'github.com', 44 | 'Referer': 'https://github.com', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'github.middlewares.GithubSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'github.middlewares.GithubDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'github.pipelines.GithubPipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | # 账号、密码 94 | ACCOUNT = 'northxw@163.com' 95 | PASSWORD = '123456' -------------------------------------------------------------------------------- /23-GithubLogin/github/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /23-GithubLogin/github/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /23-GithubLogin/github/spiders/__pycache__/logingit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/spiders/__pycache__/logingit.cpython-36.pyc -------------------------------------------------------------------------------- /23-GithubLogin/github/spiders/logingit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import requests 4 | 5 | class LogingitSpider(scrapy.Spider): 6 | name = 'logingit' 7 | allowed_domains = ['github.com'] 8 | # 登陆界面的URL 9 | login_url = 'https://github.com/login' 10 | # POST表单数据的URL 11 | post_url = 'https://github.com/session' 12 | # 登陆后URL 13 | logined_url = 'https://github.com/settings/profile' 14 | 15 | def start_requests(self): 16 | """ 17 | 获取登陆页面源码 18 | """ 19 | return [scrapy.Request(url=self.login_url, 20 | callback=self.login, 21 | headers=self.settings.get('DEFAULT_REQUEST_HEADERS'))] 22 | 23 | def login(self, response): 24 | """ 25 | 使用FromRequest模拟登陆Github 26 | """ 27 | # 提取POST验证参数 authenticity_token 28 | authcode = response.xpath('//*[@id="login"]/form/input[2]/@value').extract_first() 29 | if authcode: 30 | self.logger.debug("Auth Token: %s" %authcode) 31 | post_data = { 32 | 'commit': 'Sign in', 33 | 'utf8': '✓', 34 | 'authenticity_token': authcode, 35 | 'login': self.settings.get('ACCOUNT'), 36 | 'password': self.settings.get('PASSWORD') 37 | } 38 | return [scrapy.FormRequest(url=self.post_url, 39 | formdata=post_data, 40 | headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), 41 | callback=self.check)] 42 | else: 43 | return [scrapy.Request(url=self.login_url, callback=self.login)] 44 | 45 | def check(self, response): 46 | """ 47 | 验证登陆是否成功 48 | """ 49 | avatar = response.css('#user-links > li:nth-child(3) > details > summary > img::attr(src)').extract_first() 50 | if avatar: 51 | content = requests.get(url=avatar.split('?')[0]).content 52 | with open('./utils/acatar.jpg', 'wb') as f: 53 | f.write(content) 54 | print('Successfully Login!') 55 | pass 56 | 57 | 58 | def parse(self, response): 59 | pass -------------------------------------------------------------------------------- /23-GithubLogin/github/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/utils/__init__.py -------------------------------------------------------------------------------- /23-GithubLogin/github/utils/acatar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/utils/acatar.jpg -------------------------------------------------------------------------------- /23-GithubLogin/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = github.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = github 12 | -------------------------------------------------------------------------------- /24-Dianping/README.md: -------------------------------------------------------------------------------- 1 | ## 大众点评字体反爬 2 |   大众点评css定位的字体反爬解决方案 3 | 4 | ## 处理思路 5 | - 请求CSS链接获取文本内容,正则匹配class对应的坐标值 6 | - 请求SVG链接,正则匹配被除数以及偏移文本 7 | - 判断、获取、拼接数字 8 | 9 | ## 示例 10 |   网页对应文字截图(https://www.dianping.com/xian/ch0): 11 | 12 | ![prt1](https://github.com/Northxw/Python3_WebSpider/blob/master/24-Dianping/utils/prtsc1.png) 13 | 14 |   代码运行结果截图: 15 | 16 | ![prt5](https://github.com/Northxw/Python3_WebSpider/blob/master/24-Dianping/utils/prtsc5.png) 17 | -------------------------------------------------------------------------------- /24-Dianping/demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import re 4 | import requests 5 | import lxml.html 6 | 7 | def get_css_text(class_): 8 | """ 9 | 获取坐标值 10 | """ 11 | css_html = requests.get('https://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/1595b8f4917c831efb53461c8d9b86cb.css').text 12 | info_css = re.findall(r'%s{background:-(\d+).0px -(\d+).0px' % class_, css_html, re.S)[0] 13 | return info_css 14 | 15 | def get_completed_nums(compelted_nums=''): 16 | """ 17 | 获取数字 18 | """ 19 | result_svgtext = requests.get('http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/7226aa7d9b89866aecb63ab0f06ca037.svg').text 20 | a, b, c = re.findall('y=.*?>(.*?)<', result_svgtext, re.S) # 示例a:56422383356911691085268889707857... 21 | y1, y2, y3 = re.findall('y="(.*?)">', result_svgtext, re.S) # 示例: 46, 83, 129 22 | divisor = eval(re.search('x="(\d{2}) ', result_svgtext, re.S).group(1)) # 示例:x = 12,...... 23 | for class_ in class_list: 24 | x, y = get_css_text(class_) 25 | x, y = int(x), int(y) 26 | if y < int(y1): 27 | compelted_nums += a[x // divisor] 28 | elif y < int(y2): 29 | compelted_nums += b[x // divisor] 30 | elif y < int(y3): 31 | compelted_nums += c[x // divisor] 32 | print("总评论数:", compelted_nums) 33 | return compelted_nums 34 | 35 | if __name__ == '__main__': 36 | class_list = ['ovr2h', 'ovjpg', 'ovra6', 'ovzs7'] 37 | get_completed_nums() -------------------------------------------------------------------------------- /24-Dianping/utils/prtsc1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc1.png -------------------------------------------------------------------------------- /24-Dianping/utils/prtsc2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc2.png -------------------------------------------------------------------------------- /24-Dianping/utils/prtsc3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc3.png -------------------------------------------------------------------------------- /24-Dianping/utils/prtsc4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc4.png -------------------------------------------------------------------------------- /24-Dianping/utils/prtsc5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc5.png -------------------------------------------------------------------------------- /25-DouYin/README.md: -------------------------------------------------------------------------------- 1 | ## 抖音 2 | 3 |   根据抖音个人主页名片分享链接构造抓取链接,获取用户的昵称、抖音ID,签名,头像、关注、粉丝、获赞数量,作品及获得喜欢的数量。 4 | 5 | ## 分类 6 | 7 |   字体反爬 - 抖音 8 | 9 | ## 运行 10 | 11 | ```shell 12 | python douyin.py 13 | ``` 14 | 15 | -------------------------------------------------------------------------------- /25-DouYin/font.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # import requests 4 | # import re 5 | # import time 6 | # 7 | # headers = { 8 | # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36" 9 | # } 10 | # 11 | # def crack_font(): 12 | # """处理反爬""" 13 | # url = "https://www.iesdouyin.com/share/user/59498826860" 14 | # response = requests.get(url, headers=headers) 15 | # ttf_url = "https://%s" % re.findall("format\('woff'\),url\(//(.*?\.ttf)\)", response.text, re.S)[0] # 匹配字体文件链接 16 | # print(ttf_url) 17 | # # get_mapping_table(ttf_url) 18 | 19 | def get_mapping_table(codeNum): 20 | """处理文字""" 21 | font_code_map = { 22 | "": "num_", 23 | "": "num_1", 24 | "": "num_2", 25 | "": "num_3", 26 | "": "num_4", 27 | "": "num_5", 28 | "": "num_6", 29 | "": "num_7", 30 | "": "num_8", 31 | "": "num_9", 32 | "": "num_4", 33 | "": "num_1", 34 | "": "num_", 35 | "": "num_5", 36 | "": "num_3", 37 | "": "num_2", 38 | "": "num_6", 39 | "": "num_8", 40 | "": "num_9", 41 | "": "num_7", 42 | "": "num_1", 43 | "": "num_3", 44 | "": "num_", 45 | "": "num_4", 46 | "": "num_2", 47 | "": "num_5", 48 | "": "num_8", 49 | "": "num_9", 50 | "": "num_7", 51 | "": "num_6", 52 | } 53 | 54 | font_num_map = { 55 | "1": "num_", 56 | "0": "num_1", 57 | "3": "num_2", 58 | "2": "num_3", 59 | "4": "num_4", 60 | "5": "num_5", 61 | "6": "num_6", 62 | "9": "num_7", 63 | "7": "num_8", 64 | "8": "num_9", 65 | } 66 | codeNumMap = font_code_map[codeNum] 67 | decodeNum = '' 68 | if codeNumMap in font_num_map.values(): 69 | decodeNum = ''.join([k for k, v in font_num_map.items() if codeNumMap == v]) 70 | return decodeNum 71 | 72 | 73 | if __name__ == '__main__': 74 | print(get_mapping_table("")) -------------------------------------------------------------------------------- /25-DouYin/shareid.txt: -------------------------------------------------------------------------------- 1 | 98524936524 2 | 96467876974 3 | 97836647912 4 | 72051219546 5 | 88445518961 6 | 59498826860 7 | 76055758243 8 | 58944980339 9 | 93584412487 10 | 62427282029 11 | 98985522288 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python3 WebSpider 2 |   Python3网络爬虫实践代码, 仅供学习交流使用。 3 | 4 | ## Tip 5 |   除特殊说明外,编译环境为:**Python 3.6.5、Pycharm 2018.3.5**。 6 | 7 | ## Notice 8 |   短期内不再更新,后期视情况。项目代码已经很早了,很多爬虫破解逻辑或思路可能已经过时,仅供爬虫入门练习。 9 | --------------------------------------------------------------------------------