├── .DS_Store ├── .gitignore ├── CrawlYouYuan ├── .idea │ ├── CrawlYouYuan.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── CrawlYouYuan │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── youyuan.py ├── begin.py └── scrapy.cfg ├── DouBanMovie ├── .DS_Store ├── .idea │ ├── DouBanMovie.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── DouBanMovie │ ├── .DS_Store │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── .DS_Store │ │ ├── __init__.py │ │ └── douban.py ├── begin.py ├── movie.json └── scrapy.cfg ├── DouYuSpider ├── .idea │ ├── DouYuSpider.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── DouYuSpider │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── douyu.py ├── Images │ ├── Cute兔丶.jpg │ ├── MiS媛.jpg │ ├── Super超级冷.jpg │ ├── Yozi柚子妹妹.jpg │ ├── pinky水蜜桃.jpg │ ├── 一只小玲儿.jpg │ ├── 会玩的黄宝宝.jpg │ ├── 冷伊宁.jpg │ ├── 十四万岁的青丘老太婆.jpg │ ├── 可乐小十五.jpg │ ├── 吃萝卜的辛巴.jpg │ ├── 咘咘柳.jpg │ ├── 大宝SOD蜜不是润肤露.jpg │ ├── 大木头CL.jpg │ ├── 小依泽儿.jpg │ ├── 小口古小咕.jpg │ ├── 小圆脸娜娜.jpg │ ├── 小小小思齐.jpg │ ├── 小雅er.jpg │ ├── 尛小钰.jpg │ ├── 左思念.jpg │ ├── 巫女蛋.jpg │ ├── 布丁味的雯宝宝.jpg │ ├── 幼齿懵骚小安妮.jpg │ ├── 悠悠fairy.jpg │ ├── 懵G娜.jpg │ ├── 是囧囧初啊.jpg │ ├── 江沅是个小可爱.jpg │ ├── 温柔的喵小胖.jpg │ ├── 爱笑的蒙蒙.jpg │ ├── 璇璇璇儿丶Tay.jpg │ ├── 甜馨大队长.jpg │ ├── 白羊可爱多.jpg │ ├── 磨人的小柠檬.jpg │ ├── 糖炒栗子lr.jpg │ ├── 糖糖小萌主.jpg │ ├── 紫絮儿521.jpg │ ├── 苏思淳sheep.jpg │ ├── 若儿被注册了呢.jpg │ ├── 诗诗诗诗诗诗酱.jpg │ ├── 谷猫宁.jpg │ ├── 辣椒酱jiang.jpg │ ├── 迷人的小北北.jpg │ ├── 阿青Dale.jpg │ ├── 陈梓不是橙子.jpg │ └── 鲸鱼妹爱素颜.jpg ├── begin.py ├── douyu.json └── scrapy.cfg ├── HongNiangNet ├── .DS_Store ├── .idea │ ├── HongNiangNet.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── HongNiangNet │ ├── .DS_Store │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── .DS_Store │ │ ├── __init__.py │ │ └── hongniang.py ├── begin.py ├── content.json └── scrapy.cfg ├── LICENSE ├── README.md ├── duodian ├── .idea │ ├── duodian.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── db.sqlite3 ├── duodian │ ├── __init__.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── manage.py ├── myduodian │ ├── __init__.py │ ├── admin.py │ ├── migrations │ │ ├── 0001_initial.py │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ └── views.py ├── templates │ └── myduodian │ │ └── index.html └── woduodian.py ├── gongzhonghao.jpeg ├── jiekou ├── .idea │ ├── jiekou.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── db.sqlite3 ├── jiekou │ ├── __init__.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── manage.py ├── myjiekou │ ├── __init__.py │ ├── admin.py │ ├── migrations │ │ ├── 0001_initial.py │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ └── views.py └── templates │ └── myjiekou │ └── index.html ├── teacherInfo ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── teacherInfo.iml │ └── workspace.xml ├── begin.py ├── scrapy.cfg ├── teacher.json └── teacherInfo │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── myteacher.py └── 爬虫小demo ├── .DS_Store ├── 01 taobao.py ├── 02 doubanzhihu.py ├── 03 douYuUnittest.py ├── 04 fileHandler.py ├── 05 getimage.py ├── 06 jsload.py ├── 07 jsondata.py ├── 08 jsonpath和json总结.py ├── 09 zhihu_login.py ├── 10 match.py ├── 11 neihan.py ├── 12 PIL.py ├── 13 queryxpath.py ├── 14 selenium执行js.py ├── 15 tencent.py ├── 16 xunmall.py ├── 17 zhihulogin.py ├── 18 github_login.py ├── 19 jd_login.py ├── 20 下载网易云歌词.py ├── 21 TaoBaoInfo.py ├── 22 JDPython.py ├── 23 tuchongnet.py ├── 24 pythonDuoDian.py ├── 25 PythonItChat.py ├── 26 PythonWeChat.py ├── 27 PythonWordCloud.py ├── 28 PythonCheHui.py ├── 29 PythonCeHui.py ├── 30 PythonZhuanFa.py ├── 31 下载bilibili视频.py └── 32 m3u8.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /CrawlYouYuan/.idea/CrawlYouYuan.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /CrawlYouYuan/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /CrawlYouYuan/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /CrawlYouYuan/CrawlYouYuan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/CrawlYouYuan/CrawlYouYuan/__init__.py -------------------------------------------------------------------------------- /CrawlYouYuan/CrawlYouYuan/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class CrawlyouyuanItem(scrapy.Item): 11 | # 用户名 12 | username = scrapy.Field() 13 | # 年龄 14 | age = scrapy.Field() 15 | # 头像图片的链接 16 | header_url = scrapy.Field() 17 | # 相册图片的链接 18 | images_url = scrapy.Field() 19 | # 内心独白 20 | content = scrapy.Field() 21 | # 籍贯 22 | place_from = scrapy.Field() 23 | # 学历 24 | education = scrapy.Field() 25 | # 兴趣爱好 26 | hobby = scrapy.Field() 27 | # 个人主页 28 | source_url = scrapy.Field() 29 | # 数据来源网站 30 | sourec = scrapy.Field() 31 | # utc 时间 32 | time = scrapy.Field() 33 | # 爬虫名 34 | spidername = scrapy.Field() 35 | -------------------------------------------------------------------------------- /CrawlYouYuan/CrawlYouYuan/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | import codecs 9 | 10 | class CrawlyouyuanPipeline(object): 11 | 12 | def __init__(self): 13 | self.filename = codecs.open('content.json', 'w', encoding='utf-8') 14 | 15 | def process_item(self, item, spider): 16 | html = json.dumps(dict(item), ensure_ascii=False) 17 | self.filename.write(html + '\n') 18 | return item 19 | 20 | def spider_closed(self, spider): 21 | self.filename.close() 22 | 23 | -------------------------------------------------------------------------------- /CrawlYouYuan/CrawlYouYuan/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for CrawlYouYuan project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'CrawlYouYuan' 13 | 14 | SPIDER_MODULES = ['CrawlYouYuan.spiders'] 15 | NEWSPIDER_MODULE = 'CrawlYouYuan.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)' 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | #CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | #DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'CrawlYouYuan.middlewares.MyCustomSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'CrawlYouYuan.middlewares.MyCustomDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'CrawlYouYuan.pipelines.CrawlyouyuanPipeline': 300, 68 | } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /CrawlYouYuan/CrawlYouYuan/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /CrawlYouYuan/CrawlYouYuan/spiders/youyuan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from CrawlYouYuan.items import CrawlyouyuanItem 6 | import re 7 | class YouyuanSpider(CrawlSpider): 8 | name = 'youyuan' 9 | allowed_domains = ['youyuan.com'] 10 | start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/'] 11 | # 自动生成的文件不需要改东西,只需要添加rules文件里面Rule角色就可以 12 | # 每一页匹配规则 13 | page_links = LinkExtractor(allow=(r"youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/")) 14 | # 每个人个人主页匹配规则 15 | profile_links = LinkExtractor(allow=(r"youyuan.com/\d+-profile/")) 16 | rules = ( 17 | # 没有回调函数,说明follow是True 18 | Rule(page_links), 19 | # 有回调函数,说明follow是False 20 | Rule(profile_links, callback='parse_item', follow=True), 21 | ) 22 | 23 | def parse_item(self, response): 24 | item = CrawlyouyuanItem() 25 | 26 | item['username'] = self.get_username(response) 27 | # 年龄 28 | item['age'] = self.get_age(response) 29 | # 头像图片的链接 30 | item['header_url'] = self.get_header_url(response) 31 | # 相册图片的链接 32 | item['images_url'] = self.get_images_url(response) 33 | # 内心独白 34 | item['content'] = self.get_content(response) 35 | # 籍贯 36 | item['place_from'] = self.get_place_from(response) 37 | # 学历 38 | item['education'] = self.get_education(response) 39 | # 兴趣爱好 40 | item['hobby'] = self.get_hobby(response) 41 | # 个人主页 42 | item['source_url'] = response.url 43 | # 数据来源网站 44 | item['sourec'] = "youyuan" 45 | 46 | yield item 47 | 48 | def get_username(self, response): 49 | username = response.xpath("//dl[@class='personal_cen']//div[@class='main']/strong/text()").extract() 50 | if len(username): 51 | username = username[0] 52 | else: 53 | username = "NULL" 54 | return username.strip() 55 | 56 | def get_age(self, response): 57 | age = response.xpath("//dl[@class='personal_cen']//dd/p/text()").extract() 58 | if len(age): 59 | age = re.findall(u"\d+岁", age[0])[0] 60 | else: 61 | age = "NULL" 62 | return age.strip() 63 | 64 | def get_header_url(self, response): 65 | header_url = response.xpath("//dl[@class='personal_cen']/dt/img/@src").extract() 66 | if len(header_url): 67 | header_url = header_url[0] 68 | else: 69 | header_url = "NULL" 70 | return header_url.strip() 71 | 72 | def get_images_url(self, response): 73 | images_url = response.xpath("//div[@class='ph_show']/ul/li/a/img/@src").extract() 74 | if len(images_url): 75 | images_url = ", ".join(images_url) 76 | else: 77 | images_url = "NULL" 78 | return images_url 79 | 80 | def get_content(self, response): 81 | content = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract() 82 | if len(content): 83 | content = content[0] 84 | else: 85 | content = "NULL" 86 | return content.strip() 87 | 88 | def get_place_from(self, response): 89 | place_from = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract() 90 | if len(place_from): 91 | place_from = place_from[0] 92 | else: 93 | place_from = "NULL" 94 | return place_from.strip() 95 | 96 | def get_education(self, response): 97 | education = response.xpath("//div[@class='pre_data']/ul/li[3]//ol[2]/li[2]/span/text()").extract() 98 | if len(education): 99 | education = education[0] 100 | else: 101 | education = "NULL" 102 | return education.strip() 103 | 104 | def get_hobby(self, response): 105 | hobby = response.xpath("//dl[@class='personal_cen']//ol/li/text()").extract() 106 | if len(hobby): 107 | hobby = ",".join(hobby).replace(" ", "") 108 | else: 109 | hobby = "NULL" 110 | return hobby.strip() 111 | -------------------------------------------------------------------------------- /CrawlYouYuan/begin.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl youyuan'.split()) -------------------------------------------------------------------------------- /CrawlYouYuan/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = CrawlYouYuan.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = CrawlYouYuan 12 | -------------------------------------------------------------------------------- /DouBanMovie/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/.DS_Store -------------------------------------------------------------------------------- /DouBanMovie/.idea/DouBanMovie.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /DouBanMovie/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /DouBanMovie/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/.DS_Store -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/__init__.py -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanmovieItem(scrapy.Item): 12 | # 标题 13 | title = scrapy.Field() 14 | # 信息 15 | info = scrapy.Field() 16 | # 评分 17 | star = scrapy.Field() 18 | # 简介 19 | quote = scrapy.Field() 20 | 21 | 22 | -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import codecs 9 | import json 10 | import pymongo 11 | from scrapy.conf import settings 12 | 13 | class DoubanmoviePipeline(object): 14 | host = settings["MONGODB_HOST"] 15 | port = settings["MONGODB_PORT"] 16 | dbname = settings["MONGODB_DBNAME"] 17 | sheetname = settings["MONGODB_SHEETNAME"] 18 | 19 | # 创建MONGODB数据库链接 20 | client = pymongo.MongoClient(host=host, port=port) 21 | # 指定数据库 22 | mydb = client[dbname] 23 | # 存放数据的数据库表名 24 | sheet = mydb[sheetname] 25 | def process_item(self, item, spider): 26 | # 1. 生成文件 27 | # self.filename = codecs.open('movie.json','a',encoding='utf-8') 28 | # html = json.dumps(dict(item),ensure_ascii=False) 29 | # self.filename.write(html + '\n') 30 | # self.filename.close() 31 | # 2. 把数据插入数据库 32 | data = dict(item) 33 | self.sheet.insert(data) 34 | 35 | return item 36 | 37 | 38 | -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DouBanMovie project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'DouBanMovie' 13 | 14 | SPIDER_MODULES = ['DouBanMovie.spiders'] 15 | NEWSPIDER_MODULE = 'DouBanMovie.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'DouBanMovie (+http://www.yourdomain.com)' 20 | USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;" 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | # DEFAULT_REQUEST_HEADERS = { 44 | # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)', 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | # } 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'DouBanMovie.middlewares.MyCustomSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'DouBanMovie.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'DouBanMovie.pipelines.DoubanmoviePipeline': 300, 71 | } 72 | # MONGODB 主机名 73 | MONGODB_HOST = "127.0.0.1" 74 | 75 | # MONGODB 端口号 76 | MONGODB_PORT = 27017 77 | 78 | # 数据库名称 79 | MONGODB_DBNAME = "Douban" 80 | 81 | # 存放数据的表名称 82 | MONGODB_SHEETNAME = "doubanmovies" 83 | # Enable and configure the AutoThrottle extension (disabled by default) 84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 85 | #AUTOTHROTTLE_ENABLED = True 86 | # The initial download delay 87 | #AUTOTHROTTLE_START_DELAY = 5 88 | # The maximum download delay to be set in case of high latencies 89 | #AUTOTHROTTLE_MAX_DELAY = 60 90 | # The average number of requests Scrapy should be sending in parallel to 91 | # each remote server 92 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 93 | # Enable showing throttling stats for every response received: 94 | #AUTOTHROTTLE_DEBUG = False 95 | 96 | # Enable and configure HTTP caching (disabled by default) 97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 98 | #HTTPCACHE_ENABLED = True 99 | #HTTPCACHE_EXPIRATION_SECS = 0 100 | #HTTPCACHE_DIR = 'httpcache' 101 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 102 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 103 | -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/spiders/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/spiders/.DS_Store -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /DouBanMovie/DouBanMovie/spiders/douban.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from DouBanMovie.items import DoubanmovieItem 4 | 5 | class DoubanSpider(scrapy.Spider): 6 | name = "douban" 7 | allowed_domains = ["movie.douban.com"] 8 | offset = 0 9 | url = 'https://movie.douban.com/top250?start=' 10 | start_urls = ( 11 | url + str(offset), 12 | ) 13 | 14 | def parse(self, response): 15 | item = DoubanmovieItem() 16 | # 电影全部信息 17 | movies = response.xpath("//div[@class='info']") 18 | for eachmovie in movies: 19 | 20 | titlelist = eachmovie.xpath("./div[@class='hd']/a/span[@class='title'][1]/text()") 21 | if len(titlelist) == 0: 22 | item['title'] = '' 23 | else: 24 | item['title'] = titlelist.extract()[0] 25 | info = eachmovie.xpath("./div[@class='bd']/p/text()").extract()[0] 26 | item['info'] = info.replace('\n','').strip() 27 | item['star'] = eachmovie.xpath("./div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()").extract()[0] 28 | quotelist = eachmovie.xpath("./div[@class='bd']/p[@class='quote']/span[@class='inq']/text()") 29 | if len(quotelist) == 0: 30 | item['quote'] = '' 31 | else: 32 | item['quote'] = quotelist.extract()[0] 33 | yield item 34 | 35 | 36 | if self.offset < 225: 37 | self.offset += 25 38 | yield scrapy.Request(self.url + str(self.offset),callback = self.parse) 39 | 40 | -------------------------------------------------------------------------------- /DouBanMovie/begin.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl douban'.split()) -------------------------------------------------------------------------------- /DouBanMovie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = DouBanMovie.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = DouBanMovie 12 | -------------------------------------------------------------------------------- /DouYuSpider/.idea/DouYuSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /DouYuSpider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /DouYuSpider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/DouYuSpider/__init__.py -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DouyuspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # 房间名 14 | vertical = scrapy.Field() 15 | # 昵称 16 | name = scrapy.Field() 17 | # 房间照片 18 | room_src = scrapy.Field() 19 | # 地区 20 | anchor_city = scrapy.Field() 21 | imagesPath = scrapy.Field() 22 | 23 | 24 | -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DouyuspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import scrapy 9 | # import codecs 10 | # import json 11 | import os 12 | from scrapy.pipelines.images import ImagesPipeline 13 | from scrapy.utils.project import get_project_settings 14 | 15 | # class DouyuspiderPipeline(object): 16 | # def __init__(self): 17 | # # 创建一个只写文件,指定文本编码格式为utf-8 18 | # self.filename = codecs.open('douyu.json', 'w', encoding='utf-8') 19 | # def process_item(self, item, spider): 20 | # 21 | # html = json.dumps(dict(item),ensure_ascii='utf-8') 22 | # self.filename.write(html + '\n') 23 | # return item 24 | # 25 | # # def spider_closed(self, spider): 26 | # # self.file.close() 27 | 28 | # scrapy下载图片需要安装pip install image/Pillow 29 | class DouYuImagesPipelines(ImagesPipeline): 30 | IMAGES_STORE = get_project_settings().get("IMAGES_STORE") 31 | 32 | def get_media_requests(self, item, info): 33 | image_url = item["vertical"] 34 | yield scrapy.Request(image_url) 35 | 36 | def item_completed(self, results, item, info): 37 | # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确,就放到 image_path里,ImagesPipeline源码剖析可见 38 | image_path = [x["path"] for ok, x in results if ok] 39 | 40 | os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg") 41 | item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"] 42 | 43 | return item -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DouYuSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'DouYuSpider' 13 | 14 | SPIDER_MODULES = ['DouYuSpider.spiders'] 15 | NEWSPIDER_MODULE = 'DouYuSpider.spiders' 16 | 17 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)' 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'DouYuSpider (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'DouYuSpider.middlewares.DouyuspiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'DouYuSpider.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | # 'DouYuSpider.pipelines.DouyuspiderPipeline': 300, 70 | 'DouYuSpider.pipelines.DouYuImagesPipelines': 300, 71 | } 72 | IMAGES_STORE = "/Users/yunmei/Desktop/scrapyenv/Python-Spider/DouYuSpider/Images" 73 | # 日志 74 | # LOG_FILE = "dg.log" 75 | # LOG_LEVEL = "DEBUG" 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /DouYuSpider/DouYuSpider/spiders/douyu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | 5 | from DouYuSpider.items import DouyuspiderItem 6 | class DouyuSpider(scrapy.Spider): 7 | name = 'douyu' 8 | # 不可设置为allowed_domains = ['http://capi.douyucdn.cn'] 9 | allowed_domains = ['capi.douyucdn.cn'] 10 | 11 | offset = 0 12 | url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" 13 | 14 | start_urls = [url + str(offset)] 15 | 16 | def parse(self, response): 17 | data = json.loads(response.text)['data'] 18 | 19 | for each in data: 20 | item = DouyuspiderItem() 21 | 22 | item["vertical"] = each["vertical_src"].encode("utf-8") 23 | item["name"] = each["nickname"].encode("utf-8") 24 | item["room_src"] = each["room_src"].encode("utf-8") 25 | item["anchor_city"] = each["anchor_city"].encode("utf-8") 26 | 27 | yield item 28 | 29 | self.offset += 20 30 | yield scrapy.Request(self.url + str(self.offset),callback = self.parse) 31 | 32 | -------------------------------------------------------------------------------- /DouYuSpider/Images/Cute兔丶.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Cute兔丶.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/MiS媛.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/MiS媛.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/Super超级冷.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Super超级冷.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/Yozi柚子妹妹.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Yozi柚子妹妹.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/pinky水蜜桃.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/pinky水蜜桃.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/一只小玲儿.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/一只小玲儿.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/会玩的黄宝宝.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/会玩的黄宝宝.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/冷伊宁.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/冷伊宁.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/十四万岁的青丘老太婆.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/十四万岁的青丘老太婆.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/可乐小十五.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/可乐小十五.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/吃萝卜的辛巴.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/吃萝卜的辛巴.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/咘咘柳.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/咘咘柳.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/大宝SOD蜜不是润肤露.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/大宝SOD蜜不是润肤露.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/大木头CL.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/大木头CL.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/小依泽儿.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小依泽儿.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/小口古小咕.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小口古小咕.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/小圆脸娜娜.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小圆脸娜娜.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/小小小思齐.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小小小思齐.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/小雅er.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小雅er.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/尛小钰.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/尛小钰.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/左思念.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/左思念.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/巫女蛋.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/巫女蛋.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/布丁味的雯宝宝.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/布丁味的雯宝宝.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/幼齿懵骚小安妮.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/幼齿懵骚小安妮.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/悠悠fairy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/悠悠fairy.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/懵G娜.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/懵G娜.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/是囧囧初啊.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/是囧囧初啊.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/江沅是个小可爱.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/江沅是个小可爱.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/温柔的喵小胖.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/温柔的喵小胖.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/爱笑的蒙蒙.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/爱笑的蒙蒙.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/璇璇璇儿丶Tay.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/璇璇璇儿丶Tay.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/甜馨大队长.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/甜馨大队长.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/白羊可爱多.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/白羊可爱多.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/磨人的小柠檬.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/磨人的小柠檬.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/糖炒栗子lr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/糖炒栗子lr.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/糖糖小萌主.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/糖糖小萌主.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/紫絮儿521.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/紫絮儿521.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/苏思淳sheep.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/苏思淳sheep.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/若儿被注册了呢.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/若儿被注册了呢.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/诗诗诗诗诗诗酱.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/诗诗诗诗诗诗酱.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/谷猫宁.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/谷猫宁.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/辣椒酱jiang.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/辣椒酱jiang.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/迷人的小北北.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/迷人的小北北.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/阿青Dale.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/阿青Dale.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/陈梓不是橙子.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/陈梓不是橙子.jpg -------------------------------------------------------------------------------- /DouYuSpider/Images/鲸鱼妹爱素颜.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/鲸鱼妹爱素颜.jpg -------------------------------------------------------------------------------- /DouYuSpider/begin.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl douyu'.split()) -------------------------------------------------------------------------------- /DouYuSpider/douyu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/douyu.json -------------------------------------------------------------------------------- /DouYuSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = DouYuSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = DouYuSpider 12 | -------------------------------------------------------------------------------- /HongNiangNet/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/.DS_Store -------------------------------------------------------------------------------- /HongNiangNet/.idea/HongNiangNet.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /HongNiangNet/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /HongNiangNet/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/.DS_Store -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/__init__.py -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy import Field,Item 10 | 11 | class HongniangnetItem(Item): 12 | # define the fields for your item here like: 13 | # 用户名 14 | username = Field() 15 | # 年龄 16 | age = Field() 17 | # 头像图片链接 18 | header_link = Field() 19 | # 相册图片链接 20 | images_url = Field() 21 | # 内心独白 22 | content = Field() 23 | # 籍贯 24 | place_from= Field() 25 | # 学历 26 | education = Field() 27 | # 爱好 28 | hobby = Field() 29 | # 个人主页链接 30 | source_url = Field() 31 | # 数据来源网站 32 | source = Field() 33 | 34 | -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class HongniangnetSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import codecs 9 | import json 10 | class HongniangnetPipeline(object): 11 | 12 | 13 | def __init__(self): 14 | self.filename = codecs.open('content.json', 'w', encoding='utf-8') 15 | def process_item(self, item, spider): 16 | html = json.dumps(dict(item),ensure_ascii=False) 17 | # self.filename.write(html + '\n') 18 | self.filename.write(html + '\n') 19 | return item 20 | 21 | def spider_closed(self, spider): 22 | self.filename.close() 23 | -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for HongNiangNet project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'HongNiangNet' 13 | 14 | SPIDER_MODULES = ['HongNiangNet.spiders'] 15 | NEWSPIDER_MODULE = 'HongNiangNet.spiders' 16 | 17 | # 分布式爬虫设置Ip端口 18 | REDIS_HOST = '192.168.19.206' 19 | REDIS_PORT = 6379 20 | 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'HongNiangNet (+http://www.yourdomain.com)' 24 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)' 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = True 27 | 28 | 29 | # 使用了scrapy-redis里的去重组件,不使用scrapy默认的去重 30 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 31 | # 使用了scrapy-redis里的调度器组件,不实用scrapy默认的调度器 32 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 33 | # 使用队列形式 34 | SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 35 | # 允许暂停,redis请求记录不丢失 36 | SCHEDULER_PERSIST = True 37 | 38 | 39 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 40 | #CONCURRENT_REQUESTS = 32 41 | 42 | # Configure a delay for requests for the same website (default: 0) 43 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 44 | # See also autothrottle settings and docs 45 | # DOWNLOAD_DELAY = 3 46 | # The download delay setting will honor only one of: 47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 48 | #CONCURRENT_REQUESTS_PER_IP = 16 49 | 50 | # Disable cookies (enabled by default) 51 | #COOKIES_ENABLED = False 52 | 53 | # Disable Telnet Console (enabled by default) 54 | #TELNETCONSOLE_ENABLED = False 55 | 56 | # Override the default request headers: 57 | #DEFAULT_REQUEST_HEADERS = { 58 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 59 | # 'Accept-Language': 'en', 60 | #} 61 | 62 | # Enable or disable spider middlewares 63 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 64 | # SPIDER_MIDDLEWARES = { 65 | # 'HongNiangNet.middlewares.HongniangnetSpiderMiddleware': 543, 66 | # } 67 | 68 | # Enable or disable downloader middlewares 69 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 70 | #DOWNLOADER_MIDDLEWARES = { 71 | # 'HongNiangNet.middlewares.MyCustomDownloaderMiddleware': 543, 72 | #} 73 | 74 | # Enable or disable extensions 75 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 76 | #EXTENSIONS = { 77 | # 'scrapy.extensions.telnet.TelnetConsole': None, 78 | #} 79 | 80 | # Configure item pipelines 81 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 82 | ITEM_PIPELINES = { 83 | 'HongNiangNet.pipelines.HongniangnetPipeline': 300, 84 | 'scrapy_redis.pipelines.RedisPipeline' : 400, 85 | } 86 | 87 | # Enable and configure the AutoThrottle extension (disabled by default) 88 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 89 | #AUTOTHROTTLE_ENABLED = True 90 | # The initial download delay 91 | #AUTOTHROTTLE_START_DELAY = 5 92 | # The maximum download delay to be set in case of high latencies 93 | #AUTOTHROTTLE_MAX_DELAY = 60 94 | # The average number of requests Scrapy should be sending in parallel to 95 | # each remote server 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 97 | # Enable showing throttling stats for every response received: 98 | #AUTOTHROTTLE_DEBUG = False 99 | 100 | # Enable and configure HTTP caching (disabled by default) 101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 102 | #HTTPCACHE_ENABLED = True 103 | #HTTPCACHE_EXPIRATION_SECS = 0 104 | #HTTPCACHE_DIR = 'httpcache' 105 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 107 | -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/spiders/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/spiders/.DS_Store -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /HongNiangNet/HongNiangNet/spiders/hongniang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | # from scrapy.spiders import CrawlSpider, Rule 5 | from HongNiangNet.items import HongniangnetItem 6 | # 分布式 7 | from scrapy.spider import Rule 8 | from scrapy_redis.spiders import RedisCrawlSpider 9 | 10 | # class HongniangSpider(CrawlSpider): 11 | class HongniangSpider(RedisCrawlSpider): 12 | 13 | name = 'hongniang' 14 | allowed_domains = ['hongniang.com'] 15 | # start_urls = ['http://www.hongniang.com/match?&page=1'] 16 | redis_key = "hongniangSpider:start_urls" 17 | 18 | # 动态域范围获取 19 | def __init__(self, *args, **kwargs): 20 | # Dynamically define the allowed domains list. 21 | domain = kwargs.pop('domain', '') 22 | self.allowed_domains = filter(None, domain.split(',')) 23 | super(HongniangSpider, self).__init__(*args, **kwargs) 24 | 25 | # 每一页匹配规则 26 | page_links = LinkExtractor(allow=(r"hongniang.com/match?&page=\d+")) 27 | # 每个人个人主页匹配规则 28 | profile_links = LinkExtractor(allow=(r"hongniang.com/user/member/id/\d+")) 29 | rules = ( 30 | # 没有回调函数,说明follow是True 31 | Rule(page_links), 32 | # 有回调函数,说明follow是False 33 | Rule(profile_links, callback='parse_item',follow=True), 34 | ) 35 | 36 | def parse_item(self, response): 37 | 38 | item = HongniangnetItem() 39 | # 注意:xpath获取位置时,不从0开始 40 | # 用户名 41 | item["username"] = self.get_username(response) 42 | # 年龄 43 | item["age"] = self.get_age(response) 44 | # 头像图片链接 45 | item["header_link"] = self.get_header_link(response) 46 | # 相册图片链接 47 | item["images_url"] = self.get_images_url(response) 48 | # 内心独白 49 | item["content"] = self.get_content(response) 50 | # 籍贯 51 | item["place_from"] = self.get_place_from(response) 52 | # 学历 53 | item["education"] = self.get_education(response) 54 | # 爱好 55 | item["hobby"] = self.get_hobby(response) 56 | # 个人主页链接 57 | item["source_url"] = response.url 58 | # 数据来源网站 59 | item["source"] = "hongniang" 60 | 61 | yield item 62 | 63 | def get_username(self,response): 64 | username = response.xpath("//div[@class='name nickname']/text()").extract() 65 | if len(username): 66 | username = username[0] 67 | else: 68 | username = "NULL" 69 | return username.strip() 70 | 71 | def get_age(self,response): 72 | age = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div[1]/ul[1]/li[1]/text()").extract() 73 | if len(age): 74 | age = age[0] 75 | print(age) 76 | else: 77 | age = "NULL" 78 | return age.strip() 79 | 80 | def get_header_link(self,response): 81 | header_link = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img[1]/@src").extract() 82 | if len(header_link): 83 | header_link = header_link[0] 84 | else: 85 | header_link = "NULL" 86 | return header_link.strip() 87 | 88 | def get_images_url(self,response): 89 | images_url = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img/@src").extract() 90 | if len(images_url): 91 | images_url = images_url 92 | else: 93 | images_url = "NULL" 94 | return images_url 95 | 96 | def get_content(self,response): 97 | ontent = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info5']/div[@class='text']/text()").extract() 98 | if len(ontent): 99 | ontent = ontent[0] 100 | else: 101 | ontent = "NULL" 102 | return ontent.strip() 103 | 104 | def get_place_from(self,response): 105 | place_from = response.xpath("//div[@class='mem_main']/div[@class='sub2']/div[@class='info1'][1]/div[@class='right']/ul[2]/li[1]/text()").extract() 106 | if len(place_from): 107 | place_from = place_from[0] 108 | else: 109 | place_from = "NULL" 110 | return place_from.strip() 111 | 112 | def get_education(self,response): 113 | education = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div/ul[2]/li[2]/text()").extract() 114 | if len(education): 115 | education = education[0] 116 | else: 117 | education = "NULL" 118 | return education.strip() 119 | def get_hobby(self,response): 120 | hobby = response.xpath("//div[@class='mem_main']//div[@class='sub2']/div[@class='info1'][2]/div[@class='right'][1]/ul[1]/li[4]/text()").extract() 121 | if len(hobby): 122 | hobby = hobby[0] 123 | else: 124 | hobby = "NULL" 125 | return hobby.strip() 126 | 127 | -------------------------------------------------------------------------------- /HongNiangNet/begin.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl hongniang'.split()) -------------------------------------------------------------------------------- /HongNiangNet/content.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/content.json -------------------------------------------------------------------------------- /HongNiangNet/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = HongNiangNet.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = HongNiangNet 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Spider 2 | 1、豆瓣电影top250
3 | 2、斗鱼爬取json数据以及爬取美女图片
4 | 3、CrawlSpider爬取红娘网相亲人的部分基本信息、红娘网分布式爬取、存储redis
5 | 4、爬虫小demo
6 | 5、Selenium的使用
7 | 6、PIL
8 | 7、爬多点商品存储mysql数据库同时显示在djangoweb页面
9 | 8、django开发接口
10 | 9、python txt、csv、xml文件解析
11 | 10、Scrapy框架进行Spiders简单爬虫
12 | 11、抓取淘宝美女信息,下载本地并存储mysql数据库
13 | 12、爬取有缘网用户信息
14 | 13、模拟Github登陆
15 | 14、selenium动态模拟登录
16 | 15、模拟知乎登录
17 | 16、爬取tencent社招信息
18 | 17、[爬取《多点》整站商品信息](https://github.com/lb2281075105/LBDuoDian)
19 | 18、模拟京东登录
20 | 19、下载网易云歌词
21 | 20、淘宝信息
22 | 21、京东商城商品详情页信息
23 | 22、模拟图虫网登录
24 | 23、itchat 获取微信群或者微信好友分享文章
25 | 24、爬取微信公众号历史文章
26 | 25、itchat监听指定微信公众号分享的文章
27 | 26、itchat微信群微信好友防撤回
28 | 27、在微信群之间转发消息
29 | 27、下载bilibili视频 也可以下载哔哩哔哩集合视频
30 | 28、爬取m3u8视频
31 | 32 | 详细请移步简书[Python文集](http://www.jianshu.com/nb/18442681) 33 | -------------------------------------------------------------------------------- /duodian/.idea/duodian.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 22 | 24 | -------------------------------------------------------------------------------- /duodian/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /duodian/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /duodian/db.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/db.sqlite3 -------------------------------------------------------------------------------- /duodian/duodian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/duodian/__init__.py -------------------------------------------------------------------------------- /duodian/duodian/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for duodian project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.11.4. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.11/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = 'htonb%m8+_d=tsnqm)6)_q@2@m#ulx#nb!8$wbluo9&1yi$yh$' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'myduodian', 41 | ] 42 | 43 | MIDDLEWARE = [ 44 | 'django.middleware.security.SecurityMiddleware', 45 | 'django.contrib.sessions.middleware.SessionMiddleware', 46 | 'django.middleware.common.CommonMiddleware', 47 | 'django.middleware.csrf.CsrfViewMiddleware', 48 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 49 | 'django.contrib.messages.middleware.MessageMiddleware', 50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 51 | ] 52 | MIDDLEWARE_CLASSES = [ 53 | 'django.contrib.sessions.middleware.SessionMiddleware', 54 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 55 | 'django.contrib.messages.middleware.MessageMiddleware', 56 | ] 57 | ROOT_URLCONF = 'duodian.urls' 58 | 59 | TEMPLATES = [ 60 | { 61 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 62 | 'DIRS': [os.path.join(BASE_DIR),'templates'], 63 | 'APP_DIRS': True, 64 | 'OPTIONS': { 65 | 'context_processors': [ 66 | 'django.template.context_processors.debug', 67 | 'django.template.context_processors.request', 68 | 'django.contrib.auth.context_processors.auth', 69 | 'django.contrib.messages.context_processors.messages', 70 | ], 71 | }, 72 | }, 73 | ] 74 | 75 | WSGI_APPLICATION = 'duodian.wsgi.application' 76 | 77 | 78 | # Database 79 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases 80 | 81 | DATABASES = { 82 | 'default': { 83 | 'ENGINE': 'django.db.backends.mysql', 84 | 'HOST':'127.0.0.1', 85 | 'PORT':'3306', 86 | 'NAME': 'test', 87 | 'USER':'root', 88 | 'PASSWORD':'', 89 | } 90 | } 91 | 92 | 93 | # Password validation 94 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators 95 | 96 | AUTH_PASSWORD_VALIDATORS = [ 97 | { 98 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 99 | }, 100 | { 101 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 102 | }, 103 | { 104 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 105 | }, 106 | { 107 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 108 | }, 109 | ] 110 | 111 | 112 | # Internationalization 113 | # https://docs.djangoproject.com/en/1.11/topics/i18n/ 114 | 115 | LANGUAGE_CODE = 'zh-hans' 116 | 117 | TIME_ZONE = 'UTC' 118 | 119 | USE_I18N = True 120 | 121 | USE_L10N = True 122 | 123 | USE_TZ = True 124 | 125 | 126 | # Static files (CSS, JavaScript, Images) 127 | # https://docs.djangoproject.com/en/1.11/howto/static-files/ 128 | 129 | STATIC_URL = '/static/' 130 | -------------------------------------------------------------------------------- /duodian/duodian/urls.py: -------------------------------------------------------------------------------- 1 | """duodian URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.11/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url 17 | from django.contrib import admin 18 | from myduodian import views 19 | urlpatterns = [ 20 | url(r'^admin/', admin.site.urls), 21 | url(r'^index/', views.index), 22 | ] 23 | -------------------------------------------------------------------------------- /duodian/duodian/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for duodian project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "duodian.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /duodian/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "duodian.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /duodian/myduodian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/myduodian/__init__.py -------------------------------------------------------------------------------- /duodian/myduodian/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | 5 | from myduodian.models import AiDuoDian 6 | 7 | class DuoDianAdmin(admin.ModelAdmin): 8 | list_display = ['goodName','price','image'] 9 | 10 | 11 | admin.site.register(AiDuoDian,DuoDianAdmin) -------------------------------------------------------------------------------- /duodian/myduodian/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='AiDuoDian', 15 | fields=[ 16 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 17 | ('image', models.CharField(max_length=1000)), 18 | ('goodName', models.CharField(max_length=200)), 19 | ('price', models.CharField(max_length=40)), 20 | ], 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /duodian/myduodian/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/myduodian/migrations/__init__.py -------------------------------------------------------------------------------- /duodian/myduodian/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from django.db import models 3 | 4 | class AiDuoDian(models.Model): 5 | 6 | image = models.CharField(max_length=1000) 7 | goodName = models.CharField(max_length=200) 8 | price = models.CharField(max_length=40) -------------------------------------------------------------------------------- /duodian/myduodian/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /duodian/myduodian/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | from django.http import HttpResponse 3 | # Create your views here. 4 | 5 | from myduodian.models import * 6 | def index(request): 7 | context = {"list":AiDuoDian.objects.all()} 8 | return render(request,'myduodian/index.html',context) -------------------------------------------------------------------------------- /duodian/templates/myduodian/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | {% for item in list %} 32 | 33 | 34 | 35 | 36 | 37 | {% endfor %} 38 | 39 | 40 | 41 |
商品图片商品名价格
{{ item.goodName }}{{ item.price }}
42 | 43 | 44 | -------------------------------------------------------------------------------- /duodian/woduodian.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | 3 | import MySQLdb 4 | import json 5 | import jsonpath 6 | import urllib2 7 | import os 8 | class DuoDian(): 9 | def __init__(self): 10 | self.url = 'https://gatewx.dmall.com/customersite/searchWareByCategory?param={"pageNum":1,"pageSize":30,"venderId":"1","storeId":"108","sort":"1","categoryId":11347,"categoryLevel":3,"cateSource":1,"bizType":"1"}&token=&source=2&tempid=C7B357489E400002B1514BD01B00E270&pubParam={"utmSource":"wxmp"}&_=1511256196255' 11 | # 建立和数据库的连接 12 | self.db = MySQLdb.connect(host='127.0.0.1', user="root", passwd="", db="test") 13 | # 获取操作游标 14 | self.cursor = self.db.cursor() 15 | 16 | def get_html(self): 17 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 18 | request = urllib2.Request(self.url,headers=headers) 19 | response = urllib2.urlopen(request) 20 | html = response.read() 21 | return html 22 | 23 | def get_html1(self,url): 24 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 25 | request = urllib2.Request(url,headers=headers) 26 | response = urllib2.urlopen(request) 27 | html = response.read() 28 | return html 29 | 30 | def get_content(self): 31 | jsonobj = json.loads(self.get_html()) 32 | # 商品名称 33 | namelist = jsonpath.jsonpath(jsonobj, '$..title') 34 | # 商品价格 35 | pricelist = jsonpath.jsonpath(jsonobj, '$..promotionPrice') 36 | # 商品图片 37 | imglist = jsonpath.jsonpath(jsonobj, '$..img') 38 | listdata = zip(imglist,namelist,pricelist) 39 | 40 | 41 | 42 | for item in listdata: 43 | # print(item[1]) 44 | try: 45 | result = self.cursor.execute( 46 | "insert into myduodian_aiduodian (image,goodName,price) VALUES (%s,%s,%s)",[item[0],item[1],item[2]]) 47 | self.db.commit() 48 | print(result) 49 | except Exception as e: 50 | self.db.rollback() 51 | print('失败') 52 | 53 | # 关闭连接,释放资源 54 | self.db.close() 55 | 56 | 57 | if __name__ == "__main__": 58 | duodian = DuoDian() 59 | duodian.get_content() -------------------------------------------------------------------------------- /gongzhonghao.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/gongzhonghao.jpeg -------------------------------------------------------------------------------- /jiekou/.idea/jiekou.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 22 | 24 | -------------------------------------------------------------------------------- /jiekou/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /jiekou/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /jiekou/db.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/db.sqlite3 -------------------------------------------------------------------------------- /jiekou/jiekou/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/jiekou/__init__.py -------------------------------------------------------------------------------- /jiekou/jiekou/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for jiekou project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.8.2. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.8/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.8/ref/settings/ 11 | """ 12 | 13 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 14 | import os 15 | 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '3!2z2kqm4erg8#8y1+5n1%wl3lw32@1u&4mlnh+orzl%ns39wq' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = ( 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'myjiekou', 41 | ) 42 | 43 | MIDDLEWARE_CLASSES = ( 44 | 'django.contrib.sessions.middleware.SessionMiddleware', 45 | 'django.middleware.common.CommonMiddleware', 46 | 'django.middleware.csrf.CsrfViewMiddleware', 47 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 48 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 49 | 'django.contrib.messages.middleware.MessageMiddleware', 50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 51 | 'django.middleware.security.SecurityMiddleware', 52 | ) 53 | 54 | ROOT_URLCONF = 'jiekou.urls' 55 | 56 | TEMPLATES = [ 57 | { 58 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 59 | 'DIRS': [os.path.join(BASE_DIR),'templates'], 60 | 'APP_DIRS': True, 61 | 'OPTIONS': { 62 | 'context_processors': [ 63 | 'django.template.context_processors.debug', 64 | 'django.template.context_processors.request', 65 | 'django.contrib.auth.context_processors.auth', 66 | 'django.contrib.messages.context_processors.messages', 67 | ], 68 | }, 69 | }, 70 | ] 71 | 72 | WSGI_APPLICATION = 'jiekou.wsgi.application' 73 | 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/1.8/ref/settings/#databases 77 | 78 | DATABASES = { 79 | 'default': { 80 | 'ENGINE': 'django.db.backends.mysql', 81 | 'HOST':'127.0.0.1', 82 | 'PORT':'3306', 83 | 'NAME': 'test', 84 | 'USER':'root', 85 | 'PASSWORD':'', 86 | } 87 | } 88 | 89 | 90 | # Internationalization 91 | # https://docs.djangoproject.com/en/1.8/topics/i18n/ 92 | 93 | LANGUAGE_CODE = 'zh-hans' 94 | 95 | TIME_ZONE = 'UTC' 96 | 97 | USE_I18N = True 98 | 99 | USE_L10N = True 100 | 101 | USE_TZ = True 102 | 103 | 104 | # Static files (CSS, JavaScript, Images) 105 | # https://docs.djangoproject.com/en/1.8/howto/static-files/ 106 | 107 | STATIC_URL = '/static/' 108 | -------------------------------------------------------------------------------- /jiekou/jiekou/urls.py: -------------------------------------------------------------------------------- 1 | """jiekou URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.8/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Add an import: from blog import urls as blog_urls 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls)) 15 | """ 16 | from django.conf.urls import include, url 17 | from django.contrib import admin 18 | from myjiekou import views 19 | urlpatterns = [ 20 | url(r'^admin/', include(admin.site.urls)), 21 | url(r'^index/', views.index), 22 | url(r'^api/', views.api), 23 | ] 24 | -------------------------------------------------------------------------------- /jiekou/jiekou/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for jiekou project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiekou.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /jiekou/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiekou.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /jiekou/myjiekou/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/myjiekou/__init__.py -------------------------------------------------------------------------------- /jiekou/myjiekou/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | from myjiekou.models import MyModel 4 | # Register your models here. 5 | 6 | class MyAdmin(admin.ModelAdmin): 7 | list_display = ["name","age","hobby"] 8 | 9 | admin.site.register(MyModel,MyAdmin) 10 | -------------------------------------------------------------------------------- /jiekou/myjiekou/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='MyModel', 15 | fields=[ 16 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 17 | ('name', models.CharField(max_length=20)), 18 | ('age', models.CharField(max_length=100)), 19 | ('hobby', models.CharField(max_length=300)), 20 | ], 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /jiekou/myjiekou/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/myjiekou/migrations/__init__.py -------------------------------------------------------------------------------- /jiekou/myjiekou/models.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | from django.db import models 3 | 4 | # Create your models here. 5 | 6 | class MyModel(models.Model): 7 | # 姓名 8 | name = models.CharField(max_length=20) 9 | # 年龄 10 | age = models.CharField(max_length=100) 11 | # 爱好 12 | hobby = models.CharField(max_length=300) 13 | 14 | 15 | -------------------------------------------------------------------------------- /jiekou/myjiekou/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /jiekou/myjiekou/views.py: -------------------------------------------------------------------------------- 1 | 2 | #encoding=utf-8 3 | from django.shortcuts import render 4 | from django.http import HttpResponse,JsonResponse 5 | from models import MyModel 6 | import json 7 | def index(request): 8 | content = MyModel.objects.all() 9 | list = {"content":content} 10 | return render(request,"myjiekou/index.html",list) 11 | 12 | def api(request): 13 | list = [] 14 | item = {} 15 | content = MyModel.objects.all() 16 | for one in content: 17 | item["name"] = one.name 18 | item["age"] = one.age 19 | item["hobby"] = one.hobby 20 | list.append(item) 21 | 22 | return JsonResponse({"status":200,"date":list}) -------------------------------------------------------------------------------- /jiekou/templates/myjiekou/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /teacherInfo/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /teacherInfo/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /teacherInfo/.idea/teacherInfo.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /teacherInfo/begin.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute('scrapy crawl myteacher'.split()) -------------------------------------------------------------------------------- /teacherInfo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = teacherInfo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = teacherInfo 12 | -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/teacherInfo/teacherInfo/__init__.py -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | # Item 定义结构化数据字段,用来保存爬取到的数据 11 | class TeacherinfoItem(scrapy.Item): 12 | 13 | # 获取名字 14 | name = scrapy.Field() 15 | # 职称 16 | position = scrapy.Field() 17 | # 个人信息 18 | info = scrapy.Field() 19 | 20 | 21 | -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TeacherinfoSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import json 9 | import codecs 10 | class TeacherinfoPipeline(object): 11 | def __init__(self): 12 | self.filename = codecs.open('teacher.json','wb','utf-8') 13 | def process_item(self, item, spider): 14 | print(item) 15 | html = json.dumps(dict(item),ensure_ascii=False) 16 | self.filename.write(html + '\n') 17 | return item 18 | 19 | def open_spider(self, spider): 20 | pass 21 | # self.filename.close() -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for teacherInfo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'teacherInfo' 13 | 14 | SPIDER_MODULES = ['teacherInfo.spiders'] 15 | NEWSPIDER_MODULE = 'teacherInfo.spiders' 16 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)' 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'teacherInfo (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | SPIDER_MIDDLEWARES = { 50 | 'teacherInfo.middlewares.TeacherinfoSpiderMiddleware': 543, 51 | } 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'teacherInfo.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'teacherInfo.pipelines.TeacherinfoPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /teacherInfo/teacherInfo/spiders/myteacher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from teacherInfo.items import TeacherinfoItem 4 | 5 | class MyteacherSpider(scrapy.Spider): 6 | name = 'myteacher' 7 | allowed_domains = ['itcast.cn'] 8 | # start_urls = ("http://www.itcast.cn/channel/teacher.shtml",) 元组也可以 9 | start_urls = ['http://www.itcast.cn/channel/teacher.shtml#ac', 10 | 'http://www.itcast.cn/channel/teacher.shtml#acloud', 11 | 'http://www.itcast.cn/channel/teacher.shtml#adesign', 12 | 'http://www.itcast.cn/channel/teacher.shtml#ads', 13 | 'http://www.itcast.cn/channel/teacher.shtml#ajavaee', 14 | 'http://www.itcast.cn/channel/teacher.shtml#anetmarket', 15 | 'http://www.itcast.cn/channel/teacher.shtml#aphp', 16 | 'http://www.itcast.cn/channel/teacher.shtml#apm', 17 | 'http://www.itcast.cn/channel/teacher.shtml#apython', 18 | 'http://www.itcast.cn/channel/teacher.shtml#astack', 19 | 'http://www.itcast.cn/channel/teacher.shtml#atest', 20 | 'http://www.itcast.cn/channel/teacher.shtml#aui', 21 | 'http://www.itcast.cn/channel/teacher.shtml#auijp', 22 | 'http://www.itcast.cn/channel/teacher.shtml#aweb'] 23 | # 爬虫的约束区域 24 | def parse(self, response): 25 | # 存放老师信息的集合 26 | items = [] 27 | print(response.body) 28 | for each in response.xpath("//div[@class='li_txt']"): 29 | # 将我们得到的数据封装到一个 `ItcastItem` 对象 30 | item = TeacherinfoItem() 31 | # extract()方法返回的都是unicode字符串 32 | name = each.xpath("h3/text()").extract() 33 | position = each.xpath("h4/text()").extract() 34 | info = each.xpath("p/text()").extract() 35 | 36 | # xpath返回的是包含一个元素的列表 37 | item['name'] = name[0] 38 | item['position'] = position[0] 39 | item['info'] = info[0] 40 | 41 | items.append(item) 42 | yield item 43 | # 直接返回最后数据 44 | # return items 45 | -------------------------------------------------------------------------------- /爬虫小demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/爬虫小demo/.DS_Store -------------------------------------------------------------------------------- /爬虫小demo/01 taobao.py: -------------------------------------------------------------------------------- 1 | from urllib import request, parse, error 2 | import json 3 | import os 4 | import pymysql 5 | import ssl 6 | # 请求链接需要设置ssl认证 7 | ssl._create_default_https_context = ssl._create_unverified_context 8 | 9 | 10 | class TaoBao(): 11 | 12 | def __init__(self): 13 | # 设置头部 14 | self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} 15 | # 设置get参数 16 | self.params = {'_input_charset': 'utf-8', 17 | 'q': '', 18 | 'viewFlag': 'A', 19 | 'sortType': 'default', 20 | 'searchStyle': '', 21 | 'searchRegion': 'city', 22 | 'searchFansNum': '', 23 | 'currentPage': '', 24 | 'pageSize': '20' 25 | } 26 | self.url = 'https://mm.taobao.com/tstar/search/tstar_model.do' 27 | 28 | 29 | def get_connect(self): 30 | 31 | self.tablename = 'taobao' 32 | self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='test', charset='utf8') 33 | self.cur = self.db.cursor() 34 | self.cur.execute('USE test') 35 | try: 36 | # 创建表 37 | self.cur.execute('CREATE TABLE '+self.tablename+' (id BIGINT(7) NOT NULL AUTO_INCREMENT, name VARCHAR(100), city VARCHAR(20), height VARCHAR(10), weight VARCHAR(10), homepage VARCHAR(100), profile VARCHAR(100), pic VARCHAR(100), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))') 38 | except pymysql.err.InternalError as e: 39 | print(e) 40 | # 修改表字段 41 | self.cur.execute('ALTER DATABASE test CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci') 42 | self.cur.execute('ALTER TABLE '+self.tablename+' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 43 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE name name VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 44 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE city city VARCHAR(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 45 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE height height VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 46 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE weight weight VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 47 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE homepage homepage VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 48 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE profile profile VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 49 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE pic pic VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 50 | 51 | def insert_table(self,name, city, height, weight, hompage, profile, pic): 52 | self.cur.execute('INSERT INTO '+self.tablename+' (name, city, height, weight, homepage, profile, pic) VALUES (\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")', (name, city, height, weight, hompage, profile, pic)) 53 | self.cur.connection.commit() 54 | 55 | 56 | def get_html(self,page): 57 | self.params['currentPage'] = str(page) 58 | # urlencode可以把字典=键值对编码成url地址中get参数 59 | self.param = parse.urlencode(self.params).encode('utf-8') 60 | # data=self.param 上传参数 61 | req = request.Request(self.url, data=self.param, headers=self.headers) 62 | content = request.urlopen(req) 63 | content = json.loads(content.read().decode('gbk')) 64 | if content['status'] == -1: 65 | return -1 66 | 67 | return content 68 | 69 | def parser_json(self,content, page): 70 | meinvist = [] 71 | # 解析json数据 72 | data = content['data']['searchDOList'] 73 | for list in data: 74 | temp = {} 75 | temp['id'] = str(list['userId']) 76 | temp['name'] = list['realName'] 77 | temp['city'] = list['city'] 78 | temp['height'] = str(list['height']) 79 | temp['weight'] = str(list['weight']) 80 | temp['favornum'] = str(list['totalFavorNum']) 81 | temp['profile'] = 'http:'+list['avatarUrl'] 82 | temp['pic'] = 'http:'+list['cardUrl'] 83 | 84 | # meinvist.append(temp) 85 | self.mkdir(temp['name']) 86 | print('%s正在抓取%s'%(page, temp['name'])) 87 | self.get_img(temp['profile'], temp['name'], 'profile') 88 | self.get_img(temp['pic'], temp['name'], 'pic') 89 | if not os.path.exists('./'+temp['name']+'/info.txt'): 90 | with open('./'+temp['name']+'/info.txt', 'w') as f: 91 | f.write(temp['name']+'\n') 92 | f.write(temp['city']+'\n') 93 | f.write(temp['height']+'\n') 94 | f.write(temp['weight']+'\n') 95 | # 插入数据库 96 | self.insert_table(temp['name'], temp['city'], temp['height'], temp['weight'], 'https://mm.taobao.com/self/aiShow.htm?userId='+temp['id'], temp['profile'], temp['pic']) 97 | # return meinvist 98 | 99 | # 判断文件夹是否存在 100 | def mkdir(self,path): 101 | if not os.path.exists(path): 102 | os.makedirs(path) 103 | else: 104 | print('目录已存在!') 105 | 106 | # 判断文件是否存在 107 | def get_img(self,url, path, name): 108 | if os.path.exists('./' + path + '/' + name + '.jpg'): 109 | print('文件已存在!') 110 | return 0 111 | try: 112 | req = request.Request(url, headers=self.headers) 113 | reponse = request.urlopen(req) 114 | get_img = reponse.read() 115 | with open('./' + path + '/' + name + '.jpg', 'wb') as fp: 116 | fp.write(get_img) 117 | # 也可以用一下代码实现图片的下载 118 | # request.urlretrieve(img, './' + path + '/' + name + '.jpg') 119 | except error.URLError as e: 120 | print(e.reason) 121 | 122 | 123 | 124 | if __name__ == '__main__': 125 | page = 1 126 | taobao = TaoBao() 127 | taobao.get_connect() 128 | while True: 129 | content = taobao.get_html(page) 130 | if content == -1: 131 | print('抓取完毕!') 132 | exit() 133 | # 解析json 134 | taobao.parser_json(content, page) 135 | page += 1 136 | -------------------------------------------------------------------------------- /爬虫小demo/02 doubanzhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | 4 | # from selenium import webdriver 5 | # from selenium.webdriver.common.keys import Keys 6 | # import time 7 | # 8 | # driver = webdriver.PhantomJS(executable_path="/Users/yunmei/phantomjs-2.1.1-macosx/bin/phantomjs") 9 | # driver.get("https://www.douban.com/") 10 | # 11 | # # 输入账号密码 12 | # driver.find_element_by_name("form_email").send_keys("2334497007@qq.com") 13 | # driver.find_element_by_name("form_password").send_keys("lbaiwb1314") 14 | # 15 | # # 模拟点击登录 16 | # driver.find_element_by_xpath("//input[@class='bn-submit']").click() 17 | # 18 | # # 等待3秒 19 | # time.sleep(3) 20 | # 21 | # # 生成登陆后快照 22 | # driver.save_screenshot("douban.png") 23 | # 24 | # with open("douban.html", "w") as file: 25 | # file.write(driver.page_source.encode('utf-8')) 26 | # 27 | # driver.quit() 28 | 29 | 30 | from selenium import webdriver 31 | import time 32 | # 创建浏览器对象 33 | browser=webdriver.PhantomJS(executable_path="/Users/yunmei/phantomjs-2.1.1-macosx/bin/phantomjs") 34 | # 请求加载登录链接 35 | browser.get('https://www.zhihu.com/#signin') 36 | time.sleep(3) 37 | # 模拟点击使用密码登录 38 | browser.find_element_by_css_selector(".signin-switch-password").click() 39 | # 输入账号 40 | browser.find_element_by_css_selector(".account input[name='account']").send_keys('17078075655') 41 | # 输入密码 42 | browser.find_element_by_css_selector(".verification input[name='password']").send_keys('19910825580lb') 43 | # 模拟点击登录 44 | browser.find_element_by_css_selector(".sign-button").click() 45 | time.sleep(3) 46 | # 截图 47 | browser.save_screenshot("zhihu.png") 48 | browser.quit() -------------------------------------------------------------------------------- /爬虫小demo/03 douYuUnittest.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | # python的测试模块 4 | import unittest 5 | from selenium import webdriver 6 | from bs4 import BeautifulSoup 7 | 8 | class douyuSelenium(unittest.TestCase): 9 | # 初始化方法 10 | def setUp(self): 11 | self.driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs") 12 | 13 | #具体的测试用例方法,一定要以test开头 14 | def testDouyu(self): 15 | self.driver.get('http://www.douyu.com/directory/all') 16 | while True: 17 | # 指定xml解析 18 | soup = BeautifulSoup(self.driver.page_source, 'lxml') 19 | # 返回当前页面所有房间标题列表 和 观众人数列表 20 | titles = soup.find_all('h3', attrs={'class': 'ellipsis'}) 21 | nums = soup.find_all('span', attrs={'class': 'dy-num fr'}) 22 | 23 | # 使用zip()函数来可以把列表合并,并创建一个元组对的列表[(1,2), (3,4)] 24 | for title, num in zip(nums, titles): 25 | print u"房间标题: " + num.get_text().strip(), u"\t观众人数:" + title.get_text().strip() 26 | # page_source.find()未找到内容则返回-1 27 | if self.driver.page_source.find('shark-pager-disable-next') != -1: 28 | break 29 | # 模拟下一页点击 30 | self.driver.find_element_by_class_name('shark-pager-next').click() 31 | 32 | # 退出时的清理方法 33 | def tearDown(self): 34 | print '加载完成...' 35 | self.driver.quit() 36 | 37 | if __name__ == "__main__": 38 | unittest.main() -------------------------------------------------------------------------------- /爬虫小demo/04 fileHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import csv 3 | 4 | # 1、txt文件 5 | file = open('file.txt','r') 6 | # 获取所有的信息 7 | print file.read() 8 | file.write("你好") 9 | # 获取所有并且在所有行存在一个数组 10 | print file.readlines() 11 | # 获取第一行 12 | print file.readline() 13 | 14 | # 2、读取csv文件 15 | 16 | writer = csv.writer(open('test.csv','wb')) 17 | writer.writerow(['col1','col2','col3']) 18 | data = [range(3) for i in range(3)] 19 | for item in data: 20 | writer.writerow(item) 21 | 22 | filelist = csv.reader(open('./test.csv','r')) 23 | for item in filelist: 24 | print item 25 | 26 | 27 | # 3、读取xml文件 28 | 29 | from xml.dom import minidom 30 | # parse打开xml文件 31 | dom = minidom.parse("info.xml") 32 | # 获取根节点 33 | root = dom.documentElement 34 | print root.nodeName 35 | print root.nodeValue 36 | print root.nodeType 37 | print root.ELEMENT_NODE 38 | print "--" * 8 39 | province = root.getElementsByTagName("province") 40 | print province[0].tagName 41 | print province[0].getAttribute("username") 42 | print province[0].firstChild.data 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /爬虫小demo/05 getimage.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import urllib2 4 | import lxml.etree 5 | 6 | class GetImage(): 7 | 8 | def __init__(self): 9 | self.tieba = "https://tieba.baidu.com" 10 | self.count = 50 11 | 12 | def get_html(self,url): 13 | request = urllib2.Request(url) 14 | response = urllib2.urlopen(request) 15 | html = response.read() 16 | return html 17 | 18 | def get_xpath(self): 19 | # 起始页 20 | baginPage = int(raw_input("请输入起始页:")) 21 | # 结束页 22 | endPage = int(raw_input("请输入结束页:")) 23 | for pagecount in range(baginPage,endPage + 1): 24 | pn = (pagecount - 1) * self.count 25 | urllink = self.tieba + "/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=" + str(pn) 26 | xmlcontent = lxml.etree.HTML(self.get_html(urllink)) 27 | # content = xmlcontent.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') 28 | # content = xmlcontent.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@class="j_th_tit "]/@href') 29 | content = xmlcontent.xpath('//a[@class="j_th_tit "]/@href') 30 | 31 | for item in content: 32 | itemcontent = lxml.etree.HTML(self.get_html(self.tieba + item)) 33 | print self.tieba + item 34 | itemlist = itemcontent.xpath('//img[@class="BDE_Image"]//@src') 35 | for imageitem in itemlist: 36 | get_image = self.get_html(imageitem) 37 | with open("images/" + imageitem[-10:],'a') as file: 38 | file.write(get_image) 39 | file.close 40 | 41 | if __name__ == "__main__": 42 | getImages = GetImage() 43 | getImages.get_xpath() -------------------------------------------------------------------------------- /爬虫小demo/06 jsload.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from selenium import webdriver 3 | from time import sleep 4 | from selenium.webdriver.common.keys import Keys 5 | 6 | driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs") 7 | driver.get("http://baidu.com/") 8 | 9 | driver.find_element_by_id("kw").send_keys(u"长城") 10 | sleep(10) 11 | driver.find_element_by_id("su").click() 12 | 13 | driver.save_screenshot("长城.png") 14 | 15 | -------------------------------------------------------------------------------- /爬虫小demo/07 jsondata.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import json 4 | import jsonpath 5 | import urllib2 6 | 7 | class Json(): 8 | def __init__(self): 9 | self.url = "http://www.lagou.com/lbs/getAllCitySearchLabels.json" 10 | 11 | def get_json(self): 12 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 13 | request = urllib2.Request(self.url,headers=headers) 14 | response = urllib2.urlopen(request) 15 | html = response.read() 16 | jsonobj = json.loads(html) 17 | # 获取城市名称 18 | namelist = jsonpath.jsonpath(jsonobj,'$..name') 19 | for name in namelist: 20 | print(name) 21 | 22 | # 把列表存储为字符串 23 | nametext = json.dumps(namelist,ensure_ascii=False) 24 | with open('name.txt','a') as file: 25 | file.write(nametext.encode("utf-8")) 26 | file.close 27 | 28 | 29 | if __name__ == "__main__": 30 | jsono = Json() 31 | jsono.get_json() 32 | -------------------------------------------------------------------------------- /爬虫小demo/08 jsonpath和json总结.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | 4 | import json 5 | import jsonpath 6 | import time 7 | 8 | # 1、第一种存储字典和数组 9 | 10 | listDict = [{"city": "北京"},{"name": "小明"}] 11 | strlist = json.dumps(listDict, ensure_ascii=False) 12 | print type(strlist) # 13 | # 写数据 14 | with open("listDict.json",'w') as file: 15 | file.write(strlist) 16 | 17 | # 2、第二种存储字典和数组 18 | listStr = [{"city": "北京"}, {"name": "大刘"}] 19 | json.dump(listStr, open("listStr.json","w"), ensure_ascii=False) 20 | 21 | dictStr = {"city": "北京", "name": "大刘"} 22 | json.dump(dictStr, open("dictStr.json","w"), ensure_ascii=False) 23 | time.sleep(1) 24 | 25 | # ------------ 从文件里面取数据 --------- 26 | 27 | dictList = json.load(open("listDict.json",'r')) 28 | # 输出北京 29 | print dictList[0]["city"] 30 | # ------------ 读出字典loads ---------- 31 | strDict = '{"city": "北京", "name": "大猫"}' 32 | # 33 | print type(json.loads(strDict)) 34 | 35 | jsonobj = json.loads(strDict) 36 | 37 | # 从根节点开始,匹配name节点 38 | citylist = jsonpath.jsonpath(jsonobj,'$..name') 39 | 40 | print citylist[0].encode('utf-8') -------------------------------------------------------------------------------- /爬虫小demo/09 zhihu_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | import requests 4 | import time 5 | 6 | 7 | class Login(): 8 | # 模拟登录一般步骤:(1)首先抓包,根据webForm来分析需要传那些data 9 | # (2)分析_xsrf获取 10 | # (3)分析验证码获取方式 11 | # (4)post登录 12 | 13 | def get_login(self): 14 | sess=requests.Session() 15 | # 头部headers需要注意,如果头部没有设置好,下面的步骤就会不能执行成功 16 | headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 17 | # 首先获取登录页面,找到需要get的数据,同时记录cookie的值 18 | html=sess.get('https://www.zhihu.com/#signin',headers=headers).text 19 | # 调用xml解析库 20 | bs=BeautifulSoup(html,'lxml') 21 | # _xsrf作用是跨站请求伪造(或者叫跨域攻击) 22 | _xsrf=bs.find('input',attrs={'name':'_xsrf'}).get('value') 23 | # 通过时间戳拼接验证码链接 24 | captcha_url='https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000) 25 | # 发送验证码请求,获取图片数据流。 26 | captchadata = sess.get(captcha_url, headers=headers).content 27 | text = self.captcha(captchadata) 28 | 29 | data={ 30 | '_xsrf':_xsrf, 31 | 'phone_num':'17078075655',# 换成邮箱登录也可 32 | 'password':'lbaiwb1314', 33 | 'captcha':text 34 | } 35 | response=sess.post('https://www.zhihu.com/login/phone_num',data=data,headers=headers) 36 | # print type(response.text) 37 | # 在个人中心请求一下是否真正登录成功 38 | response=sess.get('https://www.zhihu.com/people/liu-tao-98-32/activities',headers=headers) 39 | with open("mylogin.txt", "w") as file: 40 | file.write(response.text.encode("utf-8")) 41 | 42 | def captcha(self,captcha_data): 43 | # 将二进制数据写入到文件中 44 | with open('captcha.jpg','wb')as f: 45 | f.write(captcha_data) 46 | text=raw_input('请输入登录验证码') 47 | return text 48 | 49 | if __name__=='__main__': 50 | 51 | login = Login() 52 | login.get_login() 53 | -------------------------------------------------------------------------------- /爬虫小demo/10 match.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import re 4 | import urllib2 5 | 6 | class Content: 7 | 8 | def __init__(self): 9 | self.page = 1 10 | 11 | def get_html(self): 12 | # 获取整个网页的html内容 13 | headers = { 14 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"} 15 | url = "http://www.neihan8.com/article/list_5_"+str(self.page)+".html" 16 | request = urllib2.Request(url=url, headers=headers) 17 | response = urllib2.urlopen(request) 18 | html = response.read() 19 | return html 20 | 21 | def get_content(self): 22 | pattern = re.compile(r'(.*?)', re.S) 23 | content_list = pattern.findall(self.get_html()) 24 | for content in content_list: 25 | result_content = content.decode('gbk').replace("

", "").replace("

", "") \ 26 | .replace("“", "").replace("
", "") \ 27 | .replace("”", "").replace("&hellip", "") 28 | 29 | with open("content.txt", "a") as file: 30 | file.write(result_content.encode("utf-8")) 31 | file.close 32 | 33 | if __name__ == "__main__": 34 | 35 | content = Content() 36 | while True: 37 | content.page+=1 38 | print content.page 39 | content.get_content() 40 | 41 | """ 42 | r 打开只读文件,该文件必须存在。 43 | r+ 打开可读写的文件,该文件必须存在。 44 | w 打开只写文件,若文件存在则文件长度清为0,即该文件内容会消失。若文件不存在则建立该文件。 45 | w+ 打开可读写文件,若文件存在则文件长度清为零,即该文件内容会消失。若文件不存在则建立该文件。 46 | a 以附加的方式打开只写文件。若文件不存在,则会建立该文件,如果文件存在,写入的数据会被加到文件尾,即文件原先的内容会被保留。 47 | a+ 以附加方式打开可读写的文件。若文件不存在,则会建立该文件,如果文件存在,写入的数据会被加到文件尾后,即文件原先的内容会被保留。 48 | """ -------------------------------------------------------------------------------- /爬虫小demo/11 neihan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import urllib2 5 | import re 6 | 7 | class Spider: 8 | def __init__(self): 9 | # 初始化起始页位置 10 | self.page = 1 11 | # 爬取开关,如果为True继续爬取 12 | self.switch = True 13 | 14 | def loadPage(self): 15 | """ 16 | 作用:下载页面 17 | """ 18 | print "正在下载数据...." 19 | url = "http://www.neihan8.com/article/list_5_" + str(self.page) + ".html" 20 | headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 21 | request = urllib2.Request(url, headers = headers) 22 | response = urllib2.urlopen(request) 23 | 24 | # 获取每页的HTML源码字符串 25 | html = response.read() 26 | #print html 27 | 28 | # 创建正则表达式规则对象,匹配每页里的段子内容,re.S 表示匹配全部字符串内容 29 | pattern = re.compile('(.*?)', re.S) 30 | 31 | # 将正则匹配对象应用到html源码字符串里,返回这个页面里的所有段子的列表 32 | content_list = pattern.findall(html) 33 | 34 | # 调用dealPage() 处理段子里的杂七杂八 35 | self.dealPage(content_list) 36 | 37 | def dealPage(self, content_list): 38 | """ 39 | 处理每页的段子 40 | content_list : 每页的段子列表集合 41 | """ 42 | for item in content_list: 43 | # 将集合里的每个段子按个处理,替换掉无用数据 44 | item = item.replace("

","").replace("

", "").replace("
", "") 45 | #print item.decode("gbk") 46 | # 处理完后调用writePage() 将每个段子写入文件内 47 | self.writePage(item) 48 | 49 | def writePage(self, item): 50 | """ 51 | 把每条段子逐个写入文件里 52 | item: 处理后的每条段子 53 | """ 54 | # 写入文件内 55 | print "正在写入数据...." 56 | with open("duanzi.txt", "a") as f: 57 | f.write(item) 58 | 59 | def startWork(self): 60 | """ 61 | 控制爬虫运行 62 | """ 63 | # 循环执行,直到 self.switch == False 64 | while self.switch: 65 | # 用户确定爬取的次数 66 | self.loadPage() 67 | command = raw_input("如果继续爬取,请按回车(退出输入quit)") 68 | if command == "quit": 69 | # 如果停止爬取,则输入 quit 70 | self.switch = False 71 | # 每次循环,page页码自增1 72 | self.page += 1 73 | print "谢谢使用!" 74 | 75 | 76 | if __name__ == "__main__": 77 | duanziSpider = Spider() 78 | # duanziSpider.loadPage() 79 | duanziSpider.startWork() 80 | 81 | -------------------------------------------------------------------------------- /爬虫小demo/12 PIL.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding:utf-8 -*- 3 | import pytesseract 4 | from PIL import Image 5 | 6 | # PIL读取与存储图像 7 | 8 | # 1、PIL识别图片上面文字 9 | images = Image.open('test.png') 10 | text = pytesseract.image_to_string(images) 11 | print text 12 | 13 | # 2、PIL保存成灰色图片 14 | # -*- coding: utf-8 -*- 15 | from PIL import Image 16 | 17 | # 打开图像得到一个PIL图像对象 18 | img = Image.open("test.png") 19 | # 将其转为一张灰度图 20 | img = img.convert('L') 21 | # 存储该张图片 22 | try: 23 | img.save("test.png") 24 | except IOError: 25 | print "cannot convert" 26 | 27 | 28 | # 3、PIL生成缩略图 29 | # -*- coding: utf-8 -*- 30 | from PIL import Image 31 | 32 | # 打开图像得到一个PIL图像对象 33 | img = Image.open("test.png") 34 | # 创建最长边为128的缩略图 35 | img.thumbnail((128,128)) 36 | # 存储该张图片 37 | try: 38 | img.save("test.png") 39 | except IOError: 40 | print "cannot convert" 41 | 42 | 43 | # 4、PIL调整尺寸与旋转 44 | # -*- coding: utf-8 -*- 45 | from PIL import Image 46 | 47 | # 打开图像得到一个PIL图像对象 48 | img = Image.open("test.png") 49 | # 修改图片大小,参数为一元组 50 | img = img.resize((100,200)) 51 | # 使图片逆时针选择45度 52 | img = img.rotate(45) 53 | # 存储该张图片 54 | try: 55 | img.save("test.png") 56 | except IOError: 57 | print "cannot convert" 58 | 59 | 60 | -------------------------------------------------------------------------------- /爬虫小demo/13 queryxpath.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import urllib2 4 | import json 5 | import lxml.etree 6 | # xpath 模糊查询 7 | 8 | class XpathQuery(): 9 | def __init__(self): 10 | self.url = "https://www.qiushibaike.com/" 11 | 12 | 13 | def get_html(self): 14 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 15 | request = urllib2.Request(self.url,headers=headers) 16 | response = urllib2.urlopen(request) 17 | html = response.read() 18 | return html 19 | 20 | def get_xpath(self): 21 | xmlcontent = lxml.etree.HTML(self.get_html()) 22 | xmllist = xmlcontent.xpath('//div[contains(@id,"qiushi_tag_")]') 23 | print len(xmllist) 24 | # 分享的地方 25 | sharelist = xmlcontent.xpath('//div[@class="article block untagged mb15 typs_recent"]//div[@class="single-share"]/a/@title') 26 | for item in range(0,4): 27 | print sharelist[item] 28 | 29 | for item in xmllist: 30 | # 用户名 31 | username = item.xpath('.//div[@class="author clearfix"]/a/h2/text()') 32 | # 标题 33 | title = item.xpath('.//a/div[@class="content"]/span/text()')[0] 34 | 35 | with open('title.txt','a') as file: 36 | file.write(title.encode("utf-8")) 37 | file.close 38 | with open('username.txt','a') as file: 39 | if len(username) == 0: 40 | file.write("匿名用户") 41 | else: 42 | file.write(username[0].encode("utf-8")) 43 | 44 | # 好笑数 45 | votecount = item.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()')[0] 46 | print "好笑数:" + votecount 47 | # 评论数 48 | commentcount = item.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()')[0] 49 | print "评论数:" + commentcount 50 | # 放在一个字典里进行存储 51 | dic = { 52 | "username":username, 53 | "votecount":votecount, 54 | "commentcount":commentcount, 55 | "title": title, 56 | } 57 | with open('qiushi.json','a') as file: 58 | file.write(json.dumps(dic,ensure_ascii=False).encode("utf-8") + '\n') 59 | file.close 60 | 61 | 62 | if __name__ == "__main__": 63 | xpathq = XpathQuery() 64 | xpathq.get_xpath() -------------------------------------------------------------------------------- /爬虫小demo/14 selenium执行js.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from selenium import webdriver 4 | import time 5 | driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs") 6 | driver.get("https://www.baidu.com/") 7 | 8 | # 给搜索输入框标红的javascript脚本 9 | js = "var q=document.getElementById(\"kw\");q.style.border=\"2px solid red\";" 10 | 11 | # 调用给搜索输入框标红js脚本 12 | driver.execute_script(js) 13 | 14 | # 查看页面快照 15 | driver.save_screenshot("redbaidu.png") 16 | 17 | # js隐藏元素,将获取的图片元素隐藏 18 | img = driver.find_element_by_xpath("//div[@id='lg']/img") 19 | driver.execute_script('$(arguments[0]).fadeOut()',img) 20 | 21 | # 向下滚动到页面底部 22 | # driver.execute_script("$('.scroll_top').click(function(){$('html,body').animate({scrollTop: '0px'}, 800);});") 23 | time.sleep(1) 24 | # 查看页面快照 25 | driver.save_screenshot("wubaidu.png") 26 | 27 | driver.quit() -------------------------------------------------------------------------------- /爬虫小demo/15 tencent.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from bs4 import BeautifulSoup 4 | import urllib2 5 | 6 | class Tencent(): 7 | def __init__(self): 8 | self.url = 'http://hr.tencent.com/position.php?&start=10#a' 9 | 10 | def get_html(self): 11 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 12 | request = urllib2.Request(self.url,headers=headers) 13 | html = urllib2.urlopen(request) 14 | return html 15 | 16 | def get_content(self): 17 | techlist = [] 18 | soup = BeautifulSoup(self.get_html(),'lxml') 19 | positionlist = soup.select('.l > a') 20 | even = soup.select('.even') 21 | odd = soup.select('.odd') 22 | even + odd 23 | 24 | for position in positionlist: 25 | with open("position.txt",'a') as file: 26 | file.write(position.string.encode("utf-8") + "\n") 27 | file.close 28 | 29 | for technology in even: 30 | with open("technology.txt",'a') as file: 31 | file.write("" + technology.select('td')[1].string.encode("utf-8")) 32 | file.write(" 人数:" + technology.select('td')[2].string.encode("utf-8")) 33 | file.write(" 地点:" + technology.select('td')[3].string.encode("utf-8")) 34 | file.write(" 时间:" + technology.select('td')[4].string.encode("utf-8") + "\n") 35 | file.close 36 | 37 | for technology in odd: 38 | with open("technology.txt",'a') as file: 39 | file.write("" + technology.select('td')[1].string.encode("utf-8")) 40 | file.write(" 人数:" + technology.select('td')[2].string.encode("utf-8")) 41 | file.write(" 地点:" + technology.select('td')[3].string.encode("utf-8")) 42 | file.write(" 时间:" + technology.select('td')[4].string.encode("utf-8") + "\n") 43 | file.close 44 | 45 | # items = {} 也可以这么存储数据到文件 46 | # items["name"] = name 47 | # str = json.dumps(items, ensure_ascii=False) 48 | # output.write(line.encode('utf-8')) 49 | # output.close() 50 | if __name__ == "__main__": 51 | tencent = Tencent() 52 | tencent.get_content() -------------------------------------------------------------------------------- /爬虫小demo/16 xunmall.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import urllib2,os 4 | import lxml.etree 5 | 6 | class Xunmall(): 7 | def __init__(self): 8 | self.url = "http://www.xunmall.com" 9 | 10 | def get_html(self,p1 = ""): 11 | # headers = { 12 | # "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"} 13 | request = urllib2.Request(self.url + p1) 14 | response = urllib2.urlopen(request) 15 | html = response.read() 16 | return html 17 | 18 | def get_xpath(self): 19 | xmlcontent = lxml.etree.HTML(self.get_html()) 20 | xmllist = xmlcontent.xpath('//h2[@class="floor_name"]/text()') 21 | 22 | for item in xmllist: 23 | with open('title.txt','a') as file: 24 | file.write(item.encode('utf-8') + '\n') 25 | file.close 26 | 27 | 28 | def get_image(self): 29 | xmlimage = lxml.etree.HTML(self.get_html()) 30 | imagelist = xmlimage.xpath('//div[@class="color_top"]/img/@src') 31 | if os.path.isdir('./imgs'): 32 | pass 33 | else: 34 | os.mkdir("./imgs") 35 | for item in imagelist: 36 | # print self.url + item 37 | with open('imgs/' + (self.url + item)[-8:],'a+') as file: 38 | file.write(self.get_html(item)) 39 | file.close 40 | 41 | def get_theme(self): 42 | xmltheme = lxml.etree.HTML(self.get_html()) 43 | themelist = xmltheme.xpath('//h3[@class="floor_theme"]/text()') 44 | 45 | for item in themelist: 46 | with open('theme.txt','a') as file: 47 | file.write(item.encode('utf-8') + '\n') 48 | file.close 49 | 50 | sloganlist = xmltheme.xpath('//p[@class="slogan"]/text()') 51 | for item in sloganlist: 52 | with open('theme.txt','a') as file: 53 | file.write(item.encode('utf-8') + '\n') 54 | file.close 55 | 56 | give_outlist = xmltheme.xpath('//p[@class="give_out"]/text()') 57 | for item in give_outlist: 58 | with open('theme.txt', 'a') as file: 59 | file.write(item.encode('utf-8') + '\n') 60 | file.close 61 | 62 | def get_html1(self,p2): 63 | request = urllib2.Request(p2) 64 | response = urllib2.urlopen(request) 65 | html = response.read() 66 | return html 67 | 68 | # 食品标题和图片 69 | def foodImageTitle(self): 70 | foodImage = lxml.etree.HTML(self.get_html()) 71 | foodImageList = foodImage.xpath('//div[@class="pro_image"]/img/@src') 72 | 73 | if os.path.isdir('./foodimage'): 74 | pass 75 | else: 76 | os.mkdir("./foodimage") 77 | for item in foodImageList: 78 | # print item 79 | with open('foodimage/' + item[-20:],'a+') as file: 80 | file.write(self.get_html1(item)) 81 | file.close 82 | 83 | # 每个零食的详细信息(标题、图片、副标题) 84 | def detail(self): 85 | detailLink = lxml.etree.HTML(self.get_html()) 86 | detailLinkList = detailLink.xpath('//div[@class="nth_floor first_floor"]/div[@class="goods_box"]/ul[@class="item_list"]//a/@href') 87 | for item in detailLinkList: 88 | # print item[-18:] 89 | detailUrl = lxml.etree.HTML(self.get_html("/" + item[-18:])) 90 | detailImageList = detailUrl.xpath( 91 | '//div[@class="info-panel panel1"]/img/@src') 92 | 93 | for detailitem in detailImageList: 94 | # print '正在下载详情图片' 95 | 96 | if os.path.isdir('./' + item[-18:-5]): 97 | pass 98 | else: 99 | os.mkdir("./" + item[-18:-5]) 100 | 101 | with open(item[-18:-5] + '/' + detailitem[-9:], 'a+') as file: 102 | file.write(self.get_html1(detailitem)) 103 | file.close 104 | # 商品标题 105 | detailtitleList = detailUrl.xpath( 106 | '//div[@class="col-lg-7 item-inner"]//h1[@class="fl"]/text()') 107 | 108 | for title in detailtitleList: 109 | with open('foodtitle.txt', 'a+') as file: 110 | file.write(title.encode('utf-8') + '\n') 111 | file.close 112 | # 商品编号 113 | goodnumberList = detailUrl.xpath( 114 | '//div[@class="col-lg-7 item-inner"]//li[@class="col-lg-5 col-md-5"]/text()') 115 | for number in goodnumberList: 116 | # print number 117 | if os.path.isdir('./qrcoder'): 118 | pass 119 | else: 120 | os.mkdir("./qrcoder") 121 | 122 | with open('qrcoder', 'a+') as file: 123 | file.write(number.encode('utf-8') + '\n') 124 | file.close 125 | 126 | 127 | # 商品二维码:data_code 128 | coderImageList = detailUrl.xpath('//div[@class="clearfixed"]//div[@class="barcode fr"]/img/@data_code') 129 | 130 | for item in coderImageList: 131 | # print item 132 | with open('goodnumber.txt', 'a+') as file: 133 | file.write(item + '\n') 134 | file.close 135 | 136 | 137 | 138 | 139 | if __name__ == "__main__": 140 | # 获取分类标题 141 | xunmall = Xunmall() 142 | # xunmall.get_xpath() 143 | # 获取图片 144 | # xunmall.get_image() 145 | # 图片上面的标题 146 | # xunmall.get_theme() 147 | # 休闲食品标题和图片 148 | # xunmall.foodImageTitle() 149 | xunmall.detail() -------------------------------------------------------------------------------- /爬虫小demo/17 zhihulogin.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import urllib2 4 | import lxml.etree 5 | class Login(): 6 | def __init__(self): 7 | self.url = "https://www.zhihu.com/#signin" 8 | 9 | def get_html(self): 10 | # headers = { 11 | # "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"} 12 | request = urllib2.Request(self.url) 13 | response = urllib2.urlopen(request) 14 | html = response.read() 15 | return html 16 | 17 | def get_xpath(self): 18 | # print self.get_html() 19 | xmlcontent = lxml.etree.HTML(self.get_html()) 20 | xmllist = xmlcontent.xpath('//div[@class="view view-signin"]/form/input/@value') 21 | 22 | for item in xmllist: 23 | print item 24 | with open('title.txt','a') as file: 25 | file.write(item.encode('utf-8') + '\n') 26 | file.close 27 | 28 | 29 | if __name__ == "__main__": 30 | login = Login() 31 | login.get_xpath() -------------------------------------------------------------------------------- /爬虫小demo/18 github_login.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | 模拟Github登陆步骤: 4 | 1、请求头:self.headers,请求url 5 | 2、设置session,保存登陆信息cookies,生成github_cookie文件 6 | 3、POST表单提交,请求数据格式post_data 7 | 4、authenticity_token获取 8 | 5、在个人中心验证判断是否登陆成功,输出个人中心信息即登陆成功 9 | 10 | ''' 11 | 12 | import requests 13 | from lxml import etree 14 | try: 15 | import cookielib 16 | except: 17 | import http.cookiejar as cookielib 18 | 19 | class GithubLogin(): 20 | 21 | def __init__(self): 22 | # url 23 | self.loginUrl = 'https://github.com/login' 24 | self.postUrl = 'https://github.com/session' 25 | self.profileUrl = 'https://github.com/settings/profile' 26 | 27 | # 设置请求头 28 | self.headers = { 29 | 'Referer': 'https://github.com/', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 31 | 'Host': 'github.com' 32 | } 33 | 34 | # 设置session 35 | self.session = requests.session() 36 | # 生成github_cookie文件 37 | self.session.cookies = cookielib.LWPCookieJar(filename='github_cookie') 38 | 39 | ''' 40 | 登陆时表单提交参数 41 | Form Data: 42 | commit:Sign in 43 | utf8:✓ 44 | authenticity_token:yyZprIm4aghZ0u7r25ymZjisfTjGdUAdDowD9fKHM0oUvHD1WjUHbn2sW0Cz1VglZWdGno543jod2M8+jwLv6w== 45 | login:***** 46 | password:****** 47 | 48 | ''' 49 | def post_account(self, email, password): 50 | post_data = { 51 | 'commit': 'Sign in', 52 | 'utf8': '✓', 53 | 'authenticity_token': self.get_token()[0], 54 | 'login': email, 55 | 'password': password 56 | } 57 | response = self.session.post(self.postUrl, data=post_data, headers=self.headers) 58 | # 保存cookies 59 | self.session.cookies.save() 60 | 61 | def load_cookie(self): 62 | try: 63 | self.session.cookies.load(ignore_discard=True) 64 | except: 65 | print('cookie 获取不成功') 66 | 67 | # 获取authenticity_token 68 | def get_token(self): 69 | response = self.session.get(self.loginUrl, headers=self.headers) 70 | html = etree.HTML(response.text) 71 | authenticity_token = html.xpath('//div/input[2]/@value') 72 | print(authenticity_token) 73 | return authenticity_token 74 | 75 | # 判断是否登陆成功 76 | def isLogin(self): 77 | self.load_cookie() 78 | response = self.session.get(self.profileUrl, headers=self.headers) 79 | selector = etree.HTML(response.text) 80 | flag = selector.xpath('//div[@class="column two-thirds"]/dl/dt/label/text()') 81 | info = selector.xpath('//div[@class="column two-thirds"]/dl/dd/input/@value') 82 | textarea = selector.xpath('//div[@class="column two-thirds"]/dl/dd/textarea/text()') 83 | # 登陆成功返回来的个人设置信息 84 | print(u'个人设置Profile标题: %s'%flag) 85 | print(u'个人设置Profile内容: %s'%info) 86 | print(u'个人设置Profile内容: %s'%textarea) 87 | 88 | if __name__ == "__main__": 89 | github = GithubLogin() 90 | # 输入自己email账号和密码 91 | github.post_account(email='******', password='******') 92 | # 验证是否登陆成功 93 | github.isLogin() -------------------------------------------------------------------------------- /爬虫小demo/19 jd_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | 7 | class JDlogin(): 8 | def __init__(self, username, password): 9 | self.session = requests.session() 10 | self.loginUrl = "http://passport.jd.com/uc/login" 11 | self.postUrl = "http://passport.jd.com/uc/loginService" 12 | self.authUrl = "https://passport.jd.com/uc/showAuthCode" 13 | self.username = username 14 | self.password = password 15 | 16 | # 设置请求头 17 | self.headers = { 18 | 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" 19 | } 20 | 21 | def get_authcode(self, url): 22 | self.headers['Host'] = 'authcode.jd.com' 23 | self.headers['Referer'] = 'https://passport.jd.com/uc/login' 24 | response = self.session.get(url, headers=self.headers) 25 | with open('codeimage.jpg', 'wb') as f: 26 | f.write(response.content) 27 | authcode = input("请输入验证码:") 28 | return authcode 29 | 30 | def get_info(self): 31 | 32 | try: 33 | # 登陆请求 34 | html = self.session.get(self.loginUrl, headers=self.headers) 35 | soup = BeautifulSoup(html.text,"lxml") 36 | inputList = soup.select('.form input') 37 | print(inputList) 38 | data = {} 39 | data['uuid'] = inputList[0]['value'] 40 | data['eid'] = inputList[4]['value'] 41 | data['fp'] = inputList[5]['value'] 42 | data['_t'] = inputList[6]['value'] 43 | rstr = inputList[7]['name'] 44 | data[rstr] = inputList[7]['value'] 45 | acRequired = self.session.post(self.authUrl, data={ 46 | 'loginName': self.username}).text 47 | 48 | if 'true' in acRequired: 49 | 50 | acUrl = soup.select('.form img')[0]['src2'] 51 | acUrl = 'http:{}&yys={}'.format(acUrl, str(int(time.time() * 1000))) 52 | authcode = self.get_authcode(acUrl) 53 | data['authcode'] = authcode 54 | else: 55 | data['authcode'] = '' 56 | 57 | except Exception as e: 58 | print(e) 59 | finally: 60 | return data 61 | 62 | def jd_login(self): 63 | 64 | data = self.get_info() 65 | # Form表单提交数据 66 | # 1、loginname、nloginpwd、loginpwd是在网页中input属性值name,作为表单值提交到登陆请求 67 | # 2、在此处也可以用selenium来进行给输入框(登陆账号、登陆密码)进行赋值 68 | 69 | data['loginname'] = self.username 70 | data['nloginpwd'] = self.password 71 | data['loginpwd'] = self.password 72 | try: 73 | self.headers['Host'] = 'passport.jd.com' 74 | html = self.session.post(self.postUrl, data=data, headers=self.headers) 75 | # 在这里可以判断请求是否判断成功不成功 76 | print(html.text) 77 | except Exception as e: 78 | print(e) 79 | 80 | 81 | if __name__ == "__main__": 82 | # 在下面输入账号名、密码 83 | jdlogin = JDlogin("******", "******") 84 | jdlogin.jd_login() 85 | -------------------------------------------------------------------------------- /爬虫小demo/20 下载网易云歌词.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import json 7 | import re 8 | from urllib import request 9 | 10 | # 1、获取网页 11 | def get_html(url): 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47', 14 | 'Referer': 'http://music.163.com/', 15 | 'Host': 'music.163.com' 16 | } 17 | 18 | try: 19 | response = requests.get(url, headers=headers) 20 | html = response.text 21 | return html 22 | except: 23 | print('request error') 24 | 25 | def get_text(song_id): 26 | url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(song_id) + '&lv=1&kv=1&tv=-1' 27 | html = get_html(url) 28 | json_obj = json.loads(html) 29 | text = json_obj['lrc']['lyric'] 30 | regex = re.compile(r'\[.*\]') 31 | finalLyric = re.sub(regex, '', text).strip() 32 | return finalLyric 33 | 34 | def write_text(song_name,text): 35 | print("正在写入歌曲:{}".format(song_name)) 36 | with open("{}.txt".format(song_name),'a',encoding='utf-8') as fp: 37 | fp.write(text) 38 | 39 | def getSingerInfo(html): 40 | soup = BeautifulSoup(html, 'lxml') 41 | links = soup.find('ul', class_='f-hide').find_all('a') 42 | song_IDs = [] 43 | song_names = [] 44 | for link in links: 45 | song_ID = link.get('href').split('=')[-1] 46 | song_name = link.get_text() 47 | song_IDs.append(song_ID) 48 | song_names.append(song_name) 49 | return zip(song_names, song_IDs) 50 | 51 | def downloadSong(songName,songId): 52 | singer_url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songId) 53 | print('正在下载歌曲:{}'.format(songName)) 54 | request.urlretrieve(singer_url,'{}.mp3'.format(songName)) 55 | 56 | 57 | 58 | if __name__ == "__main__": 59 | singerId = input("请输入歌手的ID:") 60 | startUrl = "http://music.163.com/artist?id={}".format(singerId) 61 | html = get_html(startUrl) 62 | singerInfos = getSingerInfo(html) 63 | 64 | for singerInfo in singerInfos: 65 | print(singerInfo[1],singerInfo[0]) 66 | text = get_text(singerInfo[1]) 67 | # 下载歌曲文本 68 | write_text(singerInfo[0],text) 69 | # 下载歌曲mp3 70 | downloadSong(singerInfo[0],singerInfo[1]) 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /爬虫小demo/21 TaoBaoInfo.py: -------------------------------------------------------------------------------- 1 | from urllib import request 2 | import re, os, datetime 3 | from selenium import webdriver 4 | import ssl 5 | 6 | ssl._create_default_https_context = ssl._create_unverified_context 7 | 8 | 9 | class TaoBaoInfo: 10 | def __init__(self): 11 | self.dirName = 'MyTaoBaoInfo' 12 | self.driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs') 13 | self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'} 14 | 15 | # 获取页面内容提取 16 | def getPageContent(self, page): 17 | 18 | url = "https://mm.taobao.com/json/request_top_list.htm?page=" + str(page) 19 | response = request.Request(url, headers = self.headers) 20 | response = request.urlopen(response) 21 | 22 | # 正则获取 23 | pattern_link = re.compile(r'.*?(.*?).*?' 25 | r'.*?(.*?).*?' 26 | r'(.*?)' 27 | , re.S) 28 | items = re.findall(pattern_link, response.read().decode('gbk')) 29 | 30 | for item in items: 31 | # 详情页面:头像,个人详情,名字,年龄,地区 32 | 33 | detailPage = item[1] 34 | name = item[2] 35 | self.getDetailPage(detailPage, name) 36 | 37 | def getDetailPage(self, url, name): 38 | url = 'http:' + url 39 | self.driver.get(url) 40 | base_msg = self.driver.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li') 41 | brief = '' 42 | for item in base_msg: 43 | print(item.text) 44 | brief += item.text + '\n' 45 | 46 | icon_url = self.driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]//img') 47 | icon_url = icon_url.get_attribute('src') 48 | dir = self.dirName + '/' + name 49 | self.mkdir(dir) 50 | # 保存头像 51 | try: 52 | self.saveIcon(icon_url, dir, name) 53 | except Exception as e: 54 | print(u'保存头像失败 %s' % (e)) 55 | 56 | # 开始跳转相册列表 57 | images_url = self.driver.find_element_by_xpath('//ul[@class="mm-p-menu"]//a') 58 | images_url = images_url.get_attribute('href') 59 | try: 60 | self.getAllImage(images_url, name) 61 | except Exception as e: 62 | print(u'获取所有相册异常 %s' % e) 63 | 64 | try: 65 | self.saveBrief(brief,dir, name) 66 | 67 | except Exception as e: 68 | print(u'保存个人信息失败 %s' % e) 69 | 70 | # 保存个人信息 71 | def saveBrief(self, content,dir, name): 72 | fileName = dir + '/' + name + '.txt' 73 | with open(fileName,'w+') as file: 74 | file.write(content) 75 | print(u'下载完成' + '\n' + '\n') 76 | # 获取所有图片 77 | def getAllImage(self, images_url, name): 78 | self.driver.get(images_url) 79 | # 只获取第一个相册 80 | photos = self.driver.find_element_by_xpath('//div[@class="mm-photo-cell-middle"]//h4/a') 81 | photos_url = photos.get_attribute('href') 82 | # 进入相册页面获取相册内容 83 | self.driver.get(photos_url) 84 | images_all = self.driver.find_elements_by_xpath('//div[@id="mm-photoimg-area"]/a/img') 85 | 86 | self.saveImgs(images_all, name) 87 | 88 | def saveImgs(self, images, name): 89 | index = 1 90 | 91 | for imageUrl in images: 92 | splitPath = imageUrl.get_attribute('src').split('.') 93 | fTail = splitPath.pop() 94 | if len(fTail) > 3: 95 | fTail = "jpg" 96 | fileName = self.dirName + '/' + name + '/' + name + str(index) + "." + fTail 97 | self.saveImg(imageUrl.get_attribute('src'), fileName) 98 | index += 1 99 | 100 | def saveIcon(self, url, dir, name): 101 | splitPath = url.split('.') 102 | fTail = splitPath.pop() 103 | fileName = dir + '/' + name + '.' + fTail 104 | print(fileName) 105 | self.saveImg(url, fileName) 106 | 107 | # 写入图片 108 | def saveImg(self, imageUrl, fileName): 109 | print(imageUrl) 110 | u = request.urlopen(imageUrl) 111 | data = u.read() 112 | f = open(fileName, 'wb') 113 | f.write(data) 114 | f.close() 115 | 116 | 117 | # 创建目录 118 | def mkdir(self, path): 119 | path = path.strip() 120 | print(u'正在下载 %s 个人信息' % path) 121 | if os.path.exists(path): 122 | return False 123 | else: 124 | os.makedirs(path) 125 | return True 126 | 127 | if __name__ == "__main__": 128 | taoBaoInfo = TaoBaoInfo() 129 | # 输入需要下载的页数 130 | page = input("请输入要下载的页数:") 131 | for index in range(1, int(page) + 1): 132 | taoBaoInfo.getPageContent(index) 133 | -------------------------------------------------------------------------------- /爬虫小demo/22 JDPython.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from lxml import etree 4 | 5 | driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs') 6 | 7 | 8 | # 获取第一页的数据 9 | def get_html(): 10 | url = "https://detail.tmall.com/item.htm?id=531993957001&skuId=3609796167425&user_id=268451883&cat_id=2&is_b=1&rn=71b9b0aeb233411c4f59fe8c610bc34b" 11 | driver.get(url) 12 | time.sleep(5) 13 | driver.execute_script('window.scrollBy(0,3000)') 14 | time.sleep(2) 15 | driver.execute_script('window.scrollBy(0,5000)') 16 | time.sleep(2) 17 | 18 | # 累计评价 19 | btnNext = driver.find_element_by_xpath('//*[@id="J_TabBar"]/li[3]/a') 20 | btnNext.click() 21 | html = driver.page_source 22 | return html 23 | 24 | 25 | def get_comments(html): 26 | source = etree.HTML(html) 27 | commens = source.xpath("//*[@id='J_TabBar']/li[3]/a/em/text()") 28 | print('评论数:', commens) 29 | # 将评论转为int类型 30 | commens = (int(commens[0]) / 20) + 1 31 | # 获取到总评论 32 | print('评论数:', int(commens)) 33 | return int(commens) 34 | 35 | 36 | def parse_html(html): 37 | html = etree.HTML(html) 38 | commentlist = html.xpath("//*[@class='rate-grid']/table/tbody") 39 | for comment in commentlist: 40 | # 评论 41 | vercomment = comment.xpath( 42 | "./tr/td[@class='tm-col-master']/div[@class='tm-rate-content']/div[@class='tm-rate-fulltxt']/text()") 43 | # 机器类型 44 | verphone = comment.xpath("./tr/td[@class='col-meta']/div[@class='rate-sku']/p[@title]/text()") 45 | print(vercomment) 46 | print(verphone) 47 | # 用户(头尾各一个字,中间用****代替) 48 | veruser = comment.xpath("./tr/td[@class='col-author']/div[@class='rate-user-info']/text()") 49 | print(veruser) 50 | 51 | 52 | def next_button_work(num): 53 | if num != 0: 54 | driver.execute_script('window.scrollBy(0,3000)') 55 | time.sleep(2) 56 | try: 57 | driver.find_element_by_css_selector('#J_Reviews > div > div.rate-page > div > a:last-child').click() 58 | except Exception as e: 59 | print(e) 60 | 61 | time.sleep(2) 62 | driver.execute_script('window.scrollBy(0,3000)') 63 | time.sleep(2) 64 | driver.execute_script('window.scrollBy(0,5000)') 65 | time.sleep(2) 66 | html = driver.page_source 67 | parse_html(html) 68 | 69 | 70 | def selenuim_work(html): 71 | parse_html(html) 72 | next_button_work(1) 73 | pass 74 | 75 | 76 | def gettotalpagecomments(comments): 77 | html = get_html() 78 | for i in range(0, comments): 79 | selenuim_work(html) 80 | 81 | 82 | data = get_html() 83 | # 得到评论 84 | commens = get_comments(data) 85 | # 根据评论内容进行遍历 86 | gettotalpagecomments(commens) 87 | -------------------------------------------------------------------------------- /爬虫小demo/23 tuchongnet.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import rsa 4 | import binascii 5 | import requests 6 | from base64 import b64decode 7 | import sys 8 | reload(sys) 9 | sys.setdefaultencoding('utf8') 10 | 11 | class LBTuChongNet(object): 12 | def __init__(self): 13 | self.loginUrl = "https://tuchong.com/rest/accounts/login" 14 | self.userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" 15 | self.headers = { 16 | 'user-agent': self.userAgent 17 | } 18 | #pubkey 在页面的js中: http://static.tuchong.net/js/pc/page/welcome_6e7f1cd.js 19 | 20 | self.key = "D8CC0180AFCC72C9F5981BDB90A27928672F1D6EA8A57AF44EFFA7DAF6EFB17DAD9F643B9F9F7A1F05ACC2FEA8DE19F023200EFEE9224104627F1E680CE8F025AF44824A45EA4DDC321672D2DEAA91DB27418CFDD776848F27A76E747D53966683EFB00F7485F3ECF68365F5C10C69969AE3D665162D2EE3A5BA109D7DF6C7A5" 21 | self.session = requests.session() 22 | 23 | def get_crypt_password(self,message): 24 | rsaPublickey = int(self.key, 16) 25 | key = rsa.PublicKey(rsaPublickey, 65537) 26 | password = rsa.encrypt(message, key) 27 | password = binascii.b2a_hex(password) 28 | return password 29 | 30 | def get_captcha(self): 31 | captchaUrl="https://tuchong.com/rest/captcha/image" 32 | 33 | rsp = self.session.post(captchaUrl, data = None, headers = self.headers).json() 34 | captcha_id = rsp['captchaId'] 35 | captcha_base64 = rsp['captchaBase64'] 36 | captcha_base64 = captcha_base64.replace("data:image/png;base64,","") 37 | with open("lbcaptcha.png",'w') as f: 38 | f.write(b64decode(captcha_base64)) 39 | captcha = input(u'输入当前目录下 lbcaptcha.png 上的验证码:') 40 | return captcha_id,captcha 41 | 42 | def login(self,username,password): 43 | 44 | passwd_crypt = self.get_crypt_password(password) 45 | postdata = { 46 | 'account': username, 47 | 'password': passwd_crypt, 48 | } 49 | rsp = self.session.post(self.loginUrl, data = postdata, headers = self.headers) 50 | rsp = rsp.json() 51 | print(rsp) 52 | #登录成功 53 | if rsp.has_key('result') and rsp['result'] == "SUCCESS": 54 | print(rsp['message']) 55 | return 56 | 57 | #登录失败 58 | if rsp.has_key('code') and rsp.has_key('message'): 59 | print("response code:%d, message:%s"%(rsp['code'],rsp['message'])) 60 | if rsp['message'].find("验证码") >= 0: 61 | print(rsp['message']) 62 | captcha = self.get_captcha() 63 | postdata = { 64 | 'account': username, 65 | 'password': passwd_crypt, 66 | 'captcha_id': captcha[0], 67 | 'captcha_token': int(captcha[1]) 68 | } 69 | rsp = self.session.post(self.loginUrl, data = postdata, headers = self.headers) 70 | if str(rsp).find('200'): 71 | print("登陆成功!") 72 | 73 | 74 | if __name__ == '__main__': 75 | # 图虫网验证 76 | lbtuchongnet = LBTuChongNet() 77 | username = raw_input(u'请输入图虫网用户名:') 78 | password = raw_input(u'请输入图虫网密码:') 79 | lbtuchongnet.login(username,password) 80 | -------------------------------------------------------------------------------- /爬虫小demo/25 PythonItChat.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | itchat:获取分享给群或者个人的技术文章 4 | (0) 熟悉itchat,(https://www.cnblogs.com/Chenjiabing/p/6907397.html) 5 | (1) itchat 扫码次数太多会被限制扫码登录微信。 6 | (2) itchat:获取分享给群或者个人的技术文章,提取出文章链接、文章标题、文章首页图片、文章内容 7 | (3) 通过获取到的文章链接,爬虫文章内容。 8 | (4) 判断是接收方(ToUserName)是谁、发送方(FromUserName)是谁就是通过唯一的ID来判别的。 9 | (5) python itchat 热登陆(itchat.auto_login(hotReload=True)) 10 | (6) xpath模块爬取文章标题、文章内图片 11 | (7) 搭建web服务器环境(Mac使用XAMPP) 12 | (8) pymysql模块自动创建数据库、创建字段、保存内容到字段 13 | (9) navicat 的使用 14 | (10) python 相关模块的使用 15 | ''' 16 | 17 | # 爬取微信群或者是好友分享的文章 18 | # 监听微信公众号分享的文章 19 | 20 | import itchat 21 | # import全部消息类型 22 | from itchat.content import * 23 | import urllib2 24 | import lxml.etree 25 | import os 26 | import pymysql 27 | import uuid 28 | import json 29 | # 连接数据库 30 | table_cms_news = 'cms_news' 31 | table_cms_news_pic = 'cms_news_pic' 32 | # db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8') 33 | db = pymysql.connect(host='127.0.0.1', user='root', passwd='djs@12316', db='fz_afmcms', charset='utf8') 34 | cur = db.cursor() 35 | 36 | # 处理个人分享消息 37 | # 包括文本、位置、名片、通知、分享(49重点) 38 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING]) 39 | def text_reply(msg): 40 | print msg 41 | # 微信里,每个用户和群聊,都使用很长的ID来区分 42 | if msg["MsgType"] == 49: 43 | print "个人分享文章地址链接Url:" + "---------------------------" 44 | 45 | xmlcontent = lxml.etree.HTML(get_html(msg["Url"])) 46 | print xmlcontent 47 | title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()') 48 | 49 | imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src') 50 | # 下载图片 51 | source = xmlcontent.xpath('//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()') 52 | time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()') 53 | print "来源" 54 | print source, time 55 | # 下载图片 56 | print "下载图片" 57 | # print imgArray 58 | # print title[0] 59 | get_image(title, imgArray, source, time,msg["Url"]) 60 | 61 | print msg["Url"] 62 | print "个人分享文章类型编号MsgType:" + "---------------------------" 63 | print msg["MsgType"] 64 | print "个人分享Content:" + "---------------------------" 65 | print msg["Content"] 66 | print "个人分享FromUserName:" + "---------------------------" 67 | print msg["FromUserName"] 68 | print "个人分享ToUserName:" + "---------------------------" 69 | print msg["ToUserName"] 70 | print "个人分享链接标题FileName:" + "---------------------------" 71 | print msg["FileName"] 72 | 73 | print "------------个人" 74 | # 获取到的信息是某某人和登录者之间的通讯,如果不是和登录这通讯就获取不到 75 | print itchat.search_friends(userName=msg['FromUserName'])['NickName'] 76 | print itchat.search_friends(userName=msg['ToUserName'])['NickName'] 77 | 78 | else: 79 | print "不是个人分享的文章" 80 | 81 | 82 | # 处理群聊消息 83 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING], isGroupChat=True) 84 | def text_reply(msg): 85 | print msg 86 | if msg["MsgType"] == 49: 87 | print "群聊分享文章地址链接Url:" + "---------------------------" 88 | print msg["Url"] 89 | 90 | xmlcontent = lxml.etree.HTML(get_html(msg["Url"])) 91 | title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()') 92 | imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src') 93 | # 来源 94 | source = xmlcontent.xpath('//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()') 95 | time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()') 96 | print "来源" 97 | print source,time 98 | # 下载图片 99 | print "下载图片" 100 | # print imgArray 101 | # print title[0] 102 | get_image(title,imgArray,source,time,msg["Url"]) 103 | 104 | # print "群聊分享文章类型编号MsgType:" + "---------------------------" 105 | # print msg["MsgType"] 106 | # print "群聊分享Content:" + "---------------------------" 107 | # print msg["Content"] 108 | # print "群聊分享FromUserName:" + "---------------------------" 109 | # print msg["FromUserName"] 110 | # print "群聊分享ToUserName:" + "---------------------------" 111 | # print msg["ToUserName"] 112 | # print "群聊分享链接标题FileName:" + "---------------------------" 113 | # print msg["FileName"] 114 | print "-------------群--------" 115 | # itchat.send('%s: %s : %s' % (msg['Type'], msg['Text'], msg['Url']), msg['FromUserName']) 116 | 117 | print msg['FromUserName'] 118 | print msg['ToUserName'] 119 | # 这个是需要每次扫码登录都改变的receiver 120 | receiver = "@4603e5cb2e47b710bba6fd15dfa3ace9ef3be0f3c80b812e0cc97cd7a71b7c96" 121 | if msg['FromUserName'] == receiver: 122 | print "----------- 自己在群里发的文章 ------------" 123 | # 自己在群里发的文章 124 | print "昵称:" 125 | print itchat.search_friends(userName=msg['FromUserName'])['NickName'] 126 | print " ----------- " 127 | print "群名称:" 128 | print itchat.search_chatrooms(userName=msg['ToUserName'])['NickName'] 129 | chatRoomName = "呵呵各地" 130 | # if itchat.search_chatrooms(userName=msg['ToUserName'])['NickName'] == chatRoomName: 131 | # pass 132 | # else: 133 | # pass 134 | 135 | else: 136 | # 群友发的文章 137 | print "----------- 群友发的文章 -----------" 138 | print "昵称:" 139 | print msg['ActualNickName'] 140 | print " ----------- " 141 | print "群名称:" 142 | print itchat.search_chatrooms(userName=msg['FromUserName'])['NickName'] 143 | chatRoomName = "呵呵各地" 144 | # if itchat.search_chatrooms(userName=msg['FromUserName'])['NickName'] == chatRoomName: 145 | # pass 146 | # else: 147 | # pass 148 | else: 149 | print "不是群聊分享的文章" 150 | # return msg['Text'] 151 | 152 | 153 | # 处理微信公众号消息 154 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING], isMpChat=True) 155 | def text_reply(msg): 156 | print msg 157 | print itchat.search_mps(name='PythonCoder')[0]["NickName"] 158 | if msg["MsgType"] == 49: 159 | print "监听到制定微信公众号分享的文章链接:" 160 | print msg["Url"] 161 | else: 162 | print "微信公众号分享的不是文章" 163 | 164 | # 获取网页内容 165 | def get_html(url): 166 | request = urllib2.Request(url) 167 | response = urllib2.urlopen(request) 168 | html = response.read() 169 | return html 170 | 171 | # 下载图片 172 | def get_image(title,imgArray,source,time,linkurl): 173 | print "标题" 174 | result = cur.execute("select news_url from cms_news WHERE news_url='"+ linkurl + "'") 175 | print(str(result) + '------------url-----------') 176 | 177 | if result: 178 | print("数据库里面存在此数据") 179 | else: 180 | if os.path.isdir('./imgs'): 181 | pass 182 | else: 183 | os.mkdir("./imgs") 184 | for item in imgArray: 185 | with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file: 186 | file.write(get_html(item)) 187 | file.close 188 | ima_dic = {} 189 | news_pic = "" 190 | news_pic_s = "" 191 | news_pic_t = "" 192 | 193 | if len(imgArray) == 0: 194 | pass 195 | else: 196 | # 文章图片 197 | for index, item in enumerate(imgArray): 198 | ima_dic[index] = item 199 | if len(imgArray) == 0: 200 | pass 201 | elif len(imgArray) == 1: 202 | news_pic = imgArray[0] 203 | elif len(imgArray) == 2: 204 | news_pic = imgArray[0] 205 | news_pic_s = imgArray[1] 206 | elif len(imgArray) == 3: 207 | news_pic = imgArray[0] 208 | news_pic_s = imgArray[1] 209 | news_pic_t = imgArray[2] 210 | new_id = str(uuid.uuid1()).strip().replace("-", "") 211 | titleString = "" 212 | if len(title) == 0: 213 | pass 214 | else: 215 | titleString = title[0].strip().replace("\n", "") 216 | cur.execute( 217 | 'INSERT INTO ' + table_cms_news_pic + ' (news_id,pic_url,pic_desc) VALUES (%s,%s,%s)', 218 | (new_id, json.dumps(ima_dic,ensure_ascii=False),"")) 219 | cur.execute( 220 | 'INSERT INTO ' + table_cms_news + ' (news_open_type,news_id,news_title,news_type,com_id,'\ 221 | 'news_column_code1,news_column_name1,'\ 222 | 'news_column_code2,news_column_name2,news_desc,news_pic,'\ 223 | 'news_pic_s,news_pic_t,news_pic_is_show,'\ 224 | 'news_content,news_source,news_cuser_name,'\ 225 | 'news_ctime,news_url,news_status,view_count,platid) '\ 226 | 'VALUES (%s,%s, %s,%s,%s, %s,%s,%s,%s, %s,%s, %s,%s,%s,'\ 227 | ' %s,%s, %s,%s,%s,%s,%s,%s)', 228 | ('1',new_id,titleString,'1','1','1','微信转发','1','分类1','news_desc',news_pic,news_pic_s, 229 | news_pic_t,'1','news_content',source[0].strip().replace("\n", ""),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", ""),linkurl, 230 | '1',200,'weixin')) 231 | 232 | # cur.execute( 233 | # 'INSERT INTO ' + table_cms_news + ' (title,url, img,source,time) VALUES (%s, %s,%s,%s, %s)', 234 | # (title[0].strip().replace("\n", ""),linkurl, json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", ""))) 235 | cur.connection.commit() 236 | print("------------------------ 插入成功 ----------------------------------") 237 | 238 | # 连接数据库 239 | def get_connect(): 240 | 241 | try: 242 | # 创建表 243 | cur.execute( 244 | 'CREATE TABLE ' + table_cms_news + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000),url VARCHAR(10000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))') 245 | except pymysql.err.InternalError as e: 246 | print(e) 247 | # 修改表字段 248 | cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci') 249 | cur.execute( 250 | 'ALTER TABLE ' + table_cms_news + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 251 | cur.execute( 252 | 'ALTER TABLE ' + table_cms_news + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 253 | cur.execute( 254 | 'ALTER TABLE ' + table_cms_news + ' CHANGE url url VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 255 | cur.execute( 256 | 'ALTER TABLE ' + table_cms_news + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 257 | cur.execute( 258 | 'ALTER TABLE ' + table_cms_news + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 259 | cur.execute( 260 | 'ALTER TABLE ' + table_cms_news + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 261 | 262 | 263 | # 热登录(在一段时间内不用扫码登录还能保持登录状态) 264 | get_connect() 265 | print "哈哈" 266 | itchat.auto_login(hotReload=True) 267 | # 绑定消息响应事件后,让itchat运行起来,监听消息 268 | itchat.run() 269 | 270 | -------------------------------------------------------------------------------- /爬虫小demo/26 PythonWeChat.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import pickle 3 | import wechatsogou 4 | import urllib2 5 | import lxml.etree 6 | import os 7 | import pymysql 8 | import json 9 | 10 | # 添加一个文件,将已经发送成功的文章标题序列化到文件,防止多次运行导致重复发送邮件 11 | file_path = 'sent_articles_file' 12 | 13 | ws_api = wechatsogou.WechatSogouAPI() 14 | 15 | # 连接数据库 16 | tablename = 'pythonwechat' 17 | db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8') 18 | cur = db.cursor() 19 | cur.execute('USE itchat') 20 | 21 | # 获取公众号文章信息 22 | def get_article(gzh): 23 | articles = ws_api.get_gzh_article_by_history(gzh) 24 | print(len(articles['article'])) 25 | return articles['article'] 26 | 27 | # 获取网页内容 28 | def get_html(url): 29 | request = urllib2.Request(url) 30 | response = urllib2.urlopen(request) 31 | html = response.read() 32 | return html 33 | 34 | # 下载图片 35 | def get_image(title,imgArray,source,time): 36 | if os.path.isdir('./imgs'): 37 | pass 38 | else: 39 | os.mkdir("./imgs") 40 | for item in imgArray: 41 | with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file: 42 | file.write(get_html(item)) 43 | file.close 44 | 45 | cur.execute( 46 | 'INSERT INTO ' + tablename + ' (title, img,source,time) VALUES (%s, %s,%s, %s)', 47 | (title[0].strip().replace("\n", ""), json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", ""))) 48 | cur.connection.commit() 49 | print title[0] 50 | print("------------------------ 插入成功 ----------------------------------") 51 | 52 | # 连接数据库 53 | def get_connect(): 54 | 55 | try: 56 | # 创建表 57 | cur.execute( 58 | 'CREATE TABLE ' + tablename + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))') 59 | except pymysql.err.InternalError as e: 60 | print(e) 61 | # 修改表字段 62 | cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci') 63 | cur.execute( 64 | 'ALTER TABLE ' + tablename + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 65 | cur.execute( 66 | 'ALTER TABLE ' + tablename + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 67 | cur.execute( 68 | 'ALTER TABLE ' + tablename + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 69 | cur.execute( 70 | 'ALTER TABLE ' + tablename + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 71 | cur.execute( 72 | 'ALTER TABLE ' + tablename + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') 73 | 74 | 75 | if '__main__' == __name__: 76 | 77 | get_connect() 78 | 79 | # 定义一个公众号列表 80 | gzh_list = ['技术最前线', 'python', '全民独立经纪人', '程序视界', '非著名程序员'] 81 | 82 | for gzh in gzh_list: 83 | # 查找公众号之前,先从文件中反序列化出已经成功发送的文章列表 84 | if os.path.exists(file_path): 85 | f = open(file_path, 'rb') 86 | sent_list = pickle.load(f) 87 | f.close() 88 | articles = get_article(gzh) 89 | for article in articles: 90 | print(article['title'],'\n\t' ,article['content_url']) 91 | 92 | xmlcontent = lxml.etree.HTML(get_html(article['content_url'])) 93 | title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()') 94 | imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src') 95 | # 来源 96 | source = xmlcontent.xpath( 97 | '//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()') 98 | time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()') 99 | print "来源、时间" 100 | print source, time 101 | # 下载图片 102 | print "下载图片" 103 | get_image(title, imgArray, source, time) 104 | 105 | -------------------------------------------------------------------------------- /爬虫小demo/27 PythonWordCloud.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | from pyecharts import WordCloud 4 | # 词云 5 | def pythonWordCloud(x,y,label): 6 | wordcloud = WordCloud(width=1300, height=620) 7 | wordcloud.add("", x, y, word_size_range=[20, 100],shape="triangle-forward") 8 | wordcloud.render() 9 | os.system(r"render.html") 10 | x = [ 11 | 'PythonCoder', '爬虫', '人工智能', '大数据', 'Django', 12 | 'Flask', '机器学习', '数据分析', '深度学习', '运维测试', 'TensorFlow', 13 | '真实面试经历', '真实面试题', '自然语言处理', 'NLP',"数据处理", 14 | '500GB资料免费送', '开放源码', '免费学习群', '面试简历', 'JCSON'] 15 | y = [ 16 | 10000, 6181, 4386, 4055, 2467, 2244, 1898, 1484, 1112, 17 | 965, 847, 582, 555, 550, 462, 366, 360, 282, 273, 265,5000] 18 | 19 | pythonWordCloud(x,y,"词云") 20 | 21 | -------------------------------------------------------------------------------- /爬虫小demo/28 PythonCheHui.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | # 微信,找回好友、群聊用户撤回的消息 4 | # 说明:可以撤回的有文本文字、语音、视频、图片、位置、名片、分享、附件 5 | 6 | import itchat 7 | from itchat.content import * 8 | import sys 9 | import time 10 | import re 11 | import os 12 | 13 | reload(sys) 14 | sys.setdefaultencoding('utf8') 15 | 16 | msg_information = {} 17 | # 针对表情包的内容 18 | face_bug = None 19 | 20 | @itchat.msg_register([TEXT,PICTURE,FRIENDS,CARD,MAP,SHARING,RECORDING,ATTACHMENT,VIDEO],isFriendChat=True,isGroupChat=True) 21 | def receive_msg(msg): 22 | global face_bug 23 | # 接收消息的时间 24 | msg_time_rec = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 25 | if msg.has_key('ActualNickName'): 26 | # 群消息的发送者,用户的唯一标识 27 | from_user = msg['ActualUserName'] 28 | # 发送者群内的昵称 29 | msg_from = msg['ActualNickName'] 30 | # 获取所有好友 31 | friends = itchat.get_friends(update=True) 32 | for f in friends: 33 | # 如果群消息是好友发的 34 | if from_user == f['UserName']: 35 | # 优先使用好友的备注名称,没有则使用昵称 36 | if f['RemarkName']: 37 | msg_from = f['RemarkName'] 38 | else: 39 | msg_from = f['NickName'] 40 | break 41 | # 获取所有的群 42 | groups = itchat.get_chatrooms(update=True) 43 | for g in groups: 44 | # 根据群消息的FromUserName匹配是哪个群 45 | if msg['FromUserName'] == g['UserName']: 46 | group_name = g['NickName'] 47 | group_menbers = g['MemberCount'] 48 | break 49 | group_name = group_name + "(" + str(group_menbers) +")" 50 | else: 51 | # 优先使用备注名称 52 | if itchat.search_friends(userName=msg['FromUserName'])['RemarkName']: 53 | msg_from = itchat.search_friends(userName=msg['FromUserName'])['RemarkName'] 54 | else: 55 | # 在好友列表中查询发送信息的好友昵称 56 | msg_from = itchat.search_friends(userName=msg['FromUserName'])['NickName'] 57 | group_name = "" 58 | # 信息发送的时间 59 | msg_time = msg['CreateTime'] 60 | # 每条信息的id 61 | msg_id = msg['MsgId'] 62 | # 储存信息的内容 63 | msg_content = None 64 | # 储存分享的链接,比如分享的文章和音乐 65 | msg_share_url = None 66 | # 如果发送的消息是文本或者好友推荐 67 | if msg['Type'] == 'Text' or msg['Type'] == 'Friends': 68 | msg_content = msg['Text'] 69 | 70 | # 如果发送的消息是附件、视频、图片、语音 71 | elif msg['Type'] == "Attachment" or msg['Type'] == "Video" \ 72 | or msg['Type'] == 'Picture' \ 73 | or msg['Type'] == 'Recording': 74 | # 内容就是他们的文件名 75 | msg_content = msg['FileName'] 76 | # 下载文件 77 | msg['Text'](str(msg_content)) 78 | # 如果消息为分享的位置信息 79 | elif msg['Type'] == 'Map': 80 | x, y, location = re.search( 81 | "" + x.__str__() + " 经度->" + y.__str__() 85 | else: 86 | msg_content = r"" + location 87 | # 如果消息为分享的音乐或者文章,详细的内容为文章的标题或者是分享的名字 88 | elif msg['Type'] == 'Sharing': 89 | msg_content = msg['Text'] 90 | # 记录分享的url 91 | msg_share_url = msg['Url'] 92 | face_bug = msg_content 93 | # 将信息存储在字典中,每一个msg_id对应一条信息 94 | msg_information.update( 95 | { 96 | msg_id: { 97 | "msg_from": msg_from, 98 | "msg_time": msg_time, 99 | "msg_time_rec": msg_time_rec, 100 | "msg_type": msg["Type"], 101 | "msg_content": msg_content, 102 | "msg_share_url": msg_share_url, 103 | "group_name":group_name 104 | } 105 | } 106 | ) 107 | 108 | # 监听是否有消息撤回 109 | # 使用下面的装饰器监听,会发送4条消息 110 | # @itchat.msg_register(NOTE,isFriendChat=True,isGroupChat=True,isMpChat=True) 111 | 112 | # 监听是否有消息撤回 113 | # 使用下面的装饰器监听,会发送1条消息 114 | @itchat.msg_register(NOTE) 115 | def information(msg): 116 | # 这里如果这里的msg['Content']中包含消息撤回和id,就执行下面的语句 117 | if '撤回了一条消息' in msg['Content']: 118 | # 在返回的content查找撤回的消息的id 119 | old_msg_id = re.search("\(.*?)\<\/msgid\>", msg['Content']).group(1) 120 | # 获取到消息原文 121 | old_msg = msg_information.get(old_msg_id) 122 | # 如果发送的是表情包 123 | if len(old_msg_id)<11: 124 | # 发送撤回的提示给文件助手 125 | itchat.send_file(face_bug,toUserName='filehelper') 126 | # 把暂时存储的信息可以删除掉,也可以选择不删除 127 | # os.remove(face_bug) 128 | else: 129 | msg_body = old_msg.get('group_name') + old_msg.get('msg_from') +"\n" + old_msg.get('msg_time_rec') \ 130 | + "撤回了:" + "\n" + r"" + old_msg.get('msg_content') 131 | 132 | # 如果是分享的文件被撤回了,那么就将分享的url加在msg_body中发送给文件助手 133 | if old_msg['msg_type'] == "Sharing": 134 | msg_body += "\n链接是:" + old_msg.get('msg_share_url') 135 | print msg_body 136 | # 将撤回消息发给文件助手 137 | itchat.send_msg(msg_body, toUserName='filehelper') 138 | 139 | # 有文件的话也要将文件发送回去 140 | if old_msg["msg_type"] == "Picture" \ 141 | or old_msg["msg_type"] == "Recording" \ 142 | or old_msg["msg_type"] == "Video" \ 143 | or old_msg["msg_type"] == "Attachment": 144 | file = '@fil@%s' % (old_msg['msg_content']) 145 | itchat.send(msg=file, toUserName='filehelper') 146 | # 把暂时存储的信息可以删除掉,也可以选择不删除 147 | os.remove(old_msg['msg_content']) 148 | # 删除字典旧消息 149 | msg_information.pop(old_msg_id) 150 | 151 | itchat.auto_login(hotReload=True) 152 | itchat.run() 153 | -------------------------------------------------------------------------------- /爬虫小demo/29 PythonCeHui.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os, re, shutil, time, collections, json 3 | 4 | from html.parser import HTMLParser 5 | from xml.etree import ElementTree as ETree 6 | 7 | import itchat 8 | from itchat.content import * 9 | 10 | msg_store = collections.OrderedDict() 11 | timeout = 600 12 | sending_type = {'Picture': 'img', 'Video': 'vid'} 13 | data_path = 'data' 14 | nickname = '' 15 | bot = None 16 | 17 | if __name__ == '__main__': 18 | if not os.path.exists(data_path): 19 | os.mkdir(data_path) 20 | # if the QR code doesn't show correctly, you can try to change the value 21 | # of enableCdmQR to 1 or -1 or -2. It nothing works, you can change it to 22 | # enableCmdQR=True and a picture will show up. 23 | bot = itchat.new_instance() 24 | bot.auto_login(hotReload=True, enableCmdQR=2) 25 | nickname = bot.loginInfo['User']['NickName'] 26 | 27 | def clear_timeouted_message(): 28 | now = time.time() 29 | count = 0 30 | for k, v in list(msg_store.items()): 31 | if now - v['ReceivedTime'] > timeout: 32 | count += 1 33 | else: 34 | break 35 | for i in range(count): 36 | item = msg_store.popitem(last=False) 37 | 38 | def get_sender_receiver(msg): 39 | sender = nickname 40 | receiver = nickname 41 | if msg['FromUserName'][0:2] == '@@': # group chat 42 | sender = msg['ActualNickName'] 43 | m = bot.search_chatrooms(userName=msg['FromUserName']) 44 | if m is not None: 45 | receiver = m['NickName'] 46 | elif msg['ToUserName'][0:2] == '@@': # group chat by myself 47 | if 'ActualNickName' in msg: 48 | sender = msg['ActualNickName'] 49 | else: 50 | m = bot.search_friends(userName=msg['FromUserName']) 51 | if m is not None: 52 | sender = m['NickName'] 53 | m = bot.search_chatrooms(userName=msg['ToUserName']) 54 | if m is not None: 55 | receiver = m['NickName'] 56 | else: # personal chat 57 | m = bot.search_friends(userName=msg['FromUserName']) 58 | if m is not None: 59 | sender = m['NickName'] 60 | m = bot.search_friends(userName=msg['ToUserName']) 61 | if m is not None: 62 | receiver = m['NickName'] 63 | return HTMLParser().unescape(sender), HTMLParser().unescape(receiver) 64 | 65 | def print_msg(msg): 66 | msg_str = ' '.join(msg) 67 | print(msg_str) 68 | return msg_str 69 | 70 | def get_whole_msg(msg, download=False): 71 | sender, receiver = get_sender_receiver(msg) 72 | if len(msg['FileName']) > 0 and len(msg['Url']) == 0: 73 | if download: # download the file into data_path directory 74 | fn = os.path.join(data_path, msg['FileName']) 75 | msg['Text'](fn) 76 | if os.path.getsize(fn) == 0: 77 | return [] 78 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), fn) 79 | else: 80 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), msg['FileName']) 81 | return ['[%s]->[%s]:' % (sender, receiver), c] 82 | c = msg['Text'] 83 | if len(msg['Url']) > 0: 84 | try: # handle map label 85 | content_tree = ETree.fromstring(msg['OriContent']) 86 | if content_tree is not None: 87 | map_label = content_tree.find('location') 88 | if map_label is not None: 89 | c += ' ' + map_label.attrib['poiname'] 90 | c += ' ' + map_label.attrib['label'] 91 | except: 92 | pass 93 | url = HTMLParser().unescape(msg['Url']) 94 | c += ' ' + url 95 | return ['[%s]->[%s]: %s' % (sender, receiver, c)] 96 | 97 | @bot.msg_register([TEXT, PICTURE, MAP, CARD, SHARING, RECORDING, 98 | ATTACHMENT, VIDEO, FRIENDS], isFriendChat=True, isGroupChat=True) 99 | def normal_msg(msg): 100 | print_msg(get_whole_msg(msg)) 101 | now = time.time() 102 | msg['ReceivedTime'] = now 103 | msg_id = msg['MsgId'] 104 | msg_store[msg_id] = msg 105 | clear_timeouted_message() 106 | 107 | @bot.msg_register([NOTE], isFriendChat=True, isGroupChat=True) 108 | def note_msg(msg): 109 | print_msg(get_whole_msg(msg)) 110 | content = HTMLParser().unescape(msg['Content']) 111 | try: 112 | content_tree = ETree.fromstring(content) 113 | except Exception: 114 | # invent/remove to chatroom 115 | return 116 | if content_tree is None: 117 | return 118 | revoked = content_tree.find('revokemsg') 119 | if revoked is None: 120 | return 121 | old_msg_id = revoked.find('msgid').text 122 | old_msg = msg_store.get(old_msg_id) 123 | if old_msg is None: 124 | return 125 | msg_send = get_whole_msg(old_msg, download=True) 126 | for m in msg_send: 127 | bot.send(m, toUserName='filehelper') 128 | clear_timeouted_message() 129 | 130 | if __name__ == '__main__': 131 | bot.run() -------------------------------------------------------------------------------- /爬虫小demo/30 PythonZhuanFa.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | reload(sys) 4 | sys.setdefaultencoding('UTF8') 5 | 6 | import os, re, shutil, time, collections, json 7 | import requests 8 | from HTMLParser import HTMLParser 9 | from xml.etree import ElementTree as ETree 10 | import hashlib 11 | 12 | import itchat 13 | from itchat.content import * 14 | 15 | sending_type = {'Picture': 'img', 'Video': 'vid'} 16 | data_path = 'data' 17 | group_uin = {u'技术群1': '42235582@chatroom', 18 | u'技术群2': '2424504406@chatroom', 19 | u'技术群3': '6203978346@chatroom'} 20 | publishers = {u'技术群1': u'[阴险]', 21 | u'技术群2': u'[菜刀]', 22 | u'技术群3': u'[月亮]'} 23 | subscribers = [u'技术群1', u'技术群2', u'技术群3'] 24 | nickname = '' 25 | bot = None 26 | as_chat_bot = True 27 | 28 | if __name__ == '__main__': 29 | if not os.path.exists(data_path): 30 | os.mkdir(data_path) 31 | # if the QR code doesn't show correctly, you can try to change the value 32 | # of enableCdmQR to 1 or -1 or -2. It nothing works, you can change it to 33 | # enableCmdQR=True and a picture will show up. 34 | bot = itchat.new_instance() 35 | bot.auto_login(hotReload=True, enableCmdQR=2) 36 | nickname = bot.loginInfo['User']['NickName'] 37 | 38 | # tuling chat bot 39 | def talks_robot(info): 40 | api_url = 'http://www.tuling123.com/openapi/api' 41 | apikey = '' 42 | data = {'key': apikey, 'info': info.lower()} 43 | req = requests.post(api_url, data=data, timeout=10).text 44 | replys = json.loads(req)['text'] 45 | return replys 46 | 47 | def get_sender_receiver(msg): 48 | sender = nickname 49 | receiver = nickname 50 | if msg['FromUserName'][0:2] == '@@': # group chat 51 | sender = msg['ActualNickName'] 52 | m = bot.search_chatrooms(userName=msg['FromUserName']) 53 | if m is not None: 54 | receiver = m['NickName'] 55 | elif msg['ToUserName'][0:2] == '@@': # group chat by myself 56 | if 'ActualNickName' in msg: 57 | sender = msg['ActualNickName'] 58 | else: 59 | m = bot.search_friends(userName=msg['FromUserName']) 60 | if m is not None: 61 | sender = m['NickName'] 62 | m = bot.search_chatrooms(userName=msg['ToUserName']) 63 | if m is not None: 64 | receiver = m['NickName'] 65 | else: # personal chat 66 | m = bot.search_friends(userName=msg['FromUserName']) 67 | if m is not None: 68 | sender = m['NickName'] 69 | m = bot.search_friends(userName=msg['ToUserName']) 70 | if m is not None: 71 | receiver = m['NickName'] 72 | return HTMLParser().unescape(sender), HTMLParser().unescape(receiver) 73 | 74 | def print_msg(msg): 75 | msg_str = ' '.join(msg) 76 | print msg_str 77 | return msg_str 78 | 79 | def get_whole_msg(msg, prefix, download=False): 80 | if len(msg['FileName']) > 0 and len(msg['Url']) == 0: 81 | if download: # download the file into data_path directory 82 | fn = os.path.join(data_path, msg['FileName']) 83 | msg['Text'](fn) 84 | if os.path.getsize(fn) == 0: 85 | return [] 86 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), fn) 87 | else: 88 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), msg['FileName']) 89 | return ['%s:' % (prefix), c] 90 | c = msg['Text'] 91 | if len(msg['Url']) > 0: 92 | if len(msg['OriContent']) > 0: 93 | try: # handle map label 94 | content_tree = ETree.fromstring(msg['OriContent']) 95 | if content_tree is not None: 96 | map_label = content_tree.find('location') 97 | if map_label is not None: 98 | c += ' ' + map_label.attrib['poiname'] 99 | c += ' ' + map_label.attrib['label'] 100 | except: 101 | pass 102 | url = HTMLParser().unescape(msg['Url']) 103 | c += ' ' + url 104 | return ['%s: %s' % (prefix, c)] 105 | 106 | @bot.msg_register([TEXT], isFriendChat=True, isGroupChat=False) 107 | def personal_msg(msg): 108 | global as_chat_bot 109 | text = msg['Text'].strip() 110 | if text == u'闭嘴': 111 | as_chat_bot = False 112 | if text == u'张嘴吃药': 113 | as_chat_bot = True 114 | return talks_robot(text) 115 | 116 | @bot.msg_register([FRIENDS]) 117 | def accept_friend(msg): 118 | bot.add_friend(msg['RecommendInfo']['UserName'], 3) 119 | 120 | @bot.msg_register([TEXT, PICTURE, MAP, SHARING, RECORDING, ATTACHMENT, VIDEO], 121 | isFriendChat=False, isGroupChat=True) 122 | def group_msg(msg): 123 | # chat bot functionality 124 | global as_chat_bot 125 | if 'IsAt' in msg and msg['IsAt'] == True and \ 126 | msg['Type'] == 'Text' and \ 127 | msg['ToUserName'][0:2] != '@@' and \ 128 | msg['Text'].find(u'@' + nickname) >= 0: 129 | text = msg['Text'].replace(u'@' + nickname, '').strip() 130 | if text == u'shit': 131 | as_chat_bot = False 132 | return 133 | if as_chat_bot: 134 | info = talks_robot(text) 135 | if info.find('No Know') >= 0: 136 | return 137 | if info.find('No Can') >= 0: 138 | return 139 | if info.find('Sorry') >= 0: 140 | return 141 | return info 142 | return 143 | # forwarding functionality 144 | group = msg['FromUserName'] 145 | if msg['ToUserName'][0:2] == '@@': # message sent by myself 146 | group = msg['ToUserName'] 147 | sender, receiver = get_sender_receiver(msg) 148 | if sender == '': 149 | sender = nickname 150 | # check if the message is from the publisher groups 151 | if receiver not in publishers: # if not in the publishers, do nothing 152 | return 153 | # turn on the chat bot if this magic happens 154 | if msg['Type'] == 'Text' and \ 155 | hashlib.sha256(msg['Text']).hexdigest()[-2:] == '23': 156 | as_chat_bot = True 157 | # process message and send it to all the subscribed groups 158 | prefix = '%s[%s]' % (publishers[receiver], sender) 159 | msg_send = get_whole_msg(msg, prefix=prefix, download=True) 160 | if len(msg_send) == 0: 161 | return 162 | print_msg(msg_send) 163 | for tosend in subscribers: 164 | room = bot.search_chatrooms(name=tosend) 165 | for r in room: 166 | if r['UserName'] == group: # don't send back to the source 167 | continue 168 | if r['NickName'] != tosend: # check group name exact match 169 | continue 170 | for m in msg_send: # iterate messages (for images, videos, and files) 171 | bot.send(m, toUserName=r['UserName']) 172 | 173 | if __name__ == '__main__': 174 | bot.run() 175 | -------------------------------------------------------------------------------- /爬虫小demo/31 下载bilibili视频.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import html 3 | import re 4 | import urllib3 5 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 6 | 7 | def star(url): 8 | url2 = "https://api.bilibili.com/x/player/playurl?avid={avid}&cid={cid}&qn=32&type=&otype=json" 9 | headers2 = { 10 | "host": "", 11 | "Referer": "https://www.bilibili.com", 12 | "User-Agent": "Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,likeGecko)Chrome/63.0.3239.132Safari/537.36" 13 | } 14 | 15 | avid = re.findall("video/av(.+)\?", url) 16 | print(avid) 17 | cid ,name = get_cid(avid[0]) 18 | print(cid,name) 19 | flv_url , size = get_flvurl(url2.format(avid=avid[0],cid=cid)) 20 | shuju = size / 1024 / 1024 21 | print("本视频大小为:%.2fM" % shuju) 22 | 23 | h = re.findall("https://(.+)com",flv_url) 24 | host = h[0]+"com" 25 | 26 | headers2["host"] = host 27 | res = requests.get(flv_url,headers=headers2,stream=True, verify=False) 28 | print(res.status_code) 29 | save_movie(res,name) 30 | 31 | def get_cid(aid):#获得cid 32 | header = { 33 | 'host': 'api.bilibili.com', 34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0' 35 | } 36 | url = "https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp".format(aid=aid) 37 | response = requests.get(url,headers=header).json() 38 | # print(response["data"]) 39 | # 这个地方设置index是因为下载集合里面的视频,顺序,0代表下载第一个视频,1代表下载集合里面第二个视频,2,3,4...依次类推 40 | index = 0 41 | return response["data"][index]["cid"] ,response["data"][index]["part"] 42 | def get_flvurl(url):#获得视频真实flv地址 43 | header = {'host': 'api.bilibili.com', 44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'} 45 | 46 | response = requests.get(url,headers=header).json() 47 | return response["data"]["durl"][0]["url"],response["data"]["durl"][0]["size"] 48 | def save_movie(res,name):#保存视频 49 | chunk_size = 1024 50 | with open("{name}.flv".format(name = name),"wb") as f: 51 | for data in res.iter_content(1024): 52 | f.write(data) 53 | 54 | 55 | if __name__ == "__main__": 56 | # 把下面的av后面的'583959574'在要下载的视频集合里面找到就可以下载视频了 57 | url = "https://www.bilibili.com/video/av583959574?spm_id_from=333.334.b_62696c695f646f756761.5" 58 | star(url) 59 | 60 | 61 | -------------------------------------------------------------------------------- /爬虫小demo/32 m3u8.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | from Crypto.Cipher import AES 4 | 5 | def m3u8(url): 6 | header = { 7 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 8 | } 9 | # requests得到m3u8文件内容 10 | content = requests.get(url, headers=header).text 11 | if "#EXTM3U" not in content: 12 | print("这不是一个m3u8的视频链接!") 13 | return False 14 | if "EXT-X-KEY" not in content: 15 | print("没有加密") 16 | return False 17 | 18 | # 使用re正则得到key和视频地址 19 | jiami = re.findall('#EXT-X-KEY:(.*)',content) 20 | key = re.findall('URI="(.*)"', jiami[0]) 21 | vi = re.findall('IV=(.*)', jiami[0])[0] 22 | 23 | # 得到每一个ts视频链接 24 | 25 | # tslist = re.findall('EXTINF:(.*), (. *)',content.replace(' ', '').replace(r'\n', '')) 26 | tslist = re.findall('v.f240.ts(.*)',content) 27 | 28 | newlist = [] 29 | for i in tslist: 30 | newlist.append("v.f240.ts" + i) 31 | # print(newlist) 32 | # 得到key的链接并请求得到加密的key值 33 | keyurl = key[0] 34 | keycontent = requests.get(keyurl, headers=header).content 35 | 36 | # 得到每一个完整视频的链接地址 37 | base_url = url.replace(url.split('/')[-1], '') 38 | # print(base_url) 39 | tslisturl = [] 40 | for i in newlist: 41 | tsurl = base_url + i 42 | tslisturl.append(tsurl) 43 | 44 | # 得到解密方法,这里要导入第三方库 pycrypto 45 | # 这里有一个问题,安装pycrypto成功后,导入from Crypto.Cipher import AES报错 46 | # 找到使用python环境的文件夹,在Lib文件夹下有一个 site-packages 文件夹,里面是我们环境安装的包。 47 | # 找到一个crypto文件夹,打开可以看到 Cipher文件夹,此时我们将 crypto文件夹改为 Crypto 即可使用了 48 | # 必须添加b'0000000000000000',防止报错ValueError: IV must be 16 bytes long 49 | cryptor = AES.new(keycontent, AES.MODE_CBC, b'0000000000000000') 50 | 51 | # for循环获取视频文件 52 | for i in tslisturl: 53 | print(i) 54 | res = requests.get(i, header) 55 | # 使用解密方法解密得到的视频文件 56 | cont = cryptor.decrypt(res.content) 57 | # 以追加的形式保存为mp4文件,mp4可以随意命名,这里命名为小鹅通视频下载测试 58 | with open('14-搜索组件界面实现.mp4', 'ab+') as f: 59 | f.write(cont) 60 | return True 61 | 62 | if __name__ == '__main__': 63 | # 这个是网页上查到的小鹅通的卖u8地址 64 | # url = "https://1252524126.vod2.myqcloud.com/9764a7a5vodtransgzp1252524126/91c29aad5285890807164109582/drm/v.f146750.m3u8" 65 | # url = "https://1258102968.vod2.myqcloud.com/ed7d8254vodtranscq1258102968/a61912e43701925923160746329/drm/v.f240.m3u8?t=62dfad73&us=DYws6oOg3A&sign=1d4381d06b276e87eae478a23f3d6375" 66 | url = "https://1258102968.vod2.myqcloud.com/ed7d8254vodtranscq1258102968/a3ae8ff93701925923160630524/drm/v.f240.m3u8?t=62dfaf5a&us=RquNSsL6XT&sign=8bec9ca974f9413c9bad7a9e8d620ae2" 67 | pd = m3u8(url) 68 | if pd: 69 | print('视频下载完成!') --------------------------------------------------------------------------------