├── .DS_Store
├── .gitignore
├── CrawlYouYuan
    ├── .idea
    │   ├── CrawlYouYuan.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── CrawlYouYuan
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── youyuan.py
    ├── begin.py
    └── scrapy.cfg
├── DouBanMovie
    ├── .DS_Store
    ├── .idea
    │   ├── DouBanMovie.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── DouBanMovie
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── .DS_Store
    │   │   ├── __init__.py
    │   │   └── douban.py
    ├── begin.py
    ├── movie.json
    └── scrapy.cfg
├── DouYuSpider
    ├── .idea
    │   ├── DouYuSpider.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── DouYuSpider
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── douyu.py
    ├── Images
    │   ├── Cute兔丶.jpg
    │   ├── MiS媛.jpg
    │   ├── Super超级冷.jpg
    │   ├── Yozi柚子妹妹.jpg
    │   ├── pinky水蜜桃.jpg
    │   ├── 一只小玲儿.jpg
    │   ├── 会玩的黄宝宝.jpg
    │   ├── 冷伊宁.jpg
    │   ├── 十四万岁的青丘老太婆.jpg
    │   ├── 可乐小十五.jpg
    │   ├── 吃萝卜的辛巴.jpg
    │   ├── 咘咘柳.jpg
    │   ├── 大宝SOD蜜不是润肤露.jpg
    │   ├── 大木头CL.jpg
    │   ├── 小依泽儿.jpg
    │   ├── 小口古小咕.jpg
    │   ├── 小圆脸娜娜.jpg
    │   ├── 小小小思齐.jpg
    │   ├── 小雅er.jpg
    │   ├── 尛小钰.jpg
    │   ├── 左思念.jpg
    │   ├── 巫女蛋.jpg
    │   ├── 布丁味的雯宝宝.jpg
    │   ├── 幼齿懵骚小安妮.jpg
    │   ├── 悠悠fairy.jpg
    │   ├── 懵G娜.jpg
    │   ├── 是囧囧初啊.jpg
    │   ├── 江沅是个小可爱.jpg
    │   ├── 温柔的喵小胖.jpg
    │   ├── 爱笑的蒙蒙.jpg
    │   ├── 璇璇璇儿丶Tay.jpg
    │   ├── 甜馨大队长.jpg
    │   ├── 白羊可爱多.jpg
    │   ├── 磨人的小柠檬.jpg
    │   ├── 糖炒栗子lr.jpg
    │   ├── 糖糖小萌主.jpg
    │   ├── 紫絮儿521.jpg
    │   ├── 苏思淳sheep.jpg
    │   ├── 若儿被注册了呢.jpg
    │   ├── 诗诗诗诗诗诗酱.jpg
    │   ├── 谷猫宁.jpg
    │   ├── 辣椒酱jiang.jpg
    │   ├── 迷人的小北北.jpg
    │   ├── 阿青Dale.jpg
    │   ├── 陈梓不是橙子.jpg
    │   └── 鲸鱼妹爱素颜.jpg
    ├── begin.py
    ├── douyu.json
    └── scrapy.cfg
├── HongNiangNet
    ├── .DS_Store
    ├── .idea
    │   ├── HongNiangNet.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── HongNiangNet
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── .DS_Store
    │   │   ├── __init__.py
    │   │   └── hongniang.py
    ├── begin.py
    ├── content.json
    └── scrapy.cfg
├── LICENSE
├── README.md
├── duodian
    ├── .idea
    │   ├── duodian.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── db.sqlite3
    ├── duodian
    │   ├── __init__.py
    │   ├── settings.py
    │   ├── urls.py
    │   └── wsgi.py
    ├── manage.py
    ├── myduodian
    │   ├── __init__.py
    │   ├── admin.py
    │   ├── migrations
    │   │   ├── 0001_initial.py
    │   │   └── __init__.py
    │   ├── models.py
    │   ├── tests.py
    │   └── views.py
    ├── templates
    │   └── myduodian
    │   │   └── index.html
    └── woduodian.py
├── gongzhonghao.jpeg
├── jiekou
    ├── .idea
    │   ├── jiekou.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── db.sqlite3
    ├── jiekou
    │   ├── __init__.py
    │   ├── settings.py
    │   ├── urls.py
    │   └── wsgi.py
    ├── manage.py
    ├── myjiekou
    │   ├── __init__.py
    │   ├── admin.py
    │   ├── migrations
    │   │   ├── 0001_initial.py
    │   │   └── __init__.py
    │   ├── models.py
    │   ├── tests.py
    │   └── views.py
    └── templates
    │   └── myjiekou
    │       └── index.html
├── teacherInfo
    ├── .idea
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── teacherInfo.iml
    │   └── workspace.xml
    ├── begin.py
    ├── scrapy.cfg
    ├── teacher.json
    └── teacherInfo
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── myteacher.py
└── 爬虫小demo
    ├── .DS_Store
    ├── 01 taobao.py
    ├── 02 doubanzhihu.py
    ├── 03 douYuUnittest.py
    ├── 04 fileHandler.py
    ├── 05 getimage.py
    ├── 06 jsload.py
    ├── 07 jsondata.py
    ├── 08 jsonpath和json总结.py
    ├── 09 zhihu_login.py
    ├── 10 match.py
    ├── 11 neihan.py
    ├── 12 PIL.py
    ├── 13 queryxpath.py
    ├── 14 selenium执行js.py
    ├── 15 tencent.py
    ├── 16 xunmall.py
    ├── 17 zhihulogin.py
    ├── 18 github_login.py
    ├── 19 jd_login.py
    ├── 20 下载网易云歌词.py
    ├── 21 TaoBaoInfo.py
    ├── 22 JDPython.py
    ├── 23 tuchongnet.py
    ├── 24 pythonDuoDian.py
    ├── 25 PythonItChat.py
    ├── 26 PythonWeChat.py
    ├── 27 PythonWordCloud.py
    ├── 28 PythonCheHui.py
    ├── 29 PythonCeHui.py
    ├── 30 PythonZhuanFa.py
    ├── 31 下载bilibili视频.py
    └── 32  m3u8.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/CrawlYouYuan/.idea/CrawlYouYuan.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/CrawlYouYuan/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.2 (/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/CrawlYouYuan/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/CrawlYouYuan.iml" filepath="$PROJECT_DIR$/.idea/CrawlYouYuan.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/CrawlYouYuan/CrawlYouYuan/__init__.py


--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class CrawlyouyuanItem(scrapy.Item):
11 |     # 用户名
12 |     username = scrapy.Field()
13 |     # 年龄
14 |     age = scrapy.Field()
15 |     # 头像图片的链接
16 |     header_url = scrapy.Field()
17 |     # 相册图片的链接
18 |     images_url = scrapy.Field()
19 |     # 内心独白
20 |     content = scrapy.Field()
21 |     # 籍贯
22 |     place_from = scrapy.Field()
23 |     # 学历
24 |     education = scrapy.Field()
25 |     # 兴趣爱好
26 |     hobby = scrapy.Field()
27 |     # 个人主页
28 |     source_url = scrapy.Field()
29 |     # 数据来源网站
30 |     sourec = scrapy.Field()
31 |     # utc 时间
32 |     time = scrapy.Field()
33 |     # 爬虫名
34 |     spidername = scrapy.Field()
35 | 


--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import json
 8 | import codecs
 9 | 
10 | class CrawlyouyuanPipeline(object):
11 | 
12 |     def __init__(self):
13 |         self.filename = codecs.open('content.json', 'w', encoding='utf-8')
14 | 
15 |     def process_item(self, item, spider):
16 |         html = json.dumps(dict(item), ensure_ascii=False)
17 |         self.filename.write(html + '\n')
18 |         return item
19 | 
20 |     def spider_closed(self, spider):
21 |         self.filename.close()
22 | 
23 | 


--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for CrawlYouYuan project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'CrawlYouYuan'
13 | 
14 | SPIDER_MODULES = ['CrawlYouYuan.spiders']
15 | NEWSPIDER_MODULE = 'CrawlYouYuan.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | #   'Accept-Language': 'en',
44 | #}
45 | 
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | #    'CrawlYouYuan.middlewares.MyCustomSpiderMiddleware': 543,
50 | #}
51 | 
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | #    'CrawlYouYuan.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 | 
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 | 
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 |    'CrawlYouYuan.pipelines.CrawlyouyuanPipeline': 300,
68 | }
69 | 
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 | 
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 | 


--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/spiders/youyuan.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from scrapy.linkextractors import LinkExtractor
  4 | from scrapy.spiders import CrawlSpider, Rule
  5 | from CrawlYouYuan.items import CrawlyouyuanItem
  6 | import re
  7 | class YouyuanSpider(CrawlSpider):
  8 |     name = 'youyuan'
  9 |     allowed_domains = ['youyuan.com']
 10 |     start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
 11 |     # 自动生成的文件不需要改东西，只需要添加rules文件里面Rule角色就可以
 12 |     # 每一页匹配规则
 13 |     page_links = LinkExtractor(allow=(r"youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/"))
 14 |     # 每个人个人主页匹配规则
 15 |     profile_links = LinkExtractor(allow=(r"youyuan.com/\d+-profile/"))
 16 |     rules = (
 17 |         # 没有回调函数，说明follow是True
 18 |         Rule(page_links),
 19 |         # 有回调函数，说明follow是False
 20 |         Rule(profile_links, callback='parse_item', follow=True),
 21 |     )
 22 | 
 23 |     def parse_item(self, response):
 24 |         item = CrawlyouyuanItem()
 25 | 
 26 |         item['username'] = self.get_username(response)
 27 |         # 年龄
 28 |         item['age'] = self.get_age(response)
 29 |         # 头像图片的链接
 30 |         item['header_url'] = self.get_header_url(response)
 31 |         # 相册图片的链接
 32 |         item['images_url'] = self.get_images_url(response)
 33 |         # 内心独白
 34 |         item['content'] = self.get_content(response)
 35 |         # 籍贯
 36 |         item['place_from'] = self.get_place_from(response)
 37 |         # 学历
 38 |         item['education'] = self.get_education(response)
 39 |         # 兴趣爱好
 40 |         item['hobby'] = self.get_hobby(response)
 41 |         # 个人主页
 42 |         item['source_url'] = response.url
 43 |         # 数据来源网站
 44 |         item['sourec'] = "youyuan"
 45 | 
 46 |         yield item
 47 | 
 48 |     def get_username(self, response):
 49 |         username = response.xpath("//dl[@class='personal_cen']//div[@class='main']/strong/text()").extract()
 50 |         if len(username):
 51 |             username = username[0]
 52 |         else:
 53 |             username = "NULL"
 54 |         return username.strip()
 55 | 
 56 |     def get_age(self, response):
 57 |         age = response.xpath("//dl[@class='personal_cen']//dd/p/text()").extract()
 58 |         if len(age):
 59 |             age = re.findall(u"\d+岁", age[0])[0]
 60 |         else:
 61 |             age = "NULL"
 62 |         return age.strip()
 63 | 
 64 |     def get_header_url(self, response):
 65 |         header_url = response.xpath("//dl[@class='personal_cen']/dt/img/@src").extract()
 66 |         if len(header_url):
 67 |             header_url = header_url[0]
 68 |         else:
 69 |             header_url = "NULL"
 70 |         return header_url.strip()
 71 | 
 72 |     def get_images_url(self, response):
 73 |         images_url = response.xpath("//div[@class='ph_show']/ul/li/a/img/@src").extract()
 74 |         if len(images_url):
 75 |             images_url = ", ".join(images_url)
 76 |         else:
 77 |             images_url = "NULL"
 78 |         return images_url
 79 | 
 80 |     def get_content(self, response):
 81 |         content = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract()
 82 |         if len(content):
 83 |             content = content[0]
 84 |         else:
 85 |             content = "NULL"
 86 |         return content.strip()
 87 | 
 88 |     def get_place_from(self, response):
 89 |         place_from = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract()
 90 |         if len(place_from):
 91 |             place_from = place_from[0]
 92 |         else:
 93 |             place_from = "NULL"
 94 |         return place_from.strip()
 95 | 
 96 |     def get_education(self, response):
 97 |         education = response.xpath("//div[@class='pre_data']/ul/li[3]//ol[2]/li[2]/span/text()").extract()
 98 |         if len(education):
 99 |             education = education[0]
100 |         else:
101 |             education = "NULL"
102 |         return education.strip()
103 | 
104 |     def get_hobby(self, response):
105 |         hobby = response.xpath("//dl[@class='personal_cen']//ol/li/text()").extract()
106 |         if len(hobby):
107 |             hobby = ",".join(hobby).replace(" ", "")
108 |         else:
109 |             hobby = "NULL"
110 |         return hobby.strip()
111 | 


--------------------------------------------------------------------------------
/CrawlYouYuan/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl youyuan'.split())


--------------------------------------------------------------------------------
/CrawlYouYuan/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = CrawlYouYuan.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = CrawlYouYuan
12 | 


--------------------------------------------------------------------------------
/DouBanMovie/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/.DS_Store


--------------------------------------------------------------------------------
/DouBanMovie/.idea/DouBanMovie.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 2.7.10 virtualenv at ~/Desktop/scrapyenv" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/DouBanMovie/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 virtualenv at ~/Desktop/scrapyenv" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/DouBanMovie/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/DouBanMovie.iml" filepath="$PROJECT_DIR$/.idea/DouBanMovie.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/.DS_Store


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/__init__.py


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanmovieItem(scrapy.Item):
12 |     # 标题
13 |     title = scrapy.Field()
14 |     # 信息
15 |     info = scrapy.Field()
16 |     # 评分
17 |     star = scrapy.Field()
18 |     # 简介
19 |     quote = scrapy.Field()
20 |     
21 |     
22 | 


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import codecs
 9 | import json
10 | import pymongo
11 | from scrapy.conf import settings
12 | 
13 | class DoubanmoviePipeline(object):
14 |     host = settings["MONGODB_HOST"]
15 |     port = settings["MONGODB_PORT"]
16 |     dbname = settings["MONGODB_DBNAME"]
17 |     sheetname = settings["MONGODB_SHEETNAME"]
18 | 
19 |     # 创建MONGODB数据库链接
20 |     client = pymongo.MongoClient(host=host, port=port)
21 |     # 指定数据库
22 |     mydb = client[dbname]
23 |     # 存放数据的数据库表名
24 |     sheet = mydb[sheetname]
25 |     def process_item(self, item, spider):
26 |         # 1. 生成文件
27 |         # self.filename = codecs.open('movie.json','a',encoding='utf-8')
28 |         # html = json.dumps(dict(item),ensure_ascii=False)
29 |         # self.filename.write(html + '\n')
30 |         # self.filename.close()
31 |         # 2. 把数据插入数据库
32 |         data = dict(item)
33 |         self.sheet.insert(data)
34 | 
35 |         return item
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for DouBanMovie project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'DouBanMovie'
 13 | 
 14 | SPIDER_MODULES = ['DouBanMovie.spiders']
 15 | NEWSPIDER_MODULE = 'DouBanMovie.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'DouBanMovie (+http://www.yourdomain.com)'
 20 | USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
 21 | 
 22 | # Obey robots.txt rules
 23 | ROBOTSTXT_OBEY = True
 24 | 
 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 26 | #CONCURRENT_REQUESTS = 32
 27 | 
 28 | # Configure a delay for requests for the same website (default: 0)
 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 30 | # See also autothrottle settings and docs
 31 | #DOWNLOAD_DELAY = 3
 32 | # The download delay setting will honor only one of:
 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 34 | #CONCURRENT_REQUESTS_PER_IP = 16
 35 | 
 36 | # Disable cookies (enabled by default)
 37 | #COOKIES_ENABLED = False
 38 | 
 39 | # Disable Telnet Console (enabled by default)
 40 | #TELNETCONSOLE_ENABLED = False
 41 | 
 42 | # Override the default request headers:
 43 | # DEFAULT_REQUEST_HEADERS = {
 44 | #   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)',
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | # }
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 51 | #SPIDER_MIDDLEWARES = {
 52 | #    'DouBanMovie.middlewares.MyCustomSpiderMiddleware': 543,
 53 | #}
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 57 | #DOWNLOADER_MIDDLEWARES = {
 58 | #    'DouBanMovie.middlewares.MyCustomDownloaderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable extensions
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 63 | #EXTENSIONS = {
 64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 65 | #}
 66 | 
 67 | # Configure item pipelines
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 69 | ITEM_PIPELINES = {
 70 |    'DouBanMovie.pipelines.DoubanmoviePipeline': 300,
 71 | }
 72 | # MONGODB 主机名
 73 | MONGODB_HOST = "127.0.0.1"
 74 | 
 75 | # MONGODB 端口号
 76 | MONGODB_PORT = 27017
 77 | 
 78 | # 数据库名称
 79 | MONGODB_DBNAME = "Douban"
 80 | 
 81 | # 存放数据的表名称
 82 | MONGODB_SHEETNAME = "doubanmovies"
 83 | # Enable and configure the AutoThrottle extension (disabled by default)
 84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 85 | #AUTOTHROTTLE_ENABLED = True
 86 | # The initial download delay
 87 | #AUTOTHROTTLE_START_DELAY = 5
 88 | # The maximum download delay to be set in case of high latencies
 89 | #AUTOTHROTTLE_MAX_DELAY = 60
 90 | # The average number of requests Scrapy should be sending in parallel to
 91 | # each remote server
 92 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 93 | # Enable showing throttling stats for every response received:
 94 | #AUTOTHROTTLE_DEBUG = False
 95 | 
 96 | # Enable and configure HTTP caching (disabled by default)
 97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 98 | #HTTPCACHE_ENABLED = True
 99 | #HTTPCACHE_EXPIRATION_SECS = 0
100 | #HTTPCACHE_DIR = 'httpcache'
101 | #HTTPCACHE_IGNORE_HTTP_CODES = []
102 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
103 | 


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/spiders/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/spiders/.DS_Store


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/spiders/douban.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from DouBanMovie.items import DoubanmovieItem
 4 | 
 5 | class DoubanSpider(scrapy.Spider):
 6 |     name = "douban"
 7 |     allowed_domains = ["movie.douban.com"]
 8 |     offset = 0
 9 |     url = 'https://movie.douban.com/top250?start='
10 |     start_urls = (
11 |         url + str(offset),
12 |     )
13 | 
14 |     def parse(self, response):
15 |         item = DoubanmovieItem()
16 |         # 电影全部信息
17 |         movies = response.xpath("//div[@class='info']")
18 |         for eachmovie in movies:
19 |             
20 |             titlelist = eachmovie.xpath("./div[@class='hd']/a/span[@class='title'][1]/text()")
21 |             if len(titlelist) == 0:
22 |                 item['title'] = ''
23 |             else:
24 |                 item['title'] = titlelist.extract()[0]
25 |             info = eachmovie.xpath("./div[@class='bd']/p/text()").extract()[0]
26 |             item['info'] = info.replace('\n','').strip()
27 |             item['star'] = eachmovie.xpath("./div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()").extract()[0]
28 |             quotelist = eachmovie.xpath("./div[@class='bd']/p[@class='quote']/span[@class='inq']/text()")
29 |             if len(quotelist) == 0:
30 |                 item['quote'] = ''
31 |             else:
32 |                 item['quote'] = quotelist.extract()[0]
33 |             yield item
34 | 
35 | 
36 |         if self.offset < 225:
37 |             self.offset += 25
38 |             yield scrapy.Request(self.url + str(self.offset),callback = self.parse)
39 | 
40 | 


--------------------------------------------------------------------------------
/DouBanMovie/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl douban'.split())


--------------------------------------------------------------------------------
/DouBanMovie/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = DouBanMovie.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DouBanMovie
12 | 


--------------------------------------------------------------------------------
/DouYuSpider/.idea/DouYuSpider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 2.7.10 virtualenv at ~/Desktop/scrapyenv" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/DouYuSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 virtualenv at ~/Desktop/scrapyenv" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/DouYuSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/DouYuSpider.iml" filepath="$PROJECT_DIR$/.idea/DouYuSpider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/DouYuSpider/__init__.py


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DouyuspiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # 房间名
14 |     vertical = scrapy.Field()
15 |     # 昵称
16 |     name = scrapy.Field()
17 |     # 房间照片
18 |     room_src = scrapy.Field()
19 |     # 地区
20 |     anchor_city = scrapy.Field()
21 |     imagesPath = scrapy.Field()
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class DouyuspiderSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import scrapy
 9 | # import codecs
10 | # import json
11 | import os
12 | from scrapy.pipelines.images import ImagesPipeline
13 | from scrapy.utils.project import get_project_settings
14 | 
15 | # class DouyuspiderPipeline(object):
16 | #     def __init__(self):
17 | #         # 创建一个只写文件，指定文本编码格式为utf-8
18 | #         self.filename = codecs.open('douyu.json', 'w', encoding='utf-8')
19 | #     def process_item(self, item, spider):
20 | #
21 | #         html = json.dumps(dict(item),ensure_ascii='utf-8')
22 | #         self.filename.write(html + '\n')
23 | #         return item
24 | #
25 | #     # def spider_closed(self, spider):
26 | #     #     self.file.close()
27 | 
28 | # scrapy下载图片需要安装pip install image/Pillow
29 | class DouYuImagesPipelines(ImagesPipeline):
30 |     IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
31 | 
32 |     def get_media_requests(self, item, info):
33 |         image_url = item["vertical"]
34 |         yield scrapy.Request(image_url)
35 | 
36 |     def item_completed(self, results, item, info):
37 |         # 固定写法，获取图片路径，同时判断这个路径是否正确，如果正确，就放到 image_path里，ImagesPipeline源码剖析可见
38 |         image_path = [x["path"] for ok, x in results if ok]
39 | 
40 |         os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg")
41 |         item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"]
42 | 
43 |         return item


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for DouYuSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'DouYuSpider'
13 | 
14 | SPIDER_MODULES = ['DouYuSpider.spiders']
15 | NEWSPIDER_MODULE = 'DouYuSpider.spiders'
16 | 
17 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
18 | 
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'DouYuSpider (+http://www.yourdomain.com)'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'DouYuSpider.middlewares.DouyuspiderSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'DouYuSpider.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    # 'DouYuSpider.pipelines.DouyuspiderPipeline': 300,
70 |    'DouYuSpider.pipelines.DouYuImagesPipelines': 300,
71 | }
72 | IMAGES_STORE = "/Users/yunmei/Desktop/scrapyenv/Python-Spider/DouYuSpider/Images"
73 | # 日志
74 | # LOG_FILE = "dg.log"
75 | # LOG_LEVEL = "DEBUG"
76 | 
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 | 
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 | 


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/spiders/douyu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import json
 4 | 
 5 | from DouYuSpider.items import DouyuspiderItem
 6 | class DouyuSpider(scrapy.Spider):
 7 |     name = 'douyu'
 8 |     # 不可设置为allowed_domains = ['http://capi.douyucdn.cn']
 9 |     allowed_domains = ['capi.douyucdn.cn']
10 | 
11 |     offset = 0
12 |     url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
13 | 
14 |     start_urls = [url + str(offset)]
15 | 
16 |     def parse(self, response):
17 |         data = json.loads(response.text)['data']
18 | 
19 |         for each in data:
20 |             item = DouyuspiderItem()
21 | 
22 |             item["vertical"] = each["vertical_src"].encode("utf-8")
23 |             item["name"] = each["nickname"].encode("utf-8")
24 |             item["room_src"] = each["room_src"].encode("utf-8")
25 |             item["anchor_city"] = each["anchor_city"].encode("utf-8")
26 | 
27 |             yield item
28 | 
29 |         self.offset += 20
30 |         yield scrapy.Request(self.url + str(self.offset),callback = self.parse)
31 | 
32 | 


--------------------------------------------------------------------------------
/DouYuSpider/Images/Cute兔丶.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Cute兔丶.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/MiS媛.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/MiS媛.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/Super超级冷.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Super超级冷.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/Yozi柚子妹妹.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Yozi柚子妹妹.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/pinky水蜜桃.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/pinky水蜜桃.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/一只小玲儿.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/一只小玲儿.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/会玩的黄宝宝.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/会玩的黄宝宝.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/冷伊宁.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/冷伊宁.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/十四万岁的青丘老太婆.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/十四万岁的青丘老太婆.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/可乐小十五.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/可乐小十五.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/吃萝卜的辛巴.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/吃萝卜的辛巴.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/咘咘柳.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/咘咘柳.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/大宝SOD蜜不是润肤露.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/大宝SOD蜜不是润肤露.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/大木头CL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/大木头CL.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/小依泽儿.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小依泽儿.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/小口古小咕.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小口古小咕.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/小圆脸娜娜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小圆脸娜娜.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/小小小思齐.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小小小思齐.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/小雅er.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小雅er.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/尛小钰.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/尛小钰.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/左思念.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/左思念.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/巫女蛋.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/巫女蛋.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/布丁味的雯宝宝.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/布丁味的雯宝宝.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/幼齿懵骚小安妮.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/幼齿懵骚小安妮.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/悠悠fairy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/悠悠fairy.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/懵G娜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/懵G娜.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/是囧囧初啊.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/是囧囧初啊.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/江沅是个小可爱.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/江沅是个小可爱.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/温柔的喵小胖.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/温柔的喵小胖.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/爱笑的蒙蒙.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/爱笑的蒙蒙.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/璇璇璇儿丶Tay.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/璇璇璇儿丶Tay.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/甜馨大队长.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/甜馨大队长.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/白羊可爱多.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/白羊可爱多.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/磨人的小柠檬.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/磨人的小柠檬.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/糖炒栗子lr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/糖炒栗子lr.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/糖糖小萌主.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/糖糖小萌主.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/紫絮儿521.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/紫絮儿521.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/苏思淳sheep.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/苏思淳sheep.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/若儿被注册了呢.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/若儿被注册了呢.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/诗诗诗诗诗诗酱.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/诗诗诗诗诗诗酱.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/谷猫宁.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/谷猫宁.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/辣椒酱jiang.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/辣椒酱jiang.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/迷人的小北北.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/迷人的小北北.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/阿青Dale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/阿青Dale.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/陈梓不是橙子.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/陈梓不是橙子.jpg


--------------------------------------------------------------------------------
/DouYuSpider/Images/鲸鱼妹爱素颜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/鲸鱼妹爱素颜.jpg


--------------------------------------------------------------------------------
/DouYuSpider/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl douyu'.split())


--------------------------------------------------------------------------------
/DouYuSpider/douyu.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/douyu.json


--------------------------------------------------------------------------------
/DouYuSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = DouYuSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DouYuSpider
12 | 


--------------------------------------------------------------------------------
/HongNiangNet/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/.DS_Store


--------------------------------------------------------------------------------
/HongNiangNet/.idea/HongNiangNet.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/HongNiangNet/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.2 (/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/HongNiangNet/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/HongNiangNet.iml" filepath="$PROJECT_DIR$/.idea/HongNiangNet.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/.DS_Store


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/__init__.py


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy import Field,Item
10 | 
11 | class HongniangnetItem(Item):
12 |     # define the fields for your item here like:
13 |     # 用户名
14 |     username = Field()
15 |     # 年龄
16 |     age = Field()
17 |     # 头像图片链接
18 |     header_link = Field()
19 |     # 相册图片链接
20 |     images_url = Field()
21 |     # 内心独白
22 |     content = Field()
23 |     # 籍贯
24 |     place_from= Field()
25 |     # 学历
26 |     education = Field()
27 |     # 爱好
28 |     hobby = Field()
29 |     # 个人主页链接
30 |     source_url = Field()
31 |     # 数据来源网站
32 |     source = Field()
33 | 
34 | 


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class HongniangnetSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import codecs
 9 | import json
10 | class HongniangnetPipeline(object):
11 | 
12 | 
13 |     def __init__(self):
14 |         self.filename = codecs.open('content.json', 'w', encoding='utf-8')
15 |     def process_item(self, item, spider):
16 |         html = json.dumps(dict(item),ensure_ascii=False)
17 |         # self.filename.write(html + '\n')
18 |         self.filename.write(html + '\n')
19 |         return item
20 | 
21 |     def spider_closed(self, spider):
22 |         self.filename.close()
23 | 


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for HongNiangNet project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'HongNiangNet'
 13 | 
 14 | SPIDER_MODULES = ['HongNiangNet.spiders']
 15 | NEWSPIDER_MODULE = 'HongNiangNet.spiders'
 16 | 
 17 | # 分布式爬虫设置Ip端口
 18 | REDIS_HOST = '192.168.19.206'
 19 | REDIS_PORT = 6379
 20 | 
 21 | 
 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 23 | #USER_AGENT = 'HongNiangNet (+http://www.yourdomain.com)'
 24 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
 25 | # Obey robots.txt rules
 26 | ROBOTSTXT_OBEY = True
 27 | 
 28 | 
 29 | # 使用了scrapy-redis里的去重组件，不使用scrapy默认的去重
 30 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 31 | # 使用了scrapy-redis里的调度器组件，不实用scrapy默认的调度器
 32 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 33 | # 使用队列形式
 34 | SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
 35 | # 允许暂停，redis请求记录不丢失
 36 | SCHEDULER_PERSIST = True
 37 | 
 38 | 
 39 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 40 | #CONCURRENT_REQUESTS = 32
 41 | 
 42 | # Configure a delay for requests for the same website (default: 0)
 43 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 44 | # See also autothrottle settings and docs
 45 | # DOWNLOAD_DELAY = 3
 46 | # The download delay setting will honor only one of:
 47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 48 | #CONCURRENT_REQUESTS_PER_IP = 16
 49 | 
 50 | # Disable cookies (enabled by default)
 51 | #COOKIES_ENABLED = False
 52 | 
 53 | # Disable Telnet Console (enabled by default)
 54 | #TELNETCONSOLE_ENABLED = False
 55 | 
 56 | # Override the default request headers:
 57 | #DEFAULT_REQUEST_HEADERS = {
 58 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 59 | #   'Accept-Language': 'en',
 60 | #}
 61 | 
 62 | # Enable or disable spider middlewares
 63 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 64 | # SPIDER_MIDDLEWARES = {
 65 | #    'HongNiangNet.middlewares.HongniangnetSpiderMiddleware': 543,
 66 | # }
 67 | 
 68 | # Enable or disable downloader middlewares
 69 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 70 | #DOWNLOADER_MIDDLEWARES = {
 71 | #    'HongNiangNet.middlewares.MyCustomDownloaderMiddleware': 543,
 72 | #}
 73 | 
 74 | # Enable or disable extensions
 75 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 76 | #EXTENSIONS = {
 77 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 78 | #}
 79 | 
 80 | # Configure item pipelines
 81 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 82 | ITEM_PIPELINES = {
 83 |    'HongNiangNet.pipelines.HongniangnetPipeline': 300,
 84 |    'scrapy_redis.pipelines.RedisPipeline' : 400,
 85 | }
 86 | 
 87 | # Enable and configure the AutoThrottle extension (disabled by default)
 88 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 89 | #AUTOTHROTTLE_ENABLED = True
 90 | # The initial download delay
 91 | #AUTOTHROTTLE_START_DELAY = 5
 92 | # The maximum download delay to be set in case of high latencies
 93 | #AUTOTHROTTLE_MAX_DELAY = 60
 94 | # The average number of requests Scrapy should be sending in parallel to
 95 | # each remote server
 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 97 | # Enable showing throttling stats for every response received:
 98 | #AUTOTHROTTLE_DEBUG = False
 99 | 
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = 'httpcache'
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 | 


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/spiders/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/spiders/.DS_Store


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/spiders/hongniang.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from scrapy.linkextractors import LinkExtractor
  4 | # from scrapy.spiders import CrawlSpider, Rule
  5 | from HongNiangNet.items import HongniangnetItem
  6 | # 分布式
  7 | from scrapy.spider import Rule
  8 | from scrapy_redis.spiders import RedisCrawlSpider
  9 | 
 10 | # class HongniangSpider(CrawlSpider):
 11 | class HongniangSpider(RedisCrawlSpider):
 12 | 
 13 |     name = 'hongniang'
 14 |     allowed_domains = ['hongniang.com']
 15 |     # start_urls = ['http://www.hongniang.com/match?&page=1']
 16 |     redis_key = "hongniangSpider:start_urls"
 17 | 
 18 |     # 动态域范围获取
 19 |     def __init__(self, *args, **kwargs):
 20 |         # Dynamically define the allowed domains list.
 21 |         domain = kwargs.pop('domain', '')
 22 |         self.allowed_domains = filter(None, domain.split(','))
 23 |         super(HongniangSpider, self).__init__(*args, **kwargs)
 24 | 
 25 |     # 每一页匹配规则
 26 |     page_links = LinkExtractor(allow=(r"hongniang.com/match?&page=\d+"))
 27 |     # 每个人个人主页匹配规则
 28 |     profile_links = LinkExtractor(allow=(r"hongniang.com/user/member/id/\d+"))
 29 |     rules = (
 30 |         # 没有回调函数，说明follow是True
 31 |         Rule(page_links),
 32 |         # 有回调函数，说明follow是False
 33 |         Rule(profile_links, callback='parse_item',follow=True),
 34 |     )
 35 | 
 36 |     def parse_item(self, response):
 37 | 
 38 |         item = HongniangnetItem()
 39 |         # 注意：xpath获取位置时，不从0开始
 40 |         # 用户名
 41 |         item["username"] = self.get_username(response)
 42 |         # 年龄
 43 |         item["age"] = self.get_age(response)
 44 |         # 头像图片链接
 45 |         item["header_link"] = self.get_header_link(response)
 46 |         # 相册图片链接
 47 |         item["images_url"] = self.get_images_url(response)
 48 |         # 内心独白
 49 |         item["content"] = self.get_content(response)
 50 |         # 籍贯
 51 |         item["place_from"] = self.get_place_from(response)
 52 |         # 学历
 53 |         item["education"] = self.get_education(response)
 54 |         # 爱好
 55 |         item["hobby"] = self.get_hobby(response)
 56 |         # 个人主页链接
 57 |         item["source_url"] = response.url
 58 |         # 数据来源网站
 59 |         item["source"] = "hongniang"
 60 | 
 61 |         yield item
 62 | 
 63 |     def get_username(self,response):
 64 |         username = response.xpath("//div[@class='name nickname']/text()").extract()
 65 |         if len(username):
 66 |             username = username[0]
 67 |         else:
 68 |             username = "NULL"
 69 |         return username.strip()
 70 | 
 71 |     def get_age(self,response):
 72 |         age = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div[1]/ul[1]/li[1]/text()").extract()
 73 |         if len(age):
 74 |             age = age[0]
 75 |             print(age)
 76 |         else:
 77 |             age = "NULL"
 78 |         return age.strip()
 79 | 
 80 |     def get_header_link(self,response):
 81 |         header_link = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img[1]/@src").extract()
 82 |         if len(header_link):
 83 |             header_link = header_link[0]
 84 |         else:
 85 |             header_link = "NULL"
 86 |         return header_link.strip()
 87 | 
 88 |     def get_images_url(self,response):
 89 |         images_url = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img/@src").extract()
 90 |         if len(images_url):
 91 |             images_url = images_url
 92 |         else:
 93 |             images_url = "NULL"
 94 |         return images_url
 95 | 
 96 |     def get_content(self,response):
 97 |         ontent = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info5']/div[@class='text']/text()").extract()
 98 |         if len(ontent):
 99 |             ontent = ontent[0]
100 |         else:
101 |             ontent = "NULL"
102 |         return ontent.strip()
103 | 
104 |     def get_place_from(self,response):
105 |         place_from = response.xpath("//div[@class='mem_main']/div[@class='sub2']/div[@class='info1'][1]/div[@class='right']/ul[2]/li[1]/text()").extract()
106 |         if len(place_from):
107 |             place_from = place_from[0]
108 |         else:
109 |             place_from = "NULL"
110 |         return place_from.strip()
111 | 
112 |     def get_education(self,response):
113 |         education = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div/ul[2]/li[2]/text()").extract()
114 |         if len(education):
115 |             education = education[0]
116 |         else:
117 |             education = "NULL"
118 |         return education.strip()
119 |     def get_hobby(self,response):
120 |         hobby = response.xpath("//div[@class='mem_main']//div[@class='sub2']/div[@class='info1'][2]/div[@class='right'][1]/ul[1]/li[4]/text()").extract()
121 |         if len(hobby):
122 |             hobby = hobby[0]
123 |         else:
124 |             hobby = "NULL"
125 |         return hobby.strip()
126 | 
127 | 


--------------------------------------------------------------------------------
/HongNiangNet/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl hongniang'.split())


--------------------------------------------------------------------------------
/HongNiangNet/content.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/content.json


--------------------------------------------------------------------------------
/HongNiangNet/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = HongNiangNet.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = HongNiangNet
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python-Spider
 2 | 1、豆瓣电影top250<br>
 3 | 2、斗鱼爬取json数据以及爬取美女图片<br>
 4 | 3、CrawlSpider爬取红娘网相亲人的部分基本信息、红娘网分布式爬取、存储redis<br>
 5 | 4、爬虫小demo<br>
 6 | 5、Selenium的使用<br>
 7 | 6、PIL<br>
 8 | 7、爬多点商品存储mysql数据库同时显示在djangoweb页面<br>
 9 | 8、django开发接口<br>
10 | 9、python txt、csv、xml文件解析<br>
11 | 10、Scrapy框架进行Spiders简单爬虫<br>
12 | 11、抓取淘宝美女信息，下载本地并存储mysql数据库<br>
13 | 12、爬取有缘网用户信息<br>
14 | 13、模拟Github登陆<br>
15 | 14、selenium动态模拟登录<br>
16 | 15、模拟知乎登录<br>
17 | 16、爬取tencent社招信息<br>
18 | 17、[爬取《多点》整站商品信息](https://github.com/lb2281075105/LBDuoDian)<br>
19 | 18、模拟京东登录<br>
20 | 19、下载网易云歌词<br>
21 | 20、淘宝信息<br>
22 | 21、京东商城商品详情页信息<br>
23 | 22、模拟图虫网登录<br>
24 | 23、itchat 获取微信群或者微信好友分享文章<br>
25 | 24、爬取微信公众号历史文章<br>
26 | 25、itchat监听指定微信公众号分享的文章<br>
27 | 26、itchat微信群微信好友防撤回<br>
28 | 27、在微信群之间转发消息<br>
29 | 27、下载bilibili视频  也可以下载哔哩哔哩集合视频<br>
30 | 28、爬取m3u8视频<br>
31 | 
32 | 详细请移步简书[Python文集](http://www.jianshu.com/nb/18442681)
33 | 


--------------------------------------------------------------------------------
/duodian/.idea/duodian.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="FacetManager">
 4 |     <facet type="django" name="Django">
 5 |       <configuration>
 6 |         <option name="rootFolder" value="$MODULE_DIR$" />
 7 |         <option name="settingsModule" value="duodian/settings.py" />
 8 |         <option name="manageScript" value="$MODULE_DIR$/manage.py" />
 9 |         <option name="environment" value="&lt;map/&gt;" />
10 |       </configuration>
11 |     </facet>
12 |   </component>
13 |   <component name="NewModuleRootManager">
14 |     <content url="file://$MODULE_DIR$" />
15 |     <orderEntry type="inheritedJdk" />
16 |     <orderEntry type="sourceFolder" forTests="false" />
17 |   </component>
18 |   <component name="TemplatesService">
19 |     <option name="TEMPLATE_CONFIGURATION" value="Django" />
20 |   </component>
21 |   <component name="TestRunnerService">
22 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
23 |   </component>
24 | </module>


--------------------------------------------------------------------------------
/duodian/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.2 (/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/duodian/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/duodian.iml" filepath="$PROJECT_DIR$/.idea/duodian.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/duodian/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/db.sqlite3


--------------------------------------------------------------------------------
/duodian/duodian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/duodian/__init__.py


--------------------------------------------------------------------------------
/duodian/duodian/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for duodian project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 1.11.4.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/1.11/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/1.11/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | 
 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = 'htonb%m8+_d=tsnqm)6)_q@2@m#ulx#nb!8$wbluo9&1yi$yh$'
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = []
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = [
 34 |     'django.contrib.admin',
 35 |     'django.contrib.auth',
 36 |     'django.contrib.contenttypes',
 37 |     'django.contrib.sessions',
 38 |     'django.contrib.messages',
 39 |     'django.contrib.staticfiles',
 40 |     'myduodian',
 41 | ]
 42 | 
 43 | MIDDLEWARE = [
 44 |     'django.middleware.security.SecurityMiddleware',
 45 |     'django.contrib.sessions.middleware.SessionMiddleware',
 46 |     'django.middleware.common.CommonMiddleware',
 47 |     'django.middleware.csrf.CsrfViewMiddleware',
 48 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 49 |     'django.contrib.messages.middleware.MessageMiddleware',
 50 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 51 | ]
 52 | MIDDLEWARE_CLASSES = [
 53 |     'django.contrib.sessions.middleware.SessionMiddleware',
 54 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 55 |     'django.contrib.messages.middleware.MessageMiddleware',
 56 | ]
 57 | ROOT_URLCONF = 'duodian.urls'
 58 | 
 59 | TEMPLATES = [
 60 |     {
 61 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 62 |         'DIRS': [os.path.join(BASE_DIR),'templates'],
 63 |         'APP_DIRS': True,
 64 |         'OPTIONS': {
 65 |             'context_processors': [
 66 |                 'django.template.context_processors.debug',
 67 |                 'django.template.context_processors.request',
 68 |                 'django.contrib.auth.context_processors.auth',
 69 |                 'django.contrib.messages.context_processors.messages',
 70 |             ],
 71 |         },
 72 |     },
 73 | ]
 74 | 
 75 | WSGI_APPLICATION = 'duodian.wsgi.application'
 76 | 
 77 | 
 78 | # Database
 79 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases
 80 | 
 81 | DATABASES = {
 82 |     'default': {
 83 |         'ENGINE': 'django.db.backends.mysql',
 84 |         'HOST':'127.0.0.1',
 85 |         'PORT':'3306',
 86 |         'NAME': 'test',
 87 |         'USER':'root',
 88 |         'PASSWORD':'',
 89 |     }
 90 | }
 91 | 
 92 | 
 93 | # Password validation
 94 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators
 95 | 
 96 | AUTH_PASSWORD_VALIDATORS = [
 97 |     {
 98 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 99 |     },
100 |     {
101 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
102 |     },
103 |     {
104 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
105 |     },
106 |     {
107 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
108 |     },
109 | ]
110 | 
111 | 
112 | # Internationalization
113 | # https://docs.djangoproject.com/en/1.11/topics/i18n/
114 | 
115 | LANGUAGE_CODE = 'zh-hans'
116 | 
117 | TIME_ZONE = 'UTC'
118 | 
119 | USE_I18N = True
120 | 
121 | USE_L10N = True
122 | 
123 | USE_TZ = True
124 | 
125 | 
126 | # Static files (CSS, JavaScript, Images)
127 | # https://docs.djangoproject.com/en/1.11/howto/static-files/
128 | 
129 | STATIC_URL = '/static/'
130 | 


--------------------------------------------------------------------------------
/duodian/duodian/urls.py:
--------------------------------------------------------------------------------
 1 | """duodian URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/1.11/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.conf.urls import url, include
14 |     2. Add a URL to urlpatterns:  url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url
17 | from django.contrib import admin
18 | from myduodian import views
19 | urlpatterns = [
20 |     url(r'^admin/', admin.site.urls),
21 |     url(r'^index/', views.index),
22 | ]
23 | 


--------------------------------------------------------------------------------
/duodian/duodian/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for duodian project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "duodian.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/duodian/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "duodian.settings")
 7 |     try:
 8 |         from django.core.management import execute_from_command_line
 9 |     except ImportError:
10 |         # The above import may fail for some other reason. Ensure that the
11 |         # issue is really that Django is missing to avoid masking other
12 |         # exceptions on Python 2.
13 |         try:
14 |             import django
15 |         except ImportError:
16 |             raise ImportError(
17 |                 "Couldn't import Django. Are you sure it's installed and "
18 |                 "available on your PYTHONPATH environment variable? Did you "
19 |                 "forget to activate a virtual environment?"
20 |             )
21 |         raise
22 |     execute_from_command_line(sys.argv)
23 | 


--------------------------------------------------------------------------------
/duodian/myduodian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/myduodian/__init__.py


--------------------------------------------------------------------------------
/duodian/myduodian/admin.py:
--------------------------------------------------------------------------------
 1 | from django.contrib import admin
 2 | 
 3 | # Register your models here.
 4 | 
 5 | from myduodian.models import AiDuoDian
 6 | 
 7 | class DuoDianAdmin(admin.ModelAdmin):
 8 |     list_display = ['goodName','price','image']
 9 | 
10 | 
11 | admin.site.register(AiDuoDian,DuoDianAdmin)


--------------------------------------------------------------------------------
/duodian/myduodian/migrations/0001_initial.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from django.db import models, migrations
 5 | 
 6 | 
 7 | class Migration(migrations.Migration):
 8 | 
 9 |     dependencies = [
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.CreateModel(
14 |             name='AiDuoDian',
15 |             fields=[
16 |                 ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
17 |                 ('image', models.CharField(max_length=1000)),
18 |                 ('goodName', models.CharField(max_length=200)),
19 |                 ('price', models.CharField(max_length=40)),
20 |             ],
21 |         ),
22 |     ]
23 | 


--------------------------------------------------------------------------------
/duodian/myduodian/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/myduodian/migrations/__init__.py


--------------------------------------------------------------------------------
/duodian/myduodian/models.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from django.db import models
3 | 
4 | class AiDuoDian(models.Model):
5 | 
6 |     image = models.CharField(max_length=1000)
7 |     goodName = models.CharField(max_length=200)
8 |     price = models.CharField(max_length=40)


--------------------------------------------------------------------------------
/duodian/myduodian/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/duodian/myduodian/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 | from django.http import HttpResponse
3 | # Create your views here.
4 | 
5 | from myduodian.models import *
6 | def index(request):
7 |     context = {"list":AiDuoDian.objects.all()}
8 |     return render(request,'myduodian/index.html',context)


--------------------------------------------------------------------------------
/duodian/templates/myduodian/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Title</title>
 6 |     <style>
 7 | 
 8 |         table{
 9 |             border: 1px red solid;
10 |             margin:  0px auto;
11 |         }
12 |         td{
13 |             width: 140px;
14 |             height:140px;
15 |             text-align: center;
16 |             line-height: 140px;
17 |         }
18 |         img{
19 |             width: 100px;
20 |             height:100px;
21 |         }
22 |     </style>
23 | </head>
24 | <body>
25 | 
26 | <table>
27 |     <thead>
28 |         <tr><th>商品图片</th><th>商品名</th><th>价格</th></tr>
29 |     </thead>
30 |     <tbody>
31 |       {% for item in list %}
32 |          <tr>
33 |          <td><img src="{{ item.image }}" alt=""></td>
34 |          <td>{{ item.goodName }}</td>
35 |          <td>{{ item.price }}</td>
36 |          </tr>
37 |       {% endfor %}
38 | 
39 |     </tbody>
40 | 
41 | </table>
42 | 
43 | </body>
44 | </html>


--------------------------------------------------------------------------------
/duodian/woduodian.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | 
 3 | import MySQLdb
 4 | import json
 5 | import jsonpath
 6 | import urllib2
 7 | import os
 8 | class DuoDian():
 9 |     def __init__(self):
10 |         self.url = 'https://gatewx.dmall.com/customersite/searchWareByCategory?param={"pageNum":1,"pageSize":30,"venderId":"1","storeId":"108","sort":"1","categoryId":11347,"categoryLevel":3,"cateSource":1,"bizType":"1"}&token=&source=2&tempid=C7B357489E400002B1514BD01B00E270&pubParam={"utmSource":"wxmp"}&_=1511256196255'
11 |         # 建立和数据库的连接
12 |         self.db = MySQLdb.connect(host='127.0.0.1', user="root", passwd="", db="test")
13 |         # 获取操作游标
14 |         self.cursor = self.db.cursor()
15 | 
16 |     def get_html(self):
17 |         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
18 |         request = urllib2.Request(self.url,headers=headers)
19 |         response = urllib2.urlopen(request)
20 |         html = response.read()
21 |         return html
22 | 
23 |     def get_html1(self,url):
24 |         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
25 |         request = urllib2.Request(url,headers=headers)
26 |         response = urllib2.urlopen(request)
27 |         html = response.read()
28 |         return html
29 | 
30 |     def get_content(self):
31 |         jsonobj = json.loads(self.get_html())
32 |         # 商品名称
33 |         namelist = jsonpath.jsonpath(jsonobj, '$..title')
34 |         # 商品价格
35 |         pricelist = jsonpath.jsonpath(jsonobj, '$..promotionPrice')
36 |         # 商品图片
37 |         imglist = jsonpath.jsonpath(jsonobj, '$..img')
38 |         listdata = zip(imglist,namelist,pricelist)
39 | 
40 | 
41 | 
42 |         for item in listdata:
43 |             # print(item[1])
44 |             try:
45 |                 result = self.cursor.execute(
46 |                     "insert into myduodian_aiduodian (image,goodName,price) VALUES (%s,%s,%s)",[item[0],item[1],item[2]])
47 |                 self.db.commit()
48 |                 print(result)
49 |             except Exception as e:
50 |                 self.db.rollback()
51 |                 print('失败')
52 | 
53 |         # 关闭连接，释放资源
54 |         self.db.close()
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     duodian = DuoDian()
59 |     duodian.get_content()


--------------------------------------------------------------------------------
/gongzhonghao.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/gongzhonghao.jpeg


--------------------------------------------------------------------------------
/jiekou/.idea/jiekou.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="FacetManager">
 4 |     <facet type="django" name="Django">
 5 |       <configuration>
 6 |         <option name="rootFolder" value="$MODULE_DIR$" />
 7 |         <option name="settingsModule" value="jiekou/settings.py" />
 8 |         <option name="manageScript" value="$MODULE_DIR$/manage.py" />
 9 |         <option name="environment" value="&lt;map/&gt;" />
10 |       </configuration>
11 |     </facet>
12 |   </component>
13 |   <component name="NewModuleRootManager">
14 |     <content url="file://$MODULE_DIR$" />
15 |     <orderEntry type="jdk" jdkName="Python 2.7.10 virtualenv at ~/Desktop/mypython" jdkType="Python SDK" />
16 |     <orderEntry type="sourceFolder" forTests="false" />
17 |   </component>
18 |   <component name="TemplatesService">
19 |     <option name="TEMPLATE_CONFIGURATION" value="Django" />
20 |   </component>
21 |   <component name="TestRunnerService">
22 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
23 |   </component>
24 | </module>


--------------------------------------------------------------------------------
/jiekou/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 virtualenv at ~/Desktop/mypython" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/jiekou/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/jiekou.iml" filepath="$PROJECT_DIR$/.idea/jiekou.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/jiekou/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/db.sqlite3


--------------------------------------------------------------------------------
/jiekou/jiekou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/jiekou/__init__.py


--------------------------------------------------------------------------------
/jiekou/jiekou/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for jiekou project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 1.8.2.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/1.8/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/1.8/ref/settings/
 11 | """
 12 | 
 13 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 14 | import os
 15 | 
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = '3!2z2kqm4erg8#8y1+5n1%wl3lw32@1u&4mlnh+orzl%ns39wq'
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = []
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = (
 34 |     'django.contrib.admin',
 35 |     'django.contrib.auth',
 36 |     'django.contrib.contenttypes',
 37 |     'django.contrib.sessions',
 38 |     'django.contrib.messages',
 39 |     'django.contrib.staticfiles',
 40 |     'myjiekou',
 41 | )
 42 | 
 43 | MIDDLEWARE_CLASSES = (
 44 |     'django.contrib.sessions.middleware.SessionMiddleware',
 45 |     'django.middleware.common.CommonMiddleware',
 46 |     'django.middleware.csrf.CsrfViewMiddleware',
 47 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 48 |     'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
 49 |     'django.contrib.messages.middleware.MessageMiddleware',
 50 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 51 |     'django.middleware.security.SecurityMiddleware',
 52 | )
 53 | 
 54 | ROOT_URLCONF = 'jiekou.urls'
 55 | 
 56 | TEMPLATES = [
 57 |     {
 58 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 59 |         'DIRS': [os.path.join(BASE_DIR),'templates'],
 60 |         'APP_DIRS': True,
 61 |         'OPTIONS': {
 62 |             'context_processors': [
 63 |                 'django.template.context_processors.debug',
 64 |                 'django.template.context_processors.request',
 65 |                 'django.contrib.auth.context_processors.auth',
 66 |                 'django.contrib.messages.context_processors.messages',
 67 |             ],
 68 |         },
 69 |     },
 70 | ]
 71 | 
 72 | WSGI_APPLICATION = 'jiekou.wsgi.application'
 73 | 
 74 | 
 75 | # Database
 76 | # https://docs.djangoproject.com/en/1.8/ref/settings/#databases
 77 | 
 78 | DATABASES = {
 79 |     'default': {
 80 |         'ENGINE': 'django.db.backends.mysql',
 81 |         'HOST':'127.0.0.1',
 82 |         'PORT':'3306',
 83 |         'NAME': 'test',
 84 |         'USER':'root',
 85 |         'PASSWORD':'',
 86 |     }
 87 | }
 88 | 
 89 | 
 90 | # Internationalization
 91 | # https://docs.djangoproject.com/en/1.8/topics/i18n/
 92 | 
 93 | LANGUAGE_CODE = 'zh-hans'
 94 | 
 95 | TIME_ZONE = 'UTC'
 96 | 
 97 | USE_I18N = True
 98 | 
 99 | USE_L10N = True
100 | 
101 | USE_TZ = True
102 | 
103 | 
104 | # Static files (CSS, JavaScript, Images)
105 | # https://docs.djangoproject.com/en/1.8/howto/static-files/
106 | 
107 | STATIC_URL = '/static/'
108 | 


--------------------------------------------------------------------------------
/jiekou/jiekou/urls.py:
--------------------------------------------------------------------------------
 1 | """jiekou URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/1.8/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Add an import:  from blog import urls as blog_urls
14 |     2. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls))
15 | """
16 | from django.conf.urls import include, url
17 | from django.contrib import admin
18 | from myjiekou import views
19 | urlpatterns = [
20 |     url(r'^admin/', include(admin.site.urls)),
21 |     url(r'^index/', views.index),
22 |     url(r'^api/', views.api),
23 | ]
24 | 


--------------------------------------------------------------------------------
/jiekou/jiekou/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for jiekou project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiekou.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/jiekou/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiekou.settings")
 7 | 
 8 |     from django.core.management import execute_from_command_line
 9 | 
10 |     execute_from_command_line(sys.argv)
11 | 


--------------------------------------------------------------------------------
/jiekou/myjiekou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/myjiekou/__init__.py


--------------------------------------------------------------------------------
/jiekou/myjiekou/admin.py:
--------------------------------------------------------------------------------
 1 | from django.contrib import admin
 2 | 
 3 | from  myjiekou.models import MyModel
 4 | # Register your models here.
 5 | 
 6 | class MyAdmin(admin.ModelAdmin):
 7 |     list_display = ["name","age","hobby"]
 8 | 
 9 | admin.site.register(MyModel,MyAdmin)
10 | 


--------------------------------------------------------------------------------
/jiekou/myjiekou/migrations/0001_initial.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | from django.db import models, migrations
 5 | 
 6 | 
 7 | class Migration(migrations.Migration):
 8 | 
 9 |     dependencies = [
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.CreateModel(
14 |             name='MyModel',
15 |             fields=[
16 |                 ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
17 |                 ('name', models.CharField(max_length=20)),
18 |                 ('age', models.CharField(max_length=100)),
19 |                 ('hobby', models.CharField(max_length=300)),
20 |             ],
21 |         ),
22 |     ]
23 | 


--------------------------------------------------------------------------------
/jiekou/myjiekou/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/myjiekou/migrations/__init__.py


--------------------------------------------------------------------------------
/jiekou/myjiekou/models.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from django.db import models
 3 | 
 4 | # Create your models here.
 5 | 
 6 | class MyModel(models.Model):
 7 |     # 姓名
 8 |     name = models.CharField(max_length=20)
 9 |     # 年龄
10 |     age = models.CharField(max_length=100)
11 |     # 爱好
12 |     hobby = models.CharField(max_length=300)
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/jiekou/myjiekou/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/jiekou/myjiekou/views.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #encoding=utf-8
 3 | from django.shortcuts import render
 4 | from django.http import HttpResponse,JsonResponse
 5 | from models import MyModel
 6 | import json
 7 | def index(request):
 8 |     content = MyModel.objects.all()
 9 |     list = {"content":content}
10 |     return render(request,"myjiekou/index.html",list)
11 | 
12 | def api(request):
13 |     list = []
14 |     item = {}
15 |     content = MyModel.objects.all()
16 |     for one in content:
17 |         item["name"] = one.name
18 |         item["age"] = one.age
19 |         item["hobby"] = one.hobby
20 |         list.append(item)
21 | 
22 |     return JsonResponse({"status":200,"date":list})


--------------------------------------------------------------------------------
/jiekou/templates/myjiekou/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Title</title>
 6 | </head>
 7 | <body>
 8 | <ul>
 9 |         {% for item in content %}
10 | 
11 |             <li>{{ item.name }}</li>
12 |             <li>{{ item.age }}</li>
13 |             <li>{{ item.hobby }}</li>
14 | 
15 |         {% endfor %}
16 | </ul>
17 | 
18 | 
19 | </body>
20 | </html>


--------------------------------------------------------------------------------
/teacherInfo/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.2 (/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/teacherInfo/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/teacherInfo.iml" filepath="$PROJECT_DIR$/.idea/teacherInfo.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/teacherInfo/.idea/teacherInfo.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.6.2 (/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/teacherInfo/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl myteacher'.split())


--------------------------------------------------------------------------------
/teacherInfo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = teacherInfo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = teacherInfo
12 | 


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/teacherInfo/teacherInfo/__init__.py


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | # Item 定义结构化数据字段,用来保存爬取到的数据
11 | class TeacherinfoItem(scrapy.Item):
12 | 
13 |     # 获取名字
14 |     name = scrapy.Field()
15 |     # 职称
16 |     position = scrapy.Field()
17 |     # 个人信息
18 |     info = scrapy.Field()
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TeacherinfoSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import json
 9 | import codecs
10 | class TeacherinfoPipeline(object):
11 |     def __init__(self):
12 |         self.filename = codecs.open('teacher.json','wb','utf-8')
13 |     def process_item(self, item, spider):
14 |         print(item)
15 |         html = json.dumps(dict(item),ensure_ascii=False)
16 |         self.filename.write(html + '\n')
17 |         return item
18 | 
19 |     def open_spider(self, spider):
20 |         pass
21 |         # self.filename.close()


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for teacherInfo project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'teacherInfo'
13 | 
14 | SPIDER_MODULES = ['teacherInfo.spiders']
15 | NEWSPIDER_MODULE = 'teacherInfo.spiders'
16 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'teacherInfo (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | SPIDER_MIDDLEWARES = {
50 |    'teacherInfo.middlewares.TeacherinfoSpiderMiddleware': 543,
51 | }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'teacherInfo.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'teacherInfo.pipelines.TeacherinfoPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/spiders/myteacher.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from teacherInfo.items import TeacherinfoItem
 4 | 
 5 | class MyteacherSpider(scrapy.Spider):
 6 |     name = 'myteacher'
 7 |     allowed_domains = ['itcast.cn']
 8 |     # start_urls = ("http://www.itcast.cn/channel/teacher.shtml",) 元组也可以
 9 |     start_urls = ['http://www.itcast.cn/channel/teacher.shtml#ac',
10 |                   'http://www.itcast.cn/channel/teacher.shtml#acloud',
11 |                   'http://www.itcast.cn/channel/teacher.shtml#adesign',
12 |                   'http://www.itcast.cn/channel/teacher.shtml#ads',
13 |                   'http://www.itcast.cn/channel/teacher.shtml#ajavaee',
14 |                   'http://www.itcast.cn/channel/teacher.shtml#anetmarket',
15 |                   'http://www.itcast.cn/channel/teacher.shtml#aphp',
16 |                   'http://www.itcast.cn/channel/teacher.shtml#apm',
17 |                   'http://www.itcast.cn/channel/teacher.shtml#apython',
18 |                   'http://www.itcast.cn/channel/teacher.shtml#astack',
19 |                   'http://www.itcast.cn/channel/teacher.shtml#atest',
20 |                   'http://www.itcast.cn/channel/teacher.shtml#aui',
21 |                   'http://www.itcast.cn/channel/teacher.shtml#auijp',
22 |                   'http://www.itcast.cn/channel/teacher.shtml#aweb']
23 |     # 爬虫的约束区域
24 |     def parse(self, response):
25 |         # 存放老师信息的集合
26 |         items = []
27 |         print(response.body)
28 |         for each in response.xpath("//div[@class='li_txt']"):
29 |             # 将我们得到的数据封装到一个 `ItcastItem` 对象
30 |             item = TeacherinfoItem()
31 |             # extract()方法返回的都是unicode字符串
32 |             name = each.xpath("h3/text()").extract()
33 |             position = each.xpath("h4/text()").extract()
34 |             info = each.xpath("p/text()").extract()
35 | 
36 |             # xpath返回的是包含一个元素的列表
37 |             item['name'] = name[0]
38 |             item['position'] = position[0]
39 |             item['info'] = info[0]
40 | 
41 |             items.append(item)
42 |             yield item
43 |         # 直接返回最后数据
44 |         # return items
45 | 


--------------------------------------------------------------------------------
/爬虫小demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/爬虫小demo/.DS_Store


--------------------------------------------------------------------------------
/爬虫小demo/01 taobao.py:
--------------------------------------------------------------------------------
  1 | from urllib import request, parse, error
  2 | import json
  3 | import os
  4 | import pymysql
  5 | import ssl
  6 | # 请求链接需要设置ssl认证
  7 | ssl._create_default_https_context = ssl._create_unverified_context
  8 | 
  9 | 
 10 | class TaoBao():
 11 | 
 12 |     def __init__(self):
 13 |         # 设置头部
 14 |         self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
 15 |         # 设置get参数
 16 |         self.params = {'_input_charset': 'utf-8',
 17 |                   'q': '',
 18 |                   'viewFlag': 'A',
 19 |                   'sortType': 'default',
 20 |                   'searchStyle': '',
 21 |                   'searchRegion': 'city',
 22 |                   'searchFansNum': '',
 23 |                   'currentPage': '',
 24 |                   'pageSize': '20'
 25 |                   }
 26 |         self.url = 'https://mm.taobao.com/tstar/search/tstar_model.do'
 27 | 
 28 | 
 29 |     def get_connect(self):
 30 | 
 31 |         self.tablename = 'taobao'
 32 |         self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='test', charset='utf8')
 33 |         self.cur = self.db.cursor()
 34 |         self.cur.execute('USE test')
 35 |         try:
 36 |             # 创建表
 37 |             self.cur.execute('CREATE TABLE '+self.tablename+' (id BIGINT(7) NOT NULL AUTO_INCREMENT, name VARCHAR(100), city VARCHAR(20), height VARCHAR(10), weight VARCHAR(10), homepage VARCHAR(100), profile VARCHAR(100), pic VARCHAR(100), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
 38 |         except pymysql.err.InternalError as e:
 39 |             print(e)
 40 |         # 修改表字段
 41 |         self.cur.execute('ALTER DATABASE test CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
 42 |         self.cur.execute('ALTER TABLE '+self.tablename+' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 43 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE name name VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 44 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE city city VARCHAR(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 45 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE height height VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 46 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE weight weight VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 47 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE homepage homepage VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 48 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE profile profile VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 49 |         self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE pic pic VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 50 | 
 51 |     def insert_table(self,name, city, height, weight, hompage, profile, pic):
 52 |         self.cur.execute('INSERT INTO '+self.tablename+' (name, city, height, weight, homepage, profile, pic) VALUES (\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")', (name, city, height, weight, hompage, profile, pic))
 53 |         self.cur.connection.commit()
 54 | 
 55 | 
 56 |     def get_html(self,page):
 57 |         self.params['currentPage'] = str(page)
 58 |         # urlencode可以把字典=键值对编码成url地址中get参数
 59 |         self.param = parse.urlencode(self.params).encode('utf-8')
 60 |         # data=self.param 上传参数
 61 |         req = request.Request(self.url, data=self.param, headers=self.headers)
 62 |         content = request.urlopen(req)
 63 |         content = json.loads(content.read().decode('gbk'))
 64 |         if content['status'] == -1:
 65 |             return -1
 66 | 
 67 |         return content
 68 | 
 69 |     def parser_json(self,content, page):
 70 |         meinvist = []
 71 |         # 解析json数据
 72 |         data = content['data']['searchDOList']
 73 |         for list in data:
 74 |             temp = {}
 75 |             temp['id'] = str(list['userId'])
 76 |             temp['name'] = list['realName']
 77 |             temp['city'] = list['city']
 78 |             temp['height'] = str(list['height'])
 79 |             temp['weight'] = str(list['weight'])
 80 |             temp['favornum'] = str(list['totalFavorNum'])
 81 |             temp['profile'] = 'http:'+list['avatarUrl']
 82 |             temp['pic'] = 'http:'+list['cardUrl']
 83 | 
 84 |             # meinvist.append(temp)
 85 |             self.mkdir(temp['name'])
 86 |             print('%s正在抓取%s'%(page, temp['name']))
 87 |             self.get_img(temp['profile'], temp['name'], 'profile')
 88 |             self.get_img(temp['pic'], temp['name'], 'pic')
 89 |             if not os.path.exists('./'+temp['name']+'/info.txt'):
 90 |                 with open('./'+temp['name']+'/info.txt', 'w') as f:
 91 |                     f.write(temp['name']+'\n')
 92 |                     f.write(temp['city']+'\n')
 93 |                     f.write(temp['height']+'\n')
 94 |                     f.write(temp['weight']+'\n')
 95 |             # 插入数据库
 96 |             self.insert_table(temp['name'], temp['city'], temp['height'], temp['weight'], 'https://mm.taobao.com/self/aiShow.htm?userId='+temp['id'], temp['profile'], temp['pic'])
 97 |         # return meinvist
 98 | 
 99 |     # 判断文件夹是否存在
100 |     def mkdir(self,path):
101 |         if not os.path.exists(path):
102 |             os.makedirs(path)
103 |         else:
104 |             print('目录已存在！')
105 | 
106 |     # 判断文件是否存在
107 |     def get_img(self,url, path, name):
108 |         if os.path.exists('./' + path + '/' + name + '.jpg'):
109 |             print('文件已存在！')
110 |             return 0
111 |         try:
112 |             req = request.Request(url, headers=self.headers)
113 |             reponse = request.urlopen(req)
114 |             get_img = reponse.read()
115 |             with open('./' + path + '/' + name + '.jpg', 'wb') as fp:
116 |                 fp.write(get_img)
117 |             # 也可以用一下代码实现图片的下载
118 |             # request.urlretrieve(img, './' + path + '/' + name + '.jpg')
119 |         except error.URLError as e:
120 |             print(e.reason)
121 | 
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     page = 1
126 |     taobao = TaoBao()
127 |     taobao.get_connect()
128 |     while True:
129 |         content = taobao.get_html(page)
130 |         if content == -1:
131 |             print('抓取完毕！')
132 |             exit()
133 |         # 解析json
134 |         taobao.parser_json(content, page)
135 |         page += 1
136 | 


--------------------------------------------------------------------------------
/爬虫小demo/02 doubanzhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | 
 4 | # from selenium import webdriver
 5 | # from selenium.webdriver.common.keys import Keys
 6 | # import time
 7 | #
 8 | # driver = webdriver.PhantomJS(executable_path="/Users/yunmei/phantomjs-2.1.1-macosx/bin/phantomjs")
 9 | # driver.get("https://www.douban.com/")
10 | #
11 | # # 输入账号密码
12 | # driver.find_element_by_name("form_email").send_keys("2334497007@qq.com")
13 | # driver.find_element_by_name("form_password").send_keys("lbaiwb1314")
14 | #
15 | # # 模拟点击登录
16 | # driver.find_element_by_xpath("//input[@class='bn-submit']").click()
17 | #
18 | # # 等待3秒
19 | # time.sleep(3)
20 | #
21 | # # 生成登陆后快照
22 | # driver.save_screenshot("douban.png")
23 | #
24 | # with open("douban.html", "w") as file:
25 | #     file.write(driver.page_source.encode('utf-8'))
26 | #
27 | # driver.quit()
28 | 
29 | 
30 | from selenium import webdriver
31 | import time
32 | # 创建浏览器对象
33 | browser=webdriver.PhantomJS(executable_path="/Users/yunmei/phantomjs-2.1.1-macosx/bin/phantomjs")
34 | # 请求加载登录链接
35 | browser.get('https://www.zhihu.com/#signin')
36 | time.sleep(3)
37 | # 模拟点击使用密码登录
38 | browser.find_element_by_css_selector(".signin-switch-password").click()
39 | # 输入账号
40 | browser.find_element_by_css_selector(".account input[name='account']").send_keys('17078075655')
41 | # 输入密码
42 | browser.find_element_by_css_selector(".verification input[name='password']").send_keys('19910825580lb')
43 | # 模拟点击登录
44 | browser.find_element_by_css_selector(".sign-button").click()
45 | time.sleep(3)
46 | # 截图
47 | browser.save_screenshot("zhihu.png")
48 | browser.quit()


--------------------------------------------------------------------------------
/爬虫小demo/03 douYuUnittest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | # python的测试模块
 4 | import unittest
 5 | from selenium import webdriver
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | class douyuSelenium(unittest.TestCase):
 9 |     # 初始化方法
10 |     def setUp(self):
11 |         self.driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs")
12 | 
13 |     #具体的测试用例方法，一定要以test开头
14 |     def testDouyu(self):
15 |         self.driver.get('http://www.douyu.com/directory/all')
16 |         while True:
17 |             # 指定xml解析
18 |             soup = BeautifulSoup(self.driver.page_source, 'lxml')
19 |             # 返回当前页面所有房间标题列表 和 观众人数列表
20 |             titles = soup.find_all('h3', attrs={'class': 'ellipsis'})
21 |             nums = soup.find_all('span', attrs={'class': 'dy-num fr'})
22 | 
23 |             # 使用zip()函数来可以把列表合并，并创建一个元组对的列表[(1,2), (3,4)]
24 |             for title, num in zip(nums, titles):
25 |                 print u"房间标题: " + num.get_text().strip(), u"\t观众人数:" + title.get_text().strip()
26 |             # page_source.find()未找到内容则返回-1
27 |             if self.driver.page_source.find('shark-pager-disable-next') != -1:
28 |                 break
29 |             # 模拟下一页点击
30 |             self.driver.find_element_by_class_name('shark-pager-next').click()
31 | 
32 |     # 退出时的清理方法
33 |     def tearDown(self):
34 |         print '加载完成...'
35 |         self.driver.quit()
36 | 
37 | if __name__ == "__main__":
38 |     unittest.main()


--------------------------------------------------------------------------------
/爬虫小demo/04 fileHandler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import csv
 3 | 
 4 | # 1、txt文件
 5 | file = open('file.txt','r')
 6 | # 获取所有的信息
 7 | print file.read()
 8 | file.write("你好")
 9 | # 获取所有并且在所有行存在一个数组
10 | print file.readlines()
11 | # 获取第一行
12 | print file.readline()
13 | 
14 | # 2、读取csv文件
15 | 
16 | writer = csv.writer(open('test.csv','wb'))
17 | writer.writerow(['col1','col2','col3'])
18 | data = [range(3) for i in range(3)]
19 | for item in data:
20 |     writer.writerow(item)
21 | 
22 | filelist = csv.reader(open('./test.csv','r'))
23 | for item in filelist:
24 |     print item
25 | 
26 | 
27 | # 3、读取xml文件
28 | 
29 | from xml.dom import minidom
30 | # parse打开xml文件
31 | dom = minidom.parse("info.xml")
32 | # 获取根节点
33 | root = dom.documentElement
34 | print root.nodeName
35 | print root.nodeValue
36 | print root.nodeType
37 | print root.ELEMENT_NODE
38 | print "--" * 8
39 | province = root.getElementsByTagName("province")
40 | print province[0].tagName
41 | print province[0].getAttribute("username")
42 | print province[0].firstChild.data
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/爬虫小demo/05 getimage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import urllib2
 4 | import lxml.etree
 5 | 
 6 | class GetImage():
 7 | 
 8 |     def __init__(self):
 9 |         self.tieba = "https://tieba.baidu.com"
10 |         self.count = 50
11 | 
12 |     def get_html(self,url):
13 |         request = urllib2.Request(url)
14 |         response = urllib2.urlopen(request)
15 |         html = response.read()
16 |         return html
17 | 
18 |     def get_xpath(self):
19 |         # 起始页
20 |         baginPage = int(raw_input("请输入起始页："))
21 |         # 结束页
22 |         endPage = int(raw_input("请输入结束页："))
23 |         for pagecount in  range(baginPage,endPage + 1):
24 |             pn = (pagecount - 1) * self.count
25 |             urllink = self.tieba + "/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=" + str(pn)
26 |             xmlcontent = lxml.etree.HTML(self.get_html(urllink))
27 |             # content = xmlcontent.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
28 |             # content = xmlcontent.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@class="j_th_tit "]/@href')
29 |             content = xmlcontent.xpath('//a[@class="j_th_tit "]/@href')
30 | 
31 |             for item in content:
32 |                 itemcontent = lxml.etree.HTML(self.get_html(self.tieba + item))
33 |                 print self.tieba + item
34 |                 itemlist = itemcontent.xpath('//img[@class="BDE_Image"]//@src')
35 |                 for imageitem in itemlist:
36 |                     get_image = self.get_html(imageitem)
37 |                     with open("images/" + imageitem[-10:],'a') as file:
38 |                         file.write(get_image)
39 |                         file.close
40 | 
41 | if __name__ == "__main__":
42 |     getImages = GetImage()
43 |     getImages.get_xpath()


--------------------------------------------------------------------------------
/爬虫小demo/06 jsload.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from selenium import webdriver
 3 | from time import sleep
 4 | from selenium.webdriver.common.keys import Keys
 5 | 
 6 | driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs")
 7 | driver.get("http://baidu.com/")
 8 | 
 9 | driver.find_element_by_id("kw").send_keys(u"长城")
10 | sleep(10)
11 | driver.find_element_by_id("su").click()
12 | 
13 | driver.save_screenshot("长城.png")
14 | 
15 | 


--------------------------------------------------------------------------------
/爬虫小demo/07 jsondata.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import json
 4 | import jsonpath
 5 | import urllib2
 6 | 
 7 | class Json():
 8 |     def __init__(self):
 9 |         self.url = "http://www.lagou.com/lbs/getAllCitySearchLabels.json"
10 | 
11 |     def get_json(self):
12 |         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
13 |         request = urllib2.Request(self.url,headers=headers)
14 |         response = urllib2.urlopen(request)
15 |         html = response.read()
16 |         jsonobj = json.loads(html)
17 |         # 获取城市名称
18 |         namelist = jsonpath.jsonpath(jsonobj,'$..name')
19 |         for name in namelist:
20 |             print(name)
21 | 
22 |         # 把列表存储为字符串
23 |         nametext = json.dumps(namelist,ensure_ascii=False)
24 |         with open('name.txt','a') as file:
25 |             file.write(nametext.encode("utf-8"))
26 |             file.close
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     jsono = Json()
31 |     jsono.get_json()
32 | 


--------------------------------------------------------------------------------
/爬虫小demo/08 jsonpath和json总结.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | 
 4 | import json
 5 | import jsonpath
 6 | import time
 7 | 
 8 | # 1、第一种存储字典和数组
 9 | 
10 | listDict = [{"city": "北京"},{"name": "小明"}]
11 | strlist = json.dumps(listDict,  ensure_ascii=False)
12 | print type(strlist) # <type 'str'>
13 | # 写数据
14 | with open("listDict.json",'w') as file:
15 |     file.write(strlist)
16 | 
17 | # 2、第二种存储字典和数组
18 | listStr = [{"city": "北京"}, {"name": "大刘"}]
19 | json.dump(listStr, open("listStr.json","w"), ensure_ascii=False)
20 | 
21 | dictStr = {"city": "北京", "name": "大刘"}
22 | json.dump(dictStr, open("dictStr.json","w"), ensure_ascii=False)
23 | time.sleep(1)
24 | 
25 | # ------------ 从文件里面取数据 ---------
26 | 
27 | dictList = json.load(open("listDict.json",'r'))
28 | # 输出北京
29 | print dictList[0]["city"]
30 | # ------------ 读出字典loads ----------
31 | strDict = '{"city": "北京", "name": "大猫"}'
32 | # <type 'dict'>
33 | print type(json.loads(strDict))
34 | 
35 | jsonobj = json.loads(strDict)
36 | 
37 | # 从根节点开始，匹配name节点
38 | citylist = jsonpath.jsonpath(jsonobj,'$..name')
39 | 
40 | print citylist[0].encode('utf-8')


--------------------------------------------------------------------------------
/爬虫小demo/09 zhihu_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from bs4 import BeautifulSoup
 3 | import requests
 4 | import time
 5 | 
 6 | 
 7 | class Login():
 8 |     # 模拟登录一般步骤：(1)首先抓包，根据webForm来分析需要传那些data
 9 |     #                (2)分析_xsrf获取
10 |     #                (3)分析验证码获取方式
11 |     #                (4)post登录
12 | 
13 |     def get_login(self):
14 |         sess=requests.Session()
15 |         # 头部headers需要注意，如果头部没有设置好，下面的步骤就会不能执行成功
16 |         headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
17 |         # 首先获取登录页面，找到需要get的数据，同时记录cookie的值
18 |         html=sess.get('https://www.zhihu.com/#signin',headers=headers).text
19 |         # 调用xml解析库
20 |         bs=BeautifulSoup(html,'lxml')
21 |         # _xsrf作用是跨站请求伪造(或者叫跨域攻击)
22 |         _xsrf=bs.find('input',attrs={'name':'_xsrf'}).get('value')
23 |         # 通过时间戳拼接验证码链接
24 |         captcha_url='https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000)
25 |         # 发送验证码请求，获取图片数据流。
26 |         captchadata = sess.get(captcha_url, headers=headers).content
27 |         text = self.captcha(captchadata)
28 | 
29 |         data={
30 |             '_xsrf':_xsrf,
31 |             'phone_num':'17078075655',# 换成邮箱登录也可
32 |             'password':'lbaiwb1314',
33 |             'captcha':text
34 |         }
35 |         response=sess.post('https://www.zhihu.com/login/phone_num',data=data,headers=headers)
36 |         # print type(response.text)
37 |         # 在个人中心请求一下是否真正登录成功
38 |         response=sess.get('https://www.zhihu.com/people/liu-tao-98-32/activities',headers=headers)
39 |         with open("mylogin.txt", "w") as file:
40 |             file.write(response.text.encode("utf-8"))
41 | 
42 |     def captcha(self,captcha_data):
43 |         # 将二进制数据写入到文件中
44 |         with open('captcha.jpg','wb')as f:
45 |             f.write(captcha_data)
46 |         text=raw_input('请输入登录验证码')
47 |         return text
48 | 
49 | if __name__=='__main__':
50 | 
51 |    login = Login()
52 |    login.get_login()
53 | 


--------------------------------------------------------------------------------
/爬虫小demo/10 match.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import re
 4 | import urllib2
 5 | 
 6 | class Content:
 7 | 
 8 |     def __init__(self):
 9 |         self.page = 1
10 | 
11 |     def get_html(self):
12 |         # 获取整个网页的html内容
13 |         headers = {
14 |             "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
15 |         url = "http://www.neihan8.com/article/list_5_"+str(self.page)+".html"
16 |         request = urllib2.Request(url=url, headers=headers)
17 |         response = urllib2.urlopen(request)
18 |         html = response.read()
19 |         return html
20 | 
21 |     def get_content(self):
22 |         pattern = re.compile(r'<div.*?class="f18 mb20">(.*?)</div>', re.S)
23 |         content_list = pattern.findall(self.get_html())
24 |         for content in content_list:
25 |             result_content = content.decode('gbk').replace("<p>", "").replace("</p>", "") \
26 |                 .replace("&ldquo;", "").replace("<br />", "") \
27 |                 .replace("&rdquo;", "").replace("&hellip", "")
28 | 
29 |             with open("content.txt", "a") as file:
30 |                 file.write(result_content.encode("utf-8"))
31 |                 file.close
32 | 
33 | if __name__ == "__main__":
34 | 
35 |     content = Content()
36 |     while True:
37 |         content.page+=1
38 |         print content.page
39 |         content.get_content()
40 | 
41 | """
42 | r 打开只读文件，该文件必须存在。
43 | r+ 打开可读写的文件，该文件必须存在。
44 | w 打开只写文件，若文件存在则文件长度清为0，即该文件内容会消失。若文件不存在则建立该文件。
45 | w+ 打开可读写文件，若文件存在则文件长度清为零，即该文件内容会消失。若文件不存在则建立该文件。
46 | a 以附加的方式打开只写文件。若文件不存在，则会建立该文件，如果文件存在，写入的数据会被加到文件尾，即文件原先的内容会被保留。
47 | a+ 以附加方式打开可读写的文件。若文件不存在，则会建立该文件，如果文件存在，写入的数据会被加到文件尾后，即文件原先的内容会被保留。
48 | """


--------------------------------------------------------------------------------
/爬虫小demo/11 neihan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | import urllib2
 5 | import re
 6 | 
 7 | class Spider:
 8 |     def __init__(self):
 9 |         # 初始化起始页位置
10 |         self.page = 1
11 |         # 爬取开关，如果为True继续爬取
12 |         self.switch = True
13 | 
14 |     def loadPage(self):
15 |         """
16 |             作用：下载页面
17 |         """
18 |         print "正在下载数据...."
19 |         url = "http://www.neihan8.com/article/list_5_" + str(self.page) + ".html"
20 |         headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
21 |         request = urllib2.Request(url, headers = headers)
22 |         response = urllib2.urlopen(request)
23 | 
24 |         # 获取每页的HTML源码字符串
25 |         html = response.read()
26 |         #print html
27 | 
28 |         # 创建正则表达式规则对象，匹配每页里的段子内容，re.S 表示匹配全部字符串内容
29 |         pattern = re.compile('<div\sclass="f18 mb20">(.*?)</div>', re.S)
30 | 
31 |         # 将正则匹配对象应用到html源码字符串里，返回这个页面里的所有段子的列表
32 |         content_list = pattern.findall(html)
33 | 
34 |         # 调用dealPage() 处理段子里的杂七杂八
35 |         self.dealPage(content_list)
36 | 
37 |     def dealPage(self, content_list):
38 |         """
39 |             处理每页的段子
40 |             content_list : 每页的段子列表集合
41 |         """
42 |         for item in content_list:
43 |             # 将集合里的每个段子按个处理，替换掉无用数据
44 |             item = item.replace("<p>","").replace("</p>", "").replace("<br>", "")
45 |             #print item.decode("gbk")
46 |             # 处理完后调用writePage() 将每个段子写入文件内
47 |             self.writePage(item)
48 | 
49 |     def writePage(self, item):
50 |         """
51 |             把每条段子逐个写入文件里
52 |             item: 处理后的每条段子
53 |         """
54 |         # 写入文件内
55 |         print "正在写入数据...."
56 |         with open("duanzi.txt", "a") as f:
57 |             f.write(item)
58 | 
59 |     def startWork(self):
60 |         """
61 |             控制爬虫运行
62 |         """
63 |         # 循环执行，直到 self.switch == False
64 |         while self.switch:
65 |             # 用户确定爬取的次数
66 |             self.loadPage()
67 |             command = raw_input("如果继续爬取，请按回车（退出输入quit)")
68 |             if command == "quit":
69 |                 # 如果停止爬取，则输入 quit
70 |                 self.switch = False
71 |             # 每次循环，page页码自增1
72 |             self.page += 1
73 |         print "谢谢使用！"
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     duanziSpider = Spider()
78 | #    duanziSpider.loadPage()
79 |     duanziSpider.startWork()
80 | 
81 | 


--------------------------------------------------------------------------------
/爬虫小demo/12 PIL.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- coding:utf-8 -*-
 3 | import pytesseract
 4 | from PIL import Image
 5 | 
 6 | # PIL读取与存储图像
 7 | 
 8 | # 1、PIL识别图片上面文字
 9 | images = Image.open('test.png')
10 | text = pytesseract.image_to_string(images)
11 | print text
12 | 
13 | # 2、PIL保存成灰色图片
14 | # -*- coding: utf-8 -*-
15 | from PIL import Image
16 | 
17 | # 打开图像得到一个PIL图像对象
18 | img = Image.open("test.png")
19 | # 将其转为一张灰度图
20 | img = img.convert('L')
21 | # 存储该张图片
22 | try:
23 |   img.save("test.png")
24 | except IOError:
25 |   print "cannot convert"
26 | 
27 | 
28 | # 3、PIL生成缩略图
29 | # -*- coding: utf-8 -*-
30 | from PIL import Image
31 | 
32 | # 打开图像得到一个PIL图像对象
33 | img = Image.open("test.png")
34 | # 创建最长边为128的缩略图
35 | img.thumbnail((128,128))
36 | # 存储该张图片
37 | try:
38 |   img.save("test.png")
39 | except IOError:
40 |   print "cannot convert"
41 | 
42 | 
43 | # 4、PIL调整尺寸与旋转
44 | # -*- coding: utf-8 -*-
45 | from PIL import Image
46 | 
47 | # 打开图像得到一个PIL图像对象
48 | img = Image.open("test.png")
49 | # 修改图片大小，参数为一元组
50 | img = img.resize((100,200))
51 | # 使图片逆时针选择45度
52 | img = img.rotate(45)
53 | # 存储该张图片
54 | try:
55 |   img.save("test.png")
56 | except IOError:
57 |   print "cannot convert"
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/爬虫小demo/13 queryxpath.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import urllib2
 4 | import json
 5 | import lxml.etree
 6 | # xpath 模糊查询
 7 | 
 8 | class XpathQuery():
 9 |     def __init__(self):
10 |         self.url = "https://www.qiushibaike.com/"
11 | 
12 | 
13 |     def get_html(self):
14 |         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
15 |         request = urllib2.Request(self.url,headers=headers)
16 |         response = urllib2.urlopen(request)
17 |         html = response.read()
18 |         return html
19 | 
20 |     def get_xpath(self):
21 |         xmlcontent =  lxml.etree.HTML(self.get_html())
22 |         xmllist = xmlcontent.xpath('//div[contains(@id,"qiushi_tag_")]')
23 |         print len(xmllist)
24 |         # 分享的地方
25 |         sharelist = xmlcontent.xpath('//div[@class="article block untagged mb15 typs_recent"]//div[@class="single-share"]/a/@title')
26 |         for item in range(0,4):
27 |             print sharelist[item]
28 | 
29 |         for item in xmllist:
30 |             # 用户名
31 |             username = item.xpath('.//div[@class="author clearfix"]/a/h2/text()')
32 |             # 标题
33 |             title = item.xpath('.//a/div[@class="content"]/span/text()')[0]
34 | 
35 |             with open('title.txt','a') as file:
36 |                 file.write(title.encode("utf-8"))
37 |                 file.close
38 |             with open('username.txt','a') as file:
39 |                 if len(username) == 0:
40 |                     file.write("匿名用户")
41 |                 else:
42 |                     file.write(username[0].encode("utf-8"))
43 | 
44 |             # 好笑数
45 |             votecount = item.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()')[0]
46 |             print "好笑数：" + votecount
47 |             # 评论数
48 |             commentcount = item.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()')[0]
49 |             print "评论数：" + commentcount
50 |             # 放在一个字典里进行存储
51 |             dic = {
52 |                 "username":username,
53 |                 "votecount":votecount,
54 |                 "commentcount":commentcount,
55 |                 "title": title,
56 |             }
57 |             with open('qiushi.json','a') as file:
58 |                 file.write(json.dumps(dic,ensure_ascii=False).encode("utf-8") + '\n')
59 |                 file.close
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     xpathq = XpathQuery()
64 |     xpathq.get_xpath()


--------------------------------------------------------------------------------
/爬虫小demo/14 selenium执行js.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from selenium import webdriver
 4 | import time
 5 | driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs")
 6 | driver.get("https://www.baidu.com/")
 7 | 
 8 | # 给搜索输入框标红的javascript脚本
 9 | js = "var q=document.getElementById(\"kw\");q.style.border=\"2px solid red\";"
10 | 
11 | # 调用给搜索输入框标红js脚本
12 | driver.execute_script(js)
13 | 
14 | # 查看页面快照
15 | driver.save_screenshot("redbaidu.png")
16 | 
17 | # js隐藏元素，将获取的图片元素隐藏
18 | img = driver.find_element_by_xpath("//div[@id='lg']/img")
19 | driver.execute_script('$(arguments[0]).fadeOut()',img)
20 | 
21 | # 向下滚动到页面底部
22 | # driver.execute_script("$('.scroll_top').click(function(){$('html,body').animate({scrollTop: '0px'}, 800);});")
23 | time.sleep(1)
24 | # 查看页面快照
25 | driver.save_screenshot("wubaidu.png")
26 | 
27 | driver.quit()


--------------------------------------------------------------------------------
/爬虫小demo/15 tencent.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | import urllib2
 5 | 
 6 | class Tencent():
 7 |     def __init__(self):
 8 |         self.url = 'http://hr.tencent.com/position.php?&start=10#a'
 9 | 
10 |     def get_html(self):
11 |         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
12 |         request = urllib2.Request(self.url,headers=headers)
13 |         html = urllib2.urlopen(request)
14 |         return html
15 | 
16 |     def get_content(self):
17 |         techlist = []
18 |         soup = BeautifulSoup(self.get_html(),'lxml')
19 |         positionlist = soup.select('.l > a')
20 |         even = soup.select('.even')
21 |         odd = soup.select('.odd')
22 |         even + odd
23 | 
24 |         for position in positionlist:
25 |             with open("position.txt",'a') as file:
26 |                 file.write(position.string.encode("utf-8") + "\n")
27 |                 file.close
28 | 
29 |         for technology in even:
30 |             with open("technology.txt",'a') as file:
31 |                 file.write("" + technology.select('td')[1].string.encode("utf-8"))
32 |                 file.write("   人数：" + technology.select('td')[2].string.encode("utf-8"))
33 |                 file.write("   地点：" + technology.select('td')[3].string.encode("utf-8"))
34 |                 file.write("   时间：" + technology.select('td')[4].string.encode("utf-8") + "\n")
35 |                 file.close
36 | 
37 |         for technology in odd:
38 |             with open("technology.txt",'a') as file:
39 |                 file.write("" + technology.select('td')[1].string.encode("utf-8"))
40 |                 file.write("   人数：" + technology.select('td')[2].string.encode("utf-8"))
41 |                 file.write("   地点：" + technology.select('td')[3].string.encode("utf-8"))
42 |                 file.write("   时间：" + technology.select('td')[4].string.encode("utf-8") + "\n")
43 |                 file.close
44 | 
45 |         # items = {} 也可以这么存储数据到文件
46 |         # items["name"] = name
47 |         # str = json.dumps(items, ensure_ascii=False)
48 |         # output.write(line.encode('utf-8'))
49 |         # output.close()
50 | if __name__ == "__main__":
51 |     tencent = Tencent()
52 |     tencent.get_content()


--------------------------------------------------------------------------------
/爬虫小demo/16 xunmall.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import urllib2,os
  4 | import lxml.etree
  5 | 
  6 | class Xunmall():
  7 |     def __init__(self):
  8 |         self.url = "http://www.xunmall.com"
  9 | 
 10 |     def get_html(self,p1 = ""):
 11 |         # headers = {
 12 |         # "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
 13 |         request = urllib2.Request(self.url + p1)
 14 |         response = urllib2.urlopen(request)
 15 |         html = response.read()
 16 |         return html
 17 | 
 18 |     def get_xpath(self):
 19 |         xmlcontent = lxml.etree.HTML(self.get_html())
 20 |         xmllist = xmlcontent.xpath('//h2[@class="floor_name"]/text()')
 21 | 
 22 |         for item in xmllist:
 23 |             with open('title.txt','a') as file:
 24 |                 file.write(item.encode('utf-8') + '\n')
 25 |                 file.close
 26 | 
 27 | 
 28 |     def get_image(self):
 29 |         xmlimage = lxml.etree.HTML(self.get_html())
 30 |         imagelist = xmlimage.xpath('//div[@class="color_top"]/img/@src')
 31 |         if os.path.isdir('./imgs'):
 32 |            pass
 33 |         else:
 34 |             os.mkdir("./imgs")
 35 |         for item in imagelist:
 36 |             # print self.url + item
 37 |             with open('imgs/' + (self.url + item)[-8:],'a+') as file:
 38 |                 file.write(self.get_html(item))
 39 |                 file.close
 40 | 
 41 |     def get_theme(self):
 42 |         xmltheme = lxml.etree.HTML(self.get_html())
 43 |         themelist = xmltheme.xpath('//h3[@class="floor_theme"]/text()')
 44 | 
 45 |         for item in themelist:
 46 |             with open('theme.txt','a') as file:
 47 |                 file.write(item.encode('utf-8') + '\n')
 48 |                 file.close
 49 | 
 50 |         sloganlist = xmltheme.xpath('//p[@class="slogan"]/text()')
 51 |         for item in sloganlist:
 52 |             with open('theme.txt','a') as file:
 53 |                 file.write(item.encode('utf-8') + '\n')
 54 |                 file.close
 55 | 
 56 |         give_outlist = xmltheme.xpath('//p[@class="give_out"]/text()')
 57 |         for item in give_outlist:
 58 |             with open('theme.txt', 'a') as file:
 59 |                 file.write(item.encode('utf-8') + '\n')
 60 |                 file.close
 61 | 
 62 |     def get_html1(self,p2):
 63 |         request = urllib2.Request(p2)
 64 |         response = urllib2.urlopen(request)
 65 |         html = response.read()
 66 |         return html
 67 | 
 68 |     # 食品标题和图片
 69 |     def foodImageTitle(self):
 70 |         foodImage = lxml.etree.HTML(self.get_html())
 71 |         foodImageList = foodImage.xpath('//div[@class="pro_image"]/img/@src')
 72 | 
 73 |         if os.path.isdir('./foodimage'):
 74 |            pass
 75 |         else:
 76 |             os.mkdir("./foodimage")
 77 |         for item in foodImageList:
 78 |             # print item
 79 |             with open('foodimage/' + item[-20:],'a+') as file:
 80 |                 file.write(self.get_html1(item))
 81 |                 file.close
 82 | 
 83 |     # 每个零食的详细信息（标题、图片、副标题）
 84 |     def detail(self):
 85 |         detailLink = lxml.etree.HTML(self.get_html())
 86 |         detailLinkList = detailLink.xpath('//div[@class="nth_floor first_floor"]/div[@class="goods_box"]/ul[@class="item_list"]//a/@href')
 87 |         for item in detailLinkList:
 88 |             # print item[-18:]
 89 |             detailUrl = lxml.etree.HTML(self.get_html("/" + item[-18:]))
 90 |             detailImageList = detailUrl.xpath(
 91 |                 '//div[@class="info-panel panel1"]/img/@src')
 92 | 
 93 |             for detailitem in detailImageList:
 94 |                 # print '正在下载详情图片'
 95 | 
 96 |                 if os.path.isdir('./' + item[-18:-5]):
 97 |                     pass
 98 |                 else:
 99 |                     os.mkdir("./" + item[-18:-5])
100 | 
101 |                 with open(item[-18:-5] + '/' + detailitem[-9:], 'a+') as file:
102 |                     file.write(self.get_html1(detailitem))
103 |                     file.close
104 |             # 商品标题
105 |             detailtitleList = detailUrl.xpath(
106 |                 '//div[@class="col-lg-7 item-inner"]//h1[@class="fl"]/text()')
107 | 
108 |             for title in detailtitleList:
109 |                 with open('foodtitle.txt', 'a+') as file:
110 |                     file.write(title.encode('utf-8') + '\n')
111 |                     file.close
112 |             # 商品编号
113 |             goodnumberList = detailUrl.xpath(
114 |                 '//div[@class="col-lg-7 item-inner"]//li[@class="col-lg-5 col-md-5"]/text()')
115 |             for number in goodnumberList:
116 |                 # print number
117 |                 if os.path.isdir('./qrcoder'):
118 |                     pass
119 |                 else:
120 |                     os.mkdir("./qrcoder")
121 | 
122 |                 with open('qrcoder', 'a+') as file:
123 |                     file.write(number.encode('utf-8') + '\n')
124 |                     file.close
125 | 
126 | 
127 |             # 商品二维码:data_code
128 |             coderImageList = detailUrl.xpath('//div[@class="clearfixed"]//div[@class="barcode fr"]/img/@data_code')
129 | 
130 |             for item in coderImageList:
131 |                 # print item
132 |                 with open('goodnumber.txt', 'a+') as file:
133 |                     file.write(item + '\n')
134 |                     file.close
135 | 
136 | 
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     # 获取分类标题
141 |     xunmall = Xunmall()
142 |     # xunmall.get_xpath()
143 |     # 获取图片
144 |     # xunmall.get_image()
145 |     # 图片上面的标题
146 |     # xunmall.get_theme()
147 |     # 休闲食品标题和图片
148 |     # xunmall.foodImageTitle()
149 |     xunmall.detail()


--------------------------------------------------------------------------------
/爬虫小demo/17 zhihulogin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import urllib2
 4 | import lxml.etree
 5 | class Login():
 6 |     def __init__(self):
 7 |         self.url = "https://www.zhihu.com/#signin"
 8 | 
 9 |     def get_html(self):
10 |         # headers = {
11 |         # "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
12 |         request = urllib2.Request(self.url)
13 |         response = urllib2.urlopen(request)
14 |         html = response.read()
15 |         return html
16 | 
17 |     def get_xpath(self):
18 |         # print self.get_html()
19 |         xmlcontent = lxml.etree.HTML(self.get_html())
20 |         xmllist = xmlcontent.xpath('//div[@class="view view-signin"]/form/input/@value')
21 | 
22 |         for item in xmllist:
23 |             print item
24 |             with open('title.txt','a') as file:
25 |                 file.write(item.encode('utf-8') + '\n')
26 |                 file.close
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     login = Login()
31 |     login.get_xpath()


--------------------------------------------------------------------------------
/爬虫小demo/18 github_login.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | 模拟Github登陆步骤：
 4 |     1、请求头:self.headers,请求url
 5 |     2、设置session,保存登陆信息cookies,生成github_cookie文件
 6 |     3、POST表单提交,请求数据格式post_data
 7 |     4、authenticity_token获取
 8 |     5、在个人中心验证判断是否登陆成功,输出个人中心信息即登陆成功
 9 | 
10 | '''
11 | 
12 | import requests
13 | from lxml import etree
14 | try:
15 |     import cookielib
16 | except:
17 |     import http.cookiejar as cookielib
18 | 
19 | class GithubLogin():
20 | 
21 |     def __init__(self):
22 |         # url
23 |         self.loginUrl = 'https://github.com/login'
24 |         self.postUrl = 'https://github.com/session'
25 |         self.profileUrl = 'https://github.com/settings/profile'
26 | 
27 |         # 设置请求头
28 |         self.headers = {
29 |             'Referer': 'https://github.com/',
30 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
31 |             'Host': 'github.com'
32 |         }
33 | 
34 |         # 设置session
35 |         self.session = requests.session()
36 |         # 生成github_cookie文件
37 |         self.session.cookies = cookielib.LWPCookieJar(filename='github_cookie')
38 | 
39 |     '''
40 |         登陆时表单提交参数
41 |         Form Data:
42 |             commit:Sign in
43 |             utf8:✓
44 |             authenticity_token:yyZprIm4aghZ0u7r25ymZjisfTjGdUAdDowD9fKHM0oUvHD1WjUHbn2sW0Cz1VglZWdGno543jod2M8+jwLv6w==
45 |             login:*****
46 |             password:******
47 |     
48 |     '''
49 |     def post_account(self, email, password):
50 |         post_data = {
51 |             'commit': 'Sign in',
52 |             'utf8': '✓',
53 |             'authenticity_token': self.get_token()[0],
54 |             'login': email,
55 |             'password': password
56 |         }
57 |         response = self.session.post(self.postUrl, data=post_data, headers=self.headers)
58 |         # 保存cookies
59 |         self.session.cookies.save()
60 | 
61 |     def load_cookie(self):
62 |         try:
63 |             self.session.cookies.load(ignore_discard=True)
64 |         except:
65 |             print('cookie 获取不成功')
66 | 
67 |     # 获取authenticity_token
68 |     def get_token(self):
69 |         response = self.session.get(self.loginUrl, headers=self.headers)
70 |         html = etree.HTML(response.text)
71 |         authenticity_token = html.xpath('//div/input[2]/@value')
72 |         print(authenticity_token)
73 |         return authenticity_token
74 | 
75 |     # 判断是否登陆成功
76 |     def isLogin(self):
77 |         self.load_cookie()
78 |         response = self.session.get(self.profileUrl, headers=self.headers)
79 |         selector = etree.HTML(response.text)
80 |         flag = selector.xpath('//div[@class="column two-thirds"]/dl/dt/label/text()')
81 |         info = selector.xpath('//div[@class="column two-thirds"]/dl/dd/input/@value')
82 |         textarea = selector.xpath('//div[@class="column two-thirds"]/dl/dd/textarea/text()')
83 |         # 登陆成功返回来的个人设置信息
84 |         print(u'个人设置Profile标题: %s'%flag)
85 |         print(u'个人设置Profile内容: %s'%info)
86 |         print(u'个人设置Profile内容: %s'%textarea)
87 | 
88 | if __name__ == "__main__":
89 |     github = GithubLogin()
90 |     # 输入自己email账号和密码
91 |     github.post_account(email='******', password='******')
92 |     # 验证是否登陆成功
93 |     github.isLogin()


--------------------------------------------------------------------------------
/爬虫小demo/19 jd_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import time
 6 | 
 7 | class JDlogin():
 8 |     def __init__(self, username, password):
 9 |         self.session = requests.session()
10 |         self.loginUrl = "http://passport.jd.com/uc/login"
11 |         self.postUrl = "http://passport.jd.com/uc/loginService"
12 |         self.authUrl = "https://passport.jd.com/uc/showAuthCode"
13 |         self.username = username
14 |         self.password = password
15 | 
16 |         # 设置请求头
17 |         self.headers = {
18 |             'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
19 |             }
20 | 
21 |     def get_authcode(self, url):
22 |         self.headers['Host'] = 'authcode.jd.com'
23 |         self.headers['Referer'] = 'https://passport.jd.com/uc/login'
24 |         response = self.session.get(url, headers=self.headers)
25 |         with open('codeimage.jpg', 'wb') as f:
26 |             f.write(response.content)
27 |         authcode = input("请输入验证码：")
28 |         return authcode
29 | 
30 |     def get_info(self):
31 | 
32 |         try:
33 |             # 登陆请求
34 |             html = self.session.get(self.loginUrl, headers=self.headers)
35 |             soup = BeautifulSoup(html.text,"lxml")
36 |             inputList = soup.select('.form input')
37 |             print(inputList)
38 |             data = {}
39 |             data['uuid'] = inputList[0]['value']
40 |             data['eid'] = inputList[4]['value']
41 |             data['fp'] = inputList[5]['value']
42 |             data['_t'] = inputList[6]['value']
43 |             rstr = inputList[7]['name']
44 |             data[rstr] = inputList[7]['value']
45 |             acRequired = self.session.post(self.authUrl, data={
46 |                 'loginName': self.username}).text
47 | 
48 |             if 'true' in acRequired:
49 | 
50 |                 acUrl = soup.select('.form img')[0]['src2']
51 |                 acUrl = 'http:{}&yys={}'.format(acUrl, str(int(time.time() * 1000)))
52 |                 authcode = self.get_authcode(acUrl)
53 |                 data['authcode'] = authcode
54 |             else:
55 |                 data['authcode'] = ''
56 | 
57 |         except Exception as e:
58 |             print(e)
59 |         finally:
60 |             return data
61 | 
62 |     def jd_login(self):
63 | 
64 |         data = self.get_info()
65 |         # Form表单提交数据
66 |         # 1、loginname、nloginpwd、loginpwd是在网页中input属性值name,作为表单值提交到登陆请求
67 |         # 2、在此处也可以用selenium来进行给输入框(登陆账号、登陆密码)进行赋值
68 | 
69 |         data['loginname'] = self.username
70 |         data['nloginpwd'] = self.password
71 |         data['loginpwd'] = self.password
72 |         try:
73 |             self.headers['Host'] = 'passport.jd.com'
74 |             html = self.session.post(self.postUrl, data=data, headers=self.headers)
75 |             # 在这里可以判断请求是否判断成功不成功
76 |             print(html.text)
77 |         except Exception as e:
78 |             print(e)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     # 在下面输入账号名、密码
83 |     jdlogin = JDlogin("******", "******")
84 |     jdlogin.jd_login()
85 | 


--------------------------------------------------------------------------------
/爬虫小demo/20 下载网易云歌词.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import json
 7 | import re
 8 | from urllib import request
 9 | 
10 | # 1、获取网页
11 | def get_html(url):
12 |     headers = {
13 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47',
14 |         'Referer': 'http://music.163.com/',
15 |         'Host': 'music.163.com'
16 |         }
17 | 
18 |     try:
19 |         response = requests.get(url, headers=headers)
20 |         html = response.text
21 |         return html
22 |     except:
23 |         print('request error')
24 | 
25 | def get_text(song_id):
26 |     url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(song_id) + '&lv=1&kv=1&tv=-1'
27 |     html = get_html(url)
28 |     json_obj = json.loads(html)
29 |     text = json_obj['lrc']['lyric']
30 |     regex = re.compile(r'\[.*\]')
31 |     finalLyric = re.sub(regex, '', text).strip()
32 |     return finalLyric
33 | 
34 | def write_text(song_name,text):
35 |     print("正在写入歌曲：{}".format(song_name))
36 |     with open("{}.txt".format(song_name),'a',encoding='utf-8') as fp:
37 |         fp.write(text)
38 | 
39 | def getSingerInfo(html):
40 |     soup = BeautifulSoup(html, 'lxml')
41 |     links = soup.find('ul', class_='f-hide').find_all('a')
42 |     song_IDs = []
43 |     song_names = []
44 |     for link in links:
45 |         song_ID = link.get('href').split('=')[-1]
46 |         song_name = link.get_text()
47 |         song_IDs.append(song_ID)
48 |         song_names.append(song_name)
49 |     return zip(song_names, song_IDs)
50 | 
51 | def downloadSong(songName,songId):
52 |     singer_url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songId)
53 |     print('正在下载歌曲:{}'.format(songName))
54 |     request.urlretrieve(singer_url,'{}.mp3'.format(songName))
55 | 
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     singerId = input("请输入歌手的ID：")
60 |     startUrl = "http://music.163.com/artist?id={}".format(singerId)
61 |     html = get_html(startUrl)
62 |     singerInfos = getSingerInfo(html)
63 | 
64 |     for singerInfo in singerInfos:
65 |         print(singerInfo[1],singerInfo[0])
66 |         text = get_text(singerInfo[1])
67 |         # 下载歌曲文本
68 |         write_text(singerInfo[0],text)
69 |         # 下载歌曲mp3
70 |         downloadSong(singerInfo[0],singerInfo[1])
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/爬虫小demo/21 TaoBaoInfo.py:
--------------------------------------------------------------------------------
  1 | from urllib import request
  2 | import re, os, datetime
  3 | from selenium import webdriver
  4 | import ssl
  5 | 
  6 | ssl._create_default_https_context = ssl._create_unverified_context
  7 | 
  8 | 
  9 | class TaoBaoInfo:
 10 |     def __init__(self):
 11 |         self.dirName = 'MyTaoBaoInfo'
 12 |         self.driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs')
 13 |         self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
 14 | 
 15 |     # 获取页面内容提取
 16 |     def getPageContent(self, page):
 17 | 
 18 |         url = "https://mm.taobao.com/json/request_top_list.htm?page=" + str(page)
 19 |         response = request.Request(url, headers = self.headers)
 20 |         response = request.urlopen(response)
 21 | 
 22 |         # 正则获取
 23 |         pattern_link = re.compile(r'<div.*?class="pic-word">.*?<img src="(.*?)".*?'
 24 |                                   r'<a.*?class="lady-name".*?href="(.*?)".*?>(.*?)</a>.*?'
 25 |                                   r'<em>.*?<strong>(.*?)</strong>.*?'
 26 |                                   r'<span>(.*?)</span>'
 27 |                                   , re.S)
 28 |         items = re.findall(pattern_link, response.read().decode('gbk'))
 29 | 
 30 |         for item in items:
 31 |             # 详情页面：头像，个人详情，名字，年龄，地区
 32 | 
 33 |             detailPage = item[1]
 34 |             name = item[2]
 35 |             self.getDetailPage(detailPage, name)
 36 | 
 37 |     def getDetailPage(self, url, name):
 38 |         url = 'http:' + url
 39 |         self.driver.get(url)
 40 |         base_msg = self.driver.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
 41 |         brief = ''
 42 |         for item in base_msg:
 43 |             print(item.text)
 44 |             brief += item.text + '\n'
 45 | 
 46 |         icon_url = self.driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]//img')
 47 |         icon_url = icon_url.get_attribute('src')
 48 |         dir = self.dirName + '/' + name
 49 |         self.mkdir(dir)
 50 |         # 保存头像
 51 |         try:
 52 |             self.saveIcon(icon_url, dir, name)
 53 |         except Exception as e:
 54 |             print(u'保存头像失败 %s' % (e))
 55 | 
 56 |         # 开始跳转相册列表
 57 |         images_url = self.driver.find_element_by_xpath('//ul[@class="mm-p-menu"]//a')
 58 |         images_url = images_url.get_attribute('href')
 59 |         try:
 60 |             self.getAllImage(images_url, name)
 61 |         except Exception as e:
 62 |             print(u'获取所有相册异常 %s' % e)
 63 | 
 64 |         try:
 65 |             self.saveBrief(brief,dir, name)
 66 | 
 67 |         except Exception as e:
 68 |             print(u'保存个人信息失败 %s' % e)
 69 | 
 70 |     # 保存个人信息
 71 |     def saveBrief(self, content,dir, name):
 72 |         fileName = dir + '/' + name + '.txt'
 73 |         with open(fileName,'w+') as file:
 74 |             file.write(content)
 75 |         print(u'下载完成' + '\n' + '\n')
 76 |     # 获取所有图片
 77 |     def getAllImage(self, images_url, name):
 78 |         self.driver.get(images_url)
 79 |         # 只获取第一个相册
 80 |         photos = self.driver.find_element_by_xpath('//div[@class="mm-photo-cell-middle"]//h4/a')
 81 |         photos_url = photos.get_attribute('href')
 82 |         # 进入相册页面获取相册内容
 83 |         self.driver.get(photos_url)
 84 |         images_all = self.driver.find_elements_by_xpath('//div[@id="mm-photoimg-area"]/a/img')
 85 | 
 86 |         self.saveImgs(images_all, name)
 87 | 
 88 |     def saveImgs(self, images, name):
 89 |         index = 1
 90 | 
 91 |         for imageUrl in images:
 92 |             splitPath = imageUrl.get_attribute('src').split('.')
 93 |             fTail = splitPath.pop()
 94 |             if len(fTail) > 3:
 95 |                 fTail = "jpg"
 96 |             fileName = self.dirName + '/' + name + '/' + name + str(index) + "." + fTail
 97 |             self.saveImg(imageUrl.get_attribute('src'), fileName)
 98 |             index += 1
 99 | 
100 |     def saveIcon(self, url, dir, name):
101 |         splitPath = url.split('.')
102 |         fTail = splitPath.pop()
103 |         fileName = dir + '/' + name + '.' + fTail
104 |         print(fileName)
105 |         self.saveImg(url, fileName)
106 | 
107 |     # 写入图片
108 |     def saveImg(self, imageUrl, fileName):
109 |         print(imageUrl)
110 |         u = request.urlopen(imageUrl)
111 |         data = u.read()
112 |         f = open(fileName, 'wb')
113 |         f.write(data)
114 |         f.close()
115 | 
116 | 
117 |     # 创建目录
118 |     def mkdir(self, path):
119 |         path = path.strip()
120 |         print(u'正在下载 %s 个人信息' % path)
121 |         if os.path.exists(path):
122 |             return False
123 |         else:
124 |             os.makedirs(path)
125 |             return True
126 | 
127 | if __name__ == "__main__":
128 |     taoBaoInfo = TaoBaoInfo()
129 |     # 输入需要下载的页数
130 |     page = input("请输入要下载的页数：")
131 |     for index in range(1, int(page) + 1):
132 |         taoBaoInfo.getPageContent(index)
133 | 


--------------------------------------------------------------------------------
/爬虫小demo/22 JDPython.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from lxml import etree
 4 | 
 5 | driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs')
 6 | 
 7 | 
 8 | # 获取第一页的数据
 9 | def get_html():
10 |     url = "https://detail.tmall.com/item.htm?id=531993957001&skuId=3609796167425&user_id=268451883&cat_id=2&is_b=1&rn=71b9b0aeb233411c4f59fe8c610bc34b"
11 |     driver.get(url)
12 |     time.sleep(5)
13 |     driver.execute_script('window.scrollBy(0,3000)')
14 |     time.sleep(2)
15 |     driver.execute_script('window.scrollBy(0,5000)')
16 |     time.sleep(2)
17 | 
18 |     # 累计评价
19 |     btnNext = driver.find_element_by_xpath('//*[@id="J_TabBar"]/li[3]/a')
20 |     btnNext.click()
21 |     html = driver.page_source
22 |     return html
23 | 
24 | 
25 | def get_comments(html):
26 |     source = etree.HTML(html)
27 |     commens = source.xpath("//*[@id='J_TabBar']/li[3]/a/em/text()")
28 |     print('评论数：', commens)
29 |     # 将评论转为int类型
30 |     commens = (int(commens[0]) / 20) + 1
31 |     # 获取到总评论
32 |     print('评论数：', int(commens))
33 |     return int(commens)
34 | 
35 | 
36 | def parse_html(html):
37 |     html = etree.HTML(html)
38 |     commentlist = html.xpath("//*[@class='rate-grid']/table/tbody")
39 |     for comment in commentlist:
40 |         # 评论
41 |         vercomment = comment.xpath(
42 |             "./tr/td[@class='tm-col-master']/div[@class='tm-rate-content']/div[@class='tm-rate-fulltxt']/text()")
43 |         # 机器类型
44 |         verphone = comment.xpath("./tr/td[@class='col-meta']/div[@class='rate-sku']/p[@title]/text()")
45 |         print(vercomment)
46 |         print(verphone)
47 |         # 用户(头尾各一个字，中间用****代替)
48 |         veruser = comment.xpath("./tr/td[@class='col-author']/div[@class='rate-user-info']/text()")
49 |         print(veruser)
50 | 
51 | 
52 | def next_button_work(num):
53 |     if num != 0:
54 |         driver.execute_script('window.scrollBy(0,3000)')
55 |         time.sleep(2)
56 |         try:
57 |             driver.find_element_by_css_selector('#J_Reviews > div > div.rate-page > div > a:last-child').click()
58 |         except Exception as e:
59 |             print(e)
60 | 
61 |         time.sleep(2)
62 |         driver.execute_script('window.scrollBy(0,3000)')
63 |         time.sleep(2)
64 |         driver.execute_script('window.scrollBy(0,5000)')
65 |         time.sleep(2)
66 |         html = driver.page_source
67 |         parse_html(html)
68 | 
69 | 
70 | def selenuim_work(html):
71 |     parse_html(html)
72 |     next_button_work(1)
73 |     pass
74 | 
75 | 
76 | def gettotalpagecomments(comments):
77 |     html = get_html()
78 |     for i in range(0, comments):
79 |         selenuim_work(html)
80 | 
81 | 
82 | data = get_html()
83 | # 得到评论
84 | commens = get_comments(data)
85 | # 根据评论内容进行遍历
86 | gettotalpagecomments(commens)
87 | 


--------------------------------------------------------------------------------
/爬虫小demo/23 tuchongnet.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | import rsa
 4 | import binascii
 5 | import requests
 6 | from base64 import b64decode
 7 | import sys
 8 | reload(sys)
 9 | sys.setdefaultencoding('utf8')
10 | 
11 | class LBTuChongNet(object):
12 |     def __init__(self):
13 |         self.loginUrl = "https://tuchong.com/rest/accounts/login"
14 |         self.userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
15 |         self.headers = {
16 |             'user-agent': self.userAgent
17 |         }
18 |         #pubkey 在页面的js中: http://static.tuchong.net/js/pc/page/welcome_6e7f1cd.js
19 |         
20 |         self.key = "D8CC0180AFCC72C9F5981BDB90A27928672F1D6EA8A57AF44EFFA7DAF6EFB17DAD9F643B9F9F7A1F05ACC2FEA8DE19F023200EFEE9224104627F1E680CE8F025AF44824A45EA4DDC321672D2DEAA91DB27418CFDD776848F27A76E747D53966683EFB00F7485F3ECF68365F5C10C69969AE3D665162D2EE3A5BA109D7DF6C7A5"
21 |         self.session = requests.session()
22 |     
23 |     def get_crypt_password(self,message):
24 |         rsaPublickey = int(self.key, 16)
25 |         key = rsa.PublicKey(rsaPublickey, 65537)
26 |         password = rsa.encrypt(message, key)
27 |         password = binascii.b2a_hex(password)
28 |         return password
29 |     
30 |     def get_captcha(self):
31 |         captchaUrl="https://tuchong.com/rest/captcha/image"
32 |         
33 |         rsp = self.session.post(captchaUrl, data = None, headers = self.headers).json()
34 |         captcha_id = rsp['captchaId']
35 |         captcha_base64 = rsp['captchaBase64']
36 |         captcha_base64 = captcha_base64.replace("data:image/png;base64,","")
37 |         with open("lbcaptcha.png",'w') as f:
38 |             f.write(b64decode(captcha_base64))
39 |         captcha = input(u'输入当前目录下 lbcaptcha.png 上的验证码：')
40 |         return captcha_id,captcha
41 |     
42 |     def login(self,username,password):
43 |         
44 |         passwd_crypt = self.get_crypt_password(password)
45 |         postdata = {
46 |             'account': username,
47 |                 'password': passwd_crypt,
48 |         }
49 |         rsp = self.session.post(self.loginUrl, data = postdata, headers = self.headers)
50 |         rsp = rsp.json()
51 |         print(rsp)
52 |         #登录成功
53 |         if rsp.has_key('result') and rsp['result'] == "SUCCESS":
54 |             print(rsp['message'])
55 |             return
56 |         
57 |         #登录失败
58 |         if rsp.has_key('code') and rsp.has_key('message'):
59 |             print("response code:%d, message:%s"%(rsp['code'],rsp['message']))
60 |             if rsp['message'].find("验证码") >= 0:
61 |                 print(rsp['message'])
62 |                 captcha = self.get_captcha()
63 |                 postdata = {
64 |                     'account': username,
65 |                     'password': passwd_crypt,
66 |                     'captcha_id': captcha[0],
67 |                     'captcha_token': int(captcha[1])
68 |                 }
69 |                 rsp = self.session.post(self.loginUrl, data = postdata, headers = self.headers)
70 |                 if str(rsp).find('200'):
71 |                     print("登陆成功！")
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     # 图虫网验证
76 |     lbtuchongnet = LBTuChongNet()
77 |     username = raw_input(u'请输入图虫网用户名：')
78 |     password = raw_input(u'请输入图虫网密码：')
79 |     lbtuchongnet.login(username,password)
80 | 


--------------------------------------------------------------------------------
/爬虫小demo/25 PythonItChat.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | '''
  3 | itchat:获取分享给群或者个人的技术文章
  4 | (0) 熟悉itchat,(https://www.cnblogs.com/Chenjiabing/p/6907397.html)
  5 | (1) itchat 扫码次数太多会被限制扫码登录微信。
  6 | (2) itchat:获取分享给群或者个人的技术文章,提取出文章链接、文章标题、文章首页图片、文章内容
  7 | (3) 通过获取到的文章链接，爬虫文章内容。
  8 | (4) 判断是接收方(ToUserName)是谁、发送方(FromUserName)是谁就是通过唯一的ID来判别的。
  9 | (5) python itchat 热登陆(itchat.auto_login(hotReload=True))
 10 | (6) xpath模块爬取文章标题、文章内图片
 11 | (7) 搭建web服务器环境(Mac使用XAMPP)
 12 | (8) pymysql模块自动创建数据库、创建字段、保存内容到字段
 13 | (9) navicat 的使用
 14 | (10) python 相关模块的使用
 15 | '''
 16 | 
 17 | # 爬取微信群或者是好友分享的文章
 18 | # 监听微信公众号分享的文章
 19 | 
 20 | import itchat
 21 | # import全部消息类型
 22 | from itchat.content import *
 23 | import urllib2
 24 | import lxml.etree
 25 | import os
 26 | import pymysql
 27 | import uuid
 28 | import json
 29 | # 连接数据库
 30 | table_cms_news = 'cms_news'
 31 | table_cms_news_pic = 'cms_news_pic'
 32 | # db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8')
 33 | db = pymysql.connect(host='127.0.0.1', user='root', passwd='djs@12316', db='fz_afmcms', charset='utf8')
 34 | cur = db.cursor()
 35 | 
 36 | # 处理个人分享消息
 37 | # 包括文本、位置、名片、通知、分享(49重点)
 38 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING])
 39 | def text_reply(msg):
 40 |     print msg
 41 |         # 微信里，每个用户和群聊，都使用很长的ID来区分
 42 |         if msg["MsgType"] == 49:
 43 |             print "个人分享文章地址链接Url:" + "---------------------------"
 44 |                 
 45 |                 xmlcontent = lxml.etree.HTML(get_html(msg["Url"]))
 46 |                 print xmlcontent
 47 |                     title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
 48 | 
 49 |                     imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
 50 |                         # 下载图片
 51 |                         source = xmlcontent.xpath('//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
 52 |                         time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
 53 |                         print "来源"
 54 |                             print source, time
 55 |                                 # 下载图片
 56 |                                 print "下载图片"
 57 |                                     # print imgArray
 58 |                                     # print title[0]
 59 |                                     get_image(title, imgArray, source, time,msg["Url"])
 60 | 
 61 |                                     print msg["Url"]
 62 |                                         print "个人分享文章类型编号MsgType:" + "---------------------------"
 63 |                                             print msg["MsgType"]
 64 |                                             print "个人分享Content:" + "---------------------------"
 65 |                                                 print msg["Content"]
 66 |                                                 print "个人分享FromUserName:" + "---------------------------"
 67 |                                                     print msg["FromUserName"]
 68 |                                                     print "个人分享ToUserName:" + "---------------------------"
 69 |                                                         print msg["ToUserName"]
 70 |                                                         print "个人分享链接标题FileName:" + "---------------------------"
 71 |                                                             print msg["FileName"]
 72 |                                                         
 73 |                                                             print "------------个人"
 74 |                                                                 # 获取到的信息是某某人和登录者之间的通讯，如果不是和登录这通讯就获取不到
 75 |                                                                 print itchat.search_friends(userName=msg['FromUserName'])['NickName']
 76 |                                                                 print itchat.search_friends(userName=msg['ToUserName'])['NickName']
 77 |                                                                     
 78 |                                                                     else:
 79 |                                                                         print "不是个人分享的文章"
 80 | 
 81 | 
 82 | # 处理群聊消息
 83 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING], isGroupChat=True)
 84 | def text_reply(msg):
 85 |     print msg
 86 |     if msg["MsgType"] == 49:
 87 |         print "群聊分享文章地址链接Url:" + "---------------------------"
 88 |         print msg["Url"]
 89 |         
 90 |         xmlcontent = lxml.etree.HTML(get_html(msg["Url"]))
 91 |         title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
 92 |         imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
 93 |         # 来源
 94 |         source = xmlcontent.xpath('//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
 95 |         time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
 96 |         print "来源"
 97 |         print source,time
 98 |         # 下载图片
 99 |         print "下载图片"
100 |         # print imgArray
101 |         # print title[0]
102 |         get_image(title,imgArray,source,time,msg["Url"])
103 |         
104 |         # print "群聊分享文章类型编号MsgType:" + "---------------------------"
105 |         # print msg["MsgType"]
106 |         # print "群聊分享Content:" + "---------------------------"
107 |         # print msg["Content"]
108 |         # print "群聊分享FromUserName:" + "---------------------------"
109 |         # print msg["FromUserName"]
110 |         # print "群聊分享ToUserName:" + "---------------------------"
111 |         # print msg["ToUserName"]
112 |         # print "群聊分享链接标题FileName:" + "---------------------------"
113 |         # print msg["FileName"]
114 |         print "-------------群--------"
115 |         # itchat.send('%s: %s : %s' % (msg['Type'], msg['Text'], msg['Url']), msg['FromUserName'])
116 |         
117 |         print msg['FromUserName']
118 |         print msg['ToUserName']
119 |         # 这个是需要每次扫码登录都改变的receiver
120 |         receiver = "@4603e5cb2e47b710bba6fd15dfa3ace9ef3be0f3c80b812e0cc97cd7a71b7c96"
121 |         if msg['FromUserName'] == receiver:
122 |             print "----------- 自己在群里发的文章 ------------"
123 |             # 自己在群里发的文章
124 |             print "昵称:"
125 |             print itchat.search_friends(userName=msg['FromUserName'])['NickName']
126 |             print " ----------- "
127 |             print "群名称:"
128 |             print itchat.search_chatrooms(userName=msg['ToUserName'])['NickName']
129 |             chatRoomName = "呵呵各地"
130 |         # if itchat.search_chatrooms(userName=msg['ToUserName'])['NickName'] == chatRoomName:
131 |         #     pass
132 |         # else:
133 |         #     pass
134 |         
135 |         else:
136 |             # 群友发的文章
137 |             print "----------- 群友发的文章 -----------"
138 |             print "昵称:"
139 |             print msg['ActualNickName']
140 |             print " ----------- "
141 |             print "群名称:"
142 |             print itchat.search_chatrooms(userName=msg['FromUserName'])['NickName']
143 |             chatRoomName = "呵呵各地"
144 | # if itchat.search_chatrooms(userName=msg['FromUserName'])['NickName'] == chatRoomName:
145 | #     pass
146 | # else:
147 | #     pass
148 | else:
149 |     print "不是群聊分享的文章"
150 | # return msg['Text']
151 | 
152 | 
153 | # 处理微信公众号消息
154 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING], isMpChat=True)
155 | def text_reply(msg):
156 |     print msg
157 |     print itchat.search_mps(name='PythonCoder')[0]["NickName"]
158 |     if msg["MsgType"] == 49:
159 |         print "监听到制定微信公众号分享的文章链接："
160 |         print msg["Url"]
161 |     else:
162 |         print "微信公众号分享的不是文章"
163 | 
164 | # 获取网页内容
165 | def get_html(url):
166 |     request = urllib2.Request(url)
167 |     response = urllib2.urlopen(request)
168 |     html = response.read()
169 |     return html
170 | 
171 | # 下载图片
172 | def get_image(title,imgArray,source,time,linkurl):
173 |     print "标题"
174 |     result = cur.execute("select news_url from cms_news WHERE news_url='"+ linkurl + "'")
175 |     print(str(result) + '------------url-----------')
176 |     
177 |     if result:
178 |         print("数据库里面存在此数据")
179 |     else:
180 |         if os.path.isdir('./imgs'):
181 |             pass
182 |         else:
183 |             os.mkdir("./imgs")
184 |         for item in imgArray:
185 |             with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file:
186 |                 file.write(get_html(item))
187 |                 file.close
188 |         ima_dic = {}
189 |         news_pic = ""
190 |         news_pic_s = ""
191 |         news_pic_t = ""
192 | 
193 | if len(imgArray) == 0:
194 |     pass
195 |         else:
196 |             # 文章图片
197 |             for index, item in enumerate(imgArray):
198 |                 ima_dic[index] = item
199 |     if len(imgArray) == 0:
200 |         pass
201 |         elif len(imgArray) == 1:
202 |             news_pic = imgArray[0]
203 | elif len(imgArray) == 2:
204 |     news_pic = imgArray[0]
205 |     news_pic_s = imgArray[1]
206 |         elif len(imgArray) == 3:
207 |             news_pic = imgArray[0]
208 |             news_pic_s = imgArray[1]
209 |             news_pic_t = imgArray[2]
210 |     new_id = str(uuid.uuid1()).strip().replace("-", "")
211 |         titleString = ""
212 |         if len(title) == 0:
213 |             pass
214 | else:
215 |     titleString = title[0].strip().replace("\n", "")
216 |         cur.execute(
217 |                     'INSERT INTO ' + table_cms_news_pic + ' (news_id,pic_url,pic_desc) VALUES (%s,%s,%s)',
218 |                     (new_id, json.dumps(ima_dic,ensure_ascii=False),""))
219 |                     cur.execute(
220 |                                 'INSERT INTO ' + table_cms_news + ' (news_open_type,news_id,news_title,news_type,com_id,'\
221 |                                 'news_column_code1,news_column_name1,'\
222 |                                 'news_column_code2,news_column_name2,news_desc,news_pic,'\
223 |                                 'news_pic_s,news_pic_t,news_pic_is_show,'\
224 |                                 'news_content,news_source,news_cuser_name,'\
225 |                                 'news_ctime,news_url,news_status,view_count,platid) '\
226 |                                 'VALUES (%s,%s, %s,%s,%s, %s,%s,%s,%s, %s,%s, %s,%s,%s,'\
227 |                                 ' %s,%s, %s,%s,%s,%s,%s,%s)',
228 |                                 ('1',new_id,titleString,'1','1','1','微信转发','1','分类1','news_desc',news_pic,news_pic_s,
229 |                                  news_pic_t,'1','news_content',source[0].strip().replace("\n", ""),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", ""),linkurl,
230 |                                  '1',200,'weixin'))
231 |                     
232 |                     # cur.execute(
233 |                     #         'INSERT INTO ' + table_cms_news + ' (title,url, img,source,time) VALUES (%s, %s,%s,%s, %s)',
234 |                     #         (title[0].strip().replace("\n", ""),linkurl, json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", "")))
235 |                     cur.connection.commit()
236 |                     print("------------------------  插入成功  ----------------------------------")
237 | 
238 | # 连接数据库
239 | def get_connect():
240 | 
241 |      try:
242 |          # 创建表
243 |          cur.execute(
244 |              'CREATE TABLE ' + table_cms_news + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000),url VARCHAR(10000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
245 |      except pymysql.err.InternalError as e:
246 |          print(e)
247 |      # 修改表字段
248 |      cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
249 |      cur.execute(
250 |          'ALTER TABLE ' + table_cms_news + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
251 |      cur.execute(
252 |          'ALTER TABLE ' + table_cms_news + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
253 |      cur.execute(
254 |          'ALTER TABLE ' + table_cms_news + ' CHANGE url url VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
255 |      cur.execute(
256 |              'ALTER TABLE ' + table_cms_news + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
257 |      cur.execute(
258 |          'ALTER TABLE ' + table_cms_news + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
259 |      cur.execute(
260 |          'ALTER TABLE ' + table_cms_news + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
261 | 
262 | 
263 | # 热登录(在一段时间内不用扫码登录还能保持登录状态)
264 | get_connect()
265 | print "哈哈"
266 | itchat.auto_login(hotReload=True)
267 | # 绑定消息响应事件后，让itchat运行起来，监听消息
268 | itchat.run()
269 | 
270 | 


--------------------------------------------------------------------------------
/爬虫小demo/26 PythonWeChat.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import pickle
  3 | import wechatsogou
  4 | import urllib2
  5 | import lxml.etree
  6 | import os
  7 | import pymysql
  8 | import json
  9 | 
 10 | # 添加一个文件，将已经发送成功的文章标题序列化到文件，防止多次运行导致重复发送邮件
 11 | file_path = 'sent_articles_file'
 12 | 
 13 | ws_api = wechatsogou.WechatSogouAPI()
 14 | 
 15 | # 连接数据库
 16 | tablename = 'pythonwechat'
 17 | db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8')
 18 | cur = db.cursor()
 19 | cur.execute('USE itchat')
 20 | 
 21 | # 获取公众号文章信息
 22 | def get_article(gzh):
 23 |     articles = ws_api.get_gzh_article_by_history(gzh)
 24 |     print(len(articles['article']))
 25 |     return articles['article']
 26 | 
 27 | # 获取网页内容
 28 | def get_html(url):
 29 |     request = urllib2.Request(url)
 30 |     response = urllib2.urlopen(request)
 31 |     html = response.read()
 32 |     return html
 33 | 
 34 | # 下载图片
 35 | def get_image(title,imgArray,source,time):
 36 |     if os.path.isdir('./imgs'):
 37 |         pass
 38 |     else:
 39 |         os.mkdir("./imgs")
 40 |     for item in imgArray:
 41 |         with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file:
 42 |             file.write(get_html(item))
 43 |             file.close
 44 | 
 45 | cur.execute(
 46 |             'INSERT INTO ' + tablename + ' (title, img,source,time) VALUES (%s, %s,%s, %s)',
 47 |             (title[0].strip().replace("\n", ""), json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", "")))
 48 | cur.connection.commit()
 49 | print title[0]
 50 | print("------------------------  插入成功  ----------------------------------")
 51 | 
 52 | # 连接数据库
 53 | def get_connect():
 54 |     
 55 |     try:
 56 |         # 创建表
 57 |         cur.execute(
 58 |                     'CREATE TABLE ' + tablename + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
 59 |     except pymysql.err.InternalError as e:
 60 |         print(e)
 61 |     # 修改表字段
 62 |     cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
 63 |     cur.execute(
 64 |                 'ALTER TABLE ' + tablename + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 65 | cur.execute(
 66 |             'ALTER TABLE ' + tablename + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 67 |     cur.execute(
 68 |                 'ALTER TABLE ' + tablename + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 69 |                 cur.execute(
 70 |                             'ALTER TABLE ' + tablename + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 71 |                 cur.execute(
 72 |                             'ALTER TABLE ' + tablename + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
 73 | 
 74 | 
 75 | if '__main__' == __name__:
 76 | 
 77 |     get_connect()
 78 |     
 79 |     # 定义一个公众号列表
 80 |     gzh_list = ['技术最前线', 'python', '全民独立经纪人', '程序视界', '非著名程序员']
 81 |     
 82 |     for gzh in gzh_list:
 83 |         # 查找公众号之前，先从文件中反序列化出已经成功发送的文章列表
 84 |         if os.path.exists(file_path):
 85 |             f = open(file_path, 'rb')
 86 |             sent_list = pickle.load(f)
 87 |             f.close()
 88 |         articles = get_article(gzh)
 89 |         for article in articles:
 90 |             print(article['title'],'\n\t' ,article['content_url'])
 91 |             
 92 |             xmlcontent = lxml.etree.HTML(get_html(article['content_url']))
 93 |             title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
 94 |             imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
 95 |             # 来源
 96 |             source = xmlcontent.xpath(
 97 |                                       '//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
 98 |                                       time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
 99 |                                       print "来源、时间"
100 |                                       print source, time
101 |                                       # 下载图片
102 |                                       print "下载图片"
103 |             get_image(title, imgArray, source, time)
104 | 
105 | 


--------------------------------------------------------------------------------
/爬虫小demo/27 PythonWordCloud.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | import os
 3 | from pyecharts import WordCloud
 4 | # 词云
 5 | def pythonWordCloud(x,y,label):
 6 |     wordcloud = WordCloud(width=1300, height=620)
 7 |     wordcloud.add("", x, y, word_size_range=[20, 100],shape="triangle-forward")
 8 |     wordcloud.render()
 9 |     os.system(r"render.html")
10 | x = [
11 |      'PythonCoder', '爬虫', '人工智能', '大数据', 'Django',
12 |      'Flask', '机器学习', '数据分析', '深度学习', '运维测试', 'TensorFlow',
13 |      '真实面试经历', '真实面试题', '自然语言处理', 'NLP',"数据处理",
14 |      '500GB资料免费送', '开放源码', '免费学习群', '面试简历', 'JCSON']
15 | y = [
16 |      10000, 6181, 4386, 4055, 2467, 2244, 1898, 1484, 1112,
17 |      965, 847, 582, 555, 550, 462, 366, 360, 282, 273, 265,5000]
18 | 
19 | pythonWordCloud(x,y,"词云")
20 | 
21 | 


--------------------------------------------------------------------------------
/爬虫小demo/28 PythonCheHui.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | 
  3 | # 微信，找回好友、群聊用户撤回的消息
  4 | # 说明：可以撤回的有文本文字、语音、视频、图片、位置、名片、分享、附件
  5 | 
  6 | import itchat
  7 | from itchat.content import *
  8 | import sys
  9 | import time
 10 | import re
 11 | import os
 12 | 
 13 | reload(sys)
 14 | sys.setdefaultencoding('utf8')
 15 | 
 16 | msg_information = {}
 17 | # 针对表情包的内容
 18 | face_bug = None
 19 | 
 20 | @itchat.msg_register([TEXT,PICTURE,FRIENDS,CARD,MAP,SHARING,RECORDING,ATTACHMENT,VIDEO],isFriendChat=True,isGroupChat=True)
 21 | def receive_msg(msg):
 22 |     global face_bug
 23 |     # 接收消息的时间
 24 |     msg_time_rec = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 25 |     if msg.has_key('ActualNickName'):
 26 |         # 群消息的发送者,用户的唯一标识
 27 |         from_user = msg['ActualUserName']
 28 |         # 发送者群内的昵称
 29 |         msg_from = msg['ActualNickName']
 30 |         # 获取所有好友
 31 |         friends = itchat.get_friends(update=True)
 32 |         for f in friends:
 33 |             # 如果群消息是好友发的
 34 |             if from_user == f['UserName']:
 35 |                 # 优先使用好友的备注名称，没有则使用昵称
 36 |                 if f['RemarkName']:
 37 |                     msg_from = f['RemarkName']
 38 |                 else:
 39 |                     msg_from = f['NickName']
 40 |                 break
 41 |         # 获取所有的群
 42 |         groups = itchat.get_chatrooms(update=True)
 43 |         for g in groups:
 44 |             # 根据群消息的FromUserName匹配是哪个群
 45 |             if msg['FromUserName'] == g['UserName']:
 46 |                 group_name = g['NickName']
 47 |                 group_menbers = g['MemberCount']
 48 |                 break
 49 |         group_name = group_name + "(" + str(group_menbers) +")"
 50 |     else:
 51 |         # 优先使用备注名称
 52 |         if itchat.search_friends(userName=msg['FromUserName'])['RemarkName']:
 53 |             msg_from = itchat.search_friends(userName=msg['FromUserName'])['RemarkName']
 54 |         else:
 55 |             # 在好友列表中查询发送信息的好友昵称
 56 |             msg_from = itchat.search_friends(userName=msg['FromUserName'])['NickName']
 57 |         group_name = ""
 58 |     # 信息发送的时间
 59 |     msg_time = msg['CreateTime']
 60 |     # 每条信息的id
 61 |     msg_id = msg['MsgId']
 62 |     # 储存信息的内容
 63 |     msg_content = None
 64 |     # 储存分享的链接，比如分享的文章和音乐
 65 |     msg_share_url = None
 66 |     # 如果发送的消息是文本或者好友推荐
 67 |     if msg['Type'] == 'Text' or msg['Type'] == 'Friends':
 68 |         msg_content = msg['Text']
 69 | 
 70 |     # 如果发送的消息是附件、视频、图片、语音
 71 |     elif msg['Type'] == "Attachment" or msg['Type'] == "Video" \
 72 |         or msg['Type'] == 'Picture' \
 73 |         or msg['Type'] == 'Recording':
 74 |         # 内容就是他们的文件名
 75 |         msg_content = msg['FileName']
 76 |         # 下载文件
 77 |         msg['Text'](str(msg_content))
 78 |     # 如果消息为分享的位置信息
 79 |     elif msg['Type'] == 'Map':
 80 |         x, y, location = re.search(
 81 |                                    "<location x=\"(.*?)\" y=\"(.*?)\".*label=\"(.*?)\".*", msg['OriContent']).group(1, 2, 3)
 82 |                                    if location is None:
 83 |                                        # 内容为详细的地址
 84 |                                        msg_content = r"纬度->" + x.__str__() + " 经度->" + y.__str__()
 85 |                                    else:
 86 |                                        msg_content = r"" + location
 87 | # 如果消息为分享的音乐或者文章，详细的内容为文章的标题或者是分享的名字
 88 | elif msg['Type'] == 'Sharing':
 89 |     msg_content = msg['Text']
 90 |     # 记录分享的url
 91 |         msg_share_url = msg['Url']
 92 |     face_bug = msg_content
 93 |     # 将信息存储在字典中，每一个msg_id对应一条信息
 94 |     msg_information.update(
 95 |                            {
 96 |                            msg_id: {
 97 |                            "msg_from": msg_from,
 98 |                            "msg_time": msg_time,
 99 |                            "msg_time_rec": msg_time_rec,
100 |                            "msg_type": msg["Type"],
101 |                            "msg_content": msg_content,
102 |                            "msg_share_url": msg_share_url,
103 |                            "group_name":group_name
104 |                            }
105 |                            }
106 |                            )
107 | 
108 | # 监听是否有消息撤回
109 | # 使用下面的装饰器监听，会发送4条消息
110 | # @itchat.msg_register(NOTE,isFriendChat=True,isGroupChat=True,isMpChat=True)
111 | 
112 | # 监听是否有消息撤回
113 | # 使用下面的装饰器监听，会发送1条消息
114 | @itchat.msg_register(NOTE)
115 | def information(msg):
116 |     # 这里如果这里的msg['Content']中包含消息撤回和id，就执行下面的语句
117 |     if '撤回了一条消息' in msg['Content']:
118 |         # 在返回的content查找撤回的消息的id
119 |         old_msg_id = re.search("\<msgid\>(.*?)\<\/msgid\>", msg['Content']).group(1)
120 |         # 获取到消息原文
121 |         old_msg = msg_information.get(old_msg_id)
122 |         # 如果发送的是表情包
123 |         if len(old_msg_id)<11:
124 |             # 发送撤回的提示给文件助手
125 |             itchat.send_file(face_bug,toUserName='filehelper')
126 |         # 把暂时存储的信息可以删除掉,也可以选择不删除
127 |         # os.remove(face_bug)
128 |         else:
129 |             msg_body = old_msg.get('group_name') + old_msg.get('msg_from') +"\n" + old_msg.get('msg_time_rec') \
130 |                 + "撤回了:" + "\n" + r"" + old_msg.get('msg_content')
131 |             
132 |             # 如果是分享的文件被撤回了，那么就将分享的url加在msg_body中发送给文件助手
133 |             if old_msg['msg_type'] == "Sharing":
134 |                 msg_body += "\n链接是:" + old_msg.get('msg_share_url')
135 |             print msg_body
136 |             # 将撤回消息发给文件助手
137 |             itchat.send_msg(msg_body, toUserName='filehelper')
138 |             
139 |             # 有文件的话也要将文件发送回去
140 |             if old_msg["msg_type"] == "Picture" \
141 |                 or old_msg["msg_type"] == "Recording" \
142 |                 or old_msg["msg_type"] == "Video" \
143 |                 or old_msg["msg_type"] == "Attachment":
144 |                 file = '@fil@%s' % (old_msg['msg_content'])
145 |                 itchat.send(msg=file, toUserName='filehelper')
146 |                 # 把暂时存储的信息可以删除掉,也可以选择不删除
147 |                 os.remove(old_msg['msg_content'])
148 |             # 删除字典旧消息
149 |     msg_information.pop(old_msg_id)
150 | 
151 | itchat.auto_login(hotReload=True)
152 | itchat.run()
153 | 


--------------------------------------------------------------------------------
/爬虫小demo/29 PythonCeHui.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os, re, shutil, time, collections, json
  3 | 
  4 | from html.parser import HTMLParser
  5 | from xml.etree import ElementTree as ETree
  6 | 
  7 | import itchat
  8 | from itchat.content import *
  9 | 
 10 | msg_store = collections.OrderedDict()
 11 | timeout = 600
 12 | sending_type = {'Picture': 'img', 'Video': 'vid'}
 13 | data_path = 'data'
 14 | nickname = ''
 15 | bot = None
 16 | 
 17 | if __name__ == '__main__':
 18 |     if not os.path.exists(data_path):
 19 |         os.mkdir(data_path)
 20 |     # if the QR code doesn't show correctly, you can try to change the value
 21 |     # of enableCdmQR to 1 or -1 or -2. It nothing works, you can change it to
 22 |     # enableCmdQR=True and a picture will show up.
 23 |     bot = itchat.new_instance()
 24 |     bot.auto_login(hotReload=True, enableCmdQR=2)
 25 |     nickname = bot.loginInfo['User']['NickName']
 26 | 
 27 | def clear_timeouted_message():
 28 |     now = time.time()
 29 |     count = 0
 30 |     for k, v in list(msg_store.items()):
 31 |         if now - v['ReceivedTime'] > timeout:
 32 |             count += 1
 33 |         else:
 34 |             break
 35 |     for i in range(count):
 36 |         item = msg_store.popitem(last=False)
 37 | 
 38 | def get_sender_receiver(msg):
 39 |     sender = nickname
 40 |     receiver = nickname
 41 |     if msg['FromUserName'][0:2] == '@@': # group chat
 42 |         sender = msg['ActualNickName']
 43 |         m = bot.search_chatrooms(userName=msg['FromUserName'])
 44 |         if m is not None:
 45 |             receiver = m['NickName']
 46 |     elif msg['ToUserName'][0:2] == '@@': # group chat by myself
 47 |         if 'ActualNickName' in msg:
 48 |             sender = msg['ActualNickName']
 49 |         else:
 50 |             m = bot.search_friends(userName=msg['FromUserName'])
 51 |             if m is not None:
 52 |                 sender = m['NickName']
 53 |         m = bot.search_chatrooms(userName=msg['ToUserName'])
 54 |         if m is not None:
 55 |             receiver = m['NickName']
 56 |     else: # personal chat
 57 |         m = bot.search_friends(userName=msg['FromUserName'])
 58 |         if m is not None:
 59 |             sender = m['NickName']
 60 |         m = bot.search_friends(userName=msg['ToUserName'])
 61 |         if m is not None:
 62 |             receiver = m['NickName']
 63 |     return HTMLParser().unescape(sender), HTMLParser().unescape(receiver)
 64 | 
 65 | def print_msg(msg):
 66 |     msg_str = ' '.join(msg)
 67 |     print(msg_str)
 68 |     return msg_str
 69 | 
 70 | def get_whole_msg(msg, download=False):
 71 |     sender, receiver = get_sender_receiver(msg)
 72 |     if len(msg['FileName']) > 0 and len(msg['Url']) == 0:
 73 |         if download: # download the file into data_path directory
 74 |             fn = os.path.join(data_path, msg['FileName'])
 75 |             msg['Text'](fn)
 76 |             if os.path.getsize(fn) == 0:
 77 |                 return []
 78 |             c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), fn)
 79 |         else:
 80 |             c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), msg['FileName'])
 81 |         return ['[%s]->[%s]:' % (sender, receiver), c]
 82 |     c = msg['Text']
 83 |     if len(msg['Url']) > 0:
 84 |         try: # handle map label
 85 |             content_tree = ETree.fromstring(msg['OriContent'])
 86 |             if content_tree is not None:
 87 |                 map_label = content_tree.find('location')
 88 |                 if map_label is not None:
 89 |                     c += ' ' + map_label.attrib['poiname']
 90 |                     c += ' ' + map_label.attrib['label']
 91 |         except:
 92 |             pass
 93 |         url = HTMLParser().unescape(msg['Url'])
 94 |         c += ' ' + url
 95 |     return ['[%s]->[%s]: %s' % (sender, receiver, c)]
 96 | 
 97 | @bot.msg_register([TEXT, PICTURE, MAP, CARD, SHARING, RECORDING,
 98 |     ATTACHMENT, VIDEO, FRIENDS], isFriendChat=True, isGroupChat=True)
 99 | def normal_msg(msg):
100 |     print_msg(get_whole_msg(msg))
101 |     now = time.time()
102 |     msg['ReceivedTime'] = now
103 |     msg_id = msg['MsgId']
104 |     msg_store[msg_id] = msg
105 |     clear_timeouted_message()
106 | 
107 | @bot.msg_register([NOTE], isFriendChat=True, isGroupChat=True)
108 | def note_msg(msg):
109 |     print_msg(get_whole_msg(msg))
110 |     content = HTMLParser().unescape(msg['Content'])
111 |     try:
112 |         content_tree = ETree.fromstring(content)
113 |     except Exception:
114 |         # invent/remove to chatroom
115 |         return
116 |     if content_tree is None:
117 |         return
118 |     revoked = content_tree.find('revokemsg')
119 |     if revoked is None:
120 |         return
121 |     old_msg_id = revoked.find('msgid').text
122 |     old_msg = msg_store.get(old_msg_id)
123 |     if old_msg is None:
124 |         return
125 |     msg_send = get_whole_msg(old_msg, download=True)
126 |     for m in msg_send:
127 |         bot.send(m, toUserName='filehelper')
128 |     clear_timeouted_message()
129 | 
130 | if __name__ == '__main__':
131 |     bot.run()


--------------------------------------------------------------------------------
/爬虫小demo/30 PythonZhuanFa.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | reload(sys)
  4 | sys.setdefaultencoding('UTF8')
  5 | 
  6 | import os, re, shutil, time, collections, json
  7 | import requests
  8 | from HTMLParser import HTMLParser
  9 | from xml.etree import ElementTree as ETree
 10 | import hashlib
 11 | 
 12 | import itchat
 13 | from itchat.content import *
 14 | 
 15 | sending_type = {'Picture': 'img', 'Video': 'vid'}
 16 | data_path = 'data'
 17 | group_uin = {u'技术群1': '42235582@chatroom',
 18 |     u'技术群2': '2424504406@chatroom',
 19 |         u'技术群3': '6203978346@chatroom'}
 20 | publishers = {u'技术群1': u'[阴险]',
 21 |     u'技术群2': u'[菜刀]',
 22 |         u'技术群3': u'[月亮]'}
 23 | subscribers = [u'技术群1', u'技术群2', u'技术群3']
 24 | nickname = ''
 25 | bot = None
 26 | as_chat_bot = True
 27 | 
 28 | if __name__ == '__main__':
 29 |     if not os.path.exists(data_path):
 30 |         os.mkdir(data_path)
 31 |     # if the QR code doesn't show correctly, you can try to change the value
 32 |     # of enableCdmQR to 1 or -1 or -2. It nothing works, you can change it to
 33 |     # enableCmdQR=True and a picture will show up.
 34 |     bot = itchat.new_instance()
 35 |     bot.auto_login(hotReload=True, enableCmdQR=2)
 36 |     nickname = bot.loginInfo['User']['NickName']
 37 | 
 38 | # tuling chat bot
 39 | def talks_robot(info):
 40 |     api_url = 'http://www.tuling123.com/openapi/api'
 41 |     apikey = ''
 42 |     data = {'key': apikey, 'info': info.lower()}
 43 |     req = requests.post(api_url, data=data, timeout=10).text
 44 |     replys = json.loads(req)['text']
 45 |     return replys
 46 | 
 47 | def get_sender_receiver(msg):
 48 |     sender = nickname
 49 |     receiver = nickname
 50 |     if msg['FromUserName'][0:2] == '@@': # group chat
 51 |         sender = msg['ActualNickName']
 52 |         m = bot.search_chatrooms(userName=msg['FromUserName'])
 53 |         if m is not None:
 54 |             receiver = m['NickName']
 55 |     elif msg['ToUserName'][0:2] == '@@': # group chat by myself
 56 |         if 'ActualNickName' in msg:
 57 |             sender = msg['ActualNickName']
 58 |         else:
 59 |             m = bot.search_friends(userName=msg['FromUserName'])
 60 |             if m is not None:
 61 |                 sender = m['NickName']
 62 |         m = bot.search_chatrooms(userName=msg['ToUserName'])
 63 |         if m is not None:
 64 |             receiver = m['NickName']
 65 |     else: # personal chat
 66 |         m = bot.search_friends(userName=msg['FromUserName'])
 67 |         if m is not None:
 68 |             sender = m['NickName']
 69 |         m = bot.search_friends(userName=msg['ToUserName'])
 70 |         if m is not None:
 71 |             receiver = m['NickName']
 72 |     return HTMLParser().unescape(sender), HTMLParser().unescape(receiver)
 73 | 
 74 | def print_msg(msg):
 75 |     msg_str = ' '.join(msg)
 76 |     print msg_str
 77 |     return msg_str
 78 | 
 79 | def get_whole_msg(msg, prefix, download=False):
 80 |     if len(msg['FileName']) > 0 and len(msg['Url']) == 0:
 81 |         if download: # download the file into data_path directory
 82 |             fn = os.path.join(data_path, msg['FileName'])
 83 |             msg['Text'](fn)
 84 |             if os.path.getsize(fn) == 0:
 85 |                 return []
 86 |             c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), fn)
 87 |         else:
 88 |             c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), msg['FileName'])
 89 |         return ['%s:' % (prefix), c]
 90 |     c = msg['Text']
 91 |     if len(msg['Url']) > 0:
 92 |         if len(msg['OriContent']) > 0:
 93 |             try: # handle map label
 94 |                 content_tree = ETree.fromstring(msg['OriContent'])
 95 |                 if content_tree is not None:
 96 |                     map_label = content_tree.find('location')
 97 |                     if map_label is not None:
 98 |                         c += ' ' + map_label.attrib['poiname']
 99 |                         c += ' ' + map_label.attrib['label']
100 |             except:
101 |                 pass
102 |         url = HTMLParser().unescape(msg['Url'])
103 |         c += ' ' + url
104 |     return ['%s: %s' % (prefix, c)]
105 | 
106 | @bot.msg_register([TEXT], isFriendChat=True, isGroupChat=False)
107 | def personal_msg(msg):
108 |     global as_chat_bot
109 |     text = msg['Text'].strip()
110 |     if text == u'闭嘴':
111 |         as_chat_bot = False
112 |     if text == u'张嘴吃药':
113 |         as_chat_bot = True
114 |     return talks_robot(text)
115 | 
116 | @bot.msg_register([FRIENDS])
117 | def accept_friend(msg):
118 |     bot.add_friend(msg['RecommendInfo']['UserName'], 3)
119 | 
120 | @bot.msg_register([TEXT, PICTURE, MAP, SHARING, RECORDING, ATTACHMENT, VIDEO],
121 |                   isFriendChat=False, isGroupChat=True)
122 | def group_msg(msg):
123 |     # chat bot functionality
124 |     global as_chat_bot
125 |     if 'IsAt' in msg and msg['IsAt'] == True and \
126 |         msg['Type'] == 'Text' and \
127 |         msg['ToUserName'][0:2] != '@@' and \
128 |         msg['Text'].find(u'@' + nickname) >= 0:
129 |         text = msg['Text'].replace(u'@' + nickname, '').strip()
130 |         if text == u'shit':
131 |             as_chat_bot = False
132 |             return
133 |         if as_chat_bot:
134 |             info = talks_robot(text)
135 |             if info.find('No Know') >= 0:
136 |                 return
137 |             if info.find('No Can') >= 0:
138 |                 return
139 |             if info.find('Sorry') >= 0:
140 |                 return
141 |             return info
142 |         return
143 |     # forwarding functionality
144 |     group = msg['FromUserName']
145 |     if msg['ToUserName'][0:2] == '@@': # message sent by myself
146 |         group = msg['ToUserName']
147 | sender, receiver = get_sender_receiver(msg)
148 | if sender == '':
149 |     sender = nickname
150 |     # check if the message is from the publisher groups
151 |     if receiver not in publishers: # if not in the publishers, do nothing
152 |         return
153 | # turn on the chat bot if this magic happens
154 | if msg['Type'] == 'Text' and \
155 |     hashlib.sha256(msg['Text']).hexdigest()[-2:] == '23':
156 |         as_chat_bot = True
157 | # process message and send it to all the subscribed groups
158 | prefix = '%s[%s]' % (publishers[receiver], sender)
159 | msg_send = get_whole_msg(msg, prefix=prefix, download=True)
160 | if len(msg_send) == 0:
161 |     return
162 |     print_msg(msg_send)
163 |     for tosend in subscribers:
164 |         room = bot.search_chatrooms(name=tosend)
165 |         for r in room:
166 |             if r['UserName'] == group: # don't send back to the source
167 |                 continue
168 |             if r['NickName'] != tosend: # check group name exact match
169 |                 continue
170 |             for m in msg_send: # iterate messages (for images, videos, and files)
171 |                 bot.send(m, toUserName=r['UserName'])
172 | 
173 | if __name__ == '__main__':
174 |     bot.run()
175 | 


--------------------------------------------------------------------------------
/爬虫小demo/31 下载bilibili视频.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import html
 3 | import re
 4 | import urllib3
 5 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 6 | 
 7 | def star(url):
 8 |     url2 = "https://api.bilibili.com/x/player/playurl?avid={avid}&cid={cid}&qn=32&type=&otype=json"
 9 |     headers2 = {
10 |         "host": "",
11 |         "Referer": "https://www.bilibili.com",
12 |         "User-Agent": "Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,likeGecko)Chrome/63.0.3239.132Safari/537.36"
13 |     }
14 | 
15 |     avid = re.findall("video/av(.+)\?", url)
16 |     print(avid)
17 |     cid ,name = get_cid(avid[0])
18 |     print(cid,name)
19 |     flv_url , size = get_flvurl(url2.format(avid=avid[0],cid=cid))
20 |     shuju = size / 1024 / 1024
21 |     print("本视频大小为：%.2fM" % shuju)
22 | 
23 |     h = re.findall("https://(.+)com",flv_url)
24 |     host = h[0]+"com"
25 | 
26 |     headers2["host"] = host
27 |     res = requests.get(flv_url,headers=headers2,stream=True, verify=False)
28 |     print(res.status_code)
29 |     save_movie(res,name)
30 | 
31 | def get_cid(aid):#获得cid
32 |     header = {
33 |         'host': 'api.bilibili.com',
34 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
35 |              }
36 |     url = "https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp".format(aid=aid)
37 |     response = requests.get(url,headers=header).json()
38 |     # print(response["data"])
39 |     # 这个地方设置index是因为下载集合里面的视频,顺序,0代表下载第一个视频,1代表下载集合里面第二个视频,2,3,4...依次类推
40 |     index = 0
41 |     return response["data"][index]["cid"] ,response["data"][index]["part"]
42 | def get_flvurl(url):#获得视频真实flv地址
43 |     header = {'host': 'api.bilibili.com',
44 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
45 | 
46 |     response = requests.get(url,headers=header).json()
47 |     return response["data"]["durl"][0]["url"],response["data"]["durl"][0]["size"]
48 | def save_movie(res,name):#保存视频
49 |     chunk_size = 1024
50 |     with open("{name}.flv".format(name = name),"wb") as f:
51 |         for data in res.iter_content(1024):
52 |             f.write(data)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     # 把下面的av后面的'583959574'在要下载的视频集合里面找到就可以下载视频了
57 |     url = "https://www.bilibili.com/video/av583959574?spm_id_from=333.334.b_62696c695f646f756761.5"
58 |     star(url)
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/爬虫小demo/32  m3u8.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | from Crypto.Cipher import AES
 4 | 
 5 | def m3u8(url):
 6 |     header = {
 7 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
 8 |     }
 9 |     # requests得到m3u8文件内容
10 |     content = requests.get(url, headers=header).text
11 |     if "#EXTM3U" not in content:
12 |         print("这不是一个m3u8的视频链接！")
13 |         return False
14 |     if "EXT-X-KEY" not in content:
15 |         print("没有加密")
16 |         return False
17 | 
18 |     # 使用re正则得到key和视频地址
19 |     jiami = re.findall('#EXT-X-KEY:(.*)',content)
20 |     key = re.findall('URI="(.*)"', jiami[0])
21 |     vi = re.findall('IV=(.*)', jiami[0])[0]
22 | 
23 |     # 得到每一个ts视频链接
24 | 
25 |     # tslist = re.findall('EXTINF:(.*), (. *)',content.replace(' ', '').replace(r'\n', ''))
26 |     tslist = re.findall('v.f240.ts(.*)',content)
27 | 
28 |     newlist = []
29 |     for i in tslist:
30 |         newlist.append("v.f240.ts" + i)
31 |     # print(newlist)
32 |     # 得到key的链接并请求得到加密的key值
33 |     keyurl = key[0]
34 |     keycontent = requests.get(keyurl, headers=header).content
35 | 
36 |     # 得到每一个完整视频的链接地址
37 |     base_url = url.replace(url.split('/')[-1], '')
38 |     # print(base_url)
39 |     tslisturl = []
40 |     for i in newlist:
41 |         tsurl = base_url + i
42 |         tslisturl.append(tsurl)
43 | 
44 |     # 得到解密方法，这里要导入第三方库  pycrypto
45 |     # 这里有一个问题，安装pycrypto成功后，导入from Crypto.Cipher import AES报错
46 |     # 找到使用python环境的文件夹，在Lib文件夹下有一个 site-packages 文件夹，里面是我们环境安装的包。
47 |     # 找到一个crypto文件夹，打开可以看到 Cipher文件夹，此时我们将 crypto文件夹改为 Crypto 即可使用了
48 |     # 必须添加b'0000000000000000'，防止报错ValueError: IV must be 16 bytes long
49 |     cryptor = AES.new(keycontent, AES.MODE_CBC, b'0000000000000000')
50 | 
51 |     # for循环获取视频文件
52 |     for i in tslisturl:
53 |         print(i)
54 |         res = requests.get(i, header)
55 |         # 使用解密方法解密得到的视频文件
56 |         cont = cryptor.decrypt(res.content)
57 |     # 以追加的形式保存为mp4文件，mp4可以随意命名，这里命名为小鹅通视频下载测试
58 |         with open('14-搜索组件界面实现.mp4', 'ab+') as f:
59 |             f.write(cont)
60 |     return True
61 | 
62 | if __name__ == '__main__':
63 |     # 这个是网页上查到的小鹅通的卖u8地址
64 |     # url = "https://1252524126.vod2.myqcloud.com/9764a7a5vodtransgzp1252524126/91c29aad5285890807164109582/drm/v.f146750.m3u8"
65 |     # url = "https://1258102968.vod2.myqcloud.com/ed7d8254vodtranscq1258102968/a61912e43701925923160746329/drm/v.f240.m3u8?t=62dfad73&us=DYws6oOg3A&sign=1d4381d06b276e87eae478a23f3d6375"
66 |     url = "https://1258102968.vod2.myqcloud.com/ed7d8254vodtranscq1258102968/a3ae8ff93701925923160630524/drm/v.f240.m3u8?t=62dfaf5a&us=RquNSsL6XT&sign=8bec9ca974f9413c9bad7a9e8d620ae2"
67 |     pd = m3u8(url)
68 |     if pd:
69 |       print('视频下载完成！')


--------------------------------------------------------------------------------