├── .DS_Store
├── .gitignore
├── CrawlYouYuan
├── .idea
│ ├── CrawlYouYuan.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── CrawlYouYuan
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── youyuan.py
├── begin.py
└── scrapy.cfg
├── DouBanMovie
├── .DS_Store
├── .idea
│ ├── DouBanMovie.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── DouBanMovie
│ ├── .DS_Store
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── .DS_Store
│ │ ├── __init__.py
│ │ └── douban.py
├── begin.py
├── movie.json
└── scrapy.cfg
├── DouYuSpider
├── .idea
│ ├── DouYuSpider.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── DouYuSpider
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── douyu.py
├── Images
│ ├── Cute兔丶.jpg
│ ├── MiS媛.jpg
│ ├── Super超级冷.jpg
│ ├── Yozi柚子妹妹.jpg
│ ├── pinky水蜜桃.jpg
│ ├── 一只小玲儿.jpg
│ ├── 会玩的黄宝宝.jpg
│ ├── 冷伊宁.jpg
│ ├── 十四万岁的青丘老太婆.jpg
│ ├── 可乐小十五.jpg
│ ├── 吃萝卜的辛巴.jpg
│ ├── 咘咘柳.jpg
│ ├── 大宝SOD蜜不是润肤露.jpg
│ ├── 大木头CL.jpg
│ ├── 小依泽儿.jpg
│ ├── 小口古小咕.jpg
│ ├── 小圆脸娜娜.jpg
│ ├── 小小小思齐.jpg
│ ├── 小雅er.jpg
│ ├── 尛小钰.jpg
│ ├── 左思念.jpg
│ ├── 巫女蛋.jpg
│ ├── 布丁味的雯宝宝.jpg
│ ├── 幼齿懵骚小安妮.jpg
│ ├── 悠悠fairy.jpg
│ ├── 懵G娜.jpg
│ ├── 是囧囧初啊.jpg
│ ├── 江沅是个小可爱.jpg
│ ├── 温柔的喵小胖.jpg
│ ├── 爱笑的蒙蒙.jpg
│ ├── 璇璇璇儿丶Tay.jpg
│ ├── 甜馨大队长.jpg
│ ├── 白羊可爱多.jpg
│ ├── 磨人的小柠檬.jpg
│ ├── 糖炒栗子lr.jpg
│ ├── 糖糖小萌主.jpg
│ ├── 紫絮儿521.jpg
│ ├── 苏思淳sheep.jpg
│ ├── 若儿被注册了呢.jpg
│ ├── 诗诗诗诗诗诗酱.jpg
│ ├── 谷猫宁.jpg
│ ├── 辣椒酱jiang.jpg
│ ├── 迷人的小北北.jpg
│ ├── 阿青Dale.jpg
│ ├── 陈梓不是橙子.jpg
│ └── 鲸鱼妹爱素颜.jpg
├── begin.py
├── douyu.json
└── scrapy.cfg
├── HongNiangNet
├── .DS_Store
├── .idea
│ ├── HongNiangNet.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── HongNiangNet
│ ├── .DS_Store
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── .DS_Store
│ │ ├── __init__.py
│ │ └── hongniang.py
├── begin.py
├── content.json
└── scrapy.cfg
├── LICENSE
├── README.md
├── duodian
├── .idea
│ ├── duodian.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── db.sqlite3
├── duodian
│ ├── __init__.py
│ ├── settings.py
│ ├── urls.py
│ └── wsgi.py
├── manage.py
├── myduodian
│ ├── __init__.py
│ ├── admin.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ └── views.py
├── templates
│ └── myduodian
│ │ └── index.html
└── woduodian.py
├── gongzhonghao.jpeg
├── jiekou
├── .idea
│ ├── jiekou.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── db.sqlite3
├── jiekou
│ ├── __init__.py
│ ├── settings.py
│ ├── urls.py
│ └── wsgi.py
├── manage.py
├── myjiekou
│ ├── __init__.py
│ ├── admin.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ └── views.py
└── templates
│ └── myjiekou
│ └── index.html
├── teacherInfo
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── teacherInfo.iml
│ └── workspace.xml
├── begin.py
├── scrapy.cfg
├── teacher.json
└── teacherInfo
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── myteacher.py
└── 爬虫小demo
├── .DS_Store
├── 01 taobao.py
├── 02 doubanzhihu.py
├── 03 douYuUnittest.py
├── 04 fileHandler.py
├── 05 getimage.py
├── 06 jsload.py
├── 07 jsondata.py
├── 08 jsonpath和json总结.py
├── 09 zhihu_login.py
├── 10 match.py
├── 11 neihan.py
├── 12 PIL.py
├── 13 queryxpath.py
├── 14 selenium执行js.py
├── 15 tencent.py
├── 16 xunmall.py
├── 17 zhihulogin.py
├── 18 github_login.py
├── 19 jd_login.py
├── 20 下载网易云歌词.py
├── 21 TaoBaoInfo.py
├── 22 JDPython.py
├── 23 tuchongnet.py
├── 24 pythonDuoDian.py
├── 25 PythonItChat.py
├── 26 PythonWeChat.py
├── 27 PythonWordCloud.py
├── 28 PythonCheHui.py
├── 29 PythonCeHui.py
├── 30 PythonZhuanFa.py
├── 31 下载bilibili视频.py
└── 32 m3u8.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/CrawlYouYuan/.idea/CrawlYouYuan.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/CrawlYouYuan/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/CrawlYouYuan/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/CrawlYouYuan/CrawlYouYuan/__init__.py
--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | class CrawlyouyuanItem(scrapy.Item):
11 | # 用户名
12 | username = scrapy.Field()
13 | # 年龄
14 | age = scrapy.Field()
15 | # 头像图片的链接
16 | header_url = scrapy.Field()
17 | # 相册图片的链接
18 | images_url = scrapy.Field()
19 | # 内心独白
20 | content = scrapy.Field()
21 | # 籍贯
22 | place_from = scrapy.Field()
23 | # 学历
24 | education = scrapy.Field()
25 | # 兴趣爱好
26 | hobby = scrapy.Field()
27 | # 个人主页
28 | source_url = scrapy.Field()
29 | # 数据来源网站
30 | sourec = scrapy.Field()
31 | # utc 时间
32 | time = scrapy.Field()
33 | # 爬虫名
34 | spidername = scrapy.Field()
35 |
--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import json
8 | import codecs
9 |
10 | class CrawlyouyuanPipeline(object):
11 |
12 | def __init__(self):
13 | self.filename = codecs.open('content.json', 'w', encoding='utf-8')
14 |
15 | def process_item(self, item, spider):
16 | html = json.dumps(dict(item), ensure_ascii=False)
17 | self.filename.write(html + '\n')
18 | return item
19 |
20 | def spider_closed(self, spider):
21 | self.filename.close()
22 |
23 |
--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for CrawlYouYuan project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'CrawlYouYuan'
13 |
14 | SPIDER_MODULES = ['CrawlYouYuan.spiders']
15 | NEWSPIDER_MODULE = 'CrawlYouYuan.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'CrawlYouYuan.middlewares.MyCustomSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | # 'CrawlYouYuan.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | 'CrawlYouYuan.pipelines.CrawlyouyuanPipeline': 300,
68 | }
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/CrawlYouYuan/CrawlYouYuan/spiders/youyuan.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from CrawlYouYuan.items import CrawlyouyuanItem
6 | import re
7 | class YouyuanSpider(CrawlSpider):
8 | name = 'youyuan'
9 | allowed_domains = ['youyuan.com']
10 | start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
11 | # 自动生成的文件不需要改东西,只需要添加rules文件里面Rule角色就可以
12 | # 每一页匹配规则
13 | page_links = LinkExtractor(allow=(r"youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/"))
14 | # 每个人个人主页匹配规则
15 | profile_links = LinkExtractor(allow=(r"youyuan.com/\d+-profile/"))
16 | rules = (
17 | # 没有回调函数,说明follow是True
18 | Rule(page_links),
19 | # 有回调函数,说明follow是False
20 | Rule(profile_links, callback='parse_item', follow=True),
21 | )
22 |
23 | def parse_item(self, response):
24 | item = CrawlyouyuanItem()
25 |
26 | item['username'] = self.get_username(response)
27 | # 年龄
28 | item['age'] = self.get_age(response)
29 | # 头像图片的链接
30 | item['header_url'] = self.get_header_url(response)
31 | # 相册图片的链接
32 | item['images_url'] = self.get_images_url(response)
33 | # 内心独白
34 | item['content'] = self.get_content(response)
35 | # 籍贯
36 | item['place_from'] = self.get_place_from(response)
37 | # 学历
38 | item['education'] = self.get_education(response)
39 | # 兴趣爱好
40 | item['hobby'] = self.get_hobby(response)
41 | # 个人主页
42 | item['source_url'] = response.url
43 | # 数据来源网站
44 | item['sourec'] = "youyuan"
45 |
46 | yield item
47 |
48 | def get_username(self, response):
49 | username = response.xpath("//dl[@class='personal_cen']//div[@class='main']/strong/text()").extract()
50 | if len(username):
51 | username = username[0]
52 | else:
53 | username = "NULL"
54 | return username.strip()
55 |
56 | def get_age(self, response):
57 | age = response.xpath("//dl[@class='personal_cen']//dd/p/text()").extract()
58 | if len(age):
59 | age = re.findall(u"\d+岁", age[0])[0]
60 | else:
61 | age = "NULL"
62 | return age.strip()
63 |
64 | def get_header_url(self, response):
65 | header_url = response.xpath("//dl[@class='personal_cen']/dt/img/@src").extract()
66 | if len(header_url):
67 | header_url = header_url[0]
68 | else:
69 | header_url = "NULL"
70 | return header_url.strip()
71 |
72 | def get_images_url(self, response):
73 | images_url = response.xpath("//div[@class='ph_show']/ul/li/a/img/@src").extract()
74 | if len(images_url):
75 | images_url = ", ".join(images_url)
76 | else:
77 | images_url = "NULL"
78 | return images_url
79 |
80 | def get_content(self, response):
81 | content = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract()
82 | if len(content):
83 | content = content[0]
84 | else:
85 | content = "NULL"
86 | return content.strip()
87 |
88 | def get_place_from(self, response):
89 | place_from = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract()
90 | if len(place_from):
91 | place_from = place_from[0]
92 | else:
93 | place_from = "NULL"
94 | return place_from.strip()
95 |
96 | def get_education(self, response):
97 | education = response.xpath("//div[@class='pre_data']/ul/li[3]//ol[2]/li[2]/span/text()").extract()
98 | if len(education):
99 | education = education[0]
100 | else:
101 | education = "NULL"
102 | return education.strip()
103 |
104 | def get_hobby(self, response):
105 | hobby = response.xpath("//dl[@class='personal_cen']//ol/li/text()").extract()
106 | if len(hobby):
107 | hobby = ",".join(hobby).replace(" ", "")
108 | else:
109 | hobby = "NULL"
110 | return hobby.strip()
111 |
--------------------------------------------------------------------------------
/CrawlYouYuan/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl youyuan'.split())
--------------------------------------------------------------------------------
/CrawlYouYuan/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = CrawlYouYuan.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = CrawlYouYuan
12 |
--------------------------------------------------------------------------------
/DouBanMovie/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/.DS_Store
--------------------------------------------------------------------------------
/DouBanMovie/.idea/DouBanMovie.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/DouBanMovie/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/DouBanMovie/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/.DS_Store
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/__init__.py
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DoubanmovieItem(scrapy.Item):
12 | # 标题
13 | title = scrapy.Field()
14 | # 信息
15 | info = scrapy.Field()
16 | # 评分
17 | star = scrapy.Field()
18 | # 简介
19 | quote = scrapy.Field()
20 |
21 |
22 |
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import codecs
9 | import json
10 | import pymongo
11 | from scrapy.conf import settings
12 |
13 | class DoubanmoviePipeline(object):
14 | host = settings["MONGODB_HOST"]
15 | port = settings["MONGODB_PORT"]
16 | dbname = settings["MONGODB_DBNAME"]
17 | sheetname = settings["MONGODB_SHEETNAME"]
18 |
19 | # 创建MONGODB数据库链接
20 | client = pymongo.MongoClient(host=host, port=port)
21 | # 指定数据库
22 | mydb = client[dbname]
23 | # 存放数据的数据库表名
24 | sheet = mydb[sheetname]
25 | def process_item(self, item, spider):
26 | # 1. 生成文件
27 | # self.filename = codecs.open('movie.json','a',encoding='utf-8')
28 | # html = json.dumps(dict(item),ensure_ascii=False)
29 | # self.filename.write(html + '\n')
30 | # self.filename.close()
31 | # 2. 把数据插入数据库
32 | data = dict(item)
33 | self.sheet.insert(data)
34 |
35 | return item
36 |
37 |
38 |
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for DouBanMovie project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'DouBanMovie'
13 |
14 | SPIDER_MODULES = ['DouBanMovie.spiders']
15 | NEWSPIDER_MODULE = 'DouBanMovie.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'DouBanMovie (+http://www.yourdomain.com)'
20 | USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
21 |
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 |
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 |
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 |
42 | # Override the default request headers:
43 | # DEFAULT_REQUEST_HEADERS = {
44 | # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)',
45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | # 'Accept-Language': 'en',
47 | # }
48 |
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'DouBanMovie.middlewares.MyCustomSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'DouBanMovie.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 | 'DouBanMovie.pipelines.DoubanmoviePipeline': 300,
71 | }
72 | # MONGODB 主机名
73 | MONGODB_HOST = "127.0.0.1"
74 |
75 | # MONGODB 端口号
76 | MONGODB_PORT = 27017
77 |
78 | # 数据库名称
79 | MONGODB_DBNAME = "Douban"
80 |
81 | # 存放数据的表名称
82 | MONGODB_SHEETNAME = "doubanmovies"
83 | # Enable and configure the AutoThrottle extension (disabled by default)
84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
85 | #AUTOTHROTTLE_ENABLED = True
86 | # The initial download delay
87 | #AUTOTHROTTLE_START_DELAY = 5
88 | # The maximum download delay to be set in case of high latencies
89 | #AUTOTHROTTLE_MAX_DELAY = 60
90 | # The average number of requests Scrapy should be sending in parallel to
91 | # each remote server
92 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
93 | # Enable showing throttling stats for every response received:
94 | #AUTOTHROTTLE_DEBUG = False
95 |
96 | # Enable and configure HTTP caching (disabled by default)
97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
98 | #HTTPCACHE_ENABLED = True
99 | #HTTPCACHE_EXPIRATION_SECS = 0
100 | #HTTPCACHE_DIR = 'httpcache'
101 | #HTTPCACHE_IGNORE_HTTP_CODES = []
102 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
103 |
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/spiders/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouBanMovie/DouBanMovie/spiders/.DS_Store
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/DouBanMovie/DouBanMovie/spiders/douban.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from DouBanMovie.items import DoubanmovieItem
4 |
5 | class DoubanSpider(scrapy.Spider):
6 | name = "douban"
7 | allowed_domains = ["movie.douban.com"]
8 | offset = 0
9 | url = 'https://movie.douban.com/top250?start='
10 | start_urls = (
11 | url + str(offset),
12 | )
13 |
14 | def parse(self, response):
15 | item = DoubanmovieItem()
16 | # 电影全部信息
17 | movies = response.xpath("//div[@class='info']")
18 | for eachmovie in movies:
19 |
20 | titlelist = eachmovie.xpath("./div[@class='hd']/a/span[@class='title'][1]/text()")
21 | if len(titlelist) == 0:
22 | item['title'] = ''
23 | else:
24 | item['title'] = titlelist.extract()[0]
25 | info = eachmovie.xpath("./div[@class='bd']/p/text()").extract()[0]
26 | item['info'] = info.replace('\n','').strip()
27 | item['star'] = eachmovie.xpath("./div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()").extract()[0]
28 | quotelist = eachmovie.xpath("./div[@class='bd']/p[@class='quote']/span[@class='inq']/text()")
29 | if len(quotelist) == 0:
30 | item['quote'] = ''
31 | else:
32 | item['quote'] = quotelist.extract()[0]
33 | yield item
34 |
35 |
36 | if self.offset < 225:
37 | self.offset += 25
38 | yield scrapy.Request(self.url + str(self.offset),callback = self.parse)
39 |
40 |
--------------------------------------------------------------------------------
/DouBanMovie/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl douban'.split())
--------------------------------------------------------------------------------
/DouBanMovie/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = DouBanMovie.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DouBanMovie
12 |
--------------------------------------------------------------------------------
/DouYuSpider/.idea/DouYuSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/DouYuSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/DouYuSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/DouYuSpider/__init__.py
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DouyuspiderItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # 房间名
14 | vertical = scrapy.Field()
15 | # 昵称
16 | name = scrapy.Field()
17 | # 房间照片
18 | room_src = scrapy.Field()
19 | # 地区
20 | anchor_city = scrapy.Field()
21 | imagesPath = scrapy.Field()
22 |
23 |
24 |
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class DouyuspiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import scrapy
9 | # import codecs
10 | # import json
11 | import os
12 | from scrapy.pipelines.images import ImagesPipeline
13 | from scrapy.utils.project import get_project_settings
14 |
15 | # class DouyuspiderPipeline(object):
16 | # def __init__(self):
17 | # # 创建一个只写文件,指定文本编码格式为utf-8
18 | # self.filename = codecs.open('douyu.json', 'w', encoding='utf-8')
19 | # def process_item(self, item, spider):
20 | #
21 | # html = json.dumps(dict(item),ensure_ascii='utf-8')
22 | # self.filename.write(html + '\n')
23 | # return item
24 | #
25 | # # def spider_closed(self, spider):
26 | # # self.file.close()
27 |
28 | # scrapy下载图片需要安装pip install image/Pillow
29 | class DouYuImagesPipelines(ImagesPipeline):
30 | IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
31 |
32 | def get_media_requests(self, item, info):
33 | image_url = item["vertical"]
34 | yield scrapy.Request(image_url)
35 |
36 | def item_completed(self, results, item, info):
37 | # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确,就放到 image_path里,ImagesPipeline源码剖析可见
38 | image_path = [x["path"] for ok, x in results if ok]
39 |
40 | os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg")
41 | item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"]
42 |
43 | return item
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for DouYuSpider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'DouYuSpider'
13 |
14 | SPIDER_MODULES = ['DouYuSpider.spiders']
15 | NEWSPIDER_MODULE = 'DouYuSpider.spiders'
16 |
17 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
18 |
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'DouYuSpider (+http://www.yourdomain.com)'
21 |
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 |
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 |
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 |
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | # 'Accept-Language': 'en',
46 | #}
47 |
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | # 'DouYuSpider.middlewares.DouyuspiderSpiderMiddleware': 543,
52 | #}
53 |
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | # 'DouYuSpider.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | # 'DouYuSpider.pipelines.DouyuspiderPipeline': 300,
70 | 'DouYuSpider.pipelines.DouYuImagesPipelines': 300,
71 | }
72 | IMAGES_STORE = "/Users/yunmei/Desktop/scrapyenv/Python-Spider/DouYuSpider/Images"
73 | # 日志
74 | # LOG_FILE = "dg.log"
75 | # LOG_LEVEL = "DEBUG"
76 |
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 |
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 |
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/DouYuSpider/DouYuSpider/spiders/douyu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import json
4 |
5 | from DouYuSpider.items import DouyuspiderItem
6 | class DouyuSpider(scrapy.Spider):
7 | name = 'douyu'
8 | # 不可设置为allowed_domains = ['http://capi.douyucdn.cn']
9 | allowed_domains = ['capi.douyucdn.cn']
10 |
11 | offset = 0
12 | url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
13 |
14 | start_urls = [url + str(offset)]
15 |
16 | def parse(self, response):
17 | data = json.loads(response.text)['data']
18 |
19 | for each in data:
20 | item = DouyuspiderItem()
21 |
22 | item["vertical"] = each["vertical_src"].encode("utf-8")
23 | item["name"] = each["nickname"].encode("utf-8")
24 | item["room_src"] = each["room_src"].encode("utf-8")
25 | item["anchor_city"] = each["anchor_city"].encode("utf-8")
26 |
27 | yield item
28 |
29 | self.offset += 20
30 | yield scrapy.Request(self.url + str(self.offset),callback = self.parse)
31 |
32 |
--------------------------------------------------------------------------------
/DouYuSpider/Images/Cute兔丶.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Cute兔丶.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/MiS媛.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/MiS媛.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/Super超级冷.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Super超级冷.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/Yozi柚子妹妹.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/Yozi柚子妹妹.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/pinky水蜜桃.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/pinky水蜜桃.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/一只小玲儿.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/一只小玲儿.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/会玩的黄宝宝.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/会玩的黄宝宝.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/冷伊宁.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/冷伊宁.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/十四万岁的青丘老太婆.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/十四万岁的青丘老太婆.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/可乐小十五.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/可乐小十五.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/吃萝卜的辛巴.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/吃萝卜的辛巴.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/咘咘柳.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/咘咘柳.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/大宝SOD蜜不是润肤露.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/大宝SOD蜜不是润肤露.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/大木头CL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/大木头CL.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/小依泽儿.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小依泽儿.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/小口古小咕.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小口古小咕.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/小圆脸娜娜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小圆脸娜娜.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/小小小思齐.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小小小思齐.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/小雅er.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/小雅er.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/尛小钰.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/尛小钰.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/左思念.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/左思念.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/巫女蛋.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/巫女蛋.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/布丁味的雯宝宝.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/布丁味的雯宝宝.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/幼齿懵骚小安妮.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/幼齿懵骚小安妮.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/悠悠fairy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/悠悠fairy.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/懵G娜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/懵G娜.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/是囧囧初啊.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/是囧囧初啊.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/江沅是个小可爱.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/江沅是个小可爱.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/温柔的喵小胖.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/温柔的喵小胖.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/爱笑的蒙蒙.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/爱笑的蒙蒙.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/璇璇璇儿丶Tay.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/璇璇璇儿丶Tay.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/甜馨大队长.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/甜馨大队长.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/白羊可爱多.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/白羊可爱多.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/磨人的小柠檬.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/磨人的小柠檬.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/糖炒栗子lr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/糖炒栗子lr.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/糖糖小萌主.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/糖糖小萌主.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/紫絮儿521.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/紫絮儿521.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/苏思淳sheep.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/苏思淳sheep.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/若儿被注册了呢.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/若儿被注册了呢.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/诗诗诗诗诗诗酱.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/诗诗诗诗诗诗酱.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/谷猫宁.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/谷猫宁.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/辣椒酱jiang.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/辣椒酱jiang.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/迷人的小北北.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/迷人的小北北.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/阿青Dale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/阿青Dale.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/陈梓不是橙子.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/陈梓不是橙子.jpg
--------------------------------------------------------------------------------
/DouYuSpider/Images/鲸鱼妹爱素颜.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/Images/鲸鱼妹爱素颜.jpg
--------------------------------------------------------------------------------
/DouYuSpider/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl douyu'.split())
--------------------------------------------------------------------------------
/DouYuSpider/douyu.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/DouYuSpider/douyu.json
--------------------------------------------------------------------------------
/DouYuSpider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = DouYuSpider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = DouYuSpider
12 |
--------------------------------------------------------------------------------
/HongNiangNet/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/.DS_Store
--------------------------------------------------------------------------------
/HongNiangNet/.idea/HongNiangNet.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/HongNiangNet/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/HongNiangNet/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/.DS_Store
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/__init__.py
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy import Field,Item
10 |
11 | class HongniangnetItem(Item):
12 | # define the fields for your item here like:
13 | # 用户名
14 | username = Field()
15 | # 年龄
16 | age = Field()
17 | # 头像图片链接
18 | header_link = Field()
19 | # 相册图片链接
20 | images_url = Field()
21 | # 内心独白
22 | content = Field()
23 | # 籍贯
24 | place_from= Field()
25 | # 学历
26 | education = Field()
27 | # 爱好
28 | hobby = Field()
29 | # 个人主页链接
30 | source_url = Field()
31 | # 数据来源网站
32 | source = Field()
33 |
34 |
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class HongniangnetSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import codecs
9 | import json
10 | class HongniangnetPipeline(object):
11 |
12 |
13 | def __init__(self):
14 | self.filename = codecs.open('content.json', 'w', encoding='utf-8')
15 | def process_item(self, item, spider):
16 | html = json.dumps(dict(item),ensure_ascii=False)
17 | # self.filename.write(html + '\n')
18 | self.filename.write(html + '\n')
19 | return item
20 |
21 | def spider_closed(self, spider):
22 | self.filename.close()
23 |
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for HongNiangNet project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'HongNiangNet'
13 |
14 | SPIDER_MODULES = ['HongNiangNet.spiders']
15 | NEWSPIDER_MODULE = 'HongNiangNet.spiders'
16 |
17 | # 分布式爬虫设置Ip端口
18 | REDIS_HOST = '192.168.19.206'
19 | REDIS_PORT = 6379
20 |
21 |
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'HongNiangNet (+http://www.yourdomain.com)'
24 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
25 | # Obey robots.txt rules
26 | ROBOTSTXT_OBEY = True
27 |
28 |
29 | # 使用了scrapy-redis里的去重组件,不使用scrapy默认的去重
30 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
31 | # 使用了scrapy-redis里的调度器组件,不实用scrapy默认的调度器
32 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
33 | # 使用队列形式
34 | SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
35 | # 允许暂停,redis请求记录不丢失
36 | SCHEDULER_PERSIST = True
37 |
38 |
39 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
40 | #CONCURRENT_REQUESTS = 32
41 |
42 | # Configure a delay for requests for the same website (default: 0)
43 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
44 | # See also autothrottle settings and docs
45 | # DOWNLOAD_DELAY = 3
46 | # The download delay setting will honor only one of:
47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
48 | #CONCURRENT_REQUESTS_PER_IP = 16
49 |
50 | # Disable cookies (enabled by default)
51 | #COOKIES_ENABLED = False
52 |
53 | # Disable Telnet Console (enabled by default)
54 | #TELNETCONSOLE_ENABLED = False
55 |
56 | # Override the default request headers:
57 | #DEFAULT_REQUEST_HEADERS = {
58 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 | # 'Accept-Language': 'en',
60 | #}
61 |
62 | # Enable or disable spider middlewares
63 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
64 | # SPIDER_MIDDLEWARES = {
65 | # 'HongNiangNet.middlewares.HongniangnetSpiderMiddleware': 543,
66 | # }
67 |
68 | # Enable or disable downloader middlewares
69 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
70 | #DOWNLOADER_MIDDLEWARES = {
71 | # 'HongNiangNet.middlewares.MyCustomDownloaderMiddleware': 543,
72 | #}
73 |
74 | # Enable or disable extensions
75 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
76 | #EXTENSIONS = {
77 | # 'scrapy.extensions.telnet.TelnetConsole': None,
78 | #}
79 |
80 | # Configure item pipelines
81 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
82 | ITEM_PIPELINES = {
83 | 'HongNiangNet.pipelines.HongniangnetPipeline': 300,
84 | 'scrapy_redis.pipelines.RedisPipeline' : 400,
85 | }
86 |
87 | # Enable and configure the AutoThrottle extension (disabled by default)
88 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
89 | #AUTOTHROTTLE_ENABLED = True
90 | # The initial download delay
91 | #AUTOTHROTTLE_START_DELAY = 5
92 | # The maximum download delay to be set in case of high latencies
93 | #AUTOTHROTTLE_MAX_DELAY = 60
94 | # The average number of requests Scrapy should be sending in parallel to
95 | # each remote server
96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
97 | # Enable showing throttling stats for every response received:
98 | #AUTOTHROTTLE_DEBUG = False
99 |
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = 'httpcache'
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 |
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/spiders/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/HongNiangNet/spiders/.DS_Store
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/HongNiangNet/HongNiangNet/spiders/hongniang.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | # from scrapy.spiders import CrawlSpider, Rule
5 | from HongNiangNet.items import HongniangnetItem
6 | # 分布式
7 | from scrapy.spider import Rule
8 | from scrapy_redis.spiders import RedisCrawlSpider
9 |
10 | # class HongniangSpider(CrawlSpider):
11 | class HongniangSpider(RedisCrawlSpider):
12 |
13 | name = 'hongniang'
14 | allowed_domains = ['hongniang.com']
15 | # start_urls = ['http://www.hongniang.com/match?&page=1']
16 | redis_key = "hongniangSpider:start_urls"
17 |
18 | # 动态域范围获取
19 | def __init__(self, *args, **kwargs):
20 | # Dynamically define the allowed domains list.
21 | domain = kwargs.pop('domain', '')
22 | self.allowed_domains = filter(None, domain.split(','))
23 | super(HongniangSpider, self).__init__(*args, **kwargs)
24 |
25 | # 每一页匹配规则
26 | page_links = LinkExtractor(allow=(r"hongniang.com/match?&page=\d+"))
27 | # 每个人个人主页匹配规则
28 | profile_links = LinkExtractor(allow=(r"hongniang.com/user/member/id/\d+"))
29 | rules = (
30 | # 没有回调函数,说明follow是True
31 | Rule(page_links),
32 | # 有回调函数,说明follow是False
33 | Rule(profile_links, callback='parse_item',follow=True),
34 | )
35 |
36 | def parse_item(self, response):
37 |
38 | item = HongniangnetItem()
39 | # 注意:xpath获取位置时,不从0开始
40 | # 用户名
41 | item["username"] = self.get_username(response)
42 | # 年龄
43 | item["age"] = self.get_age(response)
44 | # 头像图片链接
45 | item["header_link"] = self.get_header_link(response)
46 | # 相册图片链接
47 | item["images_url"] = self.get_images_url(response)
48 | # 内心独白
49 | item["content"] = self.get_content(response)
50 | # 籍贯
51 | item["place_from"] = self.get_place_from(response)
52 | # 学历
53 | item["education"] = self.get_education(response)
54 | # 爱好
55 | item["hobby"] = self.get_hobby(response)
56 | # 个人主页链接
57 | item["source_url"] = response.url
58 | # 数据来源网站
59 | item["source"] = "hongniang"
60 |
61 | yield item
62 |
63 | def get_username(self,response):
64 | username = response.xpath("//div[@class='name nickname']/text()").extract()
65 | if len(username):
66 | username = username[0]
67 | else:
68 | username = "NULL"
69 | return username.strip()
70 |
71 | def get_age(self,response):
72 | age = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div[1]/ul[1]/li[1]/text()").extract()
73 | if len(age):
74 | age = age[0]
75 | print(age)
76 | else:
77 | age = "NULL"
78 | return age.strip()
79 |
80 | def get_header_link(self,response):
81 | header_link = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img[1]/@src").extract()
82 | if len(header_link):
83 | header_link = header_link[0]
84 | else:
85 | header_link = "NULL"
86 | return header_link.strip()
87 |
88 | def get_images_url(self,response):
89 | images_url = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img/@src").extract()
90 | if len(images_url):
91 | images_url = images_url
92 | else:
93 | images_url = "NULL"
94 | return images_url
95 |
96 | def get_content(self,response):
97 | ontent = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info5']/div[@class='text']/text()").extract()
98 | if len(ontent):
99 | ontent = ontent[0]
100 | else:
101 | ontent = "NULL"
102 | return ontent.strip()
103 |
104 | def get_place_from(self,response):
105 | place_from = response.xpath("//div[@class='mem_main']/div[@class='sub2']/div[@class='info1'][1]/div[@class='right']/ul[2]/li[1]/text()").extract()
106 | if len(place_from):
107 | place_from = place_from[0]
108 | else:
109 | place_from = "NULL"
110 | return place_from.strip()
111 |
112 | def get_education(self,response):
113 | education = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div/ul[2]/li[2]/text()").extract()
114 | if len(education):
115 | education = education[0]
116 | else:
117 | education = "NULL"
118 | return education.strip()
119 | def get_hobby(self,response):
120 | hobby = response.xpath("//div[@class='mem_main']//div[@class='sub2']/div[@class='info1'][2]/div[@class='right'][1]/ul[1]/li[4]/text()").extract()
121 | if len(hobby):
122 | hobby = hobby[0]
123 | else:
124 | hobby = "NULL"
125 | return hobby.strip()
126 |
127 |
--------------------------------------------------------------------------------
/HongNiangNet/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl hongniang'.split())
--------------------------------------------------------------------------------
/HongNiangNet/content.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/HongNiangNet/content.json
--------------------------------------------------------------------------------
/HongNiangNet/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = HongNiangNet.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = HongNiangNet
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-Spider
2 | 1、豆瓣电影top250
3 | 2、斗鱼爬取json数据以及爬取美女图片
4 | 3、CrawlSpider爬取红娘网相亲人的部分基本信息、红娘网分布式爬取、存储redis
5 | 4、爬虫小demo
6 | 5、Selenium的使用
7 | 6、PIL
8 | 7、爬多点商品存储mysql数据库同时显示在djangoweb页面
9 | 8、django开发接口
10 | 9、python txt、csv、xml文件解析
11 | 10、Scrapy框架进行Spiders简单爬虫
12 | 11、抓取淘宝美女信息,下载本地并存储mysql数据库
13 | 12、爬取有缘网用户信息
14 | 13、模拟Github登陆
15 | 14、selenium动态模拟登录
16 | 15、模拟知乎登录
17 | 16、爬取tencent社招信息
18 | 17、[爬取《多点》整站商品信息](https://github.com/lb2281075105/LBDuoDian)
19 | 18、模拟京东登录
20 | 19、下载网易云歌词
21 | 20、淘宝信息
22 | 21、京东商城商品详情页信息
23 | 22、模拟图虫网登录
24 | 23、itchat 获取微信群或者微信好友分享文章
25 | 24、爬取微信公众号历史文章
26 | 25、itchat监听指定微信公众号分享的文章
27 | 26、itchat微信群微信好友防撤回
28 | 27、在微信群之间转发消息
29 | 27、下载bilibili视频 也可以下载哔哩哔哩集合视频
30 | 28、爬取m3u8视频
31 |
32 | 详细请移步简书[Python文集](http://www.jianshu.com/nb/18442681)
33 |
--------------------------------------------------------------------------------
/duodian/.idea/duodian.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/duodian/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/duodian/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/duodian/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/db.sqlite3
--------------------------------------------------------------------------------
/duodian/duodian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/duodian/__init__.py
--------------------------------------------------------------------------------
/duodian/duodian/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for duodian project.
3 |
4 | Generated by 'django-admin startproject' using Django 1.11.4.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.11/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/1.11/ref/settings/
11 | """
12 |
13 | import os
14 |
15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 |
19 | # Quick-start development settings - unsuitable for production
20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/
21 |
22 | # SECURITY WARNING: keep the secret key used in production secret!
23 | SECRET_KEY = 'htonb%m8+_d=tsnqm)6)_q@2@m#ulx#nb!8$wbluo9&1yi$yh$'
24 |
25 | # SECURITY WARNING: don't run with debug turned on in production!
26 | DEBUG = True
27 |
28 | ALLOWED_HOSTS = []
29 |
30 |
31 | # Application definition
32 |
33 | INSTALLED_APPS = [
34 | 'django.contrib.admin',
35 | 'django.contrib.auth',
36 | 'django.contrib.contenttypes',
37 | 'django.contrib.sessions',
38 | 'django.contrib.messages',
39 | 'django.contrib.staticfiles',
40 | 'myduodian',
41 | ]
42 |
43 | MIDDLEWARE = [
44 | 'django.middleware.security.SecurityMiddleware',
45 | 'django.contrib.sessions.middleware.SessionMiddleware',
46 | 'django.middleware.common.CommonMiddleware',
47 | 'django.middleware.csrf.CsrfViewMiddleware',
48 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
49 | 'django.contrib.messages.middleware.MessageMiddleware',
50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
51 | ]
52 | MIDDLEWARE_CLASSES = [
53 | 'django.contrib.sessions.middleware.SessionMiddleware',
54 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
55 | 'django.contrib.messages.middleware.MessageMiddleware',
56 | ]
57 | ROOT_URLCONF = 'duodian.urls'
58 |
59 | TEMPLATES = [
60 | {
61 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
62 | 'DIRS': [os.path.join(BASE_DIR),'templates'],
63 | 'APP_DIRS': True,
64 | 'OPTIONS': {
65 | 'context_processors': [
66 | 'django.template.context_processors.debug',
67 | 'django.template.context_processors.request',
68 | 'django.contrib.auth.context_processors.auth',
69 | 'django.contrib.messages.context_processors.messages',
70 | ],
71 | },
72 | },
73 | ]
74 |
75 | WSGI_APPLICATION = 'duodian.wsgi.application'
76 |
77 |
78 | # Database
79 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases
80 |
81 | DATABASES = {
82 | 'default': {
83 | 'ENGINE': 'django.db.backends.mysql',
84 | 'HOST':'127.0.0.1',
85 | 'PORT':'3306',
86 | 'NAME': 'test',
87 | 'USER':'root',
88 | 'PASSWORD':'',
89 | }
90 | }
91 |
92 |
93 | # Password validation
94 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators
95 |
96 | AUTH_PASSWORD_VALIDATORS = [
97 | {
98 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
99 | },
100 | {
101 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
102 | },
103 | {
104 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
105 | },
106 | {
107 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
108 | },
109 | ]
110 |
111 |
112 | # Internationalization
113 | # https://docs.djangoproject.com/en/1.11/topics/i18n/
114 |
115 | LANGUAGE_CODE = 'zh-hans'
116 |
117 | TIME_ZONE = 'UTC'
118 |
119 | USE_I18N = True
120 |
121 | USE_L10N = True
122 |
123 | USE_TZ = True
124 |
125 |
126 | # Static files (CSS, JavaScript, Images)
127 | # https://docs.djangoproject.com/en/1.11/howto/static-files/
128 |
129 | STATIC_URL = '/static/'
130 |
--------------------------------------------------------------------------------
/duodian/duodian/urls.py:
--------------------------------------------------------------------------------
1 | """duodian URL Configuration
2 |
3 | The `urlpatterns` list routes URLs to views. For more information please see:
4 | https://docs.djangoproject.com/en/1.11/topics/http/urls/
5 | Examples:
6 | Function views
7 | 1. Add an import: from my_app import views
8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
9 | Class-based views
10 | 1. Add an import: from other_app.views import Home
11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 | 1. Import the include() function: from django.conf.urls import url, include
14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url
17 | from django.contrib import admin
18 | from myduodian import views
19 | urlpatterns = [
20 | url(r'^admin/', admin.site.urls),
21 | url(r'^index/', views.index),
22 | ]
23 |
--------------------------------------------------------------------------------
/duodian/duodian/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for duodian project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "duodian.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/duodian/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "duodian.settings")
7 | try:
8 | from django.core.management import execute_from_command_line
9 | except ImportError:
10 | # The above import may fail for some other reason. Ensure that the
11 | # issue is really that Django is missing to avoid masking other
12 | # exceptions on Python 2.
13 | try:
14 | import django
15 | except ImportError:
16 | raise ImportError(
17 | "Couldn't import Django. Are you sure it's installed and "
18 | "available on your PYTHONPATH environment variable? Did you "
19 | "forget to activate a virtual environment?"
20 | )
21 | raise
22 | execute_from_command_line(sys.argv)
23 |
--------------------------------------------------------------------------------
/duodian/myduodian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/myduodian/__init__.py
--------------------------------------------------------------------------------
/duodian/myduodian/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
5 | from myduodian.models import AiDuoDian
6 |
7 | class DuoDianAdmin(admin.ModelAdmin):
8 | list_display = ['goodName','price','image']
9 |
10 |
11 | admin.site.register(AiDuoDian,DuoDianAdmin)
--------------------------------------------------------------------------------
/duodian/myduodian/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from django.db import models, migrations
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='AiDuoDian',
15 | fields=[
16 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
17 | ('image', models.CharField(max_length=1000)),
18 | ('goodName', models.CharField(max_length=200)),
19 | ('price', models.CharField(max_length=40)),
20 | ],
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/duodian/myduodian/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/duodian/myduodian/migrations/__init__.py
--------------------------------------------------------------------------------
/duodian/myduodian/models.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from django.db import models
3 |
4 | class AiDuoDian(models.Model):
5 |
6 | image = models.CharField(max_length=1000)
7 | goodName = models.CharField(max_length=200)
8 | price = models.CharField(max_length=40)
--------------------------------------------------------------------------------
/duodian/myduodian/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/duodian/myduodian/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 | from django.http import HttpResponse
3 | # Create your views here.
4 |
5 | from myduodian.models import *
6 | def index(request):
7 | context = {"list":AiDuoDian.objects.all()}
8 | return render(request,'myduodian/index.html',context)
--------------------------------------------------------------------------------
/duodian/templates/myduodian/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
23 |
24 |
25 |
26 |
27 |
28 | 商品图片 | 商品名 | 价格 |
29 |
30 |
31 | {% for item in list %}
32 |
33 |  |
34 | {{ item.goodName }} |
35 | {{ item.price }} |
36 |
37 | {% endfor %}
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/duodian/woduodian.py:
--------------------------------------------------------------------------------
1 | #encoding=utf-8
2 |
3 | import MySQLdb
4 | import json
5 | import jsonpath
6 | import urllib2
7 | import os
8 | class DuoDian():
9 | def __init__(self):
10 | self.url = 'https://gatewx.dmall.com/customersite/searchWareByCategory?param={"pageNum":1,"pageSize":30,"venderId":"1","storeId":"108","sort":"1","categoryId":11347,"categoryLevel":3,"cateSource":1,"bizType":"1"}&token=&source=2&tempid=C7B357489E400002B1514BD01B00E270&pubParam={"utmSource":"wxmp"}&_=1511256196255'
11 | # 建立和数据库的连接
12 | self.db = MySQLdb.connect(host='127.0.0.1', user="root", passwd="", db="test")
13 | # 获取操作游标
14 | self.cursor = self.db.cursor()
15 |
16 | def get_html(self):
17 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
18 | request = urllib2.Request(self.url,headers=headers)
19 | response = urllib2.urlopen(request)
20 | html = response.read()
21 | return html
22 |
23 | def get_html1(self,url):
24 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
25 | request = urllib2.Request(url,headers=headers)
26 | response = urllib2.urlopen(request)
27 | html = response.read()
28 | return html
29 |
30 | def get_content(self):
31 | jsonobj = json.loads(self.get_html())
32 | # 商品名称
33 | namelist = jsonpath.jsonpath(jsonobj, '$..title')
34 | # 商品价格
35 | pricelist = jsonpath.jsonpath(jsonobj, '$..promotionPrice')
36 | # 商品图片
37 | imglist = jsonpath.jsonpath(jsonobj, '$..img')
38 | listdata = zip(imglist,namelist,pricelist)
39 |
40 |
41 |
42 | for item in listdata:
43 | # print(item[1])
44 | try:
45 | result = self.cursor.execute(
46 | "insert into myduodian_aiduodian (image,goodName,price) VALUES (%s,%s,%s)",[item[0],item[1],item[2]])
47 | self.db.commit()
48 | print(result)
49 | except Exception as e:
50 | self.db.rollback()
51 | print('失败')
52 |
53 | # 关闭连接,释放资源
54 | self.db.close()
55 |
56 |
57 | if __name__ == "__main__":
58 | duodian = DuoDian()
59 | duodian.get_content()
--------------------------------------------------------------------------------
/gongzhonghao.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/gongzhonghao.jpeg
--------------------------------------------------------------------------------
/jiekou/.idea/jiekou.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/jiekou/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/jiekou/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/jiekou/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/db.sqlite3
--------------------------------------------------------------------------------
/jiekou/jiekou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/jiekou/__init__.py
--------------------------------------------------------------------------------
/jiekou/jiekou/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for jiekou project.
3 |
4 | Generated by 'django-admin startproject' using Django 1.8.2.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.8/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/1.8/ref/settings/
11 | """
12 |
13 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
14 | import os
15 |
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 |
19 | # Quick-start development settings - unsuitable for production
20 | # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/
21 |
22 | # SECURITY WARNING: keep the secret key used in production secret!
23 | SECRET_KEY = '3!2z2kqm4erg8#8y1+5n1%wl3lw32@1u&4mlnh+orzl%ns39wq'
24 |
25 | # SECURITY WARNING: don't run with debug turned on in production!
26 | DEBUG = True
27 |
28 | ALLOWED_HOSTS = []
29 |
30 |
31 | # Application definition
32 |
33 | INSTALLED_APPS = (
34 | 'django.contrib.admin',
35 | 'django.contrib.auth',
36 | 'django.contrib.contenttypes',
37 | 'django.contrib.sessions',
38 | 'django.contrib.messages',
39 | 'django.contrib.staticfiles',
40 | 'myjiekou',
41 | )
42 |
43 | MIDDLEWARE_CLASSES = (
44 | 'django.contrib.sessions.middleware.SessionMiddleware',
45 | 'django.middleware.common.CommonMiddleware',
46 | 'django.middleware.csrf.CsrfViewMiddleware',
47 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
48 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
49 | 'django.contrib.messages.middleware.MessageMiddleware',
50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
51 | 'django.middleware.security.SecurityMiddleware',
52 | )
53 |
54 | ROOT_URLCONF = 'jiekou.urls'
55 |
56 | TEMPLATES = [
57 | {
58 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
59 | 'DIRS': [os.path.join(BASE_DIR),'templates'],
60 | 'APP_DIRS': True,
61 | 'OPTIONS': {
62 | 'context_processors': [
63 | 'django.template.context_processors.debug',
64 | 'django.template.context_processors.request',
65 | 'django.contrib.auth.context_processors.auth',
66 | 'django.contrib.messages.context_processors.messages',
67 | ],
68 | },
69 | },
70 | ]
71 |
72 | WSGI_APPLICATION = 'jiekou.wsgi.application'
73 |
74 |
75 | # Database
76 | # https://docs.djangoproject.com/en/1.8/ref/settings/#databases
77 |
78 | DATABASES = {
79 | 'default': {
80 | 'ENGINE': 'django.db.backends.mysql',
81 | 'HOST':'127.0.0.1',
82 | 'PORT':'3306',
83 | 'NAME': 'test',
84 | 'USER':'root',
85 | 'PASSWORD':'',
86 | }
87 | }
88 |
89 |
90 | # Internationalization
91 | # https://docs.djangoproject.com/en/1.8/topics/i18n/
92 |
93 | LANGUAGE_CODE = 'zh-hans'
94 |
95 | TIME_ZONE = 'UTC'
96 |
97 | USE_I18N = True
98 |
99 | USE_L10N = True
100 |
101 | USE_TZ = True
102 |
103 |
104 | # Static files (CSS, JavaScript, Images)
105 | # https://docs.djangoproject.com/en/1.8/howto/static-files/
106 |
107 | STATIC_URL = '/static/'
108 |
--------------------------------------------------------------------------------
/jiekou/jiekou/urls.py:
--------------------------------------------------------------------------------
1 | """jiekou URL Configuration
2 |
3 | The `urlpatterns` list routes URLs to views. For more information please see:
4 | https://docs.djangoproject.com/en/1.8/topics/http/urls/
5 | Examples:
6 | Function views
7 | 1. Add an import: from my_app import views
8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
9 | Class-based views
10 | 1. Add an import: from other_app.views import Home
11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 | 1. Add an import: from blog import urls as blog_urls
14 | 2. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
15 | """
16 | from django.conf.urls import include, url
17 | from django.contrib import admin
18 | from myjiekou import views
19 | urlpatterns = [
20 | url(r'^admin/', include(admin.site.urls)),
21 | url(r'^index/', views.index),
22 | url(r'^api/', views.api),
23 | ]
24 |
--------------------------------------------------------------------------------
/jiekou/jiekou/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for jiekou project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiekou.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/jiekou/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiekou.settings")
7 |
8 | from django.core.management import execute_from_command_line
9 |
10 | execute_from_command_line(sys.argv)
11 |
--------------------------------------------------------------------------------
/jiekou/myjiekou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/myjiekou/__init__.py
--------------------------------------------------------------------------------
/jiekou/myjiekou/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | from myjiekou.models import MyModel
4 | # Register your models here.
5 |
6 | class MyAdmin(admin.ModelAdmin):
7 | list_display = ["name","age","hobby"]
8 |
9 | admin.site.register(MyModel,MyAdmin)
10 |
--------------------------------------------------------------------------------
/jiekou/myjiekou/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from django.db import models, migrations
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ]
11 |
12 | operations = [
13 | migrations.CreateModel(
14 | name='MyModel',
15 | fields=[
16 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
17 | ('name', models.CharField(max_length=20)),
18 | ('age', models.CharField(max_length=100)),
19 | ('hobby', models.CharField(max_length=300)),
20 | ],
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/jiekou/myjiekou/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/jiekou/myjiekou/migrations/__init__.py
--------------------------------------------------------------------------------
/jiekou/myjiekou/models.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | from django.db import models
3 |
4 | # Create your models here.
5 |
6 | class MyModel(models.Model):
7 | # 姓名
8 | name = models.CharField(max_length=20)
9 | # 年龄
10 | age = models.CharField(max_length=100)
11 | # 爱好
12 | hobby = models.CharField(max_length=300)
13 |
14 |
15 |
--------------------------------------------------------------------------------
/jiekou/myjiekou/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/jiekou/myjiekou/views.py:
--------------------------------------------------------------------------------
1 |
2 | #encoding=utf-8
3 | from django.shortcuts import render
4 | from django.http import HttpResponse,JsonResponse
5 | from models import MyModel
6 | import json
7 | def index(request):
8 | content = MyModel.objects.all()
9 | list = {"content":content}
10 | return render(request,"myjiekou/index.html",list)
11 |
12 | def api(request):
13 | list = []
14 | item = {}
15 | content = MyModel.objects.all()
16 | for one in content:
17 | item["name"] = one.name
18 | item["age"] = one.age
19 | item["hobby"] = one.hobby
20 | list.append(item)
21 |
22 | return JsonResponse({"status":200,"date":list})
--------------------------------------------------------------------------------
/jiekou/templates/myjiekou/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
7 |
8 |
9 | {% for item in content %}
10 |
11 | - {{ item.name }}
12 | - {{ item.age }}
13 | - {{ item.hobby }}
14 |
15 | {% endfor %}
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/teacherInfo/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/teacherInfo/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/teacherInfo/.idea/teacherInfo.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/teacherInfo/begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl myteacher'.split())
--------------------------------------------------------------------------------
/teacherInfo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = teacherInfo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = teacherInfo
12 |
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/teacherInfo/teacherInfo/__init__.py
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | # Item 定义结构化数据字段,用来保存爬取到的数据
11 | class TeacherinfoItem(scrapy.Item):
12 |
13 | # 获取名字
14 | name = scrapy.Field()
15 | # 职称
16 | position = scrapy.Field()
17 | # 个人信息
18 | info = scrapy.Field()
19 |
20 |
21 |
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class TeacherinfoSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import json
9 | import codecs
10 | class TeacherinfoPipeline(object):
11 | def __init__(self):
12 | self.filename = codecs.open('teacher.json','wb','utf-8')
13 | def process_item(self, item, spider):
14 | print(item)
15 | html = json.dumps(dict(item),ensure_ascii=False)
16 | self.filename.write(html + '\n')
17 | return item
18 |
19 | def open_spider(self, spider):
20 | pass
21 | # self.filename.close()
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for teacherInfo project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'teacherInfo'
13 |
14 | SPIDER_MODULES = ['teacherInfo.spiders']
15 | NEWSPIDER_MODULE = 'teacherInfo.spiders'
16 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'teacherInfo (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | SPIDER_MIDDLEWARES = {
50 | 'teacherInfo.middlewares.TeacherinfoSpiderMiddleware': 543,
51 | }
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'teacherInfo.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'teacherInfo.pipelines.TeacherinfoPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/teacherInfo/teacherInfo/spiders/myteacher.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from teacherInfo.items import TeacherinfoItem
4 |
5 | class MyteacherSpider(scrapy.Spider):
6 | name = 'myteacher'
7 | allowed_domains = ['itcast.cn']
8 | # start_urls = ("http://www.itcast.cn/channel/teacher.shtml",) 元组也可以
9 | start_urls = ['http://www.itcast.cn/channel/teacher.shtml#ac',
10 | 'http://www.itcast.cn/channel/teacher.shtml#acloud',
11 | 'http://www.itcast.cn/channel/teacher.shtml#adesign',
12 | 'http://www.itcast.cn/channel/teacher.shtml#ads',
13 | 'http://www.itcast.cn/channel/teacher.shtml#ajavaee',
14 | 'http://www.itcast.cn/channel/teacher.shtml#anetmarket',
15 | 'http://www.itcast.cn/channel/teacher.shtml#aphp',
16 | 'http://www.itcast.cn/channel/teacher.shtml#apm',
17 | 'http://www.itcast.cn/channel/teacher.shtml#apython',
18 | 'http://www.itcast.cn/channel/teacher.shtml#astack',
19 | 'http://www.itcast.cn/channel/teacher.shtml#atest',
20 | 'http://www.itcast.cn/channel/teacher.shtml#aui',
21 | 'http://www.itcast.cn/channel/teacher.shtml#auijp',
22 | 'http://www.itcast.cn/channel/teacher.shtml#aweb']
23 | # 爬虫的约束区域
24 | def parse(self, response):
25 | # 存放老师信息的集合
26 | items = []
27 | print(response.body)
28 | for each in response.xpath("//div[@class='li_txt']"):
29 | # 将我们得到的数据封装到一个 `ItcastItem` 对象
30 | item = TeacherinfoItem()
31 | # extract()方法返回的都是unicode字符串
32 | name = each.xpath("h3/text()").extract()
33 | position = each.xpath("h4/text()").extract()
34 | info = each.xpath("p/text()").extract()
35 |
36 | # xpath返回的是包含一个元素的列表
37 | item['name'] = name[0]
38 | item['position'] = position[0]
39 | item['info'] = info[0]
40 |
41 | items.append(item)
42 | yield item
43 | # 直接返回最后数据
44 | # return items
45 |
--------------------------------------------------------------------------------
/爬虫小demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lb2281075105/Python-Spider/c53822e7b3bd96cd785361cacebe4edbb612ef4f/爬虫小demo/.DS_Store
--------------------------------------------------------------------------------
/爬虫小demo/01 taobao.py:
--------------------------------------------------------------------------------
1 | from urllib import request, parse, error
2 | import json
3 | import os
4 | import pymysql
5 | import ssl
6 | # 请求链接需要设置ssl认证
7 | ssl._create_default_https_context = ssl._create_unverified_context
8 |
9 |
10 | class TaoBao():
11 |
12 | def __init__(self):
13 | # 设置头部
14 | self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
15 | # 设置get参数
16 | self.params = {'_input_charset': 'utf-8',
17 | 'q': '',
18 | 'viewFlag': 'A',
19 | 'sortType': 'default',
20 | 'searchStyle': '',
21 | 'searchRegion': 'city',
22 | 'searchFansNum': '',
23 | 'currentPage': '',
24 | 'pageSize': '20'
25 | }
26 | self.url = 'https://mm.taobao.com/tstar/search/tstar_model.do'
27 |
28 |
29 | def get_connect(self):
30 |
31 | self.tablename = 'taobao'
32 | self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='test', charset='utf8')
33 | self.cur = self.db.cursor()
34 | self.cur.execute('USE test')
35 | try:
36 | # 创建表
37 | self.cur.execute('CREATE TABLE '+self.tablename+' (id BIGINT(7) NOT NULL AUTO_INCREMENT, name VARCHAR(100), city VARCHAR(20), height VARCHAR(10), weight VARCHAR(10), homepage VARCHAR(100), profile VARCHAR(100), pic VARCHAR(100), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
38 | except pymysql.err.InternalError as e:
39 | print(e)
40 | # 修改表字段
41 | self.cur.execute('ALTER DATABASE test CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
42 | self.cur.execute('ALTER TABLE '+self.tablename+' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
43 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE name name VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
44 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE city city VARCHAR(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
45 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE height height VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
46 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE weight weight VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
47 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE homepage homepage VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
48 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE profile profile VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
49 | self.cur.execute('ALTER TABLE '+self.tablename+' CHANGE pic pic VARCHAR(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
50 |
51 | def insert_table(self,name, city, height, weight, hompage, profile, pic):
52 | self.cur.execute('INSERT INTO '+self.tablename+' (name, city, height, weight, homepage, profile, pic) VALUES (\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")', (name, city, height, weight, hompage, profile, pic))
53 | self.cur.connection.commit()
54 |
55 |
56 | def get_html(self,page):
57 | self.params['currentPage'] = str(page)
58 | # urlencode可以把字典=键值对编码成url地址中get参数
59 | self.param = parse.urlencode(self.params).encode('utf-8')
60 | # data=self.param 上传参数
61 | req = request.Request(self.url, data=self.param, headers=self.headers)
62 | content = request.urlopen(req)
63 | content = json.loads(content.read().decode('gbk'))
64 | if content['status'] == -1:
65 | return -1
66 |
67 | return content
68 |
69 | def parser_json(self,content, page):
70 | meinvist = []
71 | # 解析json数据
72 | data = content['data']['searchDOList']
73 | for list in data:
74 | temp = {}
75 | temp['id'] = str(list['userId'])
76 | temp['name'] = list['realName']
77 | temp['city'] = list['city']
78 | temp['height'] = str(list['height'])
79 | temp['weight'] = str(list['weight'])
80 | temp['favornum'] = str(list['totalFavorNum'])
81 | temp['profile'] = 'http:'+list['avatarUrl']
82 | temp['pic'] = 'http:'+list['cardUrl']
83 |
84 | # meinvist.append(temp)
85 | self.mkdir(temp['name'])
86 | print('%s正在抓取%s'%(page, temp['name']))
87 | self.get_img(temp['profile'], temp['name'], 'profile')
88 | self.get_img(temp['pic'], temp['name'], 'pic')
89 | if not os.path.exists('./'+temp['name']+'/info.txt'):
90 | with open('./'+temp['name']+'/info.txt', 'w') as f:
91 | f.write(temp['name']+'\n')
92 | f.write(temp['city']+'\n')
93 | f.write(temp['height']+'\n')
94 | f.write(temp['weight']+'\n')
95 | # 插入数据库
96 | self.insert_table(temp['name'], temp['city'], temp['height'], temp['weight'], 'https://mm.taobao.com/self/aiShow.htm?userId='+temp['id'], temp['profile'], temp['pic'])
97 | # return meinvist
98 |
99 | # 判断文件夹是否存在
100 | def mkdir(self,path):
101 | if not os.path.exists(path):
102 | os.makedirs(path)
103 | else:
104 | print('目录已存在!')
105 |
106 | # 判断文件是否存在
107 | def get_img(self,url, path, name):
108 | if os.path.exists('./' + path + '/' + name + '.jpg'):
109 | print('文件已存在!')
110 | return 0
111 | try:
112 | req = request.Request(url, headers=self.headers)
113 | reponse = request.urlopen(req)
114 | get_img = reponse.read()
115 | with open('./' + path + '/' + name + '.jpg', 'wb') as fp:
116 | fp.write(get_img)
117 | # 也可以用一下代码实现图片的下载
118 | # request.urlretrieve(img, './' + path + '/' + name + '.jpg')
119 | except error.URLError as e:
120 | print(e.reason)
121 |
122 |
123 |
124 | if __name__ == '__main__':
125 | page = 1
126 | taobao = TaoBao()
127 | taobao.get_connect()
128 | while True:
129 | content = taobao.get_html(page)
130 | if content == -1:
131 | print('抓取完毕!')
132 | exit()
133 | # 解析json
134 | taobao.parser_json(content, page)
135 | page += 1
136 |
--------------------------------------------------------------------------------
/爬虫小demo/02 doubanzhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
4 | # from selenium import webdriver
5 | # from selenium.webdriver.common.keys import Keys
6 | # import time
7 | #
8 | # driver = webdriver.PhantomJS(executable_path="/Users/yunmei/phantomjs-2.1.1-macosx/bin/phantomjs")
9 | # driver.get("https://www.douban.com/")
10 | #
11 | # # 输入账号密码
12 | # driver.find_element_by_name("form_email").send_keys("2334497007@qq.com")
13 | # driver.find_element_by_name("form_password").send_keys("lbaiwb1314")
14 | #
15 | # # 模拟点击登录
16 | # driver.find_element_by_xpath("//input[@class='bn-submit']").click()
17 | #
18 | # # 等待3秒
19 | # time.sleep(3)
20 | #
21 | # # 生成登陆后快照
22 | # driver.save_screenshot("douban.png")
23 | #
24 | # with open("douban.html", "w") as file:
25 | # file.write(driver.page_source.encode('utf-8'))
26 | #
27 | # driver.quit()
28 |
29 |
30 | from selenium import webdriver
31 | import time
32 | # 创建浏览器对象
33 | browser=webdriver.PhantomJS(executable_path="/Users/yunmei/phantomjs-2.1.1-macosx/bin/phantomjs")
34 | # 请求加载登录链接
35 | browser.get('https://www.zhihu.com/#signin')
36 | time.sleep(3)
37 | # 模拟点击使用密码登录
38 | browser.find_element_by_css_selector(".signin-switch-password").click()
39 | # 输入账号
40 | browser.find_element_by_css_selector(".account input[name='account']").send_keys('17078075655')
41 | # 输入密码
42 | browser.find_element_by_css_selector(".verification input[name='password']").send_keys('19910825580lb')
43 | # 模拟点击登录
44 | browser.find_element_by_css_selector(".sign-button").click()
45 | time.sleep(3)
46 | # 截图
47 | browser.save_screenshot("zhihu.png")
48 | browser.quit()
--------------------------------------------------------------------------------
/爬虫小demo/03 douYuUnittest.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # python的测试模块
4 | import unittest
5 | from selenium import webdriver
6 | from bs4 import BeautifulSoup
7 |
8 | class douyuSelenium(unittest.TestCase):
9 | # 初始化方法
10 | def setUp(self):
11 | self.driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs")
12 |
13 | #具体的测试用例方法,一定要以test开头
14 | def testDouyu(self):
15 | self.driver.get('http://www.douyu.com/directory/all')
16 | while True:
17 | # 指定xml解析
18 | soup = BeautifulSoup(self.driver.page_source, 'lxml')
19 | # 返回当前页面所有房间标题列表 和 观众人数列表
20 | titles = soup.find_all('h3', attrs={'class': 'ellipsis'})
21 | nums = soup.find_all('span', attrs={'class': 'dy-num fr'})
22 |
23 | # 使用zip()函数来可以把列表合并,并创建一个元组对的列表[(1,2), (3,4)]
24 | for title, num in zip(nums, titles):
25 | print u"房间标题: " + num.get_text().strip(), u"\t观众人数:" + title.get_text().strip()
26 | # page_source.find()未找到内容则返回-1
27 | if self.driver.page_source.find('shark-pager-disable-next') != -1:
28 | break
29 | # 模拟下一页点击
30 | self.driver.find_element_by_class_name('shark-pager-next').click()
31 |
32 | # 退出时的清理方法
33 | def tearDown(self):
34 | print '加载完成...'
35 | self.driver.quit()
36 |
37 | if __name__ == "__main__":
38 | unittest.main()
--------------------------------------------------------------------------------
/爬虫小demo/04 fileHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import csv
3 |
4 | # 1、txt文件
5 | file = open('file.txt','r')
6 | # 获取所有的信息
7 | print file.read()
8 | file.write("你好")
9 | # 获取所有并且在所有行存在一个数组
10 | print file.readlines()
11 | # 获取第一行
12 | print file.readline()
13 |
14 | # 2、读取csv文件
15 |
16 | writer = csv.writer(open('test.csv','wb'))
17 | writer.writerow(['col1','col2','col3'])
18 | data = [range(3) for i in range(3)]
19 | for item in data:
20 | writer.writerow(item)
21 |
22 | filelist = csv.reader(open('./test.csv','r'))
23 | for item in filelist:
24 | print item
25 |
26 |
27 | # 3、读取xml文件
28 |
29 | from xml.dom import minidom
30 | # parse打开xml文件
31 | dom = minidom.parse("info.xml")
32 | # 获取根节点
33 | root = dom.documentElement
34 | print root.nodeName
35 | print root.nodeValue
36 | print root.nodeType
37 | print root.ELEMENT_NODE
38 | print "--" * 8
39 | province = root.getElementsByTagName("province")
40 | print province[0].tagName
41 | print province[0].getAttribute("username")
42 | print province[0].firstChild.data
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/爬虫小demo/05 getimage.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import urllib2
4 | import lxml.etree
5 |
6 | class GetImage():
7 |
8 | def __init__(self):
9 | self.tieba = "https://tieba.baidu.com"
10 | self.count = 50
11 |
12 | def get_html(self,url):
13 | request = urllib2.Request(url)
14 | response = urllib2.urlopen(request)
15 | html = response.read()
16 | return html
17 |
18 | def get_xpath(self):
19 | # 起始页
20 | baginPage = int(raw_input("请输入起始页:"))
21 | # 结束页
22 | endPage = int(raw_input("请输入结束页:"))
23 | for pagecount in range(baginPage,endPage + 1):
24 | pn = (pagecount - 1) * self.count
25 | urllink = self.tieba + "/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=" + str(pn)
26 | xmlcontent = lxml.etree.HTML(self.get_html(urllink))
27 | # content = xmlcontent.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
28 | # content = xmlcontent.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@class="j_th_tit "]/@href')
29 | content = xmlcontent.xpath('//a[@class="j_th_tit "]/@href')
30 |
31 | for item in content:
32 | itemcontent = lxml.etree.HTML(self.get_html(self.tieba + item))
33 | print self.tieba + item
34 | itemlist = itemcontent.xpath('//img[@class="BDE_Image"]//@src')
35 | for imageitem in itemlist:
36 | get_image = self.get_html(imageitem)
37 | with open("images/" + imageitem[-10:],'a') as file:
38 | file.write(get_image)
39 | file.close
40 |
41 | if __name__ == "__main__":
42 | getImages = GetImage()
43 | getImages.get_xpath()
--------------------------------------------------------------------------------
/爬虫小demo/06 jsload.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from selenium import webdriver
3 | from time import sleep
4 | from selenium.webdriver.common.keys import Keys
5 |
6 | driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs")
7 | driver.get("http://baidu.com/")
8 |
9 | driver.find_element_by_id("kw").send_keys(u"长城")
10 | sleep(10)
11 | driver.find_element_by_id("su").click()
12 |
13 | driver.save_screenshot("长城.png")
14 |
15 |
--------------------------------------------------------------------------------
/爬虫小demo/07 jsondata.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import json
4 | import jsonpath
5 | import urllib2
6 |
7 | class Json():
8 | def __init__(self):
9 | self.url = "http://www.lagou.com/lbs/getAllCitySearchLabels.json"
10 |
11 | def get_json(self):
12 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
13 | request = urllib2.Request(self.url,headers=headers)
14 | response = urllib2.urlopen(request)
15 | html = response.read()
16 | jsonobj = json.loads(html)
17 | # 获取城市名称
18 | namelist = jsonpath.jsonpath(jsonobj,'$..name')
19 | for name in namelist:
20 | print(name)
21 |
22 | # 把列表存储为字符串
23 | nametext = json.dumps(namelist,ensure_ascii=False)
24 | with open('name.txt','a') as file:
25 | file.write(nametext.encode("utf-8"))
26 | file.close
27 |
28 |
29 | if __name__ == "__main__":
30 | jsono = Json()
31 | jsono.get_json()
32 |
--------------------------------------------------------------------------------
/爬虫小demo/08 jsonpath和json总结.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
4 | import json
5 | import jsonpath
6 | import time
7 |
8 | # 1、第一种存储字典和数组
9 |
10 | listDict = [{"city": "北京"},{"name": "小明"}]
11 | strlist = json.dumps(listDict, ensure_ascii=False)
12 | print type(strlist) #
13 | # 写数据
14 | with open("listDict.json",'w') as file:
15 | file.write(strlist)
16 |
17 | # 2、第二种存储字典和数组
18 | listStr = [{"city": "北京"}, {"name": "大刘"}]
19 | json.dump(listStr, open("listStr.json","w"), ensure_ascii=False)
20 |
21 | dictStr = {"city": "北京", "name": "大刘"}
22 | json.dump(dictStr, open("dictStr.json","w"), ensure_ascii=False)
23 | time.sleep(1)
24 |
25 | # ------------ 从文件里面取数据 ---------
26 |
27 | dictList = json.load(open("listDict.json",'r'))
28 | # 输出北京
29 | print dictList[0]["city"]
30 | # ------------ 读出字典loads ----------
31 | strDict = '{"city": "北京", "name": "大猫"}'
32 | #
33 | print type(json.loads(strDict))
34 |
35 | jsonobj = json.loads(strDict)
36 |
37 | # 从根节点开始,匹配name节点
38 | citylist = jsonpath.jsonpath(jsonobj,'$..name')
39 |
40 | print citylist[0].encode('utf-8')
--------------------------------------------------------------------------------
/爬虫小demo/09 zhihu_login.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from bs4 import BeautifulSoup
3 | import requests
4 | import time
5 |
6 |
7 | class Login():
8 | # 模拟登录一般步骤:(1)首先抓包,根据webForm来分析需要传那些data
9 | # (2)分析_xsrf获取
10 | # (3)分析验证码获取方式
11 | # (4)post登录
12 |
13 | def get_login(self):
14 | sess=requests.Session()
15 | # 头部headers需要注意,如果头部没有设置好,下面的步骤就会不能执行成功
16 | headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
17 | # 首先获取登录页面,找到需要get的数据,同时记录cookie的值
18 | html=sess.get('https://www.zhihu.com/#signin',headers=headers).text
19 | # 调用xml解析库
20 | bs=BeautifulSoup(html,'lxml')
21 | # _xsrf作用是跨站请求伪造(或者叫跨域攻击)
22 | _xsrf=bs.find('input',attrs={'name':'_xsrf'}).get('value')
23 | # 通过时间戳拼接验证码链接
24 | captcha_url='https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000)
25 | # 发送验证码请求,获取图片数据流。
26 | captchadata = sess.get(captcha_url, headers=headers).content
27 | text = self.captcha(captchadata)
28 |
29 | data={
30 | '_xsrf':_xsrf,
31 | 'phone_num':'17078075655',# 换成邮箱登录也可
32 | 'password':'lbaiwb1314',
33 | 'captcha':text
34 | }
35 | response=sess.post('https://www.zhihu.com/login/phone_num',data=data,headers=headers)
36 | # print type(response.text)
37 | # 在个人中心请求一下是否真正登录成功
38 | response=sess.get('https://www.zhihu.com/people/liu-tao-98-32/activities',headers=headers)
39 | with open("mylogin.txt", "w") as file:
40 | file.write(response.text.encode("utf-8"))
41 |
42 | def captcha(self,captcha_data):
43 | # 将二进制数据写入到文件中
44 | with open('captcha.jpg','wb')as f:
45 | f.write(captcha_data)
46 | text=raw_input('请输入登录验证码')
47 | return text
48 |
49 | if __name__=='__main__':
50 |
51 | login = Login()
52 | login.get_login()
53 |
--------------------------------------------------------------------------------
/爬虫小demo/10 match.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import re
4 | import urllib2
5 |
6 | class Content:
7 |
8 | def __init__(self):
9 | self.page = 1
10 |
11 | def get_html(self):
12 | # 获取整个网页的html内容
13 | headers = {
14 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
15 | url = "http://www.neihan8.com/article/list_5_"+str(self.page)+".html"
16 | request = urllib2.Request(url=url, headers=headers)
17 | response = urllib2.urlopen(request)
18 | html = response.read()
19 | return html
20 |
21 | def get_content(self):
22 | pattern = re.compile(r'(.*?)', re.S)
23 | content_list = pattern.findall(self.get_html())
24 | for content in content_list:
25 | result_content = content.decode('gbk').replace("", "").replace("
", "") \
26 | .replace("“", "").replace("
", "") \
27 | .replace("”", "").replace("&hellip", "")
28 |
29 | with open("content.txt", "a") as file:
30 | file.write(result_content.encode("utf-8"))
31 | file.close
32 |
33 | if __name__ == "__main__":
34 |
35 | content = Content()
36 | while True:
37 | content.page+=1
38 | print content.page
39 | content.get_content()
40 |
41 | """
42 | r 打开只读文件,该文件必须存在。
43 | r+ 打开可读写的文件,该文件必须存在。
44 | w 打开只写文件,若文件存在则文件长度清为0,即该文件内容会消失。若文件不存在则建立该文件。
45 | w+ 打开可读写文件,若文件存在则文件长度清为零,即该文件内容会消失。若文件不存在则建立该文件。
46 | a 以附加的方式打开只写文件。若文件不存在,则会建立该文件,如果文件存在,写入的数据会被加到文件尾,即文件原先的内容会被保留。
47 | a+ 以附加方式打开可读写的文件。若文件不存在,则会建立该文件,如果文件存在,写入的数据会被加到文件尾后,即文件原先的内容会被保留。
48 | """
--------------------------------------------------------------------------------
/爬虫小demo/11 neihan.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 |
4 | import urllib2
5 | import re
6 |
7 | class Spider:
8 | def __init__(self):
9 | # 初始化起始页位置
10 | self.page = 1
11 | # 爬取开关,如果为True继续爬取
12 | self.switch = True
13 |
14 | def loadPage(self):
15 | """
16 | 作用:下载页面
17 | """
18 | print "正在下载数据...."
19 | url = "http://www.neihan8.com/article/list_5_" + str(self.page) + ".html"
20 | headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
21 | request = urllib2.Request(url, headers = headers)
22 | response = urllib2.urlopen(request)
23 |
24 | # 获取每页的HTML源码字符串
25 | html = response.read()
26 | #print html
27 |
28 | # 创建正则表达式规则对象,匹配每页里的段子内容,re.S 表示匹配全部字符串内容
29 | pattern = re.compile('(.*?)
', re.S)
30 |
31 | # 将正则匹配对象应用到html源码字符串里,返回这个页面里的所有段子的列表
32 | content_list = pattern.findall(html)
33 |
34 | # 调用dealPage() 处理段子里的杂七杂八
35 | self.dealPage(content_list)
36 |
37 | def dealPage(self, content_list):
38 | """
39 | 处理每页的段子
40 | content_list : 每页的段子列表集合
41 | """
42 | for item in content_list:
43 | # 将集合里的每个段子按个处理,替换掉无用数据
44 | item = item.replace("","").replace("
", "").replace("
", "")
45 | #print item.decode("gbk")
46 | # 处理完后调用writePage() 将每个段子写入文件内
47 | self.writePage(item)
48 |
49 | def writePage(self, item):
50 | """
51 | 把每条段子逐个写入文件里
52 | item: 处理后的每条段子
53 | """
54 | # 写入文件内
55 | print "正在写入数据...."
56 | with open("duanzi.txt", "a") as f:
57 | f.write(item)
58 |
59 | def startWork(self):
60 | """
61 | 控制爬虫运行
62 | """
63 | # 循环执行,直到 self.switch == False
64 | while self.switch:
65 | # 用户确定爬取的次数
66 | self.loadPage()
67 | command = raw_input("如果继续爬取,请按回车(退出输入quit)")
68 | if command == "quit":
69 | # 如果停止爬取,则输入 quit
70 | self.switch = False
71 | # 每次循环,page页码自增1
72 | self.page += 1
73 | print "谢谢使用!"
74 |
75 |
76 | if __name__ == "__main__":
77 | duanziSpider = Spider()
78 | # duanziSpider.loadPage()
79 | duanziSpider.startWork()
80 |
81 |
--------------------------------------------------------------------------------
/爬虫小demo/12 PIL.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding:utf-8 -*-
3 | import pytesseract
4 | from PIL import Image
5 |
6 | # PIL读取与存储图像
7 |
8 | # 1、PIL识别图片上面文字
9 | images = Image.open('test.png')
10 | text = pytesseract.image_to_string(images)
11 | print text
12 |
13 | # 2、PIL保存成灰色图片
14 | # -*- coding: utf-8 -*-
15 | from PIL import Image
16 |
17 | # 打开图像得到一个PIL图像对象
18 | img = Image.open("test.png")
19 | # 将其转为一张灰度图
20 | img = img.convert('L')
21 | # 存储该张图片
22 | try:
23 | img.save("test.png")
24 | except IOError:
25 | print "cannot convert"
26 |
27 |
28 | # 3、PIL生成缩略图
29 | # -*- coding: utf-8 -*-
30 | from PIL import Image
31 |
32 | # 打开图像得到一个PIL图像对象
33 | img = Image.open("test.png")
34 | # 创建最长边为128的缩略图
35 | img.thumbnail((128,128))
36 | # 存储该张图片
37 | try:
38 | img.save("test.png")
39 | except IOError:
40 | print "cannot convert"
41 |
42 |
43 | # 4、PIL调整尺寸与旋转
44 | # -*- coding: utf-8 -*-
45 | from PIL import Image
46 |
47 | # 打开图像得到一个PIL图像对象
48 | img = Image.open("test.png")
49 | # 修改图片大小,参数为一元组
50 | img = img.resize((100,200))
51 | # 使图片逆时针选择45度
52 | img = img.rotate(45)
53 | # 存储该张图片
54 | try:
55 | img.save("test.png")
56 | except IOError:
57 | print "cannot convert"
58 |
59 |
60 |
--------------------------------------------------------------------------------
/爬虫小demo/13 queryxpath.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import urllib2
4 | import json
5 | import lxml.etree
6 | # xpath 模糊查询
7 |
8 | class XpathQuery():
9 | def __init__(self):
10 | self.url = "https://www.qiushibaike.com/"
11 |
12 |
13 | def get_html(self):
14 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
15 | request = urllib2.Request(self.url,headers=headers)
16 | response = urllib2.urlopen(request)
17 | html = response.read()
18 | return html
19 |
20 | def get_xpath(self):
21 | xmlcontent = lxml.etree.HTML(self.get_html())
22 | xmllist = xmlcontent.xpath('//div[contains(@id,"qiushi_tag_")]')
23 | print len(xmllist)
24 | # 分享的地方
25 | sharelist = xmlcontent.xpath('//div[@class="article block untagged mb15 typs_recent"]//div[@class="single-share"]/a/@title')
26 | for item in range(0,4):
27 | print sharelist[item]
28 |
29 | for item in xmllist:
30 | # 用户名
31 | username = item.xpath('.//div[@class="author clearfix"]/a/h2/text()')
32 | # 标题
33 | title = item.xpath('.//a/div[@class="content"]/span/text()')[0]
34 |
35 | with open('title.txt','a') as file:
36 | file.write(title.encode("utf-8"))
37 | file.close
38 | with open('username.txt','a') as file:
39 | if len(username) == 0:
40 | file.write("匿名用户")
41 | else:
42 | file.write(username[0].encode("utf-8"))
43 |
44 | # 好笑数
45 | votecount = item.xpath('.//span[@class="stats-vote"]/i[@class="number"]/text()')[0]
46 | print "好笑数:" + votecount
47 | # 评论数
48 | commentcount = item.xpath('.//span[@class="stats-comments"]//i[@class="number"]/text()')[0]
49 | print "评论数:" + commentcount
50 | # 放在一个字典里进行存储
51 | dic = {
52 | "username":username,
53 | "votecount":votecount,
54 | "commentcount":commentcount,
55 | "title": title,
56 | }
57 | with open('qiushi.json','a') as file:
58 | file.write(json.dumps(dic,ensure_ascii=False).encode("utf-8") + '\n')
59 | file.close
60 |
61 |
62 | if __name__ == "__main__":
63 | xpathq = XpathQuery()
64 | xpathq.get_xpath()
--------------------------------------------------------------------------------
/爬虫小demo/14 selenium执行js.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from selenium import webdriver
4 | import time
5 | driver = webdriver.PhantomJS(executable_path="./phantomjs-2.1.1-macosx/bin/phantomjs")
6 | driver.get("https://www.baidu.com/")
7 |
8 | # 给搜索输入框标红的javascript脚本
9 | js = "var q=document.getElementById(\"kw\");q.style.border=\"2px solid red\";"
10 |
11 | # 调用给搜索输入框标红js脚本
12 | driver.execute_script(js)
13 |
14 | # 查看页面快照
15 | driver.save_screenshot("redbaidu.png")
16 |
17 | # js隐藏元素,将获取的图片元素隐藏
18 | img = driver.find_element_by_xpath("//div[@id='lg']/img")
19 | driver.execute_script('$(arguments[0]).fadeOut()',img)
20 |
21 | # 向下滚动到页面底部
22 | # driver.execute_script("$('.scroll_top').click(function(){$('html,body').animate({scrollTop: '0px'}, 800);});")
23 | time.sleep(1)
24 | # 查看页面快照
25 | driver.save_screenshot("wubaidu.png")
26 |
27 | driver.quit()
--------------------------------------------------------------------------------
/爬虫小demo/15 tencent.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from bs4 import BeautifulSoup
4 | import urllib2
5 |
6 | class Tencent():
7 | def __init__(self):
8 | self.url = 'http://hr.tencent.com/position.php?&start=10#a'
9 |
10 | def get_html(self):
11 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
12 | request = urllib2.Request(self.url,headers=headers)
13 | html = urllib2.urlopen(request)
14 | return html
15 |
16 | def get_content(self):
17 | techlist = []
18 | soup = BeautifulSoup(self.get_html(),'lxml')
19 | positionlist = soup.select('.l > a')
20 | even = soup.select('.even')
21 | odd = soup.select('.odd')
22 | even + odd
23 |
24 | for position in positionlist:
25 | with open("position.txt",'a') as file:
26 | file.write(position.string.encode("utf-8") + "\n")
27 | file.close
28 |
29 | for technology in even:
30 | with open("technology.txt",'a') as file:
31 | file.write("" + technology.select('td')[1].string.encode("utf-8"))
32 | file.write(" 人数:" + technology.select('td')[2].string.encode("utf-8"))
33 | file.write(" 地点:" + technology.select('td')[3].string.encode("utf-8"))
34 | file.write(" 时间:" + technology.select('td')[4].string.encode("utf-8") + "\n")
35 | file.close
36 |
37 | for technology in odd:
38 | with open("technology.txt",'a') as file:
39 | file.write("" + technology.select('td')[1].string.encode("utf-8"))
40 | file.write(" 人数:" + technology.select('td')[2].string.encode("utf-8"))
41 | file.write(" 地点:" + technology.select('td')[3].string.encode("utf-8"))
42 | file.write(" 时间:" + technology.select('td')[4].string.encode("utf-8") + "\n")
43 | file.close
44 |
45 | # items = {} 也可以这么存储数据到文件
46 | # items["name"] = name
47 | # str = json.dumps(items, ensure_ascii=False)
48 | # output.write(line.encode('utf-8'))
49 | # output.close()
50 | if __name__ == "__main__":
51 | tencent = Tencent()
52 | tencent.get_content()
--------------------------------------------------------------------------------
/爬虫小demo/16 xunmall.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import urllib2,os
4 | import lxml.etree
5 |
6 | class Xunmall():
7 | def __init__(self):
8 | self.url = "http://www.xunmall.com"
9 |
10 | def get_html(self,p1 = ""):
11 | # headers = {
12 | # "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
13 | request = urllib2.Request(self.url + p1)
14 | response = urllib2.urlopen(request)
15 | html = response.read()
16 | return html
17 |
18 | def get_xpath(self):
19 | xmlcontent = lxml.etree.HTML(self.get_html())
20 | xmllist = xmlcontent.xpath('//h2[@class="floor_name"]/text()')
21 |
22 | for item in xmllist:
23 | with open('title.txt','a') as file:
24 | file.write(item.encode('utf-8') + '\n')
25 | file.close
26 |
27 |
28 | def get_image(self):
29 | xmlimage = lxml.etree.HTML(self.get_html())
30 | imagelist = xmlimage.xpath('//div[@class="color_top"]/img/@src')
31 | if os.path.isdir('./imgs'):
32 | pass
33 | else:
34 | os.mkdir("./imgs")
35 | for item in imagelist:
36 | # print self.url + item
37 | with open('imgs/' + (self.url + item)[-8:],'a+') as file:
38 | file.write(self.get_html(item))
39 | file.close
40 |
41 | def get_theme(self):
42 | xmltheme = lxml.etree.HTML(self.get_html())
43 | themelist = xmltheme.xpath('//h3[@class="floor_theme"]/text()')
44 |
45 | for item in themelist:
46 | with open('theme.txt','a') as file:
47 | file.write(item.encode('utf-8') + '\n')
48 | file.close
49 |
50 | sloganlist = xmltheme.xpath('//p[@class="slogan"]/text()')
51 | for item in sloganlist:
52 | with open('theme.txt','a') as file:
53 | file.write(item.encode('utf-8') + '\n')
54 | file.close
55 |
56 | give_outlist = xmltheme.xpath('//p[@class="give_out"]/text()')
57 | for item in give_outlist:
58 | with open('theme.txt', 'a') as file:
59 | file.write(item.encode('utf-8') + '\n')
60 | file.close
61 |
62 | def get_html1(self,p2):
63 | request = urllib2.Request(p2)
64 | response = urllib2.urlopen(request)
65 | html = response.read()
66 | return html
67 |
68 | # 食品标题和图片
69 | def foodImageTitle(self):
70 | foodImage = lxml.etree.HTML(self.get_html())
71 | foodImageList = foodImage.xpath('//div[@class="pro_image"]/img/@src')
72 |
73 | if os.path.isdir('./foodimage'):
74 | pass
75 | else:
76 | os.mkdir("./foodimage")
77 | for item in foodImageList:
78 | # print item
79 | with open('foodimage/' + item[-20:],'a+') as file:
80 | file.write(self.get_html1(item))
81 | file.close
82 |
83 | # 每个零食的详细信息(标题、图片、副标题)
84 | def detail(self):
85 | detailLink = lxml.etree.HTML(self.get_html())
86 | detailLinkList = detailLink.xpath('//div[@class="nth_floor first_floor"]/div[@class="goods_box"]/ul[@class="item_list"]//a/@href')
87 | for item in detailLinkList:
88 | # print item[-18:]
89 | detailUrl = lxml.etree.HTML(self.get_html("/" + item[-18:]))
90 | detailImageList = detailUrl.xpath(
91 | '//div[@class="info-panel panel1"]/img/@src')
92 |
93 | for detailitem in detailImageList:
94 | # print '正在下载详情图片'
95 |
96 | if os.path.isdir('./' + item[-18:-5]):
97 | pass
98 | else:
99 | os.mkdir("./" + item[-18:-5])
100 |
101 | with open(item[-18:-5] + '/' + detailitem[-9:], 'a+') as file:
102 | file.write(self.get_html1(detailitem))
103 | file.close
104 | # 商品标题
105 | detailtitleList = detailUrl.xpath(
106 | '//div[@class="col-lg-7 item-inner"]//h1[@class="fl"]/text()')
107 |
108 | for title in detailtitleList:
109 | with open('foodtitle.txt', 'a+') as file:
110 | file.write(title.encode('utf-8') + '\n')
111 | file.close
112 | # 商品编号
113 | goodnumberList = detailUrl.xpath(
114 | '//div[@class="col-lg-7 item-inner"]//li[@class="col-lg-5 col-md-5"]/text()')
115 | for number in goodnumberList:
116 | # print number
117 | if os.path.isdir('./qrcoder'):
118 | pass
119 | else:
120 | os.mkdir("./qrcoder")
121 |
122 | with open('qrcoder', 'a+') as file:
123 | file.write(number.encode('utf-8') + '\n')
124 | file.close
125 |
126 |
127 | # 商品二维码:data_code
128 | coderImageList = detailUrl.xpath('//div[@class="clearfixed"]//div[@class="barcode fr"]/img/@data_code')
129 |
130 | for item in coderImageList:
131 | # print item
132 | with open('goodnumber.txt', 'a+') as file:
133 | file.write(item + '\n')
134 | file.close
135 |
136 |
137 |
138 |
139 | if __name__ == "__main__":
140 | # 获取分类标题
141 | xunmall = Xunmall()
142 | # xunmall.get_xpath()
143 | # 获取图片
144 | # xunmall.get_image()
145 | # 图片上面的标题
146 | # xunmall.get_theme()
147 | # 休闲食品标题和图片
148 | # xunmall.foodImageTitle()
149 | xunmall.detail()
--------------------------------------------------------------------------------
/爬虫小demo/17 zhihulogin.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import urllib2
4 | import lxml.etree
5 | class Login():
6 | def __init__(self):
7 | self.url = "https://www.zhihu.com/#signin"
8 |
9 | def get_html(self):
10 | # headers = {
11 | # "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36"}
12 | request = urllib2.Request(self.url)
13 | response = urllib2.urlopen(request)
14 | html = response.read()
15 | return html
16 |
17 | def get_xpath(self):
18 | # print self.get_html()
19 | xmlcontent = lxml.etree.HTML(self.get_html())
20 | xmllist = xmlcontent.xpath('//div[@class="view view-signin"]/form/input/@value')
21 |
22 | for item in xmllist:
23 | print item
24 | with open('title.txt','a') as file:
25 | file.write(item.encode('utf-8') + '\n')
26 | file.close
27 |
28 |
29 | if __name__ == "__main__":
30 | login = Login()
31 | login.get_xpath()
--------------------------------------------------------------------------------
/爬虫小demo/18 github_login.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | 模拟Github登陆步骤:
4 | 1、请求头:self.headers,请求url
5 | 2、设置session,保存登陆信息cookies,生成github_cookie文件
6 | 3、POST表单提交,请求数据格式post_data
7 | 4、authenticity_token获取
8 | 5、在个人中心验证判断是否登陆成功,输出个人中心信息即登陆成功
9 |
10 | '''
11 |
12 | import requests
13 | from lxml import etree
14 | try:
15 | import cookielib
16 | except:
17 | import http.cookiejar as cookielib
18 |
19 | class GithubLogin():
20 |
21 | def __init__(self):
22 | # url
23 | self.loginUrl = 'https://github.com/login'
24 | self.postUrl = 'https://github.com/session'
25 | self.profileUrl = 'https://github.com/settings/profile'
26 |
27 | # 设置请求头
28 | self.headers = {
29 | 'Referer': 'https://github.com/',
30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
31 | 'Host': 'github.com'
32 | }
33 |
34 | # 设置session
35 | self.session = requests.session()
36 | # 生成github_cookie文件
37 | self.session.cookies = cookielib.LWPCookieJar(filename='github_cookie')
38 |
39 | '''
40 | 登陆时表单提交参数
41 | Form Data:
42 | commit:Sign in
43 | utf8:✓
44 | authenticity_token:yyZprIm4aghZ0u7r25ymZjisfTjGdUAdDowD9fKHM0oUvHD1WjUHbn2sW0Cz1VglZWdGno543jod2M8+jwLv6w==
45 | login:*****
46 | password:******
47 |
48 | '''
49 | def post_account(self, email, password):
50 | post_data = {
51 | 'commit': 'Sign in',
52 | 'utf8': '✓',
53 | 'authenticity_token': self.get_token()[0],
54 | 'login': email,
55 | 'password': password
56 | }
57 | response = self.session.post(self.postUrl, data=post_data, headers=self.headers)
58 | # 保存cookies
59 | self.session.cookies.save()
60 |
61 | def load_cookie(self):
62 | try:
63 | self.session.cookies.load(ignore_discard=True)
64 | except:
65 | print('cookie 获取不成功')
66 |
67 | # 获取authenticity_token
68 | def get_token(self):
69 | response = self.session.get(self.loginUrl, headers=self.headers)
70 | html = etree.HTML(response.text)
71 | authenticity_token = html.xpath('//div/input[2]/@value')
72 | print(authenticity_token)
73 | return authenticity_token
74 |
75 | # 判断是否登陆成功
76 | def isLogin(self):
77 | self.load_cookie()
78 | response = self.session.get(self.profileUrl, headers=self.headers)
79 | selector = etree.HTML(response.text)
80 | flag = selector.xpath('//div[@class="column two-thirds"]/dl/dt/label/text()')
81 | info = selector.xpath('//div[@class="column two-thirds"]/dl/dd/input/@value')
82 | textarea = selector.xpath('//div[@class="column two-thirds"]/dl/dd/textarea/text()')
83 | # 登陆成功返回来的个人设置信息
84 | print(u'个人设置Profile标题: %s'%flag)
85 | print(u'个人设置Profile内容: %s'%info)
86 | print(u'个人设置Profile内容: %s'%textarea)
87 |
88 | if __name__ == "__main__":
89 | github = GithubLogin()
90 | # 输入自己email账号和密码
91 | github.post_account(email='******', password='******')
92 | # 验证是否登陆成功
93 | github.isLogin()
--------------------------------------------------------------------------------
/爬虫小demo/19 jd_login.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 |
7 | class JDlogin():
8 | def __init__(self, username, password):
9 | self.session = requests.session()
10 | self.loginUrl = "http://passport.jd.com/uc/login"
11 | self.postUrl = "http://passport.jd.com/uc/loginService"
12 | self.authUrl = "https://passport.jd.com/uc/showAuthCode"
13 | self.username = username
14 | self.password = password
15 |
16 | # 设置请求头
17 | self.headers = {
18 | 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
19 | }
20 |
21 | def get_authcode(self, url):
22 | self.headers['Host'] = 'authcode.jd.com'
23 | self.headers['Referer'] = 'https://passport.jd.com/uc/login'
24 | response = self.session.get(url, headers=self.headers)
25 | with open('codeimage.jpg', 'wb') as f:
26 | f.write(response.content)
27 | authcode = input("请输入验证码:")
28 | return authcode
29 |
30 | def get_info(self):
31 |
32 | try:
33 | # 登陆请求
34 | html = self.session.get(self.loginUrl, headers=self.headers)
35 | soup = BeautifulSoup(html.text,"lxml")
36 | inputList = soup.select('.form input')
37 | print(inputList)
38 | data = {}
39 | data['uuid'] = inputList[0]['value']
40 | data['eid'] = inputList[4]['value']
41 | data['fp'] = inputList[5]['value']
42 | data['_t'] = inputList[6]['value']
43 | rstr = inputList[7]['name']
44 | data[rstr] = inputList[7]['value']
45 | acRequired = self.session.post(self.authUrl, data={
46 | 'loginName': self.username}).text
47 |
48 | if 'true' in acRequired:
49 |
50 | acUrl = soup.select('.form img')[0]['src2']
51 | acUrl = 'http:{}&yys={}'.format(acUrl, str(int(time.time() * 1000)))
52 | authcode = self.get_authcode(acUrl)
53 | data['authcode'] = authcode
54 | else:
55 | data['authcode'] = ''
56 |
57 | except Exception as e:
58 | print(e)
59 | finally:
60 | return data
61 |
62 | def jd_login(self):
63 |
64 | data = self.get_info()
65 | # Form表单提交数据
66 | # 1、loginname、nloginpwd、loginpwd是在网页中input属性值name,作为表单值提交到登陆请求
67 | # 2、在此处也可以用selenium来进行给输入框(登陆账号、登陆密码)进行赋值
68 |
69 | data['loginname'] = self.username
70 | data['nloginpwd'] = self.password
71 | data['loginpwd'] = self.password
72 | try:
73 | self.headers['Host'] = 'passport.jd.com'
74 | html = self.session.post(self.postUrl, data=data, headers=self.headers)
75 | # 在这里可以判断请求是否判断成功不成功
76 | print(html.text)
77 | except Exception as e:
78 | print(e)
79 |
80 |
81 | if __name__ == "__main__":
82 | # 在下面输入账号名、密码
83 | jdlogin = JDlogin("******", "******")
84 | jdlogin.jd_login()
85 |
--------------------------------------------------------------------------------
/爬虫小demo/20 下载网易云歌词.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | from bs4 import BeautifulSoup
6 | import json
7 | import re
8 | from urllib import request
9 |
10 | # 1、获取网页
11 | def get_html(url):
12 | headers = {
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47',
14 | 'Referer': 'http://music.163.com/',
15 | 'Host': 'music.163.com'
16 | }
17 |
18 | try:
19 | response = requests.get(url, headers=headers)
20 | html = response.text
21 | return html
22 | except:
23 | print('request error')
24 |
25 | def get_text(song_id):
26 | url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(song_id) + '&lv=1&kv=1&tv=-1'
27 | html = get_html(url)
28 | json_obj = json.loads(html)
29 | text = json_obj['lrc']['lyric']
30 | regex = re.compile(r'\[.*\]')
31 | finalLyric = re.sub(regex, '', text).strip()
32 | return finalLyric
33 |
34 | def write_text(song_name,text):
35 | print("正在写入歌曲:{}".format(song_name))
36 | with open("{}.txt".format(song_name),'a',encoding='utf-8') as fp:
37 | fp.write(text)
38 |
39 | def getSingerInfo(html):
40 | soup = BeautifulSoup(html, 'lxml')
41 | links = soup.find('ul', class_='f-hide').find_all('a')
42 | song_IDs = []
43 | song_names = []
44 | for link in links:
45 | song_ID = link.get('href').split('=')[-1]
46 | song_name = link.get_text()
47 | song_IDs.append(song_ID)
48 | song_names.append(song_name)
49 | return zip(song_names, song_IDs)
50 |
51 | def downloadSong(songName,songId):
52 | singer_url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songId)
53 | print('正在下载歌曲:{}'.format(songName))
54 | request.urlretrieve(singer_url,'{}.mp3'.format(songName))
55 |
56 |
57 |
58 | if __name__ == "__main__":
59 | singerId = input("请输入歌手的ID:")
60 | startUrl = "http://music.163.com/artist?id={}".format(singerId)
61 | html = get_html(startUrl)
62 | singerInfos = getSingerInfo(html)
63 |
64 | for singerInfo in singerInfos:
65 | print(singerInfo[1],singerInfo[0])
66 | text = get_text(singerInfo[1])
67 | # 下载歌曲文本
68 | write_text(singerInfo[0],text)
69 | # 下载歌曲mp3
70 | downloadSong(singerInfo[0],singerInfo[1])
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/爬虫小demo/21 TaoBaoInfo.py:
--------------------------------------------------------------------------------
1 | from urllib import request
2 | import re, os, datetime
3 | from selenium import webdriver
4 | import ssl
5 |
6 | ssl._create_default_https_context = ssl._create_unverified_context
7 |
8 |
9 | class TaoBaoInfo:
10 | def __init__(self):
11 | self.dirName = 'MyTaoBaoInfo'
12 | self.driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs')
13 | self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
14 |
15 | # 获取页面内容提取
16 | def getPageContent(self, page):
17 |
18 | url = "https://mm.taobao.com/json/request_top_list.htm?page=" + str(page)
19 | response = request.Request(url, headers = self.headers)
20 | response = request.urlopen(response)
21 |
22 | # 正则获取
23 | pattern_link = re.compile(r'.*?
(.*?).*?'
25 | r'.*?(.*?).*?'
26 | r'(.*?)'
27 | , re.S)
28 | items = re.findall(pattern_link, response.read().decode('gbk'))
29 |
30 | for item in items:
31 | # 详情页面:头像,个人详情,名字,年龄,地区
32 |
33 | detailPage = item[1]
34 | name = item[2]
35 | self.getDetailPage(detailPage, name)
36 |
37 | def getDetailPage(self, url, name):
38 | url = 'http:' + url
39 | self.driver.get(url)
40 | base_msg = self.driver.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
41 | brief = ''
42 | for item in base_msg:
43 | print(item.text)
44 | brief += item.text + '\n'
45 |
46 | icon_url = self.driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]//img')
47 | icon_url = icon_url.get_attribute('src')
48 | dir = self.dirName + '/' + name
49 | self.mkdir(dir)
50 | # 保存头像
51 | try:
52 | self.saveIcon(icon_url, dir, name)
53 | except Exception as e:
54 | print(u'保存头像失败 %s' % (e))
55 |
56 | # 开始跳转相册列表
57 | images_url = self.driver.find_element_by_xpath('//ul[@class="mm-p-menu"]//a')
58 | images_url = images_url.get_attribute('href')
59 | try:
60 | self.getAllImage(images_url, name)
61 | except Exception as e:
62 | print(u'获取所有相册异常 %s' % e)
63 |
64 | try:
65 | self.saveBrief(brief,dir, name)
66 |
67 | except Exception as e:
68 | print(u'保存个人信息失败 %s' % e)
69 |
70 | # 保存个人信息
71 | def saveBrief(self, content,dir, name):
72 | fileName = dir + '/' + name + '.txt'
73 | with open(fileName,'w+') as file:
74 | file.write(content)
75 | print(u'下载完成' + '\n' + '\n')
76 | # 获取所有图片
77 | def getAllImage(self, images_url, name):
78 | self.driver.get(images_url)
79 | # 只获取第一个相册
80 | photos = self.driver.find_element_by_xpath('//div[@class="mm-photo-cell-middle"]//h4/a')
81 | photos_url = photos.get_attribute('href')
82 | # 进入相册页面获取相册内容
83 | self.driver.get(photos_url)
84 | images_all = self.driver.find_elements_by_xpath('//div[@id="mm-photoimg-area"]/a/img')
85 |
86 | self.saveImgs(images_all, name)
87 |
88 | def saveImgs(self, images, name):
89 | index = 1
90 |
91 | for imageUrl in images:
92 | splitPath = imageUrl.get_attribute('src').split('.')
93 | fTail = splitPath.pop()
94 | if len(fTail) > 3:
95 | fTail = "jpg"
96 | fileName = self.dirName + '/' + name + '/' + name + str(index) + "." + fTail
97 | self.saveImg(imageUrl.get_attribute('src'), fileName)
98 | index += 1
99 |
100 | def saveIcon(self, url, dir, name):
101 | splitPath = url.split('.')
102 | fTail = splitPath.pop()
103 | fileName = dir + '/' + name + '.' + fTail
104 | print(fileName)
105 | self.saveImg(url, fileName)
106 |
107 | # 写入图片
108 | def saveImg(self, imageUrl, fileName):
109 | print(imageUrl)
110 | u = request.urlopen(imageUrl)
111 | data = u.read()
112 | f = open(fileName, 'wb')
113 | f.write(data)
114 | f.close()
115 |
116 |
117 | # 创建目录
118 | def mkdir(self, path):
119 | path = path.strip()
120 | print(u'正在下载 %s 个人信息' % path)
121 | if os.path.exists(path):
122 | return False
123 | else:
124 | os.makedirs(path)
125 | return True
126 |
127 | if __name__ == "__main__":
128 | taoBaoInfo = TaoBaoInfo()
129 | # 输入需要下载的页数
130 | page = input("请输入要下载的页数:")
131 | for index in range(1, int(page) + 1):
132 | taoBaoInfo.getPageContent(index)
133 |
--------------------------------------------------------------------------------
/爬虫小demo/22 JDPython.py:
--------------------------------------------------------------------------------
1 | import time
2 | from selenium import webdriver
3 | from lxml import etree
4 |
5 | driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-macosx/bin/phantomjs')
6 |
7 |
8 | # 获取第一页的数据
9 | def get_html():
10 | url = "https://detail.tmall.com/item.htm?id=531993957001&skuId=3609796167425&user_id=268451883&cat_id=2&is_b=1&rn=71b9b0aeb233411c4f59fe8c610bc34b"
11 | driver.get(url)
12 | time.sleep(5)
13 | driver.execute_script('window.scrollBy(0,3000)')
14 | time.sleep(2)
15 | driver.execute_script('window.scrollBy(0,5000)')
16 | time.sleep(2)
17 |
18 | # 累计评价
19 | btnNext = driver.find_element_by_xpath('//*[@id="J_TabBar"]/li[3]/a')
20 | btnNext.click()
21 | html = driver.page_source
22 | return html
23 |
24 |
25 | def get_comments(html):
26 | source = etree.HTML(html)
27 | commens = source.xpath("//*[@id='J_TabBar']/li[3]/a/em/text()")
28 | print('评论数:', commens)
29 | # 将评论转为int类型
30 | commens = (int(commens[0]) / 20) + 1
31 | # 获取到总评论
32 | print('评论数:', int(commens))
33 | return int(commens)
34 |
35 |
36 | def parse_html(html):
37 | html = etree.HTML(html)
38 | commentlist = html.xpath("//*[@class='rate-grid']/table/tbody")
39 | for comment in commentlist:
40 | # 评论
41 | vercomment = comment.xpath(
42 | "./tr/td[@class='tm-col-master']/div[@class='tm-rate-content']/div[@class='tm-rate-fulltxt']/text()")
43 | # 机器类型
44 | verphone = comment.xpath("./tr/td[@class='col-meta']/div[@class='rate-sku']/p[@title]/text()")
45 | print(vercomment)
46 | print(verphone)
47 | # 用户(头尾各一个字,中间用****代替)
48 | veruser = comment.xpath("./tr/td[@class='col-author']/div[@class='rate-user-info']/text()")
49 | print(veruser)
50 |
51 |
52 | def next_button_work(num):
53 | if num != 0:
54 | driver.execute_script('window.scrollBy(0,3000)')
55 | time.sleep(2)
56 | try:
57 | driver.find_element_by_css_selector('#J_Reviews > div > div.rate-page > div > a:last-child').click()
58 | except Exception as e:
59 | print(e)
60 |
61 | time.sleep(2)
62 | driver.execute_script('window.scrollBy(0,3000)')
63 | time.sleep(2)
64 | driver.execute_script('window.scrollBy(0,5000)')
65 | time.sleep(2)
66 | html = driver.page_source
67 | parse_html(html)
68 |
69 |
70 | def selenuim_work(html):
71 | parse_html(html)
72 | next_button_work(1)
73 | pass
74 |
75 |
76 | def gettotalpagecomments(comments):
77 | html = get_html()
78 | for i in range(0, comments):
79 | selenuim_work(html)
80 |
81 |
82 | data = get_html()
83 | # 得到评论
84 | commens = get_comments(data)
85 | # 根据评论内容进行遍历
86 | gettotalpagecomments(commens)
87 |
--------------------------------------------------------------------------------
/爬虫小demo/23 tuchongnet.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import rsa
4 | import binascii
5 | import requests
6 | from base64 import b64decode
7 | import sys
8 | reload(sys)
9 | sys.setdefaultencoding('utf8')
10 |
11 | class LBTuChongNet(object):
12 | def __init__(self):
13 | self.loginUrl = "https://tuchong.com/rest/accounts/login"
14 | self.userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
15 | self.headers = {
16 | 'user-agent': self.userAgent
17 | }
18 | #pubkey 在页面的js中: http://static.tuchong.net/js/pc/page/welcome_6e7f1cd.js
19 |
20 | self.key = "D8CC0180AFCC72C9F5981BDB90A27928672F1D6EA8A57AF44EFFA7DAF6EFB17DAD9F643B9F9F7A1F05ACC2FEA8DE19F023200EFEE9224104627F1E680CE8F025AF44824A45EA4DDC321672D2DEAA91DB27418CFDD776848F27A76E747D53966683EFB00F7485F3ECF68365F5C10C69969AE3D665162D2EE3A5BA109D7DF6C7A5"
21 | self.session = requests.session()
22 |
23 | def get_crypt_password(self,message):
24 | rsaPublickey = int(self.key, 16)
25 | key = rsa.PublicKey(rsaPublickey, 65537)
26 | password = rsa.encrypt(message, key)
27 | password = binascii.b2a_hex(password)
28 | return password
29 |
30 | def get_captcha(self):
31 | captchaUrl="https://tuchong.com/rest/captcha/image"
32 |
33 | rsp = self.session.post(captchaUrl, data = None, headers = self.headers).json()
34 | captcha_id = rsp['captchaId']
35 | captcha_base64 = rsp['captchaBase64']
36 | captcha_base64 = captcha_base64.replace("data:image/png;base64,","")
37 | with open("lbcaptcha.png",'w') as f:
38 | f.write(b64decode(captcha_base64))
39 | captcha = input(u'输入当前目录下 lbcaptcha.png 上的验证码:')
40 | return captcha_id,captcha
41 |
42 | def login(self,username,password):
43 |
44 | passwd_crypt = self.get_crypt_password(password)
45 | postdata = {
46 | 'account': username,
47 | 'password': passwd_crypt,
48 | }
49 | rsp = self.session.post(self.loginUrl, data = postdata, headers = self.headers)
50 | rsp = rsp.json()
51 | print(rsp)
52 | #登录成功
53 | if rsp.has_key('result') and rsp['result'] == "SUCCESS":
54 | print(rsp['message'])
55 | return
56 |
57 | #登录失败
58 | if rsp.has_key('code') and rsp.has_key('message'):
59 | print("response code:%d, message:%s"%(rsp['code'],rsp['message']))
60 | if rsp['message'].find("验证码") >= 0:
61 | print(rsp['message'])
62 | captcha = self.get_captcha()
63 | postdata = {
64 | 'account': username,
65 | 'password': passwd_crypt,
66 | 'captcha_id': captcha[0],
67 | 'captcha_token': int(captcha[1])
68 | }
69 | rsp = self.session.post(self.loginUrl, data = postdata, headers = self.headers)
70 | if str(rsp).find('200'):
71 | print("登陆成功!")
72 |
73 |
74 | if __name__ == '__main__':
75 | # 图虫网验证
76 | lbtuchongnet = LBTuChongNet()
77 | username = raw_input(u'请输入图虫网用户名:')
78 | password = raw_input(u'请输入图虫网密码:')
79 | lbtuchongnet.login(username,password)
80 |
--------------------------------------------------------------------------------
/爬虫小demo/25 PythonItChat.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | '''
3 | itchat:获取分享给群或者个人的技术文章
4 | (0) 熟悉itchat,(https://www.cnblogs.com/Chenjiabing/p/6907397.html)
5 | (1) itchat 扫码次数太多会被限制扫码登录微信。
6 | (2) itchat:获取分享给群或者个人的技术文章,提取出文章链接、文章标题、文章首页图片、文章内容
7 | (3) 通过获取到的文章链接,爬虫文章内容。
8 | (4) 判断是接收方(ToUserName)是谁、发送方(FromUserName)是谁就是通过唯一的ID来判别的。
9 | (5) python itchat 热登陆(itchat.auto_login(hotReload=True))
10 | (6) xpath模块爬取文章标题、文章内图片
11 | (7) 搭建web服务器环境(Mac使用XAMPP)
12 | (8) pymysql模块自动创建数据库、创建字段、保存内容到字段
13 | (9) navicat 的使用
14 | (10) python 相关模块的使用
15 | '''
16 |
17 | # 爬取微信群或者是好友分享的文章
18 | # 监听微信公众号分享的文章
19 |
20 | import itchat
21 | # import全部消息类型
22 | from itchat.content import *
23 | import urllib2
24 | import lxml.etree
25 | import os
26 | import pymysql
27 | import uuid
28 | import json
29 | # 连接数据库
30 | table_cms_news = 'cms_news'
31 | table_cms_news_pic = 'cms_news_pic'
32 | # db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8')
33 | db = pymysql.connect(host='127.0.0.1', user='root', passwd='djs@12316', db='fz_afmcms', charset='utf8')
34 | cur = db.cursor()
35 |
36 | # 处理个人分享消息
37 | # 包括文本、位置、名片、通知、分享(49重点)
38 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING])
39 | def text_reply(msg):
40 | print msg
41 | # 微信里,每个用户和群聊,都使用很长的ID来区分
42 | if msg["MsgType"] == 49:
43 | print "个人分享文章地址链接Url:" + "---------------------------"
44 |
45 | xmlcontent = lxml.etree.HTML(get_html(msg["Url"]))
46 | print xmlcontent
47 | title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
48 |
49 | imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
50 | # 下载图片
51 | source = xmlcontent.xpath('//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
52 | time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
53 | print "来源"
54 | print source, time
55 | # 下载图片
56 | print "下载图片"
57 | # print imgArray
58 | # print title[0]
59 | get_image(title, imgArray, source, time,msg["Url"])
60 |
61 | print msg["Url"]
62 | print "个人分享文章类型编号MsgType:" + "---------------------------"
63 | print msg["MsgType"]
64 | print "个人分享Content:" + "---------------------------"
65 | print msg["Content"]
66 | print "个人分享FromUserName:" + "---------------------------"
67 | print msg["FromUserName"]
68 | print "个人分享ToUserName:" + "---------------------------"
69 | print msg["ToUserName"]
70 | print "个人分享链接标题FileName:" + "---------------------------"
71 | print msg["FileName"]
72 |
73 | print "------------个人"
74 | # 获取到的信息是某某人和登录者之间的通讯,如果不是和登录这通讯就获取不到
75 | print itchat.search_friends(userName=msg['FromUserName'])['NickName']
76 | print itchat.search_friends(userName=msg['ToUserName'])['NickName']
77 |
78 | else:
79 | print "不是个人分享的文章"
80 |
81 |
82 | # 处理群聊消息
83 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING], isGroupChat=True)
84 | def text_reply(msg):
85 | print msg
86 | if msg["MsgType"] == 49:
87 | print "群聊分享文章地址链接Url:" + "---------------------------"
88 | print msg["Url"]
89 |
90 | xmlcontent = lxml.etree.HTML(get_html(msg["Url"]))
91 | title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
92 | imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
93 | # 来源
94 | source = xmlcontent.xpath('//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
95 | time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
96 | print "来源"
97 | print source,time
98 | # 下载图片
99 | print "下载图片"
100 | # print imgArray
101 | # print title[0]
102 | get_image(title,imgArray,source,time,msg["Url"])
103 |
104 | # print "群聊分享文章类型编号MsgType:" + "---------------------------"
105 | # print msg["MsgType"]
106 | # print "群聊分享Content:" + "---------------------------"
107 | # print msg["Content"]
108 | # print "群聊分享FromUserName:" + "---------------------------"
109 | # print msg["FromUserName"]
110 | # print "群聊分享ToUserName:" + "---------------------------"
111 | # print msg["ToUserName"]
112 | # print "群聊分享链接标题FileName:" + "---------------------------"
113 | # print msg["FileName"]
114 | print "-------------群--------"
115 | # itchat.send('%s: %s : %s' % (msg['Type'], msg['Text'], msg['Url']), msg['FromUserName'])
116 |
117 | print msg['FromUserName']
118 | print msg['ToUserName']
119 | # 这个是需要每次扫码登录都改变的receiver
120 | receiver = "@4603e5cb2e47b710bba6fd15dfa3ace9ef3be0f3c80b812e0cc97cd7a71b7c96"
121 | if msg['FromUserName'] == receiver:
122 | print "----------- 自己在群里发的文章 ------------"
123 | # 自己在群里发的文章
124 | print "昵称:"
125 | print itchat.search_friends(userName=msg['FromUserName'])['NickName']
126 | print " ----------- "
127 | print "群名称:"
128 | print itchat.search_chatrooms(userName=msg['ToUserName'])['NickName']
129 | chatRoomName = "呵呵各地"
130 | # if itchat.search_chatrooms(userName=msg['ToUserName'])['NickName'] == chatRoomName:
131 | # pass
132 | # else:
133 | # pass
134 |
135 | else:
136 | # 群友发的文章
137 | print "----------- 群友发的文章 -----------"
138 | print "昵称:"
139 | print msg['ActualNickName']
140 | print " ----------- "
141 | print "群名称:"
142 | print itchat.search_chatrooms(userName=msg['FromUserName'])['NickName']
143 | chatRoomName = "呵呵各地"
144 | # if itchat.search_chatrooms(userName=msg['FromUserName'])['NickName'] == chatRoomName:
145 | # pass
146 | # else:
147 | # pass
148 | else:
149 | print "不是群聊分享的文章"
150 | # return msg['Text']
151 |
152 |
153 | # 处理微信公众号消息
154 | @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING], isMpChat=True)
155 | def text_reply(msg):
156 | print msg
157 | print itchat.search_mps(name='PythonCoder')[0]["NickName"]
158 | if msg["MsgType"] == 49:
159 | print "监听到制定微信公众号分享的文章链接:"
160 | print msg["Url"]
161 | else:
162 | print "微信公众号分享的不是文章"
163 |
164 | # 获取网页内容
165 | def get_html(url):
166 | request = urllib2.Request(url)
167 | response = urllib2.urlopen(request)
168 | html = response.read()
169 | return html
170 |
171 | # 下载图片
172 | def get_image(title,imgArray,source,time,linkurl):
173 | print "标题"
174 | result = cur.execute("select news_url from cms_news WHERE news_url='"+ linkurl + "'")
175 | print(str(result) + '------------url-----------')
176 |
177 | if result:
178 | print("数据库里面存在此数据")
179 | else:
180 | if os.path.isdir('./imgs'):
181 | pass
182 | else:
183 | os.mkdir("./imgs")
184 | for item in imgArray:
185 | with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file:
186 | file.write(get_html(item))
187 | file.close
188 | ima_dic = {}
189 | news_pic = ""
190 | news_pic_s = ""
191 | news_pic_t = ""
192 |
193 | if len(imgArray) == 0:
194 | pass
195 | else:
196 | # 文章图片
197 | for index, item in enumerate(imgArray):
198 | ima_dic[index] = item
199 | if len(imgArray) == 0:
200 | pass
201 | elif len(imgArray) == 1:
202 | news_pic = imgArray[0]
203 | elif len(imgArray) == 2:
204 | news_pic = imgArray[0]
205 | news_pic_s = imgArray[1]
206 | elif len(imgArray) == 3:
207 | news_pic = imgArray[0]
208 | news_pic_s = imgArray[1]
209 | news_pic_t = imgArray[2]
210 | new_id = str(uuid.uuid1()).strip().replace("-", "")
211 | titleString = ""
212 | if len(title) == 0:
213 | pass
214 | else:
215 | titleString = title[0].strip().replace("\n", "")
216 | cur.execute(
217 | 'INSERT INTO ' + table_cms_news_pic + ' (news_id,pic_url,pic_desc) VALUES (%s,%s,%s)',
218 | (new_id, json.dumps(ima_dic,ensure_ascii=False),""))
219 | cur.execute(
220 | 'INSERT INTO ' + table_cms_news + ' (news_open_type,news_id,news_title,news_type,com_id,'\
221 | 'news_column_code1,news_column_name1,'\
222 | 'news_column_code2,news_column_name2,news_desc,news_pic,'\
223 | 'news_pic_s,news_pic_t,news_pic_is_show,'\
224 | 'news_content,news_source,news_cuser_name,'\
225 | 'news_ctime,news_url,news_status,view_count,platid) '\
226 | 'VALUES (%s,%s, %s,%s,%s, %s,%s,%s,%s, %s,%s, %s,%s,%s,'\
227 | ' %s,%s, %s,%s,%s,%s,%s,%s)',
228 | ('1',new_id,titleString,'1','1','1','微信转发','1','分类1','news_desc',news_pic,news_pic_s,
229 | news_pic_t,'1','news_content',source[0].strip().replace("\n", ""),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", ""),linkurl,
230 | '1',200,'weixin'))
231 |
232 | # cur.execute(
233 | # 'INSERT INTO ' + table_cms_news + ' (title,url, img,source,time) VALUES (%s, %s,%s,%s, %s)',
234 | # (title[0].strip().replace("\n", ""),linkurl, json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", "")))
235 | cur.connection.commit()
236 | print("------------------------ 插入成功 ----------------------------------")
237 |
238 | # 连接数据库
239 | def get_connect():
240 |
241 | try:
242 | # 创建表
243 | cur.execute(
244 | 'CREATE TABLE ' + table_cms_news + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000),url VARCHAR(10000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
245 | except pymysql.err.InternalError as e:
246 | print(e)
247 | # 修改表字段
248 | cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
249 | cur.execute(
250 | 'ALTER TABLE ' + table_cms_news + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
251 | cur.execute(
252 | 'ALTER TABLE ' + table_cms_news + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
253 | cur.execute(
254 | 'ALTER TABLE ' + table_cms_news + ' CHANGE url url VARCHAR(10000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
255 | cur.execute(
256 | 'ALTER TABLE ' + table_cms_news + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
257 | cur.execute(
258 | 'ALTER TABLE ' + table_cms_news + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
259 | cur.execute(
260 | 'ALTER TABLE ' + table_cms_news + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
261 |
262 |
263 | # 热登录(在一段时间内不用扫码登录还能保持登录状态)
264 | get_connect()
265 | print "哈哈"
266 | itchat.auto_login(hotReload=True)
267 | # 绑定消息响应事件后,让itchat运行起来,监听消息
268 | itchat.run()
269 |
270 |
--------------------------------------------------------------------------------
/爬虫小demo/26 PythonWeChat.py:
--------------------------------------------------------------------------------
1 | #coding=utf8
2 | import pickle
3 | import wechatsogou
4 | import urllib2
5 | import lxml.etree
6 | import os
7 | import pymysql
8 | import json
9 |
10 | # 添加一个文件,将已经发送成功的文章标题序列化到文件,防止多次运行导致重复发送邮件
11 | file_path = 'sent_articles_file'
12 |
13 | ws_api = wechatsogou.WechatSogouAPI()
14 |
15 | # 连接数据库
16 | tablename = 'pythonwechat'
17 | db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8')
18 | cur = db.cursor()
19 | cur.execute('USE itchat')
20 |
21 | # 获取公众号文章信息
22 | def get_article(gzh):
23 | articles = ws_api.get_gzh_article_by_history(gzh)
24 | print(len(articles['article']))
25 | return articles['article']
26 |
27 | # 获取网页内容
28 | def get_html(url):
29 | request = urllib2.Request(url)
30 | response = urllib2.urlopen(request)
31 | html = response.read()
32 | return html
33 |
34 | # 下载图片
35 | def get_image(title,imgArray,source,time):
36 | if os.path.isdir('./imgs'):
37 | pass
38 | else:
39 | os.mkdir("./imgs")
40 | for item in imgArray:
41 | with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file:
42 | file.write(get_html(item))
43 | file.close
44 |
45 | cur.execute(
46 | 'INSERT INTO ' + tablename + ' (title, img,source,time) VALUES (%s, %s,%s, %s)',
47 | (title[0].strip().replace("\n", ""), json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", "")))
48 | cur.connection.commit()
49 | print title[0]
50 | print("------------------------ 插入成功 ----------------------------------")
51 |
52 | # 连接数据库
53 | def get_connect():
54 |
55 | try:
56 | # 创建表
57 | cur.execute(
58 | 'CREATE TABLE ' + tablename + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
59 | except pymysql.err.InternalError as e:
60 | print(e)
61 | # 修改表字段
62 | cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
63 | cur.execute(
64 | 'ALTER TABLE ' + tablename + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
65 | cur.execute(
66 | 'ALTER TABLE ' + tablename + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
67 | cur.execute(
68 | 'ALTER TABLE ' + tablename + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
69 | cur.execute(
70 | 'ALTER TABLE ' + tablename + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
71 | cur.execute(
72 | 'ALTER TABLE ' + tablename + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
73 |
74 |
75 | if '__main__' == __name__:
76 |
77 | get_connect()
78 |
79 | # 定义一个公众号列表
80 | gzh_list = ['技术最前线', 'python', '全民独立经纪人', '程序视界', '非著名程序员']
81 |
82 | for gzh in gzh_list:
83 | # 查找公众号之前,先从文件中反序列化出已经成功发送的文章列表
84 | if os.path.exists(file_path):
85 | f = open(file_path, 'rb')
86 | sent_list = pickle.load(f)
87 | f.close()
88 | articles = get_article(gzh)
89 | for article in articles:
90 | print(article['title'],'\n\t' ,article['content_url'])
91 |
92 | xmlcontent = lxml.etree.HTML(get_html(article['content_url']))
93 | title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
94 | imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
95 | # 来源
96 | source = xmlcontent.xpath(
97 | '//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
98 | time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
99 | print "来源、时间"
100 | print source, time
101 | # 下载图片
102 | print "下载图片"
103 | get_image(title, imgArray, source, time)
104 |
105 |
--------------------------------------------------------------------------------
/爬虫小demo/27 PythonWordCloud.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | import os
3 | from pyecharts import WordCloud
4 | # 词云
5 | def pythonWordCloud(x,y,label):
6 | wordcloud = WordCloud(width=1300, height=620)
7 | wordcloud.add("", x, y, word_size_range=[20, 100],shape="triangle-forward")
8 | wordcloud.render()
9 | os.system(r"render.html")
10 | x = [
11 | 'PythonCoder', '爬虫', '人工智能', '大数据', 'Django',
12 | 'Flask', '机器学习', '数据分析', '深度学习', '运维测试', 'TensorFlow',
13 | '真实面试经历', '真实面试题', '自然语言处理', 'NLP',"数据处理",
14 | '500GB资料免费送', '开放源码', '免费学习群', '面试简历', 'JCSON']
15 | y = [
16 | 10000, 6181, 4386, 4055, 2467, 2244, 1898, 1484, 1112,
17 | 965, 847, 582, 555, 550, 462, 366, 360, 282, 273, 265,5000]
18 |
19 | pythonWordCloud(x,y,"词云")
20 |
21 |
--------------------------------------------------------------------------------
/爬虫小demo/28 PythonCheHui.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | # 微信,找回好友、群聊用户撤回的消息
4 | # 说明:可以撤回的有文本文字、语音、视频、图片、位置、名片、分享、附件
5 |
6 | import itchat
7 | from itchat.content import *
8 | import sys
9 | import time
10 | import re
11 | import os
12 |
13 | reload(sys)
14 | sys.setdefaultencoding('utf8')
15 |
16 | msg_information = {}
17 | # 针对表情包的内容
18 | face_bug = None
19 |
20 | @itchat.msg_register([TEXT,PICTURE,FRIENDS,CARD,MAP,SHARING,RECORDING,ATTACHMENT,VIDEO],isFriendChat=True,isGroupChat=True)
21 | def receive_msg(msg):
22 | global face_bug
23 | # 接收消息的时间
24 | msg_time_rec = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
25 | if msg.has_key('ActualNickName'):
26 | # 群消息的发送者,用户的唯一标识
27 | from_user = msg['ActualUserName']
28 | # 发送者群内的昵称
29 | msg_from = msg['ActualNickName']
30 | # 获取所有好友
31 | friends = itchat.get_friends(update=True)
32 | for f in friends:
33 | # 如果群消息是好友发的
34 | if from_user == f['UserName']:
35 | # 优先使用好友的备注名称,没有则使用昵称
36 | if f['RemarkName']:
37 | msg_from = f['RemarkName']
38 | else:
39 | msg_from = f['NickName']
40 | break
41 | # 获取所有的群
42 | groups = itchat.get_chatrooms(update=True)
43 | for g in groups:
44 | # 根据群消息的FromUserName匹配是哪个群
45 | if msg['FromUserName'] == g['UserName']:
46 | group_name = g['NickName']
47 | group_menbers = g['MemberCount']
48 | break
49 | group_name = group_name + "(" + str(group_menbers) +")"
50 | else:
51 | # 优先使用备注名称
52 | if itchat.search_friends(userName=msg['FromUserName'])['RemarkName']:
53 | msg_from = itchat.search_friends(userName=msg['FromUserName'])['RemarkName']
54 | else:
55 | # 在好友列表中查询发送信息的好友昵称
56 | msg_from = itchat.search_friends(userName=msg['FromUserName'])['NickName']
57 | group_name = ""
58 | # 信息发送的时间
59 | msg_time = msg['CreateTime']
60 | # 每条信息的id
61 | msg_id = msg['MsgId']
62 | # 储存信息的内容
63 | msg_content = None
64 | # 储存分享的链接,比如分享的文章和音乐
65 | msg_share_url = None
66 | # 如果发送的消息是文本或者好友推荐
67 | if msg['Type'] == 'Text' or msg['Type'] == 'Friends':
68 | msg_content = msg['Text']
69 |
70 | # 如果发送的消息是附件、视频、图片、语音
71 | elif msg['Type'] == "Attachment" or msg['Type'] == "Video" \
72 | or msg['Type'] == 'Picture' \
73 | or msg['Type'] == 'Recording':
74 | # 内容就是他们的文件名
75 | msg_content = msg['FileName']
76 | # 下载文件
77 | msg['Text'](str(msg_content))
78 | # 如果消息为分享的位置信息
79 | elif msg['Type'] == 'Map':
80 | x, y, location = re.search(
81 | "" + x.__str__() + " 经度->" + y.__str__()
85 | else:
86 | msg_content = r"" + location
87 | # 如果消息为分享的音乐或者文章,详细的内容为文章的标题或者是分享的名字
88 | elif msg['Type'] == 'Sharing':
89 | msg_content = msg['Text']
90 | # 记录分享的url
91 | msg_share_url = msg['Url']
92 | face_bug = msg_content
93 | # 将信息存储在字典中,每一个msg_id对应一条信息
94 | msg_information.update(
95 | {
96 | msg_id: {
97 | "msg_from": msg_from,
98 | "msg_time": msg_time,
99 | "msg_time_rec": msg_time_rec,
100 | "msg_type": msg["Type"],
101 | "msg_content": msg_content,
102 | "msg_share_url": msg_share_url,
103 | "group_name":group_name
104 | }
105 | }
106 | )
107 |
108 | # 监听是否有消息撤回
109 | # 使用下面的装饰器监听,会发送4条消息
110 | # @itchat.msg_register(NOTE,isFriendChat=True,isGroupChat=True,isMpChat=True)
111 |
112 | # 监听是否有消息撤回
113 | # 使用下面的装饰器监听,会发送1条消息
114 | @itchat.msg_register(NOTE)
115 | def information(msg):
116 | # 这里如果这里的msg['Content']中包含消息撤回和id,就执行下面的语句
117 | if '撤回了一条消息' in msg['Content']:
118 | # 在返回的content查找撤回的消息的id
119 | old_msg_id = re.search("\(.*?)\<\/msgid\>", msg['Content']).group(1)
120 | # 获取到消息原文
121 | old_msg = msg_information.get(old_msg_id)
122 | # 如果发送的是表情包
123 | if len(old_msg_id)<11:
124 | # 发送撤回的提示给文件助手
125 | itchat.send_file(face_bug,toUserName='filehelper')
126 | # 把暂时存储的信息可以删除掉,也可以选择不删除
127 | # os.remove(face_bug)
128 | else:
129 | msg_body = old_msg.get('group_name') + old_msg.get('msg_from') +"\n" + old_msg.get('msg_time_rec') \
130 | + "撤回了:" + "\n" + r"" + old_msg.get('msg_content')
131 |
132 | # 如果是分享的文件被撤回了,那么就将分享的url加在msg_body中发送给文件助手
133 | if old_msg['msg_type'] == "Sharing":
134 | msg_body += "\n链接是:" + old_msg.get('msg_share_url')
135 | print msg_body
136 | # 将撤回消息发给文件助手
137 | itchat.send_msg(msg_body, toUserName='filehelper')
138 |
139 | # 有文件的话也要将文件发送回去
140 | if old_msg["msg_type"] == "Picture" \
141 | or old_msg["msg_type"] == "Recording" \
142 | or old_msg["msg_type"] == "Video" \
143 | or old_msg["msg_type"] == "Attachment":
144 | file = '@fil@%s' % (old_msg['msg_content'])
145 | itchat.send(msg=file, toUserName='filehelper')
146 | # 把暂时存储的信息可以删除掉,也可以选择不删除
147 | os.remove(old_msg['msg_content'])
148 | # 删除字典旧消息
149 | msg_information.pop(old_msg_id)
150 |
151 | itchat.auto_login(hotReload=True)
152 | itchat.run()
153 |
--------------------------------------------------------------------------------
/爬虫小demo/29 PythonCeHui.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os, re, shutil, time, collections, json
3 |
4 | from html.parser import HTMLParser
5 | from xml.etree import ElementTree as ETree
6 |
7 | import itchat
8 | from itchat.content import *
9 |
10 | msg_store = collections.OrderedDict()
11 | timeout = 600
12 | sending_type = {'Picture': 'img', 'Video': 'vid'}
13 | data_path = 'data'
14 | nickname = ''
15 | bot = None
16 |
17 | if __name__ == '__main__':
18 | if not os.path.exists(data_path):
19 | os.mkdir(data_path)
20 | # if the QR code doesn't show correctly, you can try to change the value
21 | # of enableCdmQR to 1 or -1 or -2. It nothing works, you can change it to
22 | # enableCmdQR=True and a picture will show up.
23 | bot = itchat.new_instance()
24 | bot.auto_login(hotReload=True, enableCmdQR=2)
25 | nickname = bot.loginInfo['User']['NickName']
26 |
27 | def clear_timeouted_message():
28 | now = time.time()
29 | count = 0
30 | for k, v in list(msg_store.items()):
31 | if now - v['ReceivedTime'] > timeout:
32 | count += 1
33 | else:
34 | break
35 | for i in range(count):
36 | item = msg_store.popitem(last=False)
37 |
38 | def get_sender_receiver(msg):
39 | sender = nickname
40 | receiver = nickname
41 | if msg['FromUserName'][0:2] == '@@': # group chat
42 | sender = msg['ActualNickName']
43 | m = bot.search_chatrooms(userName=msg['FromUserName'])
44 | if m is not None:
45 | receiver = m['NickName']
46 | elif msg['ToUserName'][0:2] == '@@': # group chat by myself
47 | if 'ActualNickName' in msg:
48 | sender = msg['ActualNickName']
49 | else:
50 | m = bot.search_friends(userName=msg['FromUserName'])
51 | if m is not None:
52 | sender = m['NickName']
53 | m = bot.search_chatrooms(userName=msg['ToUserName'])
54 | if m is not None:
55 | receiver = m['NickName']
56 | else: # personal chat
57 | m = bot.search_friends(userName=msg['FromUserName'])
58 | if m is not None:
59 | sender = m['NickName']
60 | m = bot.search_friends(userName=msg['ToUserName'])
61 | if m is not None:
62 | receiver = m['NickName']
63 | return HTMLParser().unescape(sender), HTMLParser().unescape(receiver)
64 |
65 | def print_msg(msg):
66 | msg_str = ' '.join(msg)
67 | print(msg_str)
68 | return msg_str
69 |
70 | def get_whole_msg(msg, download=False):
71 | sender, receiver = get_sender_receiver(msg)
72 | if len(msg['FileName']) > 0 and len(msg['Url']) == 0:
73 | if download: # download the file into data_path directory
74 | fn = os.path.join(data_path, msg['FileName'])
75 | msg['Text'](fn)
76 | if os.path.getsize(fn) == 0:
77 | return []
78 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), fn)
79 | else:
80 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), msg['FileName'])
81 | return ['[%s]->[%s]:' % (sender, receiver), c]
82 | c = msg['Text']
83 | if len(msg['Url']) > 0:
84 | try: # handle map label
85 | content_tree = ETree.fromstring(msg['OriContent'])
86 | if content_tree is not None:
87 | map_label = content_tree.find('location')
88 | if map_label is not None:
89 | c += ' ' + map_label.attrib['poiname']
90 | c += ' ' + map_label.attrib['label']
91 | except:
92 | pass
93 | url = HTMLParser().unescape(msg['Url'])
94 | c += ' ' + url
95 | return ['[%s]->[%s]: %s' % (sender, receiver, c)]
96 |
97 | @bot.msg_register([TEXT, PICTURE, MAP, CARD, SHARING, RECORDING,
98 | ATTACHMENT, VIDEO, FRIENDS], isFriendChat=True, isGroupChat=True)
99 | def normal_msg(msg):
100 | print_msg(get_whole_msg(msg))
101 | now = time.time()
102 | msg['ReceivedTime'] = now
103 | msg_id = msg['MsgId']
104 | msg_store[msg_id] = msg
105 | clear_timeouted_message()
106 |
107 | @bot.msg_register([NOTE], isFriendChat=True, isGroupChat=True)
108 | def note_msg(msg):
109 | print_msg(get_whole_msg(msg))
110 | content = HTMLParser().unescape(msg['Content'])
111 | try:
112 | content_tree = ETree.fromstring(content)
113 | except Exception:
114 | # invent/remove to chatroom
115 | return
116 | if content_tree is None:
117 | return
118 | revoked = content_tree.find('revokemsg')
119 | if revoked is None:
120 | return
121 | old_msg_id = revoked.find('msgid').text
122 | old_msg = msg_store.get(old_msg_id)
123 | if old_msg is None:
124 | return
125 | msg_send = get_whole_msg(old_msg, download=True)
126 | for m in msg_send:
127 | bot.send(m, toUserName='filehelper')
128 | clear_timeouted_message()
129 |
130 | if __name__ == '__main__':
131 | bot.run()
--------------------------------------------------------------------------------
/爬虫小demo/30 PythonZhuanFa.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | reload(sys)
4 | sys.setdefaultencoding('UTF8')
5 |
6 | import os, re, shutil, time, collections, json
7 | import requests
8 | from HTMLParser import HTMLParser
9 | from xml.etree import ElementTree as ETree
10 | import hashlib
11 |
12 | import itchat
13 | from itchat.content import *
14 |
15 | sending_type = {'Picture': 'img', 'Video': 'vid'}
16 | data_path = 'data'
17 | group_uin = {u'技术群1': '42235582@chatroom',
18 | u'技术群2': '2424504406@chatroom',
19 | u'技术群3': '6203978346@chatroom'}
20 | publishers = {u'技术群1': u'[阴险]',
21 | u'技术群2': u'[菜刀]',
22 | u'技术群3': u'[月亮]'}
23 | subscribers = [u'技术群1', u'技术群2', u'技术群3']
24 | nickname = ''
25 | bot = None
26 | as_chat_bot = True
27 |
28 | if __name__ == '__main__':
29 | if not os.path.exists(data_path):
30 | os.mkdir(data_path)
31 | # if the QR code doesn't show correctly, you can try to change the value
32 | # of enableCdmQR to 1 or -1 or -2. It nothing works, you can change it to
33 | # enableCmdQR=True and a picture will show up.
34 | bot = itchat.new_instance()
35 | bot.auto_login(hotReload=True, enableCmdQR=2)
36 | nickname = bot.loginInfo['User']['NickName']
37 |
38 | # tuling chat bot
39 | def talks_robot(info):
40 | api_url = 'http://www.tuling123.com/openapi/api'
41 | apikey = ''
42 | data = {'key': apikey, 'info': info.lower()}
43 | req = requests.post(api_url, data=data, timeout=10).text
44 | replys = json.loads(req)['text']
45 | return replys
46 |
47 | def get_sender_receiver(msg):
48 | sender = nickname
49 | receiver = nickname
50 | if msg['FromUserName'][0:2] == '@@': # group chat
51 | sender = msg['ActualNickName']
52 | m = bot.search_chatrooms(userName=msg['FromUserName'])
53 | if m is not None:
54 | receiver = m['NickName']
55 | elif msg['ToUserName'][0:2] == '@@': # group chat by myself
56 | if 'ActualNickName' in msg:
57 | sender = msg['ActualNickName']
58 | else:
59 | m = bot.search_friends(userName=msg['FromUserName'])
60 | if m is not None:
61 | sender = m['NickName']
62 | m = bot.search_chatrooms(userName=msg['ToUserName'])
63 | if m is not None:
64 | receiver = m['NickName']
65 | else: # personal chat
66 | m = bot.search_friends(userName=msg['FromUserName'])
67 | if m is not None:
68 | sender = m['NickName']
69 | m = bot.search_friends(userName=msg['ToUserName'])
70 | if m is not None:
71 | receiver = m['NickName']
72 | return HTMLParser().unescape(sender), HTMLParser().unescape(receiver)
73 |
74 | def print_msg(msg):
75 | msg_str = ' '.join(msg)
76 | print msg_str
77 | return msg_str
78 |
79 | def get_whole_msg(msg, prefix, download=False):
80 | if len(msg['FileName']) > 0 and len(msg['Url']) == 0:
81 | if download: # download the file into data_path directory
82 | fn = os.path.join(data_path, msg['FileName'])
83 | msg['Text'](fn)
84 | if os.path.getsize(fn) == 0:
85 | return []
86 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), fn)
87 | else:
88 | c = '@%s@%s' % (sending_type.get(msg['Type'], 'fil'), msg['FileName'])
89 | return ['%s:' % (prefix), c]
90 | c = msg['Text']
91 | if len(msg['Url']) > 0:
92 | if len(msg['OriContent']) > 0:
93 | try: # handle map label
94 | content_tree = ETree.fromstring(msg['OriContent'])
95 | if content_tree is not None:
96 | map_label = content_tree.find('location')
97 | if map_label is not None:
98 | c += ' ' + map_label.attrib['poiname']
99 | c += ' ' + map_label.attrib['label']
100 | except:
101 | pass
102 | url = HTMLParser().unescape(msg['Url'])
103 | c += ' ' + url
104 | return ['%s: %s' % (prefix, c)]
105 |
106 | @bot.msg_register([TEXT], isFriendChat=True, isGroupChat=False)
107 | def personal_msg(msg):
108 | global as_chat_bot
109 | text = msg['Text'].strip()
110 | if text == u'闭嘴':
111 | as_chat_bot = False
112 | if text == u'张嘴吃药':
113 | as_chat_bot = True
114 | return talks_robot(text)
115 |
116 | @bot.msg_register([FRIENDS])
117 | def accept_friend(msg):
118 | bot.add_friend(msg['RecommendInfo']['UserName'], 3)
119 |
120 | @bot.msg_register([TEXT, PICTURE, MAP, SHARING, RECORDING, ATTACHMENT, VIDEO],
121 | isFriendChat=False, isGroupChat=True)
122 | def group_msg(msg):
123 | # chat bot functionality
124 | global as_chat_bot
125 | if 'IsAt' in msg and msg['IsAt'] == True and \
126 | msg['Type'] == 'Text' and \
127 | msg['ToUserName'][0:2] != '@@' and \
128 | msg['Text'].find(u'@' + nickname) >= 0:
129 | text = msg['Text'].replace(u'@' + nickname, '').strip()
130 | if text == u'shit':
131 | as_chat_bot = False
132 | return
133 | if as_chat_bot:
134 | info = talks_robot(text)
135 | if info.find('No Know') >= 0:
136 | return
137 | if info.find('No Can') >= 0:
138 | return
139 | if info.find('Sorry') >= 0:
140 | return
141 | return info
142 | return
143 | # forwarding functionality
144 | group = msg['FromUserName']
145 | if msg['ToUserName'][0:2] == '@@': # message sent by myself
146 | group = msg['ToUserName']
147 | sender, receiver = get_sender_receiver(msg)
148 | if sender == '':
149 | sender = nickname
150 | # check if the message is from the publisher groups
151 | if receiver not in publishers: # if not in the publishers, do nothing
152 | return
153 | # turn on the chat bot if this magic happens
154 | if msg['Type'] == 'Text' and \
155 | hashlib.sha256(msg['Text']).hexdigest()[-2:] == '23':
156 | as_chat_bot = True
157 | # process message and send it to all the subscribed groups
158 | prefix = '%s[%s]' % (publishers[receiver], sender)
159 | msg_send = get_whole_msg(msg, prefix=prefix, download=True)
160 | if len(msg_send) == 0:
161 | return
162 | print_msg(msg_send)
163 | for tosend in subscribers:
164 | room = bot.search_chatrooms(name=tosend)
165 | for r in room:
166 | if r['UserName'] == group: # don't send back to the source
167 | continue
168 | if r['NickName'] != tosend: # check group name exact match
169 | continue
170 | for m in msg_send: # iterate messages (for images, videos, and files)
171 | bot.send(m, toUserName=r['UserName'])
172 |
173 | if __name__ == '__main__':
174 | bot.run()
175 |
--------------------------------------------------------------------------------
/爬虫小demo/31 下载bilibili视频.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import html
3 | import re
4 | import urllib3
5 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
6 |
7 | def star(url):
8 | url2 = "https://api.bilibili.com/x/player/playurl?avid={avid}&cid={cid}&qn=32&type=&otype=json"
9 | headers2 = {
10 | "host": "",
11 | "Referer": "https://www.bilibili.com",
12 | "User-Agent": "Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,likeGecko)Chrome/63.0.3239.132Safari/537.36"
13 | }
14 |
15 | avid = re.findall("video/av(.+)\?", url)
16 | print(avid)
17 | cid ,name = get_cid(avid[0])
18 | print(cid,name)
19 | flv_url , size = get_flvurl(url2.format(avid=avid[0],cid=cid))
20 | shuju = size / 1024 / 1024
21 | print("本视频大小为:%.2fM" % shuju)
22 |
23 | h = re.findall("https://(.+)com",flv_url)
24 | host = h[0]+"com"
25 |
26 | headers2["host"] = host
27 | res = requests.get(flv_url,headers=headers2,stream=True, verify=False)
28 | print(res.status_code)
29 | save_movie(res,name)
30 |
31 | def get_cid(aid):#获得cid
32 | header = {
33 | 'host': 'api.bilibili.com',
34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
35 | }
36 | url = "https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp".format(aid=aid)
37 | response = requests.get(url,headers=header).json()
38 | # print(response["data"])
39 | # 这个地方设置index是因为下载集合里面的视频,顺序,0代表下载第一个视频,1代表下载集合里面第二个视频,2,3,4...依次类推
40 | index = 0
41 | return response["data"][index]["cid"] ,response["data"][index]["part"]
42 | def get_flvurl(url):#获得视频真实flv地址
43 | header = {'host': 'api.bilibili.com',
44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
45 |
46 | response = requests.get(url,headers=header).json()
47 | return response["data"]["durl"][0]["url"],response["data"]["durl"][0]["size"]
48 | def save_movie(res,name):#保存视频
49 | chunk_size = 1024
50 | with open("{name}.flv".format(name = name),"wb") as f:
51 | for data in res.iter_content(1024):
52 | f.write(data)
53 |
54 |
55 | if __name__ == "__main__":
56 | # 把下面的av后面的'583959574'在要下载的视频集合里面找到就可以下载视频了
57 | url = "https://www.bilibili.com/video/av583959574?spm_id_from=333.334.b_62696c695f646f756761.5"
58 | star(url)
59 |
60 |
61 |
--------------------------------------------------------------------------------
/爬虫小demo/32 m3u8.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 | from Crypto.Cipher import AES
4 |
5 | def m3u8(url):
6 | header = {
7 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
8 | }
9 | # requests得到m3u8文件内容
10 | content = requests.get(url, headers=header).text
11 | if "#EXTM3U" not in content:
12 | print("这不是一个m3u8的视频链接!")
13 | return False
14 | if "EXT-X-KEY" not in content:
15 | print("没有加密")
16 | return False
17 |
18 | # 使用re正则得到key和视频地址
19 | jiami = re.findall('#EXT-X-KEY:(.*)',content)
20 | key = re.findall('URI="(.*)"', jiami[0])
21 | vi = re.findall('IV=(.*)', jiami[0])[0]
22 |
23 | # 得到每一个ts视频链接
24 |
25 | # tslist = re.findall('EXTINF:(.*), (. *)',content.replace(' ', '').replace(r'\n', ''))
26 | tslist = re.findall('v.f240.ts(.*)',content)
27 |
28 | newlist = []
29 | for i in tslist:
30 | newlist.append("v.f240.ts" + i)
31 | # print(newlist)
32 | # 得到key的链接并请求得到加密的key值
33 | keyurl = key[0]
34 | keycontent = requests.get(keyurl, headers=header).content
35 |
36 | # 得到每一个完整视频的链接地址
37 | base_url = url.replace(url.split('/')[-1], '')
38 | # print(base_url)
39 | tslisturl = []
40 | for i in newlist:
41 | tsurl = base_url + i
42 | tslisturl.append(tsurl)
43 |
44 | # 得到解密方法,这里要导入第三方库 pycrypto
45 | # 这里有一个问题,安装pycrypto成功后,导入from Crypto.Cipher import AES报错
46 | # 找到使用python环境的文件夹,在Lib文件夹下有一个 site-packages 文件夹,里面是我们环境安装的包。
47 | # 找到一个crypto文件夹,打开可以看到 Cipher文件夹,此时我们将 crypto文件夹改为 Crypto 即可使用了
48 | # 必须添加b'0000000000000000',防止报错ValueError: IV must be 16 bytes long
49 | cryptor = AES.new(keycontent, AES.MODE_CBC, b'0000000000000000')
50 |
51 | # for循环获取视频文件
52 | for i in tslisturl:
53 | print(i)
54 | res = requests.get(i, header)
55 | # 使用解密方法解密得到的视频文件
56 | cont = cryptor.decrypt(res.content)
57 | # 以追加的形式保存为mp4文件,mp4可以随意命名,这里命名为小鹅通视频下载测试
58 | with open('14-搜索组件界面实现.mp4', 'ab+') as f:
59 | f.write(cont)
60 | return True
61 |
62 | if __name__ == '__main__':
63 | # 这个是网页上查到的小鹅通的卖u8地址
64 | # url = "https://1252524126.vod2.myqcloud.com/9764a7a5vodtransgzp1252524126/91c29aad5285890807164109582/drm/v.f146750.m3u8"
65 | # url = "https://1258102968.vod2.myqcloud.com/ed7d8254vodtranscq1258102968/a61912e43701925923160746329/drm/v.f240.m3u8?t=62dfad73&us=DYws6oOg3A&sign=1d4381d06b276e87eae478a23f3d6375"
66 | url = "https://1258102968.vod2.myqcloud.com/ed7d8254vodtranscq1258102968/a3ae8ff93701925923160630524/drm/v.f240.m3u8?t=62dfaf5a&us=RquNSsL6XT&sign=8bec9ca974f9413c9bad7a9e8d620ae2"
67 | pd = m3u8(url)
68 | if pd:
69 | print('视频下载完成!')
--------------------------------------------------------------------------------