├── .gitignore ├── README.md ├── baidunews ├── baidunews │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── news.py ├── main.py └── scrapy.cfg ├── dangdang ├── dangdang │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── dd.py ├── main.py └── scrapy.cfg ├── douban ├── douban │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── dou.py ├── main.py ├── scrapy.cfg └── ydm │ ├── YDMHTTP.py │ ├── YDMPython3.py │ ├── __init__.py │ ├── yundamaAPI-x64.dll │ └── yundamaAPI.dll ├── examples ├── __init__.py ├── example-1.py ├── example-10.py ├── example-11.py ├── example-12.py ├── example-13.py ├── example-14.py ├── example-15.py ├── example-16.py ├── example-17.py ├── example-18.py ├── example-19.py ├── example-2.py ├── example-20.py ├── example-21.py ├── example-22.py ├── example-23.py ├── example-24.py ├── example-25.py ├── example-26.py ├── example-3.py ├── example-4.py ├── example-5.py ├── example-6.py ├── example-7.py ├── example-8.py └── example-9.py └── jdgoods ├── jdgoods ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── good.py ├── main.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | *.log 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 概览 2 | 3 | * 零基础学习python及爬虫, python版本为3.5 4 | * 代码中为了便于调试都有print输出部分,如果需要调试的可以帮注释去掉 5 | 6 | # 目录 7 | 8 | ### examples 9 | 10 | 本目录中主要是python基础和爬虫需要用到的常用扩展库的使用 11 | 12 | 1. [example-1.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-1.py) python语法基础 13 | 2. [example-2.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-2.py) python控制流与小实例 14 | 3. [example-3.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-3.py) python函数详解 15 | 4. [example-4.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-4.py) python模块实战 16 | 5. [example-5.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-5.py) python文件操作实战 17 | 6. [example-6.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-6.py) python异常处理实战 18 | 7. [example-7.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-7.py) 面向对象编程 19 | 8. [example-8.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-8.py) 正则表达式-原子 20 | 9. [example-9.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-9.py) 正则表达式-元字符 21 | 10. [example-10.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-10.py) 正则表达式-模式修正符 22 | 11. [example-11.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-11.py) 正则表达式-贪婪模式和懒惰模式 23 | 12. [example-12.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-12.py) 简单爬虫的编写(urllib学习) 24 | 13. [example-13.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-13.py) 超时设置 25 | 14. [example-14.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-14.py) 自动模拟HTTP请求与百度信息自动搜索爬虫实战 26 | 15. [example-15.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-15.py) 自动模拟HTTP请求之自动POST实战 27 | 16. [example-16.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-16.py) 爬虫的异常处理实战 28 | 17. [example-17.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-17.py) 爬虫的浏览器伪装技术实战 29 | 18. [example-18.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-18.py) CSDN博文爬虫实战 30 | 19. [example-19.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-19.py) 糗事百科段子爬虫实战 31 | 20. [example-20.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-20.py) 用户代理池构建实战 32 | 21. [example-21.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-21.py) IP代理池构建实战 33 | 22. [example-22.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-22.py) 淘宝商品图片爬虫实战 34 | 23. [example-23.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-23.py) 如何同时使用用户代理池和IP代理池 35 | 24. [example-24.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-24.py) 在Urllib中使用XPath表达式 36 | 25. [example-25.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-25.py) BeautifulSoup基础实战 37 | 26. [example-26.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-26.py) PhantomJS基础实战 38 | 39 | ### dangdang 40 | scrapy实现当当网商品爬虫实战 41 | ### baidunews 42 | scrapy百度新闻爬虫实战 43 | ### douban 44 | scrapy豆瓣网登陆爬虫与验证码自动识别实战 45 | ### jdgoods 46 | scrapy与urllib的整合使用(爬取京东图书商品) 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /baidunews/baidunews/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/baidunews/baidunews/__init__.py -------------------------------------------------------------------------------- /baidunews/baidunews/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaidunewsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | link = scrapy.Field() 14 | title = scrapy.Field() 15 | content = scrapy.Field() 16 | print(link) 17 | -------------------------------------------------------------------------------- /baidunews/baidunews/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BaidunewsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /baidunews/baidunews/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BaidunewsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /baidunews/baidunews/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidunews project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidunews' 13 | 14 | SPIDER_MODULES = ['baidunews.spiders'] 15 | NEWSPIDER_MODULE = 'baidunews.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'baidunews.middlewares.BaidunewsSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'baidunews.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'baidunews.pipelines.BaidunewsPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /baidunews/baidunews/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /baidunews/baidunews/spiders/news.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from baidunews.items import BaidunewsItem 5 | import re 6 | from scrapy.http import Request 7 | 8 | class NewsSpider(scrapy.Spider): 9 | name = "news" 10 | #allowed_domains = ["baidu.com"] 11 | start_urls = ['http://baidu.com/'] 12 | # 网站分类 13 | allid = ['LocalHouseNews', 'LocalNews'] 14 | # 构造请求地址 15 | allurl = [] 16 | for i in range(0, len(allid)): 17 | thisurl = "http://news.baidu.com/widget?id=" + allid[i] + "&ajax=json" 18 | allurl.append(thisurl) 19 | def parse(self, response): 20 | for j in range(0, len(self.allurl)): 21 | print("正在爬取第" + str(j) + "个栏目") 22 | yield Request(self.allurl[j], callback = self.getData1) 23 | # 处理爬取到的数据 24 | def getData1(self, response): 25 | data = response.body.decode('utf-8', 'ignore') 26 | pat1 = '"m_relate_url":"(.*?)"' 27 | pat2 = '"url":"(.*?)"' 28 | # 提取json串中的文章url地址 29 | url1 = re.compile(pat1, re.S).findall(data) 30 | url2 = re.compile(pat2, re.S).findall(data) 31 | if(len(url1) != 0): 32 | url = url1 33 | else: 34 | url = url2 35 | for k in range(0, len(url)): 36 | articleurl = url[k] 37 | # 处理url中转义符号\/\/ 38 | articleurl = re.sub('\\\/', '/', articleurl) 39 | yield Request(articleurl, callback = self.getData2) 40 | def getData2(self, response): 41 | item = BaidunewsItem() 42 | item['link'] = response.url 43 | item['title'] = response.xpath("/html/head/title/text()").extract() 44 | item['content'] = response.body 45 | yield item -------------------------------------------------------------------------------- /baidunews/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl news".split()) -------------------------------------------------------------------------------- /baidunews/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidunews.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidunews 12 | -------------------------------------------------------------------------------- /dangdang/dangdang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/dangdang/dangdang/__init__.py -------------------------------------------------------------------------------- /dangdang/dangdang/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class DangdangItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | title = scrapy.Field() 14 | link = scrapy.Field() 15 | comment = scrapy.Field() 16 | -------------------------------------------------------------------------------- /dangdang/dangdang/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DangdangSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /dangdang/dangdang/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymysql 9 | 10 | class DangdangPipeline(object): 11 | def process_item(self, item, spider): 12 | conn = pymysql.connect(host="127.0.0.1", user="root", passwd="ok", db="dangdang", charset="utf8") 13 | for i in range(0, len(item["title"])): 14 | title = item["title"][i] 15 | link = item["link"][i] 16 | comment = item["comment"][i] 17 | sql = "insert into goods(title, link, comment) values('" + title + "','" + link + "','" + comment + "')" 18 | try: 19 | conn.query(sql) 20 | except Exception as e: 21 | print(e) 22 | conn.commit() 23 | conn.close() 24 | return item 25 | -------------------------------------------------------------------------------- /dangdang/dangdang/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dangdang project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dangdang' 13 | 14 | SPIDER_MODULES = ['dangdang.spiders'] 15 | NEWSPIDER_MODULE = 'dangdang.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'dangdang.middlewares.DangdangSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'dangdang.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'dangdang.pipelines.DangdangPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /dangdang/dangdang/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dangdang/dangdang/spiders/dd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Script Name : dd.py 3 | # Author : gaoyaqiu(球哥) 4 | # Created : 2017-03-19 5 | # Description : Scrapy实现当当网商品爬虫实战 6 | 7 | 8 | import scrapy 9 | from dangdang.items import DangdangItem 10 | from scrapy.http import Request 11 | 12 | class DdSpider(scrapy.Spider): 13 | name = "dd" 14 | allowed_domains = ["dangdang.com"] 15 | start_urls = ['http://category.dangdang.com/pg1-cid4008154.html'] 16 | 17 | def parse(self, response): 18 | for i in range(1, 3): 19 | url = "http://category.dangdang.com/pg" + str(i) + "-cid4008154.html" 20 | yield Request(url, callback = self.handle_items) 21 | def handle_items(self, response): 22 | item = DangdangItem() 23 | item["title"] = response.xpath("//a[@name='sort-big-pic']/@title").extract() 24 | item["link"] = response.xpath("//a[@name='sort-big-pic']/@href").extract() 25 | item["comment"] = response.xpath("//a[@name='sort-evaluate']/text()").extract() 26 | yield item 27 | -------------------------------------------------------------------------------- /dangdang/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl dd".split()) -------------------------------------------------------------------------------- /dangdang/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dangdang.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dangdang 12 | -------------------------------------------------------------------------------- /douban/douban/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/douban/__init__.py -------------------------------------------------------------------------------- /douban/douban/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /douban/douban/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /douban/douban/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /douban/douban/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for douban project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'douban' 13 | 14 | SPIDER_MODULES = ['douban.spiders'] 15 | NEWSPIDER_MODULE = 'douban.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'douban.middlewares.DoubanSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'douban.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'douban.pipelines.DoubanPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /douban/douban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /douban/douban/spiders/dou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.http import Request, FormRequest 4 | import urllib.request 5 | import os 6 | 7 | 8 | class DouSpider(scrapy.Spider): 9 | name = "dou" 10 | allowed_domains = ["douban.com"] 11 | # start_urls = ['http://douban.com/'] 12 | 13 | header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} 14 | 15 | def start_requests(self): 16 | # 请求登陆页 17 | return [Request('https://accounts.douban.com/login', meta = {'cookiejar': 1}, callback = self.parse)] 18 | def parse(self, response): 19 | # 判断是否存在验证码 20 | captcha_image = response.xpath("//img[@id='captcha_image']/@src").extract() 21 | print(captcha_image) 22 | if len(captcha_image) > 0: 23 | print('有验证码, 等待识别...') 24 | # 将验证码下载到本地 25 | local_path = '/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 26 | urllib.request.urlretrieve(captcha_image[0], filename = local_path) 27 | # 方法1: 通过半自动人工处理 28 | #captcha_value = input('请输入/Users/gaoyaqiu/Downloads/python-test/test/中captcha.png的验证码内容! ') 29 | # 方法2: 通过接口实现全自动处理-1 (这里使用的是云打码的api,因他们接口不提供mac版本,所以这里例子只能在windows中使用) 30 | # 使用注意: 需要将YDMPython3.py 中的账号信息替换成自己的账号 31 | ''' 32 | cmd = 'python3.5 /Users/gaoyaqiu/git/python-spider/douban/ydm/YDMPython3.py' 33 | r = os.popen(cmd) 34 | captcha_value = r.read() 35 | ''' 36 | 37 | # 方法3: 通过接口实现全自动处理-2 (这里使用的是云打码的http api python2.7) 38 | cmd = 'python2.7 /Users/gaoyaqiu/git/python-spider/douban/ydm/YDMHTTP.py' 39 | r = os.popen(cmd) 40 | read_result = r.read().split() 41 | cid = read_result[0] 42 | if int(cid) > 0: 43 | captcha_value = str(read_result[1]) 44 | print('当前验证码识别结果为: cid: %s, result: %s' % (cid, captcha_value)) 45 | params = { 46 | 'captcha-solution': captcha_value, 47 | 'redir': 'https://www.douban.com/people/156127818/', # 登陆成功之后的重定向地址,可是自己主页地址 48 | 'form_email': '账号', 49 | 'form_password': '密码' 50 | } 51 | else: 52 | print('识别验证码出错cid: ' + cid) 53 | else: 54 | params = { 55 | 'redir': 'https://www.douban.com/people/156127818/', 56 | 'form_email': '账号', 57 | 'form_password': '密码' 58 | } 59 | 60 | print('登陆中。。。') 61 | # 开始登陆 62 | return [FormRequest.from_response(response, 63 | # 设置cookie 64 | meta = {'cookiejar': response.meta['cookiejar']}, 65 | # 设置header信息 66 | headers = self.header, 67 | # 设置post表单提交数据 68 | formdata = params, 69 | callback = self.next 70 | )] 71 | def next(self, response): 72 | # 登陆成功之后查看title信息,确认是跳转成功 73 | title = response.xpath("/html/head/title/text()").extract() 74 | print(title) 75 | 76 | -------------------------------------------------------------------------------- /douban/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl dou".split()) 3 | #cmdline.execute("scrapy crawl dou --nolog".split()) -------------------------------------------------------------------------------- /douban/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = douban.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = douban 12 | -------------------------------------------------------------------------------- /douban/ydm/YDMHTTP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import httplib, mimetypes, urlparse, json, time 3 | 4 | 5 | class YDMHttp: 6 | apiurl = 'http://api.yundama.com/api.php' 7 | 8 | username = '' 9 | password = '' 10 | appid = '' 11 | appkey = '' 12 | 13 | def __init__(self, username, password, appid, appkey): 14 | self.username = username 15 | self.password = password 16 | self.appid = str(appid) 17 | self.appkey = appkey 18 | 19 | def request(self, fields, files=[]): 20 | try: 21 | response = post_url(self.apiurl, fields, files) 22 | response = json.loads(response) 23 | except Exception as e: 24 | response = None 25 | return response 26 | 27 | def balance(self): 28 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 29 | 'appkey': self.appkey} 30 | response = self.request(data) 31 | if (response): 32 | if (response['ret'] and response['ret'] < 0): 33 | return response['ret'] 34 | else: 35 | return response['balance'] 36 | else: 37 | return -9001 38 | 39 | def login(self): 40 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 41 | 'appkey': self.appkey} 42 | response = self.request(data) 43 | if (response): 44 | if (response['ret'] and response['ret'] < 0): 45 | return response['ret'] 46 | else: 47 | return response['uid'] 48 | else: 49 | return -9001 50 | 51 | def upload(self, filename, codetype, timeout): 52 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 53 | 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 54 | file = {'file': filename} 55 | response = self.request(data, file) 56 | if (response): 57 | if (response['ret'] and response['ret'] < 0): 58 | return response['ret'] 59 | else: 60 | return response['cid'] 61 | else: 62 | return -9001 63 | 64 | def result(self, cid): 65 | data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 66 | 'appkey': self.appkey, 'cid': str(cid)} 67 | response = self.request(data) 68 | return response and response['text'] or '' 69 | 70 | def decode(self, filename, codetype, timeout): 71 | cid = self.upload(filename, codetype, timeout) 72 | if (cid > 0): 73 | for i in range(0, timeout): 74 | result = self.result(cid) 75 | if (result != ''): 76 | return cid, result 77 | else: 78 | time.sleep(1) 79 | return -3003, '' 80 | else: 81 | return cid, '' 82 | 83 | 84 | ###################################################################### 85 | 86 | def post_url(url, fields, files=[]): 87 | urlparts = urlparse.urlsplit(url) 88 | return post_multipart(urlparts[1], urlparts[2], fields, files) 89 | 90 | 91 | def post_multipart(host, selector, fields, files): 92 | content_type, body = encode_multipart_formdata(fields, files) 93 | h = httplib.HTTP(host) 94 | h.putrequest('POST', selector) 95 | h.putheader('Host', host) 96 | h.putheader('Content-Type', content_type) 97 | h.putheader('Content-Length', str(len(body))) 98 | h.endheaders() 99 | h.send(body) 100 | errcode, errmsg, headers = h.getreply() 101 | return h.file.read() 102 | 103 | 104 | def encode_multipart_formdata(fields, files=[]): 105 | BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ' 106 | CRLF = '\r\n' 107 | L = [] 108 | for field in fields: 109 | key = field 110 | value = fields[key] 111 | L.append('--' + BOUNDARY) 112 | L.append('Content-Disposition: form-data; name="%s"' % key) 113 | L.append('') 114 | L.append(value) 115 | for field in files: 116 | key = field 117 | filepath = files[key] 118 | L.append('--' + BOUNDARY) 119 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath)) 120 | L.append('Content-Type: %s' % get_content_type(filepath)) 121 | L.append('') 122 | L.append(open(filepath, 'rb').read()) 123 | L.append('--' + BOUNDARY + '--') 124 | L.append('') 125 | body = CRLF.join(L) 126 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 127 | return content_type, body 128 | 129 | 130 | def get_content_type(filename): 131 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 132 | 133 | 134 | ###################################################################### 135 | 136 | # 用户名 137 | username = 'username' 138 | 139 | # 密码 140 | password = 'password' 141 | 142 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 143 | appid = 1 144 | 145 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 146 | appkey = '3495daa179e863b03a0483ad99e6cadd' 147 | 148 | # 图片文件 149 | filename = '/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 150 | 151 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 152 | codetype = 3000 153 | 154 | # 超时时间,秒 155 | timeout = 60 156 | 157 | # 检查 158 | if (username == ''): 159 | print('请设置好相关参数再测试') 160 | else: 161 | # 初始化 162 | yundama = YDMHttp(username, password, appid, appkey) 163 | 164 | # 登陆云打码 165 | # uid = yundama.login(); 166 | #print('uid: %s' % uid) 167 | 168 | # 查询余额 169 | # balance = yundama.balance(); 170 | # print('balance: %s' % balance) 171 | 172 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 173 | cid, result = yundama.decode(filename, codetype, timeout); 174 | #print('cid: %s, result: %s' % (cid, result)) 175 | print cid, result 176 | 177 | ###################################################################### 178 | -------------------------------------------------------------------------------- /douban/ydm/YDMPython3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | from ctypes import * 6 | 7 | # 下载接口放目录 http://www.yundama.com/apidoc/YDM_SDK.html 8 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html 9 | # 所有函数请查询 http://www.yundama.com/apidoc 10 | 11 | print('>>>正在初始化...') 12 | 13 | YDMApi = windll.LoadLibrary('/Users/gaoyaqiu/git/python-spider/douban/ydm/yundamaAPI-x64.dll') 14 | 15 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号 16 | # 2. http://www.yundama.com/developer/myapp 添加新软件 17 | # 3. 使用添加的软件ID和密钥进行开发,享受丰厚分成 18 | 19 | appId = 1 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 20 | appKey = b'1' # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 21 | 22 | print('软件ID:%d\r\n软件密钥:%s' % (appId, appKey)) 23 | 24 | # 注意这里是普通会员账号,不是开发者账号,注册地址 http://www.yundama.com/index/reg/user 25 | # 开发者可以联系客服领取免费调试题分 26 | 27 | username = b'1' 28 | password = b'1' 29 | 30 | if username == b'test': 31 | exit('\r\n>>>请先设置用户名密码') 32 | 33 | ####################### 一键识别函数 YDM_EasyDecodeByPath ####################### 34 | 35 | print('\r\n>>>正在一键识别...') 36 | 37 | # 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 38 | codetype = 3000 39 | 40 | # 分配30个字节存放识别结果 41 | result = c_char_p(b" ") 42 | 43 | # 识别超时时间 单位:秒 44 | timeout = 60 45 | 46 | # 验证码文件路径 47 | filename = b'/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 48 | 49 | # 一键识别函数,无需调用 YDM_SetAppInfo 和 YDM_Login,适合脚本调用 50 | captchaId = YDMApi.YDM_EasyDecodeByPath(username, password, appId, appKey, filename, codetype, timeout, result) 51 | 52 | print("一键识别:验证码ID:%d,识别结果:%s" % (captchaId, result.value)) 53 | 54 | ################################################################################ 55 | 56 | 57 | ########################## 普通识别函数 YDM_DecodeByPath ######################### 58 | 59 | print('\r\n>>>正在登陆...') 60 | 61 | # 第一步:初始化云打码,只需调用一次即可 62 | YDMApi.YDM_SetAppInfo(appId, appKey) 63 | 64 | # 第二步:登陆云打码账号,只需调用一次即可 65 | uid = YDMApi.YDM_Login(username, password) 66 | 67 | if uid > 0: 68 | 69 | print('>>>正在获取余额...') 70 | 71 | # 查询账号余额,按需要调用 72 | balance = YDMApi.YDM_GetBalance(username, password) 73 | 74 | print('登陆成功,用户名:%s,剩余题分:%d' % (username, balance)) 75 | 76 | print('\r\n>>>正在普通识别...') 77 | 78 | # 第三步:开始识别 79 | 80 | # 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 81 | codetype = 3000 82 | 83 | # 分配30个字节存放识别结果 84 | result = c_char_p(b" ") 85 | 86 | # 验证码文件路径 87 | filename = b'/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 88 | 89 | # 普通识别函数,需先调用 YDM_SetAppInfo 和 YDM_Login 初始化 90 | captchaId = YDMApi.YDM_DecodeByPath(filename, codetype, result) 91 | 92 | print("普通识别:验证码ID:%d,识别结果:%s" % (captchaId, result.value)) 93 | 94 | else: 95 | print('登陆失败,错误代码:%d' % uid) 96 | 97 | ################################################################################ 98 | 99 | print('\r\n>>>错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html') 100 | 101 | input('\r\n测试完成,按回车键结束...') 102 | -------------------------------------------------------------------------------- /douban/ydm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/ydm/__init__.py -------------------------------------------------------------------------------- /douban/ydm/yundamaAPI-x64.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/ydm/yundamaAPI-x64.dll -------------------------------------------------------------------------------- /douban/ydm/yundamaAPI.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/ydm/yundamaAPI.dll -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/examples/__init__.py -------------------------------------------------------------------------------- /examples/example-1.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-1.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-04 4 | # Description : No.1 python语法基础 5 | 6 | # 在控制台打印hello world,使用print方法 7 | # 学编程的一般刚认识的就是hello world, ^_^ 8 | print("hello world!") 9 | 10 | # 1.1 常见的两种注释用法(当我们想让某些区域的代码不起作用时就会用到注释) 11 | ''' 12 | 1、#号表示: 单行注释 13 | 2、三引号表示: 多行注释 14 | ''' 15 | 16 | # 单行注释 17 | #print("hello world!") 18 | 19 | # 多行注释 20 | ''' 21 | print("hello world!") 22 | ''' 23 | 24 | # 1.2 标识符(标识某个东西的符号) 25 | ''' 26 | python 中标识符命名规则为: 27 | 第一个字符为字母或下划线 28 | 除第一个字符以外的其他字符可以是字母、下划线或数字 29 | ''' 30 | # 标识符 31 | # nihao、_nihao、_nihao_nihao 32 | 33 | # 1.3 变量(简单来说可以变化的量叫做变量) 34 | # 变量 35 | nihao = 1 36 | _nihao = 2 37 | _nihao_nihao = 3 38 | 39 | # 左面的nihao、_nihao、_nihao_nihao 就是变量, 右面的 1、2、3就是变量对应的值 40 | 41 | # 1.4 数据类型(世界上有很多数据类型,为了更方便处理这些数据,我们给这些数据进行分类,进而形成了数据类型) 42 | # python 中常见的数据类型有 43 | # 数、字符串、列表(list)、元组(tuple)、集合(set)、字典(dictinoary) 44 | 45 | # 数 46 | # abc 这个变量的数据类型就是数 47 | abc = 8 48 | 49 | # 字符串 50 | # 单引号、双引号、三引号都是字符串 51 | a1 = "abc" 52 | a2 = 'abc' 53 | a3 = '''abc''' 54 | 55 | # 列表 (存储多个元素, 列表中的元素可以重新赋值, 最外围用[]中括号表示) 56 | # 列表可以直接通过下标取值,下标是从0开始编号,0就表示第一个值,不管是数组、还是列表、元组都会从0开始编号 57 | # 取第1个值的方式为list[0]、第2个为b[1]、依次类推。。。 58 | # 还可以修改列表中某个元素的值,如想修改8的值为100,那么可以使用 b[1] = 100 59 | b = [7, 8, "abc", 9] 60 | #print(b[0]) 61 | b[1] = 100 62 | #print(b[1]) 63 | 64 | # 元组 (存储多个元素, 元组中的元素不可以重新赋值, 最外围用()小括号表示) 65 | c = (7, 8, "abc", 9) 66 | #print(c) 67 | 68 | # 字典 {键: 值, 键: 值, ...}, 69 | # 取值格式: 字典名["对应的键名"] 70 | d = {"name": "gaoyaqiu", "sex": "boy", "job": "程序猿"} 71 | #print(d["job"]) 72 | 73 | # 集合 (用的不多,最好的用处就是去重,) 74 | # 去重 (比如下面的例子中a比较多,使用集合之后,重复的a都会过滤掉只保留一个) 75 | e1 = set("abcsdfsfaaaaa") 76 | #print(e1) 77 | e2 = set("abcdfg") 78 | #print(e2) 79 | 80 | # 交集 81 | r1 = e1 & e2 82 | #print(r1) 83 | 84 | # 并集 85 | r2 = e1 | e2 86 | #print(r2) 87 | 88 | # 差集 89 | r3 = e1 - e2 90 | #print(r3) 91 | 92 | # 对称差集 93 | r4 = e1 ^ e2 94 | #print(r4) 95 | 96 | 97 | # 1.5 运算符 (常见的有+、-、*、/、%号等) 98 | # 对于运算符优先级的使用,常见的技巧 99 | # 1. 基本的数学运算符的优先级规律在python中基本上适用 100 | # 2. 实在不清楚优先级规律, 为需要先执行的运算加上括号 101 | 102 | # +、-、*、/、% 运算 103 | h1 = 1 + 8 * 2 - 1 104 | #print(h1) 105 | 106 | h2 = (1 + 8) * 2 - 1 107 | #print(h2) 108 | 109 | h3 = 21 % 2 110 | #print(h3) 111 | 112 | # + 号也可以做为字符串连接使用 113 | o1 = "hello " 114 | o2 = o1 + "python" 115 | #print(o2) 116 | 117 | # 1.6 缩进 118 | # python 是一门强制缩进的语言,有些朋友可能比较方案这一点,但是,这一点的存在,让 119 | # python代码变得更加美观。python中缩进规律: 120 | # 同一层的代码,处于同一个缩进幅度上,下一层代码,需要相对于上一个层次的代码进行缩进 121 | # 建议使用tab键进行缩进 122 | 123 | b = "10" 124 | if(b == "10"): 125 | print("abc") 126 | 127 | 128 | -------------------------------------------------------------------------------- /examples/example-10.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-10.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.10 正则表达式-模式修正符 5 | 6 | # 模式修正符 7 | ''' 8 | I 匹配时忽略大小写 * 9 | M 多行匹配 * 10 | L 本地化识别匹配 11 | U unicode 12 | S 让.匹配包括换行符 * 13 | ''' 14 | import re 15 | 16 | string = 'Python' 17 | 18 | pat = "pyt" 19 | rst = re.search(pat, string, re.I) 20 | print(rst) 21 | 22 | -------------------------------------------------------------------------------- /examples/example-11.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-11.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.11 正则表达式-贪婪模式和懒惰模式 5 | 6 | import re 7 | 8 | # 贪婪模式和懒惰模式 9 | string = 'pythony' 10 | # 默认就是贪婪模式 11 | pat = "p.*y" 12 | 13 | # 懒惰模式 14 | pat = "p.*?y" 15 | rst = re.search(pat, string, re.I) 16 | print(rst) 17 | 18 | # 正则函数 re.match()、 re.search()、全局匹配、re.sub() 19 | # match 从头开始匹配 20 | string = 'pythonyjkjkjssa' 21 | pat = "p.*?y" 22 | rst = re.match(pat, string) 23 | print(rst) 24 | 25 | # search 从任意地方匹配 26 | 27 | # 全局匹配函数 28 | string = 'sdpythpnyonyjkjkjptyssa' 29 | pat = "p.*?y" 30 | rst = re.compile(pat).findall(string) 31 | print(rst) 32 | 33 | 34 | # 正则表达式 实例 35 | import re 36 | 37 | # 匹配.com 和 .cn 38 | 39 | string = "百度" 40 | 41 | pat = "[a-zA-z]+://[^\s]*[.com|.cn]" 42 | 43 | ret = re.compile(pat).findall(string) 44 | 45 | print(ret) 46 | 47 | # 匹配电话号码 48 | string = "sdfsdfs021-123132432432fsfdwfds0773-23424324234sdfsdfsd" 49 | pat = "\d{3}-\d{8}|\d{4}-\d{7}" 50 | ret = re.compile(pat).findall(string) 51 | 52 | print(ret) 53 | 54 | -------------------------------------------------------------------------------- /examples/example-12.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-12.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.12 简单爬虫的编写(urllib学习) 5 | 6 | import urllib.request 7 | import re 8 | 9 | data = urllib.request.urlopen("http://edu.csdn.net").read() 10 | 11 | #print(data) 12 | 13 | # 自动提取课程页面的QQ群号码 14 | data = urllib.request.urlopen("http://edu.csdn.net/huiyiCourse/detail/253").read().decode("utf-8") 15 | pat = "

(\d*?)

" 16 | ret = re.compile(pat).findall(data) 17 | print(ret[0]) 18 | 19 | # 豆瓣网址出版爬取 20 | 21 | ''' 22 | import urllib.request 23 | import re 24 | 25 | data = urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8") 26 | pat = '
(.*?)
' 27 | ret = re.compile(pat).findall(data) 28 | 29 | fh = open("/Users/gaoyaqiu/git/python-demo/douban", "w") 30 | for i in range(0, len(ret)): 31 | print(ret[i]) 32 | fh.write(ret[i] + "\n") 33 | fh.close() 34 | 35 | ''' 36 | 37 | 38 | import urllib.request 39 | 40 | # urlretrieve(网址, 存储位置) 直接下载网页到本地 41 | 42 | #urllib.request.urlretrieve("http://www.baidu.com", "/Users/gaoyaqiu/git/python-demo/baidu.html") 43 | 44 | # urlcleanup() 清除由于urllib.urlretrieve()所产生的缓存 45 | # urllib.request.urlcleanup() 46 | 47 | # info() 查看网页简介信息 48 | ''' 49 | Date: Sat, 04 Mar 2017 12:09:02 GMT 50 | Content-Type: text/html; charset=utf-8 51 | Content-Length: 61320 52 | Connection: close 53 | Vary: Accept-Encoding 54 | Expires: Sun, 1 Jan 2006 01:00:00 GMT 55 | Pragma: no-cache 56 | Cache-Control: must-revalidate, no-cache, private 57 | Set-Cookie: profile="deleted"; max-age=0; domain=read.douban.com; expires=Thu, 01-Jan-1970 00:00:00 GMT; path=/ 58 | Set-Cookie: bid=bKnd76QHTVg; Expires=Sun, 04-Mar-18 12:09:02 GMT; Domain=.douban.com; Path=/ 59 | X-DOUBAN-NEWBID: bKnd76QHTVg 60 | X-DAE-Node: sindar20d 61 | X-DAE-App: ark 62 | Server: dae 63 | Strict-Transport-Security: max-age=15552000; 64 | X-Content-Type-Options: nosniff 65 | ''' 66 | file = urllib.request.urlopen("https://read.douban.com/provider/all") 67 | print(file.info()) 68 | 69 | # getcode() 返回网站状态码 200 70 | print(file.getcode()) 71 | 72 | # geturl() 获取当前访问的url 73 | print(file.geturl()) 74 | 75 | -------------------------------------------------------------------------------- /examples/example-13.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-13.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.13 超时设置 5 | 6 | # 由于网络速度或对方服务器问题, 爬取一个网页的时候,都需要时间。如果该网页长时间未响应, 7 | # 那么系统就会判断该网页超时,有时我们就需要根据需要来设置超时时间值 8 | # 比如我们希望2秒没有反应,则判断为超时,那么timeout值就是2,再比如,有些网站服务器比较慢,那么我们希望100秒没有响应时, 9 | # 才判断为超时, 那么此时timeout值设置为100. 10 | 11 | # 超时设置 12 | import urllib.request 13 | 14 | for i in range(0, 100): 15 | try: 16 | # 测试可以选用一个网站响应比较慢的来测试效果 17 | file = urllib.request.urlopen("http://tool.360java.com", timeout=1) 18 | print(file.read().decode("utf-8")) 19 | except Exception as err: 20 | print("出现异常: " + str(err)) 21 | 22 | -------------------------------------------------------------------------------- /examples/example-14.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-14.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-06 4 | # Description : No.14 自动模拟HTTP请求与百度信息自动搜索爬虫实战 5 | 6 | 7 | # 客户端如果要与服务器进行通讯, 需要通过http请求进行, http请求有很多种, 这里主要会对post和get两种方式进行学习 8 | # get请求实战-- 实现百度信息自动搜索 9 | import urllib.request 10 | import re 11 | 12 | keyword = "python" 13 | keyword = urllib.request.quote(keyword) 14 | 15 | # 分页获取10页数据 page = (num - 1) * 10 16 | for i in range(1, 11): 17 | url = "https://www.baidu.com/s?wd=" + keyword + "&pn=" + str((i - 1) * 10) 18 | req = urllib.request.Request(url) 19 | # 必须要设置user-agent,不然获取不到数据 20 | req.add_header('User-Agent', 21 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36') 22 | data = urllib.request.urlopen(req).read().decode("utf-8") 23 | pat1 = "title:'(.*?)'," 24 | pat2 = 'title:"(.*?)",' 25 | rst1 = re.compile(pat1).findall(data) 26 | rst2 = re.compile(pat2).findall(data) 27 | 28 | for j in range(0, len(rst1)): 29 | print(rst1[j]) 30 | 31 | for z in range(0, len(rst2)): 32 | print(rst2[z]) 33 | 34 | -------------------------------------------------------------------------------- /examples/example-15.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-15.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-07 4 | # Description : No.15 自动模拟HTTP请求之自动POST实战 5 | 6 | 7 | # post请求实战 8 | import urllib.request 9 | import urllib.parse 10 | 11 | # 这里使用了apache的一个post测试地址 12 | posturl = "http://httpbin.org/post" 13 | # post请求的参数, 当参数中有中文时,记得需要做encode处理 14 | postdata = urllib.parse.urlencode({ 15 | "name": "张三", 16 | "age": 18 17 | }).encode("utf-8") 18 | 19 | req = urllib.request.Request(posturl, postdata) 20 | rst = urllib.request.urlopen(req).read().decode("utf-8") 21 | print(rst) 22 | 23 | ''' 24 | 返回的结果如下: 其中form对应的值就是在我请求时传入的参数 25 | { 26 | "args": {}, 27 | "data": "", 28 | "files": {}, 29 | "form": { 30 | "age": "18", 31 | "name": "\u5f20\u4e09" 32 | }, 33 | "headers": { 34 | "Accept-Encoding": "identity", 35 | "Content-Length": "30", 36 | "Content-Type": "application/x-www-form-urlencoded", 37 | "Host": "httpbin.org", 38 | "User-Agent": "Python-urllib/3.5" 39 | }, 40 | "json": null, 41 | "origin": "116.226.186.125", 42 | "url": "http://httpbin.org/post" 43 | } 44 | 45 | ''' -------------------------------------------------------------------------------- /examples/example-16.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-16.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-07 4 | # Description : No.16 爬虫的异常处理实战 5 | 6 | ''' 7 | 爬虫在运行的过程中, 很多时候会遇到这样或那样的异常。如果没有异常处理,爬虫遇到异常时就会直接崩溃停止运行, 8 | 当下次再运行爬虫时,又会重头开始,所以要开发一个具有顽强生命力的爬虫,必须要进行异常处理 9 | ''' 10 | 11 | # 16.1 认识URLError与HTTPError 12 | ''' 13 | 两者都是异常处理的类, HTTPError是URLError的子类,HTTPError有异常状态码与异常原因,URLError没有异常状态码,所以 14 | 在处理时,不能使用URLError代替HTTPError。如果要代替,必须要判断是否有状态码属性 15 | ''' 16 | 17 | # 16.2 异常处理 18 | ''' 19 | 异常出现的原因: 20 | 1. 连不上服务器 21 | 2. 远程url不存在 22 | 3. 无网络 23 | 4. 触发HTTPError 24 | ''' 25 | 26 | import urllib.request 27 | import urllib.error 28 | 29 | try: 30 | # 当直接请求csdn的博客网站时,会出现403异常, 31 | # 因为csdn会对爬虫访问进行屏蔽,那么就需要伪装成浏览器才能爬取(后面会补充) 32 | urllib.request.urlopen("http://blog.csdn.net") 33 | except urllib.error.URLError as e: 34 | # 捕捉异常,并获取异常code及异常信息 35 | if hasattr(e, "code"): 36 | print(e.code) 37 | if hasattr(e, "reason"): 38 | print(e.reason) -------------------------------------------------------------------------------- /examples/example-17.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-17.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-07 4 | # Description : No.17 爬虫的浏览器伪装技术实战 5 | 6 | ''' 7 | 由于urlopen()对于一些HTTP的高级功能不支持,所以,我们如果要修改报头,可以使用urllib.request.build_opener()进行,还有一种方法之前有写过,就是 8 | urllib.request.Request()下的add_header()方法来实现浏览器的模拟,这里主要学习前者build_opener的方式 9 | ''' 10 | 11 | # 17.1 浏览器伪装 12 | import urllib.request 13 | 14 | url = "http://blog.csdn.net" 15 | 16 | # 设置头部信息, 头部信息可以通过浏览器查看获得,这里主要以谷歌浏览器为例,按F12,查看Network,选中一个网络请求,复制User-Agent对应的值 17 | headers = ("User-Agen","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36") 18 | opener = urllib.request.build_opener() 19 | opener.addheaders = [headers] 20 | 21 | data = opener.open(url).read().decode("utf-8") 22 | print(data) -------------------------------------------------------------------------------- /examples/example-18.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-18.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-08 4 | # Description : No.18 CSDN博文爬虫 5 | 6 | 7 | import urllib.request 8 | import re 9 | 10 | url = "http://blog.csdn.net/" 11 | headers = ("User-Agen","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36") 12 | opener = urllib.request.build_opener() 13 | opener.addheaders = [headers] 14 | # 设定为全局 15 | urllib.request.install_opener(opener) 16 | data = opener.open(url).read().decode("utf-8") 17 | pat = '

90): 16 | print(a) 17 | if(b < 90): 18 | print(b) 19 | elif(a > 90 and a <= 190): 20 | print("a > 90 and a <= 190") 21 | elif(a < 90): 22 | print("a < 90") 23 | else: 24 | print("ok") 25 | 26 | ''' 27 | 28 | 29 | # 循环结构语句 - while 语句 30 | ''' 31 | i = 0 32 | while(i < 5): 33 | print("hello python") 34 | i += 1 35 | ''' 36 | 37 | 38 | 39 | # 循环结构语句 - for 语句, 遍历列表 40 | ''' 41 | a = ["a1", "a2", "a3", "a4"] 42 | for b in a: 43 | print(b) 44 | 45 | ''' 46 | 47 | # for 进行常规循环 48 | ''' 49 | for i in range(0, 5): 50 | print("hello world!") 51 | ''' 52 | 53 | 54 | # 2.2 中断结构 55 | # 指的是中途退出的一种结构, 常有break语句与continue语句 56 | 57 | # break 全部退出 58 | ''' 59 | a = ["a1", "a2", "a3", "a4"] 60 | for i in a: 61 | # 当i等于 a2 退出循环 62 | if(i == "a2"): 63 | break 64 | print(i) 65 | ''' 66 | 67 | # continue 中断一次循环,继续下一次循环 68 | ''' 69 | a = ["a1", "a2", "a3", "a4"] 70 | for i in a: 71 | # 当i等于 a2时, 中断这次循环进入下一次循环 72 | if (i == "a2"): 73 | continue 74 | print(i) 75 | ''' 76 | 77 | # 输出乘法口诀 78 | ''' 79 | 1*1=1 80 | 2*1=2 2*2=4 81 | 3*1=3 3*2=6 3*3=9 82 | 4*1=4 4*2=8 4*3=12 4*4=16 83 | 5*1=5 5*2=10 5*3=15 5*4=20 5*5=25 84 | 6*1=6 6*2=12 6*3=18 6*4=24 6*5=30 6*6=36 85 | 7*1=7 7*2=14 7*3=21 7*4=28 7*5=35 7*6=42 7*7=49 86 | 8*1=8 8*2=16 8*3=24 8*4=32 8*5=40 8*6=48 8*7=56 8*8=64 87 | 9*1=9 9*2=18 9*3=27 9*4=36 9*5=45 9*6=54 9*7=63 9*8=72 9*9=81 88 | ''' 89 | 90 | ''' 91 | for i in range(1, 10): 92 | for j in range(1, i+1): 93 | print(str(i) + "*" + str(j) + "=" + str(i*j), end=" ") 94 | 95 | print() 96 | ''' 97 | 98 | # 输出逆向乘法口诀表 99 | ''' 100 | 9*9=81 9*8=72 9*7=63 9*6=54 9*5=45 9*4=36 9*3=27 9*2=18 9*1=9 101 | 8*8=64 8*7=56 8*6=48 8*5=40 8*4=32 8*3=24 8*2=16 8*1=8 102 | 7*7=49 7*6=42 7*5=35 7*4=28 7*3=21 7*2=14 7*1=7 103 | 6*6=36 6*5=30 6*4=24 6*3=18 6*2=12 6*1=6 104 | 5*5=25 5*4=20 5*3=15 5*2=10 5*1=5 105 | 4*4=16 4*3=12 4*2=8 4*1=4 106 | 3*3=9 3*2=6 3*1=3 107 | 2*2=4 2*1=2 108 | 1*1=1 109 | ''' 110 | for i in range(9, 0, -1): 111 | for j in range(i, 0, -1): 112 | print(str(i) + "*" + str(j) + "=" + str(i*j), end=" ") 113 | 114 | print() -------------------------------------------------------------------------------- /examples/example-20.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-20.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-08 4 | # Description : No.20 用户代理池构建实战 5 | 6 | import urllib.request 7 | import re 8 | import random 9 | 10 | proxy = [ 11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 13 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" 14 | ] 15 | 16 | # 随机切换用户代理池 17 | def ua(proxy): 18 | thisua = random.choice(proxy) 19 | print("当前使用的用户代理: " + thisua) 20 | headers = ("User-Agen", thisua) 21 | opener = urllib.request.build_opener() 22 | opener.addheaders = [headers] 23 | # 设定为全局 24 | urllib.request.install_opener(opener) 25 | 26 | 27 | for i in range(0, 5): 28 | ua(proxy) 29 | thisurl = "http://www.qiushibaike.com/8hr/page/" + str(i+1) + "/" 30 | data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore") 31 | pat = '
.*?(.*?).*?
' 32 | rst = re.compile(pat, re.S).findall(data) 33 | for i in range(0, len(rst)): 34 | print(rst[i]) 35 | print("-----------------------------") 36 | -------------------------------------------------------------------------------- /examples/example-21.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-21.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-08 4 | # Description : No.21 IP代理池构建实战 5 | 6 | import urllib.request 7 | import re 8 | import random 9 | 10 | ippools = [ 11 | "121.135.146.184:8080", 12 | "61.223.154.215:8998", 13 | "103.39.107.38:808" 14 | ] 15 | 16 | 17 | 18 | def ip(ippools): 19 | # 此处还可以通过接口获取代理,适合代理不稳定的情况 20 | thisip = random.choice(ippools) 21 | print("当前使用的代理: " + thisip) 22 | proxy = urllib.request.ProxyHandler({"http": thisip}) 23 | opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler) 24 | # 设定为全局 25 | urllib.request.install_opener(opener) 26 | 27 | for i in range(0, 3): 28 | ip(ippools) 29 | url = "http://www.baidu.com" 30 | data = urllib.request.urlopen(url).read().decode("utf-8", "ignore") 31 | print(data) 32 | 33 | 34 | -------------------------------------------------------------------------------- /examples/example-22.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-22.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-09 4 | # Description : No.22 淘宝商品图片爬虫实战 5 | 6 | import urllib.request 7 | import re 8 | import random 9 | 10 | proxy = [ 11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 13 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" 14 | ] 15 | 16 | # 随机切换用户代理池 17 | def ua(proxy): 18 | thisua = random.choice(proxy) 19 | print("当前使用的用户代理: " + thisua) 20 | headers = ("User-Agen", thisua) 21 | opener = urllib.request.build_opener() 22 | opener.addheaders = [headers] 23 | # 设定为全局 24 | urllib.request.install_opener(opener) 25 | 26 | keyword = "iphone" 27 | keyword = urllib.request.quote(keyword) 28 | for i in range(1, 2): 29 | ua(proxy) 30 | 31 | thisurl = "https://s.taobao.com/search?q=" + keyword + "&s=" + str((i-1) * 44) 32 | data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore") 33 | pat = '"pic_url":"//(.*?)"' 34 | imglist = re.compile(pat).findall(data) 35 | #print(imglist) 36 | for j in range(0, len(imglist)): 37 | thisimg = imglist[j] 38 | thisimgurl = "http://" + thisimg 39 | localfile = "/Users/gaoyaqiu/Downloads/python-test/22/" + str(i) + str(j) + ".jpg" 40 | urllib.request.urlretrieve(thisimgurl, localfile) -------------------------------------------------------------------------------- /examples/example-23.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-23.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-13 4 | # Description : No.23 如何同时使用用户代理池和IP代理池 5 | 6 | import urllib.request 7 | import re 8 | import random 9 | def ua_ip(url): 10 | proxy = [ 11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 13 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" 14 | ] 15 | 16 | # 网上找的代理 17 | ippools = [ 18 | "222.128.80.28:8081", 19 | "218.241.181.202:8080", 20 | "61.182.253.72:8081" 21 | ] 22 | 23 | # 随机切换用户代理及IP 24 | def ip(ippools, proxy): 25 | thisua = random.choice(proxy) 26 | print("当前使用的用户代理: " + thisua) 27 | thisip = random.choice(ippools) 28 | print("当前使用的IP: " + thisip) 29 | headers = ("User-Agen", thisua) 30 | proxy = urllib.request.ProxyHandler({"http": thisip}) 31 | opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler) 32 | opener.addheaders = [headers] 33 | # 设定为全局 34 | urllib.request.install_opener(opener) 35 | 36 | for i in range(0, 5): 37 | try: 38 | ip(ippools, proxy) 39 | 40 | thisurl = url 41 | data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore") 42 | if data: 43 | return data 44 | except Exception as e: 45 | print(e) 46 | 47 | url = "http://www.baidu.com" 48 | data = ua_ip(url) 49 | print(data) -------------------------------------------------------------------------------- /examples/example-24.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-24.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-04-06 4 | # Description : No.24 在Urllib中使用XPath表达式(需要安装lxml模块) 5 | 6 | import urllib.request 7 | from lxml import etree 8 | 9 | data = urllib.request.urlopen('http://www.baidu.com').read().decode('utf-8', 'ignore') 10 | tree_data = etree.HTML(data) 11 | title = tree_data.xpath('//title/text()') 12 | if(str(type(title) == "")): 13 | pass 14 | else: 15 | # 如果不是列表则转换成列表,防止出错 16 | title = [i for i in title] 17 | 18 | print(title[0]) 19 | -------------------------------------------------------------------------------- /examples/example-25.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-25.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-04-17 4 | # Description : No.25 BeautifulSoup基础实战 5 | ''' 6 | BeautifulSoup 是 Python 非常好用的一个库,可以用它来方便地解析网页内容,获取我们需要的数据,几乎是 Python 爬虫居家旅行必备的库, 7 | 和正则、xpath作用是一样的,主要作用是简化开发。 8 | 使用之前需要安装beautifulsoup库, 选择自己对应的python版本 9 | 下载地址: http://www.lfd.uci.edu/~gohlke/pythonlibs/ 10 | ''' 11 | 12 | import urllib.request 13 | from bs4 import BeautifulSoup as bs 14 | 15 | data = urllib.request.urlopen("https://www.douban.com/").read().decode("utf-8", "ignore") 16 | # 解析数据(常用的解析器有: html.parser、lxml、["lxml", "xml"]、html5lib), 这里我们用 html.parser 17 | bs_data = bs(data, "html.parser") 18 | #print(bs_data) 19 | 20 | # 格式化输出html (比较美观) 21 | #print(bs_data.prettify()) 22 | 23 | # 获取标签 24 | print(bs_data.title) 25 | 26 | # 获取标签中的内容 27 | print(bs_data.title.string) 28 | 29 | # 获取标签名 30 | print(bs_data.title.name) 31 | 32 | # 获取属性列表(默认获取的是第一个标签) 33 | print(bs_data.a.attrs) 34 | 35 | # 获取属性对应的值 (下面两种方式都可以) 36 | print(bs_data.a['class']) 37 | print(bs_data.a.get('class')) 38 | 39 | # 获取某个节点的所有内容(多个标签可以传入数组) 40 | #print(bs_data.find_all('a')) 41 | #print(bs_data.find_all(['a', 'ul'])) 42 | 43 | # 获取所有子节点 (contents返回的是list, children返回的是iterator) 44 | print(bs_data.ul.contents) 45 | #print(bs_data.ul.children) 46 | t = bs_data.ul.children 47 | #for i in t: 48 | # print(i) 49 | 50 | # iterator转为list 51 | t2 = [i for i in t] 52 | print(t2) 53 | 54 | 55 | # 更多使用帮助请参阅官方文档: http://beautifulsoup.readthedocs.io/zh_CN/latest/ 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/example-26.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-26.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-04-18 4 | # Description : No.26 PhantomJS基础实战 5 | ''' 6 | PhantomJS 是一个脚本化的无界面 WebKit,以 JavaScript 为脚本语言实现各项功能, 7 | 官方列举的使用场景包括:无界面测试,页面自动化,屏幕截图和网络监控。 8 | 9 | 需要安装selenium、phantomjs 10 | 1. selenium 直接通过pip安装 11 | 2. mac下可以使用 brew 来安装 phantomjs 12 | ''' 13 | import time 14 | import re 15 | from selenium import webdriver 16 | 17 | browser = webdriver.PhantomJS() 18 | # 通过get方式来访问百度 19 | browser.get("http://www.baidu.com/") 20 | # 截屏保存 打开百度后的状态 21 | browser.get_screenshot_as_file("/Users/gaoyaqiu/Downloads/bak/test/baidu.jpg") 22 | 23 | # 模拟在百度中搜索 24 | # 先清除文本框 25 | input_xpath = '//*[@id="kw"]' 26 | browser.find_element_by_xpath(input_xpath).clear() 27 | browser.find_element_by_xpath(input_xpath).send_keys('python') 28 | # 截屏保存 在百度中输入 python后的状态 29 | browser.get_screenshot_as_file("/Users/gaoyaqiu/Downloads/bak/test/baidu2.jpg") 30 | 31 | # 32 | sub_xpath = '//*[@id="su"]' 33 | browser.find_element_by_xpath(sub_xpath).click() 34 | # 延迟2秒 35 | time.sleep(3) 36 | # 截屏保存 在百度中输入搜索python后的结果 37 | browser.get_screenshot_as_file("/Users/gaoyaqiu/Downloads/bak/test/baidu3.jpg") 38 | 39 | # 获取网页源码 40 | data = browser.page_source 41 | print(data) 42 | # 关闭浏览器 43 | browser.quit() 44 | 45 | # 获取title 46 | pat = '(.*?)' 47 | title = re.compile(pat).findall(data) 48 | print(title) 49 | -------------------------------------------------------------------------------- /examples/example-3.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-3.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-04 4 | # Description : No.3 python函数详解 5 | 6 | # 3.1 局部变量与全局变量 7 | # 变量是有生效范围的, 也叫作用域 8 | # 全局变量: 作用域从变量出现的地方开始,到这个变量作用范围结束的地方结束 9 | # 局部变量: 作用域只在局部的变量 10 | 11 | # 作用域 12 | # i是全局变量 13 | ''' 14 | i = 1000 15 | def func(): 16 | # j是局部变量 17 | j = 100 18 | print(j) 19 | print(i) 20 | func() 21 | ''' 22 | # 执行下面代码会报错,因为j是局部变量,不是全局变量 23 | #print(j) 24 | 25 | # 3.2 认识python函数 26 | # 函数的本质就是功能的封装。使用函数可以大大提高编程的效率与程序的可读性 27 | ''' 28 | 函数定义的格式: 29 | def 函数名(参数): 30 | 函数体 31 | ''' 32 | def abc(): 33 | print("abc") 34 | # 调用函数: 函数名(参数) 35 | #abc() 36 | 37 | # 参数: 分为形参和实参 38 | # 一般在函数定义的时候使用的参数是形参 39 | # 一般在函数调用的时候使用的参数是实参 40 | def func2(a, b): 41 | if(a > b): 42 | print("a 大于 b") 43 | elif( a == b): 44 | print("a 等于 b") 45 | else: 46 | print("a 小于 b") 47 | # a=1, b=10 48 | func2(1, 10) -------------------------------------------------------------------------------- /examples/example-4.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-4.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.4 python模块实战 5 | 6 | 7 | # 4.1 什么是模块? 8 | # 为了让python程序实现起来更方便,我们可以按需求类别将一些常见的功能(函数)组合在一起,形成模块。以后我们要实现这一类功能的时候,直接 9 | # 导入该模块即可。模块里面的函数叫做模块的方法 10 | 11 | # 4.2 python 模块的导入 12 | # 使用以下两种方式导入模块: 13 | # a. import 模块名 14 | # b. from ... import ... 15 | 16 | # 例子(import 模块名) 17 | import cgi 18 | # 执行cgi中得方法 19 | cgi.closelog() 20 | 21 | # 例子(from ... import ...) 22 | 23 | from cgi import closelog 24 | # 直接执行closelog方法 25 | closelog() 26 | 27 | # 4.3 第三方模块的安装 28 | # a. pip方式 29 | # b. whl下载安装的方式 30 | # c. 直接复制的方式 31 | # d. anaconda 32 | 33 | # 4.4 自定义python模块 34 | # 创建mymd.py文件,将以下代码拷贝进去 35 | def hello(): 36 | print("hello python!") 37 | 38 | # 将mymd.py文件放入python安装目录的Lib下(系统的内置的模块也在这) 39 | 40 | # 使用自定义模块 41 | # import mymd 42 | 43 | # 调用mymd下的hello方法 44 | # mymd.hello() 45 | # 或者用 from mymd import hello 46 | -------------------------------------------------------------------------------- /examples/example-5.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-5.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.5 python文件操作实战 5 | 6 | 7 | # 5.1 读取文件 8 | # 打开文件 9 | # open(文件名, 操作形式) 10 | ''' 11 | w: 写入 12 | r: 读取 13 | b: 二进制 14 | a: 追加 15 | ''' 16 | fh = open("/Users/gaoyaqiu/Downloads/t1", "r") 17 | # 文件读取可以用read(读取文件所有内容)和readline(按行读取)方法 18 | # data = fh.read() 19 | 20 | while True: 21 | line = fh.readline() 22 | print(line) 23 | if not line: 24 | break 25 | pass 26 | 27 | # 关闭文件 28 | fh.close() 29 | 30 | # 5.2 文件写入 31 | data = "hello world" 32 | fh2 = open("/Users/gaoyaqiu/Downloads/t2", "w") 33 | fh2.write(data) 34 | fh2.close() 35 | 36 | data2 = "gyq" 37 | fh2 = open("/Users/gaoyaqiu/Downloads/t2", "a+") 38 | fh2.write(data2) 39 | fh2.close() 40 | 41 | -------------------------------------------------------------------------------- /examples/example-6.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-6.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.6 python异常处理实战 5 | 6 | # 6.1 异常处理概述 7 | # python程序在执行的时候,经常会遇到异常,如果中间异常不处理,经常会导致程序崩溃。 8 | # 比如后面我们写爬虫的时候,如果不进行异常处理,很可能虫爬了一半,直接崩溃了。 9 | 10 | # 6.2 异常处理格式 11 | ''' 12 | try: 13 | 程序 14 | except Exception as 异常名称: 15 | 异常处理部分 16 | ''' 17 | ''' 18 | try: 19 | for i in range(0, 10): 20 | print(i) 21 | if(i == 4): 22 | print(nihao) 23 | except Exception as e: 24 | print(e) 25 | # 让异常后的程序继续执行 26 | 27 | for i in range(0, 10): 28 | try: 29 | print(i) 30 | if(i == 4): 31 | print(nihao) 32 | except Exception as e2: 33 | print(e2) 34 | print("ok") 35 | 36 | ''' 37 | 38 | for i in range(0, 10): 39 | try: 40 | print(i) 41 | if(i == 4): 42 | print(nihao) 43 | except Exception as e2: 44 | print(e2) 45 | -------------------------------------------------------------------------------- /examples/example-7.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-7.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.7 面向对象编程 5 | 6 | # 7.1 类和对象 7 | ''' 8 | 类: 具有某种特征的事物的集合(群体)。 9 | 对象: 群体(类)里面的个体。 10 | 类是抽象的,对象是具体的。 11 | 12 | 定义类格式: 13 | class 类名: 14 | 类里面的内容 15 | ''' 16 | class cl1: 17 | pass 18 | 19 | # 实例化 20 | a = cl1() 21 | 22 | # 构造函数(构造方法) 23 | # self 在类中的方法必须加上self参数 24 | # __init__(self, 参数) 25 | class c1: 26 | def __init__(self): 27 | print("hello world!") 28 | 29 | # 给构造方法加参数 30 | class c2: 31 | def __init__(self, name, job): 32 | print("my name is " + name + " my job is " + job) 33 | 34 | # 7.2 属性和方法 35 | # 属性: 类里面的变量 self, 属性名 36 | class c3: 37 | def __init__(self, name): 38 | self.myname = name 39 | 40 | # 方法: 类里面的函数 def 方法名(self, 参数) 41 | class c4: 42 | def show(self, name): 43 | print("hello " + name) 44 | 45 | class c5: 46 | def __init__(self, name): 47 | self.myname = name 48 | def show(self, name): 49 | print("hello " + self.name) 50 | 51 | 52 | # 继承(单继承、多继承) 53 | # 父亲-父类(基类) 54 | class father(): 55 | def speak(self): 56 | print("i can speak") 57 | 58 | # 单继承,子类继承父类 59 | class son(father): 60 | pass 61 | 62 | # 子类可以调用父类中得方法 63 | s = son() 64 | s.speak() 65 | 66 | # 母亲-父类 67 | class mother(): 68 | def write(self): 69 | print("i can write") 70 | # 多继承 71 | class daughter(father, mother): 72 | def listen(self): 73 | print("i can listen") 74 | # 重载 75 | class son2(father): 76 | def speak(self): 77 | print("i can speak to") 78 | 79 | d = daughter() 80 | d.speak() 81 | d.write() 82 | d.listen() 83 | 84 | s2 = son2() 85 | s2.speak('sssss') 86 | 87 | -------------------------------------------------------------------------------- /examples/example-8.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-8.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.8 正则表达式-原子 5 | 6 | 7 | # 8.1 原子 8 | ''' 9 | 原子是正则表达式中最基本的组成单位,每个正则表达式中至少要包含一个原子。 10 | 常见的原子类型有: 11 | a. 普通字符作为原子 12 | b. 非打印字符作为原子 13 | c. 通用字符作为原子 14 | d. 原子表 15 | ''' 16 | 17 | import re 18 | 19 | string = "gaoyaqiu" 20 | # 普通字符作为原子 21 | pat = "gao" 22 | 23 | rst = re.search(pat, string) 24 | print(rst) 25 | 26 | # 非打印字符作为原子 27 | # \n 换行符 \t 制表符 28 | 29 | string = '''gaoyaqiuubaidu''' 30 | 31 | pat = "\n" 32 | rst = re.search(pat, string) 33 | print(rst) 34 | 35 | # 通用字符作为原子 36 | ''' 37 | \w 匹配任意一个字母、数字、下划线 38 | \W 匹配除字母、数字、下划线之外的任意字符 39 | \d 十进制数 40 | \D 除十进制以外的任意字符 41 | \s 匹配空白字符 42 | \S 匹配除空白以外的任意字符 43 | ''' 44 | string = '''gaoyaqiu8 76522gaoyaqiubaidu''' 45 | 46 | pat = "\w\d\s\d\d" 47 | rst = re.search(pat, string) 48 | print(rst) 49 | 50 | # 原子表 从原子表中任意选一个原子来匹配 51 | string = '''gaoyaqiu876522jiaoyubaidu''' 52 | 53 | pat = "ga[o]ya" 54 | #pat = "tao[^a]un" 55 | rst = re.search(pat, string) 56 | print(rst) -------------------------------------------------------------------------------- /examples/example-9.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-9.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.9 正则表达式-元字符 5 | 6 | # 8.1 元字符 7 | ''' 8 | 所谓元字符, 就是正则表达式中具有一些特殊含义的字符, 比如重复N次前面的字符等 9 | ''' 10 | 11 | import re 12 | 13 | ''' 14 | . 除换行外任意一个字符 15 | ^ 开始位置 16 | $ 结束位置 17 | * 前面原子重复出现0次、1次或多次 18 | ? 前面原子重复出现0次、1次 19 | + 前面原子出现1次或多次 20 | {n} 前面原子恰好出现n次 21 | {n,} 前面原子至少n次 22 | {n, m} 最少出现n次,最多出现m次 23 | | 模式选择符 或 24 | () 模式单元 25 | ''' 26 | string = '''taogaoyaqiu876522gaobaidu''' 27 | 28 | pat = "tao..." 29 | pat = "^tao..." 30 | pat = "ba...$" 31 | pat = "tao.*" 32 | pat = "taogao+" 33 | pat = "gao{3,5}" 34 | rst = re.search(pat, string) 35 | print(rst) 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /jdgoods/jdgoods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/jdgoods/jdgoods/__init__.py -------------------------------------------------------------------------------- /jdgoods/jdgoods/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JdgoodsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # 频道1, 2 15 | sub1 = scrapy.Field() 16 | 17 | sub2 = scrapy.Field() 18 | # 图书名 19 | name = scrapy.Field() 20 | # 价格 21 | price = scrapy.Field() 22 | # 店家 23 | seller = scrapy.Field() 24 | -------------------------------------------------------------------------------- /jdgoods/jdgoods/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JdgoodsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /jdgoods/jdgoods/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | 9 | class JdgoodsPipeline(object): 10 | 11 | def __init__(self): 12 | self.conn = pymysql.connect(host = "127.0.0.1", user = "root", password = "ok", db = "jdgoods") 13 | 14 | def process_item(self, item, spider): 15 | 16 | try: 17 | name = item["name"] 18 | price = item["price"] 19 | seller = item["seller"] 20 | sub1 = item["sub1"] 21 | sub2 = item["sub2"] 22 | 23 | sql = "insert into goods (name, sub1, sub2, seller, price) values('" + name + "', '" + sub1 + "', '" + sub2 + "', '" + seller + "', '" + price + "')" 24 | self.conn.query(sql) 25 | 26 | return item 27 | except Exception as e: 28 | pass 29 | 30 | def close_spider(self): 31 | self.conn.close() 32 | 33 | 34 | -------------------------------------------------------------------------------- /jdgoods/jdgoods/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jdgoods project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jdgoods' 13 | 14 | SPIDER_MODULES = ['jdgoods.spiders'] 15 | NEWSPIDER_MODULE = 'jdgoods.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'jdgoods.middlewares.JdgoodsSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'jdgoods.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'jdgoods.pipelines.JdgoodsPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /jdgoods/jdgoods/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jdgoods/jdgoods/spiders/good.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Scrapy与Urllib的整合使用(爬取京东图书商品) 3 | import scrapy 4 | import urllib.request 5 | import re 6 | import random 7 | from jdgoods.items import JdgoodsItem 8 | from scrapy.http import Request 9 | 10 | 11 | class GoodSpider(scrapy.Spider): 12 | name = "good" 13 | allowed_domains = ["jd.com"] 14 | #start_urls = ['http://jd.com/'] 15 | 16 | def start_requests(self): 17 | proxy = [ 18 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" 19 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)" 20 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)" 21 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" 22 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 24 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" 25 | ] 26 | 27 | req1 = urllib.request.Request("http://book.jd.com/") 28 | # 浏览器伪装 29 | req1.add_header("User-Agent", random.choice(proxy)) 30 | p_data = urllib.request.urlopen(req1).read().decode("utf-8", "ignore") 31 | # 匹配渠道的正则 32 | pat1 = '

0): 61 | pass 62 | else: 63 | # 只有1页 64 | page = [1] 65 | all_page.append({k: page[0]}) 66 | if(n > 1): 67 | break 68 | n += 1 69 | n = 0 70 | for p1 in cat_all_data: 71 | this_page = all_page[n][p1] 72 | for p2 in range(1, int(this_page) + 1): 73 | this_page_url = "https://list.jd.com/list.html?cat=" + str(p1) + "&page=" + str(p2) 74 | #print(this_page_url) 75 | yield Request(this_page_url, callback = self.parse) 76 | n += 1 77 | def parse(self, response): 78 | item = JdgoodsItem() 79 | list_data = response.body.decode("utf-8", "ignore") 80 | # 频道1,2 81 | pd = response.xpath("//span[@class='curr']/text()").extract() 82 | if(len(pd) == 0): 83 | pd = ["缺省", "缺省"] 84 | if(len(pd) == 1): 85 | pda = pd[0] 86 | pd = [pda, "缺省"] 87 | pd1 = pd[0] 88 | pd2 = pd[1] 89 | #print(pd1) 90 | # 图书名 (从下标3的地方开始获取) 91 | book_name = response.xpath("//div[@class='p-name']/a/em/text()").extract() 92 | #print(book_name) 93 | # 价格 94 | all_skupat = '