├── .gitignore ├── README.md ├── baidunews ├── baidunews │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── news.py ├── main.py └── scrapy.cfg ├── dangdang ├── dangdang │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── dd.py ├── main.py └── scrapy.cfg ├── douban ├── douban │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── dou.py ├── main.py ├── scrapy.cfg └── ydm │ ├── YDMHTTP.py │ ├── YDMPython3.py │ ├── __init__.py │ ├── yundamaAPI-x64.dll │ └── yundamaAPI.dll ├── examples ├── __init__.py ├── example-1.py ├── example-10.py ├── example-11.py ├── example-12.py ├── example-13.py ├── example-14.py ├── example-15.py ├── example-16.py ├── example-17.py ├── example-18.py ├── example-19.py ├── example-2.py ├── example-20.py ├── example-21.py ├── example-22.py ├── example-23.py ├── example-24.py ├── example-25.py ├── example-26.py ├── example-3.py ├── example-4.py ├── example-5.py ├── example-6.py ├── example-7.py ├── example-8.py └── example-9.py └── jdgoods ├── jdgoods ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── good.py ├── main.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | *.log 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 概览 2 | 3 | * 零基础学习python及爬虫, python版本为3.5 4 | * 代码中为了便于调试都有print输出部分,如果需要调试的可以帮注释去掉 5 | 6 | # 目录 7 | 8 | ### examples 9 | 10 | 本目录中主要是python基础和爬虫需要用到的常用扩展库的使用 11 | 12 | 1. [example-1.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-1.py) python语法基础 13 | 2. [example-2.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-2.py) python控制流与小实例 14 | 3. [example-3.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-3.py) python函数详解 15 | 4. [example-4.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-4.py) python模块实战 16 | 5. [example-5.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-5.py) python文件操作实战 17 | 6. [example-6.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-6.py) python异常处理实战 18 | 7. [example-7.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-7.py) 面向对象编程 19 | 8. [example-8.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-8.py) 正则表达式-原子 20 | 9. [example-9.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-9.py) 正则表达式-元字符 21 | 10. [example-10.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-10.py) 正则表达式-模式修正符 22 | 11. [example-11.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-11.py) 正则表达式-贪婪模式和懒惰模式 23 | 12. [example-12.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-12.py) 简单爬虫的编写(urllib学习) 24 | 13. [example-13.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-13.py) 超时设置 25 | 14. [example-14.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-14.py) 自动模拟HTTP请求与百度信息自动搜索爬虫实战 26 | 15. [example-15.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-15.py) 自动模拟HTTP请求之自动POST实战 27 | 16. [example-16.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-16.py) 爬虫的异常处理实战 28 | 17. [example-17.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-17.py) 爬虫的浏览器伪装技术实战 29 | 18. [example-18.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-18.py) CSDN博文爬虫实战 30 | 19. [example-19.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-19.py) 糗事百科段子爬虫实战 31 | 20. [example-20.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-20.py) 用户代理池构建实战 32 | 21. [example-21.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-21.py) IP代理池构建实战 33 | 22. [example-22.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-22.py) 淘宝商品图片爬虫实战 34 | 23. [example-23.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-23.py) 如何同时使用用户代理池和IP代理池 35 | 24. [example-24.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-24.py) 在Urllib中使用XPath表达式 36 | 25. [example-25.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-25.py) BeautifulSoup基础实战 37 | 26. [example-26.py](https://github.com/gaoyaqiu/python-spider/blob/master/examples/example-26.py) PhantomJS基础实战 38 | 39 | ### dangdang 40 | scrapy实现当当网商品爬虫实战 41 | ### baidunews 42 | scrapy百度新闻爬虫实战 43 | ### douban 44 | scrapy豆瓣网登陆爬虫与验证码自动识别实战 45 | ### jdgoods 46 | scrapy与urllib的整合使用(爬取京东图书商品) 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /baidunews/baidunews/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/baidunews/baidunews/__init__.py -------------------------------------------------------------------------------- /baidunews/baidunews/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaidunewsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | link = scrapy.Field() 14 | title = scrapy.Field() 15 | content = scrapy.Field() 16 | print(link) 17 | -------------------------------------------------------------------------------- /baidunews/baidunews/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BaidunewsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /baidunews/baidunews/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BaidunewsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /baidunews/baidunews/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidunews project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidunews' 13 | 14 | SPIDER_MODULES = ['baidunews.spiders'] 15 | NEWSPIDER_MODULE = 'baidunews.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'baidunews.middlewares.BaidunewsSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'baidunews.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'baidunews.pipelines.BaidunewsPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /baidunews/baidunews/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /baidunews/baidunews/spiders/news.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from baidunews.items import BaidunewsItem 5 | import re 6 | from scrapy.http import Request 7 | 8 | class NewsSpider(scrapy.Spider): 9 | name = "news" 10 | #allowed_domains = ["baidu.com"] 11 | start_urls = ['http://baidu.com/'] 12 | # 网站分类 13 | allid = ['LocalHouseNews', 'LocalNews'] 14 | # 构造请求地址 15 | allurl = [] 16 | for i in range(0, len(allid)): 17 | thisurl = "http://news.baidu.com/widget?id=" + allid[i] + "&ajax=json" 18 | allurl.append(thisurl) 19 | def parse(self, response): 20 | for j in range(0, len(self.allurl)): 21 | print("正在爬取第" + str(j) + "个栏目") 22 | yield Request(self.allurl[j], callback = self.getData1) 23 | # 处理爬取到的数据 24 | def getData1(self, response): 25 | data = response.body.decode('utf-8', 'ignore') 26 | pat1 = '"m_relate_url":"(.*?)"' 27 | pat2 = '"url":"(.*?)"' 28 | # 提取json串中的文章url地址 29 | url1 = re.compile(pat1, re.S).findall(data) 30 | url2 = re.compile(pat2, re.S).findall(data) 31 | if(len(url1) != 0): 32 | url = url1 33 | else: 34 | url = url2 35 | for k in range(0, len(url)): 36 | articleurl = url[k] 37 | # 处理url中转义符号\/\/ 38 | articleurl = re.sub('\\\/', '/', articleurl) 39 | yield Request(articleurl, callback = self.getData2) 40 | def getData2(self, response): 41 | item = BaidunewsItem() 42 | item['link'] = response.url 43 | item['title'] = response.xpath("/html/head/title/text()").extract() 44 | item['content'] = response.body 45 | yield item -------------------------------------------------------------------------------- /baidunews/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl news".split()) -------------------------------------------------------------------------------- /baidunews/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidunews.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidunews 12 | -------------------------------------------------------------------------------- /dangdang/dangdang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/dangdang/dangdang/__init__.py -------------------------------------------------------------------------------- /dangdang/dangdang/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class DangdangItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | title = scrapy.Field() 14 | link = scrapy.Field() 15 | comment = scrapy.Field() 16 | -------------------------------------------------------------------------------- /dangdang/dangdang/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DangdangSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /dangdang/dangdang/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymysql 9 | 10 | class DangdangPipeline(object): 11 | def process_item(self, item, spider): 12 | conn = pymysql.connect(host="127.0.0.1", user="root", passwd="ok", db="dangdang", charset="utf8") 13 | for i in range(0, len(item["title"])): 14 | title = item["title"][i] 15 | link = item["link"][i] 16 | comment = item["comment"][i] 17 | sql = "insert into goods(title, link, comment) values('" + title + "','" + link + "','" + comment + "')" 18 | try: 19 | conn.query(sql) 20 | except Exception as e: 21 | print(e) 22 | conn.commit() 23 | conn.close() 24 | return item 25 | -------------------------------------------------------------------------------- /dangdang/dangdang/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dangdang project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dangdang' 13 | 14 | SPIDER_MODULES = ['dangdang.spiders'] 15 | NEWSPIDER_MODULE = 'dangdang.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'dangdang.middlewares.DangdangSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'dangdang.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'dangdang.pipelines.DangdangPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /dangdang/dangdang/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dangdang/dangdang/spiders/dd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Script Name : dd.py 3 | # Author : gaoyaqiu(球哥) 4 | # Created : 2017-03-19 5 | # Description : Scrapy实现当当网商品爬虫实战 6 | 7 | 8 | import scrapy 9 | from dangdang.items import DangdangItem 10 | from scrapy.http import Request 11 | 12 | class DdSpider(scrapy.Spider): 13 | name = "dd" 14 | allowed_domains = ["dangdang.com"] 15 | start_urls = ['http://category.dangdang.com/pg1-cid4008154.html'] 16 | 17 | def parse(self, response): 18 | for i in range(1, 3): 19 | url = "http://category.dangdang.com/pg" + str(i) + "-cid4008154.html" 20 | yield Request(url, callback = self.handle_items) 21 | def handle_items(self, response): 22 | item = DangdangItem() 23 | item["title"] = response.xpath("//a[@name='sort-big-pic']/@title").extract() 24 | item["link"] = response.xpath("//a[@name='sort-big-pic']/@href").extract() 25 | item["comment"] = response.xpath("//a[@name='sort-evaluate']/text()").extract() 26 | yield item 27 | -------------------------------------------------------------------------------- /dangdang/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl dd".split()) -------------------------------------------------------------------------------- /dangdang/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dangdang.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dangdang 12 | -------------------------------------------------------------------------------- /douban/douban/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/douban/__init__.py -------------------------------------------------------------------------------- /douban/douban/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /douban/douban/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /douban/douban/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /douban/douban/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for douban project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'douban' 13 | 14 | SPIDER_MODULES = ['douban.spiders'] 15 | NEWSPIDER_MODULE = 'douban.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'douban.middlewares.DoubanSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'douban.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'douban.pipelines.DoubanPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /douban/douban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /douban/douban/spiders/dou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.http import Request, FormRequest 4 | import urllib.request 5 | import os 6 | 7 | 8 | class DouSpider(scrapy.Spider): 9 | name = "dou" 10 | allowed_domains = ["douban.com"] 11 | # start_urls = ['http://douban.com/'] 12 | 13 | header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} 14 | 15 | def start_requests(self): 16 | # 请求登陆页 17 | return [Request('https://accounts.douban.com/login', meta = {'cookiejar': 1}, callback = self.parse)] 18 | def parse(self, response): 19 | # 判断是否存在验证码 20 | captcha_image = response.xpath("//img[@id='captcha_image']/@src").extract() 21 | print(captcha_image) 22 | if len(captcha_image) > 0: 23 | print('有验证码, 等待识别...') 24 | # 将验证码下载到本地 25 | local_path = '/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 26 | urllib.request.urlretrieve(captcha_image[0], filename = local_path) 27 | # 方法1: 通过半自动人工处理 28 | #captcha_value = input('请输入/Users/gaoyaqiu/Downloads/python-test/test/中captcha.png的验证码内容! ') 29 | # 方法2: 通过接口实现全自动处理-1 (这里使用的是云打码的api,因他们接口不提供mac版本,所以这里例子只能在windows中使用) 30 | # 使用注意: 需要将YDMPython3.py 中的账号信息替换成自己的账号 31 | ''' 32 | cmd = 'python3.5 /Users/gaoyaqiu/git/python-spider/douban/ydm/YDMPython3.py' 33 | r = os.popen(cmd) 34 | captcha_value = r.read() 35 | ''' 36 | 37 | # 方法3: 通过接口实现全自动处理-2 (这里使用的是云打码的http api python2.7) 38 | cmd = 'python2.7 /Users/gaoyaqiu/git/python-spider/douban/ydm/YDMHTTP.py' 39 | r = os.popen(cmd) 40 | read_result = r.read().split() 41 | cid = read_result[0] 42 | if int(cid) > 0: 43 | captcha_value = str(read_result[1]) 44 | print('当前验证码识别结果为: cid: %s, result: %s' % (cid, captcha_value)) 45 | params = { 46 | 'captcha-solution': captcha_value, 47 | 'redir': 'https://www.douban.com/people/156127818/', # 登陆成功之后的重定向地址,可是自己主页地址 48 | 'form_email': '账号', 49 | 'form_password': '密码' 50 | } 51 | else: 52 | print('识别验证码出错cid: ' + cid) 53 | else: 54 | params = { 55 | 'redir': 'https://www.douban.com/people/156127818/', 56 | 'form_email': '账号', 57 | 'form_password': '密码' 58 | } 59 | 60 | print('登陆中。。。') 61 | # 开始登陆 62 | return [FormRequest.from_response(response, 63 | # 设置cookie 64 | meta = {'cookiejar': response.meta['cookiejar']}, 65 | # 设置header信息 66 | headers = self.header, 67 | # 设置post表单提交数据 68 | formdata = params, 69 | callback = self.next 70 | )] 71 | def next(self, response): 72 | # 登陆成功之后查看title信息,确认是跳转成功 73 | title = response.xpath("/html/head/title/text()").extract() 74 | print(title) 75 | 76 | -------------------------------------------------------------------------------- /douban/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl dou".split()) 3 | #cmdline.execute("scrapy crawl dou --nolog".split()) -------------------------------------------------------------------------------- /douban/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = douban.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = douban 12 | -------------------------------------------------------------------------------- /douban/ydm/YDMHTTP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import httplib, mimetypes, urlparse, json, time 3 | 4 | 5 | class YDMHttp: 6 | apiurl = 'http://api.yundama.com/api.php' 7 | 8 | username = '' 9 | password = '' 10 | appid = '' 11 | appkey = '' 12 | 13 | def __init__(self, username, password, appid, appkey): 14 | self.username = username 15 | self.password = password 16 | self.appid = str(appid) 17 | self.appkey = appkey 18 | 19 | def request(self, fields, files=[]): 20 | try: 21 | response = post_url(self.apiurl, fields, files) 22 | response = json.loads(response) 23 | except Exception as e: 24 | response = None 25 | return response 26 | 27 | def balance(self): 28 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 29 | 'appkey': self.appkey} 30 | response = self.request(data) 31 | if (response): 32 | if (response['ret'] and response['ret'] < 0): 33 | return response['ret'] 34 | else: 35 | return response['balance'] 36 | else: 37 | return -9001 38 | 39 | def login(self): 40 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 41 | 'appkey': self.appkey} 42 | response = self.request(data) 43 | if (response): 44 | if (response['ret'] and response['ret'] < 0): 45 | return response['ret'] 46 | else: 47 | return response['uid'] 48 | else: 49 | return -9001 50 | 51 | def upload(self, filename, codetype, timeout): 52 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 53 | 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 54 | file = {'file': filename} 55 | response = self.request(data, file) 56 | if (response): 57 | if (response['ret'] and response['ret'] < 0): 58 | return response['ret'] 59 | else: 60 | return response['cid'] 61 | else: 62 | return -9001 63 | 64 | def result(self, cid): 65 | data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 66 | 'appkey': self.appkey, 'cid': str(cid)} 67 | response = self.request(data) 68 | return response and response['text'] or '' 69 | 70 | def decode(self, filename, codetype, timeout): 71 | cid = self.upload(filename, codetype, timeout) 72 | if (cid > 0): 73 | for i in range(0, timeout): 74 | result = self.result(cid) 75 | if (result != ''): 76 | return cid, result 77 | else: 78 | time.sleep(1) 79 | return -3003, '' 80 | else: 81 | return cid, '' 82 | 83 | 84 | ###################################################################### 85 | 86 | def post_url(url, fields, files=[]): 87 | urlparts = urlparse.urlsplit(url) 88 | return post_multipart(urlparts[1], urlparts[2], fields, files) 89 | 90 | 91 | def post_multipart(host, selector, fields, files): 92 | content_type, body = encode_multipart_formdata(fields, files) 93 | h = httplib.HTTP(host) 94 | h.putrequest('POST', selector) 95 | h.putheader('Host', host) 96 | h.putheader('Content-Type', content_type) 97 | h.putheader('Content-Length', str(len(body))) 98 | h.endheaders() 99 | h.send(body) 100 | errcode, errmsg, headers = h.getreply() 101 | return h.file.read() 102 | 103 | 104 | def encode_multipart_formdata(fields, files=[]): 105 | BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ' 106 | CRLF = '\r\n' 107 | L = [] 108 | for field in fields: 109 | key = field 110 | value = fields[key] 111 | L.append('--' + BOUNDARY) 112 | L.append('Content-Disposition: form-data; name="%s"' % key) 113 | L.append('') 114 | L.append(value) 115 | for field in files: 116 | key = field 117 | filepath = files[key] 118 | L.append('--' + BOUNDARY) 119 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath)) 120 | L.append('Content-Type: %s' % get_content_type(filepath)) 121 | L.append('') 122 | L.append(open(filepath, 'rb').read()) 123 | L.append('--' + BOUNDARY + '--') 124 | L.append('') 125 | body = CRLF.join(L) 126 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 127 | return content_type, body 128 | 129 | 130 | def get_content_type(filename): 131 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 132 | 133 | 134 | ###################################################################### 135 | 136 | # 用户名 137 | username = 'username' 138 | 139 | # 密码 140 | password = 'password' 141 | 142 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 143 | appid = 1 144 | 145 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 146 | appkey = '3495daa179e863b03a0483ad99e6cadd' 147 | 148 | # 图片文件 149 | filename = '/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 150 | 151 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 152 | codetype = 3000 153 | 154 | # 超时时间,秒 155 | timeout = 60 156 | 157 | # 检查 158 | if (username == ''): 159 | print('请设置好相关参数再测试') 160 | else: 161 | # 初始化 162 | yundama = YDMHttp(username, password, appid, appkey) 163 | 164 | # 登陆云打码 165 | # uid = yundama.login(); 166 | #print('uid: %s' % uid) 167 | 168 | # 查询余额 169 | # balance = yundama.balance(); 170 | # print('balance: %s' % balance) 171 | 172 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 173 | cid, result = yundama.decode(filename, codetype, timeout); 174 | #print('cid: %s, result: %s' % (cid, result)) 175 | print cid, result 176 | 177 | ###################################################################### 178 | -------------------------------------------------------------------------------- /douban/ydm/YDMPython3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | from ctypes import * 6 | 7 | # 下载接口放目录 http://www.yundama.com/apidoc/YDM_SDK.html 8 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html 9 | # 所有函数请查询 http://www.yundama.com/apidoc 10 | 11 | print('>>>正在初始化...') 12 | 13 | YDMApi = windll.LoadLibrary('/Users/gaoyaqiu/git/python-spider/douban/ydm/yundamaAPI-x64.dll') 14 | 15 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号 16 | # 2. http://www.yundama.com/developer/myapp 添加新软件 17 | # 3. 使用添加的软件ID和密钥进行开发,享受丰厚分成 18 | 19 | appId = 1 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 20 | appKey = b'1' # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 21 | 22 | print('软件ID:%d\r\n软件密钥:%s' % (appId, appKey)) 23 | 24 | # 注意这里是普通会员账号,不是开发者账号,注册地址 http://www.yundama.com/index/reg/user 25 | # 开发者可以联系客服领取免费调试题分 26 | 27 | username = b'1' 28 | password = b'1' 29 | 30 | if username == b'test': 31 | exit('\r\n>>>请先设置用户名密码') 32 | 33 | ####################### 一键识别函数 YDM_EasyDecodeByPath ####################### 34 | 35 | print('\r\n>>>正在一键识别...') 36 | 37 | # 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 38 | codetype = 3000 39 | 40 | # 分配30个字节存放识别结果 41 | result = c_char_p(b" ") 42 | 43 | # 识别超时时间 单位:秒 44 | timeout = 60 45 | 46 | # 验证码文件路径 47 | filename = b'/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 48 | 49 | # 一键识别函数,无需调用 YDM_SetAppInfo 和 YDM_Login,适合脚本调用 50 | captchaId = YDMApi.YDM_EasyDecodeByPath(username, password, appId, appKey, filename, codetype, timeout, result) 51 | 52 | print("一键识别:验证码ID:%d,识别结果:%s" % (captchaId, result.value)) 53 | 54 | ################################################################################ 55 | 56 | 57 | ########################## 普通识别函数 YDM_DecodeByPath ######################### 58 | 59 | print('\r\n>>>正在登陆...') 60 | 61 | # 第一步:初始化云打码,只需调用一次即可 62 | YDMApi.YDM_SetAppInfo(appId, appKey) 63 | 64 | # 第二步:登陆云打码账号,只需调用一次即可 65 | uid = YDMApi.YDM_Login(username, password) 66 | 67 | if uid > 0: 68 | 69 | print('>>>正在获取余额...') 70 | 71 | # 查询账号余额,按需要调用 72 | balance = YDMApi.YDM_GetBalance(username, password) 73 | 74 | print('登陆成功,用户名:%s,剩余题分:%d' % (username, balance)) 75 | 76 | print('\r\n>>>正在普通识别...') 77 | 78 | # 第三步:开始识别 79 | 80 | # 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 81 | codetype = 3000 82 | 83 | # 分配30个字节存放识别结果 84 | result = c_char_p(b" ") 85 | 86 | # 验证码文件路径 87 | filename = b'/Users/gaoyaqiu/Downloads/python-test/test/captcha.png' 88 | 89 | # 普通识别函数,需先调用 YDM_SetAppInfo 和 YDM_Login 初始化 90 | captchaId = YDMApi.YDM_DecodeByPath(filename, codetype, result) 91 | 92 | print("普通识别:验证码ID:%d,识别结果:%s" % (captchaId, result.value)) 93 | 94 | else: 95 | print('登陆失败,错误代码:%d' % uid) 96 | 97 | ################################################################################ 98 | 99 | print('\r\n>>>错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html') 100 | 101 | input('\r\n测试完成,按回车键结束...') 102 | -------------------------------------------------------------------------------- /douban/ydm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/ydm/__init__.py -------------------------------------------------------------------------------- /douban/ydm/yundamaAPI-x64.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/ydm/yundamaAPI-x64.dll -------------------------------------------------------------------------------- /douban/ydm/yundamaAPI.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/douban/ydm/yundamaAPI.dll -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gaoyaqiu/python-spider/86e2c7df9694a45261d4799254f34762074b9997/examples/__init__.py -------------------------------------------------------------------------------- /examples/example-1.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-1.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-04 4 | # Description : No.1 python语法基础 5 | 6 | # 在控制台打印hello world,使用print方法 7 | # 学编程的一般刚认识的就是hello world, ^_^ 8 | print("hello world!") 9 | 10 | # 1.1 常见的两种注释用法(当我们想让某些区域的代码不起作用时就会用到注释) 11 | ''' 12 | 1、#号表示: 单行注释 13 | 2、三引号表示: 多行注释 14 | ''' 15 | 16 | # 单行注释 17 | #print("hello world!") 18 | 19 | # 多行注释 20 | ''' 21 | print("hello world!") 22 | ''' 23 | 24 | # 1.2 标识符(标识某个东西的符号) 25 | ''' 26 | python 中标识符命名规则为: 27 | 第一个字符为字母或下划线 28 | 除第一个字符以外的其他字符可以是字母、下划线或数字 29 | ''' 30 | # 标识符 31 | # nihao、_nihao、_nihao_nihao 32 | 33 | # 1.3 变量(简单来说可以变化的量叫做变量) 34 | # 变量 35 | nihao = 1 36 | _nihao = 2 37 | _nihao_nihao = 3 38 | 39 | # 左面的nihao、_nihao、_nihao_nihao 就是变量, 右面的 1、2、3就是变量对应的值 40 | 41 | # 1.4 数据类型(世界上有很多数据类型,为了更方便处理这些数据,我们给这些数据进行分类,进而形成了数据类型) 42 | # python 中常见的数据类型有 43 | # 数、字符串、列表(list)、元组(tuple)、集合(set)、字典(dictinoary) 44 | 45 | # 数 46 | # abc 这个变量的数据类型就是数 47 | abc = 8 48 | 49 | # 字符串 50 | # 单引号、双引号、三引号都是字符串 51 | a1 = "abc" 52 | a2 = 'abc' 53 | a3 = '''abc''' 54 | 55 | # 列表 (存储多个元素, 列表中的元素可以重新赋值, 最外围用[]中括号表示) 56 | # 列表可以直接通过下标取值,下标是从0开始编号,0就表示第一个值,不管是数组、还是列表、元组都会从0开始编号 57 | # 取第1个值的方式为list[0]、第2个为b[1]、依次类推。。。 58 | # 还可以修改列表中某个元素的值,如想修改8的值为100,那么可以使用 b[1] = 100 59 | b = [7, 8, "abc", 9] 60 | #print(b[0]) 61 | b[1] = 100 62 | #print(b[1]) 63 | 64 | # 元组 (存储多个元素, 元组中的元素不可以重新赋值, 最外围用()小括号表示) 65 | c = (7, 8, "abc", 9) 66 | #print(c) 67 | 68 | # 字典 {键: 值, 键: 值, ...}, 69 | # 取值格式: 字典名["对应的键名"] 70 | d = {"name": "gaoyaqiu", "sex": "boy", "job": "程序猿"} 71 | #print(d["job"]) 72 | 73 | # 集合 (用的不多,最好的用处就是去重,) 74 | # 去重 (比如下面的例子中a比较多,使用集合之后,重复的a都会过滤掉只保留一个) 75 | e1 = set("abcsdfsfaaaaa") 76 | #print(e1) 77 | e2 = set("abcdfg") 78 | #print(e2) 79 | 80 | # 交集 81 | r1 = e1 & e2 82 | #print(r1) 83 | 84 | # 并集 85 | r2 = e1 | e2 86 | #print(r2) 87 | 88 | # 差集 89 | r3 = e1 - e2 90 | #print(r3) 91 | 92 | # 对称差集 93 | r4 = e1 ^ e2 94 | #print(r4) 95 | 96 | 97 | # 1.5 运算符 (常见的有+、-、*、/、%号等) 98 | # 对于运算符优先级的使用,常见的技巧 99 | # 1. 基本的数学运算符的优先级规律在python中基本上适用 100 | # 2. 实在不清楚优先级规律, 为需要先执行的运算加上括号 101 | 102 | # +、-、*、/、% 运算 103 | h1 = 1 + 8 * 2 - 1 104 | #print(h1) 105 | 106 | h2 = (1 + 8) * 2 - 1 107 | #print(h2) 108 | 109 | h3 = 21 % 2 110 | #print(h3) 111 | 112 | # + 号也可以做为字符串连接使用 113 | o1 = "hello " 114 | o2 = o1 + "python" 115 | #print(o2) 116 | 117 | # 1.6 缩进 118 | # python 是一门强制缩进的语言,有些朋友可能比较方案这一点,但是,这一点的存在,让 119 | # python代码变得更加美观。python中缩进规律: 120 | # 同一层的代码,处于同一个缩进幅度上,下一层代码,需要相对于上一个层次的代码进行缩进 121 | # 建议使用tab键进行缩进 122 | 123 | b = "10" 124 | if(b == "10"): 125 | print("abc") 126 | 127 | 128 | -------------------------------------------------------------------------------- /examples/example-10.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-10.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.10 正则表达式-模式修正符 5 | 6 | # 模式修正符 7 | ''' 8 | I 匹配时忽略大小写 * 9 | M 多行匹配 * 10 | L 本地化识别匹配 11 | U unicode 12 | S 让.匹配包括换行符 * 13 | ''' 14 | import re 15 | 16 | string = 'Python' 17 | 18 | pat = "pyt" 19 | rst = re.search(pat, string, re.I) 20 | print(rst) 21 | 22 | -------------------------------------------------------------------------------- /examples/example-11.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-11.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.11 正则表达式-贪婪模式和懒惰模式 5 | 6 | import re 7 | 8 | # 贪婪模式和懒惰模式 9 | string = 'pythony' 10 | # 默认就是贪婪模式 11 | pat = "p.*y" 12 | 13 | # 懒惰模式 14 | pat = "p.*?y" 15 | rst = re.search(pat, string, re.I) 16 | print(rst) 17 | 18 | # 正则函数 re.match()、 re.search()、全局匹配、re.sub() 19 | # match 从头开始匹配 20 | string = 'pythonyjkjkjssa' 21 | pat = "p.*?y" 22 | rst = re.match(pat, string) 23 | print(rst) 24 | 25 | # search 从任意地方匹配 26 | 27 | # 全局匹配函数 28 | string = 'sdpythpnyonyjkjkjptyssa' 29 | pat = "p.*?y" 30 | rst = re.compile(pat).findall(string) 31 | print(rst) 32 | 33 | 34 | # 正则表达式 实例 35 | import re 36 | 37 | # 匹配.com 和 .cn 38 | 39 | string = "百度" 40 | 41 | pat = "[a-zA-z]+://[^\s]*[.com|.cn]" 42 | 43 | ret = re.compile(pat).findall(string) 44 | 45 | print(ret) 46 | 47 | # 匹配电话号码 48 | string = "sdfsdfs021-123132432432fsfdwfds0773-23424324234sdfsdfsd" 49 | pat = "\d{3}-\d{8}|\d{4}-\d{7}" 50 | ret = re.compile(pat).findall(string) 51 | 52 | print(ret) 53 | 54 | -------------------------------------------------------------------------------- /examples/example-12.py: -------------------------------------------------------------------------------- 1 | # Script Name : example-12.py 2 | # Author : gaoyaqiu(球哥) 3 | # Created : 2017-03-05 4 | # Description : No.12 简单爬虫的编写(urllib学习) 5 | 6 | import urllib.request 7 | import re 8 | 9 | data = urllib.request.urlopen("http://edu.csdn.net").read() 10 | 11 | #print(data) 12 | 13 | # 自动提取课程页面的QQ群号码 14 | data = urllib.request.urlopen("http://edu.csdn.net/huiyiCourse/detail/253").read().decode("utf-8") 15 | pat = "
(\d*?)
" 16 | ret = re.compile(pat).findall(data) 17 | print(ret[0]) 18 | 19 | # 豆瓣网址出版爬取 20 | 21 | ''' 22 | import urllib.request 23 | import re 24 | 25 | data = urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8") 26 | pat = '