├── .gitignore ├── DouTu ├── DouTu │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── doutu.py ├── README.md ├── scrapy.cfg └── screenshot.png ├── Dytt ├── Dytt │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── IpProxy ├── IpProxy │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── proxy.py ├── ip.txt └── scrapy.cfg ├── JiKeXueYuan ├── JiKeXueYuan │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── mySpider.py ├── data.json └── scrapy.cfg ├── OneSpider ├── OneSpider │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── myspider.py ├── data.json └── scrapy.cfg ├── QiuShiBaiKe ├── QiuShiBaiKe │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── MySpdier.py │ │ └── __init__.py ├── data.json ├── readme.md └── scrapy.cfg ├── README.md ├── ShiFuTu ├── ShiFuTu │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── myspider.py │ │ └── myspider2.py └── scrapy.cfg ├── ZhiHuUser ├── README.md ├── ZhiHuUser │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── mySpider.py └── scrapy.cfg ├── dbMoviesTop250 ├── data.json ├── dbMoviesTop250 │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── mySpider.py └── scrapy.cfg └── litterlove ├── data.csv ├── data.json ├── data.xml ├── litterlove ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── myspider.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | *.jpg 4 | *.jpeg 5 | *.png 6 | *~ 7 | -------------------------------------------------------------------------------- /DouTu/DouTu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/DouTu/DouTu/__init__.py -------------------------------------------------------------------------------- /DouTu/DouTu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoutuItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | img_url = scrapy.Field() 14 | name = scrapy.Field() 15 | -------------------------------------------------------------------------------- /DouTu/DouTu/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | import random 11 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 12 | 13 | 14 | class DoutuSpiderMiddleware(object): 15 | # Not all methods need to be defined. If a method is not defined, 16 | # scrapy acts as if the spider middleware does not modify the 17 | # passed objects. 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | # This method is used by Scrapy to create your spiders. 22 | s = cls() 23 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 24 | return s 25 | 26 | def process_spider_input(response, spider): 27 | # Called for each response that goes through the spider 28 | # middleware and into the spider. 29 | 30 | # Should return None or raise an exception. 31 | return None 32 | 33 | def process_spider_output(response, result, spider): 34 | # Called with the results returned from the Spider, after 35 | # it has processed the response. 36 | 37 | # Must return an iterable of Request, dict or Item objects. 38 | for i in result: 39 | yield i 40 | 41 | def process_spider_exception(response, exception, spider): 42 | # Called when a spider or process_spider_input() method 43 | # (from other spider middleware) raises an exception. 44 | 45 | # Should return either None or an iterable of Response, dict 46 | # or Item objects. 47 | pass 48 | 49 | def process_start_requests(start_requests, spider): 50 | # Called with the start requests of the spider, and works 51 | # similarly to the process_spider_output() method, except 52 | # that it doesn’t have a response associated. 53 | 54 | # Must return only requests (not items). 55 | for r in start_requests: 56 | yield r 57 | 58 | def spider_opened(self, spider): 59 | spider.logger.info('Spider opened: %s' % spider.name) 60 | 61 | 62 | class RotateUserAgentMiddleware(UserAgentMiddleware): 63 | def __init__(self, user_agent=''): 64 | self.user_agent = user_agent 65 | 66 | def process_request(self, request, spider): 67 | ua = random.choice(self.user_agent_list) 68 | if ua: 69 | print(ua) 70 | request.headers.setdefault('User-Agent', ua) 71 | 72 | user_agent_list = [ 73 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" 74 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 75 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 76 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 77 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 78 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 79 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 80 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 81 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 82 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 83 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 84 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 85 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 86 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 87 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 88 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 89 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 90 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 91 | ] 92 | -------------------------------------------------------------------------------- /DouTu/DouTu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoutuPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /DouTu/DouTu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DouTu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'DouTu' 13 | 14 | SPIDER_MODULES = ['DouTu.spiders'] 15 | NEWSPIDER_MODULE = 'DouTu.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 16 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 0.2 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | SPIDER_MIDDLEWARES = { 50 | 'DouTu.middlewares.DoutuSpiderMiddleware': None, 51 | 'DouTu.middlewares.RotateUserAgentMiddleware': 400, 52 | } 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'DouTu.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'DouTu.pipelines.DoutuPipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /DouTu/DouTu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /DouTu/DouTu/spiders/doutu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import requests 4 | import scrapy 5 | from DouTu.items import DoutuItem 6 | 7 | 8 | class DoutuSpider(scrapy.Spider): 9 | name = "doutu" 10 | allowed_domains = ["doutula.com", "sinaimg.cn"] 11 | start_urls = ['https://www.doutula.com/photo/list/?page={}'.format(i) for i in range(1, 40)] 12 | 13 | def parse(self, response): 14 | i = 0 15 | for content in response.xpath('//li[@class="list-group-item"]/div/div/a'): 16 | i += 1 17 | item = DoutuItem() 18 | item['img_url'] = content.xpath('//img/@data-original').extract()[i] 19 | item['name'] = content.xpath('//p/text()').extract()[i] 20 | 21 | try: 22 | if not os.path.exists('doutu'): 23 | os.makedirs('doutu') 24 | r = requests.get(item['img_url']) 25 | filename = 'doutu/{}'.format(item['name']) + item['img_url'][-4:] 26 | with open(filename, 'wb') as fo: 27 | fo.write(r.content) 28 | except Exception as e: 29 | raise e 30 | 31 | yield item 32 | -------------------------------------------------------------------------------- /DouTu/README.md: -------------------------------------------------------------------------------- 1 | 斗图 2 | 3 | ![image](./screenshot.png) 4 | -------------------------------------------------------------------------------- /DouTu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = DouTu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = DouTu 12 | -------------------------------------------------------------------------------- /DouTu/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/DouTu/screenshot.png -------------------------------------------------------------------------------- /Dytt/Dytt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/Dytt/Dytt/__init__.py -------------------------------------------------------------------------------- /Dytt/Dytt/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class DyttItem(scrapy.Item): 11 | title = scrapy.Field() 12 | date = scrapy.Field() 13 | content = scrapy.Field() 14 | image = scrapy.Field() 15 | download_url = scrapy.Field() 16 | -------------------------------------------------------------------------------- /Dytt/Dytt/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DyttSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class DyttDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /Dytt/Dytt/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DyttPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Dytt/Dytt/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Dytt project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Dytt' 13 | 14 | SPIDER_MODULES = ['Dytt.spiders'] 15 | NEWSPIDER_MODULE = 'Dytt.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Dytt (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Dytt.middlewares.DyttSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Dytt.middlewares.DyttDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'Dytt.pipelines.DyttPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Dytt/Dytt/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Dytt/Dytt/spiders/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from Dytt.items import DyttItem 4 | 5 | 6 | class MovieSpider(scrapy.Spider): 7 | name = 'movie' 8 | allowed_domains = ['dytt8.net'] 9 | start_urls = ['http://dytt8.net/'] 10 | 11 | def parse(self, response): 12 | dyttItem = DyttItem() 13 | item = response.xpath("//div[@class='co_area2']/div[@class='co_content8']/ul/table/tr")[1] 14 | title = item.xpath('td/a[2]/text()').extract()[0] 15 | href = item.xpath('td/a[2]/@href').extract()[0] 16 | date = item.xpath('td[2]/font/text()').extract()[0] 17 | # print(title) 18 | # print(date) 19 | dyttItem['title'] = title 20 | dyttItem['date'] = date 21 | 22 | yield scrapy.Request( 23 | url = 'http://dytt8.net' + href, 24 | callback = self.parse_item, 25 | meta = { 26 | 'item': dyttItem, 27 | } 28 | ) 29 | 30 | def parse_item(self, response): 31 | dyttItem = response.meta['item'] 32 | image = response.xpath("//div[@id='Zoom']//img[1]/@src").extract()[0] 33 | content = [i for i in response.xpath("//div[@id='Zoom']//text()[preceding-sibling::br]").extract() if i.strip() != ""] 34 | download_url = response.xpath("//div[@id='Zoom']//a/@href").extract() 35 | # print (image) 36 | # print ("\n".join(content)) 37 | # print ("\n".join(download_url)) 38 | dyttItem['image'] = image 39 | dyttItem['content'] = "\n".join(content) 40 | dyttItem['download_url'] = "\n".join(download_url) 41 | 42 | return dyttItem 43 | -------------------------------------------------------------------------------- /Dytt/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Dytt.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Dytt 12 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/IpProxy/IpProxy/__init__.py -------------------------------------------------------------------------------- /IpProxy/IpProxy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #actor 3 | 4 | import scrapy 5 | 6 | class IpproxyItem(scrapy.Item): 7 | # define the fields for your item here like: 8 | # name = scrapy.Field() 9 | IP = scrapy.Field() 10 | port = scrapy.Field() 11 | #status = scrapy.Field() 12 | #types = scrapy.Field() 13 | #support = scrapy.Field() 14 | #address = scrapy.Field() 15 | #speed = scrapy.Field() 16 | #testtime = scrapy.Field() 17 | #grab_time = scrapy.Field() 18 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class IpproxyPipeline(object): 4 | 5 | def process_item(self, item, spider): 6 | print '---------write------------------' 7 | f=file("ip.txt","a+") 8 | content=item['IP'] + ':' +item['port'] +'\n' 9 | f.write(content) 10 | f.close() 11 | return item 12 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for IpProxy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'IpProxy' 13 | 14 | SPIDER_MODULES = ['IpProxy.spiders'] 15 | NEWSPIDER_MODULE = 'IpProxy.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'IpProxy (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | DOWNLOAD_DELAY=5 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'IpProxy.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'IpProxy.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | USER_AGENT = USER_AGENTS = [ 62 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" 63 | ] 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'IpProxy.pipelines.IpproxyPipeline': 300, 68 | } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 73 | #AUTOTHROTTLE_ENABLED=True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY=5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY=60 78 | # Enable showing throttling stats for every response received: 79 | #AUTOTHROTTLE_DEBUG=False 80 | 81 | # Enable and configure HTTP caching (disabled by default) 82 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 83 | #HTTPCACHE_ENABLED=True 84 | #HTTPCACHE_EXPIRATION_SECS=0 85 | #HTTPCACHE_DIR='httpcache' 86 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 87 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 88 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /IpProxy/IpProxy/spiders/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request 5 | from IpProxy.items import IpproxyItem 6 | import time 7 | class MyspiderSpider(scrapy.Spider): 8 | name = "kuaidaili" 9 | allowed_domains = ["http://www.kuaidaili.com/"] 10 | start_urls = ( 11 | 'http://www.kuaidaili.com/proxylist/%d/' % i for i in xrange(1,21) 12 | ) 13 | 14 | def parse(self, response): 15 | item = IpproxyItem() 16 | IP = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[1]/text()').extract() 17 | port = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[2]/text()').extract() 18 | #status = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[3]/text()').extract() 19 | #types = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[4]/text()').extract() 20 | #support = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[5]/text()').extract() 21 | #address = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[6]/text()').extract() 22 | #speed= Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[7]/text()').extract() 23 | #testtime= Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[8]/text()').extract() 24 | 25 | for i in range(len(IP)): 26 | item['IP'] = IP[i] 27 | item['port'] = port[i] 28 | #item['status'] = status[i] 29 | #item['types'] = types[i] 30 | #item['support'] = support[i] 31 | #item['address'] = address[i] 32 | #item['speed'] = speed[i] 33 | #item['testtime'] = testtime[i] 34 | #item['grab_time'] = time.strftime('%Y-%m-%d') 35 | yield item 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /IpProxy/ip.txt: -------------------------------------------------------------------------------- 1 | 58.61.75.134:9999 2 | 121.69.25.58:8118 3 | 180.110.78.206:8998 4 | 101.201.52.37:80 5 | 218.202.137.102:81 6 | 183.57.82.74:8081 7 | 183.230.53.12:8123 8 | 115.173.230.31:8123 9 | 49.85.250.139:8998 10 | 120.83.227.162:9999 11 | 117.64.22.71:8998 12 | 58.251.199.23:8118 13 | 222.45.196.46:8118 14 | 116.7.103.168:3128 15 | 218.241.153.211:81 16 | 14.154.200.44:8118 17 | 123.54.207.170:8998 18 | 113.122.117.223:8998 19 | 114.232.19.132:8998 20 | 125.39.226.122:20000 21 | 175.13.128.45:81 22 | 218.205.76.131:80 23 | 42.123.89.120:8888 24 | 60.164.152.183:8998 25 | 60.191.153.12:3128 26 | 117.67.239.97:8998 27 | 123.121.104.126:9797 28 | 180.124.161.7:8998 29 | 221.237.154.57:9999 30 | 115.28.33.176:1080 31 | 119.147.115.6:8088 32 | 122.96.59.105:81 33 | 125.87.249.205:8123 34 | 221.237.154.58:9999 35 | 113.105.80.61:3128 36 | 115.218.143.148:3128 37 | 175.30.122.81:8998 38 | 27.46.52.10:8118 39 | 111.73.154.57:8998 40 | 183.140.40.102:8998 41 | 183.140.40.102:8998 42 | 121.69.16.146:9999 43 | 60.185.209.157:8998 44 | 112.255.39.176:80 45 | 183.141.125.140:3128 46 | 183.230.53.166:8123 47 | 222.59.161.12:8118 48 | 171.217.113.209:9797 49 | 219.141.225.149:80 50 | 112.92.207.122:9999 51 | 60.13.74.143:82 52 | 121.40.37.25:6666 53 | 222.192.171.131:8998 54 | 113.66.62.137:9999 55 | 115.236.166.125:8080 56 | 114.217.23.141:8998 57 | 221.179.195.78:80 58 | 183.57.82.71:8081 59 | 120.204.85.29:3128 60 | 61.163.55.3:9000 61 | 183.230.53.177:8123 62 | 60.186.80.44:3128 63 | 42.95.231.27:8998 64 | 183.230.53.181:8123 65 | 110.73.55.188:8123 66 | 175.8.27.151:808 67 | 210.14.134.37:80 68 | 121.33.35.191:9797 69 | 223.82.208.56:8123 70 | 116.237.99.243:8118 71 | 220.249.21.222:8118 72 | 222.221.46.219:8998 73 | 60.13.74.143:80 74 | 111.195.87.63:8123 75 | 183.141.153.66:3128 76 | 113.251.235.243:8998 77 | 115.203.28.241:8888 78 | 222.78.248.10:3128 79 | 115.220.205.19:8998 80 | 121.69.22.250:8118 81 | 121.69.22.250:8118 82 | 61.160.254.23:23 83 | 61.143.158.238:808 84 | 114.215.192.135:8118 85 | 182.61.22.179:80 86 | 60.161.23.243:8998 87 | 27.46.52.37:8118 88 | 42.51.13.103:8118 89 | 61.134.25.106:3128 90 | 119.130.194.147:9999 91 | 124.239.238.201:9999 92 | 113.122.214.253:8998 93 | 116.10.176.39:9999 94 | 27.36.94.82:8080 95 | 124.47.7.45:80 96 | 14.155.83.183:9999 97 | 121.56.7.9:8080 98 | 27.42.239.174:9797 99 | 39.150.97.8:8998 100 | 120.194.107.149:9999 101 | 211.149.155.136:8118 102 | 183.230.53.122:8123 103 | 58.61.75.134:9999 104 | 121.69.25.58:8118 105 | 180.110.78.206:8998 106 | 101.201.52.37:80 107 | 218.202.137.102:81 108 | 183.57.82.74:8081 109 | 183.230.53.12:8123 110 | 115.173.230.31:8123 111 | -------------------------------------------------------------------------------- /IpProxy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = IpProxy.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = IpProxy 12 | -------------------------------------------------------------------------------- /JiKeXueYuan/JiKeXueYuan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/JiKeXueYuan/JiKeXueYuan/__init__.py -------------------------------------------------------------------------------- /JiKeXueYuan/JiKeXueYuan/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | class JiKeXueYuanItem(Item): 11 | course_id = Field() 12 | course_name = Field() 13 | course_url = Field() 14 | course_path = Field() 15 | pass 16 | 17 | -------------------------------------------------------------------------------- /JiKeXueYuan/JiKeXueYuan/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class JikexueyuanPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /JiKeXueYuan/JiKeXueYuan/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for JiKeXueYuan project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'JiKeXueYuan' 13 | 14 | SPIDER_MODULES = ['JiKeXueYuan.spiders'] 15 | NEWSPIDER_MODULE = 'JiKeXueYuan.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'JiKeXueYuan (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'JiKeXueYuan.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'JiKeXueYuan.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | #ITEM_PIPELINES = { 65 | # 'JiKeXueYuan.pipelines.SomePipeline': 300, 66 | #} 67 | 68 | # Enable and configure the AutoThrottle extension (disabled by default) 69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 71 | #AUTOTHROTTLE_ENABLED=True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY=5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY=60 76 | # Enable showing throttling stats for every response received: 77 | #AUTOTHROTTLE_DEBUG=False 78 | 79 | # Enable and configure HTTP caching (disabled by default) 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 81 | #HTTPCACHE_ENABLED=True 82 | #HTTPCACHE_EXPIRATION_SECS=0 83 | #HTTPCACHE_DIR='httpcache' 84 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 86 | -------------------------------------------------------------------------------- /JiKeXueYuan/JiKeXueYuan/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /JiKeXueYuan/JiKeXueYuan/spiders/mySpider.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | from scrapy import Spider 4 | from scrapy.http import Request 5 | from scrapy.selector import Selector 6 | from scrapy.spiders import CrawlSpider 7 | from JiKeXueYuan.items import JiKeXueYuanItem 8 | 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding("utf-8") 12 | 13 | class CourseSpider(Spider): 14 | name = "course" 15 | baseurl = "http://www.jikexueyuan.com/course/" 16 | allowed_domains = ["http://www.jikexueyuan.com/", "search.jikexueyuan.com", "jikexueyuan.com"] 17 | start_urls = [ 18 | # 修改这里以完成整站的爬取 19 | # 'http://www.jikexueyuan.com/course/?pageNum=%d' % i for i in xrange(1, 86) 20 | 'http://www.jikexueyuan.com/course/?pageNum=1' 21 | ] 22 | 23 | def __init__(self): 24 | self.cookies = {your-cookie} 25 | 26 | def parse(self, response): 27 | s_total = Selector(text=response.body).xpath("//ul[@class='cf']/li/div[@class='lessonimg-box']/a/@href").extract() 28 | 29 | if len(s_total) > 0: 30 | for page in s_total: 31 | yield Request(page, callback=self.get_course_page, cookies=self.cookies) 32 | else: 33 | pass 34 | 35 | def get_course_page(self, response): 36 | x_course = Selector(text=response.body).xpath("//ul/li/div[@class='text-box']/h2/a") 37 | for x in x_course: 38 | try: 39 | href = x.xpath('@href').extract()[0] 40 | title = x.xpath('text()').extract()[0] 41 | 42 | meta = {} 43 | meta['href'] = href 44 | meta['title'] = title 45 | yield Request(href, callback=self.get_down_urls, meta={'meta': meta}, cookies=self.cookies) 46 | except: 47 | pass 48 | 49 | def get_down_urls(self, response): 50 | meta = response.meta['meta'] 51 | path = Selector(text=response.body).xpath("//div[@class='crumbs']/div[@class='w-1000']/a/text()").extract() 52 | course_down = re.findall(r'source src="(.*?)"', response.body, re.S) 53 | item = JiKeXueYuanItem() 54 | if course_down: 55 | item['course_id'] = meta['href'] 56 | item['course_name'] = meta['title'] 57 | item['course_url'] = course_down[0] 58 | item['course_path'] = path 59 | yield item -------------------------------------------------------------------------------- /JiKeXueYuan/data.json: -------------------------------------------------------------------------------- 1 | [{"course_id": "http://www.jikexueyuan.com/course/2826_3.html?ss=1", "course_name": " \u5176\u4ed6\u7eb9\u7406\u4f18\u5316\u65b9\u5f0f", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684\u7eb9\u7406\u4f18\u5316"], "course_url": "http://cv4.jikexueyuan.com/d3dfec23f93e1a12b6ea3107c3dc577f/201606211645/course/3301-3400/3383/video/9697_b_h264_sd_960_540.mp4"}, 2 | {"course_id": "http://www.jikexueyuan.com/course/2826_1.html?ss=1", "course_name": " \u538b\u7f29\u7eb9\u7406", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684\u7eb9\u7406\u4f18\u5316"], "course_url": "http://cv4.jikexueyuan.com/218a7e326a8144045e835d9850f312ae/201606211645/course/3301-3400/3383/video/9695_b_h264_sd_960_540.mp4"}, 3 | {"course_id": "http://www.jikexueyuan.com/course/2718_4.html?ss=1", "course_name": " iOS \u5c01\u88c5 IAP \u652f\u4ed8 SDK", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "iOS", "iOS \u5c01\u88c5\u7b2c\u4e09\u65b9\u652f\u4ed8 SDK"], "course_url": "http://cv4.jikexueyuan.com/3b03f23e36bde02bd02f46a511f9ce25/201606211645/course/2901-3000/2995/video/8774_b_h264_sd_960_540.mp4"}, 4 | {"course_id": "http://www.jikexueyuan.com/course/2718_3.html?ss=1", "course_name": " iOS \u5c01\u88c5\u7b2c\u4e09\u65b9\u652f\u4ed8 SDK", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "iOS", "iOS \u5c01\u88c5\u7b2c\u4e09\u65b9\u652f\u4ed8 SDK"], "course_url": "http://cv4.jikexueyuan.com/bbff298059ed8a691dfe61ae7bf41c11/201606211645/course/2901-3000/2995/video/8773_b_h264_sd_960_540.mp4"}, 5 | {"course_id": "http://www.jikexueyuan.com/course/2718_2.html?ss=1", "course_name": " iOS \u521b\u5efa Framework \u7c7b\u5e93", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "iOS", "iOS \u5c01\u88c5\u7b2c\u4e09\u65b9\u652f\u4ed8 SDK"], "course_url": "http://cv4.jikexueyuan.com/80a06e0054da5b5ca3921b2f8da39467/201606211645/course/2901-3000/2995/video/8772_b_h264_sd_960_540.mp4"}, 6 | {"course_id": "http://www.jikexueyuan.com/course/2720_1.html?ss=1", "course_name": " \u7f16\u7a0b\u4e4b\u7f8e\u201c\u51cc\u4e91\u4e4b\u667a\u201c\u7ebf\u4e0a\u5206\u4eab\u4e4b Azure \u7ebf\u4e0a\u57f9\u8bad\u53ca\u5de5\u7a0b\u5e08\u7b54\u7591", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Windows Azure", "\u7f16\u7a0b\u4e4b\u7f8e\u201c\u51cc\u4e91\u4e4b\u667a\u201c\u7ebf\u4e0a\u5206\u4eab\u4e4b Azure \u7ebf\u4e0a\u57f9\u8bad\u53ca\u5de5\u7a0b\u5e08\u7b54\u7591"], "course_url": "http://cv4.jikexueyuan.com/92768d911fcd8b3361c4e5a8e70eedc7/201606211645/course/3301-3400/3325/video/c3325b_01_h264_sd_960_540.mp4"}, 7 | {"course_id": "http://www.jikexueyuan.com/course/2715_5.html?ss=1", "course_name": " \u5220\u9664\u5b66\u751f\u4fe1\u606f\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "C", "C \u8bed\u8a00\u7f16\u7a0b\u5b9e\u6218\uff1a\u5b66\u751f\u4fe1\u606f\u7ba1\u7406\u7cfb\u7edf\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/4c4fc2551d0434982a2230194c4e2e73/201606211645/course/3001-3100/3095/video/8922_b_h264_sd_960_540.mp4"}, 8 | {"course_id": "http://www.jikexueyuan.com/course/2721_3.html?ss=1", "course_name": " \u6267\u884c\u7ed8\u5236\u547d\u4ee4", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684 OpenGL ES\u77e5\u8bc6\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/5961b826c2d8ed374ffeb21705370b21/201606211645/course/3101-3200/3151/video/9044_b_h264_sd_960_540.mp4"}, 9 | {"course_id": "http://www.jikexueyuan.com/course/2721_1.html?ss=1", "course_name": " \u7ed1\u5b9a Shader", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684 OpenGL ES\u77e5\u8bc6\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/59cf9dcfa396dfa62f6d9f987ab7c295/201606211645/course/3101-3200/3151/video/9042_b_h264_sd_960_540.mp4"}, 10 | {"course_id": "http://www.jikexueyuan.com/course/2721_2.html?ss=1", "course_name": " \u4f20\u5165\u7ed8\u5236\u4fe1\u606f", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684 OpenGL ES\u77e5\u8bc6\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/785b1dd9008eb94430482106b34e3e2a/201606211645/course/3101-3200/3151/video/9043_b_h264_sd_960_540.mp4"}, 11 | {"course_id": "http://www.jikexueyuan.com/course/2722_2.html?ss=1", "course_name": " \u4e00\u5143\u7ebf\u6027\u56de\u5f52\u7684\u539f\u7406", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u6316\u6398", "\u4e00\u5143\u7ebf\u6027\u56de\u5f52"], "course_url": "http://cv4.jikexueyuan.com/49ed7043e4954dabd9f05b688b5051e5/201606211645/course/2801-2900/2842/video/7930_b_h264_sd_960_540.mp4"}, 12 | {"course_id": "http://www.jikexueyuan.com/course/2722_3.html?ss=1", "course_name": " \u4e00\u5143\u7ebf\u6027\u56de\u5f52\u5b9e\u4f8b\u89e3\u6790", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u6316\u6398", "\u4e00\u5143\u7ebf\u6027\u56de\u5f52"], "course_url": "http://cv4.jikexueyuan.com/29cfe249131416b020921d1387ba0d3e/201606211645/course/2801-2900/2842/video/7931_b_h264_sd_960_540.mp4"}, 13 | {"course_id": "http://www.jikexueyuan.com/course/2826_2.html?ss=1", "course_name": " \u7eb9\u7406\u7f13\u5b58", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684\u7eb9\u7406\u4f18\u5316"], "course_url": "http://cv4.jikexueyuan.com/b539e4fbe320047ac3aad905fe76fe48/201606211645/course/3301-3400/3383/video/9696_b_h264_sd_960_540.mp4"}, 14 | {"course_id": "http://www.jikexueyuan.com/course/2723_4.html?ss=1", "course_name": " Flume Sink \u7ec4\u4ef6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Flume", "Flume \u57fa\u7840\u67b6\u6784"], "course_url": "http://cv4.jikexueyuan.com/d9a4f38b45d4a11fb3373349d33a5228/201606211645/course/3101-3200/3139/video/9274_b_h264_sd_960_540.mp4"}, 15 | {"course_id": "http://www.jikexueyuan.com/course/2723_3.html?ss=1", "course_name": " Flume Channel \u7ec4\u4ef6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Flume", "Flume \u57fa\u7840\u67b6\u6784"], "course_url": "http://cv4.jikexueyuan.com/6b2f23a347802af1d90d726e4f156a86/201606211645/course/3101-3200/3139/video/9273_b_h264_sd_960_540.mp4"}, 16 | {"course_id": "http://www.jikexueyuan.com/course/2780_4.html?ss=1", "course_name": " ES6 Generator \u7f16\u7a0b", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "JavaScript", "JavaScript \u5f02\u6b65\u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/ca133a2a216a8a9af901e9ad49af063f/201606211645/course/2801-2900/2860/video/8003_b_h264_sd_960_540.mp4"}, 17 | {"course_id": "http://www.jikexueyuan.com/course/2780_3.html?ss=1", "course_name": " JavaScript \u5f02\u6b65\u7f16\u7a0b\u5f00\u6e90\u5e93", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "JavaScript", "JavaScript \u5f02\u6b65\u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/2082393a4c129c64a51e973283c14ffe/201606211645/course/2801-2900/2860/video/8002_b_h264_sd_960_540.mp4"}, 18 | {"course_id": "http://www.jikexueyuan.com/course/2780_2.html?ss=1", "course_name": " JavaScript \u5f02\u6b65\u7f16\u7a0b\u7684\u4e3b\u8981\u65b9\u6cd5", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "JavaScript", "JavaScript \u5f02\u6b65\u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/df2feeba68d5c5a869197bf1ce471f5c/201606211645/course/2801-2900/2860/video/8001_b_h264_sd_960_540.mp4"}, 19 | {"course_id": "http://www.jikexueyuan.com/course/2780_1.html?ss=1", "course_name": " JavaScript \u540c\u6b65\u548c\u5f02\u6b65\u7f16\u7a0b\u7684\u533a\u522b", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "JavaScript", "JavaScript \u5f02\u6b65\u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/039e85901d5ef51d58d2d373ca5dc9dd/201606211645/course/2801-2900/2860/video/8000_b_h264_sd_960_540.mp4"}, 20 | {"course_id": "http://www.jikexueyuan.com/course/2779_4.html?ss=1", "course_name": " \u5f00\u53d1\u8005\u670d\u52a1\u7bc7-IDE\u3001\u4e91\u670d\u52a1\u3001\u5f00\u53d1\u6807\u51c6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Google I/O 2016 \u6280\u672f\u63ed\u79d8\u4e0e\u524d\u77bb"], "course_url": "http://cv4.jikexueyuan.com/9df761ed1de3460813bda6d0cbb83f91/201606211645/course/3301-3400/3364/video/c3364b_04_h264_sd_960_540.mp4"}, 21 | {"course_id": "http://www.jikexueyuan.com/course/2779_2.html?ss=1", "course_name": " \u65b0\u8bbe\u5907\u4ea7\u54c1\u7bc7-\u672a\u6765\u7684\u4eba\u5de5\u667a\u80fd\u539f\u578b", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Google I/O 2016 \u6280\u672f\u63ed\u79d8\u4e0e\u524d\u77bb"], "course_url": "http://cv4.jikexueyuan.com/8049313386daba3e8265fb5cb451705a/201606211645/course/3301-3400/3364/video/c3364b_02_h264_sd_960_540.mp4"}, 22 | {"course_id": "http://www.jikexueyuan.com/course/2779_1.html?ss=1", "course_name": " Google I/O 2016 \u6982\u8ff0", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Google I/O 2016 \u6280\u672f\u63ed\u79d8\u4e0e\u524d\u77bb"], "course_url": "http://cv4.jikexueyuan.com/f64b5743aa6d178e84e1b3bcd63300b6/201606211645/course/3301-3400/3364/video/c3364b_01_h264_sd_960_540.mp4"}, 23 | {"course_id": "http://www.jikexueyuan.com/course/2781_3.html?ss=1", "course_name": " Dubbo \u505a Http \u670d\u52a1\u5668", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u5e38\u7528\u6846\u67b6", "Dubbo \u4f7f\u7528\u5165\u95e8\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/801f2312f50d8d4d204617b02f597a81/201606211645/course/2601-2700/2621/video/7302_b_h264_sd_960_540.mp4"}, 24 | {"course_id": "http://www.jikexueyuan.com/course/2781_1.html?ss=1", "course_name": " Web \u5e94\u7528\u914d\u7f6e", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u5e38\u7528\u6846\u67b6", "Dubbo \u4f7f\u7528\u5165\u95e8\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/12c405b0b3a33cc65ce20b662f5c71b9/201606211645/course/2601-2700/2621/video/7300_b_h264_sd_960_540.mp4"}, 25 | {"course_id": "http://www.jikexueyuan.com/course/2781_2.html?ss=1", "course_name": " Web \u5e94\u7528\u6df7\u5408\u914d\u7f6e", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u5e38\u7528\u6846\u67b6", "Dubbo \u4f7f\u7528\u5165\u95e8\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/b98ca0c15c0cab5572be671d98b598bb/201606211645/course/2601-2700/2621/video/7301_b_h264_sd_960_540.mp4"}, 26 | {"course_id": "http://www.jikexueyuan.com/course/2743_3.html?ss=1", "course_name": " \u4e91\u8ba1\u7b97\u57fa\u7840\u77e5\u8bc6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Docker", "Docker \u57fa\u7840\u77e5\u8bc6"], "course_url": "http://cv4.jikexueyuan.com/1c9e5d096e975a455a2627adb195c8d1/201606211645/course/3301-3400/3332/video/9591_b_h264_sd_960_540.mp4"}, 27 | {"course_id": "http://www.jikexueyuan.com/course/2743_2.html?ss=1", "course_name": " \u865a\u62df\u5316\u6280\u672f", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Docker", "Docker \u57fa\u7840\u77e5\u8bc6"], "course_url": "http://cv4.jikexueyuan.com/7be60d4c0f861f0762ded174be05e768/201606211645/course/3301-3400/3332/video/9590_b_h264_sd_960_540.mp4"}, 28 | {"course_id": "http://www.jikexueyuan.com/course/2743_1.html?ss=1", "course_name": " Docker hello world", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Docker", "Docker \u57fa\u7840\u77e5\u8bc6"], "course_url": "http://cv4.jikexueyuan.com/f06983fecd424ce0f7ba94d6fa1762a7/201606211645/course/3301-3400/3332/video/9589_b_h264_sd_960_540.mp4"}, 29 | {"course_id": "http://www.jikexueyuan.com/course/2779_3.html?ss=1", "course_name": " \u7cfb\u7edf\u66f4\u65b0\u7bc7-Android N\u3001Daydream\u3001Wear 2.0\u3001Auto", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Google I/O 2016 \u6280\u672f\u63ed\u79d8\u4e0e\u524d\u77bb"], "course_url": "http://cv4.jikexueyuan.com/a074307fbac4bca6225030a6285be20b/201606211645/course/3301-3400/3364/video/c3364b_03_h264_sd_960_540.mp4"}, 30 | {"course_id": "http://www.jikexueyuan.com/course/2741_2.html?ss=1", "course_name": " \u573a\u666f\u4ea4\u4e92", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "OpenGL/GLES", "OSG\u4eba\u673a\u4ea4\u4e92"], "course_url": "http://cv4.jikexueyuan.com/9bc9e7b6910408ea3597831afdbb081d/201606211645/course/3101-3200/3192/video/c3192b_02_h264_sd_960_540.mp4"}, 31 | {"course_id": "http://www.jikexueyuan.com/course/2741_3.html?ss=1", "course_name": " \u5bf9\u8c61\u62fe\u53d6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "OpenGL/GLES", "OSG\u4eba\u673a\u4ea4\u4e92"], "course_url": "http://cv4.jikexueyuan.com/42f87fa738d48207d30f2f806a9017ab/201606211645/course/3101-3200/3192/video/c3192b_03_h264_sd_960_540.mp4"}, 32 | {"course_id": "http://www.jikexueyuan.com/course/2722_1.html?ss=1", "course_name": " \u4e00\u5143\u7ebf\u6027\u56de\u5f52\u6982\u8ff0", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u6316\u6398", "\u4e00\u5143\u7ebf\u6027\u56de\u5f52"], "course_url": "http://cv4.jikexueyuan.com/1558ad48e834d0e248311b351dffb89d/201606211645/course/2801-2900/2842/video/7929_b_h264_sd_960_540.mp4"}, 33 | {"course_id": "http://www.jikexueyuan.com/course/2723_2.html?ss=1", "course_name": " Flume Source \u7ec4\u4ef6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Flume", "Flume \u57fa\u7840\u67b6\u6784"], "course_url": "http://cv4.jikexueyuan.com/2cdd979197ab5976e8819dea442a7b28/201606211645/course/3101-3200/3139/video/9270_b_h264_sd_960_540.mp4"}, 34 | {"course_id": "http://www.jikexueyuan.com/course/2730_3.html?ss=1", "course_name": " \u4f7f\u7528\u6447\u6746\u6a21\u62df\u9f20\u6807", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Arduino", "Arduino \u6a21\u62df\u9f20\u6807\u952e\u76d8\uff08\u4e8c\uff09"], "course_url": "http://cv4.jikexueyuan.com/5c65285543bb5a0ba4dab802ff38b71f/201606211645/course/3101-3200/3163/video/9110_b_h264_sd_960_540.mp4"}, 35 | {"course_id": "http://www.jikexueyuan.com/course/2723_1.html?ss=1", "course_name": " Flume \u6574\u4f53\u67b6\u6784", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Flume", "Flume \u57fa\u7840\u67b6\u6784"], "course_url": "http://cv4.jikexueyuan.com/f803ce945ad316c1ec7e3e153fa10a6f/201606211645/course/3101-3200/3139/video/9269_b_h264_sd_960_540.mp4"}, 36 | {"course_id": "http://www.jikexueyuan.com/course/2741_1.html?ss=1", "course_name": " \u573a\u666f\u89c6\u56fe", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "OpenGL/GLES", "OSG\u4eba\u673a\u4ea4\u4e92"], "course_url": "http://cv4.jikexueyuan.com/0ac16ae614f85f8611249517c340f82f/201606211645/course/3101-3200/3192/video/c3192b_01_h264_sd_960_540.mp4"}, 37 | {"course_id": "http://www.jikexueyuan.com/course/2730_2.html?ss=1", "course_name": " \u6a21\u62df\u9f20\u6807\u4e0e Arduino \u95f4\u7684\u8fde\u63a5", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Arduino", "Arduino \u6a21\u62df\u9f20\u6807\u952e\u76d8\uff08\u4e8c\uff09"], "course_url": "http://cv4.jikexueyuan.com/aba9dfbe4b457fc65b82241542eab160/201606211645/course/3101-3200/3163/video/9109_b_h264_sd_960_540.mp4"}, 38 | {"course_id": "http://www.jikexueyuan.com/course/2730_1.html?ss=1", "course_name": " Leonardo \u6a21\u62df\u9f20\u6807\u539f\u7406\u4ecb\u7ecd", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Arduino", "Arduino \u6a21\u62df\u9f20\u6807\u952e\u76d8\uff08\u4e8c\uff09"], "course_url": "http://cv4.jikexueyuan.com/0914379bbc0a039d56cab855e2520544/201606211645/course/3101-3200/3163/video/9108_b_h264_sd_960_540.mp4"}, 39 | {"course_id": "http://www.jikexueyuan.com/course/2715_3.html?ss=1", "course_name": " \u663e\u793a\u5b66\u751f\u4fe1\u606f\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "C", "C \u8bed\u8a00\u7f16\u7a0b\u5b9e\u6218\uff1a\u5b66\u751f\u4fe1\u606f\u7ba1\u7406\u7cfb\u7edf\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/a3357e1fa803b92b6c716c07ced56906/201606211645/course/3001-3100/3095/video/8920_b_h264_sd_960_540.mp4"}, 40 | {"course_id": "http://www.jikexueyuan.com/course/2715_4.html?ss=1", "course_name": " \u67e5\u627e\u5b66\u751f\u4fe1\u606f\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "C", "C \u8bed\u8a00\u7f16\u7a0b\u5b9e\u6218\uff1a\u5b66\u751f\u4fe1\u606f\u7ba1\u7406\u7cfb\u7edf\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/8f5683000bf6faed26ccd8f169e3f2ca/201606211645/course/3001-3100/3095/video/8921_b_h264_sd_960_540.mp4"}, 41 | {"course_id": "http://www.jikexueyuan.com/course/2715_2.html?ss=1", "course_name": " \u52a0\u8f7d\u5b66\u751f\u4fe1\u606f\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "C", "C \u8bed\u8a00\u7f16\u7a0b\u5b9e\u6218\uff1a\u5b66\u751f\u4fe1\u606f\u7ba1\u7406\u7cfb\u7edf\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/bd7e09666a0c523cab750825043ae0ce/201606211645/course/3001-3100/3095/video/8919_b_h264_sd_960_540.mp4"}, 42 | {"course_id": "http://www.jikexueyuan.com/course/2715_1.html?ss=1", "course_name": " \u4fdd\u5b58\u5b66\u751f\u4fe1\u606f\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "C", "C \u8bed\u8a00\u7f16\u7a0b\u5b9e\u6218\uff1a\u5b66\u751f\u4fe1\u606f\u7ba1\u7406\u7cfb\u7edf\uff08\u4e0b\uff09"], "course_url": "http://cv4.jikexueyuan.com/dc83d3667350df81517e33028ecee766/201606211645/course/3001-3100/3095/video/8918_b_h264_sd_960_540.mp4"}, 43 | {"course_id": "http://www.jikexueyuan.com/course/2713_5.html?ss=1", "course_name": " twisted \u7684\u4ecb\u7ecd\u4ee5\u53ca\u4e0e tornado \u7684\u6bd4\u8f83", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Python \u57fa\u7840", "Tornado \u5f00\u53d1--TCP \u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/0a31b9e7204e1ed64a498617dbd22ed5/201606211645/course/3101-3200/3190/video/9225_b_h264_sd_960_540.mp4"}, 44 | {"course_id": "http://www.jikexueyuan.com/course/2713_3.html?ss=1", "course_name": " iostream \u76f8\u5173 API \u4ecb\u7ecd", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Python \u57fa\u7840", "Tornado \u5f00\u53d1--TCP \u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/38246f4cad3ea5f7a26369f35c8d9096/201606211645/course/3101-3200/3190/video/9223_b_h264_sd_960_540.mp4"}, 45 | {"course_id": "http://www.jikexueyuan.com/course/2713_1.html?ss=1", "course_name": " tornado \u5728 TCP \u5c42\u7684\u5de5\u4f5c", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Python \u57fa\u7840", "Tornado \u5f00\u53d1--TCP \u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/2456c5e2eee51805fee3bad96f4332d9/201606211645/course/3101-3200/3190/video/9221_b_h264_sd_960_540.mp4"}, 46 | {"course_id": "http://www.jikexueyuan.com/course/2713_2.html?ss=1", "course_name": " ioloop \u76f8\u5173 API \u4ecb\u7ecd", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Python \u57fa\u7840", "Tornado \u5f00\u53d1--TCP \u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/3472082935bdeaf5c62bfe24948949ea/201606211645/course/3101-3200/3190/video/9222_b_h264_sd_960_540.mp4"}, 47 | {"course_id": "http://www.jikexueyuan.com/course/2714_3.html?ss=1", "course_name": " \u6700\u5c0f\u6df1\u5ea6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u7ed3\u6784", "\u540d\u4f01\u6570\u636e\u7ed3\u6784\u9762\u8bd5\u9898\u4e4b DFS\uff08\u4e0a\uff09"], "course_url": "http://cv4.jikexueyuan.com/1c48839126e7729158c25eee0ae0e4cc/201606211645/course/3201-3300/3265/video/9385_b_h264_sd_960_540.mp4"}, 48 | {"course_id": "http://www.jikexueyuan.com/course/2714_4.html?ss=1", "course_name": " \u53cd\u8f6c\u4e8c\u53c9\u6811", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u7ed3\u6784", "\u540d\u4f01\u6570\u636e\u7ed3\u6784\u9762\u8bd5\u9898\u4e4b DFS\uff08\u4e0a\uff09"], "course_url": "http://cv4.jikexueyuan.com/6973f75dcab5d4291c0c6625286033c8/201606211645/course/3201-3300/3265/video/9386_b_h264_sd_960_540.mp4"}, 49 | {"course_id": "http://www.jikexueyuan.com/course/2714_1.html?ss=1", "course_name": " \u8282\u70b9\u6240\u5728\u5c42\u6570", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u7ed3\u6784", "\u540d\u4f01\u6570\u636e\u7ed3\u6784\u9762\u8bd5\u9898\u4e4b DFS\uff08\u4e0a\uff09"], "course_url": "http://cv4.jikexueyuan.com/fbab5f5645a270fbcb444b3ded48aba9/201606211645/course/3201-3300/3265/video/9383_b_h264_sd_960_540.mp4"}, 50 | {"course_id": "http://www.jikexueyuan.com/course/2714_2.html?ss=1", "course_name": " \u6700\u5927\u6df1\u5ea6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6570\u636e\u7ed3\u6784", "\u540d\u4f01\u6570\u636e\u7ed3\u6784\u9762\u8bd5\u9898\u4e4b DFS\uff08\u4e0a\uff09"], "course_url": "http://cv4.jikexueyuan.com/1ebe43a6e151877d1b107e731419b067/201606211645/course/3201-3300/3265/video/9384_b_h264_sd_960_540.mp4"}, 51 | {"course_id": "http://www.jikexueyuan.com/course/2713_4.html?ss=1", "course_name": " RPC \u4ecb\u7ecd\u4ee5\u53ca\u4f7f\u7528", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Python \u57fa\u7840", "Tornado \u5f00\u53d1--TCP \u7f16\u7a0b"], "course_url": "http://cv4.jikexueyuan.com/6d1ee6aaf2a33ac7c138b6bc4640c3e0/201606211645/course/3101-3200/3190/video/9224_b_h264_sd_960_540.mp4"}, 52 | {"course_id": "http://www.jikexueyuan.com/course/2719_3.html?ss=1", "course_name": " \u6301\u4e45\u5316\u7c7b\u578b\u5339\u914d\u7684\u5e94\u7528", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Launcher3\uff0d\u6587\u4ef6\u5939\u81ea\u52a8\u6574\u7406"], "course_url": "http://cv4.jikexueyuan.com/9eb18f658fd0f0fcb90eb5955e03b07b/201606211645/course/3201-3300/3244/video/c3244b_03_h264_sd_960_540.mp4"}, 53 | {"course_id": "http://www.jikexueyuan.com/course/2719_2.html?ss=1", "course_name": " \u6587\u4ef6\u5939\u7c7b\u578b", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Launcher3\uff0d\u6587\u4ef6\u5939\u81ea\u52a8\u6574\u7406"], "course_url": "http://cv4.jikexueyuan.com/f8c38ab6570b834b771233d9ccfe8fab/201606211645/course/3201-3300/3244/video/c3244b_02_h264_sd_960_540.mp4"}, 54 | {"course_id": "http://www.jikexueyuan.com/course/2719_4.html?ss=1", "course_name": " \u5728 Workspace \u663e\u793a\u5339\u914d\u7684\u5e94\u7528", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Launcher3\uff0d\u6587\u4ef6\u5939\u81ea\u52a8\u6574\u7406"], "course_url": "http://cv4.jikexueyuan.com/2561ecfe198900b987ead239ae30d3b0/201606211645/course/3201-3300/3244/video/c3244b_04_h264_sd_960_540.mp4"}, 55 | {"course_id": "http://www.jikexueyuan.com/course/2716_3.html?ss=1", "course_name": " \u4f4d\u7f6e\u4fe1\u606f\u4e0e\u4e2a\u4eba\u9690\u79c1", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u79fb\u52a8\u901a\u4fe1", "\u7269\u8054\u7f51\u6280\u672f\u5bfc\u8bba\uff08\u5341\u4e00\uff09\u7269\u8054\u7f51\u4e2d\u7684\u4fe1\u606f\u5b89\u5168\u4e0e\u9690\u79c1\u4fdd\u62a4"], "course_url": "http://cv4.jikexueyuan.com/94df973416af74f8a19677424cb24212/201606211645/course/3101-3200/3182/video/9155_b_h264_sd_960_540.mp4"}, 56 | {"course_id": "http://www.jikexueyuan.com/course/2716_2.html?ss=1", "course_name": " RFID \u5b89\u5168\u548c\u9690\u79c1\u4fdd\u62a4\u673a\u5236", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u79fb\u52a8\u901a\u4fe1", "\u7269\u8054\u7f51\u6280\u672f\u5bfc\u8bba\uff08\u5341\u4e00\uff09\u7269\u8054\u7f51\u4e2d\u7684\u4fe1\u606f\u5b89\u5168\u4e0e\u9690\u79c1\u4fdd\u62a4"], "course_url": "http://cv4.jikexueyuan.com/dbb1d55e605b6ba89496f8160b24914a/201606211645/course/3101-3200/3182/video/9154_b_h264_sd_960_540.mp4"}, 57 | {"course_id": "http://www.jikexueyuan.com/course/2718_1.html?ss=1", "course_name": " iOS \u521b\u5efa\u9759\u6001\u5e93", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "iOS", "iOS \u5c01\u88c5\u7b2c\u4e09\u65b9\u652f\u4ed8 SDK"], "course_url": "http://cv4.jikexueyuan.com/d63945736af6c22de0a55a68adb910c9/201606211645/course/2901-3000/2995/video/8699_b_h264_sd_960_540.mp4"}, 58 | {"course_id": "http://www.jikexueyuan.com/course/2716_1.html?ss=1", "course_name": " RFID \u5b89\u5168\u548c\u9690\u79c1", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u79fb\u52a8\u901a\u4fe1", "\u7269\u8054\u7f51\u6280\u672f\u5bfc\u8bba\uff08\u5341\u4e00\uff09\u7269\u8054\u7f51\u4e2d\u7684\u4fe1\u606f\u5b89\u5168\u4e0e\u9690\u79c1\u4fdd\u62a4"], "course_url": "http://cv4.jikexueyuan.com/9b5aba05602a732251c2047a2b06e95f/201606211645/course/3101-3200/3182/video/9153_b_h264_sd_960_540.mp4"}, 59 | {"course_id": "http://www.jikexueyuan.com/course/2719_1.html?ss=1", "course_name": " \u9879\u76ee\u6f14\u793a\u4ee5\u53ca\u9700\u6c42\u5206\u6790", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Launcher3\uff0d\u6587\u4ef6\u5939\u81ea\u52a8\u6574\u7406"], "course_url": "http://cv4.jikexueyuan.com/1a528930a24bf6203c3b137f5c7ccc36/201606211645/course/3201-3300/3244/video/c3244b_01_h264_sd_960_540.mp4"}, 60 | {"course_id": "http://www.jikexueyuan.com/course/2782_3.html?ss=1", "course_name": " \u8bbe\u7f6e\u7eb9\u7406\u5c5e\u6027\u5e76\u4f7f\u7528\u7eb9\u7406", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684\u7eb9\u7406"], "course_url": "http://cv4.jikexueyuan.com/e3cb5e2378011a46d485d48de0e2241f/201606211645/course/3301-3400/3305/video/9510_b_h264_sd_960_540.mp4"}, 61 | {"course_id": "http://www.jikexueyuan.com/course/2782_1.html?ss=1", "course_name": " \u751f\u6210\u7eb9\u7406\u524d\u7684\u51c6\u5907\u5de5\u4f5c", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684\u7eb9\u7406"], "course_url": "http://cv4.jikexueyuan.com/b04de0f87c3d59faf12947e3477abfb0/201606211645/course/3301-3400/3305/video/9508_b_h264_sd_960_540.mp4"}, 62 | {"course_id": "http://www.jikexueyuan.com/course/2782_2.html?ss=1", "course_name": " \u751f\u6210\u7eb9\u7406", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Cocos", "Cocos2d-x \u5f15\u64ce\u6e90\u7801\u4e2d\u7684\u7eb9\u7406"], "course_url": "http://cv4.jikexueyuan.com/abc989581401e5e75f3655ceb13effc8/201606211645/course/3301-3400/3305/video/9509_b_h264_sd_960_540.mp4"}, 63 | {"course_id": "http://www.jikexueyuan.com/course/2790_2.html?ss=1", "course_name": " \u516c\u6709\u73af\u5883\u5b89\u88c5 Docker", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Docker", "Docker \u5b89\u88c5"], "course_url": "http://cv4.jikexueyuan.com/aa425b111fa2c1add0dd9a4098e69d9e/201606211645/course/3301-3400/3356/video/9621_b_h264_sd_960_540.mp4"}, 64 | {"course_id": "http://www.jikexueyuan.com/course/2790_1.html?ss=1", "course_name": " \u79c1\u6709\u73af\u5883\u5b89\u88c5 Docker", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Docker", "Docker \u5b89\u88c5"], "course_url": "http://cv4.jikexueyuan.com/3160a3fdede0ef473bb83fa5f1d77ec1/201606211645/course/3301-3400/3356/video/9620_b_h264_sd_960_540.mp4"}, 65 | {"course_id": "http://www.jikexueyuan.com/course/2790_3.html?ss=1", "course_name": " \u914d\u7f6e\u56fd\u5185\u955c\u50cf\u4ed3\u5e93", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Docker", "Docker \u5b89\u88c5"], "course_url": "http://cv4.jikexueyuan.com/0b77638c50cdf1abb7a3edab236fa34a/201606211645/course/3301-3400/3356/video/9622_b_h264_sd_960_540.mp4"}, 66 | {"course_id": "http://www.jikexueyuan.com/course/2791_4.html?ss=1", "course_name": " \u524d\u540e\u7aef\u7a0b\u5e8f\u5b9e\u73b0", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6811\u8393\u6d3e", "\u6811\u8393\u6d3e\u9879\u76ee\u5b9e\u8df5\uff08\u4e00\uff09\u2014\u2014\u53ef\u7528 web \u63a7\u5236\u7684\u4eba\u4f53\u611f\u5e94\u5c0f\u591c\u706f"], "course_url": "http://cv4.jikexueyuan.com/7f4880db61dc883d96aaa3fc610bee0b/201606211645/course/2801-2900/2817/video/9089_b_h264_sd_960_540.mp4"}, 67 | {"course_id": "http://www.jikexueyuan.com/course/2791_3.html?ss=1", "course_name": " \u8ba9\u6811\u8393\u6d3e\u7cfb\u7edf\u6b63\u786e\u8d70\u65f6", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6811\u8393\u6d3e", "\u6811\u8393\u6d3e\u9879\u76ee\u5b9e\u8df5\uff08\u4e00\uff09\u2014\u2014\u53ef\u7528 web \u63a7\u5236\u7684\u4eba\u4f53\u611f\u5e94\u5c0f\u591c\u706f"], "course_url": "http://cv4.jikexueyuan.com/d3b293a67827d5572047574a38186807/201606211645/course/2801-2900/2817/video/9088_b_h264_sd_960_540.mp4"}, 68 | {"course_id": "http://www.jikexueyuan.com/course/2791_1.html?ss=1", "course_name": " \u9700\u6c42\u5206\u6790", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6811\u8393\u6d3e", "\u6811\u8393\u6d3e\u9879\u76ee\u5b9e\u8df5\uff08\u4e00\uff09\u2014\u2014\u53ef\u7528 web \u63a7\u5236\u7684\u4eba\u4f53\u611f\u5e94\u5c0f\u591c\u706f"], "course_url": "http://cv4.jikexueyuan.com/5f00cd92cf7916917256c35fa68a8c5a/201606211645/course/2801-2900/2817/video/7835_b_h264_sd_960_540.mp4"}, 69 | {"course_id": "http://www.jikexueyuan.com/course/2791_2.html?ss=1", "course_name": " \u4eba\u4f53\u63a5\u8fd1\u611f\u5e94\u529f\u80fd\u7684\u5b9e\u73b0", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u6811\u8393\u6d3e", "\u6811\u8393\u6d3e\u9879\u76ee\u5b9e\u8df5\uff08\u4e00\uff09\u2014\u2014\u53ef\u7528 web \u63a7\u5236\u7684\u4eba\u4f53\u611f\u5e94\u5c0f\u591c\u706f"], "course_url": "http://cv4.jikexueyuan.com/2127f29d528ee2f4f932a82f15684d39/201606211645/course/2801-2900/2817/video/9087_b_h264_sd_960_540.mp4"}, 70 | {"course_id": "http://www.jikexueyuan.com/course/2792_1.html?ss=1", "course_name": " SQL Server \u6570\u636e\u5e93\u7684\u67e5\u8be2\u4f18\u5316\u5668\u7684\u8fd0\u884c\u6982\u8ff0", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "SQL Server", "Microsoft SQL Server \u6570\u636e\u5e93\u7684\u67e5\u8be2\u4f18\u5316\u5668\u7684\u8fd0\u884c\u65b9\u5f0f"], "course_url": "http://cv4.jikexueyuan.com/7a467684aecdafe875c4bdf2cf405be2/201606211645/course/1901-2000/1909/video/5160_b_h264_sd_960_540.mp4"}, 71 | {"course_id": "http://www.jikexueyuan.com/course/2792_2.html?ss=1", "course_name": " SQL Server \u6570\u636e\u5e93\u7684\u67e5\u8be2\u4f18\u5316\u5668\u7684\u7b5b\u9009\u6761\u4ef6\u5206\u6790", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "SQL Server", "Microsoft SQL Server \u6570\u636e\u5e93\u7684\u67e5\u8be2\u4f18\u5316\u5668\u7684\u8fd0\u884c\u65b9\u5f0f"], "course_url": "http://cv4.jikexueyuan.com/85bed3899eb78973546c0f6451bac768/201606211645/course/1901-2000/1909/video/5161_b_h264_sd_960_540.mp4"}, 72 | {"course_id": "http://www.jikexueyuan.com/course/2815_6.html?ss=1", "course_name": " \u6570\u636e\u5e93\u6a21\u5757\uff08\u4e0b\uff09", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Android \u5feb\u901f\u5f00\u53d1\u6846\u67b6\u4e4b xUtils3"], "course_url": "http://cv4.jikexueyuan.com/dc91f33f613594700f510ff6333aff53/201606211645/course/3201-3300/3261/video/9371_b_h264_sd_960_540.mp4"}, 73 | {"course_id": "http://www.jikexueyuan.com/course/2815_5.html?ss=1", "course_name": " \u6570\u636e\u5e93\u6a21\u5757\uff08\u4e0a\uff09", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Android \u5feb\u901f\u5f00\u53d1\u6846\u67b6\u4e4b xUtils3"], "course_url": "http://cv4.jikexueyuan.com/d2ca22039c319d6091ed1181933804f0/201606211645/course/3201-3300/3261/video/9370_b_h264_sd_960_540.mp4"}, 74 | {"course_id": "http://www.jikexueyuan.com/course/2792_3.html?ss=1", "course_name": " SQL Server \u6570\u636e\u5e93\u7684\u67e5\u8be2\u4f18\u5316\u5668\u7684\u7d22\u5f15\u4f18\u5316\u6280\u5de7", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "SQL Server", "Microsoft SQL Server \u6570\u636e\u5e93\u7684\u67e5\u8be2\u4f18\u5316\u5668\u7684\u8fd0\u884c\u65b9\u5f0f"], "course_url": "http://cv4.jikexueyuan.com/575009f824e909744bfbe07a1265b7a6/201606211645/course/1901-2000/1909/video/5162_b_h264_sd_960_540.mp4"}, 75 | {"course_id": "http://www.jikexueyuan.com/course/2825_1.html?ss=1", "course_name": " \u82f9\u679c WWDC 2016 \u89e3\u8bfb", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "iOS", "\u82f9\u679c WWDC 2016 \u89e3\u8bfb"], "course_url": "http://cv4.jikexueyuan.com/368a1d20fb6c82650cf2c4f9fec783c3/201606211645/course/3401-3500/3407/video/c3407b_01_h264_sd_960_540.mp4"}, 76 | {"course_id": "http://www.jikexueyuan.com/course/2821_4.html?ss=1", "course_name": " \u9879\u76ee\u8fdb\u9636\u4e0e\u5206\u6790", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u9879\u76ee\u5b9e\u6218", "Gulp + Sass + Bootstrap \u7f51\u9875\u5f00\u53d1\u5b9e\u6218"], "course_url": "http://cv4.jikexueyuan.com/ec89a9c20f4da60df740a215ff40ce6d/201606211645/course/3201-3300/3206/video/9246_b_h264_sd_960_540.mp4"}, 77 | {"course_id": "http://www.jikexueyuan.com/course/2821_1.html?ss=1", "course_name": " \u6280\u672f\u6982\u8ff0", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u9879\u76ee\u5b9e\u6218", "Gulp + Sass + Bootstrap \u7f51\u9875\u5f00\u53d1\u5b9e\u6218"], "course_url": "http://cv4.jikexueyuan.com/461ea5feb762d5d6426fa5affddf9545/201606211645/course/3201-3300/3206/video/9243_b_h264_sd_960_540.mp4"}, 78 | {"course_id": "http://www.jikexueyuan.com/course/2821_3.html?ss=1", "course_name": " \u5355\u9875\u7f51\u7ad9\u9879\u76ee\u5b9e\u6218", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u9879\u76ee\u5b9e\u6218", "Gulp + Sass + Bootstrap \u7f51\u9875\u5f00\u53d1\u5b9e\u6218"], "course_url": "http://cv4.jikexueyuan.com/4f897ced575a987d10245c47d98d6a13/201606211645/course/3201-3300/3206/video/9245_b_h264_sd_960_540.mp4"}, 79 | {"course_id": "http://www.jikexueyuan.com/course/2815_4.html?ss=1", "course_name": " \u56fe\u7247\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Android \u5feb\u901f\u5f00\u53d1\u6846\u67b6\u4e4b xUtils3"], "course_url": "http://cv4.jikexueyuan.com/275756b59b406bfbf1b17db33af734c7/201606211645/course/3201-3300/3261/video/9369_b_h264_sd_960_540.mp4"}, 80 | {"course_id": "http://www.jikexueyuan.com/course/2821_2.html?ss=1", "course_name": " \u4f7f\u7528 yo \u642d\u5efa\u4e00\u4e2a Gulp + Sass + Bootstrap \u9879\u76ee", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "\u9879\u76ee\u5b9e\u6218", "Gulp + Sass + Bootstrap \u7f51\u9875\u5f00\u53d1\u5b9e\u6218"], "course_url": "http://cv4.jikexueyuan.com/33b92c3cd25a0059a3d1b6aaf98b8f82/201606211645/course/3201-3300/3206/video/9244_b_h264_sd_960_540.mp4"}, 81 | {"course_id": "http://www.jikexueyuan.com/course/2815_2.html?ss=1", "course_name": " \u6ce8\u89e3\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Android \u5feb\u901f\u5f00\u53d1\u6846\u67b6\u4e4b xUtils3"], "course_url": "http://cv4.jikexueyuan.com/dc2cbd00c65ae52c70e68b27c8e820f3/201606211645/course/3201-3300/3261/video/9367_b_h264_sd_960_540.mp4"}, 82 | {"course_id": "http://www.jikexueyuan.com/course/2815_1.html?ss=1", "course_name": " xUtils3 \u7b80\u4ecb", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Android \u5feb\u901f\u5f00\u53d1\u6846\u67b6\u4e4b xUtils3"], "course_url": "http://cv4.jikexueyuan.com/03acf25187ebb03ece4f8355c4ab984a/201606211645/course/3201-3300/3261/video/9366_b_h264_sd_960_540.mp4"}, 83 | {"course_id": "http://www.jikexueyuan.com/course/2815_3.html?ss=1", "course_name": " \u7f51\u7edc\u6a21\u5757", "course_path": ["\u9996\u9875", "\u804c\u4e1a\u8bfe\u7a0b\u5e93", "Android", "Android \u5feb\u901f\u5f00\u53d1\u6846\u67b6\u4e4b xUtils3"], "course_url": "http://cv4.jikexueyuan.com/869dda36e9a6f5bdb67511cf923087c9/201606211645/course/3201-3300/3261/video/9368_b_h264_sd_960_540.mp4"}] -------------------------------------------------------------------------------- /JiKeXueYuan/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = JiKeXueYuan.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = JiKeXueYuan 12 | -------------------------------------------------------------------------------- /OneSpider/OneSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/OneSpider/OneSpider/__init__.py -------------------------------------------------------------------------------- /OneSpider/OneSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class PhotoItem(scrapy.Item): 11 | title = scrapy.Field() 12 | image_urls = scrapy.Field() 13 | date = scrapy.Field() 14 | motto = scrapy.Field() 15 | 16 | class ArticleItem(scrapy.Item): 17 | title = scrapy.Field() 18 | content = scrapy.Field() 19 | date = scrapy.Field() 20 | 21 | class AskItem(scrapy.Item): 22 | title = scrapy.Field() 23 | date = scrapy.Field() 24 | question = scrapy.Field() 25 | answer = scrapy.Field() 26 | -------------------------------------------------------------------------------- /OneSpider/OneSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | # -*- coding: utf-8 -*- 10 | 11 | import pymongo 12 | from scrapy.exceptions import DropItem 13 | from scrapy.conf import settings 14 | from OneSpider.items import PhotoItem, ArticleItem, AskItem 15 | from scrapy.pipelines.images import ImagesPipeline 16 | 17 | class DateFormatPipeline(object): 18 | 19 | def process_item(self, item, spider): 20 | a = item['date'].replace(u'月','') 21 | tmp = a[:2] + '/' + a[2:] 22 | day = tmp.split('/')[0].strip() 23 | month = tmp.split('/')[1].strip() 24 | year = tmp.split('/')[2].strip() 25 | item['date'] = year+'/'+month+'/'+day 26 | return item 27 | 28 | 29 | class MongoDBPipeline(object): 30 | 31 | def __init__(self): 32 | self.connection = pymongo.MongoClient( 33 | settings['MONGODB_SERVER'], 34 | settings['MONGODB_PORT'] 35 | ) 36 | self.db = self.connection[settings['MONGODB_DB']] 37 | 38 | def process_item(self, item, spider): 39 | vaild = True 40 | 41 | for data in item: 42 | if not data: 43 | vaild = False 44 | raise DropItem("Missing {0}!".format(data)) 45 | if vaild: 46 | 47 | if isinstance(item, PhotoItem): 48 | self.collection = self.db['photo'] 49 | self.collection.insert(dict(item)) 50 | elif isinstance(item, ArticleItem): 51 | self.collection = self.db['article'] 52 | self.collection.insert(dict(item)) 53 | elif isinstance(item, AskItem): 54 | self.collection = self.db['ask'] 55 | self.collection.insert(dict(item)) 56 | else: 57 | raise DropItem("Error") 58 | return item -------------------------------------------------------------------------------- /OneSpider/OneSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for OneSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'OneSpider' 13 | 14 | SPIDER_MODULES = ['OneSpider.spiders'] 15 | NEWSPIDER_MODULE = 'OneSpider.spiders' 16 | 17 | MONGODB_SERVER = "localhost" 18 | MONGODB_PORT = 27017 19 | MONGODB_DB = "ones" 20 | 21 | 22 | IMAGES_STORE = '.' 23 | 24 | 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 26 | #USER_AGENT = 'OneSpider (+http://www.yourdomain.com)' 27 | 28 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 29 | #CONCURRENT_REQUESTS=32 30 | 31 | # Configure a delay for requests for the same website (default: 0) 32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 33 | # See also autothrottle settings and docs 34 | #DOWNLOAD_DELAY=3 35 | # The download delay setting will honor only one of: 36 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 37 | #CONCURRENT_REQUESTS_PER_IP=16 38 | 39 | # Disable cookies (enabled by default) 40 | #COOKIES_ENABLED=False 41 | 42 | # Disable Telnet Console (enabled by default) 43 | #TELNETCONSOLE_ENABLED=False 44 | 45 | # Override the default request headers: 46 | #DEFAULT_REQUEST_HEADERS = { 47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | # 'Accept-Language': 'en', 49 | #} 50 | 51 | # Enable or disable spider middlewares 52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 53 | #SPIDER_MIDDLEWARES = { 54 | # 'OneSpider.middlewares.MyCustomSpiderMiddleware': 543, 55 | #} 56 | 57 | # Enable or disable downloader middlewares 58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 59 | #DOWNLOADER_MIDDLEWARES = { 60 | # 'OneSpider.middlewares.MyCustomDownloaderMiddleware': 543, 61 | #} 62 | 63 | # Enable or disable extensions 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | 'OneSpider.pipelines.ImagesPipeline': 100, 73 | 'OneSpider.pipelines.DateFormatPipeline': 200, 74 | 'OneSpider.pipelines.MongoDBPipeline': 300, 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 80 | #AUTOTHROTTLE_ENABLED=True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY=5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY=60 85 | # Enable showing throttling stats for every response received: 86 | #AUTOTHROTTLE_DEBUG=False 87 | 88 | # Enable and configure HTTP caching (disabled by default) 89 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 90 | #HTTPCACHE_ENABLED=True 91 | #HTTPCACHE_EXPIRATION_SECS=0 92 | #HTTPCACHE_DIR='httpcache' 93 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 94 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 95 | -------------------------------------------------------------------------------- /OneSpider/OneSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /OneSpider/OneSpider/spiders/myspider.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import scrapy 4 | from scrapy.spiders import Spider 5 | from scrapy.selector import Selector 6 | from OneSpider.items import PhotoItem, ArticleItem, AskItem 7 | 8 | class MySpider(Spider): 9 | name = "ones" 10 | allowed_domains = ["caodan.org"] 11 | start_urls = [ 12 | # "http://caodan.org/page/%d" % i for i in xrange(1, 1340) 13 | "http://caodan.org/page/%d" % i for i in xrange(1, 3) 14 | 15 | ] 16 | 17 | def parse(self, response): 18 | html = Selector(response).xpath("//div[@class='content']/h1[@class='entry-title']/a/@href").extract() 19 | # html 为列表, 其中有3个元素, 分别是 photo, article, ask 页面的链接 20 | 21 | ## 请求图片详情页 22 | yield scrapy.Request( 23 | url = html[0], 24 | callback = self.parse_photo 25 | ) 26 | 27 | ## 请求文章详情页 28 | yield scrapy.Request( 29 | url = html[1], 30 | callback = self.parse_article 31 | ) 32 | 33 | ## 请求问题详情页 34 | yield scrapy.Request( 35 | url = html[2], 36 | callback = self.parse_ask 37 | ) 38 | 39 | 40 | def parse_photo(self, response): 41 | sel = Selector(response) 42 | 43 | item = PhotoItem() 44 | item['title'] = sel.xpath('//h1/text()').extract()[0] 45 | item['date'] = sel.xpath('//div[@class="date"]//p').xpath("string(.)").extract()[0] 46 | item['image_urls'] = sel.xpath('//div[@class="entry-content"]//img/@src').extract() 47 | item['motto'] = sel.xpath('//blockquote/p/text()').extract()[0] 48 | yield item 49 | 50 | def parse_article(self, response): 51 | sel = Selector(response) 52 | 53 | item = ArticleItem() 54 | item['title'] = sel.xpath('//h1/text()').extract()[0] 55 | item['date'] = sel.xpath('//div[@class="date"]//p').xpath("string(.)").extract()[0] 56 | item['content'] = sel.xpath('//div[@class="entry-content"]').xpath("string(.)").extract()[0] 57 | yield item 58 | 59 | 60 | def parse_ask(self, response): 61 | sel = Selector(response) 62 | 63 | item = AskItem() 64 | item['title'] = sel.xpath('//h1/text()').extract()[0] 65 | item['date'] = sel.xpath('//div[@class="date"]//p').xpath("string(.)").extract()[0] 66 | item['question'] = sel.xpath('//div[@class="cuestion-contenido"]/text()').extract()[0] 67 | item['answer'] = sel.xpath('//div[@class="cuestion-contenido"]')[1].xpath("string(.)").extract()[0] 68 | yield item -------------------------------------------------------------------------------- /OneSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = OneSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = OneSpider 12 | -------------------------------------------------------------------------------- /QiuShiBaiKe/QiuShiBaiKe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/QiuShiBaiKe/QiuShiBaiKe/__init__.py -------------------------------------------------------------------------------- /QiuShiBaiKe/QiuShiBaiKe/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QiushibaikeItem(scrapy.Item): 12 | author = scrapy.Field() 13 | content = scrapy.Field() -------------------------------------------------------------------------------- /QiuShiBaiKe/QiuShiBaiKe/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class QiushibaikePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /QiuShiBaiKe/QiuShiBaiKe/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for QiuShiBaiKe project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'QiuShiBaiKe' 13 | 14 | SPIDER_MODULES = ['QiuShiBaiKe.spiders'] 15 | NEWSPIDER_MODULE = 'QiuShiBaiKe.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'QiuShiBaiKe.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'QiuShiBaiKe.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | #ITEM_PIPELINES = { 65 | # 'QiuShiBaiKe.pipelines.SomePipeline': 300, 66 | #} 67 | 68 | # Enable and configure the AutoThrottle extension (disabled by default) 69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 71 | #AUTOTHROTTLE_ENABLED=True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY=5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY=60 76 | # Enable showing throttling stats for every response received: 77 | #AUTOTHROTTLE_DEBUG=False 78 | 79 | # Enable and configure HTTP caching (disabled by default) 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 81 | #HTTPCACHE_ENABLED=True 82 | #HTTPCACHE_EXPIRATION_SECS=0 83 | #HTTPCACHE_DIR='httpcache' 84 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 86 | -------------------------------------------------------------------------------- /QiuShiBaiKe/QiuShiBaiKe/spiders/MySpdier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spiders import Spider 3 | from scrapy.selector import Selector 4 | from QiuShiBaiKe.items import QiushibaikeItem 5 | 6 | class MySpider(Spider): 7 | name = "xiaohua" 8 | allowed_domains = ["qiushibaike.com"] 9 | start_urls = [ 10 | "http://www.qiushibaike.com/8hr/page/%s" % i for i in range(1, 3) 11 | ] 12 | 13 | def parse(self, response): 14 | xhs = Selector(response).xpath('//div[@class="article block untagged mb15"]') 15 | print xhs 16 | 17 | for xh in xhs: 18 | item = QiushibaikeItem() 19 | item['author'] = xh.xpath('div/a[2]/h2/text()').extract_first() 20 | item['content'] = xh.xpath('div[@class="content"]').xpath("string(.)").extract_first().strip() 21 | yield item 22 | -------------------------------------------------------------------------------- /QiuShiBaiKe/QiuShiBaiKe/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /QiuShiBaiKe/data.json: -------------------------------------------------------------------------------- 1 | [{"content": "堂弟94年的,有个女孩在倒追,每天电话不断,都说女追男隔层纱,我就奇怪弟弟怎么一点都不动心,反而对她的电话不胜其烦,于是我帮他接了一个,才刚“喂”了一声,那边一听是个女声立马开启咆哮模式:“XXX!你几个意思!”我终于知道为啥这层纱死活捅不破了,太彪悍了。", "author": "西瓜蔓。"}, 2 | {"content": "厂里来了几个暑假工妹子 都十五六岁的那种 她们每天都叫我厨师叔叔或者大叔 本人二十八 隔⋯⋯今天有两个小姑娘要去外厂返工 可能会回来的晚 让我给她们留饭菜 我说叫句好听的就留 本想着叫帅哥啊 或者帅叔叔什么 小姑娘开口就很嗲的叫了句 “老公给我留饭菜好不好”⋯现在的小姑娘也太奔放了吧 我半天都没有反应过来。", "author": "来杯提莫酱"}, 3 | {"content": "侄子前些天神秘兮兮地对我说,他发现了一个秘密 。我问是啥?他告诉我,他发现一到晚上月亮就会跟着他,他走月亮也走,他停月亮也停……估计是月亮上的嫦娥看上他了,想跟他处对象。。。", "author": "永远十八岁a"}, 4 | {"content": "昨晚雨下的真大,IZ仓库也进水了,前两天进的一批货全泡汤.,为减少损失,我和老婆从夜里两点一直干到早上八点,累的精彼力尽,满身泥水。二货老婆在那嘟囔:素贞呀收手吧,看我老公这熊样能是你家许仙?你个二逼干活去,小心法海收了你。哈哈。。。苦中取乐,别拉让我哭会。", "author": "荷尔蒙abc"}, 5 | {"content": "LZ微胖男,割割更健康!今天穿了一条较小的裤子,在公司蹬下捡文件时“滋啦“一声,你猜得没错,裤裆开了!瞬间六十多人的办公区安静了,三秒后轰然大笑,最过分的是几个平时经常开玩笑的妇女要过来找蛋蛋!我那个去!", "author": null}, 6 | {"content": "今天北京下雨,一个男同事被淋的很厉害,说:内裤都湿了。接着拿出自己在公司的上衣到厕所换下来了,结果刚一回来,一个女同事大喊:他没穿内裤,我们顿时萌币了,纷纷问她怎么看出来了,她说:他拿着内裤呢,那肯定就是没穿了。谁在公司还有那么多内裤。我们笑尿了。", "author": "就把我当个笑话"}, 7 | {"content": "女生生气的过程。", "author": "XxZ小贝壳"}, 8 | {"content": "说个我考驾照的糗事吧!科目二时有个项目是上坡定点,轮到我时坡上去了!点没定住,然后车开始往后退,我吓一跳!立马去拉车门想往下跳,在一片哄笑声中,教练踩住了刹车,问我好玩不?我只想说,你们开心就好!!!", "author": "你的青春恐高吗"}, 9 | {"content": "我家路由器在客厅,没心情割…在卧室侧躺看糗百,面对着门口wifi信号是满的!背对着门口wifi信号就没了!擦!我挡着信号了!", "author": "daqinghml"}, 10 | {"content": "在吃零食,弟弟突然对我说:姐!你这样咋嫁的出去哦!!我马上瞪着他说:你再说一遍!!他立马就说:你长得又漂亮,又那么高哪个敢喜欢你嘛!听完我放开了揪着他耳朵的手露出了大大的微笑。:)", "author": "共产主义接班人。"}, 11 | {"content": "我的妞吃炒饭一直用手抓,我火了“勺子干嘛的?”她委屈地说“烫!”呃,闺女,你确定用勺子比手抓烫吗?", "author": "缘起缘落冷冷清清"}, 12 | {"content": "一女同事,早上一直流鼻涕!中午吃饭,我跟她说,外面太阳那么大,出去晒晒一会就好!结果她吃完饭,真的偷偷出去晒了有20来分钟!就刚刚她就说头有点晕,可能中暑了!准备去请假,话还没说完就晕了!大中午35度高温正常人也禁不住20多分钟暴晒!!!……想问问这个情况我最高判几年!!!", "author": "ふ夜子"}, 13 | {"content": "你经历过绝望嘛?那就来说说我的一次经历:在工地上班,干着干着活想蹲大号,就找了一个带卷闸门的地方,痛快的解决了,突然门响了,一点一点升起来,这时看到施工员和监理惊讶的表情,其实我内心是崩溃的,,,,", "author": "睡中蛟龙"}, 14 | {"content": "下雨了,我打着伞在路边等车,身后急匆匆的跑来一个男子,跺着脚站在了我的身旁。我见他浑身雨水,就把伞让给了他一半。他问我:“大哥,哪里有公共厕所?”我指着前方五十米处说:“那边有一个”他撒丫子就跑~我以为他憋的不轻!想把兜里的手纸给他,可是,一摸兜,手纸没了!...", "author": "阿·木"}, 15 | {"content": "有糗友说你把头像换了吧!看了晚上会做恶梦。我想说:你就知足吧,这是我最漂亮的照片了!我老公现在吃饭都不看我,怕吃不下。。。", "author": "山那边的野刺莓"}, 16 | {"content": "公司每周三发水果,今天,就今天,后勤李师傅买了一箱榴莲,刚分好,快递小哥来送快递,看到他鄙视的眼神,整个办公室凌乱了,捏着鼻子跑了出去,正好楼主出去抽烟,听快递小哥在打电话,说16楼的那些人在吃屎……吃屎!", "author": "转身泪湿双眼"}, 17 | {"content": "宿舍一起有一哥们,早上起床特别早,五点闹钟响就起床,这几天连续工作17小时,他还是一如既往,昨晚趁他洗澡,把他闹钟改成了三点,结果还真醒了,他起床跑步去了,我们终于可以睡到自然醒了。", "author": "岚盛船舶"}, 18 | {"content": "昨晚送表妹回家,到了楼下,表妹:“车停好吧,上去坐坐!” 本是客套话,lz不自觉的来了句:“哪个做做?” 表妹愣了有一会,咆哮道:“去死吧你!” 头上还挨了两记暴粟……吗的,真要命!", "author": "布鲁斯·刘能"}, 19 | {"content": "坐公车时靠着窗户,迷迷糊糊睡着了!还做了个梦:梦见自己正在钓鱼,当看见鱼漂突然下沉时,我猛的伸手去抓鱼竿!结果,一把抓在了坐我旁边的阿姨大腿上!我现在好怕,如果要我负责咋办!~~", "author": "(糗名昭著)~老王"}, 20 | {"content": "简短就不割了,留着用--就刚刚,去一家小卖部买水,老板娘给拿的,她两岁的儿子看到我就说爸爸走了,一直说,我问他你爸爸去哪了,他说上班去了,然后我买了水就出来了,我是不是错过了什么", "author": "流长水绿改不山青"}, 21 | {"content": "在电子城买器材,售货员是一个很漂亮的妹纸。买完单,我忍不住随口问了她一句:“哎,小妹,你和你们老板是亲戚么?”妹纸一边打包,一边回头得意得跟我说,“老板是我爸爸!”我默默的点点头,转身就走~其实吧,妹纸啊,你算错帐啦,少收我150块,反正老板是你爹,想来是不会揍你的!!!", "author": "<糗犯监狱>~入库"}, 22 | {"content": "同事新买的房子买的顶层,33楼,乔迁的时候单位人去了20多人去他家吃饭,不幸的是停电了,我们男女老少浩浩荡荡的爬楼梯到了他家,他一脸歉意的看着我们。有同事就问了,怎么样?后悔买这么高楼层了吧?同事坚定的摇摇头说,不后悔,我讨厌别人在我头上拉屎的感觉。擦,我们一行人竟然无言以对!", "author": "钢的鼓"}, 23 | {"content": "前段时间我姐家再添一男丁,姐夫就愁眉苦脸的说要把弟弟去换个妹妹回来,六岁大侄子跳起来就给他老爸一拳,大骂“换,换尼玛的皮” 我的天啊 当时我抱起大侄子就跑了 头都没敢回!!", "author": "屎都是芒果味"}, 24 | {"content": "今天接了一个民间借贷电话,妹子:“先生您好!请问需要借贷吗?”本着糗百精神,我问:“能贷两个亿吗?”妹子一本正经的回到:“我们只接受各种动产、不动产抵押,不接受天安门广场等名胜古迹!”我:“我擦!自己人!有时间约个饭呗?”", "author": "名字不要太二"}, 25 | {"content": "有一次,朋友点了两个一样的青菜,我问他为什么,他竟然回答说:好吃啊!跟吃草一样。当时我就想他是不是牛妖变的", "author": "淡流年@"}, 26 | {"content": "我姐一大早就扎着个歪歪的辫子把脚翘在电脑桌上玩游戏到现在,刚才有人打她电话,她就按了免提一边玩一边跟人说,女声:~~,出来遛弯啦,南街新开了一家涮锅店,我们准备去吃,你来不?我姐:不去了,我爸妈今天不在家,还得给我弟做饭呢!女声:有个姐姐真好啊!……刚打完电话,我姐就冲我吼道:小渣,去泡两包方便面过来!要海鲜味的!", "author": "游泳池水不好喝"}, 27 | {"content": "戒烟一个星期了,媳妇儿说:“你还是抽吧!”我问:“为什么?”媳妇儿说:“你一人戒烟全家都遭殃。不但脾气坏,零食多,就连饭量都大了!”于是。我像犯了错误似的拿起了一支烟......", "author": "阿·木"}, 28 | {"content": "妹纸身高165,今天穿了双12厘米的鞋子...割...走在上班路上等红绿灯的时候,突然一辆车停在我面前,我站在司机副驾驶外面的位置,只见司机都快趴到副驾驶座位上了。就为了看我的脸一眼。我特么还没来得及得瑟,这货瞬间很失望的扭过头,猛一踩油门,走了,留下我和一股黑烟在风中凌乱......", "author": "雪妹纸Phybee"}, 29 | {"content": "今天上公交车,给个老大爷让座,老大爷竟然说你上班挺累的,不用特意给我让座。。。顿时泪奔๐·°(৹˃̵﹏˂̵৹)°·๐", "author": "鑫鑫的"}, 30 | {"content": "刚流行QQ聊天的时候我正青春年少,那时候还没有摄像头,在看不见对方长相的情况下也遇到不少漂亮女孩,当然也有例外,一个叫雨后彩虹的网友见面才知道是一剽悍生猛的离异大妈,比lz大将近20岁啊。拎着lz去开房,lz苦口婆心的跟她谈了半宿人生最后还是被她上了,最后的挣扎楼主说去买套套,当大妈从兜里拽出一联十个套套的时候,lz才彻底绝望了。", "author": null}, 31 | {"content": "出租车3公里,收费四十,打运管部门投诉,接线是运管局某领导“我出门又没打过车,我怎么知道出租车收费高不高。”管理部门说出这话,当时真无言以对。", "author": null}, 32 | {"content": "黄昏在公园散步,远处迎面走来一个络腮胡,定睛一看顿时吓尿:胡子上面没有嘴没有鼻子没有眼睛眉毛!整个脸光滑如镜!妈的难道遇上无脸人了?正惊慌失措时,“无脸人”已经走到面前……我说大叔!作为一名严重秃顶人士,锻炼身体就不要用倒走这种方式了好吗?您慢慢锻炼着,我回家换裤子去……", "author": "专治各种骚货"}, 33 | {"content": "初中时候读过一年寄宿学校。洗漱池子是一排共用的,第一天晚上刷牙洗脸,忘了把牙膏带回寝室,早上起来想起来,完蛋了,刚买的新牙膏,肯定被别人拿走了…结果去池子一看,还好,牙膏还在,没人拿,只是挤不出牙膏了…", "author": "花和尚*胡扒衣"}, 34 | {"content": "去迪士尼玩,人爆多,各个游玩的都需要排队至少三十分钟以上,老公想去漂流,需要排队一个小时,我说不去了,好多人,好难排的,他说之前两个小时你都排了,这个一个小时就嫌时间长了?我:包里没零食了,60分钟好无聊的!", "author": "二女子、"}, 35 | {"content": "唐山一女子跑到肯德基拍照,拍在肯德基里就餐的人,大声呵斥在内就餐人员:美国欺负中国,你们竟然在美国的肯德基店里就餐,你们这是在帮美国人造子弹!一名就餐的人说,你先把你拍照的苹果手机砸了,我们不吃了立刻出去。别的就餐者全符合这人说的,然后该女子望了望自己的苹果手机懵逼的不知所措了……", "author": "如我遇见你"}, 36 | {"content": "老爸是个忠实的手机斗地主迷,怎么体现呢?我刚才把无线网关了,老爸竟然因为那局牌输了拿起了十多年没碰过的鸡毛掸子…啥也不说了,跑!", "author": "大齊葩"}, 37 | {"content": "满满的套路", "author": "命中注定缺心眼"}, 38 | {"content": "A:“你碰到的最纠结的事是什么?”B:“网购的东西很便宜,但取快递的地方离家有点远的时候。”A:“?”B:“去取,车费比邮件贵;拒收,邮费比邮件贵。”", "author": "岁月荒芜"}, 39 | {"content": "发小女汉子,有天加班回家晚了,遇见一小黄毛手持水果刀打劫,发小那暴脾气:小兄弟,劫财还是劫色,劫财没有,劫色就快点,老娘还要回家伺候老公。说完就做势要脱衣解扣,小黄毛一看这架式一脸懵逼,刀一扔居然吓跑了。", "author": "望柱堂堂主"}, 40 | {"content": "刚刚在网吧准备通宵,然后一小孩走过来坐我旁边,过了一会好像是他妈打电话过来了,问他在哪,小孩:我还在培训班呢。他妈不信,小孩急了我旁边道:这道题等于9,会了不?他妈信了,挂了电话过了一会,我老婆对啊电话过来了老婆:在哪呢?还不回来?我:在网吧呢,老婆不信旁边小孩一声大叫:网管,加钱", "author": "蓝调乱想曲。"}] -------------------------------------------------------------------------------- /QiuShiBaiKe/readme.md: -------------------------------------------------------------------------------- 1 | ## 糗事百科热门笑话 2 | 3 | 爬取糗事百科热门笑话, 字段为`author`和`content`, 示例文件是抓取前两页 4 | 5 | 想要抓取更多的页修改 MySpider.py 文件中的 6 | 7 | ``` python 8 | start_urls = [ 9 | "http://www.qiushibaike.com/8hr/page/%s" % i for i in range(1, 3) 10 | ] 11 | ``` 12 | `range(1, 3)` 中的 3 即可。 -------------------------------------------------------------------------------- /QiuShiBaiKe/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = QiuShiBaiKe.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = QiuShiBaiKe 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy Projects 2 | 3 | ## OnesSpider 4 | 该爬虫是将[一个](http://caodan.org)网站上的图片,问题,文章全部抓取下来,预计耗时1个小时,大约1300页面。 5 | 6 | ## dbMoviesTop250 7 | 抓取豆瓣排行前250名电影信息 8 | 9 | ## JiKeXueYuan 10 | 抓取极客学院整站视频, 大约8000多个(侵立删) 11 | 12 | ## zhiHuSpider 13 | 抓取知乎用户信息,主要参考, (侵立删) 14 | 15 | ## QiuShiBaiKe 16 | 抓取糗事百科热门笑话 author, content 17 | 18 | ## ShiFuTu 19 | 抓取十幅图id, title, imgurl 并下载图片 20 | 21 | ## IpProxy 22 | 抓取免费的ip代理 23 | 24 | ## LitterLove 25 | 抓取新片场的小情书系列视频 26 | 27 | ## DouTu 28 | 抓取斗图网站上图片 29 | -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/ShiFuTu/ShiFuTu/__init__.py -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ShifutuItem(scrapy.Item): 12 | 13 | id = scrapy.Field() 14 | title = scrapy.Field() 15 | imgurl = scrapy.Field() 16 | 17 | -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import scrapy 8 | from scrapy.pipelines.images import ImagesPipeline 9 | 10 | class ShifutuPipeline(ImagesPipeline): 11 | def file_path(self, request, response=None, info=None): 12 | item = request.meta['item'] 13 | filename = u'{0}.jpg'.format(item['title']) 14 | return filename 15 | def get_media_requests(self, item, info): 16 | yield scrapy.Request(item['imgurl'], meta={'item': item}) 17 | def item_completed(self, results, item, info): 18 | return item -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ShiFuTu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ShiFuTu' 13 | 14 | SPIDER_MODULES = ['ShiFuTu.spiders'] 15 | NEWSPIDER_MODULE = 'ShiFuTu.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'ShiFuTu (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'ShiFuTu.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'ShiFuTu.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | EXTENSIONS = { 62 | 'scrapy.extensions.telnet.TelnetConsole': None, 63 | } 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'ShiFuTu.pipelines.SomePipeline': 300, 69 | #} 70 | IMAGES_STORE = 'images' 71 | ITEM_PIPELINES = { 72 | 'ShiFuTu.pipelines.ShifutuPipeline': 300, 73 | } 74 | 75 | # Enable and configure the AutoThrottle extension (disabled by default) 76 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 77 | #AUTOTHROTTLE_ENABLED = True 78 | # The initial download delay 79 | #AUTOTHROTTLE_START_DELAY = 5 80 | # The maximum download delay to be set in case of high latencies 81 | #AUTOTHROTTLE_MAX_DELAY = 60 82 | # The average number of requests Scrapy should be sending in parallel to 83 | # each remote server 84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 85 | # Enable showing throttling stats for every response received: 86 | #AUTOTHROTTLE_DEBUG = False 87 | 88 | # Enable and configure HTTP caching (disabled by default) 89 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 90 | #HTTPCACHE_ENABLED = True 91 | #HTTPCACHE_EXPIRATION_SECS = 0 92 | #HTTPCACHE_DIR = 'httpcache' 93 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 95 | -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/spiders/myspider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from ShiFuTu.items import ShifutuItem 3 | from scrapy.selector import Selector 4 | 5 | class ShiFuTuSpider(scrapy.spiders.Spider): 6 | name = "ShiFuTu" 7 | allowed_domains = ["www.10futu.com"] 8 | start_urls = [ 9 | "http://www.10futu.com/", 10 | ] 11 | 12 | def parse(self, response): 13 | for sel in Selector(response).xpath("//div[@id>1]"): 14 | item = ShifutuItem() 15 | item['id'] = sel.xpath('./@id').extract()[0] 16 | item['title'] = sel.xpath('div[2]/a/img/@alt').extract()[0] 17 | item['imgurl'] = sel.xpath('div[2]/a/img/@src').extract()[0] 18 | yield item -------------------------------------------------------------------------------- /ShiFuTu/ShiFuTu/spiders/myspider2.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from ShiFuTu.items import ShifutuItem 3 | from scrapy.selector import Selector 4 | 5 | class ShiFuTuSpider2(scrapy.spiders.Spider): 6 | name = "ShiFuTu2" 7 | allowed_domains = ["www.10futu.com"] 8 | start_urls = [ 9 | "http://www.10futu.com/ten_info.php?id=%d" % i for i in xrange(1, 2) 10 | ] 11 | 12 | def parse(self, response): 13 | item = ShifutuItem() 14 | sel = Selector(response) 15 | item['id'] = response.url[38:] 16 | item['title'] = sel.xpath("//h1[@class='info2_h1']/text()").extract()[0] 17 | item['imgurl'] = sel.xpath("//div[@class='news_info']/img/@src").extract()[0] 18 | return item -------------------------------------------------------------------------------- /ShiFuTu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ShiFuTu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ShiFuTu 12 | -------------------------------------------------------------------------------- /ZhiHuUser/README.md: -------------------------------------------------------------------------------- 1 | ## 使用方法 2 | 3 | 1. 在mySpider中找到`email`和`password`字段修改成自己的用户名和密码 4 | 5 | 2. 在项目目录下输入`scrapy crawl users -a url=https://www.zhihu.com/people/ -o data.json` 待程序运行完毕后即可在当前目录下的data.json中找到抓取的数据。 6 | 7 | 3. 主要参考代码 [scrapy-zhihu-users](https://github.com/ansenhuang/scrapy-zhihu-users)。 (侵立删) 8 | 9 | ## TODO: 10 | 11 | - [ ] 处理知乎反爬 12 | 13 | ![users](http://ww4.sinaimg.cn/large/5e515a93jw1f63xqh6w4hj21kw0omaq0.jpg) 14 | -------------------------------------------------------------------------------- /ZhiHuUser/ZhiHuUser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/ZhiHuUser/ZhiHuUser/__init__.py -------------------------------------------------------------------------------- /ZhiHuUser/ZhiHuUser/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | 11 | class UserItem(Item): 12 | url = Field() 13 | name = Field() 14 | bio = Field() 15 | location = Field() 16 | business = Field() 17 | gender = Field() 18 | avatar = Field() 19 | education = Field() 20 | major = Field() 21 | employment = Field() 22 | position = Field() 23 | content = Field() 24 | ask = Field() 25 | answer = Field() 26 | agree = Field() 27 | thanks = Field() 28 | followee_count = Field() 29 | follower_count = Field() -------------------------------------------------------------------------------- /ZhiHuUser/ZhiHuUser/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ZhihuuserPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /ZhiHuUser/ZhiHuUser/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ZhiHuUser project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ZhiHuUser' 13 | 14 | SPIDER_MODULES = ['ZhiHuUser.spiders'] 15 | NEWSPIDER_MODULE = 'ZhiHuUser.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | DOWNLOAD_DELAY=0.2 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | COOKIES_ENABLED=True 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | DEFAULT_REQUEST_HEADERS = { 40 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | 'Accept-Language': 'zh-CN,zh;q=0.8,en,q=0.6', 42 | 'Cache-Control': 'max-age=0', 43 | 'Connection': 'keep-alive', 44 | 'Host': 'www.zhihu.com' 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'ZhiHuUser.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'ZhiHuUser.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'ZhiHuUser.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 74 | #AUTOTHROTTLE_ENABLED=True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY=5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY=60 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG=False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED=True 85 | #HTTPCACHE_EXPIRATION_SECS=0 86 | #HTTPCACHE_DIR='httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | 90 | DEPTH_LIMIT = 10 91 | -------------------------------------------------------------------------------- /ZhiHuUser/ZhiHuUser/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ZhiHuUser/ZhiHuUser/spiders/mySpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import time 5 | from logging import log 6 | import json 7 | from urllib import urlencode 8 | 9 | import scrapy 10 | from scrapy import Spider 11 | from ZhiHuUser.items import UserItem 12 | from scrapy.selector import Selector 13 | from scrapy.shell import inspect_response 14 | 15 | class UserSpider(Spider): 16 | name = 'users' 17 | domain = 'https://www.zhihu.com' 18 | login_url = 'https://www.zhihu.com/login/email' 19 | _xsrf = '' 20 | 21 | def __init__(self, url = None): 22 | self.user_url = url 23 | 24 | def start_requests(self): 25 | yield scrapy.Request( 26 | url = self.domain, 27 | callback = self.request_captcha 28 | ) 29 | 30 | def request_captcha(self, response): 31 | # 获取_xsrf值 32 | self._xsrf = response.css('input[name="_xsrf"]::attr(value)').extract()[0] 33 | # 获取验证码地址 34 | t = str(int(time.time()*1000)) 35 | captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + t + '&type=login' 36 | 37 | # 准备下载验证码 38 | yield scrapy.Request( 39 | url = captcha_url, 40 | meta = { 41 | '_xsrf': self._xsrf 42 | }, 43 | callback = self.download_captcha 44 | ) 45 | 46 | def download_captcha(self, response): 47 | # 下载验证码 48 | with open('captcha.gif', 'wb') as fp: 49 | fp.write(response.body) 50 | # 用软件打开验证码图片 51 | os.system('open captcha.gif') 52 | # 输入验证码 53 | captcha = raw_input('Please enter captcha: ') 54 | 55 | yield scrapy.FormRequest( 56 | url = self.login_url, 57 | formdata = { 58 | 'email': '***********', 59 | 'password': '***********', 60 | '_xsrf': self._xsrf, 61 | 'remember_me': 'true', 62 | 'captcha': captcha 63 | }, 64 | callback = self.after_login 65 | ) 66 | 67 | 68 | def after_login(self, response): 69 | yield scrapy.Request( 70 | url = self.user_url, 71 | callback = self.parse_people, 72 | ) 73 | 74 | 75 | def parse_people(self, response): 76 | ''' 77 | function: 78 | 1. 抓取个人资料 79 | 2. 提取该人的 followees 和 followers 链接并发送请求 80 | ''' 81 | 82 | sel = response.xpath('//div[@class="zm-profile-header ProfileCard"]') 83 | 84 | item = UserItem() 85 | item['url'] = response.url[:-6] 86 | item['name'] = sel.xpath('//div[@class="title-section ellipsis"]/span[@class="name"]/text()').extract_first() 87 | item['bio'] = sel.xpath('//div[@class="title-section ellipsis"]/span[@class="bio"]/@title').extract_first() 88 | item['location'] = sel.xpath('//span[@class="location item"]/@title').extract_first() 89 | item['business'] = sel.xpath('//span[@class="business item"]/@title').extract_first() 90 | item['gender'] = 0 if sel.xpath('//i[contains(@class, "icon-profile-female")]') else 1 91 | item['avatar'] = sel.xpath('//img[@class="Avatar Avatar--l"]/@src').extract_first() 92 | item['education'] = sel.xpath('//span[@class="education item"]/@title').extract_first() 93 | item['major'] = sel.xpath('//span[contains(@class, "education-extra")]/@title').extract_first() 94 | item['employment'] = sel.xpath('//span[contains(@class, "employment")]/@title').extract_first() 95 | item['position'] = sel.xpath('//span[contains(@class, "position")]/@title').extract_first() 96 | item['content'] = "".join(sel.xpath('//span[@class="fold-item"]/span[@class="content"]/text()').extract()).strip() 97 | item['ask'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[2]/span[@class="num"]/text()').extract_first()) 98 | item['answer'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[3]/span[@class="num"]/text()').extract_first()) 99 | item['agree'] = int(sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract_first()) 100 | item['thanks'] = int(sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract_first()) 101 | item['followee_count'] = int(sel.xpath('//div[@class="zm-profile-side-following zg-clear"]/a[@class="item"]/strong/text()').extract()[0]) 102 | item['follower_count'] = int(sel.xpath('//div[@class="zm-profile-side-following zg-clear"]/a[@class="item"]/strong/text()').extract()[1]) 103 | 104 | 105 | ## 抓取 followees列表 (即该用户关注了谁) 106 | if item['followee_count'] != 0: 107 | yield scrapy.Request( 108 | url = self.user_url + '/followees', 109 | meta = { 110 | 'people_count': item['followee_count'], 111 | 'type': 'followee' 112 | }, 113 | callback = self.parse_follow, 114 | ) 115 | 116 | ## 抓取 followers列表 (即谁关注了该用户) 117 | if item['follower_count'] != 0: 118 | yield scrapy.Request( 119 | url = response.url + '/followers', 120 | meta = { 121 | 'people_count': item['follower_count'], 122 | 'type': 'follower' 123 | }, 124 | callback = self.parse_follow 125 | ) 126 | 127 | yield item 128 | 129 | def parse_follow(self, response): 130 | 131 | """ 132 | 1. 处理follow数据 (followee and follower), 即获取每一个用户信息 133 | 2. 向获取更多列表数据发送请求 134 | """ 135 | 136 | sel = Selector(response) 137 | people_links = sel.xpath('//a[@class="zg-link"]/@href').extract() 138 | people_count = response.meta['people_count'] 139 | people_param = json.loads(sel.xpath('//div[@class="zh-general-list clearfix"]/@data-init').extract_first()) 140 | 141 | 142 | # 请求所有的人 143 | zhihu_ids = [] 144 | for people_url in people_links: 145 | zhihu_ids.append(os.path.split(people_url)[-1]) 146 | yield scrapy.Request( 147 | url = people_url, 148 | callback = self.parse_people 149 | ) 150 | 151 | ## 处理动态加载的用户(发送Ajax请求) 152 | 153 | if response.meta['type'] == 'followee': 154 | url = 'https://www.zhihu.com/node/ProfileFolloweesListV2' 155 | else: 156 | url = 'https://www.zhihu.com/node/ProfileFollowersListV2' 157 | 158 | 159 | # 请求所有的用户数据(更多列表 动态加载) 160 | start = 20 161 | while start < people_count: 162 | payload = { 163 | 'method': 'next', 164 | '_xsrf': self._xsrf, 165 | 'params': people_param['params'] 166 | } 167 | payload['params']['offset'] = start 168 | payload['params'] = json.dumps(payload['params']) 169 | start += 20 170 | 171 | yield scrapy.Request( 172 | url = url, 173 | method='POST', 174 | body=urlencode(payload), 175 | callback=self.parse_post_follow 176 | ) 177 | 178 | 179 | def parse_post_follow(self, response): 180 | """ 181 | 1. 获取动态请求拿到的人员 182 | """ 183 | body = json.loads(response.body) 184 | people_divs = body.get('msg', []) 185 | 186 | # 请求所有的人 187 | zhihu_ids = [] 188 | for div in people_divs: 189 | selector = Selector(text=div) 190 | link = selector.xpath('//a[@class="zg-link"]/@href').extract_first() 191 | print link 192 | if not link: 193 | continue 194 | 195 | zhihu_ids.append(os.path.split(link)[-1]) 196 | yield scrapy.Request( 197 | url=link, 198 | callback=self.parse_people, 199 | ) 200 | 201 | def parse_err(self, response): 202 | print 'crawl {} failed'.format(response.url) 203 | -------------------------------------------------------------------------------- /ZhiHuUser/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ZhiHuUser.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ZhiHuUser 12 | -------------------------------------------------------------------------------- /dbMoviesTop250/dbMoviesTop250/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/dbMoviesTop250/dbMoviesTop250/__init__.py -------------------------------------------------------------------------------- /dbMoviesTop250/dbMoviesTop250/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class Dbmoviestop250Item(scrapy.Item): 11 | name = scrapy.Field() # 电影名字 12 | year = scrapy.Field() # 上映年份 13 | score = scrapy.Field() # 豆瓣分数 14 | director = scrapy.Field() # 导演 15 | classification = scrapy.Field() # 分类 16 | actor = scrapy.Field() # 演员 17 | image_urls = scrapy.Field() # 封面图片 -------------------------------------------------------------------------------- /dbMoviesTop250/dbMoviesTop250/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | import scrapy 10 | from scrapy.pipelines.images import ImagesPipeline 11 | 12 | class MyImagesPipeline(ImagesPipeline): 13 | 14 | def get_media_requests(self, item, info): 15 | for image_url in item['image_urls']: 16 | yield scrapy.Request(image_url, meta={'item': item}) 17 | 18 | def item_completed(self, results, item, info): 19 | image_paths = [x['path'] for ok, x in results if ok] 20 | if not image_paths: 21 | raise DropItem("Item contains no images") 22 | return item 23 | 24 | def file_path(self, request, response=None, info=None): 25 | item = request.meta['item'] 26 | name = item['name'] 27 | filename = u'{0}.jpg'.format(name) 28 | return filename -------------------------------------------------------------------------------- /dbMoviesTop250/dbMoviesTop250/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dbMoviesTop250 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dbMoviesTop250' 13 | 14 | SPIDER_MODULES = ['dbMoviesTop250.spiders'] 15 | NEWSPIDER_MODULE = 'dbMoviesTop250.spiders' 16 | 17 | IMAGES_STORE = 'images' # 表示图片文件夹为当前目录 18 | 19 | 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS=32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY=3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 33 | #CONCURRENT_REQUESTS_PER_IP=16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED=False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED=False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'dbMoviesTop250.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'dbMoviesTop250.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'dbMoviesTop250.pipelines.MyImagesPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 74 | #AUTOTHROTTLE_ENABLED=True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY=5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY=60 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG=False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED=True 85 | #HTTPCACHE_EXPIRATION_SECS=0 86 | #HTTPCACHE_DIR='httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | -------------------------------------------------------------------------------- /dbMoviesTop250/dbMoviesTop250/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dbMoviesTop250/dbMoviesTop250/spiders/mySpider.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import CrawlSpider, Rule 2 | from scrapy.linkextractors import LinkExtractor 3 | from scrapy.selector import Selector 4 | 5 | from dbMoviesTop250.items import Dbmoviestop250Item 6 | 7 | class MovieSpider(CrawlSpider): 8 | name = 'movies' 9 | allowed_domains = ['movie.douban.com'] 10 | start_urls = ['https://movie.douban.com/top250'] 11 | 12 | rules = [Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*'))), 13 | Rule(LinkExtractor(allow=(r'https://movie.douban.com/subject/\d+')), 14 | callback='parse_item', follow=False) 15 | ] 16 | 17 | def parse_item(self, response): 18 | 19 | sel = Selector(response) 20 | 21 | item = Dbmoviestop250Item() 22 | item['name'] = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0] 23 | item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0] 24 | item['score'] = sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0] 25 | item['director'] = sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0] 26 | item['classification'] = sel.xpath('//span[@property="v:genre"]/text()').extract()[0] 27 | item['actor'] = sel.xpath('//*[@id="info"]/span[3]//a/text()').extract()[0] 28 | item['image_urls'] = sel.xpath('//div[@id="mainpic"]/a[@class="nbgnbg"]/img/@src').extract() 29 | 30 | return item -------------------------------------------------------------------------------- /dbMoviesTop250/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dbMoviesTop250.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dbMoviesTop250 12 | -------------------------------------------------------------------------------- /litterlove/data.csv: -------------------------------------------------------------------------------- 1 | date,description,image_url,length,title,video 2 | 2017-04-07,暗恋一个人的时候,连他的每一个脚步声都记得一清二楚,谁又能想到,结局会让人这样的惊喜。 ,http://cs.xinpianchang.com/uploadfile/article/2017/03/24/58d4b1fac4f94.jpeg,03:29,30 暗恋是一个人的独角戏,也可能是两个人的浪漫剧,"[{'resolution': '3840x2160', 3 | 'video_url': 'http://video.xinpianchang.com/58c94814c2816.mp4'}, {'resolution': '2560x1440', 4 | 'video_url': 'http://video.xinpianchang.com/58c94814cfbd0.mp4'}, {'resolution': '1920x1080', 5 | 'video_url': 'http://video.xinpianchang.com/58c94814e7beb.mp4'}, {'resolution': '1280x720', 6 | 'video_url': 'http://video.xinpianchang.com/58c948150ec8a.mp4'}, {'resolution': '960x540', 7 | 'video_url': 'http://video.xinpianchang.com/58c9481525bf4.mp4'}, {'resolution': '640x360', 8 | 'video_url': 'http://video.xinpianchang.com/58c94815418d5.mp4'}]" 9 | 2016-07-25,暗恋,是自作聪明后被识破的脸红。暗恋的美好,是心里偷偷喜欢,小小的悸动就组成一个世界。暗恋是排山倒海又不动声色,但庆幸那时的我们是如此勇敢、炙热...,http://cs.xinpianchang.com/uploadfile/article/2016/06/30/5774c5f3a91e1.jpeg,02:43,03 暗恋,是自作聪明后被识破的脸红,"[{'resolution': '1920x1080', 10 | 'video_url': 'http://video.xinpianchang.com/5774c577d9a82.mp4'}, {'resolution': '1280x720', 11 | 'video_url': 'http://video.xinpianchang.com/5774c578b7428.mp4'}, {'resolution': '960x540', 12 | 'video_url': 'http://video.xinpianchang.com/5774c578bef14.mp4'}, {'resolution': '640x360', 13 | 'video_url': 'http://video.xinpianchang.com/5774c578c6737.mp4'}]" 14 | 2016-07-25,有些人,越是在乎,就越不敢触碰因为知道,友情比爱情来得更长久所以,有些喜欢,只敢藏在心底。,http://cs.xinpianchang.com/uploadfile/article/2016/06/30/5774c74c63073.jpeg,02:10,04 有一种喜欢,只能藏在心底…,"[{'resolution': '2560x1440', 15 | 'video_url': 'http://video.xinpianchang.com/5774c708c1540.mp4'}, {'resolution': '1920x1080', 16 | 'video_url': 'http://video.xinpianchang.com/5774c70987eac.mp4'}, {'resolution': '1280x720', 17 | 'video_url': 'http://video.xinpianchang.com/5774c7098ffc3.mp4'}, {'resolution': '960x540', 18 | 'video_url': 'http://video.xinpianchang.com/5774c7099e037.mp4'}, {'resolution': '640x360', 19 | 'video_url': 'http://video.xinpianchang.com/5774c709a4dfa.mp4'}]" 20 | 2016-07-25,我走过大街小巷,用相机记录沿途的风景和遇见的人。那天我走过花店,阳光透过玻璃洒在你的脸上。你拨动着琴弦,轻吟浅唱,我不自觉地按下快门键。每天与我擦...,http://cs.xinpianchang.com/uploadfile/article/2016/07/25/5795a6ad43c02.jpeg,03:00,05 你擦肩而过了多少爱情,"[{'resolution': '3840x2160', 21 | 'video_url': 'http://video.xinpianchang.com/5800a2508d04c.mp4'}, {'resolution': '2560x1440', 22 | 'video_url': 'http://video.xinpianchang.com/5800a2509d7b8.mp4'}, {'resolution': '1920x1080', 23 | 'video_url': 'http://video.xinpianchang.com/5800a250affee.mp4'}, {'resolution': '1280x720', 24 | 'video_url': 'http://video.xinpianchang.com/5800a250be7cb.mp4'}, {'resolution': '960x540', 25 | 'video_url': 'http://video.xinpianchang.com/5800a250cf195.mp4'}, {'resolution': '640x360', 26 | 'video_url': 'http://video.xinpianchang.com/5800a25107977.mp4'}]" 27 | 2017-04-07,只要能霸占你所有的怀抱,哪怕崴了脚也能加速向前。,http://cs.xinpianchang.com/uploadfile/article/2017/03/14/58c79e763e717.jpeg,04:12,29 套路是我学的,但撩你是真心的,"[{'resolution': '3840x2160', 28 | 'video_url': 'http://video.xinpianchang.com/58c65d63b45d7.mp4'}, {'resolution': '2560x1440', 29 | 'video_url': 'http://video.xinpianchang.com/58c65d63c066b.mp4'}, {'resolution': '1920x1080', 30 | 'video_url': 'http://video.xinpianchang.com/58c65d63d7e9b.mp4'}, {'resolution': '1280x720', 31 | 'video_url': 'http://video.xinpianchang.com/58c65d63ee33d.mp4'}, {'resolution': '960x540', 32 | 'video_url': 'http://video.xinpianchang.com/58c65d64130f8.mp4'}, {'resolution': '640x360', 33 | 'video_url': 'http://video.xinpianchang.com/58c65d642f436.mp4'}]" 34 | 2017-04-07,只道从此,渐行的时光里不再有你。,http://cs.xinpianchang.com/uploadfile/article/2017/03/13/58c6523aa0cbe.jpeg,04:48,28 感谢关照,后会有期,"[{'resolution': '3840x2160', 35 | 'video_url': 'http://video.xinpianchang.com/58c65102d1c39.mp4'}, {'resolution': '2560x1440', 36 | 'video_url': 'http://video.xinpianchang.com/58c65102db88b.mp4'}, {'resolution': '1920x1080', 37 | 'video_url': 'http://video.xinpianchang.com/58c6510302c4f.mp4'}, {'resolution': '1280x720', 38 | 'video_url': 'http://video.xinpianchang.com/58c651031c887.mp4'}, {'resolution': '960x540', 39 | 'video_url': 'http://video.xinpianchang.com/58c6510333ed3.mp4'}, {'resolution': '640x360', 40 | 'video_url': 'http://video.xinpianchang.com/58c6510348ddb.mp4'}]" 41 | 2017-04-07,暗恋这件青春里无可逃避的小事,需要小心轻放。 ,http://cs.xinpianchang.com/uploadfile/article/2017/03/23/58d36542623ae.jpeg,02:36,31 暗恋是一个需要小心轻放的青春,"[{'resolution': '3840x2160', 42 | 'video_url': 'http://video.xinpianchang.com/58cfb375810dd.mp4'}, {'resolution': '2560x1440', 43 | 'video_url': 'http://video.xinpianchang.com/58cfb3758e757.mp4'}, {'resolution': '1920x1080', 44 | 'video_url': 'http://video.xinpianchang.com/58cfb375a6833.mp4'}, {'resolution': '1280x720', 45 | 'video_url': 'http://video.xinpianchang.com/58cfb375c44ae.mp4'}, {'resolution': '960x540', 46 | 'video_url': 'http://video.xinpianchang.com/58cfb375dd46d.mp4'}, {'resolution': '640x360', 47 | 'video_url': 'http://video.xinpianchang.com/58cfb375f1e4a.mp4'}]" 48 | 2017-04-07,一段因爱情而生的宠物情缘。,http://cs.xinpianchang.com/uploadfile/article/2017/03/28/58da25a5d6ea3.jpeg,03:44,34 因爱情而生的宠物情缘,"[{'resolution': '1920x1080', 49 | 'video_url': 'http://video.xinpianchang.com/58da24a0da57f.mp4'}, {'resolution': '1280x720', 50 | 'video_url': 'http://video.xinpianchang.com/58da24a0e3c8b.mp4'}, {'resolution': '960x540', 51 | 'video_url': 'http://video.xinpianchang.com/58da24a104c9d.mp4'}, {'resolution': '640x360', 52 | 'video_url': 'http://video.xinpianchang.com/58da24a11c44d.mp4'}]" 53 | 2017-04-07,手作人玉头遇见摄影师左目,从两情相悦到岁月静好,造物集是他们永恒的小情书。,http://cs.xinpianchang.com/uploadfile/article/2017/03/27/58d8cb4c1e739.jpeg,03:15,32 当造物集开始书写小情书,"[{'resolution': '3840x2160', 54 | 'video_url': 'http://video.xinpianchang.com/58d364d58cddf.mp4'}, {'resolution': '2560x1440', 55 | 'video_url': 'http://video.xinpianchang.com/58d364d595483.mp4'}, {'resolution': '1920x1080', 56 | 'video_url': 'http://video.xinpianchang.com/58d364d5ae6ac.mp4'}, {'resolution': '1280x720', 57 | 'video_url': 'http://video.xinpianchang.com/58d364d5c57cb.mp4'}, {'resolution': '960x540', 58 | 'video_url': 'http://video.xinpianchang.com/58d364d5dc63f.mp4'}, {'resolution': '640x360', 59 | 'video_url': 'http://video.xinpianchang.com/58d364d5f3b43.mp4'}]" 60 | 2017-04-07,如果不想屈服于异地恋的魔爪,那就选个好天气往对方的城市靠近吧,拉近爱情需要战胜远距离! ,http://cs.xinpianchang.com/uploadfile/article/2017/04/07/58e762cf61050.jpeg,03:45,35 一切都抵不过见面时你给的那个拥抱,"[{'resolution': '3840x2160', 61 | 'video_url': 'http://video.xinpianchang.com/58dbafa40c4c5.mp4'}, {'resolution': '2560x1440', 62 | 'video_url': 'http://video.xinpianchang.com/58dbafa419681.mp4'}, {'resolution': '1920x1080', 63 | 'video_url': 'http://video.xinpianchang.com/58dbafa433dbc.mp4'}, {'resolution': '1280x720', 64 | 'video_url': 'http://video.xinpianchang.com/58dbafa44e3db.mp4'}, {'resolution': '960x540', 65 | 'video_url': 'http://video.xinpianchang.com/58dbafa462410.mp4'}, {'resolution': '640x360', 66 | 'video_url': 'http://video.xinpianchang.com/58dbafa477685.mp4'}]" 67 | 2016-12-23,如果失恋就是世界末日那我要在末日前学会很多事我要学会接受自己不快乐的样子学会不再为迎合他的喜好而打扮学会善待自己的肠胃,而不只是费心琢磨他的口...,http://cs.xinpianchang.com/uploadfile/article/2016/07/29/579b458c2b0d8.jpeg,02:10,06 如果失恋就是世界末日,"[{'resolution': '3840x2160', 68 | 'video_url': 'http://video.xinpianchang.com/579b458d3a898.mp4'}, {'resolution': '2560x1440', 69 | 'video_url': 'http://video.xinpianchang.com/579b458d6f21f.mp4'}, {'resolution': '1920x1080', 70 | 'video_url': 'http://video.xinpianchang.com/579b458d897c5.mp4'}, {'resolution': '1280x720', 71 | 'video_url': 'http://video.xinpianchang.com/579b458db4972.mp4'}, {'resolution': '960x540', 72 | 'video_url': 'http://video.xinpianchang.com/579b458de1335.mp4'}, {'resolution': '640x360', 73 | 'video_url': 'http://video.xinpianchang.com/579b458dead37.mp4'}]" 74 | 2017-04-07,回忆从前在校园里走过的路,听过的歌,原来一切过去已经很多年,但有些东西是时间改变不了的。下集内容请关注微博微信@小情书LOVOTE,http://cs.xinpianchang.com/uploadfile/article/2017/04/10/58eb3815b1288.jpeg,04:53,37 时间的轨迹(上集),"[{'resolution': '1920x1080', 75 | 'video_url': 'http://video.xinpianchang.com/58e63135528d7.mp4'}, {'resolution': '1280x720', 76 | 'video_url': 'http://video.xinpianchang.com/58e631355ab75.mp4'}, {'resolution': '960x540', 77 | 'video_url': 'http://video.xinpianchang.com/58e6313576bf7.mp4'}, {'resolution': '640x360', 78 | 'video_url': 'http://video.xinpianchang.com/58e631359004f.mp4'}]" 79 | 2017-04-07,回忆从前在校园里走过的路,听过的歌,原来一切过去已经很多年,但有些东西是时间改变不了的。,http://cs.xinpianchang.com/uploadfile/article/2017/04/07/58e7621a7dc26.jpeg,04:26,38 时间的轨迹(下集),"[{'resolution': '3840x2160', 80 | 'video_url': 'http://video.xinpianchang.com/58e75e9012a86.mp4'}, {'resolution': '2560x1440', 81 | 'video_url': 'http://video.xinpianchang.com/58e75e902103f.mp4'}, {'resolution': '1920x1080', 82 | 'video_url': 'http://video.xinpianchang.com/58e75e9045c76.mp4'}, {'resolution': '1280x720', 83 | 'video_url': 'http://video.xinpianchang.com/58e75e905d247.mp4'}, {'resolution': '960x540', 84 | 'video_url': 'http://video.xinpianchang.com/58e75e907ac6b.mp4'}, {'resolution': '640x360', 85 | 'video_url': 'http://video.xinpianchang.com/58e75e909f437.mp4'}]" 86 | 2017-04-07,有人说,爱情由肾上腺素决定出不出手,多巴胺决定天长地久,羟色胺决定谁先开口,说到底,都是化学反应。 ,http://cs.xinpianchang.com/uploadfile/article/2017/03/07/58be2072d4962.jpeg,03:00,26 爱情是科学无法解释的事情,"[{'resolution': '1920x1080', 87 | 'video_url': 'http://video.xinpianchang.com/58be1e5256ab5.mp4'}, {'resolution': '1280x720', 88 | 'video_url': 'http://video.xinpianchang.com/58be1e5263a49.mp4'}, {'resolution': '960x540', 89 | 'video_url': 'http://video.xinpianchang.com/58be1e527b62c.mp4'}, {'resolution': '640x360', 90 | 'video_url': 'http://video.xinpianchang.com/58be1e52981d8.mp4'}]" 91 | 2016-12-23,上学的时候,班里总有一个学习好,笑起来又很好看的男生。那时候的我不知道该如何靠近他,每次请他讲题都是我故意靠近他的借口。现在的我早已经记不得他口中...,http://cs.xinpianchang.com/uploadfile/article/2016/08/26/57c009adb1583.jpeg,02:37,07 我永远记得你——前座男生,"[{'resolution': '3840x2160', 92 | 'video_url': 'http://video.xinpianchang.com/57c00918b924c.mp4'}, {'resolution': '2560x1440', 93 | 'video_url': 'http://video.xinpianchang.com/57c009af9eda1.mp4'}, {'resolution': '1920x1080', 94 | 'video_url': 'http://video.xinpianchang.com/57c009afafffb.mp4'}, {'resolution': '1280x720', 95 | 'video_url': 'http://video.xinpianchang.com/57c009afbac90.mp4'}, {'resolution': '960x540', 96 | 'video_url': 'http://video.xinpianchang.com/57c009afc6ec2.mp4'}, {'resolution': '640x360', 97 | 'video_url': 'http://video.xinpianchang.com/57c009afdd159.mp4'}]" 98 | 2016-12-23,在你眼中,我对所有人都好。但在所有人眼中,我只对你好。,http://cs.xinpianchang.com/uploadfile/article/2016/10/14/5800f7c6ba939.jpeg,03:18,08 为了对你好,我假装对所有人都好,"[{'resolution': '3840x2160', 99 | 'video_url': 'http://video.xinpianchang.com/5800f7c7e1ce1.mp4'}, {'resolution': '2560x1440', 100 | 'video_url': 'http://video.xinpianchang.com/5800f7c7ec8b6.mp4'}, {'resolution': '1920x1080', 101 | 'video_url': 'http://video.xinpianchang.com/5800f7c80647b.mp4'}, {'resolution': '1280x720', 102 | 'video_url': 'http://video.xinpianchang.com/5800f7c81de13.mp4'}, {'resolution': '960x540', 103 | 'video_url': 'http://video.xinpianchang.com/5800f7c82f313.mp4'}, {'resolution': '640x360', 104 | 'video_url': 'http://video.xinpianchang.com/5800f7c83aa2a.mp4'}]" 105 | 2016-12-23,恋爱是每一步都要走得舒服。,http://cs.xinpianchang.com/uploadfile/article/2017/03/28/58da2533ecab8.jpeg,03:42,33 恋爱是每一步都要走得舒服,"[{'resolution': '1920x1080', 106 | 'video_url': 'http://video.xinpianchang.com/58da2301ae0f2.mp4'}, {'resolution': '1280x720', 107 | 'video_url': 'http://video.xinpianchang.com/58da2301be26a.mp4'}, {'resolution': '960x540', 108 | 'video_url': 'http://video.xinpianchang.com/58da2301ea9fa.mp4'}, {'resolution': '640x360', 109 | 'video_url': 'http://video.xinpianchang.com/58da230212621.mp4'}]" 110 | 2016-12-23,喜欢是冬天里藏在手心的温暖,希望所有的女生都能在冬天把手放在喜欢的人的口袋里。,http://cs.xinpianchang.com/uploadfile/article/2017/01/16/587c578308741.jpeg,02:59,10 愿冬天所有女生都能把手放在喜欢的人的口袋里,"[{'resolution': '3840x2160', 111 | 'video_url': 'http://video.xinpianchang.com/584a23840b2ba.mp4'}, {'resolution': '2560x1440', 112 | 'video_url': 'http://video.xinpianchang.com/584a2384401af.mp4'}, {'resolution': '1920x1080', 113 | 'video_url': 'http://video.xinpianchang.com/584a23848b73d.mp4'}, {'resolution': '1280x720', 114 | 'video_url': 'http://video.xinpianchang.com/584a2384c99af.mp4'}, {'resolution': '960x540', 115 | 'video_url': 'http://video.xinpianchang.com/584a238523fdb.mp4'}, {'resolution': '640x360', 116 | 'video_url': 'http://video.xinpianchang.com/584a23856087f.mp4'}]" 117 | 2016-12-23,想要借着拜年祝福把犹豫不决的心意打开,正巧的是你把我的“正在输入”都收在了眼里。,http://cs.xinpianchang.com/uploadfile/article/2017/02/01/589184ccb07db.jpeg,03:09,15 “正在输入”是了然于心却要思前想后的喜欢,"[{'resolution': '3840x2160', 118 | 'video_url': 'http://video.xinpianchang.com/589184ce3a00d.mp4'}, {'resolution': '2560x1440', 119 | 'video_url': 'http://video.xinpianchang.com/589184ce990b2.mp4'}, {'resolution': '1920x1080', 120 | 'video_url': 'http://video.xinpianchang.com/589184ced2c58.mp4'}, {'resolution': '1280x720', 121 | 'video_url': 'http://video.xinpianchang.com/589184cf39473.mp4'}, {'resolution': '960x540', 122 | 'video_url': 'http://video.xinpianchang.com/589184cf7c13f.mp4'}, {'resolution': '640x360', 123 | 'video_url': 'http://video.xinpianchang.com/589184cfe57a1.mp4'}]" 124 | 2016-12-23,晚到的人怎么会懂得等待,双方从一开始的目的地就已经相隔十万八千里。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/20/58f85b349bd7a.jpeg,03:06,41 迟到的人不必等,"[{'resolution': '1920x1080', 125 | 'video_url': 'http://video.xinpianchang.com/58f85b363ef78.mp4'}, {'resolution': '1280x720', 126 | 'video_url': 'http://video.xinpianchang.com/58f85b3649b50.mp4'}, {'resolution': '960x540', 127 | 'video_url': 'http://video.xinpianchang.com/58f85b3670536.mp4'}, {'resolution': '640x360', 128 | 'video_url': 'http://video.xinpianchang.com/58f85b36bcb1d.mp4'}]" 129 | 2016-12-23,在错的时间遇到了对的人,等待的结果会让人如愿吗? ,http://cs.xinpianchang.com/uploadfile/article/2017/04/20/58f85eed04128.jpeg,04:08,42 错的时间遇见对的人,"[{'resolution': '3840x2160', 130 | 'video_url': 'http://video.xinpianchang.com/58f85eee2d027.mp4'}, {'resolution': '2560x1440', 131 | 'video_url': 'http://video.xinpianchang.com/58f85eee437cf.mp4'}, {'resolution': '1920x1080', 132 | 'video_url': 'http://video.xinpianchang.com/58f85eee5b876.mp4'}, {'resolution': '1280x720', 133 | 'video_url': 'http://video.xinpianchang.com/58f85eee74073.mp4'}, {'resolution': '960x540', 134 | 'video_url': 'http://video.xinpianchang.com/58f85eee8c4cd.mp4'}, {'resolution': '640x360', 135 | 'video_url': 'http://video.xinpianchang.com/58f85eeeb1672.mp4'}]" 136 | 2016-12-23,因为害怕分开的日子里无止的想念,所以提出分手,可是,你怎么知道他不会回来呢? ,http://cs.xinpianchang.com/uploadfile/article/2017/04/27/5901cf71c5005.jpeg,03:59,45 爱从不因时间而停止,"[{'resolution': '1920x1080', 137 | 'video_url': 'http://video.xinpianchang.com/5901cf72b8c05.mp4'}, {'resolution': '1280x720', 138 | 'video_url': 'http://video.xinpianchang.com/5901cf72c39bc.mp4'}, {'resolution': '960x540', 139 | 'video_url': 'http://video.xinpianchang.com/5901cf72dd9a0.mp4'}, {'resolution': '640x360', 140 | 'video_url': 'http://video.xinpianchang.com/5901cf7304407.mp4'}]" 141 | 2016-12-23,忘记太难,因为要走一遍来时的路。来时的路上处处有你的身影。我给了自己一个机会,回忆与你的全部往昔,等待可以好好和你说再见的那一天。,http://cs.xinpianchang.com/uploadfile/article/2016/11/11/58256a158d7ab.jpeg,05:34,09 忘记你,还是忘记爱情的味道?,"[{'resolution': '1920x1080', 142 | 'video_url': 'http://video.xinpianchang.com/58256a1668abf.mp4'}, {'resolution': '1280x720', 143 | 'video_url': 'http://video.xinpianchang.com/58256a16867f5.mp4'}, {'resolution': '960x540', 144 | 'video_url': 'http://video.xinpianchang.com/58256a16a1cfc.mp4'}, {'resolution': '640x360', 145 | 'video_url': 'http://video.xinpianchang.com/58256a16cd410.mp4'}]" 146 | 2016-12-23,不管是朋友还是情人,我都喜欢你。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/17/591c28154ba19.jpeg,04:23,53 最合适的伴侣一直在身边,"[{'resolution': '1920x1080', 147 | 'video_url': 'http://video.xinpianchang.com/591c281723dbe.mp4'}, {'resolution': '1280x720', 148 | 'video_url': 'http://video.xinpianchang.com/591c28172d87a.mp4'}, {'resolution': '960x540', 149 | 'video_url': 'http://video.xinpianchang.com/591c281746d9f.mp4'}, {'resolution': '640x360', 150 | 'video_url': 'http://video.xinpianchang.com/591c2817653ab.mp4'}]" 151 | 2017-03-03,等到一年过去的时候才惊觉时间过得太快,就像喜欢了很久的他,从大街小巷踌躇到四下无人,只为了发一句看似群发的新年祝福。,http://cs.xinpianchang.com/uploadfile/article/2017/02/01/589185dd53882.jpeg,02:29,16 新年快乐,我喜欢你,"[{'resolution': '1920x1080', 152 | 'video_url': 'http://video.xinpianchang.com/589185de40b35.mp4'}, {'resolution': '1280x720', 153 | 'video_url': 'http://video.xinpianchang.com/589185decf765.mp4'}, {'resolution': '960x540', 154 | 'video_url': 'http://video.xinpianchang.com/589185df3011e.mp4'}, {'resolution': '640x360', 155 | 'video_url': 'http://video.xinpianchang.com/589185df77774.mp4'}]" 156 | 2017-03-03,遇到你的时候,才知道,有一种门当户对,叫我们很配。,http://cs.xinpianchang.com/uploadfile/article/2017/01/11/5875c3e054747.jpeg,03:24,13 有一种门当户对,叫我们很配,"[{'resolution': '3840x2160', 157 | 'video_url': 'http://video.xinpianchang.com/5875c3a9e6e80.mp4'}, {'resolution': '2560x1440', 158 | 'video_url': 'http://video.xinpianchang.com/5875c3aa3d139.mp4'}, {'resolution': '1920x1080', 159 | 'video_url': 'http://video.xinpianchang.com/5875c3aae6848.mp4'}, {'resolution': '1280x720', 160 | 'video_url': 'http://video.xinpianchang.com/5875c3ab5f554.mp4'}, {'resolution': '960x540', 161 | 'video_url': 'http://video.xinpianchang.com/5875c3abd7e6a.mp4'}, {'resolution': '640x360', 162 | 'video_url': 'http://video.xinpianchang.com/5875c3ac5d39a.mp4'}]" 163 | 2016-12-23,把“喜欢你”印在信封上,如果你看不到,我就说出来给你听。,http://cs.xinpianchang.com/uploadfile/article/2017/01/16/587c57c8a2846.jpeg,02:23,11 喜欢你是我做过最勇敢的事,"[{'resolution': '3840x2160', 164 | 'video_url': 'http://video.xinpianchang.com/585cdc892e1e6.mp4'}, {'resolution': '2560x1440', 165 | 'video_url': 'http://video.xinpianchang.com/585cdc8951b4e.mp4'}, {'resolution': '1920x1080', 166 | 'video_url': 'http://video.xinpianchang.com/585cdc897d1cf.mp4'}, {'resolution': '1280x720', 167 | 'video_url': 'http://video.xinpianchang.com/585cdc89d8e52.mp4'}, {'resolution': '960x540', 168 | 'video_url': 'http://video.xinpianchang.com/585cdc8a3be25.mp4'}, {'resolution': '640x360', 169 | 'video_url': 'http://video.xinpianchang.com/585cdc8a85c7a.mp4'}]" 170 | 2017-03-03,放下那些礼貌的装饰,把爱装在眼里,迈进心里。,http://cs.xinpianchang.com/uploadfile/article/2017/02/04/589564eec95f2.jpeg,03:09,14 恋爱中迈出的一大步就是见到了你的素颜,"[{'resolution': '3840x2160', 171 | 'video_url': 'http://video.xinpianchang.com/58857f1f5c3ac.mp4'}, {'resolution': '2560x1440', 172 | 'video_url': 'http://video.xinpianchang.com/58857f1fe95ce.mp4'}, {'resolution': '1920x1080', 173 | 'video_url': 'http://video.xinpianchang.com/58857f20a5297.mp4'}, {'resolution': '1280x720', 174 | 'video_url': 'http://video.xinpianchang.com/58857f2156085.mp4'}, {'resolution': '960x540', 175 | 'video_url': 'http://video.xinpianchang.com/58857f21f2823.mp4'}, {'resolution': '640x360', 176 | 'video_url': 'http://video.xinpianchang.com/58857f228b8a1.mp4'}]" 177 | 2017-03-03,世界上除了恋人还有比恋人更默契的是,陪你一起哭、一起笑、一起长大的闺蜜。,http://cs.xinpianchang.com/uploadfile/article/2017/02/12/589fd0f048a2e.jpeg,02:28,17 比恋人更默契的是闺蜜,"[{'resolution': '1920x1080', 178 | 'video_url': 'http://video.xinpianchang.com/58983ecf048b0.mp4'}, {'resolution': '1280x720', 179 | 'video_url': 'http://video.xinpianchang.com/58983ecf0d8b0.mp4'}, {'resolution': '960x540', 180 | 'video_url': 'http://video.xinpianchang.com/58983ecf1dde2.mp4'}, {'resolution': '640x360', 181 | 'video_url': 'http://video.xinpianchang.com/58983ecf32b8d.mp4'}]" 182 | 2017-03-03,精心准备的情人节礼物,只为了让喜欢的人送给TA喜欢的人。,http://cs.xinpianchang.com/uploadfile/article/2017/02/13/58a15480dde1b.jpeg,03:56,18 喜欢你是真的,想让你开心也是真的,"[{'resolution': '3840x2160', 183 | 'video_url': 'http://video.xinpianchang.com/58a1512b778d0.mp4'}, {'resolution': '2560x1440', 184 | 'video_url': 'http://video.xinpianchang.com/58a1512b81e51.mp4'}, {'resolution': '1920x1080', 185 | 'video_url': 'http://video.xinpianchang.com/58a1512b9669c.mp4'}, {'resolution': '1280x720', 186 | 'video_url': 'http://video.xinpianchang.com/58a1512baa4d5.mp4'}, {'resolution': '960x540', 187 | 'video_url': 'http://video.xinpianchang.com/58a1512bbf5b4.mp4'}, {'resolution': '640x360', 188 | 'video_url': 'http://video.xinpianchang.com/58a1512bd82b1.mp4'}]" 189 | 2017-03-03,恋人之间总是不可避免地产生很多摩擦和矛盾,而化解矛盾的终极手段就是把对方的所有可爱的模样都放在眼里,放进心里,表达出来。,http://cs.xinpianchang.com/uploadfile/article/2017/02/18/58a7dd292926b.jpeg,03:57,19 恋爱的温度,就是找回你的微笑,"[{'resolution': '3840x2160', 190 | 'video_url': 'http://video.xinpianchang.com/58a27a5fed6c9.mp4'}, {'resolution': '2560x1440', 191 | 'video_url': 'http://video.xinpianchang.com/58a27a600351e.mp4'}, {'resolution': '1920x1080', 192 | 'video_url': 'http://video.xinpianchang.com/58a27a602facb.mp4'}, {'resolution': '1280x720', 193 | 'video_url': 'http://video.xinpianchang.com/58a27a60522be.mp4'}, {'resolution': '960x540', 194 | 'video_url': 'http://video.xinpianchang.com/58a27a606885e.mp4'}, {'resolution': '640x360', 195 | 'video_url': 'http://video.xinpianchang.com/58a27a609ee9a.mp4'}]" 196 | 2017-03-03,这段感情虽然没有开花结果,但却在各自的生命里留下了不可抹灭的印记。,http://cs.xinpianchang.com/uploadfile/article/2017/04/10/58eb36e715a2c.jpeg,03:04,21 失恋没什么大不了,"[{'resolution': '1920x1080', 197 | 'video_url': 'http://video.xinpianchang.com/58aa9b15057da.mp4'}, {'resolution': '1280x720', 198 | 'video_url': 'http://video.xinpianchang.com/58aa9b150e311.mp4'}, {'resolution': '960x540', 199 | 'video_url': 'http://video.xinpianchang.com/58aa9b1526b72.mp4'}, {'resolution': '640x360', 200 | 'video_url': 'http://video.xinpianchang.com/58aa9b1547add.mp4'}]" 201 | 2017-03-03,一段感情既然开始了,就不能随随便便结束。,http://cs.xinpianchang.com/uploadfile/article/2017/02/18/58a7dd6228fbc.jpeg,02:37,20 爱情的模样,要跨过手机屏幕才看得到,"[{'resolution': '1920x1080', 202 | 'video_url': 'http://video.xinpianchang.com/58a50ec24afcc.mp4'}, {'resolution': '1280x720', 203 | 'video_url': 'http://video.xinpianchang.com/58a50ec2d13af.mp4'}, {'resolution': '960x540', 204 | 'video_url': 'http://video.xinpianchang.com/58a50ec32e37d.mp4'}, {'resolution': '640x360', 205 | 'video_url': 'http://video.xinpianchang.com/58a50ec38ddd5.mp4'}]" 206 | 2017-03-03,明明是彼此相爱的两个人,却一而再地因为没有解开的误会越走越远······ ,http://cs.xinpianchang.com/uploadfile/article/2017/03/04/58ba79a510b75.jpeg,03:45,25 有些感情,错过就不再,"[{'resolution': '1920x1080', 207 | 'video_url': 'http://video.xinpianchang.com/58b91a02b0b44.mp4'}, {'resolution': '1280x720', 208 | 'video_url': 'http://video.xinpianchang.com/58b91a02bce03.mp4'}, {'resolution': '960x540', 209 | 'video_url': 'http://video.xinpianchang.com/58b91a02cf0ec.mp4'}, {'resolution': '640x360', 210 | 'video_url': 'http://video.xinpianchang.com/58b91a02e48c1.mp4'}]" 211 | 2017-03-03,有多少藏在心底的喜欢因为缺少一份说出口的准备而成王败寇,男生也有口是心非的权利,只要结局是好的怎样都行。,http://cs.xinpianchang.com/uploadfile/article/2017/02/22/58ad525431097.jpeg,04:26,22 哪有那么多凑巧的事,有的只是我喜欢你,"[{'resolution': '3840x2160', 212 | 'video_url': 'http://video.xinpianchang.com/58ad51bd74b69.mp4'}, {'resolution': '2560x1440', 213 | 'video_url': 'http://video.xinpianchang.com/58ad51bea91c3.mp4'}, {'resolution': '1920x1080', 214 | 'video_url': 'http://video.xinpianchang.com/58ad51bf13eab.mp4'}, {'resolution': '1280x720', 215 | 'video_url': 'http://video.xinpianchang.com/58ad51bfb7117.mp4'}, {'resolution': '960x540', 216 | 'video_url': 'http://video.xinpianchang.com/58ad51c032979.mp4'}, {'resolution': '640x360', 217 | 'video_url': 'http://video.xinpianchang.com/58ad51c09dfc2.mp4'}]" 218 | 2017-03-03,即便分居两地又怎样,我爱你胜过这场雨。 ,http://cs.xinpianchang.com/uploadfile/article/2017/02/27/58b3e7a21387d.jpeg,02:38,23 所有想对你说的话,其实只有我想你,"[{'resolution': '1920x1080', 219 | 'video_url': 'http://video.xinpianchang.com/58b3ab7db4ff8.mp4'}, {'resolution': '1280x720', 220 | 'video_url': 'http://video.xinpianchang.com/58b3ab7f04d94.mp4'}, {'resolution': '960x540', 221 | 'video_url': 'http://video.xinpianchang.com/58b3ab80a7cc6.mp4'}, {'resolution': '640x360', 222 | 'video_url': 'http://video.xinpianchang.com/58b3ab814406f.mp4'}]" 223 | 2017-03-03,有一道公式叫做,不完美的你加上爱你的我等于完美的爱情。 ,http://cs.xinpianchang.com/uploadfile/article/2017/03/02/58b7d11c97f88.jpeg,04:40,24 尽管你有很多坏习惯,但这也是我爱的你,"[{'resolution': '3840x2160', 224 | 'video_url': 'http://video.xinpianchang.com/58b7d1228ba90.mp4'}, {'resolution': '2560x1440', 225 | 'video_url': 'http://video.xinpianchang.com/58b7d12298ff0.mp4'}, {'resolution': '1920x1080', 226 | 'video_url': 'http://video.xinpianchang.com/58b7d122b339d.mp4'}, {'resolution': '1280x720', 227 | 'video_url': 'http://video.xinpianchang.com/58b7d122c9a43.mp4'}, {'resolution': '960x540', 228 | 'video_url': 'http://video.xinpianchang.com/58b7d122dce25.mp4'}, {'resolution': '640x360', 229 | 'video_url': 'http://video.xinpianchang.com/58b7d12304000.mp4'}]" 230 | 2017-05-19,曾经一起度过的每个生日,走过的每个路牌,原来转瞬间就可以消失不见,就像曾经的很爱到最后的不爱。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/10/58eb95be7b4ce.jpeg,04:46,39 感情如人饮水,冷暖自知,"[{'resolution': '3840x2160', 231 | 'video_url': 'http://video.xinpianchang.com/58eb26f5e8b69.mp4'}, {'resolution': '2560x1440', 232 | 'video_url': 'http://video.xinpianchang.com/58eb26f633f6e.mp4'}, {'resolution': '1920x1080', 233 | 'video_url': 'http://video.xinpianchang.com/58eb26f64d369.mp4'}, {'resolution': '1280x720', 234 | 'video_url': 'http://video.xinpianchang.com/58eb26f6669f4.mp4'}, {'resolution': '960x540', 235 | 'video_url': 'http://video.xinpianchang.com/58eb26f6944b0.mp4'}, {'resolution': '640x360', 236 | 'video_url': 'http://video.xinpianchang.com/58eb26f6afd80.mp4'}]" 237 | 2017-05-19,爱情里为了对方低头认输并不是一件丢人的事情,只是为了能抓紧彼此的手走得更长久。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/20/58f873061e396.jpeg,04:06,43 深爱的人总会先低头,"[{'resolution': '3840x2160', 238 | 'video_url': 'http://video.xinpianchang.com/58f87307507ec.mp4'}, {'resolution': '2560x1440', 239 | 'video_url': 'http://video.xinpianchang.com/58f8730759c1e.mp4'}, {'resolution': '1920x1080', 240 | 'video_url': 'http://video.xinpianchang.com/58f87307718e2.mp4'}, {'resolution': '1280x720', 241 | 'video_url': 'http://video.xinpianchang.com/58f8730789f34.mp4'}, {'resolution': '960x540', 242 | 'video_url': 'http://video.xinpianchang.com/58f87307aaf09.mp4'}, {'resolution': '640x360', 243 | 'video_url': 'http://video.xinpianchang.com/58f87307c8e75.mp4'}]" 244 | 2017-05-19,不是失去了才懂得珍惜,而是以为拥有的不会失去。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/27/5901cdec9ee20.jpeg,04:55,44 患得患失的那份感情叫做过去,"[{'resolution': '3840x2160', 245 | 'video_url': 'http://video.xinpianchang.com/5901cde946b8a.mp4'}, {'resolution': '2560x1440', 246 | 'video_url': 'http://video.xinpianchang.com/5901cdedcdf0f.mp4'}, {'resolution': '1920x1080', 247 | 'video_url': 'http://video.xinpianchang.com/5901cdede4df5.mp4'}, {'resolution': '1280x720', 248 | 'video_url': 'http://video.xinpianchang.com/5901cdee3cfe1.mp4'}, {'resolution': '960x540', 249 | 'video_url': 'http://video.xinpianchang.com/5901cdee555a3.mp4'}, {'resolution': '640x360', 250 | 'video_url': 'http://video.xinpianchang.com/5901cdee6cfbb.mp4'}]" 251 | 2017-05-19,那些青春淋漓的日子里,因为喜欢一个人而成全了更好的自己。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/27/5901d5723ada3.jpeg,05:20,46 习惯的最初是因为喜欢才发生的,"[{'resolution': '3840x2160', 252 | 'video_url': 'http://video.xinpianchang.com/5901d572f3d19.mp4'}, {'resolution': '2560x1440', 253 | 'video_url': 'http://video.xinpianchang.com/5901d57309ef1.mp4'}, {'resolution': '1920x1080', 254 | 'video_url': 'http://video.xinpianchang.com/5901d57325f57.mp4'}, {'resolution': '1280x720', 255 | 'video_url': 'http://video.xinpianchang.com/5901d57343454.mp4'}, {'resolution': '960x540', 256 | 'video_url': 'http://video.xinpianchang.com/5901d5735d291.mp4'}, {'resolution': '640x360', 257 | 'video_url': 'http://video.xinpianchang.com/5901d5737fe03.mp4'}]" 258 | 2017-05-19,懵懂的岁月里,我们会遇到很多令我们怦然心动的人,我们会为了TA学习很多新技能,只为了跟TA有共同的话题;我们也会为了TA看完一场篮球赛,...,http://cs.xinpianchang.com/uploadfile/article/2017/04/28/5902e7ac8aa81.jpeg,04:06,47 路过青春遇到你,"[{'resolution': '3840x2160', 259 | 'video_url': 'http://video.xinpianchang.com/5902e7ade0311.mp4'}, {'resolution': '2560x1440', 260 | 'video_url': 'http://video.xinpianchang.com/5902e7adeb0e4.mp4'}, {'resolution': '1920x1080', 261 | 'video_url': 'http://video.xinpianchang.com/5902e7ae12a48.mp4'}, {'resolution': '1280x720', 262 | 'video_url': 'http://video.xinpianchang.com/5902e7ae35730.mp4'}, {'resolution': '960x540', 263 | 'video_url': 'http://video.xinpianchang.com/5902e7ae52092.mp4'}, {'resolution': '640x360', 264 | 'video_url': 'http://video.xinpianchang.com/5902e7ae6ba35.mp4'}]" 265 | 2017-05-19,都说爱情是一场博弈,不是你赢就是我输,不到最后都不知道赢家归谁。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/03/59097011d6efc.jpeg,02:07,48 每个意外都是不谋而合的心动,"[{'resolution': '3840x2160', 266 | 'video_url': 'http://video.xinpianchang.com/59097012c78bc.mp4'}, {'resolution': '2560x1440', 267 | 'video_url': 'http://video.xinpianchang.com/59097012d12ec.mp4'}, {'resolution': '1920x1080', 268 | 'video_url': 'http://video.xinpianchang.com/59097012f01ab.mp4'}, {'resolution': '1280x720', 269 | 'video_url': 'http://video.xinpianchang.com/590970131ca8c.mp4'}, {'resolution': '960x540', 270 | 'video_url': 'http://video.xinpianchang.com/5909701336f5c.mp4'}, {'resolution': '640x360', 271 | 'video_url': 'http://video.xinpianchang.com/5909701362f9a.mp4'}]" 272 | 2017-05-19,一个冬日的相遇,一个夏日的相恋,那些我和你一路携手走过的,是青春,也是毕业前的允诺。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/05/590c6b181311e.jpeg,03:29,49 说毕业不分手的人还在吗,"[{'resolution': '3840x2160', 273 | 'video_url': 'http://video.xinpianchang.com/590c6aff3e2a3.mp4'}, {'resolution': '2560x1440', 274 | 'video_url': 'http://video.xinpianchang.com/590c6aff4a590.mp4'}, {'resolution': '1920x1080', 275 | 'video_url': 'http://video.xinpianchang.com/590c6aff70d6a.mp4'}, {'resolution': '1280x720', 276 | 'video_url': 'http://video.xinpianchang.com/590c6affa28c8.mp4'}, {'resolution': '960x540', 277 | 'video_url': 'http://video.xinpianchang.com/590c6affbb1ac.mp4'}, {'resolution': '640x360', 278 | 'video_url': 'http://video.xinpianchang.com/590c6affd41c1.mp4'}]" 279 | 2017-05-19,回忆里的青春大多是无奈但又难以忘却的,当一个人的暗恋出现了第三个人的参与,这场关于青春的秘密,需要有一个人去解开。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/08/59104d37a6552.jpeg,06:31,50 三个人的友谊(上集),"[{'resolution': '3840x2160', 280 | 'video_url': 'http://video.xinpianchang.com/59104d3b63699.mp4'}, {'resolution': '2560x1440', 281 | 'video_url': 'http://video.xinpianchang.com/59104d3b6de31.mp4'}, {'resolution': '1920x1080', 282 | 'video_url': 'http://video.xinpianchang.com/59104d3bb801b.mp4'}, {'resolution': '1280x720', 283 | 'video_url': 'http://video.xinpianchang.com/59104d3bd282e.mp4'}, {'resolution': '960x540', 284 | 'video_url': 'http://video.xinpianchang.com/59104d3bec1bd.mp4'}, {'resolution': '640x360', 285 | 'video_url': 'http://video.xinpianchang.com/59104d3c1fcde.mp4'}]" 286 | 2017-05-19,与其叫她喝多点热水,不如直接去到她身边,陪伴才是最暖心的爱情方式。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/12/58ee2faf72972.jpeg,03:17,40 比喝多点喝水更暖心的关心方式,"[{'resolution': '3840x2160', 287 | 'video_url': 'http://video.xinpianchang.com/58ee2fb05786d.mp4'}, {'resolution': '2560x1440', 288 | 'video_url': 'http://video.xinpianchang.com/58ee2fb07347a.mp4'}, {'resolution': '1920x1080', 289 | 'video_url': 'http://video.xinpianchang.com/58ee2fb08b09c.mp4'}, {'resolution': '1280x720', 290 | 'video_url': 'http://video.xinpianchang.com/58ee2fb0a2467.mp4'}, {'resolution': '960x540', 291 | 'video_url': 'http://video.xinpianchang.com/58ee2fb0b7693.mp4'}, {'resolution': '640x360', 292 | 'video_url': 'http://video.xinpianchang.com/58ee2fb0ce4e1.mp4'}]" 293 | 2017-05-19,把 “我喜欢你”从口中说出来需要一种勇气,但要是把这四个字换一种语言说出来,告白的几率会不会更大些。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/11/591464ac72561.jpeg,02:30,52 换一种告白姿势,"[{'resolution': '3840x2160', 294 | 'video_url': 'http://video.xinpianchang.com/591464b8f29ea.mp4'}, {'resolution': '2560x1440', 295 | 'video_url': 'http://video.xinpianchang.com/591464b91edcf.mp4'}, {'resolution': '1920x1080', 296 | 'video_url': 'http://video.xinpianchang.com/591464b93adf7.mp4'}, {'resolution': '1280x720', 297 | 'video_url': 'http://video.xinpianchang.com/591464b9bff54.mp4'}, {'resolution': '960x540', 298 | 'video_url': 'http://video.xinpianchang.com/591464b9dad78.mp4'}, {'resolution': '640x360', 299 | 'video_url': 'http://video.xinpianchang.com/591464ba07861.mp4'}]" 300 | 2017-05-19,未完的青春,总要带一些遗憾才显得格外深刻。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/10/5912858436e7c.jpeg,07:00,51 《三个人的友谊 》下集,"[{'resolution': '3840x2160', 301 | 'video_url': 'http://video.xinpianchang.com/591285876e683.mp4'}, {'resolution': '2560x1440', 302 | 'video_url': 'http://video.xinpianchang.com/5912858778491.mp4'}, {'resolution': '1920x1080', 303 | 'video_url': 'http://video.xinpianchang.com/5912858794511.mp4'}, {'resolution': '1280x720', 304 | 'video_url': 'http://video.xinpianchang.com/59128587b6599.mp4'}, {'resolution': '960x540', 305 | 'video_url': 'http://video.xinpianchang.com/59128587dd85a.mp4'}, {'resolution': '640x360', 306 | 'video_url': 'http://video.xinpianchang.com/591285883003c.mp4'}]" 307 | 2017-05-19,当喜欢的人早就喜欢你,哪怕把这首歌所有的音调弹成G调都是好听的。,http://cs.xinpianchang.com/uploadfile/article/2017/05/17/591c28d63a23c.jpeg,05:15,54 当喜欢的人早就喜欢你,"[{'resolution': '1920x1080', 308 | 'video_url': 'http://video.xinpianchang.com/591c28d8bbbc7.mp4'}, {'resolution': '1280x720', 309 | 'video_url': 'http://video.xinpianchang.com/591c28d8dc513.mp4'}, {'resolution': '960x540', 310 | 'video_url': 'http://video.xinpianchang.com/591c28d90691d.mp4'}, {'resolution': '640x360', 311 | 'video_url': 'http://video.xinpianchang.com/591c28d928796.mp4'}]" 312 | 2017-05-19,在平凡无奇的生活里,却因为多了那一个可爱的人而变得越加逗趣。 ,http://cs.xinpianchang.com/uploadfile/article/2017/05/19/591eb7574ac00.jpeg,05:41,55 所有快乐的方式都非你不可,"[{'resolution': '3840x2160', 313 | 'video_url': 'http://video.xinpianchang.com/591eb75a69f8c.mp4'}, {'resolution': '2560x1440', 314 | 'video_url': 'http://video.xinpianchang.com/591eb75a76773.mp4'}, {'resolution': '1920x1080', 315 | 'video_url': 'http://video.xinpianchang.com/591eb75a973ac.mp4'}, {'resolution': '1280x720', 316 | 'video_url': 'http://video.xinpianchang.com/591eb75ab232d.mp4'}, {'resolution': '960x540', 317 | 'video_url': 'http://video.xinpianchang.com/591eb75af3be4.mp4'}, {'resolution': '640x360', 318 | 'video_url': 'http://video.xinpianchang.com/591eb75b262c8.mp4'}]" 319 | 2017-04-07,悄无声息的离开也意味着感情的终结。,http://cs.xinpianchang.com/uploadfile/article/2017/03/13/58c651ac9233d.jpeg,02:59,27 真正的离开没有告别,"[{'resolution': '1920x1080', 320 | 'video_url': 'http://video.xinpianchang.com/58bf6a58785df.mp4'}, {'resolution': '1280x720', 321 | 'video_url': 'http://video.xinpianchang.com/58bf6a588625f.mp4'}, {'resolution': '960x540', 322 | 'video_url': 'http://video.xinpianchang.com/58bf6a589e89f.mp4'}, {'resolution': '640x360', 323 | 'video_url': 'http://video.xinpianchang.com/58bf6a58b5b1d.mp4'}]" 324 | 2017-04-07,那些让你不安的玩笑,其实字字都是我爱你。 ,http://cs.xinpianchang.com/uploadfile/article/2017/04/07/58e7627a09f5a.jpeg,05:00,36 不经意的玩笑其实都暗藏深情,"[{'resolution': '3840x2160', 325 | 'video_url': 'http://video.xinpianchang.com/58dfa013d900f.mp4'}, {'resolution': '2560x1440', 326 | 'video_url': 'http://video.xinpianchang.com/58dfa013e15db.mp4'}, {'resolution': '1920x1080', 327 | 'video_url': 'http://video.xinpianchang.com/58dfa014067ab.mp4'}, {'resolution': '1280x720', 328 | 'video_url': 'http://video.xinpianchang.com/58dfa0141b4bc.mp4'}, {'resolution': '960x540', 329 | 'video_url': 'http://video.xinpianchang.com/58dfa01431c0c.mp4'}, {'resolution': '640x360', 330 | 'video_url': 'http://video.xinpianchang.com/58dfa0144b29b.mp4'}]" 331 | 2016-07-25,因为你 我爱上一首永远也忘不掉的歌,http://cs.xinpianchang.com/uploadfile/article/2016/03/10/56e1156af2c7b.jpeg,03:33,02 因为你 我爱上一首永远也忘不掉的歌,"[{'resolution': '1920x1080', 332 | 'video_url': 'http://le.video.xinpianchang.com/5795eb63c658a.mp4'}, {'resolution': '1280x720', 333 | 'video_url': 'http://le.video.xinpianchang.com/5795eb63cf1ce.mp4'}, {'resolution': '960x540', 334 | 'video_url': 'http://le.video.xinpianchang.com/5795eb63e888e.mp4'}, {'resolution': '640x360', 335 | 'video_url': 'http://le.video.xinpianchang.com/5795eb6406df5.mp4'}]" 336 | 2016-07-25,换一种方式说喜欢,也未尝不是一种好的尝试;喜欢嘛,就别等下一次,http://cs.xinpianchang.com/uploadfile/article/2016/02/17/56c41d243992f.jpeg,04:03,01 换一种方式说爱你,"[{'resolution': '1920x1080', 337 | 'video_url': 'http://le.video.xinpianchang.com/5795eabb41e61.mp4'}, {'resolution': '1280x720', 338 | 'video_url': 'http://le.video.xinpianchang.com/5795eabb521da.mp4'}, {'resolution': '960x540', 339 | 'video_url': 'http://le.video.xinpianchang.com/5795eabb5ed5b.mp4'}, {'resolution': '640x360', 340 | 'video_url': 'http://le.video.xinpianchang.com/5795eabb7ce37.mp4'}]" 341 | -------------------------------------------------------------------------------- /litterlove/data.xml: -------------------------------------------------------------------------------- 1 | 2 | 04 有一种喜欢,只能藏在心底…02:102016-07-25http://cs.xinpianchang.com/uploadfile/article/2016/06/30/5774c74c63073.jpeg有些人,越是在乎,就越不敢触碰因为知道,友情比爱情来得更长久所以,有些喜欢,只敢藏在心底。02 因为你 我爱上一首永远也忘不掉的歌03:332016-07-25http://cs.xinpianchang.com/uploadfile/article/2016/03/10/56e1156af2c7b.jpeg因为你 我爱上一首永远也忘不掉的歌03 暗恋,是自作聪明后被识破的脸红02:432016-07-25http://cs.xinpianchang.com/uploadfile/article/2016/06/30/5774c5f3a91e1.jpeg暗恋,是自作聪明后被识破的脸红。暗恋的美好,是心里偷偷喜欢,小小的悸动就组成一个世界。暗恋是排山倒海又不动声色,但庆幸那时的我们是如此勇敢、炙热...05 你擦肩而过了多少爱情03:002016-07-25http://cs.xinpianchang.com/uploadfile/article/2016/07/25/5795a6ad43c02.jpeg我走过大街小巷,用相机记录沿途的风景和遇见的人。那天我走过花店,阳光透过玻璃洒在你的脸上。你拨动着琴弦,轻吟浅唱,我不自觉地按下快门键。每天与我擦...01 换一种方式说爱你04:032016-07-25http://cs.xinpianchang.com/uploadfile/article/2016/02/17/56c41d243992f.jpeg换一种方式说喜欢,也未尝不是一种好的尝试;喜欢嘛,就别等下一次48 每个意外都是不谋而合的心动02:072017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/03/59097011d6efc.jpeg都说爱情是一场博弈,不是你赢就是我输,不到最后都不知道赢家归谁。 49 说毕业不分手的人还在吗03:292017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/05/590c6b181311e.jpeg一个冬日的相遇,一个夏日的相恋,那些我和你一路携手走过的,是青春,也是毕业前的允诺。 50 三个人的友谊(上集)06:312017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/08/59104d37a6552.jpeg回忆里的青春大多是无奈但又难以忘却的,当一个人的暗恋出现了第三个人的参与,这场关于青春的秘密,需要有一个人去解开。 51 《三个人的友谊 》下集07:002017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/10/5912858436e7c.jpeg未完的青春,总要带一些遗憾才显得格外深刻。 52 换一种告白姿势02:302017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/11/591464ac72561.jpeg把 “我喜欢你”从口中说出来需要一种勇气,但要是把这四个字换一种语言说出来,告白的几率会不会更大些。 40 比喝多点喝水更暖心的关心方式03:172017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/04/12/58ee2faf72972.jpeg与其叫她喝多点热水,不如直接去到她身边,陪伴才是最暖心的爱情方式。 55 所有快乐的方式都非你不可05:412017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/19/591eb7574ac00.jpeg在平凡无奇的生活里,却因为多了那一个可爱的人而变得越加逗趣。 54 当喜欢的人早就喜欢你05:152017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/05/17/591c28d63a23c.jpeg当喜欢的人早就喜欢你,哪怕把这首歌所有的音调弹成G调都是好听的。43 深爱的人总会先低头04:062017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/04/20/58f873061e396.jpeg爱情里为了对方低头认输并不是一件丢人的事情,只是为了能抓紧彼此的手走得更长久。 39 感情如人饮水,冷暖自知04:462017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/04/10/58eb95be7b4ce.jpeg曾经一起度过的每个生日,走过的每个路牌,原来转瞬间就可以消失不见,就像曾经的很爱到最后的不爱。 44 患得患失的那份感情叫做过去04:552017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/04/27/5901cdec9ee20.jpeg不是失去了才懂得珍惜,而是以为拥有的不会失去。 13 有一种门当户对,叫我们很配03:242017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/01/11/5875c3e054747.jpeg遇到你的时候,才知道,有一种门当户对,叫我们很配。17 比恋人更默契的是闺蜜02:282017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/12/589fd0f048a2e.jpeg世界上除了恋人还有比恋人更默契的是,陪你一起哭、一起笑、一起长大的闺蜜。16 新年快乐,我喜欢你02:292017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/01/589185dd53882.jpeg等到一年过去的时候才惊觉时间过得太快,就像喜欢了很久的他,从大街小巷踌躇到四下无人,只为了发一句看似群发的新年祝福。20 爱情的模样,要跨过手机屏幕才看得到02:372017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/18/58a7dd6228fbc.jpeg一段感情既然开始了,就不能随随便便结束。14 恋爱中迈出的一大步就是见到了你的素颜03:092017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/04/589564eec95f2.jpeg放下那些礼貌的装饰,把爱装在眼里,迈进心里。19 恋爱的温度,就是找回你的微笑03:572017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/18/58a7dd292926b.jpeg恋人之间总是不可避免地产生很多摩擦和矛盾,而化解矛盾的终极手段就是把对方的所有可爱的模样都放在眼里,放进心里,表达出来。18 喜欢你是真的,想让你开心也是真的03:562017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/13/58a15480dde1b.jpeg精心准备的情人节礼物,只为了让喜欢的人送给TA喜欢的人。21 失恋没什么大不了03:042017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/04/10/58eb36e715a2c.jpeg这段感情虽然没有开花结果,但却在各自的生命里留下了不可抹灭的印记。22 哪有那么多凑巧的事,有的只是我喜欢你04:262017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/22/58ad525431097.jpeg有多少藏在心底的喜欢因为缺少一份说出口的准备而成王败寇,男生也有口是心非的权利,只要结局是好的怎样都行。23 所有想对你说的话,其实只有我想你02:382017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/02/27/58b3e7a21387d.jpeg即便分居两地又怎样,我爱你胜过这场雨。 24 尽管你有很多坏习惯,但这也是我爱的你04:402017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/03/02/58b7d11c97f88.jpeg有一道公式叫做,不完美的你加上爱你的我等于完美的爱情。 25 有些感情,错过就不再03:452017-03-03http://cs.xinpianchang.com/uploadfile/article/2017/03/04/58ba79a510b75.jpeg明明是彼此相爱的两个人,却一而再地因为没有解开的误会越走越远······ 26 爱情是科学无法解释的事情03:002017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/07/58be2072d4962.jpeg有人说,爱情由肾上腺素决定出不出手,多巴胺决定天长地久,羟色胺决定谁先开口,说到底,都是化学反应。 27 真正的离开没有告别02:592017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/13/58c651ac9233d.jpeg悄无声息的离开也意味着感情的终结。28 感谢关照,后会有期04:482017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/13/58c6523aa0cbe.jpeg只道从此,渐行的时光里不再有你。29 套路是我学的,但撩你是真心的04:122017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/14/58c79e763e717.jpeg只要能霸占你所有的怀抱,哪怕崴了脚也能加速向前。08 为了对你好,我假装对所有人都好03:182016-12-23http://cs.xinpianchang.com/uploadfile/article/2016/10/14/5800f7c6ba939.jpeg在你眼中,我对所有人都好。但在所有人眼中,我只对你好。06 如果失恋就是世界末日02:102016-12-23http://cs.xinpianchang.com/uploadfile/article/2016/07/29/579b458c2b0d8.jpeg如果失恋就是世界末日那我要在末日前学会很多事我要学会接受自己不快乐的样子学会不再为迎合他的喜好而打扮学会善待自己的肠胃,而不只是费心琢磨他的口...31 暗恋是一个需要小心轻放的青春02:362017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/23/58d36542623ae.jpeg暗恋这件青春里无可逃避的小事,需要小心轻放。 07 我永远记得你——前座男生02:372016-12-23http://cs.xinpianchang.com/uploadfile/article/2016/08/26/57c009adb1583.jpeg上学的时候,班里总有一个学习好,笑起来又很好看的男生。那时候的我不知道该如何靠近他,每次请他讲题都是我故意靠近他的借口。现在的我早已经记不得他口中...30 暗恋是一个人的独角戏,也可能是两个人的浪漫剧03:292017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/24/58d4b1fac4f94.jpeg暗恋一个人的时候,连他的每一个脚步声都记得一清二楚,谁又能想到,结局会让人这样的惊喜。 34 因爱情而生的宠物情缘03:442017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/28/58da25a5d6ea3.jpeg一段因爱情而生的宠物情缘。32 当造物集开始书写小情书03:152017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/03/27/58d8cb4c1e739.jpeg手作人玉头遇见摄影师左目,从两情相悦到岁月静好,造物集是他们永恒的小情书。35 一切都抵不过见面时你给的那个拥抱03:452017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/04/07/58e762cf61050.jpeg如果不想屈服于异地恋的魔爪,那就选个好天气往对方的城市靠近吧,拉近爱情需要战胜远距离! 37 时间的轨迹(上集)04:532017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/04/10/58eb3815b1288.jpeg回忆从前在校园里走过的路,听过的歌,原来一切过去已经很多年,但有些东西是时间改变不了的。下集内容请关注微博微信@小情书LOVOTE36 不经意的玩笑其实都暗藏深情05:002017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/04/07/58e7627a09f5a.jpeg那些让你不安的玩笑,其实字字都是我爱你。 09 忘记你,还是忘记爱情的味道?05:342016-12-23http://cs.xinpianchang.com/uploadfile/article/2016/11/11/58256a158d7ab.jpeg忘记太难,因为要走一遍来时的路。来时的路上处处有你的身影。我给了自己一个机会,回忆与你的全部往昔,等待可以好好和你说再见的那一天。38 时间的轨迹(下集)04:262017-04-07http://cs.xinpianchang.com/uploadfile/article/2017/04/07/58e7621a7dc26.jpeg回忆从前在校园里走过的路,听过的歌,原来一切过去已经很多年,但有些东西是时间改变不了的。10 愿冬天所有女生都能把手放在喜欢的人的口袋里02:592016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/01/16/587c578308741.jpeg喜欢是冬天里藏在手心的温暖,希望所有的女生都能在冬天把手放在喜欢的人的口袋里。41 迟到的人不必等03:062016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/04/20/58f85b349bd7a.jpeg晚到的人怎么会懂得等待,双方从一开始的目的地就已经相隔十万八千里。 33 恋爱是每一步都要走得舒服03:422016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/03/28/58da2533ecab8.jpeg恋爱是每一步都要走得舒服。15 “正在输入”是了然于心却要思前想后的喜欢03:092016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/02/01/589184ccb07db.jpeg想要借着拜年祝福把犹豫不决的心意打开,正巧的是你把我的“正在输入”都收在了眼里。42 错的时间遇见对的人04:082016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/04/20/58f85eed04128.jpeg在错的时间遇到了对的人,等待的结果会让人如愿吗? 45 爱从不因时间而停止03:592016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/04/27/5901cf71c5005.jpeg因为害怕分开的日子里无止的想念,所以提出分手,可是,你怎么知道他不会回来呢? 53 最合适的伴侣一直在身边04:232016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/05/17/591c28154ba19.jpeg不管是朋友还是情人,我都喜欢你。 46 习惯的最初是因为喜欢才发生的05:202017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/04/27/5901d5723ada3.jpeg那些青春淋漓的日子里,因为喜欢一个人而成全了更好的自己。 11 喜欢你是我做过最勇敢的事02:232016-12-23http://cs.xinpianchang.com/uploadfile/article/2017/01/16/587c57c8a2846.jpeg把“喜欢你”印在信封上,如果你看不到,我就说出来给你听。47 路过青春遇到你04:062017-05-19http://cs.xinpianchang.com/uploadfile/article/2017/04/28/5902e7ac8aa81.jpeg懵懂的岁月里,我们会遇到很多令我们怦然心动的人,我们会为了TA学习很多新技能,只为了跟TA有共同的话题;我们也会为了TA看完一场篮球赛,... -------------------------------------------------------------------------------- /litterlove/litterlove/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kba977/Scrapy_Projects/f06bb96d802c0722a399419d27dcae4682b65fc9/litterlove/litterlove/__init__.py -------------------------------------------------------------------------------- /litterlove/litterlove/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LitterloveItem(scrapy.Item): 12 | title = scrapy.Field() 13 | length = scrapy.Field() 14 | date = scrapy.Field() 15 | image_url = scrapy.Field() 16 | description = scrapy.Field() 17 | video = scrapy.Field() 18 | 19 | class VideoItem(scrapy.Item): 20 | resolution = scrapy.Field() 21 | video_url = scrapy.Field() -------------------------------------------------------------------------------- /litterlove/litterlove/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LitterloveSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /litterlove/litterlove/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import re 8 | import pymongo 9 | from scrapy.exceptions import DropItem 10 | from scrapy.conf import settings 11 | 12 | class DateFormatPipeline(object): 13 | def process_item(self, item, spider): 14 | item['title'] = re.split('[丨|]', item['title'])[-1].strip() 15 | item['date'] = item['date'].split(':')[-1] 16 | item['image_url'] = item['image_url'].split('@')[0] 17 | 18 | description = item['description'] 19 | if '内容简介' in description: 20 | description = re.findall(r'内容简介:(.*)', description, re.S)[0] 21 | if '更多' in description: 22 | description = re.findall(r'(.*)更多.*', description, re.S)[0] 23 | item['description'] = description.replace('\n', '') 24 | return item 25 | 26 | class MongoDBPipeline(object): 27 | def __init__(self): 28 | self.connection = pymongo.MongoClient( 29 | settings['MONGODB_SERVER'], 30 | settings['MONGODB_PORT'] 31 | ) 32 | self.db = self.connection[settings['MONGODB_DB']] 33 | 34 | def process_item(self, item, spider): 35 | vaild = True 36 | for data in item: 37 | if not data: 38 | vaild = False 39 | raise DropItem("Missing {0}!".format(data)) 40 | if vaild: 41 | self.collection = self.db[settings['MONGODB_TABLE']] 42 | self.collection.insert(dict(item)) 43 | return item 44 | -------------------------------------------------------------------------------- /litterlove/litterlove/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for litterlove project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'litterlove' 13 | 14 | SPIDER_MODULES = ['litterlove.spiders'] 15 | NEWSPIDER_MODULE = 'litterlove.spiders' 16 | 17 | MONGODB_SERVER = "localhost" 18 | MONGODB_PORT = 27017 19 | MONGODB_DB = "LitterLove" 20 | MONGODB_TABLE = "videos" 21 | 22 | 23 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 24 | #USER_AGENT = 'litterlove (+http://www.yourdomain.com)' 25 | 26 | # Obey robots.txt rules 27 | ROBOTSTXT_OBEY = False 28 | 29 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 30 | #CONCURRENT_REQUESTS = 32 31 | 32 | # Configure a delay for requests for the same website (default: 0) 33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 34 | # See also autothrottle settings and docs 35 | #DOWNLOAD_DELAY = 3 36 | # The download delay setting will honor only one of: 37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 38 | #CONCURRENT_REQUESTS_PER_IP = 16 39 | 40 | # Disable cookies (enabled by default) 41 | #COOKIES_ENABLED = False 42 | 43 | # Disable Telnet Console (enabled by default) 44 | #TELNETCONSOLE_ENABLED = False 45 | 46 | # Override the default request headers: 47 | #DEFAULT_REQUEST_HEADERS = { 48 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 49 | # 'Accept-Language': 'en', 50 | #} 51 | 52 | # Enable or disable spider middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 54 | #SPIDER_MIDDLEWARES = { 55 | # 'litterlove.middlewares.LitterloveSpiderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable downloader middlewares 59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 60 | #DOWNLOADER_MIDDLEWARES = { 61 | # 'litterlove.middlewares.MyCustomDownloaderMiddleware': 543, 62 | #} 63 | 64 | # Enable or disable extensions 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 72 | ITEM_PIPELINES = { 73 | 'litterlove.pipelines.DateFormatPipeline': 300, 74 | 'litterlove.pipelines.MongoDBPipeline': 500, 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | -------------------------------------------------------------------------------- /litterlove/litterlove/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /litterlove/litterlove/spiders/myspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from litterlove.items import LitterloveItem, VideoItem 4 | import json, re 5 | 6 | class MyspiderSpider(scrapy.Spider): 7 | name = "love" 8 | allowed_domains = ["xinpianchang.com"] 9 | start_urls = ['http://www.xinpianchang.com/index.php?app=user&ac=space&ajax=1&id=837979&page=%s' % i for i in range(1,6) ] 10 | 11 | def parse(self, response): 12 | # self.logger.info(response.url) 13 | lists = response.xpath('//li') 14 | 15 | for lst in lists: 16 | item = LitterloveItem() 17 | item['title'] = lst.xpath('a/img/@title').extract_first() 18 | item['length'] = lst.xpath('a/em/text()').extract_first() 19 | item['date'] = lst.xpath('//span[@class="master-type-date"]/text()').extract_first() 20 | item['image_url'] = lst.xpath('a/img/@src').extract_first() 21 | item['description'] = lst.xpath('div[@class="master-type-intro master-type-intro-space"]/div/p/text()').extract_first() 22 | item['video'] = [] 23 | video_detail_url = lst.xpath('a/@href').extract_first() 24 | 25 | yield scrapy.Request( 26 | url = video_detail_url, 27 | meta = { 28 | 'item': item 29 | }, 30 | callback = self.video_detail_parse 31 | ) 32 | 33 | def video_detail_parse(self, response): 34 | item = response.meta.get('item') 35 | self.logger.info(response.url) 36 | videos = json.loads(response.xpath('//script').re_first('origins = (\[.*?\])')) 37 | for video in videos: 38 | video_item = VideoItem() 39 | video_item['video_url'] = video.get('qiniu_url', None) 40 | video_item['resolution'] = video.get('resolution', None) 41 | item['video'].append(video_item) 42 | yield item -------------------------------------------------------------------------------- /litterlove/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = litterlove.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = litterlove 12 | --------------------------------------------------------------------------------