├── .gitignore ├── README.md ├── db.sql ├── scrapy.cfg └── spider_news_cctv ├── __init__.py ├── __init__.pyc ├── items.py ├── items.pyc ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders ├── __init__.py ├── __init__.pyc ├── xwlb.py ├── xwlb.pyc ├── xwlb1.py ├── xwlb1.pyc ├── xwlb2.py ├── xwlb2.pyc ├── xwlb3.py ├── xwlb3.pyc ├── xwlb4.py ├── xwlb4.pyc ├── xwlb5.py ├── xwlb5.pyc ├── xwlb6.py └── xwlb6.pyc /.gitignore: -------------------------------------------------------------------------------- 1 | news_xwlb.sql -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ##README 2 | 3 | ------------------------------- 4 | 5 | - 当前的`settings.py`经过调试相对比较稳定,不要轻易修改!!! 6 | - 当前,所有爬虫**增量**抓取的开关已经打开,如果需要,可以手动关闭,`/spiders/***.py`文件的`FLAG_INTERRUPT = True`常量 7 | 8 | - 20110406 ~ 20130715 ~ now `scrapy crawl xwlb` http://cctv.cntv.cn/lm/xinwenlianbo/20110406.shtml 9 | - 20100613 ~ 20110405 `scrapy crawl xwlb1` http://news.cntv.cn/program/xwlb/20100613.shtml 10 | - 20100506 ~ 20100612 `scrapy crawl xwlb2` http://news.cntv.cn/program/xwlb/20100506.shtml 11 | - 20090626 ~ 20100505 `scrapy crawl xwlb3` http://news.cctv.com/program/xwlb/20090626.shtml 12 | - 20070831 ~ 20090625 `scrapy crawl xwlb4` http://www.cctv.com/news/xwlb/20070831/index.shtml 13 | 14 | //还存在无法解析的网页 15 | - 20061012 ~ 20070814 ~ 20070830 `scrapy crawl xwlb5` 20070814无法解析 http://www.cctv.com/news/xwlb/20061012/index.shtml 16 | - 20050609 ~ 20061011 解析方式与20061012 ~ 20070830一致,合并 http://www.cctv.com/news/xwlb/20050609/index.shtml 17 | 18 | - 20020908 ~ 20050608 `scrapy crawl xwlb6` http://www.cctv.com/news/xwlb/20020908/index.shtml 19 | - 如果有问题,可以发邮件沟通`hailong0707@gmail.com` 20 | -------------------------------------------------------------------------------- /db.sql: -------------------------------------------------------------------------------- 1 | -- mysql>source db.sql 2 | -- DROP语句慎用,第一次创建数据库的时候请取消注释 3 | -- DROP DATABASE IF EXISTS news; 4 | CREATE DATABASE news; 5 | USE news; 6 | CREATE TABLE news_xwlb( 7 | id int NOT NULL PRIMARY KEY AUTO_INCREMENT, 8 | day VARCHAR(255) NOT NULL, 9 | title VARCHAR(255) NOT NULL, 10 | keywords VARCHAR(255), 11 | url VARCHAR(255) NOT NULL, 12 | article TEXT 13 | ) CHARSET=utf8; 14 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = spider_news_cctv.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = spider_news_cctv 12 | -------------------------------------------------------------------------------- /spider_news_cctv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/__init__.py -------------------------------------------------------------------------------- /spider_news_cctv/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/__init__.pyc -------------------------------------------------------------------------------- /spider_news_cctv/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SpiderNewsCctvItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | day = scrapy.Field() 15 | title = scrapy.Field() 16 | keywords = scrapy.Field() 17 | url = scrapy.Field() 18 | article = scrapy.Field() 19 | pass 20 | -------------------------------------------------------------------------------- /spider_news_cctv/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/items.pyc -------------------------------------------------------------------------------- /spider_news_cctv/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import threading 8 | import MySQLdb 9 | from scrapy import log 10 | 11 | class SpiderNewsCctvPipeline(object): 12 | 13 | INSERT_NEWS_XWLB = ("INSERT INTO news_xwlb (title, day, url, keywords, article) " 14 | "VALUES (%s, %s, %s, %s, %s)") 15 | 16 | lock = threading.RLock() 17 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 18 | conn.set_character_set('utf8') 19 | cursor = conn.cursor() 20 | cursor.execute('SET NAMES utf8;') 21 | cursor.execute('SET CHARACTER SET utf8;') 22 | cursor.execute('SET character_set_connection=utf8;') 23 | 24 | def insert(self, title, day, url, keywords, article): 25 | self.lock.acquire() 26 | news = (title, day, url, keywords, article) 27 | try: 28 | self.cursor.execute(self.INSERT_NEWS_XWLB, news) 29 | log.msg(title + " saved successfully", level=log.INFO) 30 | except: 31 | log.msg("MySQL exception !!!", level=log.ERROR) 32 | self.lock.release() 33 | 34 | def process_item(self, item, spider): 35 | title = item['title'] 36 | day = item['day'] 37 | url = item['url'] 38 | keywords = item['keywords'] 39 | article = item['article'] 40 | self.insert(title, day, url, keywords, article) 41 | return item 42 | -------------------------------------------------------------------------------- /spider_news_cctv/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/pipelines.pyc -------------------------------------------------------------------------------- /spider_news_cctv/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for spider_news_cctv project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'spider_news_cctv' 12 | 13 | SPIDER_MODULES = ['spider_news_cctv.spiders'] 14 | NEWSPIDER_MODULE = 'spider_news_cctv.spiders' 15 | ITEM_PIPELINES = ['spider_news_cctv.pipelines.SpiderNewsCctvPipeline'] 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'spider_news_cctv (+http://www.yourdomain.com)' 19 | LOG_LEVEL = 'INFO' 20 | # LOG_FILE = 'info.log' 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'spider_news_finance (+http://www.yourdomain.com)' 24 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:24.0) Gecko/20100101 Firefox/24.0" 25 | # DOWNLOAD_DELAY = 1 26 | COOKIES_ENABLED = False 27 | -------------------------------------------------------------------------------- /spider_news_cctv/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/settings.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/__init__.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | class XwlbSpider(scrapy.Spider): 12 | name = "xwlb" 13 | allowed_domains = ["news.cntv.cn"] 14 | start_urls = ( 15 | 'http://cctv.cntv.cn/lm/xinwenlianbo/20150601.shtml', 16 | ) 17 | 18 | FLAG_INTERRUPT = False 19 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 20 | 21 | lock = threading.RLock() 22 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 23 | conn.set_character_set('utf8') 24 | cursor = conn.cursor() 25 | cursor.execute('SET NAMES utf8;') 26 | cursor.execute('SET CHARACTER SET utf8;') 27 | cursor.execute('SET character_set_connection=utf8;') 28 | 29 | URL_TEMPLATE = 'http://cctv.cntv.cn/lm/xinwenlianbo/%s.shtml' 30 | day = timedelta(1) 31 | now = date.today() 32 | 33 | def is_news_not_saved(self, title): 34 | if self.FLAG_INTERRUPT: 35 | self.lock.acquire() 36 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 37 | if rows > 0: 38 | log.msg("XWLB news saved all finished.", level=log.INFO) 39 | return False 40 | else: 41 | return True 42 | self.lock.release() 43 | else: 44 | return True 45 | 46 | def parse_news(self, response): 47 | log.msg("Start to parse news " + response.url, level=log.INFO) 48 | item = SpiderNewsCctvItem() 49 | day = title = keywords = url = article = '' 50 | url = response.url 51 | day = response.meta['day'] 52 | title = response.meta['title'] 53 | response = response.body.decode('utf-8') 54 | soup = BeautifulSoup(response) 55 | try: 56 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 57 | for i in range(0, len(items_keywords)): 58 | keywords += items_keywords[i].text.strip() 59 | except: 60 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 61 | try: 62 | article = soup.find(class_='body').text.strip() 63 | except: 64 | log.msg("News " + title + " dont has article!", level=log.INFO) 65 | item['title'] = title 66 | item['day'] = day 67 | item['url'] = url 68 | item['keywords'] = keywords 69 | item['article'] = article 70 | return item 71 | 72 | def parse(self, response): 73 | self.now = self.now - self.day 74 | str_now = self.now.strftime('%Y%m%d') 75 | next_parse = [] 76 | if (str_now == '20130715'): 77 | pass 78 | else: 79 | if self.now != (date.today() - self.day): 80 | try: 81 | res = response.body 82 | soup = BeautifulSoup(res) 83 | items = soup.find(class_=re.compile('title_list_box')).find_all("li") 84 | for i in range(1, len(items)): 85 | item_url = items[i].a['href'] 86 | title = items[i].a.text.strip() 87 | if not self.is_news_not_saved(title): 88 | return next_parse 89 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 90 | except: 91 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 92 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 93 | log.msg("Start to parse page " + url, level=log.INFO) 94 | next_parse.append(self.make_requests_from_url(url)) 95 | return next_parse 96 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | class Xwlb1Spider(scrapy.Spider): 12 | name = "xwlb1" 13 | allowed_domains = ["news.cntv.cn"] 14 | start_urls = ( 15 | 'http://news.cntv.cn/program/xwlb/20100613.shtml', 16 | ) 17 | 18 | FLAG_INTERRUPT = False 19 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 20 | 21 | lock = threading.RLock() 22 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 23 | conn.set_character_set('utf8') 24 | cursor = conn.cursor() 25 | cursor.execute('SET NAMES utf8;') 26 | cursor.execute('SET CHARACTER SET utf8;') 27 | cursor.execute('SET character_set_connection=utf8;') 28 | 29 | URL_TEMPLATE = 'http://news.cntv.cn/program/xwlb/%s.shtml' 30 | day = timedelta(1) 31 | now = date(2011, 4, 5) 32 | 33 | def is_news_not_saved(self, title): 34 | if self.FLAG_INTERRUPT: 35 | self.lock.acquire() 36 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 37 | if rows > 0: 38 | log.msg("XWLB news saved all finished.", level=log.INFO) 39 | return False 40 | else: 41 | return True 42 | self.lock.release() 43 | else: 44 | return True 45 | 46 | def parse_news(self, response): 47 | log.msg("Start to parse news " + response.url, level=log.INFO) 48 | item = SpiderNewsCctvItem() 49 | day = title = keywords = url = article = '' 50 | url = response.url 51 | day = response.meta['day'] 52 | title = response.meta['title'] 53 | response = response.body 54 | soup = BeautifulSoup(response) 55 | try: 56 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 57 | for i in range(0, len(items_keywords)): 58 | keywords += items_keywords[i].text.strip() 59 | except: 60 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 61 | try: 62 | article = soup.find(class_='body').text.strip() 63 | except: 64 | log.msg("News " + title + " dont has article!", level=log.INFO) 65 | item['title'] = title 66 | item['day'] = day 67 | item['url'] = url 68 | item['keywords'] = keywords 69 | item['article'] = article 70 | return item 71 | 72 | def parse(self, response): 73 | str_now = self.now.strftime('%Y%m%d') 74 | self.now = self.now - self.day 75 | next_parse = [] 76 | if (str_now == '20100612'): 77 | pass 78 | else: 79 | if self.now != (date(2011, 4, 5) - self.day): 80 | try: 81 | response = response.body 82 | soup = BeautifulSoup(response) 83 | items = soup.find(class_=re.compile('title_list_box')).find_all("li") 84 | for i in range(1, len(items)): 85 | item_url = items[i].a['href'] 86 | title = items[i].a.text.strip() 87 | if not self.is_news_not_saved(title): 88 | return next_parse 89 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 90 | except: 91 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 92 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 93 | log.msg("Start to parse page " + url, level=log.INFO) 94 | next_parse.append(self.make_requests_from_url(url)) 95 | return next_parse 96 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb1.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | 12 | class Xwlb2Spider(scrapy.Spider): 13 | name = "xwlb2" 14 | allowed_domains = ["news.cntv.cn"] 15 | start_urls = ( 16 | 'http://news.cntv.cn/program/xwlb/20100506.shtml', 17 | ) 18 | 19 | FLAG_INTERRUPT = False 20 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 21 | 22 | lock = threading.RLock() 23 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 24 | conn.set_character_set('utf8') 25 | cursor = conn.cursor() 26 | cursor.execute('SET NAMES utf8;') 27 | cursor.execute('SET CHARACTER SET utf8;') 28 | cursor.execute('SET character_set_connection=utf8;') 29 | 30 | URL_TEMPLATE = 'http://news.cntv.cn/program/xwlb/%s.shtml' 31 | day = timedelta(1) 32 | now = date(2010, 6, 12) 33 | 34 | def is_news_not_saved(self, title): 35 | if self.FLAG_INTERRUPT: 36 | self.lock.acquire() 37 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 38 | if rows > 0: 39 | log.msg("XWLB news saved all finished.", level=log.INFO) 40 | return False 41 | else: 42 | return True 43 | self.lock.release() 44 | else: 45 | return True 46 | 47 | def parse_news(self, response): 48 | log.msg("Start to parse news " + response.url, level=log.INFO) 49 | item = SpiderNewsCctvItem() 50 | day = title = keywords = url = article = '' 51 | url = response.url 52 | day = response.meta['day'] 53 | title = response.meta['title'] 54 | response = response.body 55 | soup = BeautifulSoup(response) 56 | try: 57 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 58 | for i in range(0, len(items_keywords)): 59 | keywords += items_keywords[i].text.strip() 60 | except: 61 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 62 | try: 63 | article = soup.find(class_='text_box padd').text.strip() 64 | except: 65 | log.msg("News " + title + " dont has article!", level=log.INFO) 66 | item['title'] = title 67 | item['day'] = day 68 | item['url'] = url 69 | item['keywords'] = keywords 70 | item['article'] = article 71 | return item 72 | 73 | def parse(self, response): 74 | str_now = self.now.strftime('%Y%m%d') 75 | self.now = self.now - self.day 76 | next_parse = [] 77 | if (str_now == '20100505'): 78 | pass 79 | else: 80 | if self.now != (date(2010, 6, 12) - self.day): 81 | try: 82 | response = response.body 83 | soup = BeautifulSoup(response) 84 | items = soup.find(class_=re.compile('title_list_box')).find_all("li") 85 | for i in range(1, len(items)): 86 | item_url = items[i].a['href'] 87 | title = items[i].a.text.strip() 88 | if not self.is_news_not_saved(title): 89 | return next_parse 90 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 91 | except: 92 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 93 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 94 | log.msg("Start to parse page " + url, level=log.INFO) 95 | next_parse.append(self.make_requests_from_url(url)) 96 | return next_parse 97 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb2.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | class Xwlb3Spider(scrapy.Spider): 12 | name = "xwlb3" 13 | allowed_domains = ["news.cctv.com"] 14 | start_urls = ( 15 | 'http://news.cctv.com/program/xwlb/20090626.shtml', 16 | ) 17 | 18 | FLAG_INTERRUPT = False 19 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 20 | 21 | lock = threading.RLock() 22 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 23 | conn.set_character_set('utf8') 24 | cursor = conn.cursor() 25 | cursor.execute('SET NAMES utf8;') 26 | cursor.execute('SET CHARACTER SET utf8;') 27 | cursor.execute('SET character_set_connection=utf8;') 28 | 29 | URL_TEMPLATE = 'http://news.cctv.com/program/xwlb/%s.shtml' 30 | day = timedelta(1) 31 | now = date(2010, 5, 5) 32 | 33 | def is_news_not_saved(self, title): 34 | if self.FLAG_INTERRUPT: 35 | self.lock.acquire() 36 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 37 | if rows > 0: 38 | log.msg("XWLB news saved all finished.", level=log.INFO) 39 | return False 40 | else: 41 | return True 42 | self.lock.release() 43 | else: 44 | return True 45 | 46 | def parse_news(self, response): 47 | log.msg("Start to parse news " + response.url, level=log.INFO) 48 | item = SpiderNewsCctvItem() 49 | day = title = keywords = url = article = '' 50 | url = response.url 51 | day = response.meta['day'] 52 | title = response.meta['title'] 53 | response = response.body 54 | soup = BeautifulSoup(response) 55 | try: 56 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 57 | for i in range(0, len(items_keywords)): 58 | keywords += items_keywords[i].text.strip() 59 | except: 60 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 61 | try: 62 | article = soup.find(class_='style_p2').text.strip() 63 | except: 64 | log.msg("News " + title + " dont has article!", level=log.INFO) 65 | item['title'] = title 66 | item['day'] = day 67 | item['url'] = url 68 | item['keywords'] = keywords 69 | item['article'] = article 70 | return item 71 | 72 | def parse(self, response): 73 | str_now = self.now.strftime('%Y%m%d') 74 | self.now = self.now - self.day 75 | next_parse = [] 76 | if (str_now == '20090625'): 77 | pass 78 | else: 79 | if self.now != (date(2010, 5, 5) - self.day): 80 | try: 81 | response = response.body 82 | soup = BeautifulSoup(response) 83 | items = soup.find(class_=re.compile('title_list_box')).find_all("li") 84 | for i in range(1, len(items)): 85 | item_url = items[i].a['href'] 86 | title = items[i].a.text.strip() 87 | if not self.is_news_not_saved(title): 88 | return next_parse 89 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 90 | except: 91 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 92 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 93 | log.msg("Start to parse page " + url, level=log.INFO) 94 | next_parse.append(self.make_requests_from_url(url)) 95 | return next_parse 96 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb3.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb3.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | class Xwlb4Spider(scrapy.Spider): 12 | name = "xwlb4" 13 | allowed_domains = ["www.cctv.com"] 14 | start_urls = ( 15 | 'http://www.cctv.com/news/xwlb/20070831/index.shtml', 16 | ) 17 | 18 | FLAG_INTERRUPT = False 19 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 20 | 21 | lock = threading.RLock() 22 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 23 | conn.set_character_set('utf8') 24 | cursor = conn.cursor() 25 | cursor.execute('SET NAMES utf8;') 26 | cursor.execute('SET CHARACTER SET utf8;') 27 | cursor.execute('SET character_set_connection=utf8;') 28 | 29 | URL_TEMPLATE = 'http://www.cctv.com/news/xwlb/%s/index.shtml' 30 | day = timedelta(1) 31 | now = date(2009, 6, 25) 32 | 33 | def is_news_not_saved(self, title): 34 | if self.FLAG_INTERRUPT: 35 | self.lock.acquire() 36 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 37 | if rows > 0: 38 | log.msg("XWLB news saved all finished.", level=log.INFO) 39 | return False 40 | else: 41 | return True 42 | self.lock.release() 43 | else: 44 | return True 45 | 46 | def parse_news(self, response): 47 | log.msg("Start to parse news " + response.url, level=log.INFO) 48 | item = SpiderNewsCctvItem() 49 | day = title = keywords = url = article = '' 50 | url = response.url 51 | day = response.meta['day'] 52 | title = response.meta['title'] 53 | response = response.body 54 | soup = BeautifulSoup(response) 55 | try: 56 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 57 | for i in range(0, len(items_keywords)): 58 | keywords += items_keywords[i].text.strip() 59 | except: 60 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 61 | try: 62 | article = soup.find(class_='style_p2').text.strip() 63 | except: 64 | log.msg("News " + title + " dont has article!", level=log.INFO) 65 | item['title'] = title 66 | item['day'] = day 67 | item['url'] = url 68 | item['keywords'] = keywords 69 | item['article'] = article 70 | return item 71 | 72 | def parse(self, response): 73 | str_now = self.now.strftime('%Y%m%d') 74 | self.now = self.now - self.day 75 | next_parse = [] 76 | if (str_now == '20070830'): 77 | pass 78 | else: 79 | if self.now != (date(2009, 6, 25) - self.day): 80 | try: 81 | items = response.selector.xpath('//a[contains(@href, "http://news.cctv.com/xwlb")]').extract() 82 | for i in range(1, len(items)): 83 | soup = BeautifulSoup(items[i]) 84 | item_url = soup.a['href'] 85 | title = soup.a.text.strip() 86 | if not self.is_news_not_saved(title): 87 | return next_parse 88 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 89 | except: 90 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 91 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 92 | log.msg("Start to parse page " + url, level=log.INFO) 93 | next_parse.append(self.make_requests_from_url(url)) 94 | return next_parse 95 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb4.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | class Xwlb5Spider(scrapy.Spider): 12 | name = "xwlb5" 13 | allowed_domains = ["www.cctv.com"] 14 | start_urls = ( 15 | 'http://www.cctv.com/news/xwlb/20061012/index.shtml', 16 | ) 17 | 18 | FLAG_INTERRUPT = False 19 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 20 | 21 | lock = threading.RLock() 22 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 23 | conn.set_character_set('utf8') 24 | cursor = conn.cursor() 25 | cursor.execute('SET NAMES utf8;') 26 | cursor.execute('SET CHARACTER SET utf8;') 27 | cursor.execute('SET character_set_connection=utf8;') 28 | 29 | URL_TEMPLATE = 'http://www.cctv.com/news/xwlb/%s/index.shtml' 30 | day = timedelta(1) 31 | now = date(2007, 8, 30) 32 | 33 | def is_news_not_saved(self, title): 34 | if self.FLAG_INTERRUPT: 35 | self.lock.acquire() 36 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 37 | if rows > 0: 38 | log.msg("XWLB news saved all finished.", level=log.INFO) 39 | return False 40 | else: 41 | return True 42 | self.lock.release() 43 | else: 44 | return True 45 | 46 | def parse_news(self, response): 47 | log.msg("Start to parse news " + response.url, level=log.INFO) 48 | item = SpiderNewsCctvItem() 49 | day = title = keywords = url = article = '' 50 | url = response.url 51 | day = response.meta['day'] 52 | title = response.meta['title'] 53 | response = response.body 54 | soup = BeautifulSoup(response) 55 | try: 56 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 57 | for i in range(0, len(items_keywords)): 58 | keywords += items_keywords[i].text.strip() 59 | except: 60 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 61 | try: 62 | article = soup.find(id='content').text.strip() 63 | except: 64 | log.msg("News " + title + " dont has article!", level=log.INFO) 65 | item['title'] = title 66 | item['day'] = day 67 | item['url'] = url 68 | item['keywords'] = keywords 69 | item['article'] = article 70 | return item 71 | 72 | def parse(self, response): 73 | str_now = self.now.strftime('%Y%m%d') 74 | self.now = self.now - self.day 75 | if self.now == date(2007, 8, 14): 76 | self.now = self.now - self.day 77 | next_parse = [] 78 | if (str_now == '20050608'): 79 | pass 80 | else: 81 | if self.now != (date(2007, 8, 30) - self.day): 82 | try: 83 | # response = response.body 84 | # soup = BeautifulSoup(response) 85 | # items = soup.find(class_=re.compile('title_list_box')).find_all("li") 86 | items = response.selector.xpath('//a[contains(@href, "http://news.cctv.com/xwlb")]').extract() 87 | for i in range(1, len(items)): 88 | soup = BeautifulSoup(items[i]) 89 | item_url = soup.a['href'] 90 | title = soup.a.text.strip() 91 | if not self.is_news_not_saved(title): 92 | return next_parse 93 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 94 | except: 95 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 96 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 97 | log.msg("Start to parse page " + url, level=log.INFO) 98 | next_parse.append(self.make_requests_from_url(url)) 99 | return next_parse 100 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb5.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb5.pyc -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb6.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from scrapy import log 5 | import threading 6 | import MySQLdb 7 | from datetime import date, timedelta 8 | import re 9 | from spider_news_cctv.items import SpiderNewsCctvItem 10 | 11 | class Xwlb6Spider(scrapy.Spider): 12 | name = "xwlb6" 13 | allowed_domains = ["www.cctv.com"] 14 | start_urls = ( 15 | 'http://www.cctv.com/news/xwlb/20020908/index.shtml/', 16 | ) 17 | 18 | FLAG_INTERRUPT = False 19 | SELECT_NEWS_XWLB_BY_TITLE = "SELECT * FROM news_xwlb WHERE title='%s'" 20 | 21 | lock = threading.RLock() 22 | conn=MySQLdb.connect(user='root', passwd='123123', db='news', autocommit=True) 23 | conn.set_character_set('utf8') 24 | cursor = conn.cursor() 25 | cursor.execute('SET NAMES utf8;') 26 | cursor.execute('SET CHARACTER SET utf8;') 27 | cursor.execute('SET character_set_connection=utf8;') 28 | 29 | URL_TEMPLATE = 'http://www.cctv.com/news/xwlb/%s/index.shtml' 30 | day = timedelta(1) 31 | now = date(2005, 6, 8) 32 | 33 | def is_news_not_saved(self, title): 34 | if self.FLAG_INTERRUPT: 35 | self.lock.acquire() 36 | rows = self.cursor.execute(self.SELECT_NEWS_XWLB_BY_TITLE % (title)) 37 | if rows > 0: 38 | log.msg("XWLB news saved all finished.", level=log.INFO) 39 | return False 40 | else: 41 | return True 42 | self.lock.release() 43 | else: 44 | return True 45 | 46 | def parse_news(self, response): 47 | log.msg("Start to parse news " + response.url, level=log.INFO) 48 | item = SpiderNewsCctvItem() 49 | day = title = keywords = url = article = '' 50 | url = response.url 51 | day = response.meta['day'] 52 | title = response.meta['title'] 53 | response = response.body 54 | soup = BeautifulSoup(response) 55 | try: 56 | items_keywords = soup.find(class_='tags dot_x_t').find_all('li') 57 | for i in range(0, len(items_keywords)): 58 | keywords += items_keywords[i].text.strip() 59 | except: 60 | log.msg("News " + title + " dont has keywords!", level=log.INFO) 61 | try: 62 | article = soup.find(class_=re.compile('line3|large')).text.strip() 63 | except: 64 | log.msg("News " + title + " dont has article!", level=log.INFO) 65 | item['title'] = title 66 | item['day'] = day 67 | item['url'] = url 68 | item['keywords'] = keywords 69 | item['article'] = article 70 | return item 71 | 72 | def parse(self, response): 73 | str_now = self.now.strftime('%Y%m%d') 74 | self.now = self.now - self.day 75 | next_parse = [] 76 | if (str_now == '20020907'): 77 | pass 78 | else: 79 | if self.now != (date(2005, 6, 8) - self.day): 80 | try: 81 | # response = response.body 82 | # soup = BeautifulSoup(response) 83 | # items = soup.find(class_=re.compile('title_list_box')).find_all("li") 84 | items = response.selector.xpath('//a[contains(@href, "/news/xwlb/")]').extract() 85 | for i in range(1, len(items)): 86 | soup = BeautifulSoup(items[i]) 87 | item_url = "http://www.cctv.com" + soup.a['href'] 88 | title = soup.a.text.strip() 89 | if not self.is_news_not_saved(title): 90 | return next_parse 91 | next_parse.append(self.make_requests_from_url(item_url).replace(callback=self.parse_news, meta={'day': (self.now + self.day).strftime('%Y%m%d'), 'title': title})) 92 | except: 93 | log.msg("Page " + response.url + " parse error!", level=log.ERROR) 94 | url = self.URL_TEMPLATE % self.now.strftime('%Y%m%d') 95 | log.msg("Start to parse page " + url, level=log.INFO) 96 | next_parse.append(self.make_requests_from_url(url)) 97 | return next_parse 98 | -------------------------------------------------------------------------------- /spider_news_cctv/spiders/xwlb6.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailong0707-zz/spider_news_cctv/94fe6ad8d0b186afac6c913dd3f86b265d0ea836/spider_news_cctv/spiders/xwlb6.pyc --------------------------------------------------------------------------------