├── news_web ├── lib │ ├── utils.py │ ├── __init__.py │ ├── error_code.py │ ├── db_utils.py │ └── response.py ├── news_web │ ├── __init__.py │ ├── wsgi.py │ ├── urls.py │ ├── middlewares.py │ └── settings.py ├── web_server │ ├── __init__.py │ ├── migrations │ │ └── __init__.py │ ├── admin.py │ ├── tests.py │ ├── apps.py │ ├── urls.py │ ├── views.py │ └── models.py ├── run_server.sh ├── init_db.py ├── manage.py └── frontend │ ├── article.html │ └── subscription.html ├── newscrawler ├── newscrawler │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── netease.py │ │ └── qq.py │ ├── utils.py │ ├── items.py │ ├── wechat_config.py │ ├── wechat_push.py │ ├── pipelines.py │ ├── middlewares.py │ └── settings.py ├── worker.py ├── scrapy.cfg └── start_crawl.py ├── README.md ├── 论文相关文件 ├── 用例图.png ├── 爬虫部分目录.png ├── 系统总体框架.png ├── MongoDB.png ├── nginx配置.png ├── scrapy架构.png ├── spider实现.png ├── 启动API服务器.png ├── 启动spider.png ├── 新闻推送活动图.png ├── 新闻订阅活动图.png ├── WechatIMG37.png ├── WechatIMG38.png ├── WechatIMG39.png ├── WechatIMG40.png ├── useragent.png ├── 订阅与展示部分目录.png ├── WechatIMG41.jpeg ├── WechatIMG42.jpeg ├── WechatIMG43.jpeg ├── WechatIMG44.jpeg └── 基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf ├── .gitmodules ├── requirements.txt ├── LICENSE └── .gitignore /news_web/lib/utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/news_web/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/web_server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/web_server/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NewsCrawler 2 | 毕业设计 基于网络爬虫的新闻采集和订阅系统的设计与实现 3 | -------------------------------------------------------------------------------- /论文相关文件/用例图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/用例图.png -------------------------------------------------------------------------------- /论文相关文件/爬虫部分目录.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/爬虫部分目录.png -------------------------------------------------------------------------------- /论文相关文件/系统总体框架.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/系统总体框架.png -------------------------------------------------------------------------------- /news_web/web_server/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /news_web/web_server/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /论文相关文件/MongoDB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/MongoDB.png -------------------------------------------------------------------------------- /论文相关文件/nginx配置.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/nginx配置.png -------------------------------------------------------------------------------- /论文相关文件/scrapy架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/scrapy架构.png -------------------------------------------------------------------------------- /论文相关文件/spider实现.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/spider实现.png -------------------------------------------------------------------------------- /论文相关文件/启动API服务器.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/启动API服务器.png -------------------------------------------------------------------------------- /论文相关文件/启动spider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/启动spider.png -------------------------------------------------------------------------------- /论文相关文件/新闻推送活动图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/新闻推送活动图.png -------------------------------------------------------------------------------- /论文相关文件/新闻订阅活动图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/新闻订阅活动图.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG37.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG38.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG39.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG39.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG40.png -------------------------------------------------------------------------------- /论文相关文件/useragent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/useragent.png -------------------------------------------------------------------------------- /论文相关文件/订阅与展示部分目录.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/订阅与展示部分目录.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG41.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG41.jpeg -------------------------------------------------------------------------------- /论文相关文件/WechatIMG42.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG42.jpeg -------------------------------------------------------------------------------- /论文相关文件/WechatIMG43.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG43.jpeg -------------------------------------------------------------------------------- /论文相关文件/WechatIMG44.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG44.jpeg -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "IPProxyTool"] 2 | path = IPProxyTool 3 | url = https://github.com/awolfly9/IPProxyTool.git 4 | -------------------------------------------------------------------------------- /论文相关文件/基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf -------------------------------------------------------------------------------- /news_web/run_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "启动服务器" 3 | #python manage.py runserver 0.0.0.0:8000 4 | gunicorn news_web.wsgi:application -w 4 -b :8000 -k gevent --max-requests 1000 5 | -------------------------------------------------------------------------------- /news_web/init_db.py: -------------------------------------------------------------------------------- 1 | from web_server.models import NewsItem, Subscription 2 | 3 | 4 | if __name__ == '__main__': 5 | NewsItem.ensure_indexes() 6 | Subscription.ensure_indexes() 7 | -------------------------------------------------------------------------------- /news_web/web_server/apps.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from django.apps import AppConfig 4 | 5 | 6 | class WebServerConfig(AppConfig): 7 | name = 'web_server' 8 | -------------------------------------------------------------------------------- /newscrawler/worker.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | 4 | 5 | if __name__ == '__main__': 6 | while True: 7 | subprocess.call(['python', 'start_crawl.py']) 8 | time.sleep(30) 9 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pymongo==2.8.1 2 | scrapy==1.3.1 3 | redis 4 | fake-useragent 5 | django 6 | mongoengine 7 | jieba 8 | w3lib 9 | lxml 10 | twisted==16.4.1 11 | gevent==1.1.2 12 | greenlet==0.4.10 13 | gunicorn==19.6.0 14 | -------------------------------------------------------------------------------- /news_web/web_server/urls.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from django.conf.urls import url 3 | from views import ping, news, subscriptions 4 | 5 | urlpatterns = [ 6 | url(r'^ping$', ping), 7 | url(r'^news$', news), 8 | url(r'^subscriptions$', subscriptions), 9 | ] 10 | -------------------------------------------------------------------------------- /news_web/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "news_web.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /newscrawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = newscrawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = newscrawler 12 | -------------------------------------------------------------------------------- /news_web/lib/error_code.py: -------------------------------------------------------------------------------- 1 | 2 | err_code_message = { 3 | '10000': 'Unknown Error', 4 | '10001': 'Bad Request', 5 | '10005': 'Internal Error(Json Dumps Error)', 6 | '99999': 'Jsonschema Check Fail', 7 | } 8 | 9 | 10 | def get_message(err_code): 11 | return err_code_message.get(str(err_code), "Undefined Error Code") 12 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import redis 3 | from scrapy.conf import settings 4 | 5 | 6 | redis_conn = redis.Redis( 7 | host=settings['REDIS_CONF']['host'], 8 | port=settings['REDIS_CONF']['port'], 9 | db=settings['REDIS_CONF']['db'] 10 | ) 11 | redis_url_key = "url" 12 | redis_invalid_url_key = "invalid_url" 13 | -------------------------------------------------------------------------------- /news_web/lib/db_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import datetime 4 | import mongoengine 5 | 6 | 7 | def now(): 8 | return datetime.datetime.utcnow() 9 | 10 | 11 | def get_utc_seconds(dt): 12 | return int(time.mktime(dt.timetuple()) - time.timezone) 13 | 14 | 15 | def using_db(alias): 16 | mongoengine.register_connection( 17 | alias, alias, 18 | host="localhost", 19 | port=27017 20 | ) 21 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NewsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | content = scrapy.Field() 16 | source = scrapy.Field() 17 | published = scrapy.Field() 18 | url = scrapy.Field() 19 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/wechat_config.py: -------------------------------------------------------------------------------- 1 | 2 | appid = 'wx98236d864d8a122a' 3 | appsecret = '2530005d0440a7d3980ab6abca806357' 4 | token_url = 'https://api.weixin.qq.com/cgi-bin/token?' 5 | default_openid = 'oCfHOwD2_5ZpGctshTZPeZHqUIjc' 6 | maintainers = ['oCfHOwD2_5ZpGctshTZPeZHqUIjc'] 7 | expires_time = None 8 | 9 | template_id = 'B1ftH9IQr4Cuy_1M_94Je851RKGgQU-Uc0CH5ej4-oo' 10 | send_url = 'https://api.weixin.qq.com/cgi-bin/message/template/send?access_token=' 11 | template_url = 'http://182.254.225.214/article.html?newsId=' 12 | -------------------------------------------------------------------------------- /news_web/news_web/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for news_web project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | from gevent import monkey 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | 15 | monkey.patch_all() 16 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "news_web.settings") 17 | 18 | application = get_wsgi_application() 19 | -------------------------------------------------------------------------------- /newscrawler/start_crawl.py: -------------------------------------------------------------------------------- 1 | from twisted.internet import reactor, defer 2 | from scrapy.crawler import CrawlerRunner 3 | from scrapy.utils.project import get_project_settings 4 | from scrapy.utils.log import configure_logging 5 | from newscrawler.spiders.netease import NeteaseSpider 6 | from newscrawler.spiders.qq import QQSpider 7 | 8 | 9 | configure_logging() 10 | settings = get_project_settings() 11 | runner = CrawlerRunner(settings) 12 | 13 | 14 | @defer.inlineCallbacks 15 | def crawl(): 16 | yield runner.crawl(NeteaseSpider) 17 | yield runner.crawl(QQSpider) 18 | reactor.stop() 19 | 20 | 21 | if __name__ == '__main__': 22 | crawl() 23 | reactor.run() 24 | -------------------------------------------------------------------------------- /news_web/news_web/urls.py: -------------------------------------------------------------------------------- 1 | """news_web URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.9/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url, include 17 | 18 | urlpatterns = [ 19 | url(r'api/', include('web_server.urls')), 20 | ] 21 | -------------------------------------------------------------------------------- /news_web/news_web/middlewares.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import urlparse 4 | from django.http import HttpResponse 5 | from lib import response 6 | 7 | 8 | class JsonMiddleware(object): 9 | def process_request(self, request): 10 | try: 11 | request.json = json.loads(request.body) 12 | except: 13 | request.json = {} 14 | 15 | 16 | class QuertStringMiddleware(object): 17 | def process_request(self, request): 18 | query_string = request.META.get("QUERY_STRING", "") 19 | # convert to json, flat it 20 | try: 21 | request.qs = {} 22 | for k, v in urlparse.parse_qs(query_string).items(): 23 | if len(v) == 1: 24 | request.qs[k] = v[0] 25 | else: 26 | request.qs[k] = v 27 | except: 28 | request.qs = {} 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Xiongbiao Huang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /news_web/lib/response.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from django.http import HttpResponse 3 | import json 4 | from error_code import get_message 5 | 6 | 7 | def JsonErrorResponse(code=10001, error_body=None): 8 | try: 9 | if error_body: 10 | data = json.dumps({ 11 | 'code': code, 12 | 'message': error_body 13 | }) 14 | else: 15 | data = '{"code": %s, "message": "%s"}' % (code, get_message(code)) 16 | except: 17 | return JsonErrorResponse(10005) 18 | response = HttpResponse( 19 | content=data, 20 | content_type='application/json; charset=utf-8', 21 | status=400 22 | ) 23 | return response 24 | 25 | 26 | def JsonResponse(json_data={}): 27 | try: 28 | if isinstance(json_data, str): 29 | data = '{"code": 0, "data": %s}' % json_data 30 | else: 31 | data = { 32 | "code": 0, 33 | "data": json_data 34 | } 35 | data = json.dumps(data) 36 | except: 37 | return JsonErrorResponse(10005) 38 | response = HttpResponse( 39 | content=data, 40 | content_type='application/json; charset=utf-8', 41 | status=200 42 | ) 43 | return response 44 | 45 | 46 | def not_match_func(request): 47 | return JsonErrorResponse(10002) 48 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/spiders/netease.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import time 5 | from datetime import datetime 6 | from scrapy.spiders import CrawlSpider, Rule 7 | from scrapy.linkextractors import LinkExtractor 8 | from scrapy.shell import inspect_response 9 | from newscrawler.items import NewsItem 10 | 11 | 12 | class NeteaseSpider(CrawlSpider): 13 | name = "netease" 14 | allowed_domains = ["163.com"] 15 | start_urls = ['http://tech.163.com/'] 16 | 17 | rules = ( 18 | Rule(LinkExtractor(allow=('/\d+/\d+/\d+/*', )), callback='parse_item'), 19 | ) 20 | 21 | def parse_item(self, response): 22 | # inspect_response(response, self) 23 | r = response 24 | title = r.xpath('/html/head/title/text()').extract()[0].strip() 25 | source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip() 26 | content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip() 27 | raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0] 28 | re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time) 29 | if re_result: 30 | ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S')) 31 | else: 32 | ts = 0 33 | url = r.url 34 | new_news = NewsItem( 35 | title=title, 36 | content=content, 37 | source=source, 38 | published=ts, 39 | url=url 40 | ) 41 | return new_news 42 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/wechat_push.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import datetime 4 | import urllib 5 | import requests 6 | import wechat_config as config 7 | 8 | 9 | def update_token(func): 10 | def wrapper(*args, **kwargs): 11 | now = datetime.datetime.now() 12 | # 刷新token 13 | if not config.expires_time or now >= config.expires_time: 14 | data = { 15 | 'grant_type': 'client_credential', 16 | 'appid': config.appid, 17 | 'secret': config.appsecret 18 | } 19 | ret = requests.get(config.token_url + urllib.urlencode(data)) 20 | result = json.loads(ret.text) 21 | config.access_token = result['access_token'] 22 | config.expires_time = now + datetime.timedelta(hours=2) 23 | return func(*args, **kwargs) 24 | return wrapper 25 | 26 | 27 | @update_token 28 | def send_msg(title, data, object_id, openid=config.maintainers[0]): 29 | data = json.dumps({ 30 | 'touser': openid, 31 | 'template_id': config.template_id, 32 | 'url': config.template_url + object_id, # 点击打开的链接 33 | 'data': { 34 | 'title': { 35 | 'value': title, 36 | 'color': '#173177' 37 | }, 38 | 'data': { 39 | 'value': data, 40 | 'color': '#173177' 41 | }, 42 | } 43 | }).encode() 44 | url = config.send_url + config.access_token 45 | ret = requests.post(url, data) 46 | print 'send result: {}'.format(ret.text) 47 | 48 | 49 | if __name__ == '__main__': 50 | from IPython import embed 51 | embed() 52 | for maintainer in config.maintainers: 53 | send_msg('hello', 'start running~', maintainer) 54 | -------------------------------------------------------------------------------- /news_web/web_server/views.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from lib.response import JsonResponse, not_match_func 3 | from models import NewsItem, Subscription 4 | from news_web.settings import DEFAULT_WECHAT_OPENID 5 | 6 | 7 | def ping(request): 8 | handler_map = { 9 | "GET": ping_get 10 | } 11 | return handler_map.get(request.method, not_match_func)(request) 12 | 13 | 14 | def news(request): 15 | handler_map = { 16 | "GET": get_news 17 | } 18 | return handler_map.get(request.method, not_match_func)(request) 19 | 20 | 21 | def subscriptions(request): 22 | handler_map = { 23 | "GET": get_subscriptions, 24 | "POST": add_subscriptions, 25 | "DELETE": remove_subscriptions 26 | } 27 | return handler_map.get(request.method, not_match_func)(request) 28 | 29 | 30 | def ping_get(request): 31 | return JsonResponse( 32 | { 33 | 'msg': 'pong' 34 | } 35 | ) 36 | 37 | 38 | def get_news(request): 39 | news_id = request.qs['news_id'] 40 | return JsonResponse( 41 | { 42 | 'news': NewsItem.objects.get(id=news_id).to_json() 43 | } 44 | ) 45 | 46 | 47 | def get_subscriptions(request): 48 | subscription = Subscription.ensure_subscription(DEFAULT_WECHAT_OPENID) 49 | return JsonResponse( 50 | { 51 | 'subscription': subscription.to_json() 52 | } 53 | ) 54 | 55 | 56 | def add_subscriptions(request): 57 | keyword = request.json.get('keyword') 58 | if keyword: 59 | Subscription.add_keyword(DEFAULT_WECHAT_OPENID, keyword) 60 | return JsonResponse({}) 61 | 62 | 63 | def remove_subscriptions(request): 64 | keyword = request.json.get('keyword') 65 | if keyword: 66 | Subscription.remove_keyword(DEFAULT_WECHAT_OPENID, keyword) 67 | return JsonResponse({}) 68 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/spiders/qq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import time 5 | from datetime import datetime 6 | from scrapy.spiders import CrawlSpider, Rule 7 | from scrapy.linkextractors import LinkExtractor 8 | from scrapy.shell import inspect_response 9 | from newscrawler.utils import redis_conn, redis_invalid_url_key 10 | from newscrawler.items import NewsItem 11 | 12 | 13 | class QQSpider(CrawlSpider): 14 | name = "qq" 15 | allowed_domains = ["tech.qq.com"] 16 | start_urls = ['http://tech.qq.com/'] 17 | 18 | rules = ( 19 | Rule(LinkExtractor(allow=('http://tech.qq.com/a/\d+/*', )), callback='parse_item'), 20 | ) 21 | 22 | def parse_item(self, response): 23 | r = response 24 | # inspect_response(response, self) 25 | 26 | title = r.xpath("//div[@class='qq_article']//h1/text()").extract() 27 | source = r.xpath("//div[@class='qq_article']//span[@class='a_source']/text()").extract() 28 | if title: 29 | title = title[0] 30 | if source: 31 | source = source[0] 32 | # 要求格式正确 33 | if not title or not source: 34 | redis_conn.hset(redis_invalid_url_key, response.url, 0) 35 | return 36 | content = ''.join(r.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract()) 37 | raw_time = r.xpath("//div[@class='qq_article']//span[@class='a_time']/text()").extract()[0] 38 | re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", raw_time) 39 | if re_result: 40 | ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M')) 41 | else: 42 | ts = 0 43 | url = r.url 44 | new_news = NewsItem( 45 | title=title, 46 | content=content, 47 | source=source, 48 | published=ts, 49 | url=url 50 | ) 51 | return new_news 52 | -------------------------------------------------------------------------------- /news_web/web_server/models.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | from lib import db_utils 4 | import mongoengine as m 5 | 6 | db_utils.using_db("news") 7 | 8 | 9 | class NewsItem(m.Document): 10 | title = m.StringField() 11 | content = m.StringField() 12 | source = m.StringField() 13 | published = m.DateTimeField() 14 | url = m.StringField() 15 | 16 | meta = { 17 | 'db_alias': 'news', 18 | 'collection': 'news', 19 | } 20 | 21 | def to_json(self): 22 | return { 23 | 'title': self.title, 24 | 'content': self.content, 25 | 'source': self.source, 26 | 'published': self.published, 27 | 'url': self.url, 28 | } 29 | 30 | 31 | class Subscription(m.Document): 32 | open_id = m.StringField() 33 | keywords = m.ListField(m.StringField()) # 关键词 34 | tags = m.ListField(m.StringField()) # 标签 35 | 36 | meta = { 37 | 'db_alias': 'news', 38 | 'collection': 'subscription', 39 | } 40 | 41 | @staticmethod 42 | def ensure_subscription(open_id): 43 | subscription = Subscription.objects(open_id=open_id).first() 44 | if not subscription: 45 | subscription = Subscription(open_id=open_id) 46 | subscription.save() 47 | return subscription 48 | 49 | @staticmethod 50 | def add_keyword(open_id, keyword): 51 | subscription = Subscription.ensure_subscription(open_id) 52 | subscription.update(add_to_set__keywords=keyword) 53 | 54 | @staticmethod 55 | def remove_keyword(open_id, keyword): 56 | subscription = Subscription.ensure_subscription(open_id) 57 | subscription.update(pull__keywords=keyword) 58 | 59 | def to_json(self): 60 | return { 61 | 'open_id': self.open_id, 62 | 'keywords': self.keywords, 63 | 'tags': self.tags 64 | } 65 | -------------------------------------------------------------------------------- /news_web/frontend/article.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 文章内容 7 | 8 | 9 | 10 | 11 |
12 |
13 |

标题

14 | 时间来源 15 |
16 |

17 | 正文 18 |

19 |
20 |
21 |
22 | 查看原文 23 | 订阅更多 24 | 25 | 26 | 27 | 46 | 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # DB 27 | *.sqlite3 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | 94 | ## Build generated 95 | build/ 96 | DerivedData/ 97 | 98 | ## Various settings 99 | *.pbxuser 100 | !default.pbxuser 101 | *.mode1v3 102 | !default.mode1v3 103 | *.mode2v3 104 | !default.mode2v3 105 | *.perspectivev3 106 | !default.perspectivev3 107 | xcuserdata/ 108 | 109 | ## Other 110 | *.moved-aside 111 | *.xcuserstate 112 | #*.xcworkspacedata 113 | 114 | ## Obj-C/Swift specific 115 | *.hmap 116 | *.ipa 117 | 118 | # CocoaPods 119 | # 120 | # We recommend against adding the Pods directory to your .gitignore. However 121 | # you should judge for yourself, the pros and cons are mentioned at: 122 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control 123 | # 124 | #Pods/ 125 | 126 | # Carthage 127 | # 128 | # Add this line if you want to avoid checking in source code from Carthage dependencies. 129 | # Carthage/Checkouts 130 | 131 | Carthage/Build 132 | Vary-iOS.zip 133 | Web/list-urls.html 134 | 135 | *.DS_Store 136 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | import logging 9 | from utils import redis_conn, redis_url_key 10 | from scrapy.conf import settings 11 | from scrapy.exceptions import DropItem 12 | from wechat_push import send_msg 13 | from wechat_config import default_openid 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class MongoDBPipeline(object): 18 | 19 | def __init__(self): 20 | conn = pymongo.Connection( 21 | settings['MONGO_CONF']['host'], 22 | settings['MONGO_CONF']['port'] 23 | ) 24 | db = conn[settings['MONGO_CONF']['db']] 25 | self.news_collection = db[settings['MONGO_CONF']['collection']] 26 | 27 | def process_item(self, item, spider): 28 | valid = True 29 | for data in item: 30 | if not data: 31 | valid = False 32 | raise DropItem("Missing {0}!".format(data)) 33 | if valid: 34 | object_id = self.news_collection.insert(dict(item)) 35 | spider.object_id = str(object_id) 36 | logger.info("Question added to MongoDB database!") 37 | return item 38 | 39 | 40 | class RedisPipeline(object): 41 | 42 | def process_item(self, item, spider): 43 | redis_conn.hset(redis_url_key, item['url'], 0) 44 | return item 45 | 46 | 47 | class PushPipeline(object): 48 | 49 | def __init__(self): 50 | conn = pymongo.Connection( 51 | settings['MONGO_CONF']['host'], 52 | settings['MONGO_CONF']['port'] 53 | ) 54 | db = conn[settings['MONGO_CONF']['db']] 55 | self.subscription_collection = db[settings['MONGO_CONF']['subscription_collection']] 56 | 57 | def process_item(self, item, spider): 58 | subscription = self.subscription_collection.find_one( 59 | { 60 | 'open_id': default_openid 61 | } 62 | ) 63 | keywords = subscription.get('keywords', []) 64 | # 判断关键词 65 | keyword_in_title = any([keyword in item['title'] for keyword in keywords]) 66 | keyword_in_content = any([keyword in item['content'] for keyword in keywords]) 67 | if keyword_in_title or keyword_in_content: 68 | send_msg( 69 | title=item['title'], 70 | data=item['content'], 71 | object_id=spider.object_id, 72 | openid=default_openid 73 | ) 74 | return item 75 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import logging 8 | import random 9 | from utils import redis_conn, redis_url_key, redis_invalid_url_key 10 | from scrapy import signals 11 | from scrapy.conf import settings 12 | from scrapy.exceptions import IgnoreRequest 13 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 14 | from fake_useragent import UserAgent 15 | from settings import USER_ANGENT_LIST 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class RedisMiddleware(object): 20 | """ 21 | 用于去重 22 | """ 23 | 24 | def process_request(self, request, spider): 25 | if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)): 26 | logger.info("Skip URL: %s, has been crawled" % request.url) 27 | raise IgnoreRequest("URL %s has been crawled" % request.url) 28 | 29 | 30 | class RotateUserAgentMiddleware(UserAgentMiddleware): 31 | 32 | def __init__(self, user_agent=''): 33 | self.user_agent = user_agent 34 | self.USER_ANGENT_LIST = USER_ANGENT_LIST 35 | 36 | def process_request(self, request, spider): 37 | random_ua = random.choice(self.USER_ANGENT_LIST) 38 | self.user_agent = random_ua 39 | request.headers.setdefault('User-Agent', random_ua) 40 | 41 | 42 | class NewscrawlerSpiderMiddleware(object): 43 | # Not all methods need to be defined. If a method is not defined, 44 | # scrapy acts as if the spider middleware does not modify the 45 | # passed objects. 46 | 47 | @classmethod 48 | def from_crawler(cls, crawler): 49 | # This method is used by Scrapy to create your spiders. 50 | s = cls() 51 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 52 | return s 53 | 54 | def process_spider_input(response, spider): 55 | # Called for each response that goes through the spider 56 | # middleware and into the spider. 57 | 58 | # Should return None or raise an exception. 59 | return None 60 | 61 | def process_spider_output(response, result, spider): 62 | # Called with the results returned from the Spider, after 63 | # it has processed the response. 64 | 65 | # Must return an iterable of Request, dict or Item objects. 66 | for i in result: 67 | yield i 68 | 69 | def process_spider_exception(response, exception, spider): 70 | # Called when a spider or process_spider_input() method 71 | # (from other spider middleware) raises an exception. 72 | 73 | # Should return either None or an iterable of Response, dict 74 | # or Item objects. 75 | pass 76 | 77 | def process_start_requests(start_requests, spider): 78 | # Called with the start requests of the spider, and works 79 | # similarly to the process_spider_output() method, except 80 | # that it doesn’t have a response associated. 81 | 82 | # Must return only requests (not items). 83 | for r in start_requests: 84 | yield r 85 | 86 | def spider_opened(self, spider): 87 | spider.logger.info('Spider opened: %s' % spider.name) 88 | -------------------------------------------------------------------------------- /news_web/news_web/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for news_web project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.9.7. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.9/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '&q2kufvz51ooh&ou_41tqv=w^3k9dx!o@(^+s$zp4=2&lm2@w1' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.contenttypes', 35 | 'django.contrib.staticfiles', 36 | 'lib', 37 | 'web_server', 38 | ] 39 | 40 | MIDDLEWARE_CLASSES = [ 41 | 'news_web.middlewares.JsonMiddleware', 42 | 'news_web.middlewares.QuertStringMiddleware', 43 | ] 44 | 45 | ROOT_URLCONF = 'news_web.urls' 46 | 47 | TEMPLATES = [ 48 | { 49 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 50 | 'DIRS': [], 51 | 'APP_DIRS': True, 52 | 'OPTIONS': { 53 | 'context_processors': [ 54 | 'django.template.context_processors.debug', 55 | 'django.template.context_processors.request', 56 | 'django.contrib.auth.context_processors.auth', 57 | 'django.contrib.messages.context_processors.messages', 58 | ], 59 | }, 60 | }, 61 | ] 62 | 63 | WSGI_APPLICATION = 'news_web.wsgi.application' 64 | 65 | 66 | # Database 67 | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases 68 | 69 | DATABASES = { 70 | 'default': { 71 | }, 72 | 'redis': { 73 | 'HOST': 'localhost', 74 | 'PORT': 6379, 75 | }, 76 | 'mongodb': { 77 | 'HOST': 'localhost', 78 | 'PORT': 27017, 79 | } 80 | } 81 | 82 | 83 | # Password validation 84 | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators 85 | 86 | AUTH_PASSWORD_VALIDATORS = [ 87 | { 88 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 89 | }, 90 | { 91 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 92 | }, 93 | { 94 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 95 | }, 96 | { 97 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 98 | }, 99 | ] 100 | 101 | 102 | # Internationalization 103 | # https://docs.djangoproject.com/en/1.9/topics/i18n/ 104 | 105 | LANGUAGE_CODE = 'en-us' 106 | 107 | TIME_ZONE = 'UTC' 108 | 109 | USE_I18N = True 110 | 111 | USE_L10N = True 112 | 113 | USE_TZ = True 114 | 115 | 116 | # Static files (CSS, JavaScript, Images) 117 | # https://docs.djangoproject.com/en/1.9/howto/static-files/ 118 | 119 | STATIC_URL = '/static/' 120 | 121 | 122 | DEFAULT_WECHAT_OPENID = 'oCfHOwD2_5ZpGctshTZPeZHqUIjc' 123 | -------------------------------------------------------------------------------- /news_web/frontend/subscription.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 订阅 7 | 8 | 9 | 10 | 11 | 12 |

订阅

19 | 20 |
已订阅关键词
21 |
22 |
23 | 24 |
订阅关键词
25 |
26 |
27 |
28 | 29 |
30 |
31 |
32 | 添加订阅关键词 33 | 返回 34 | 35 | 36 | 37 | 38 | 116 | 117 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for newscrawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'newscrawler' 13 | 14 | SPIDER_MODULES = ['newscrawler.spiders'] 15 | NEWSPIDER_MODULE = 'newscrawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'newscrawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'newscrawler.middlewares.NewscrawlerSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 57 | 'newscrawler.middlewares.RedisMiddleware': 300, 58 | 'newscrawler.middlewares.RotateUserAgentMiddleware': 400, 59 | } 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # 来源 fake-useragent 68 | USER_ANGENT_LIST = [ 69 | u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 70 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 71 | u'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 72 | u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 73 | u'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36', 74 | u'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36', 75 | u'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36', 76 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36', 77 | u'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36', 78 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36', 79 | u'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36', 80 | u'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36', 81 | u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36', 82 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36', 83 | u'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36', 84 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36', 85 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36', 86 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36', 87 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36', 88 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36', 89 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36', 90 | u'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F', 91 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10', 92 | u'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36', 93 | u'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 94 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', 95 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', 96 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36', 97 | u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36', 98 | u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36', 99 | u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', 100 | u'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', 101 | u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36', 102 | u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36', 103 | u'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36', 104 | u'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36', 105 | u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36', 106 | u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 107 | u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 108 | u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 109 | u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 110 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 111 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 112 | u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36', 113 | u'Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36', 114 | u'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36', 115 | u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17', 116 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17', 117 | u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15', 118 | u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14', 119 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1', 120 | u'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0', 121 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0', 122 | u'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0', 123 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0', 124 | u'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0', 125 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0', 126 | u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0', 127 | u'Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0', 128 | u'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0', 129 | u'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3', 130 | u'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0', 131 | u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0', 132 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', 133 | u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 134 | u'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0', 135 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0', 136 | u'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0', 137 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', 138 | u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0', 139 | u'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0', 140 | u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0', 141 | u'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0', 142 | u'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0', 143 | u'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1', 144 | u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1', 145 | u'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0', 146 | u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0', 147 | u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0', 148 | u'Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0', 149 | u'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0', 150 | u'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0', 151 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0', 152 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0', 153 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0', 154 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0', 155 | u'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0', 156 | u'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0', 157 | u'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0', 158 | u'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0', 159 | u'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0', 160 | u'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0', 161 | u'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0', 162 | u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 163 | u'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0', 164 | u'Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0', 165 | u'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0', 166 | u'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1', 167 | u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', 168 | u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6' 169 | ] 170 | 171 | # Configure item pipelines 172 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 173 | ITEM_PIPELINES = { 174 | 'newscrawler.pipelines.MongoDBPipeline': 300, 175 | 'newscrawler.pipelines.RedisPipeline': 400, 176 | 'newscrawler.pipelines.PushPipeline': 500, 177 | } 178 | 179 | # Enable and configure the AutoThrottle extension (disabled by default) 180 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 181 | #AUTOTHROTTLE_ENABLED = True 182 | # The initial download delay 183 | #AUTOTHROTTLE_START_DELAY = 5 184 | # The maximum download delay to be set in case of high latencies 185 | #AUTOTHROTTLE_MAX_DELAY = 60 186 | # The average number of requests Scrapy should be sending in parallel to 187 | # each remote server 188 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 189 | # Enable showing throttling stats for every response received: 190 | #AUTOTHROTTLE_DEBUG = False 191 | 192 | # Enable and configure HTTP caching (disabled by default) 193 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 194 | #HTTPCACHE_ENABLED = True 195 | #HTTPCACHE_EXPIRATION_SECS = 0 196 | #HTTPCACHE_DIR = 'httpcache' 197 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 198 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 199 | 200 | # MONGODB 配置 201 | MONGO_CONF = { 202 | 'host': 'localhost', 203 | 'port': 27017, 204 | 'db': 'news', 205 | 'collection': 'news', 206 | 'subscription_collection': 'subscription' 207 | } 208 | 209 | 210 | # REDIS 配置 211 | REDIS_CONF = { 212 | 'host': 'localhost', 213 | 'port': 6379, 214 | 'db': 5 215 | } 216 | --------------------------------------------------------------------------------