├── news_web
    ├── lib
    │   ├── utils.py
    │   ├── __init__.py
    │   ├── error_code.py
    │   ├── db_utils.py
    │   └── response.py
    ├── news_web
    │   ├── __init__.py
    │   ├── wsgi.py
    │   ├── urls.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── web_server
    │   ├── __init__.py
    │   ├── migrations
    │   │   └── __init__.py
    │   ├── admin.py
    │   ├── tests.py
    │   ├── apps.py
    │   ├── urls.py
    │   ├── views.py
    │   └── models.py
    ├── run_server.sh
    ├── init_db.py
    ├── manage.py
    └── frontend
    │   ├── article.html
    │   └── subscription.html
├── newscrawler
    ├── newscrawler
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── netease.py
    │   │   └── qq.py
    │   ├── utils.py
    │   ├── items.py
    │   ├── wechat_config.py
    │   ├── wechat_push.py
    │   ├── pipelines.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── worker.py
    ├── scrapy.cfg
    └── start_crawl.py
├── README.md
├── 论文相关文件
    ├── 用例图.png
    ├── 爬虫部分目录.png
    ├── 系统总体框架.png
    ├── MongoDB.png
    ├── nginx配置.png
    ├── scrapy架构.png
    ├── spider实现.png
    ├── 启动API服务器.png
    ├── 启动spider.png
    ├── 新闻推送活动图.png
    ├── 新闻订阅活动图.png
    ├── WechatIMG37.png
    ├── WechatIMG38.png
    ├── WechatIMG39.png
    ├── WechatIMG40.png
    ├── useragent.png
    ├── 订阅与展示部分目录.png
    ├── WechatIMG41.jpeg
    ├── WechatIMG42.jpeg
    ├── WechatIMG43.jpeg
    ├── WechatIMG44.jpeg
    └── 基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf
├── .gitmodules
├── requirements.txt
├── LICENSE
└── .gitignore


/news_web/lib/utils.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/news_web/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/news_web/news_web/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/news_web/web_server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/news_web/web_server/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NewsCrawler
2 | 毕业设计 基于网络爬虫的新闻采集和订阅系统的设计与实现
3 | 


--------------------------------------------------------------------------------
/论文相关文件/用例图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/用例图.png


--------------------------------------------------------------------------------
/论文相关文件/爬虫部分目录.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/爬虫部分目录.png


--------------------------------------------------------------------------------
/论文相关文件/系统总体框架.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/系统总体框架.png


--------------------------------------------------------------------------------
/news_web/web_server/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/news_web/web_server/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/论文相关文件/MongoDB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/MongoDB.png


--------------------------------------------------------------------------------
/论文相关文件/nginx配置.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/nginx配置.png


--------------------------------------------------------------------------------
/论文相关文件/scrapy架构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/scrapy架构.png


--------------------------------------------------------------------------------
/论文相关文件/spider实现.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/spider实现.png


--------------------------------------------------------------------------------
/论文相关文件/启动API服务器.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/启动API服务器.png


--------------------------------------------------------------------------------
/论文相关文件/启动spider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/启动spider.png


--------------------------------------------------------------------------------
/论文相关文件/新闻推送活动图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/新闻推送活动图.png


--------------------------------------------------------------------------------
/论文相关文件/新闻订阅活动图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/新闻订阅活动图.png


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG37.png


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG38.png


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG39.png


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG40.png


--------------------------------------------------------------------------------
/论文相关文件/useragent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/useragent.png


--------------------------------------------------------------------------------
/论文相关文件/订阅与展示部分目录.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/订阅与展示部分目录.png


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG41.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG41.jpeg


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG42.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG42.jpeg


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG43.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG43.jpeg


--------------------------------------------------------------------------------
/论文相关文件/WechatIMG44.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG44.jpeg


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "IPProxyTool"]
2 | 	path = IPProxyTool
3 | 	url = https://github.com/awolfly9/IPProxyTool.git
4 | 


--------------------------------------------------------------------------------
/论文相关文件/基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf


--------------------------------------------------------------------------------
/news_web/run_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "启动服务器"
3 | #python manage.py runserver 0.0.0.0:8000
4 | gunicorn news_web.wsgi:application -w 4 -b :8000 -k gevent --max-requests 1000
5 | 


--------------------------------------------------------------------------------
/news_web/init_db.py:
--------------------------------------------------------------------------------
1 | from web_server.models import NewsItem, Subscription
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     NewsItem.ensure_indexes()
6 |     Subscription.ensure_indexes()
7 | 


--------------------------------------------------------------------------------
/news_web/web_server/apps.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 | 
3 | from django.apps import AppConfig
4 | 
5 | 
6 | class WebServerConfig(AppConfig):
7 |     name = 'web_server'
8 | 


--------------------------------------------------------------------------------
/newscrawler/worker.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 | 
4 | 
5 | if __name__ == '__main__':
6 |     while True:
7 |         subprocess.call(['python', 'start_crawl.py'])
8 |         time.sleep(30)
9 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pymongo==2.8.1
 2 | scrapy==1.3.1
 3 | redis
 4 | fake-useragent
 5 | django
 6 | mongoengine
 7 | jieba
 8 | w3lib
 9 | lxml
10 | twisted==16.4.1
11 | gevent==1.1.2
12 | greenlet==0.4.10
13 | gunicorn==19.6.0
14 | 


--------------------------------------------------------------------------------
/news_web/web_server/urls.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from django.conf.urls import url
 3 | from views import ping, news, subscriptions
 4 | 
 5 | urlpatterns = [
 6 |     url(r'^ping$', ping),
 7 |     url(r'^news$', news),
 8 |     url(r'^subscriptions$', subscriptions),
 9 | ]
10 | 


--------------------------------------------------------------------------------
/news_web/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "news_web.settings")
 7 | 
 8 |     from django.core.management import execute_from_command_line
 9 | 
10 |     execute_from_command_line(sys.argv)
11 | 


--------------------------------------------------------------------------------
/newscrawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = newscrawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = newscrawler
12 | 


--------------------------------------------------------------------------------
/news_web/lib/error_code.py:
--------------------------------------------------------------------------------
 1 | 
 2 | err_code_message = {
 3 |     '10000': 'Unknown Error',
 4 |     '10001': 'Bad Request',
 5 |     '10005': 'Internal Error(Json Dumps Error)',
 6 |     '99999': 'Jsonschema Check Fail',
 7 | }
 8 | 
 9 | 
10 | def get_message(err_code):
11 |     return err_code_message.get(str(err_code), "Undefined Error Code")
12 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/utils.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import redis
 3 | from scrapy.conf import settings
 4 | 
 5 | 
 6 | redis_conn = redis.Redis(
 7 |     host=settings['REDIS_CONF']['host'],
 8 |     port=settings['REDIS_CONF']['port'],
 9 |     db=settings['REDIS_CONF']['db']
10 | )
11 | redis_url_key = "url"
12 | redis_invalid_url_key = "invalid_url"
13 | 


--------------------------------------------------------------------------------
/news_web/lib/db_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import time
 3 | import datetime
 4 | import mongoengine
 5 | 
 6 | 
 7 | def now():
 8 |     return datetime.datetime.utcnow()
 9 | 
10 | 
11 | def get_utc_seconds(dt):
12 |     return int(time.mktime(dt.timetuple()) - time.timezone)
13 | 
14 | 
15 | def using_db(alias):
16 |     mongoengine.register_connection(
17 |         alias, alias,
18 |         host="localhost",
19 |         port=27017
20 |     )
21 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class NewsItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     content = scrapy.Field()
16 |     source = scrapy.Field()
17 |     published = scrapy.Field()
18 |     url = scrapy.Field()
19 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/wechat_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | appid = 'wx98236d864d8a122a'
 3 | appsecret = '2530005d0440a7d3980ab6abca806357'
 4 | token_url = 'https://api.weixin.qq.com/cgi-bin/token?'
 5 | default_openid = 'oCfHOwD2_5ZpGctshTZPeZHqUIjc'
 6 | maintainers = ['oCfHOwD2_5ZpGctshTZPeZHqUIjc']
 7 | expires_time = None
 8 | 
 9 | template_id = 'B1ftH9IQr4Cuy_1M_94Je851RKGgQU-Uc0CH5ej4-oo'
10 | send_url = 'https://api.weixin.qq.com/cgi-bin/message/template/send?access_token='
11 | template_url = 'http://182.254.225.214/article.html?newsId='
12 | 


--------------------------------------------------------------------------------
/news_web/news_web/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for news_web project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | from gevent import monkey
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | 
15 | monkey.patch_all()
16 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "news_web.settings")
17 | 
18 | application = get_wsgi_application()
19 | 


--------------------------------------------------------------------------------
/newscrawler/start_crawl.py:
--------------------------------------------------------------------------------
 1 | from twisted.internet import reactor, defer
 2 | from scrapy.crawler import CrawlerRunner
 3 | from scrapy.utils.project import get_project_settings
 4 | from scrapy.utils.log import configure_logging
 5 | from newscrawler.spiders.netease import NeteaseSpider
 6 | from newscrawler.spiders.qq import QQSpider
 7 | 
 8 | 
 9 | configure_logging()
10 | settings = get_project_settings()
11 | runner = CrawlerRunner(settings)
12 | 
13 | 
14 | @defer.inlineCallbacks
15 | def crawl():
16 |     yield runner.crawl(NeteaseSpider)
17 |     yield runner.crawl(QQSpider)
18 |     reactor.stop()
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     crawl()
23 |     reactor.run()
24 | 


--------------------------------------------------------------------------------
/news_web/news_web/urls.py:
--------------------------------------------------------------------------------
 1 | """news_web URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/1.9/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.conf.urls import url, include
14 |     2. Add a URL to urlpatterns:  url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url, include
17 | 
18 | urlpatterns = [
19 |     url(r'api/', include('web_server.urls')),
20 | ]
21 | 


--------------------------------------------------------------------------------
/news_web/news_web/middlewares.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | import urlparse
 4 | from django.http import HttpResponse
 5 | from lib import response
 6 | 
 7 | 
 8 | class JsonMiddleware(object):
 9 |     def process_request(self, request):
10 |         try:
11 |             request.json = json.loads(request.body)
12 |         except:
13 |             request.json = {}
14 | 
15 | 
16 | class QuertStringMiddleware(object):
17 |     def process_request(self, request):
18 |         query_string = request.META.get("QUERY_STRING", "")
19 |         # convert to json, flat it
20 |         try:
21 |             request.qs = {}
22 |             for k, v in urlparse.parse_qs(query_string).items():
23 |                 if len(v) == 1:
24 |                     request.qs[k] = v[0]
25 |                 else:
26 |                     request.qs[k] = v
27 |         except:
28 |             request.qs = {}
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Xiongbiao Huang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/news_web/lib/response.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from django.http import HttpResponse
 3 | import json
 4 | from error_code import get_message
 5 | 
 6 | 
 7 | def JsonErrorResponse(code=10001, error_body=None):
 8 |     try:
 9 |         if error_body:
10 |             data = json.dumps({
11 |                 'code': code,
12 |                 'message': error_body
13 |             })
14 |         else:
15 |             data = '{"code": %s, "message": "%s"}' % (code, get_message(code))
16 |     except:
17 |         return JsonErrorResponse(10005)
18 |     response = HttpResponse(
19 |         content=data,
20 |         content_type='application/json; charset=utf-8',
21 |         status=400
22 |     )
23 |     return response
24 | 
25 | 
26 | def JsonResponse(json_data={}):
27 |     try:
28 |         if isinstance(json_data, str):
29 |             data = '{"code": 0, "data": %s}' % json_data
30 |         else:
31 |             data = {
32 |                 "code": 0,
33 |                 "data": json_data
34 |             }
35 |             data = json.dumps(data)
36 |     except:
37 |         return JsonErrorResponse(10005)
38 |     response = HttpResponse(
39 |         content=data,
40 |         content_type='application/json; charset=utf-8',
41 |         status=200
42 |     )
43 |     return response
44 | 
45 | 
46 | def not_match_func(request):
47 |     return JsonErrorResponse(10002)
48 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/spiders/netease.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import time
 5 | from datetime import datetime
 6 | from scrapy.spiders import CrawlSpider, Rule
 7 | from scrapy.linkextractors import LinkExtractor
 8 | from scrapy.shell import inspect_response
 9 | from newscrawler.items import NewsItem
10 | 
11 | 
12 | class NeteaseSpider(CrawlSpider):
13 |     name = "netease"
14 |     allowed_domains = ["163.com"]
15 |     start_urls = ['http://tech.163.com/']
16 | 
17 |     rules = (
18 |         Rule(LinkExtractor(allow=('/\d+/\d+/\d+/*', )), callback='parse_item'),
19 |     )
20 | 
21 |     def parse_item(self, response):
22 |         # inspect_response(response, self)
23 |         r = response
24 |         title = r.xpath('/html/head/title/text()').extract()[0].strip()
25 |         source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip()
26 |         content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip()
27 |         raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0]
28 |         re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time)
29 |         if re_result:
30 |             ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S'))
31 |         else:
32 |             ts = 0
33 |         url = r.url
34 |         new_news = NewsItem(
35 |             title=title,
36 |             content=content,
37 |             source=source,
38 |             published=ts,
39 |             url=url
40 |         )
41 |         return new_news
42 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/wechat_push.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | import datetime
 4 | import urllib
 5 | import requests
 6 | import wechat_config as config
 7 | 
 8 | 
 9 | def update_token(func):
10 |     def wrapper(*args, **kwargs):
11 |         now = datetime.datetime.now()
12 |         # 刷新token
13 |         if not config.expires_time or now >= config.expires_time:
14 |             data = {
15 |                 'grant_type': 'client_credential',
16 |                 'appid': config.appid,
17 |                 'secret': config.appsecret
18 |             }
19 |             ret = requests.get(config.token_url + urllib.urlencode(data))
20 |             result = json.loads(ret.text)
21 |             config.access_token = result['access_token']
22 |             config.expires_time = now + datetime.timedelta(hours=2)
23 |         return func(*args, **kwargs)
24 |     return wrapper
25 | 
26 | 
27 | @update_token
28 | def send_msg(title, data, object_id, openid=config.maintainers[0]):
29 |     data = json.dumps({
30 |         'touser': openid,
31 |         'template_id': config.template_id,
32 |         'url': config.template_url + object_id,  # 点击打开的链接
33 |         'data': {
34 |             'title': {
35 |                 'value': title,
36 |                 'color': '#173177'
37 |             },
38 |             'data': {
39 |                 'value': data,
40 |                 'color': '#173177'
41 |             },
42 |         }
43 |     }).encode()
44 |     url = config.send_url + config.access_token
45 |     ret = requests.post(url, data)
46 |     print 'send result: {}'.format(ret.text)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     from IPython import embed
51 |     embed()
52 |     for maintainer in config.maintainers:
53 |         send_msg('hello', 'start running~', maintainer)
54 | 


--------------------------------------------------------------------------------
/news_web/web_server/views.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from lib.response import JsonResponse, not_match_func
 3 | from models import NewsItem, Subscription
 4 | from news_web.settings import DEFAULT_WECHAT_OPENID
 5 | 
 6 | 
 7 | def ping(request):
 8 |     handler_map = {
 9 |         "GET": ping_get
10 |     }
11 |     return handler_map.get(request.method, not_match_func)(request)
12 | 
13 | 
14 | def news(request):
15 |     handler_map = {
16 |         "GET": get_news
17 |     }
18 |     return handler_map.get(request.method, not_match_func)(request)
19 | 
20 | 
21 | def subscriptions(request):
22 |     handler_map = {
23 |         "GET": get_subscriptions,
24 |         "POST": add_subscriptions,
25 |         "DELETE": remove_subscriptions
26 |     }
27 |     return handler_map.get(request.method, not_match_func)(request)
28 | 
29 | 
30 | def ping_get(request):
31 |     return JsonResponse(
32 |         {
33 |             'msg': 'pong'
34 |         }
35 |     )
36 | 
37 | 
38 | def get_news(request):
39 |     news_id = request.qs['news_id']
40 |     return JsonResponse(
41 |         {
42 |             'news': NewsItem.objects.get(id=news_id).to_json()
43 |         }
44 |     )
45 | 
46 | 
47 | def get_subscriptions(request):
48 |     subscription = Subscription.ensure_subscription(DEFAULT_WECHAT_OPENID)
49 |     return JsonResponse(
50 |         {
51 |             'subscription': subscription.to_json()
52 |         }
53 |     )
54 | 
55 | 
56 | def add_subscriptions(request):
57 |     keyword = request.json.get('keyword')
58 |     if keyword:
59 |         Subscription.add_keyword(DEFAULT_WECHAT_OPENID, keyword)
60 |     return JsonResponse({})
61 | 
62 | 
63 | def remove_subscriptions(request):
64 |     keyword = request.json.get('keyword')
65 |     if keyword:
66 |         Subscription.remove_keyword(DEFAULT_WECHAT_OPENID, keyword)
67 |     return JsonResponse({})
68 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/spiders/qq.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import time
 5 | from datetime import datetime
 6 | from scrapy.spiders import CrawlSpider, Rule
 7 | from scrapy.linkextractors import LinkExtractor
 8 | from scrapy.shell import inspect_response
 9 | from newscrawler.utils import redis_conn, redis_invalid_url_key
10 | from newscrawler.items import NewsItem
11 | 
12 | 
13 | class QQSpider(CrawlSpider):
14 |     name = "qq"
15 |     allowed_domains = ["tech.qq.com"]
16 |     start_urls = ['http://tech.qq.com/']
17 | 
18 |     rules = (
19 |         Rule(LinkExtractor(allow=('http://tech.qq.com/a/\d+/*', )), callback='parse_item'),
20 |     )
21 | 
22 |     def parse_item(self, response):
23 |         r = response
24 |         # inspect_response(response, self)
25 | 
26 |         title = r.xpath("//div[@class='qq_article']//h1/text()").extract()
27 |         source = r.xpath("//div[@class='qq_article']//span[@class='a_source']/text()").extract()
28 |         if title:
29 |             title = title[0]
30 |         if source:
31 |             source = source[0]
32 |         # 要求格式正确
33 |         if not title or not source:
34 |             redis_conn.hset(redis_invalid_url_key, response.url, 0)
35 |             return
36 |         content = ''.join(r.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract())
37 |         raw_time = r.xpath("//div[@class='qq_article']//span[@class='a_time']/text()").extract()[0]
38 |         re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", raw_time)
39 |         if re_result:
40 |             ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M'))
41 |         else:
42 |             ts = 0
43 |         url = r.url
44 |         new_news = NewsItem(
45 |             title=title,
46 |             content=content,
47 |             source=source,
48 |             published=ts,
49 |             url=url
50 |         )
51 |         return new_news
52 | 


--------------------------------------------------------------------------------
/news_web/web_server/models.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import unicode_literals
 3 | from lib import db_utils
 4 | import mongoengine as m
 5 | 
 6 | db_utils.using_db("news")
 7 | 
 8 | 
 9 | class NewsItem(m.Document):
10 |     title = m.StringField()
11 |     content = m.StringField()
12 |     source = m.StringField()
13 |     published = m.DateTimeField()
14 |     url = m.StringField()
15 | 
16 |     meta = {
17 |         'db_alias': 'news',
18 |         'collection': 'news',
19 |     }
20 | 
21 |     def to_json(self):
22 |         return {
23 |             'title': self.title,
24 |             'content': self.content,
25 |             'source': self.source,
26 |             'published': self.published,
27 |             'url': self.url,
28 |         }
29 | 
30 | 
31 | class Subscription(m.Document):
32 |     open_id = m.StringField()
33 |     keywords = m.ListField(m.StringField())  # 关键词
34 |     tags = m.ListField(m.StringField())  # 标签
35 | 
36 |     meta = {
37 |         'db_alias': 'news',
38 |         'collection': 'subscription',
39 |     }
40 | 
41 |     @staticmethod
42 |     def ensure_subscription(open_id):
43 |         subscription = Subscription.objects(open_id=open_id).first()
44 |         if not subscription:
45 |             subscription = Subscription(open_id=open_id)
46 |             subscription.save()
47 |         return subscription
48 | 
49 |     @staticmethod
50 |     def add_keyword(open_id, keyword):
51 |         subscription = Subscription.ensure_subscription(open_id)
52 |         subscription.update(add_to_set__keywords=keyword)
53 | 
54 |     @staticmethod
55 |     def remove_keyword(open_id, keyword):
56 |         subscription = Subscription.ensure_subscription(open_id)
57 |         subscription.update(pull__keywords=keyword)
58 | 
59 |     def to_json(self):
60 |         return {
61 |             'open_id': self.open_id,
62 |             'keywords': self.keywords,
63 |             'tags': self.tags
64 |         }
65 | 


--------------------------------------------------------------------------------
/news_web/frontend/article.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width,initial-scale=1,user-scalable=0">
 6 |     <title>文章内容</title>
 7 |     <link rel="stylesheet" href="http://cdn.bootcss.com/weui/1.1.1/style/weui.min.css">
 8 |     <link rel="stylesheet" href="http://cdn.bootcss.com/jquery-weui/1.0.1/css/jquery-weui.min.css">
 9 | </head>
10 | <body ontouchstart>
11 |   <div>
12 |     <article class="weui-article">
13 |       <h1 id='title'>标题</h1>
14 |         <span id='published' style="color: #888888; margin-right: 5%; font-size: 10px;">时间</span><span id='source' style="color: #888888; font-size: 10px;">来源</span>
15 |         <section>
16 |           <p id='content'>
17 |             正文
18 |           </p>
19 |         </section>
20 |     </article>
21 |   </div>
22 |     <a id="goNews" style="margin: 0.5em;" href="#" class="weui-btn weui-btn_primary">查看原文</a>
23 |     <a id="goSubscription" style="margin: 0.5em;" href="#" class="weui-btn weui-btn_default">订阅更多</a>
24 | </body>
25 | <script src="http://cdn.bootcss.com/jquery/1.11.0/jquery.min.js"></script>
26 | <script src="http://cdn.bootcss.com/jquery-weui/1.0.1/js/jquery-weui.min.js"></script>
27 | <script>
28 |     // objectId
29 |     var newsId = window.location.search.slice(8, 32);
30 |     $.get(
31 |         "/api/news",
32 |         {'news_id': newsId},
33 |         function(result) {
34 |             var news = result.data.news;
35 |             $('title').text(news.title);
36 |             $('#title').text(news.title);
37 |             $('#content').text(news.content);
38 |             $('#source').text(news.source);
39 |             $('#goNews').attr('href', news.url);
40 |             $('#goSubscription').attr('href', 'http://182.254.225.214/subscription.html');
41 |             var published = new Date(news.published * 1000);
42 |             $('#published').text(published.toLocaleString());
43 |         }
44 |     );
45 | </script>
46 | </html>
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | 
 26 | # DB
 27 | *.sqlite3
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *,cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # IPython Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # dotenv
 81 | .env
 82 | 
 83 | # virtualenv
 84 | venv/
 85 | ENV/
 86 | 
 87 | # Spyder project settings
 88 | .spyderproject
 89 | 
 90 | # Rope project settings
 91 | .ropeproject
 92 | 
 93 | 
 94 | ## Build generated
 95 | build/
 96 | DerivedData/
 97 | 
 98 | ## Various settings
 99 | *.pbxuser
100 | !default.pbxuser
101 | *.mode1v3
102 | !default.mode1v3
103 | *.mode2v3
104 | !default.mode2v3
105 | *.perspectivev3
106 | !default.perspectivev3
107 | xcuserdata/
108 | 
109 | ## Other
110 | *.moved-aside
111 | *.xcuserstate
112 | #*.xcworkspacedata
113 | 
114 | ## Obj-C/Swift specific
115 | *.hmap
116 | *.ipa
117 | 
118 | # CocoaPods
119 | #
120 | # We recommend against adding the Pods directory to your .gitignore. However
121 | # you should judge for yourself, the pros and cons are mentioned at:
122 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
123 | #
124 | #Pods/
125 | 
126 | # Carthage
127 | #
128 | # Add this line if you want to avoid checking in source code from Carthage dependencies.
129 | # Carthage/Checkouts
130 | 
131 | Carthage/Build
132 | Vary-iOS.zip
133 | Web/list-urls.html
134 | 
135 | *.DS_Store
136 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | import logging
 9 | from utils import redis_conn, redis_url_key
10 | from scrapy.conf import settings
11 | from scrapy.exceptions import DropItem
12 | from wechat_push import send_msg
13 | from wechat_config import default_openid
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class MongoDBPipeline(object):
18 | 
19 |     def __init__(self):
20 |         conn = pymongo.Connection(
21 |             settings['MONGO_CONF']['host'],
22 |             settings['MONGO_CONF']['port']
23 |         )
24 |         db = conn[settings['MONGO_CONF']['db']]
25 |         self.news_collection = db[settings['MONGO_CONF']['collection']]
26 | 
27 |     def process_item(self, item, spider):
28 |         valid = True
29 |         for data in item:
30 |             if not data:
31 |                 valid = False
32 |                 raise DropItem("Missing {0}!".format(data))
33 |         if valid:
34 |             object_id = self.news_collection.insert(dict(item))
35 |             spider.object_id = str(object_id)
36 |             logger.info("Question added to MongoDB database!")
37 |         return item
38 | 
39 | 
40 | class RedisPipeline(object):
41 | 
42 |     def process_item(self, item, spider):
43 |         redis_conn.hset(redis_url_key, item['url'], 0)
44 |         return item
45 | 
46 | 
47 | class PushPipeline(object):
48 | 
49 |     def __init__(self):
50 |         conn = pymongo.Connection(
51 |             settings['MONGO_CONF']['host'],
52 |             settings['MONGO_CONF']['port']
53 |         )
54 |         db = conn[settings['MONGO_CONF']['db']]
55 |         self.subscription_collection = db[settings['MONGO_CONF']['subscription_collection']]
56 | 
57 |     def process_item(self, item, spider):
58 |         subscription = self.subscription_collection.find_one(
59 |             {
60 |                 'open_id': default_openid
61 |             }
62 |         )
63 |         keywords = subscription.get('keywords', [])
64 |         # 判断关键词
65 |         keyword_in_title = any([keyword in item['title'] for keyword in keywords])
66 |         keyword_in_content = any([keyword in item['content'] for keyword in keywords])
67 |         if keyword_in_title or keyword_in_content:
68 |             send_msg(
69 |                 title=item['title'],
70 |                 data=item['content'],
71 |                 object_id=spider.object_id,
72 |                 openid=default_openid
73 |             )
74 |         return item
75 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | import logging
 8 | import random
 9 | from utils import redis_conn, redis_url_key, redis_invalid_url_key
10 | from scrapy import signals
11 | from scrapy.conf import settings
12 | from scrapy.exceptions import IgnoreRequest
13 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
14 | from fake_useragent import UserAgent
15 | from settings import USER_ANGENT_LIST
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class RedisMiddleware(object):
20 |     """
21 |     用于去重
22 |     """
23 | 
24 |     def process_request(self, request, spider):
25 |         if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)):
26 |             logger.info("Skip URL: %s, has been crawled" % request.url)
27 |             raise IgnoreRequest("URL %s has been crawled" % request.url)
28 | 
29 | 
30 | class RotateUserAgentMiddleware(UserAgentMiddleware):
31 | 
32 |     def __init__(self, user_agent=''):
33 |         self.user_agent = user_agent
34 |         self.USER_ANGENT_LIST = USER_ANGENT_LIST
35 | 
36 |     def process_request(self, request, spider):
37 |         random_ua = random.choice(self.USER_ANGENT_LIST)
38 |         self.user_agent = random_ua
39 |         request.headers.setdefault('User-Agent', random_ua)
40 | 
41 | 
42 | class NewscrawlerSpiderMiddleware(object):
43 |     # Not all methods need to be defined. If a method is not defined,
44 |     # scrapy acts as if the spider middleware does not modify the
45 |     # passed objects.
46 | 
47 |     @classmethod
48 |     def from_crawler(cls, crawler):
49 |         # This method is used by Scrapy to create your spiders.
50 |         s = cls()
51 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
52 |         return s
53 | 
54 |     def process_spider_input(response, spider):
55 |         # Called for each response that goes through the spider
56 |         # middleware and into the spider.
57 | 
58 |         # Should return None or raise an exception.
59 |         return None
60 | 
61 |     def process_spider_output(response, result, spider):
62 |         # Called with the results returned from the Spider, after
63 |         # it has processed the response.
64 | 
65 |         # Must return an iterable of Request, dict or Item objects.
66 |         for i in result:
67 |             yield i
68 | 
69 |     def process_spider_exception(response, exception, spider):
70 |         # Called when a spider or process_spider_input() method
71 |         # (from other spider middleware) raises an exception.
72 | 
73 |         # Should return either None or an iterable of Response, dict
74 |         # or Item objects.
75 |         pass
76 | 
77 |     def process_start_requests(start_requests, spider):
78 |         # Called with the start requests of the spider, and works
79 |         # similarly to the process_spider_output() method, except
80 |         # that it doesn’t have a response associated.
81 | 
82 |         # Must return only requests (not items).
83 |         for r in start_requests:
84 |             yield r
85 | 
86 |     def spider_opened(self, spider):
87 |         spider.logger.info('Spider opened: %s' % spider.name)
88 | 


--------------------------------------------------------------------------------
/news_web/news_web/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for news_web project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 1.9.7.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/1.9/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/1.9/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | 
 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = '&q2kufvz51ooh&ou_41tqv=w^3k9dx!o@(^+s$zp4=2&lm2@w1'
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = []
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = [
 34 |     'django.contrib.contenttypes',
 35 |     'django.contrib.staticfiles',
 36 |     'lib',
 37 |     'web_server',
 38 | ]
 39 | 
 40 | MIDDLEWARE_CLASSES = [
 41 |     'news_web.middlewares.JsonMiddleware',
 42 |     'news_web.middlewares.QuertStringMiddleware',
 43 | ]
 44 | 
 45 | ROOT_URLCONF = 'news_web.urls'
 46 | 
 47 | TEMPLATES = [
 48 |     {
 49 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 50 |         'DIRS': [],
 51 |         'APP_DIRS': True,
 52 |         'OPTIONS': {
 53 |             'context_processors': [
 54 |                 'django.template.context_processors.debug',
 55 |                 'django.template.context_processors.request',
 56 |                 'django.contrib.auth.context_processors.auth',
 57 |                 'django.contrib.messages.context_processors.messages',
 58 |             ],
 59 |         },
 60 |     },
 61 | ]
 62 | 
 63 | WSGI_APPLICATION = 'news_web.wsgi.application'
 64 | 
 65 | 
 66 | # Database
 67 | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases
 68 | 
 69 | DATABASES = {
 70 |     'default': {
 71 |     },
 72 |     'redis': {
 73 |         'HOST': 'localhost',
 74 |         'PORT': 6379,
 75 |     },
 76 |     'mongodb': {
 77 |         'HOST': 'localhost',
 78 |         'PORT': 27017,
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | # Password validation
 84 | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
 85 | 
 86 | AUTH_PASSWORD_VALIDATORS = [
 87 |     {
 88 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 89 |     },
 90 |     {
 91 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
 92 |     },
 93 |     {
 94 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
 95 |     },
 96 |     {
 97 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
 98 |     },
 99 | ]
100 | 
101 | 
102 | # Internationalization
103 | # https://docs.djangoproject.com/en/1.9/topics/i18n/
104 | 
105 | LANGUAGE_CODE = 'en-us'
106 | 
107 | TIME_ZONE = 'UTC'
108 | 
109 | USE_I18N = True
110 | 
111 | USE_L10N = True
112 | 
113 | USE_TZ = True
114 | 
115 | 
116 | # Static files (CSS, JavaScript, Images)
117 | # https://docs.djangoproject.com/en/1.9/howto/static-files/
118 | 
119 | STATIC_URL = '/static/'
120 | 
121 | 
122 | DEFAULT_WECHAT_OPENID = 'oCfHOwD2_5ZpGctshTZPeZHqUIjc'
123 | 


--------------------------------------------------------------------------------
/news_web/frontend/subscription.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width,initial-scale=1,user-scalable=0">
  6 |     <title>订阅</title>
  7 |     <link rel="stylesheet" href="http://cdn.bootcss.com/weui/1.1.1/style/weui.min.css">
  8 |     <link rel="stylesheet" href="http://cdn.bootcss.com/jquery-weui/1.0.1/css/jquery-weui.min.css">
  9 | </head>
 10 | <body ontouchstart>
 11 | 
 12 | <h1 style="text-align: center;
 13 |     font-size: 34px;
 14 |     color: #3cc51f;
 15 |     font-weight: 400;
 16 |     margin: 0 15%;
 17 |     padding: 35px 0;"
 18 | >订阅</h1>
 19 | 
 20 | <div class="weui-cells__title">已订阅关键词</div>
 21 | <div class="weui-cells weui-cell_access" id="keyword_list">
 22 | </div>
 23 | 
 24 | <div class="weui-cells__title">订阅关键词</div>
 25 | <div class="weui-cells">
 26 |   <div class="weui-cell">
 27 |     <div class="weui-cell__bd">
 28 |       <input id='keyword_to_add' class="weui-input" type="text" placeholder="请输入要订阅的关键词">
 29 |     </div>
 30 |   </div>
 31 | </div>
 32 | <a id="add_keyword_btn" style="margin: 0.5em;" class="weui-btn weui-btn_primary">添加订阅关键词</a>
 33 | <a id="go_back_btn" style="margin: 0.5em;" class="weui-btn weui-btn_default">返回</a>
 34 | 
 35 | </body>
 36 | <script src="http://cdn.bootcss.com/jquery/1.11.0/jquery.min.js"></script>
 37 | <script src="http://cdn.bootcss.com/jquery-weui/1.0.1/js/jquery-weui.min.js"></script>
 38 | <script>
 39 |     var keyword_list = $('#keyword_list');
 40 |     var all_keywords = [];
 41 | 
 42 |     var delete_keyword = function(keyword) {
 43 |       $.confirm({
 44 |         title: '确认删除?',
 45 |         text: '删除已订阅的关键词：' + keyword,
 46 |         onOK: function () {
 47 |           //点击确认
 48 |           $.ajax({
 49 |             method: 'DELETE',
 50 |             url: "/api/subscriptions",
 51 |             dataType: "json",
 52 |             contentType: 'application/json',
 53 |             data: JSON.stringify({keyword: keyword})
 54 |           }).success(function() {
 55 |             $('#keyword_' + keyword).remove();
 56 |             $.toast("操作成功");
 57 |           });
 58 |         }
 59 |       });
 60 |     };
 61 | 
 62 |     var createCell = function(keyword) {
 63 |       var dom = document.createElement('div');
 64 |       var head_dom = document.createElement('div');
 65 |       var foot_dom = document.createElement('div');
 66 |       dom.id = 'keyword_' + keyword;
 67 |       $(dom).addClass('weui-cell');
 68 |       $(head_dom).addClass('weui-cell__bd');
 69 |       $(foot_dom).addClass('weui-cell__ft');
 70 |       $(head_dom).text(keyword);
 71 |       $(dom).click(
 72 |         function () { delete_keyword(keyword) }
 73 |       );
 74 |       dom.appendChild(head_dom);
 75 |       dom.appendChild(foot_dom);
 76 |       $(dom).appendTo(keyword_list);
 77 |     };
 78 | 
 79 |     var get_keyword_list = function() {
 80 |       $.ajax({
 81 |         method: 'GET',
 82 |         url: "/api/subscriptions",
 83 |         dataType: "json"
 84 |       }).success(function(result) {
 85 |         var subscription = result.data.subscription;
 86 |         all_keywords = subscription.keywords || [];
 87 |         for (var keyword of all_keywords) {
 88 |           createCell(keyword);
 89 |         }
 90 |       });
 91 |     };
 92 | 
 93 |     var add_keyword = function() {
 94 |       var keyword_to_add = $('#keyword_to_add').val();
 95 |       if (all_keywords.indexOf(keyword_to_add) === -1) {
 96 |         $.ajax({
 97 |           method: 'POST',
 98 |           url: "/api/subscriptions",
 99 |           dataType: "json",
100 |           contentType: 'application/json',
101 |           data: JSON.stringify({keyword: keyword_to_add})
102 |         }).success(function() {
103 |           createCell(keyword_to_add);
104 |           $.toast("添加关键词" + keyword_to_add + "成功");
105 |         });
106 |       } else {
107 |         $.toast("关键词已存在，添加失败", "forbidden");
108 |       }
109 |     };
110 | 
111 |     $('#add_keyword_btn').click(add_keyword);
112 |     $('#go_back_btn').click(function() { window.history.back(); });
113 | 
114 |     get_keyword_list();
115 | </script>
116 | </html>
117 | 


--------------------------------------------------------------------------------
/newscrawler/newscrawler/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for newscrawler project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'newscrawler'
 13 | 
 14 | SPIDER_MODULES = ['newscrawler.spiders']
 15 | NEWSPIDER_MODULE = 'newscrawler.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'newscrawler (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | DOWNLOAD_DELAY = 1
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | #DEFAULT_REQUEST_HEADERS = {
 43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 | #   'Accept-Language': 'en',
 45 | #}
 46 | 
 47 | # Enable or disable spider middlewares
 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 49 | #SPIDER_MIDDLEWARES = {
 50 | #    'newscrawler.middlewares.NewscrawlerSpiderMiddleware': 543,
 51 | #}
 52 | 
 53 | # Enable or disable downloader middlewares
 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 55 | DOWNLOADER_MIDDLEWARES = {
 56 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 57 |     'newscrawler.middlewares.RedisMiddleware': 300,
 58 |     'newscrawler.middlewares.RotateUserAgentMiddleware': 400,
 59 | }
 60 | 
 61 | # Enable or disable extensions
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 63 | #EXTENSIONS = {
 64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 65 | #}
 66 | 
 67 | # 来源 fake-useragent
 68 | USER_ANGENT_LIST = [
 69 |     u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
 70 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
 71 |     u'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
 72 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
 73 |     u'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36',
 74 |     u'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
 75 |     u'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
 76 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
 77 |     u'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36',
 78 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36',
 79 |     u'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
 80 |     u'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
 81 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
 82 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
 83 |     u'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
 84 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
 85 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
 86 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36',
 87 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36',
 88 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
 89 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36',
 90 |     u'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F',
 91 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10',
 92 |     u'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36',
 93 |     u'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
 94 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
 95 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
 96 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
 97 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36',
 98 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36',
 99 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
100 |     u'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36',
101 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36',
102 |     u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
103 |     u'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
104 |     u'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
105 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36',
106 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
107 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
108 |     u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
109 |     u'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
110 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
111 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
112 |     u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36',
113 |     u'Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
114 |     u'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
115 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',
116 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',
117 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15',
118 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14',
119 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
120 |     u'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
121 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
122 |     u'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
123 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0',
124 |     u'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
125 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0',
126 |     u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0',
127 |     u'Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0',
128 |     u'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0',
129 |     u'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3',
130 |     u'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0',
131 |     u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
132 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
133 |     u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
134 |     u'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
135 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0',
136 |     u'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0',
137 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
138 |     u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0',
139 |     u'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0',
140 |     u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0',
141 |     u'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0',
142 |     u'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0',
143 |     u'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
144 |     u'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
145 |     u'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0',
146 |     u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0',
147 |     u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0',
148 |     u'Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0',
149 |     u'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0',
150 |     u'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0',
151 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0',
152 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0',
153 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0',
154 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
155 |     u'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0',
156 |     u'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0',
157 |     u'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0',
158 |     u'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0',
159 |     u'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0',
160 |     u'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0',
161 |     u'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0',
162 |     u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0',
163 |     u'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0',
164 |     u'Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0',
165 |     u'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0',
166 |     u'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1',
167 |     u'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0',
168 |     u'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6'
169 | ]
170 | 
171 | # Configure item pipelines
172 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
173 | ITEM_PIPELINES = {
174 |    'newscrawler.pipelines.MongoDBPipeline': 300,
175 |    'newscrawler.pipelines.RedisPipeline': 400,
176 |    'newscrawler.pipelines.PushPipeline': 500,
177 | }
178 | 
179 | # Enable and configure the AutoThrottle extension (disabled by default)
180 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
181 | #AUTOTHROTTLE_ENABLED = True
182 | # The initial download delay
183 | #AUTOTHROTTLE_START_DELAY = 5
184 | # The maximum download delay to be set in case of high latencies
185 | #AUTOTHROTTLE_MAX_DELAY = 60
186 | # The average number of requests Scrapy should be sending in parallel to
187 | # each remote server
188 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
189 | # Enable showing throttling stats for every response received:
190 | #AUTOTHROTTLE_DEBUG = False
191 | 
192 | # Enable and configure HTTP caching (disabled by default)
193 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
194 | #HTTPCACHE_ENABLED = True
195 | #HTTPCACHE_EXPIRATION_SECS = 0
196 | #HTTPCACHE_DIR = 'httpcache'
197 | #HTTPCACHE_IGNORE_HTTP_CODES = []
198 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
199 | 
200 | # MONGODB 配置
201 | MONGO_CONF = {
202 |     'host': 'localhost',
203 |     'port': 27017,
204 |     'db': 'news',
205 |     'collection': 'news',
206 |     'subscription_collection': 'subscription'
207 | }
208 | 
209 | 
210 | # REDIS 配置
211 | REDIS_CONF = {
212 |     'host': 'localhost',
213 |     'port': 6379,
214 |     'db': 5
215 | }
216 | 


--------------------------------------------------------------------------------