标题
14 | 时间来源 15 |17 | 正文 18 |
19 |├── news_web ├── lib │ ├── utils.py │ ├── __init__.py │ ├── error_code.py │ ├── db_utils.py │ └── response.py ├── news_web │ ├── __init__.py │ ├── wsgi.py │ ├── urls.py │ ├── middlewares.py │ └── settings.py ├── web_server │ ├── __init__.py │ ├── migrations │ │ └── __init__.py │ ├── admin.py │ ├── tests.py │ ├── apps.py │ ├── urls.py │ ├── views.py │ └── models.py ├── run_server.sh ├── init_db.py ├── manage.py └── frontend │ ├── article.html │ └── subscription.html ├── newscrawler ├── newscrawler │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── netease.py │ │ └── qq.py │ ├── utils.py │ ├── items.py │ ├── wechat_config.py │ ├── wechat_push.py │ ├── pipelines.py │ ├── middlewares.py │ └── settings.py ├── worker.py ├── scrapy.cfg └── start_crawl.py ├── README.md ├── 论文相关文件 ├── 用例图.png ├── 爬虫部分目录.png ├── 系统总体框架.png ├── MongoDB.png ├── nginx配置.png ├── scrapy架构.png ├── spider实现.png ├── 启动API服务器.png ├── 启动spider.png ├── 新闻推送活动图.png ├── 新闻订阅活动图.png ├── WechatIMG37.png ├── WechatIMG38.png ├── WechatIMG39.png ├── WechatIMG40.png ├── useragent.png ├── 订阅与展示部分目录.png ├── WechatIMG41.jpeg ├── WechatIMG42.jpeg ├── WechatIMG43.jpeg ├── WechatIMG44.jpeg └── 基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf ├── .gitmodules ├── requirements.txt ├── LICENSE └── .gitignore /news_web/lib/utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/news_web/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/web_server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news_web/web_server/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NewsCrawler 2 | 毕业设计 基于网络爬虫的新闻采集和订阅系统的设计与实现 3 | -------------------------------------------------------------------------------- /论文相关文件/用例图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/用例图.png -------------------------------------------------------------------------------- /论文相关文件/爬虫部分目录.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/爬虫部分目录.png -------------------------------------------------------------------------------- /论文相关文件/系统总体框架.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/系统总体框架.png -------------------------------------------------------------------------------- /news_web/web_server/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /news_web/web_server/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /论文相关文件/MongoDB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/MongoDB.png -------------------------------------------------------------------------------- /论文相关文件/nginx配置.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/nginx配置.png -------------------------------------------------------------------------------- /论文相关文件/scrapy架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/scrapy架构.png -------------------------------------------------------------------------------- /论文相关文件/spider实现.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/spider实现.png -------------------------------------------------------------------------------- /论文相关文件/启动API服务器.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/启动API服务器.png -------------------------------------------------------------------------------- /论文相关文件/启动spider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/启动spider.png -------------------------------------------------------------------------------- /论文相关文件/新闻推送活动图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/新闻推送活动图.png -------------------------------------------------------------------------------- /论文相关文件/新闻订阅活动图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/新闻订阅活动图.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG37.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG38.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG39.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG39.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG40.png -------------------------------------------------------------------------------- /论文相关文件/useragent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/useragent.png -------------------------------------------------------------------------------- /论文相关文件/订阅与展示部分目录.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/订阅与展示部分目录.png -------------------------------------------------------------------------------- /论文相关文件/WechatIMG41.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG41.jpeg -------------------------------------------------------------------------------- /论文相关文件/WechatIMG42.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG42.jpeg -------------------------------------------------------------------------------- /论文相关文件/WechatIMG43.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG43.jpeg -------------------------------------------------------------------------------- /论文相关文件/WechatIMG44.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/WechatIMG44.jpeg -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "IPProxyTool"] 2 | path = IPProxyTool 3 | url = https://github.com/awolfly9/IPProxyTool.git 4 | -------------------------------------------------------------------------------- /论文相关文件/基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BillBillBillBill/NewsCrawler/HEAD/论文相关文件/基于网络爬虫的新闻采集和订阅系统的设计与实现_黄雄镖_终稿.pdf -------------------------------------------------------------------------------- /news_web/run_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "启动服务器" 3 | #python manage.py runserver 0.0.0.0:8000 4 | gunicorn news_web.wsgi:application -w 4 -b :8000 -k gevent --max-requests 1000 5 | -------------------------------------------------------------------------------- /news_web/init_db.py: -------------------------------------------------------------------------------- 1 | from web_server.models import NewsItem, Subscription 2 | 3 | 4 | if __name__ == '__main__': 5 | NewsItem.ensure_indexes() 6 | Subscription.ensure_indexes() 7 | -------------------------------------------------------------------------------- /news_web/web_server/apps.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from django.apps import AppConfig 4 | 5 | 6 | class WebServerConfig(AppConfig): 7 | name = 'web_server' 8 | -------------------------------------------------------------------------------- /newscrawler/worker.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | 4 | 5 | if __name__ == '__main__': 6 | while True: 7 | subprocess.call(['python', 'start_crawl.py']) 8 | time.sleep(30) 9 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pymongo==2.8.1 2 | scrapy==1.3.1 3 | redis 4 | fake-useragent 5 | django 6 | mongoengine 7 | jieba 8 | w3lib 9 | lxml 10 | twisted==16.4.1 11 | gevent==1.1.2 12 | greenlet==0.4.10 13 | gunicorn==19.6.0 14 | -------------------------------------------------------------------------------- /news_web/web_server/urls.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from django.conf.urls import url 3 | from views import ping, news, subscriptions 4 | 5 | urlpatterns = [ 6 | url(r'^ping$', ping), 7 | url(r'^news$', news), 8 | url(r'^subscriptions$', subscriptions), 9 | ] 10 | -------------------------------------------------------------------------------- /news_web/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "news_web.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /newscrawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = newscrawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = newscrawler 12 | -------------------------------------------------------------------------------- /news_web/lib/error_code.py: -------------------------------------------------------------------------------- 1 | 2 | err_code_message = { 3 | '10000': 'Unknown Error', 4 | '10001': 'Bad Request', 5 | '10005': 'Internal Error(Json Dumps Error)', 6 | '99999': 'Jsonschema Check Fail', 7 | } 8 | 9 | 10 | def get_message(err_code): 11 | return err_code_message.get(str(err_code), "Undefined Error Code") 12 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import redis 3 | from scrapy.conf import settings 4 | 5 | 6 | redis_conn = redis.Redis( 7 | host=settings['REDIS_CONF']['host'], 8 | port=settings['REDIS_CONF']['port'], 9 | db=settings['REDIS_CONF']['db'] 10 | ) 11 | redis_url_key = "url" 12 | redis_invalid_url_key = "invalid_url" 13 | -------------------------------------------------------------------------------- /news_web/lib/db_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import datetime 4 | import mongoengine 5 | 6 | 7 | def now(): 8 | return datetime.datetime.utcnow() 9 | 10 | 11 | def get_utc_seconds(dt): 12 | return int(time.mktime(dt.timetuple()) - time.timezone) 13 | 14 | 15 | def using_db(alias): 16 | mongoengine.register_connection( 17 | alias, alias, 18 | host="localhost", 19 | port=27017 20 | ) 21 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NewsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | content = scrapy.Field() 16 | source = scrapy.Field() 17 | published = scrapy.Field() 18 | url = scrapy.Field() 19 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/wechat_config.py: -------------------------------------------------------------------------------- 1 | 2 | appid = 'wx98236d864d8a122a' 3 | appsecret = '2530005d0440a7d3980ab6abca806357' 4 | token_url = 'https://api.weixin.qq.com/cgi-bin/token?' 5 | default_openid = 'oCfHOwD2_5ZpGctshTZPeZHqUIjc' 6 | maintainers = ['oCfHOwD2_5ZpGctshTZPeZHqUIjc'] 7 | expires_time = None 8 | 9 | template_id = 'B1ftH9IQr4Cuy_1M_94Je851RKGgQU-Uc0CH5ej4-oo' 10 | send_url = 'https://api.weixin.qq.com/cgi-bin/message/template/send?access_token=' 11 | template_url = 'http://182.254.225.214/article.html?newsId=' 12 | -------------------------------------------------------------------------------- /news_web/news_web/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for news_web project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | from gevent import monkey 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | 15 | monkey.patch_all() 16 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "news_web.settings") 17 | 18 | application = get_wsgi_application() 19 | -------------------------------------------------------------------------------- /newscrawler/start_crawl.py: -------------------------------------------------------------------------------- 1 | from twisted.internet import reactor, defer 2 | from scrapy.crawler import CrawlerRunner 3 | from scrapy.utils.project import get_project_settings 4 | from scrapy.utils.log import configure_logging 5 | from newscrawler.spiders.netease import NeteaseSpider 6 | from newscrawler.spiders.qq import QQSpider 7 | 8 | 9 | configure_logging() 10 | settings = get_project_settings() 11 | runner = CrawlerRunner(settings) 12 | 13 | 14 | @defer.inlineCallbacks 15 | def crawl(): 16 | yield runner.crawl(NeteaseSpider) 17 | yield runner.crawl(QQSpider) 18 | reactor.stop() 19 | 20 | 21 | if __name__ == '__main__': 22 | crawl() 23 | reactor.run() 24 | -------------------------------------------------------------------------------- /news_web/news_web/urls.py: -------------------------------------------------------------------------------- 1 | """news_web URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.9/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url, include 17 | 18 | urlpatterns = [ 19 | url(r'api/', include('web_server.urls')), 20 | ] 21 | -------------------------------------------------------------------------------- /news_web/news_web/middlewares.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import urlparse 4 | from django.http import HttpResponse 5 | from lib import response 6 | 7 | 8 | class JsonMiddleware(object): 9 | def process_request(self, request): 10 | try: 11 | request.json = json.loads(request.body) 12 | except: 13 | request.json = {} 14 | 15 | 16 | class QuertStringMiddleware(object): 17 | def process_request(self, request): 18 | query_string = request.META.get("QUERY_STRING", "") 19 | # convert to json, flat it 20 | try: 21 | request.qs = {} 22 | for k, v in urlparse.parse_qs(query_string).items(): 23 | if len(v) == 1: 24 | request.qs[k] = v[0] 25 | else: 26 | request.qs[k] = v 27 | except: 28 | request.qs = {} 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Xiongbiao Huang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /news_web/lib/response.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from django.http import HttpResponse 3 | import json 4 | from error_code import get_message 5 | 6 | 7 | def JsonErrorResponse(code=10001, error_body=None): 8 | try: 9 | if error_body: 10 | data = json.dumps({ 11 | 'code': code, 12 | 'message': error_body 13 | }) 14 | else: 15 | data = '{"code": %s, "message": "%s"}' % (code, get_message(code)) 16 | except: 17 | return JsonErrorResponse(10005) 18 | response = HttpResponse( 19 | content=data, 20 | content_type='application/json; charset=utf-8', 21 | status=400 22 | ) 23 | return response 24 | 25 | 26 | def JsonResponse(json_data={}): 27 | try: 28 | if isinstance(json_data, str): 29 | data = '{"code": 0, "data": %s}' % json_data 30 | else: 31 | data = { 32 | "code": 0, 33 | "data": json_data 34 | } 35 | data = json.dumps(data) 36 | except: 37 | return JsonErrorResponse(10005) 38 | response = HttpResponse( 39 | content=data, 40 | content_type='application/json; charset=utf-8', 41 | status=200 42 | ) 43 | return response 44 | 45 | 46 | def not_match_func(request): 47 | return JsonErrorResponse(10002) 48 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/spiders/netease.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import time 5 | from datetime import datetime 6 | from scrapy.spiders import CrawlSpider, Rule 7 | from scrapy.linkextractors import LinkExtractor 8 | from scrapy.shell import inspect_response 9 | from newscrawler.items import NewsItem 10 | 11 | 12 | class NeteaseSpider(CrawlSpider): 13 | name = "netease" 14 | allowed_domains = ["163.com"] 15 | start_urls = ['http://tech.163.com/'] 16 | 17 | rules = ( 18 | Rule(LinkExtractor(allow=('/\d+/\d+/\d+/*', )), callback='parse_item'), 19 | ) 20 | 21 | def parse_item(self, response): 22 | # inspect_response(response, self) 23 | r = response 24 | title = r.xpath('/html/head/title/text()').extract()[0].strip() 25 | source = r.xpath("//a[@id='ne_article_source']/text()").extract()[0].strip() 26 | content = "".join(r.xpath("//div[@id='endText']/p/text()").extract()).strip() 27 | raw_time = r.xpath("//div[@class='post_time_source']/text()").extract()[0] 28 | re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", raw_time) 29 | if re_result: 30 | ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M:%S')) 31 | else: 32 | ts = 0 33 | url = r.url 34 | new_news = NewsItem( 35 | title=title, 36 | content=content, 37 | source=source, 38 | published=ts, 39 | url=url 40 | ) 41 | return new_news 42 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/wechat_push.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import datetime 4 | import urllib 5 | import requests 6 | import wechat_config as config 7 | 8 | 9 | def update_token(func): 10 | def wrapper(*args, **kwargs): 11 | now = datetime.datetime.now() 12 | # 刷新token 13 | if not config.expires_time or now >= config.expires_time: 14 | data = { 15 | 'grant_type': 'client_credential', 16 | 'appid': config.appid, 17 | 'secret': config.appsecret 18 | } 19 | ret = requests.get(config.token_url + urllib.urlencode(data)) 20 | result = json.loads(ret.text) 21 | config.access_token = result['access_token'] 22 | config.expires_time = now + datetime.timedelta(hours=2) 23 | return func(*args, **kwargs) 24 | return wrapper 25 | 26 | 27 | @update_token 28 | def send_msg(title, data, object_id, openid=config.maintainers[0]): 29 | data = json.dumps({ 30 | 'touser': openid, 31 | 'template_id': config.template_id, 32 | 'url': config.template_url + object_id, # 点击打开的链接 33 | 'data': { 34 | 'title': { 35 | 'value': title, 36 | 'color': '#173177' 37 | }, 38 | 'data': { 39 | 'value': data, 40 | 'color': '#173177' 41 | }, 42 | } 43 | }).encode() 44 | url = config.send_url + config.access_token 45 | ret = requests.post(url, data) 46 | print 'send result: {}'.format(ret.text) 47 | 48 | 49 | if __name__ == '__main__': 50 | from IPython import embed 51 | embed() 52 | for maintainer in config.maintainers: 53 | send_msg('hello', 'start running~', maintainer) 54 | -------------------------------------------------------------------------------- /news_web/web_server/views.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from lib.response import JsonResponse, not_match_func 3 | from models import NewsItem, Subscription 4 | from news_web.settings import DEFAULT_WECHAT_OPENID 5 | 6 | 7 | def ping(request): 8 | handler_map = { 9 | "GET": ping_get 10 | } 11 | return handler_map.get(request.method, not_match_func)(request) 12 | 13 | 14 | def news(request): 15 | handler_map = { 16 | "GET": get_news 17 | } 18 | return handler_map.get(request.method, not_match_func)(request) 19 | 20 | 21 | def subscriptions(request): 22 | handler_map = { 23 | "GET": get_subscriptions, 24 | "POST": add_subscriptions, 25 | "DELETE": remove_subscriptions 26 | } 27 | return handler_map.get(request.method, not_match_func)(request) 28 | 29 | 30 | def ping_get(request): 31 | return JsonResponse( 32 | { 33 | 'msg': 'pong' 34 | } 35 | ) 36 | 37 | 38 | def get_news(request): 39 | news_id = request.qs['news_id'] 40 | return JsonResponse( 41 | { 42 | 'news': NewsItem.objects.get(id=news_id).to_json() 43 | } 44 | ) 45 | 46 | 47 | def get_subscriptions(request): 48 | subscription = Subscription.ensure_subscription(DEFAULT_WECHAT_OPENID) 49 | return JsonResponse( 50 | { 51 | 'subscription': subscription.to_json() 52 | } 53 | ) 54 | 55 | 56 | def add_subscriptions(request): 57 | keyword = request.json.get('keyword') 58 | if keyword: 59 | Subscription.add_keyword(DEFAULT_WECHAT_OPENID, keyword) 60 | return JsonResponse({}) 61 | 62 | 63 | def remove_subscriptions(request): 64 | keyword = request.json.get('keyword') 65 | if keyword: 66 | Subscription.remove_keyword(DEFAULT_WECHAT_OPENID, keyword) 67 | return JsonResponse({}) 68 | -------------------------------------------------------------------------------- /newscrawler/newscrawler/spiders/qq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import time 5 | from datetime import datetime 6 | from scrapy.spiders import CrawlSpider, Rule 7 | from scrapy.linkextractors import LinkExtractor 8 | from scrapy.shell import inspect_response 9 | from newscrawler.utils import redis_conn, redis_invalid_url_key 10 | from newscrawler.items import NewsItem 11 | 12 | 13 | class QQSpider(CrawlSpider): 14 | name = "qq" 15 | allowed_domains = ["tech.qq.com"] 16 | start_urls = ['http://tech.qq.com/'] 17 | 18 | rules = ( 19 | Rule(LinkExtractor(allow=('http://tech.qq.com/a/\d+/*', )), callback='parse_item'), 20 | ) 21 | 22 | def parse_item(self, response): 23 | r = response 24 | # inspect_response(response, self) 25 | 26 | title = r.xpath("//div[@class='qq_article']//h1/text()").extract() 27 | source = r.xpath("//div[@class='qq_article']//span[@class='a_source']/text()").extract() 28 | if title: 29 | title = title[0] 30 | if source: 31 | source = source[0] 32 | # 要求格式正确 33 | if not title or not source: 34 | redis_conn.hset(redis_invalid_url_key, response.url, 0) 35 | return 36 | content = ''.join(r.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract()) 37 | raw_time = r.xpath("//div[@class='qq_article']//span[@class='a_time']/text()").extract()[0] 38 | re_result = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", raw_time) 39 | if re_result: 40 | ts = time.mktime(time.strptime(re_result[0], '%Y-%m-%d %H:%M')) 41 | else: 42 | ts = 0 43 | url = r.url 44 | new_news = NewsItem( 45 | title=title, 46 | content=content, 47 | source=source, 48 | published=ts, 49 | url=url 50 | ) 51 | return new_news 52 | -------------------------------------------------------------------------------- /news_web/web_server/models.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | from lib import db_utils 4 | import mongoengine as m 5 | 6 | db_utils.using_db("news") 7 | 8 | 9 | class NewsItem(m.Document): 10 | title = m.StringField() 11 | content = m.StringField() 12 | source = m.StringField() 13 | published = m.DateTimeField() 14 | url = m.StringField() 15 | 16 | meta = { 17 | 'db_alias': 'news', 18 | 'collection': 'news', 19 | } 20 | 21 | def to_json(self): 22 | return { 23 | 'title': self.title, 24 | 'content': self.content, 25 | 'source': self.source, 26 | 'published': self.published, 27 | 'url': self.url, 28 | } 29 | 30 | 31 | class Subscription(m.Document): 32 | open_id = m.StringField() 33 | keywords = m.ListField(m.StringField()) # 关键词 34 | tags = m.ListField(m.StringField()) # 标签 35 | 36 | meta = { 37 | 'db_alias': 'news', 38 | 'collection': 'subscription', 39 | } 40 | 41 | @staticmethod 42 | def ensure_subscription(open_id): 43 | subscription = Subscription.objects(open_id=open_id).first() 44 | if not subscription: 45 | subscription = Subscription(open_id=open_id) 46 | subscription.save() 47 | return subscription 48 | 49 | @staticmethod 50 | def add_keyword(open_id, keyword): 51 | subscription = Subscription.ensure_subscription(open_id) 52 | subscription.update(add_to_set__keywords=keyword) 53 | 54 | @staticmethod 55 | def remove_keyword(open_id, keyword): 56 | subscription = Subscription.ensure_subscription(open_id) 57 | subscription.update(pull__keywords=keyword) 58 | 59 | def to_json(self): 60 | return { 61 | 'open_id': self.open_id, 62 | 'keywords': self.keywords, 63 | 'tags': self.tags 64 | } 65 | -------------------------------------------------------------------------------- /news_web/frontend/article.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 |17 | 正文 18 |
19 |