├── .gitignore ├── README.md ├── __init__.py ├── application.py ├── config ├── .DS_Store ├── __init__.py ├── myNewsApi.conf └── n_conf.py ├── controller ├── __init__.py ├── dataController.py └── newsController.py ├── cookie_secret.py ├── doc ├── source │ ├── ERDDiagram.jpg │ ├── news.png │ ├── 推荐新闻.png │ ├── 新闻.png │ ├── 新闻分数.png │ ├── 新闻基本信息.png │ ├── 新闻标签因子.png │ ├── 标签喜欢程度.png │ ├── 用户.png │ ├── 用户基本信息.png │ ├── 用户操作.png │ └── 用户行为信息.png ├── 互联网内容推荐系统需求分析.md ├── 互联网推荐系统API分析.md ├── 新闻推荐系统后台管理系统.md └── 新闻推荐系统数据库设计.md ├── handlers ├── UmFeedBack.py ├── UmMyNote.py ├── __init__.py ├── admin.py ├── api │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-34.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── newsApi.cpython-34.pyc │ │ └── newsApi.cpython-35.pyc │ └── newsApi.py ├── base.py ├── changePass.py ├── dataAna.py ├── dataOperator.py ├── errorHandler.py ├── index.py ├── newsManage.py ├── spider.py ├── system.py └── userManage.py ├── log.txt ├── methods ├── .DS_Store ├── __init__.py └── pDb.py ├── myNews.py ├── myNewsApi.log ├── server.py ├── spider ├── .DS_Store ├── __init__.py ├── allSource │ ├── .DS_Store │ ├── README.md │ ├── __all__ │ │ └── README.md │ ├── news_baby │ │ └── README.md │ ├── news_car │ │ └── README.md │ ├── news_discovery │ │ └── README.md │ ├── news_entertainment │ │ └── README.md │ ├── news_essay │ │ └── README.md │ ├── news_fashion │ │ └── README.md │ ├── news_finance │ │ └── README.md │ ├── news_food │ │ └── README.md │ ├── news_game │ │ └── README.md │ ├── news_history │ │ └── README.md │ ├── news_hot │ │ └── README.md │ ├── news_military │ │ └── README.md │ ├── news_regimen │ │ └── README.md │ ├── news_society │ │ └── README.md │ ├── news_sports │ │ └── README.md │ ├── news_story │ │ └── README.md │ ├── news_tech │ │ └── README.md │ ├── news_travel │ │ └── README.md │ └── news_world │ │ └── README.md ├── allSpider.py ├── mergeExcel.py ├── newsDb │ ├── __init__.py │ └── insertNews.py ├── pyspider │ ├── __init__.py │ ├── data │ │ ├── project.db │ │ ├── result.db │ │ ├── scheduler.1d │ │ ├── scheduler.1h │ │ ├── scheduler.all │ │ └── task.db │ ├── database │ │ ├── __init__.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── basedb.py │ │ ├── elasticsearch │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── local │ │ │ ├── __init__.py │ │ │ └── projectdb.py │ │ ├── mongodb │ │ │ ├── __init__.py │ │ │ ├── mongodbbase.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── mysql │ │ │ ├── __init__.py │ │ │ ├── mysqlbase.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── redis │ │ │ ├── __init__.py │ │ │ └── taskdb.py │ │ ├── sqlalchemy │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ ├── sqlalchemybase.py │ │ │ └── taskdb.py │ │ └── sqlite │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ ├── sqlitebase.py │ │ │ └── taskdb.py │ ├── fetcher │ │ ├── __init__.py │ │ ├── cookie_utils.py │ │ ├── phantomjs_fetcher.js │ │ └── tornado_fetcher.py │ ├── libs │ │ ├── ListIO.py │ │ ├── __init__.py │ │ ├── base_handler.py │ │ ├── bench.py │ │ ├── counter.py │ │ ├── dataurl.py │ │ ├── log.py │ │ ├── multiprocessing_queue.py │ │ ├── pprint.py │ │ ├── response.py │ │ ├── result_dump.py │ │ ├── sample_handler.py │ │ ├── url.py │ │ ├── utils.py │ │ └── wsgi_xmlrpc.py │ ├── logging.conf │ ├── message_queue │ │ ├── __init__.py │ │ ├── beanstalk.py │ │ ├── kombu_queue.py │ │ ├── rabbitmq.py │ │ └── redis_queue.py │ ├── processor │ │ ├── __init__.py │ │ ├── processor.py │ │ └── project_module.py │ ├── result │ │ ├── __init__.py │ │ └── result_worker.py │ ├── run.py │ ├── scheduler │ │ ├── __init__.py │ │ ├── scheduler.py │ │ ├── task_queue.py │ │ └── token_bucket.py │ └── webui │ │ ├── __init__.py │ │ ├── app.py │ │ ├── bench_test.py │ │ ├── debug.py │ │ ├── index.py │ │ ├── login.py │ │ ├── result.py │ │ ├── static │ │ ├── css_selector_helper.js │ │ ├── debug.css │ │ ├── debug.js │ │ ├── debug.less │ │ ├── index.css │ │ ├── index.js │ │ ├── index.less │ │ ├── result.css │ │ ├── result.less │ │ ├── splitter.js │ │ ├── task.css │ │ ├── task.less │ │ ├── tasks.css │ │ ├── tasks.less │ │ └── variable.less │ │ ├── task.py │ │ ├── templates │ │ ├── debug.html │ │ ├── helper.html │ │ ├── helper.js │ │ ├── index.html │ │ ├── result.html │ │ ├── task.html │ │ └── tasks.html │ │ └── webdav.py ├── pyspiderSource │ ├── .DS_Store │ ├── README.md │ ├── __all__ │ │ └── README.md │ ├── news_baby │ │ └── README.md │ ├── news_car │ │ └── README.md │ ├── news_discovery │ │ └── README.md │ ├── news_entertainment │ │ └── README.md │ ├── news_essay │ │ └── README.md │ ├── news_fashion │ │ └── README.md │ ├── news_finance │ │ └── README.md │ ├── news_food │ │ └── README.md │ ├── news_game │ │ └── README.md │ ├── news_history │ │ └── README.md │ ├── news_hot │ │ └── README.md │ ├── news_military │ │ └── README.md │ ├── news_regimen │ │ └── README.md │ ├── news_society │ │ └── README.md │ ├── news_sports │ │ └── README.md │ ├── news_story │ │ └── README.md │ ├── news_tech │ │ └── README.md │ ├── news_travel │ │ └── README.md │ └── news_world │ │ └── README.md ├── sina │ ├── README.md │ ├── __init__.py │ ├── sina.py │ └── sinaSpider.py ├── sinaSource │ ├── README.md │ ├── news_entertainment │ │ └── README.md │ ├── news_finance │ │ └── README.md │ ├── news_military │ │ └── README.md │ ├── news_society │ │ └── README.md │ ├── news_sports │ │ └── README.md │ ├── news_tech │ │ └── README.md │ └── news_world │ │ └── README.md ├── touTiaoSource │ ├── README.md │ ├── __all__ │ │ └── README.md │ ├── gallery_detail │ │ └── README.md │ ├── news_baby │ │ └── README.md │ ├── news_car │ │ ├── .DS_Store │ │ └── README.md │ ├── news_discovery │ │ └── README.md │ ├── news_entertainment │ │ ├── .DS_Store │ │ └── README.md │ ├── news_essay │ │ └── README.md │ ├── news_fashion │ │ └── README.md │ ├── news_finance │ │ ├── .DS_Store │ │ └── README.md │ ├── news_food │ │ └── README.md │ ├── news_game │ │ └── README.md │ ├── news_history │ │ └── README.md │ ├── news_hot │ │ └── README.md │ ├── news_military │ │ └── README.md │ ├── news_regimen │ │ └── README.md │ ├── news_society │ │ ├── .DS_Store │ │ └── README.md │ ├── news_sports │ │ ├── .DS_Store │ │ └── README.md │ ├── news_story │ │ └── README.md │ ├── news_tech │ │ ├── .DS_Store │ │ └── README.md │ ├── news_travel │ │ └── README.md │ ├── news_world │ │ └── README.md │ └── video │ │ └── README.md ├── toutiao │ ├── __init__.py │ ├── touTiao.py │ └── touTiaoSpider.py └── wordAna │ ├── .DS_Store │ ├── __init__.py │ ├── allNews │ ├── .DS_Store │ └── README.md │ ├── contentSpider.py │ ├── contentTool.py │ ├── excelTool.py │ └── wordAnaNews │ ├── .DS_Store │ └── README.md ├── static ├── css │ ├── admin.css │ ├── bootstrap.css │ ├── bootstrap.min.css │ ├── login.css │ ├── newsManage.css │ └── userManage.css ├── fonts │ ├── glyphicons-halflings-regular.eot │ ├── glyphicons-halflings-regular.svg │ ├── glyphicons-halflings-regular.ttf │ ├── glyphicons-halflings-regular.woff │ └── glyphicons-halflings-regular.woff2 ├── images │ ├── 1.svg │ ├── admin.jpg │ ├── back.jpg │ ├── bg.jpg │ ├── bg.png │ ├── bgb.jpg │ ├── hand.png │ ├── left-handing.png │ ├── left_hand.png │ ├── news.png │ ├── password.png │ ├── right_hand.png │ ├── right_handing.png │ ├── save.svg │ ├── save0.svg │ ├── top_1.png │ ├── tou.png │ └── username.png └── js │ ├── admin.js │ ├── bootstrap.min.js │ ├── d3.js │ ├── d3.min.js │ ├── feedback.js │ ├── howie.js │ ├── jquery.min.js │ ├── newsManage.js │ └── userManage.js ├── system ├── README.md ├── classPredict │ ├── NavieBayesInfo │ │ ├── last_model.txt │ │ ├── model.txt │ │ ├── predict_new_word.txt │ │ ├── predict_result.txt │ │ ├── train_news_Info.txt │ │ └── word_id_dict.txt │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-34.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── main.cpython-34.pyc │ │ ├── main.cpython-35.pyc │ │ ├── newsPredict.cpython-34.pyc │ │ ├── newsPredict.cpython-35.pyc │ │ ├── predictTool.cpython-34.pyc │ │ └── predictTool.cpython-35.pyc │ ├── bayesTool.py │ ├── dataPrepare.py │ ├── main.py │ ├── newsPredict.py │ ├── predictTool.py │ ├── test.py │ └── trainData │ │ └── 2016-06-06-13-09-44&news_fashion.xlsx ├── latentFactor │ ├── README.md │ ├── __pycache__ │ │ ├── geneCalcul.cpython-34.pyc │ │ ├── geneCalcul.cpython-35.pyc │ │ ├── geneNewsType.cpython-34.pyc │ │ ├── geneNewsType.cpython-35.pyc │ │ ├── geneUserType.cpython-34.pyc │ │ └── geneUserType.cpython-35.pyc │ ├── dbTool.py │ ├── geneCalcul.py │ ├── geneNewsType.py │ ├── geneUserNews.py │ └── geneUserType.py └── pointsAlo │ ├── __pycache__ │ ├── scoreSetting.cpython-34.pyc │ └── scoreSetting.cpython-35.pyc │ └── scoreSetting.py ├── templates ├── admin.html ├── dataAna.html ├── index.html ├── main.html ├── newsManage.html ├── spider.html ├── system.html ├── umFeedBack.html ├── umMyNote.html └── userManage.html └── tools └── howie.sql /.gitignore: -------------------------------------------------------------------------------- 1 | # 过滤文件 2 | *.pyc 3 | 4 | # 过滤文件夹 5 | __pycache__/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Tornado新闻数据管理平台 2 | 3 | ### 简述: 4 | 5 | **代码规范以及项目结构都有很大问题,不再维护,当做大学时期的回忆保存。** 6 | 7 | `git clone https://github.com/howie6879/getNews`至本地即可 8 | 9 | ### 说明: 10 | 11 | 对采集的新闻数据进行分析,后台实现图形化操作,生成API供Android调用 12 | 13 | 14 | ``` 15 | myNews 16 | Usage: myNews [-p] 17 | 18 | Options: 19 | -h,--help 显示帮助菜单 20 | -p 端口号 21 | 22 | Example: 23 | myNews -p 8888 设置端口号为8888 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /application.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -* 2 | __author__ = 'Howie' 3 | import tornado.web 4 | import os 5 | from handlers.errorHandler import ErrorHandler 6 | from handlers.index import IndexHandler 7 | from handlers.admin import AdminHandler 8 | from handlers.dataAna import DataAna 9 | from handlers.spider import Spider 10 | from handlers.system import System 11 | from handlers.newsManage import NewsManage 12 | from handlers.UmFeedBack import UmFeedBack 13 | from handlers.UmMyNote import UmMyNote 14 | from handlers.userManage import UserManage 15 | from handlers.changePass import ChangePass 16 | from handlers.dataOperator import DataOperator 17 | import handlers.api.newsApi as api 18 | 19 | url = [ 20 | (r'/', IndexHandler), 21 | (r'/admin',AdminHandler), 22 | (r'/dataAna',DataAna), 23 | (r'/spider', Spider), 24 | (r'/system',System), 25 | (r'/newsManage',NewsManage), 26 | (r'/userManage',UserManage), 27 | (r'/umMyNote', UmMyNote), 28 | (r'/umFeedBack', UmFeedBack), 29 | (r'/changePass',ChangePass), 30 | (r'/dataOperator',DataOperator), 31 | (r'/api/register',api.Register), 32 | (r'/api/login', api.Login), 33 | (r'/api/newstags', api.NewsTags), 34 | (r'/api/newscontent', api.NewsContent), 35 | (r'/api/userinfo', api.UserInfo), 36 | (r'/api/userinfochange', api.UserInfoChange), 37 | (r'/api/lovenews', api.LoveNews), 38 | (r'/api/lovelist', api.LoveList), 39 | (r'/api/hotlist', api.HotList), 40 | (r'/api/feedback', api.FeedBack), 41 | (r'/api/keyword', api.KeyWord), 42 | (r'/api/comment', api.Comment), 43 | (r'/api/lovecomment', api.LoveComment), 44 | (r'/api/exitread', api.ExitRead), 45 | (r'/api/adminuser', api.AdminUser), 46 | (r'/api/adminuserinfo', api.AdminUserInfo), 47 | (r'/api/adminfeedback', api.AdminFeedback), 48 | (r'/api/returntags', api.ReturnTags), 49 | #这个页面处理语句必须放在最后 50 | (r".*", ErrorHandler) 51 | ] 52 | 53 | setting = dict( 54 | template_path = os.path.join(os.path.dirname(__file__), "templates"), 55 | static_path = os.path.join(os.path.dirname(__file__), "static"), 56 | cookie_secret = "XQ5rhITaQ1m7HoN40CcggWPCvR2jqUn1tY9E3kWU+yc=", 57 | #xsrf_cookies = True, 58 | debug = True, 59 | login_url = '/', 60 | ) 61 | 62 | application = tornado.web.Application( 63 | handlers = url, 64 | **setting 65 | ) -------------------------------------------------------------------------------- /config/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/config/.DS_Store -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /config/myNewsApi.conf: -------------------------------------------------------------------------------- 1 | [program:myNewsApi] 2 | command =/root/anaconda3/bin/python /root/programmming/git/getNews/myNews.py -p 8884 3 | directory =/root/programmming/git/getNews 4 | user =root 5 | startsecs =3 6 | 7 | autostart=true 8 | autorestart=true 9 | 10 | redirect_stderr = true 11 | stdout_logfile_maxbytes = 50MB 12 | stdout_logfile_backups = 10 13 | stdout_logfile = /root/programmming/git/getNews/myNewsApi.log 14 | -------------------------------------------------------------------------------- /config/n_conf.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | 4 | 5 | admin = dict( 6 | WEBSITE=True, 7 | TOKEN="news&&admin" 8 | ) 9 | 10 | # 本地数据库配置 11 | localDatabase = dict( 12 | host="127.0.0.1", 13 | user="root", 14 | password="", 15 | db="howie", 16 | charset="utf8", 17 | port=3306 18 | ) 19 | 20 | # 路径配置 21 | #dirPath = "/home/howie/programming/python/getNews" 22 | dirPath = "/root/programming/git/getNews" 23 | -------------------------------------------------------------------------------- /controller/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /controller/dataController.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import pandas as pd 4 | import os 5 | from config.n_conf import dirPath 6 | import controller.newsController as newsController 7 | from config.n_conf import dirPath 8 | 9 | 10 | class DataController(newsController.NewsController): 11 | def repeatedData(self, *dirs): 12 | """ 13 | func: 对爬取的数据进行去重操作 14 | :param *dirs:文件夹list,dirs[0]里面含有文件夹名称,默认为2个 15 | :return: 成功返回True,否则返回"No Data" 16 | """ 17 | self.initData = self.newsFiles("get", "allSource") 18 | if self.initData: 19 | for eachFile in self.initData: 20 | newsData = pd.read_excel(eachFile, sheetname="allNews") 21 | newsData = newsData.drop_duplicates() # 去重 22 | # 获取主路径 23 | path = os.path.join(dirPath, 'spider') 24 | # 获取处理后文件路径 25 | for dir in dirs[0]: 26 | path = os.path.join(path, dir) 27 | filePath = os.path.join(path, os.path.split(eachFile)[1]) 28 | log = filePath + "文件去重成功" 29 | print(log) 30 | with open(dirPath+"/log.txt", 'a') as fp: 31 | fp.write(log + "\n") 32 | newsData.to_excel(excel_writer=filePath, sheet_name="allNews") 33 | return True 34 | else: 35 | return "No Data!" 36 | 37 | def rmAllNews(self, newsSource): 38 | for i in newsSource: 39 | self.newsFiles("rm", i) 40 | return self.rmRepeate(['wordAna', 'allNews']) 41 | 42 | 43 | newsSource = ["touTiaoSource", "sinaSource", "allSource","pyspiderSource"] 44 | DataController = DataController() 45 | # print(DataController.rmAllNews(newsSource)) #删除所有原始数据 46 | # print(DataController.initData) 47 | # print(DataController.initData) 48 | # DataController.rmRepeate(['wordAna','allNews']) #删除去重文件夹里面的表 49 | # DataController.rmRepeate(['wordAna','wordAnaNews']) #删除分词文件夹里面的表 50 | # print(DataController.repeatedData(['wordAna','allNews'])) #进行去重操作 51 | -------------------------------------------------------------------------------- /controller/newsController.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import os 4 | from config.n_conf import dirPath 5 | 6 | class NewsController(): 7 | """ 8 | 系统控制类 9 | """ 10 | 11 | def newsFiles(self, operator, sourceName): 12 | """ 13 | :func 获取spider/sourceName/目录下爬取的各个新闻excel表 14 | :param operator: 根据get或rm进行获取文件以及删除文件操作 15 | sourceName:新闻网站文件夹 16 | :return:获取文件操作返回文件名列表,删除文件,删除成功返回allFiles=False,表示目录下没有文件 17 | """ 18 | # 获取新闻目录 19 | path = os.path.join(os.path.join(dirPath, 'spider'), sourceName) 20 | allFiles = [] 21 | for dir in os.listdir(path): 22 | tarPath = os.path.join(path, dir) 23 | if os.path.isdir(tarPath): 24 | files = [file for file in os.listdir(tarPath) if 25 | os.path.isfile(os.path.join(tarPath, file)) and os.path.splitext(file)[1] == ".xlsx"] 26 | if files and operator == "get": 27 | for file in files: 28 | allFiles.append(os.path.join(tarPath, file)) 29 | # 删除原始数据 30 | elif files and operator == "rm": 31 | for file in files: 32 | os.remove(os.path.join(tarPath, file)) 33 | log = os.path.join(tarPath, file) + "文件删除成功" 34 | print(log) 35 | with open(dirPath+"/log.txt", 'a') as fp: 36 | fp.write(log + "\n") 37 | if not allFiles: 38 | return False 39 | else: 40 | return allFiles 41 | 42 | def rmRepeate(self,*dirs): 43 | """ 44 | func: 删除已经去重的文件 45 | :param *dirs:文件夹list,dirs[0]里面含有文件夹名称,默认为2个 46 | :return: 删除成功返回True 47 | """ 48 | path = os.path.join(dirPath,'spider') 49 | #生成去重的数据目录 50 | for dir in dirs[0]: 51 | path = os.path.join(path,str(dir)) 52 | 53 | files = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path,file)) and os.path.splitext(file)[1] == ".xlsx"] 54 | for file in files: 55 | os.remove(os.path.join(path, file)) 56 | log = os.path.join(path, file) + "文件删除成功" 57 | print(log) 58 | with open(dirPath+"/log.txt", 'a') as fp: 59 | fp.write(log + "\n") 60 | return True 61 | -------------------------------------------------------------------------------- /cookie_secret.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import base64 4 | import uuid 5 | 6 | cookie_secret = base64.b64encode(uuid.uuid4().bytes + uuid.uuid4().bytes) 7 | #print(cookie_secret) -------------------------------------------------------------------------------- /doc/source/ERDDiagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/ERDDiagram.jpg -------------------------------------------------------------------------------- /doc/source/news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/news.png -------------------------------------------------------------------------------- /doc/source/推荐新闻.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/推荐新闻.png -------------------------------------------------------------------------------- /doc/source/新闻.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻.png -------------------------------------------------------------------------------- /doc/source/新闻分数.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻分数.png -------------------------------------------------------------------------------- /doc/source/新闻基本信息.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻基本信息.png -------------------------------------------------------------------------------- /doc/source/新闻标签因子.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻标签因子.png -------------------------------------------------------------------------------- /doc/source/标签喜欢程度.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/标签喜欢程度.png -------------------------------------------------------------------------------- /doc/source/用户.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户.png -------------------------------------------------------------------------------- /doc/source/用户基本信息.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户基本信息.png -------------------------------------------------------------------------------- /doc/source/用户操作.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户操作.png -------------------------------------------------------------------------------- /doc/source/用户行为信息.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户行为信息.png -------------------------------------------------------------------------------- /doc/新闻推荐系统后台管理系统.md: -------------------------------------------------------------------------------- 1 | ## 新闻推荐系统后台管理文档 2 | 3 | ### 1.需求分析 4 | 5 | ##### 目标 6 | 7 | 对数据进行界面化管理 8 | 9 | ### 2.数据库设计 10 | 11 | #### 2.1.用户表(admin.user) 12 | 13 | | 列名 | 数据类型 | 是否为空 | 说明 | 14 | | :--: | :---------: | :---------------------: | :--: | 15 | | id | int | not null auto_increment | PK | 16 | | name | varchar(10) | not null | 管理员 | 17 | | pass | varchar(40) | not null | 密码 | 18 | 19 | ```mysql 20 | -- Table: user 21 | CREATE TABLE `n_admin` ( 22 | 23 | `id` int(11) NOT NULL AUTO_INCREMENT, 24 | 25 | `name` varchar(10) NOT NULL, 26 | 27 | `pass` varchar(40) NOT NULL, 28 | 29 | PRIMARY KEY (`id`) 30 | 31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 32 | ``` 33 | ### 3.系统搭建 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /handlers/UmFeedBack.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class UmFeedBack(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | 12 | 13 | self.render("umFeedBack.html") -------------------------------------------------------------------------------- /handlers/UmMyNote.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class UmMyNote(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | 12 | 13 | self.render("umMyNote.html") -------------------------------------------------------------------------------- /handlers/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /handlers/admin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | __author__ = 'Howie' 3 | 4 | import tornado.web 5 | import tornado.escape 6 | from methods.pDb import newsDb 7 | from handlers.base import BaseHandler 8 | 9 | 10 | class AdminHandler(BaseHandler): 11 | @tornado.web.authenticated 12 | def get(self, *args, **kwargs): 13 | user = self.get_argument("user") 14 | if user == "logout": 15 | self.clear_cookie("user") 16 | self.render("index.html") 17 | else: 18 | header = "新闻推荐系统后台" 19 | cateType = {"news_society":"社会", "news_entertainment":"娱乐","news_tech":"科技", "news_car":"汽车", "news_sports":"体育", "news_finance":"财经", 20 | "news_military":"军事", "news_world":"国际","news_fashion":"时尚", "news_travel":"旅游", "news_discovery":"探索", "news_baby":"育儿", 21 | "news_regimen":"养生", "news_story":"故事","news_essay":"美文", "news_game":"游戏", "news_history":"历史", "news_food":"美食"} 22 | numTag = {} 23 | for i in cateType.keys(): 24 | mSql = newsDb() 25 | result = mSql.select_table(table="get_news",column="count(*)",condition="tag",value=i) 26 | numTag[cateType[i]]=result[0][0] 27 | #排序 28 | sortTag = list(sorted(numTag.items(), key=lambda d:d[1], reverse = True)) 29 | self.render("admin.html", header=header, numTag=numTag,sortTag=sortTag[0:7]) 30 | -------------------------------------------------------------------------------- /handlers/api/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /handlers/api/__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /handlers/api/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /handlers/api/__pycache__/newsApi.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/newsApi.cpython-34.pyc -------------------------------------------------------------------------------- /handlers/api/__pycache__/newsApi.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/newsApi.cpython-35.pyc -------------------------------------------------------------------------------- /handlers/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | __author__ = 'Howie' 3 | 4 | import tornado.web 5 | 6 | 7 | class BaseHandler(tornado.web.RequestHandler): 8 | def get_current_user(self): 9 | return self.get_secure_cookie("user") 10 | 11 | def write_error(self, status_code, **kwargs): 12 | self.write("错误页面,状态码{0}.\n".format( 13 | status_code)) 14 | 15 | def output(self): 16 | self.write("hi") 17 | print("hi") 18 | -------------------------------------------------------------------------------- /handlers/changePass.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | import hashlib 6 | from methods.pDb import newsDb 7 | from config.n_conf import admin 8 | from handlers.base import BaseHandler 9 | 10 | 11 | class ChangePass(BaseHandler): 12 | @tornado.web.authenticated 13 | def get(self, *args, **kwargs): 14 | password = self.get_argument("pass") 15 | password = str(hashlib.md5((admin["TOKEN"] + password).encode("utf-8")).hexdigest()) 16 | sql = "update n_admin set pass='" + password + "' where name = 'admin'" # 执行SQL语句 17 | mSql = newsDb() 18 | if mSql.exeSql(sql): 19 | self.write("密码修改成功") 20 | else: 21 | self.write("密码修改失败") 22 | -------------------------------------------------------------------------------- /handlers/dataAna.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class DataAna(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | header = "数据分析" 12 | self.render("dataAna.html",header=header) -------------------------------------------------------------------------------- /handlers/dataOperator.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | from spider import allSpider 7 | from controller.dataController import DataController, newsSource 8 | from spider.newsDb.insertNews import newsInsert 9 | from system.classPredict.main import startPredict 10 | from system.latentFactor.geneCalcul import GeneCulcal 11 | from methods.pDb import newsDb 12 | 13 | 14 | class DataOperator(BaseHandler): 15 | @tornado.web.authenticated 16 | def get(self, *args, **kwargs): 17 | # 新闻种类 18 | action = self.get_argument('action') 19 | if action == "getNews": 20 | page = int(self.get_argument('page')) 21 | num = int(self.get_argument('num')) 22 | cate = ["__all__","news_hot","news_society", "news_entertainment", 23 | "news_tech", "news_car", "news_sports", "news_finance", "news_military", "news_world", 24 | "news_fashion", "news_travel", "news_discovery", "news_baby", "news_regimen", "news_story", 25 | "news_essay", "news_game", "news_history", "news_food"] 26 | allSpider.touTiao(category=cate, page=page, num=num) 27 | allSpider.sina(num=1000, page=1) 28 | print("success") 29 | elif action == "repeatedData": 30 | # 先进行合并 31 | allSpider.merge() 32 | # 进行去重 33 | print(DataController.repeatedData(['wordAna', 'allNews'])) 34 | print("success") 35 | elif action == "anaData": 36 | # 进行词性分析 37 | allSpider.wordAna() 38 | 39 | elif action == "rmAllNews": 40 | DataController.rmAllNews(newsSource) 41 | print("success") 42 | elif action == "insertDB": 43 | # 清除老数据 44 | db = newsDb() 45 | db.exeSql("delete from news_tag_deep") 46 | db.exeSql("delete from news_nums") 47 | #db.exeSql("delete from get_news where is_old=0") 48 | db.exeSql("insert into news_nums select * from news_nums_view") 49 | # 将新闻插入数据库 50 | newsInsert.insertSql("wordAnaNews") 51 | # 删除分词文件夹里面的表 52 | DataController.rmRepeate(['wordAna', 'wordAnaNews']) 53 | startPredict() 54 | gc = GeneCulcal() 55 | gc.getMatData() 56 | print("success") 57 | -------------------------------------------------------------------------------- /handlers/errorHandler.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -* 2 | __author__ = 'Howie' 3 | 4 | import tornado.web 5 | 6 | class ErrorHandler(tornado.web.RequestHandler): 7 | 8 | def write_error(self, status_code, **kwargs): 9 | self.write("错误状态码{0}.\n".format( 10 | status_code)) -------------------------------------------------------------------------------- /handlers/index.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -* 2 | __author__ = 'Howie' 3 | 4 | import tornado.escape 5 | import hashlib 6 | from methods.pDb import newsDb 7 | from config.n_conf import admin 8 | from handlers.base import BaseHandler 9 | 10 | class IndexHandler(BaseHandler): 11 | def get(self): 12 | self.clear_cookie("user") 13 | self.render("index.html") if admin["WEBSITE"] else self.write("

网站正在维护...

") 14 | 15 | def post(self): 16 | username = self.get_argument("username") 17 | password = self.get_argument("password") 18 | mSql = newsDb() 19 | result = mSql.select_table("n_admin", "*", "name", username) 20 | if result: 21 | db_pwd = result[0][2] 22 | password = hashlib.md5((admin["TOKEN"]+password).encode("utf-8")).hexdigest() 23 | if db_pwd == password: 24 | self.set_current_user(username) #将当前用户名写入cookie 25 | self.write(username) 26 | else: 27 | self.clear_cookie("user") 28 | self.write("-1") 29 | else: 30 | self.clear_cookie("user") 31 | self.write("-1") 32 | 33 | def set_current_user(self,user): 34 | if user: 35 | self.set_secure_cookie('user',tornado.escape.json_encode(user)) 36 | else: 37 | self.clear_cookie("user") 38 | -------------------------------------------------------------------------------- /handlers/newsManage.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class NewsManage(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | header = "新闻管理" 12 | self.render("newsManage.html",header=header) 13 | 14 | -------------------------------------------------------------------------------- /handlers/spider.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class Spider(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | header = "爬虫管理" 12 | self.render("spider.html",header=header) -------------------------------------------------------------------------------- /handlers/system.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class System(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | header = "系统信息" 12 | self.render("system.html",header=header) -------------------------------------------------------------------------------- /handlers/userManage.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import tornado.web 4 | import tornado.escape 5 | from handlers.base import BaseHandler 6 | 7 | class UserManage(BaseHandler): 8 | 9 | @tornado.web.authenticated 10 | def get(self, *args, **kwargs): 11 | header = "用户管理" 12 | 13 | self.render("userManage.html",header=header) -------------------------------------------------------------------------------- /methods/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/methods/.DS_Store -------------------------------------------------------------------------------- /methods/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /methods/pDb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | __author__ = 'Howie,jeezy' 3 | 4 | import pymysql 5 | from config.n_conf import localDatabase 6 | 7 | 8 | class newsDb(object): 9 | """ 10 | connect mysql 11 | """ 12 | 13 | def __init__(self): 14 | self.conn = pymysql.connect(**localDatabase) 15 | self.cur = self.conn.cursor() 16 | 17 | def select_table(self, table, column, condition, value): 18 | sql = "select " + column + " from " + table + " where " + condition + "= '" + value + "'" 19 | print(sql) 20 | self.cur.execute(sql) 21 | lines = self.cur.fetchall() 22 | return lines 23 | 24 | def select_table_two(self, table, column): 25 | sql = "select " + column + " from " + table 26 | print (sql) 27 | self.cur.execute (sql) 28 | lines = self.cur.fetchall () 29 | return lines 30 | 31 | def select_table_three(self,sql): 32 | print (sql) 33 | self.cur.execute (sql) 34 | lines = self.cur.fetchall () 35 | return lines 36 | 37 | def insert_table(self, table, field, values): 38 | sql = "insert into " + table + field + " values" + values 39 | print(sql) 40 | try: 41 | self.cur.execute(sql) 42 | # 提交到数据库执行 43 | self.conn.commit() 44 | return True 45 | except: 46 | # 出现错误则回滚 47 | self.conn.rollback() 48 | return False 49 | 50 | def update_column(self, table, column, value_set, condition, value_find): 51 | sql = "update " + table + " set " + column + "= '" + value_set + "' where " + condition + "='" + value_find + "'" 52 | print(sql) 53 | try: 54 | self.cur.execute(sql) 55 | self.conn.commit() 56 | return True 57 | except: 58 | self.conn.rollback() 59 | return False 60 | 61 | 62 | 63 | def exeSql(self,sql): 64 | print (sql) 65 | try: 66 | self.cur.execute(sql) 67 | self.conn.commit() 68 | return True 69 | except: 70 | self.conn.rollback() 71 | return False 72 | 73 | def __del__(self): 74 | self.cur.close() 75 | self.conn.close() 76 | -------------------------------------------------------------------------------- /myNews.py: -------------------------------------------------------------------------------- 1 | """myNews 2 | 3 | Usage: myNews [-p] 4 | 5 | Options: 6 | -h,--help 显示帮助菜单 7 | -p 端口号 8 | 9 | Example: 10 | myNews -p 8888 设置端口号为8888 11 | """ 12 | 13 | from docopt import docopt 14 | from server import main 15 | 16 | 17 | def cli(): 18 | kwargs = docopt(__doc__) 19 | port = kwargs[''] 20 | main(port) 21 | 22 | 23 | if __name__ == "__main__": 24 | cli() 25 | -------------------------------------------------------------------------------- /myNewsApi.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/myNewsApi.log -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | __author__ = 'Howie' 3 | 4 | import tornado.options 5 | import tornado.ioloop 6 | from application import application 7 | 8 | 9 | def main(port): 10 | #tornado.options.parse_command_line() 11 | application.listen(port) 12 | print("Development server is running at http://127.0.0.1:%s" % port) 13 | print("Quit the server with Control-C") 14 | tornado.ioloop.IOLoop.instance().start() 15 | 16 | #main(8888) -------------------------------------------------------------------------------- /spider/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/.DS_Store -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /spider/allSource/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/.DS_Store -------------------------------------------------------------------------------- /spider/allSource/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/README.md -------------------------------------------------------------------------------- /spider/allSource/__all__/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/__all__/README.md -------------------------------------------------------------------------------- /spider/allSource/news_baby/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_baby/README.md -------------------------------------------------------------------------------- /spider/allSource/news_car/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_car/README.md -------------------------------------------------------------------------------- /spider/allSource/news_discovery/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_discovery/README.md -------------------------------------------------------------------------------- /spider/allSource/news_entertainment/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_entertainment/README.md -------------------------------------------------------------------------------- /spider/allSource/news_essay/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_essay/README.md -------------------------------------------------------------------------------- /spider/allSource/news_fashion/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_fashion/README.md -------------------------------------------------------------------------------- /spider/allSource/news_finance/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_finance/README.md -------------------------------------------------------------------------------- /spider/allSource/news_food/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_food/README.md -------------------------------------------------------------------------------- /spider/allSource/news_game/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_game/README.md -------------------------------------------------------------------------------- /spider/allSource/news_history/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_history/README.md -------------------------------------------------------------------------------- /spider/allSource/news_hot/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_hot/README.md -------------------------------------------------------------------------------- /spider/allSource/news_military/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_military/README.md -------------------------------------------------------------------------------- /spider/allSource/news_regimen/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_regimen/README.md -------------------------------------------------------------------------------- /spider/allSource/news_society/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_society/README.md -------------------------------------------------------------------------------- /spider/allSource/news_sports/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_sports/README.md -------------------------------------------------------------------------------- /spider/allSource/news_story/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_story/README.md -------------------------------------------------------------------------------- /spider/allSource/news_tech/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_tech/README.md -------------------------------------------------------------------------------- /spider/allSource/news_travel/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_travel/README.md -------------------------------------------------------------------------------- /spider/allSource/news_world/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_world/README.md -------------------------------------------------------------------------------- /spider/allSpider.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | import time 4 | import os 5 | import spider.toutiao.touTiaoSpider as ts 6 | import spider.sina.sinaSpider as ss 7 | import spider.mergeExcel as me 8 | import spider.wordAna.contentSpider as cs 9 | from config.n_conf import dirPath 10 | 11 | 12 | ss.cate = ["news_world", "news_sports", "news_finance", "news_society", "news_entertainment", "news_military", 13 | "news_tech"] 14 | 15 | 16 | def touTiao(category, page, num): 17 | # 爬取今日头条 18 | for cate in category: 19 | ts.getToutiaoNews(cate, page, num) 20 | 21 | 22 | def sina(num=1000, page=1, type=ss.cate): 23 | # 爬取新浪新闻 24 | ss.getSinaNews(num, page, type) 25 | 26 | def merge(): 27 | #新闻合并操作 28 | mainPath = os.path.join(dirPath,'spider') 29 | secondPath = os.path.join(mainPath,'allSource') 30 | mergeExel = me.mergeExcel() 31 | mergeExel.merge(mainPath,secondPath) 32 | 33 | def wordAna(): 34 | cs.getNewsContent() 35 | 36 | def insertNews(): 37 | pass 38 | #touTiao(category=ts.category, page=2, num=20, time=time.time()) 39 | #sina() 40 | #merge() 41 | #wordAna() -------------------------------------------------------------------------------- /spider/newsDb/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /spider/newsDb/insertNews.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | __author__ = 'Jeezy' 3 | 4 | import os 5 | import time 6 | import hashlib 7 | from methods.pDb import newsDb 8 | import random 9 | import pandas as pd 10 | from config.n_conf import dirPath 11 | 12 | 13 | class newsInsert: 14 | def __init__(self): 15 | pass 16 | 17 | def insertSql(self, mainPath): 18 | path = dirPath + "/spider/wordAna/" + mainPath 19 | for dir in os.listdir(path): 20 | if os.path.splitext(dir)[1] == ".xlsx": 21 | file = os.path.join(path, dir) 22 | self.insert(file) 23 | 24 | def insert(self, file): 25 | try: 26 | data = pd.read_excel(file, sheetname="allNews") 27 | data = data.drop_duplicates(subset='title', keep='last') 28 | db = newsDb() 29 | cateType = {"news_society": "社会", "news_entertainment": "娱乐", "news_tech": "科技", "news_car": "汽车", 30 | "news_sports": "体育", "news_finance": "财经", 31 | "news_military": "军事", "news_world": "国际", "news_fashion": "时尚", "news_travel": "旅游", 32 | "news_discovery": "探索", "news_baby": "育儿", 33 | "news_regimen": "养生", "news_story": "故事", "news_essay": "美文", "news_game": "游戏", 34 | "news_history": "历史", "news_food": "美食"} 35 | tag = file.split('&')[1] 36 | for i in range(0, len(data)): 37 | value = data.values[i] 38 | if value[8] in cateType.keys(): tag = value[8] 39 | if value[11]: 40 | times = time.time() 41 | md5newid = hashlib.md5(str(times).encode("utf-8")).hexdigest() 42 | startNum = random.randint(0, (len(md5newid) - 20)) 43 | newsId = str(md5newid)[startNum:(startNum + 20)] 44 | try: 45 | mysqlSuccess = db.insert_table(table="get_news", 46 | field="(news_id,news_link,source,title,abstract,tag," 47 | "text_content,html_content,image,keyword)", 48 | values="('" + newsId + "','" + value[2] + "','" + value[ 49 | 4] + "','" + 50 | value[1] 51 | + "','" + value[6] + "','" + tag + "','" + value[ 52 | 10] + "','" + value[11] + "','" + value[12] + "','" + 53 | value[ 54 | 9] + "')") 55 | 56 | if mysqlSuccess: 57 | print("新闻保存sql完成!") 58 | except: 59 | print("failed") 60 | except: 61 | print("import failed") 62 | 63 | 64 | newsInsert = newsInsert() 65 | -------------------------------------------------------------------------------- /spider/pyspider/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-17 19:17:12 7 | 8 | __version__ = '0.3.8' 9 | -------------------------------------------------------------------------------- /spider/pyspider/data/project.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/project.db -------------------------------------------------------------------------------- /spider/pyspider/data/result.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/result.db -------------------------------------------------------------------------------- /spider/pyspider/data/scheduler.1d: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/scheduler.1d -------------------------------------------------------------------------------- /spider/pyspider/data/scheduler.1h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/scheduler.1h -------------------------------------------------------------------------------- /spider/pyspider/data/scheduler.all: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/scheduler.all -------------------------------------------------------------------------------- /spider/pyspider/data/task.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/task.db -------------------------------------------------------------------------------- /spider/pyspider/database/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/database/base/__init__.py -------------------------------------------------------------------------------- /spider/pyspider/database/base/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-09 11:28:52 7 | 8 | import re 9 | 10 | # NOTE: When get/get_all/check_update from database with default fields, 11 | # all following fields should be included in output dict. 12 | { 13 | 'project': { 14 | 'name': str, 15 | 'group': str, 16 | 'status': str, 17 | 'script': str, 18 | # 'config': str, 19 | 'comments': str, 20 | # 'priority': int, 21 | 'rate': int, 22 | 'burst': int, 23 | 'updatetime': int, 24 | } 25 | } 26 | 27 | 28 | class ProjectDB(object): 29 | status_str = [ 30 | 'TODO', 31 | 'STOP', 32 | 'CHECKING', 33 | 'DEBUG', 34 | 'RUNNING', 35 | ] 36 | 37 | def insert(self, name, obj={}): 38 | raise NotImplementedError 39 | 40 | def update(self, name, obj={}, **kwargs): 41 | raise NotImplementedError 42 | 43 | def get_all(self, fields=None): 44 | raise NotImplementedError 45 | 46 | def get(self, name, fields): 47 | raise NotImplementedError 48 | 49 | def drop(self, name): 50 | raise NotImplementedError 51 | 52 | def check_update(self, timestamp, fields=None): 53 | raise NotImplementedError 54 | 55 | def split_group(self, group, lower=True): 56 | return re.split("\W+", (group or '').lower()) 57 | 58 | def verify_project_name(self, name): 59 | if len(name) > 64: 60 | return False 61 | if re.search(r"[^\w]", name): 62 | return False 63 | return True 64 | 65 | def copy(self): 66 | ''' 67 | database should be able to copy itself to create new connection 68 | 69 | it's implemented automatically by pyspider.database.connect_database 70 | if you are not create database connection via connect_database method, 71 | you should implement this 72 | ''' 73 | raise NotImplementedError 74 | -------------------------------------------------------------------------------- /spider/pyspider/database/base/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-11 18:40:03 7 | 8 | # result schema 9 | { 10 | 'result': { 11 | 'taskid': str, # new, not changeable 12 | 'project': str, # new, not changeable 13 | 'url': str, # new, not changeable 14 | 'result': str, # json string 15 | 'updatetime': int, 16 | } 17 | } 18 | 19 | 20 | class ResultDB(object): 21 | """ 22 | database for result 23 | """ 24 | projects = set() # projects in resultdb 25 | 26 | def save(self, project, taskid, url, result): 27 | raise NotImplementedError 28 | 29 | def select(self, project, fields=None, offset=0, limit=None): 30 | raise NotImplementedError 31 | 32 | def count(self, project): 33 | raise NotImplementedError 34 | 35 | def get(self, project, taskid, fields=None): 36 | raise NotImplementedError 37 | 38 | def drop(self, project): 39 | raise NotImplementedError 40 | 41 | def copy(self): 42 | ''' 43 | database should be able to copy itself to create new connection 44 | 45 | it's implemented automatically by pyspider.database.connect_database 46 | if you are not create database connection via connect_database method, 47 | you should implement this 48 | ''' 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /spider/pyspider/database/base/taskdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-08 10:28:48 7 | 8 | # task schema 9 | { 10 | 'task': { 11 | 'taskid': str, # new, not change 12 | 'project': str, # new, not change 13 | 'url': str, # new, not change 14 | 'status': int, # change 15 | 'schedule': { 16 | 'priority': int, 17 | 'retries': int, 18 | 'retried': int, 19 | 'exetime': int, 20 | 'age': int, 21 | 'itag': str, 22 | # 'recrawl': int 23 | }, # new and restart 24 | 'fetch': { 25 | 'method': str, 26 | 'headers': dict, 27 | 'data': str, 28 | 'timeout': int, 29 | 'save': dict, 30 | }, # new and restart 31 | 'process': { 32 | 'callback': str, 33 | }, # new and restart 34 | 'track': { 35 | 'fetch': { 36 | 'ok': bool, 37 | 'time': int, 38 | 'status_code': int, 39 | 'headers': dict, 40 | 'encoding': str, 41 | 'content': str, 42 | }, 43 | 'process': { 44 | 'ok': bool, 45 | 'time': int, 46 | 'follows': int, 47 | 'outputs': int, 48 | 'logs': str, 49 | 'exception': str, 50 | }, 51 | 'save': object, # jsonable object saved by processor 52 | }, # finish 53 | 'lastcrawltime': int, # keep between request 54 | 'updatetime': int, # keep between request 55 | } 56 | } 57 | 58 | 59 | class TaskDB(object): 60 | ACTIVE = 1 61 | SUCCESS = 2 62 | FAILED = 3 63 | BAD = 4 64 | 65 | projects = set() # projects in taskdb 66 | 67 | def load_tasks(self, status, project=None, fields=None): 68 | raise NotImplementedError 69 | 70 | def get_task(self, project, taskid, fields=None): 71 | raise NotImplementedError 72 | 73 | def status_count(self, project): 74 | ''' 75 | return a dict 76 | ''' 77 | raise NotImplementedError 78 | 79 | def insert(self, project, taskid, obj={}): 80 | raise NotImplementedError 81 | 82 | def update(self, project, taskid, obj={}, **kwargs): 83 | raise NotImplementedError 84 | 85 | def drop(self, project): 86 | raise NotImplementedError 87 | 88 | @staticmethod 89 | def status_to_string(status): 90 | return { 91 | 1: 'ACTIVE', 92 | 2: 'SUCCESS', 93 | 3: 'FAILED', 94 | 4: 'BAD', 95 | }.get(status, 'UNKNOWN') 96 | 97 | @staticmethod 98 | def status_to_int(status): 99 | return { 100 | 'ACTIVE': 1, 101 | 'SUCCESS': 2, 102 | 'FAILED': 3, 103 | 'BAD': 4, 104 | }.get(status, 4) 105 | 106 | def copy(self): 107 | ''' 108 | database should be able to copy itself to create new connection 109 | 110 | it's implemented automatically by pyspider.database.connect_database 111 | if you are not create database connection via connect_database method, 112 | you should implement this 113 | ''' 114 | raise NotImplementedError 115 | -------------------------------------------------------------------------------- /spider/pyspider/database/elasticsearch/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2016-01-17 18:31:58 7 | -------------------------------------------------------------------------------- /spider/pyspider/database/elasticsearch/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2016-01-17 18:32:33 7 | 8 | import time 9 | 10 | import elasticsearch.helpers 11 | from elasticsearch import Elasticsearch 12 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 13 | 14 | 15 | class ProjectDB(BaseProjectDB): 16 | __type__ = 'project' 17 | 18 | def __init__(self, hosts, index='pyspider'): 19 | self.index = index 20 | self.es = Elasticsearch(hosts=hosts) 21 | 22 | self.es.indices.create(index=self.index, ignore=400) 23 | if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): 24 | self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ 25 | "_all": {"enabled": False}, 26 | "properties": { 27 | "updatetime": {"type": "double"} 28 | } 29 | }) 30 | 31 | def insert(self, name, obj={}): 32 | obj = dict(obj) 33 | obj['name'] = name 34 | obj['updatetime'] = time.time() 35 | 36 | obj.setdefault('group', '') 37 | obj.setdefault('status', 'TODO') 38 | obj.setdefault('script', '') 39 | obj.setdefault('comments', '') 40 | obj.setdefault('rate', 0) 41 | obj.setdefault('burst', 0) 42 | 43 | return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name, 44 | refresh=True) 45 | 46 | def update(self, name, obj={}, **kwargs): 47 | obj = dict(obj) 48 | obj.update(kwargs) 49 | obj['updatetime'] = time.time() 50 | return self.es.update(index=self.index, doc_type=self.__type__, 51 | body={'doc': obj}, id=name, refresh=True, ignore=404) 52 | 53 | def get_all(self, fields=None): 54 | for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, 55 | query={'query': {"match_all": {}}}, 56 | _source_include=fields or []): 57 | yield record['_source'] 58 | 59 | def get(self, name, fields=None): 60 | ret = self.es.get(index=self.index, doc_type=self.__type__, id=name, 61 | _source_include=fields or [], ignore=404) 62 | return ret.get('_source', None) 63 | 64 | def check_update(self, timestamp, fields=None): 65 | for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, 66 | query={'query': {"range": { 67 | "updatetime": {"gte": timestamp} 68 | }}}, _source_include=fields or []): 69 | yield record['_source'] 70 | 71 | def drop(self, name): 72 | return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True) 73 | -------------------------------------------------------------------------------- /spider/pyspider/database/local/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-17 20:56:50 7 | -------------------------------------------------------------------------------- /spider/pyspider/database/local/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-17 12:32:17 7 | 8 | import os 9 | import re 10 | import six 11 | import glob 12 | import logging 13 | 14 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 15 | 16 | 17 | class ProjectDB(BaseProjectDB): 18 | """ProjectDB loading scripts from local file.""" 19 | 20 | def __init__(self, files): 21 | self.files = files 22 | self.projects = {} 23 | self.load_scripts() 24 | 25 | def load_scripts(self): 26 | project_names = set(self.projects.keys()) 27 | for path in self.files: 28 | for filename in glob.glob(path): 29 | name = os.path.splitext(os.path.basename(filename))[0] 30 | if name in project_names: 31 | project_names.remove(name) 32 | updatetime = os.path.getmtime(filename) 33 | if name not in self.projects or updatetime > self.projects[name]['updatetime']: 34 | project = self._build_project(filename) 35 | if not project: 36 | continue 37 | self.projects[project['name']] = project 38 | 39 | for name in project_names: 40 | del self.projects[name] 41 | 42 | rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M) 43 | burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M) 44 | 45 | def _build_project(self, filename): 46 | try: 47 | with open(filename) as fp: 48 | script = fp.read() 49 | m = self.rate_re.search(script) 50 | if m: 51 | rate = float(m.group(1)) 52 | else: 53 | rate = 1 54 | 55 | m = self.burst_re.search(script) 56 | if m: 57 | burst = float(m.group(1)) 58 | else: 59 | burst = 3 60 | 61 | return { 62 | 'name': os.path.splitext(os.path.basename(filename))[0], 63 | 'group': None, 64 | 'status': 'RUNNING', 65 | 'script': script, 66 | 'comments': None, 67 | 'rate': rate, 68 | 'burst': burst, 69 | 'updatetime': os.path.getmtime(filename), 70 | } 71 | except OSError as e: 72 | logging.error('loading project script error: %s', e) 73 | return None 74 | 75 | def get_all(self, fields=None): 76 | for projectname in self.projects: 77 | yield self.get(projectname, fields) 78 | 79 | def get(self, name, fields=None): 80 | if name not in self.projects: 81 | return None 82 | project = self.projects[name] 83 | result = {} 84 | for f in fields or project: 85 | if f in project: 86 | result[f] = project[f] 87 | else: 88 | result[f] = None 89 | return result 90 | 91 | def check_update(self, timestamp, fields=None): 92 | self.load_scripts() 93 | for projectname, project in six.iteritems(self.projects): 94 | if project['updatetime'] > timestamp: 95 | yield self.get(projectname, fields) 96 | -------------------------------------------------------------------------------- /spider/pyspider/database/mongodb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/database/mongodb/__init__.py -------------------------------------------------------------------------------- /spider/pyspider/database/mongodb/mongodbbase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-22 20:42:01 7 | 8 | import time 9 | 10 | 11 | class SplitTableMixin(object): 12 | UPDATE_PROJECTS_TIME = 10 * 60 13 | 14 | def _collection_name(self, project): 15 | if self.collection_prefix: 16 | return "%s.%s" % (self.collection_prefix, project) 17 | else: 18 | return project 19 | 20 | @property 21 | def projects(self): 22 | if time.time() - getattr(self, '_last_update_projects', 0) \ 23 | > self.UPDATE_PROJECTS_TIME: 24 | self._list_project() 25 | return self._projects 26 | 27 | @projects.setter 28 | def projects(self, value): 29 | self._projects = value 30 | 31 | def _list_project(self): 32 | self._last_update_projects = time.time() 33 | self.projects = set() 34 | if self.collection_prefix: 35 | prefix = "%s." % self.collection_prefix 36 | else: 37 | prefix = '' 38 | for each in self.database.collection_names(): 39 | if each.startswith('system.'): 40 | continue 41 | if each.startswith(prefix): 42 | self.projects.add(each[len(prefix):]) 43 | 44 | def drop(self, project): 45 | if project not in self.projects: 46 | self._list_project() 47 | if project not in self.projects: 48 | return 49 | collection_name = self._collection_name(project) 50 | self.database[collection_name].drop() 51 | self._list_project() 52 | -------------------------------------------------------------------------------- /spider/pyspider/database/mongodb/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-12 12:22:42 7 | 8 | import time 9 | from pymongo import MongoClient 10 | 11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 12 | 13 | 14 | class ProjectDB(BaseProjectDB): 15 | __collection_name__ = 'projectdb' 16 | 17 | def __init__(self, url, database='projectdb'): 18 | self.conn = MongoClient(url) 19 | self.conn.admin.command("ismaster") 20 | self.database = self.conn[database] 21 | self.collection = self.database[self.__collection_name__] 22 | 23 | self.collection.ensure_index('name', unique=True) 24 | 25 | def _default_fields(self, each): 26 | if each is None: 27 | return each 28 | each.setdefault('group', None) 29 | each.setdefault('status', 'TODO') 30 | each.setdefault('script', '') 31 | each.setdefault('comments', None) 32 | each.setdefault('rate', 0) 33 | each.setdefault('burst', 0) 34 | each.setdefault('updatetime', 0) 35 | return each 36 | 37 | def insert(self, name, obj={}): 38 | obj = dict(obj) 39 | obj['name'] = name 40 | obj['updatetime'] = time.time() 41 | return self.collection.update({'name': name}, {'$set': obj}, upsert=True) 42 | 43 | def update(self, name, obj={}, **kwargs): 44 | obj = dict(obj) 45 | obj.update(kwargs) 46 | obj['updatetime'] = time.time() 47 | return self.collection.update({'name': name}, {'$set': obj}) 48 | 49 | def get_all(self, fields=None): 50 | for each in self.collection.find({}, fields): 51 | if each and '_id' in each: 52 | del each['_id'] 53 | yield self._default_fields(each) 54 | 55 | def get(self, name, fields=None): 56 | each = self.collection.find_one({'name': name}, fields) 57 | if each and '_id' in each: 58 | del each['_id'] 59 | return self._default_fields(each) 60 | 61 | def check_update(self, timestamp, fields=None): 62 | for project in self.get_all(fields=('updatetime', 'name')): 63 | if project['updatetime'] > timestamp: 64 | project = self.get(project['name'], fields) 65 | yield self._default_fields(project) 66 | 67 | def drop(self, name): 68 | return self.collection.remove({'name': name}) 69 | -------------------------------------------------------------------------------- /spider/pyspider/database/mongodb/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-13 22:18:36 7 | 8 | import json 9 | import time 10 | from pymongo import MongoClient 11 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 12 | from .mongodbbase import SplitTableMixin 13 | 14 | 15 | class ResultDB(SplitTableMixin, BaseResultDB): 16 | collection_prefix = '' 17 | 18 | def __init__(self, url, database='resultdb'): 19 | self.conn = MongoClient(url) 20 | self.conn.admin.command("ismaster") 21 | self.database = self.conn[database] 22 | self.projects = set() 23 | 24 | self._list_project() 25 | for project in self.projects: 26 | collection_name = self._collection_name(project) 27 | self.database[collection_name].ensure_index('taskid') 28 | 29 | def _create_project(self, project): 30 | collection_name = self._collection_name(project) 31 | self.database[collection_name].ensure_index('taskid') 32 | self._list_project() 33 | 34 | def _parse(self, data): 35 | data['_id'] = str(data['_id']) 36 | if 'result' in data: 37 | data['result'] = json.loads(data['result']) 38 | return data 39 | 40 | def _stringify(self, data): 41 | if 'result' in data: 42 | data['result'] = json.dumps(data['result']) 43 | return data 44 | 45 | def save(self, project, taskid, url, result): 46 | if project not in self.projects: 47 | self._create_project(project) 48 | collection_name = self._collection_name(project) 49 | obj = { 50 | 'taskid': taskid, 51 | 'url': url, 52 | 'result': result, 53 | 'updatetime': time.time(), 54 | } 55 | return self.database[collection_name].update( 56 | {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True 57 | ) 58 | 59 | def select(self, project, fields=None, offset=0, limit=0): 60 | if project not in self.projects: 61 | self._list_project() 62 | if project not in self.projects: 63 | return 64 | collection_name = self._collection_name(project) 65 | for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit): 66 | yield self._parse(result) 67 | 68 | def count(self, project): 69 | if project not in self.projects: 70 | self._list_project() 71 | if project not in self.projects: 72 | return 73 | collection_name = self._collection_name(project) 74 | return self.database[collection_name].count() 75 | 76 | def get(self, project, taskid, fields=None): 77 | if project not in self.projects: 78 | self._list_project() 79 | if project not in self.projects: 80 | return 81 | collection_name = self._collection_name(project) 82 | ret = self.database[collection_name].find_one({'taskid': taskid}, fields) 83 | if not ret: 84 | return ret 85 | return self._parse(ret) 86 | -------------------------------------------------------------------------------- /spider/pyspider/database/mysql/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-07-17 20:12:54 7 | -------------------------------------------------------------------------------- /spider/pyspider/database/mysql/mysqlbase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-05 10:42:24 7 | 8 | import time 9 | import mysql.connector 10 | 11 | 12 | class MySQLMixin(object): 13 | 14 | @property 15 | def dbcur(self): 16 | try: 17 | if self.conn.unread_result: 18 | self.conn.get_rows() 19 | return self.conn.cursor() 20 | except (mysql.connector.OperationalError, mysql.connector.InterfaceError): 21 | self.conn.ping(reconnect=True) 22 | self.conn.database = self.database_name 23 | return self.conn.cursor() 24 | 25 | 26 | class SplitTableMixin(object): 27 | UPDATE_PROJECTS_TIME = 10 * 60 28 | 29 | def _tablename(self, project): 30 | if self.__tablename__: 31 | return '%s_%s' % (self.__tablename__, project) 32 | else: 33 | return project 34 | 35 | @property 36 | def projects(self): 37 | if time.time() - getattr(self, '_last_update_projects', 0) \ 38 | > self.UPDATE_PROJECTS_TIME: 39 | self._list_project() 40 | return self._projects 41 | 42 | @projects.setter 43 | def projects(self, value): 44 | self._projects = value 45 | 46 | def _list_project(self): 47 | self._last_update_projects = time.time() 48 | self.projects = set() 49 | if self.__tablename__: 50 | prefix = '%s_' % self.__tablename__ 51 | else: 52 | prefix = '' 53 | for project, in self._execute('show tables;'): 54 | if project.startswith(prefix): 55 | project = project[len(prefix):] 56 | self.projects.add(project) 57 | 58 | def drop(self, project): 59 | if project not in self.projects: 60 | self._list_project() 61 | if project not in self.projects: 62 | return 63 | tablename = self._tablename(project) 64 | self._execute("DROP TABLE %s" % self.escape(tablename)) 65 | self._list_project() 66 | -------------------------------------------------------------------------------- /spider/pyspider/database/mysql/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-07-17 21:06:43 7 | 8 | import time 9 | import mysql.connector 10 | 11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 12 | from pyspider.database.basedb import BaseDB 13 | from .mysqlbase import MySQLMixin 14 | 15 | 16 | class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB): 17 | __tablename__ = 'projectdb' 18 | 19 | def __init__(self, host='localhost', port=3306, database='projectdb', 20 | user='root', passwd=None): 21 | self.database_name = database 22 | self.conn = mysql.connector.connect(user=user, password=passwd, 23 | host=host, port=port, autocommit=True) 24 | if database not in [x[0] for x in self._execute('show databases')]: 25 | self._execute('CREATE DATABASE %s' % self.escape(database)) 26 | self.conn.database = database 27 | 28 | self._execute('''CREATE TABLE IF NOT EXISTS %s ( 29 | `name` varchar(64) PRIMARY KEY, 30 | `group` varchar(64), 31 | `status` varchar(16), 32 | `script` TEXT, 33 | `comments` varchar(1024), 34 | `rate` float(11, 4), 35 | `burst` float(11, 4), 36 | `updatetime` double(16, 4) 37 | ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__)) 38 | 39 | def insert(self, name, obj={}): 40 | obj = dict(obj) 41 | obj['name'] = name 42 | obj['updatetime'] = time.time() 43 | return self._insert(**obj) 44 | 45 | def update(self, name, obj={}, **kwargs): 46 | obj = dict(obj) 47 | obj.update(kwargs) 48 | obj['updatetime'] = time.time() 49 | ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) 50 | return ret.rowcount 51 | 52 | def get_all(self, fields=None): 53 | return self._select2dic(what=fields) 54 | 55 | def get(self, name, fields=None): 56 | where = "`name` = %s" % self.placeholder 57 | for each in self._select2dic(what=fields, where=where, where_values=(name, )): 58 | return each 59 | return None 60 | 61 | def drop(self, name): 62 | where = "`name` = %s" % self.placeholder 63 | return self._delete(where=where, where_values=(name, )) 64 | 65 | def check_update(self, timestamp, fields=None): 66 | where = "`updatetime` >= %f" % timestamp 67 | return self._select2dic(what=fields, where=where) 68 | -------------------------------------------------------------------------------- /spider/pyspider/database/redis/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-05-17 01:34:21 7 | 8 | -------------------------------------------------------------------------------- /spider/pyspider/database/sqlalchemy/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-04 20:11:04 7 | 8 | -------------------------------------------------------------------------------- /spider/pyspider/database/sqlalchemy/sqlalchemybase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-04 18:48:47 7 | 8 | import time 9 | 10 | 11 | def result2dict(columns, task): 12 | r = {} 13 | for key in task.keys(): 14 | r[key] = task[key] 15 | return r 16 | 17 | 18 | class SplitTableMixin(object): 19 | UPDATE_PROJECTS_TIME = 10 * 60 20 | 21 | def _tablename(self, project): 22 | if self.__tablename__: 23 | return '%s_%s' % (self.__tablename__, project) 24 | else: 25 | return project 26 | 27 | @property 28 | def projects(self): 29 | if time.time() - getattr(self, '_last_update_projects', 0) \ 30 | > self.UPDATE_PROJECTS_TIME: 31 | self._list_project() 32 | return self._projects 33 | 34 | @projects.setter 35 | def projects(self, value): 36 | self._projects = value 37 | 38 | def _list_project(self): 39 | self._last_update_projects = time.time() 40 | self.projects = set() 41 | if self.__tablename__: 42 | prefix = '%s_' % self.__tablename__ 43 | else: 44 | prefix = '' 45 | 46 | for project in self.engine.table_names(): 47 | if project.startswith(prefix): 48 | project = project[len(prefix):] 49 | self.projects.add(project) 50 | 51 | def drop(self, project): 52 | if project not in self.projects: 53 | self._list_project() 54 | if project not in self.projects: 55 | return 56 | self.table.name = self._tablename(project) 57 | self.table.drop(self.engine) 58 | self._list_project() 59 | -------------------------------------------------------------------------------- /spider/pyspider/database/sqlite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/database/sqlite/__init__.py -------------------------------------------------------------------------------- /spider/pyspider/database/sqlite/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-09 12:05:52 7 | 8 | import time 9 | 10 | from .sqlitebase import SQLiteMixin 11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 12 | from pyspider.database.basedb import BaseDB 13 | 14 | 15 | class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB): 16 | __tablename__ = 'projectdb' 17 | placeholder = '?' 18 | 19 | def __init__(self, path): 20 | self.path = path 21 | self.last_pid = 0 22 | self.conn = None 23 | self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( 24 | name PRIMARY KEY, 25 | `group`, 26 | status, script, comments, 27 | rate, burst, updatetime 28 | )''' % self.__tablename__) 29 | 30 | def insert(self, name, obj={}): 31 | obj = dict(obj) 32 | obj['name'] = name 33 | obj['updatetime'] = time.time() 34 | return self._insert(**obj) 35 | 36 | def update(self, name, obj={}, **kwargs): 37 | obj = dict(obj) 38 | obj.update(kwargs) 39 | obj['updatetime'] = time.time() 40 | ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) 41 | return ret.rowcount 42 | 43 | def get_all(self, fields=None): 44 | return self._select2dic(what=fields) 45 | 46 | def get(self, name, fields=None): 47 | where = "`name` = %s" % self.placeholder 48 | for each in self._select2dic(what=fields, where=where, where_values=(name, )): 49 | return each 50 | return None 51 | 52 | def check_update(self, timestamp, fields=None): 53 | where = "`updatetime` >= %f" % timestamp 54 | return self._select2dic(what=fields, where=where) 55 | 56 | def drop(self, name): 57 | where = "`name` = %s" % self.placeholder 58 | return self._delete(where=where, where_values=(name, )) 59 | -------------------------------------------------------------------------------- /spider/pyspider/database/sqlite/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-13 17:08:43 7 | 8 | import re 9 | import time 10 | import json 11 | 12 | from .sqlitebase import SQLiteMixin, SplitTableMixin 13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 14 | from pyspider.database.basedb import BaseDB 15 | 16 | 17 | class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB): 18 | __tablename__ = 'resultdb' 19 | placeholder = '?' 20 | 21 | def __init__(self, path): 22 | self.path = path 23 | self.last_pid = 0 24 | self.conn = None 25 | self._list_project() 26 | 27 | def _create_project(self, project): 28 | assert re.match(r'^\w+$', project) is not None 29 | tablename = self._tablename(project) 30 | self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( 31 | taskid PRIMARY KEY, 32 | url, 33 | result, 34 | updatetime 35 | )''' % tablename) 36 | 37 | def _parse(self, data): 38 | if 'result' in data: 39 | data['result'] = json.loads(data['result']) 40 | return data 41 | 42 | def _stringify(self, data): 43 | if 'result' in data: 44 | data['result'] = json.dumps(data['result']) 45 | return data 46 | 47 | def save(self, project, taskid, url, result): 48 | tablename = self._tablename(project) 49 | if project not in self.projects: 50 | self._create_project(project) 51 | self._list_project() 52 | obj = { 53 | 'taskid': taskid, 54 | 'url': url, 55 | 'result': result, 56 | 'updatetime': time.time(), 57 | } 58 | return self._replace(tablename, **self._stringify(obj)) 59 | 60 | def select(self, project, fields=None, offset=0, limit=None): 61 | if project not in self.projects: 62 | self._list_project() 63 | if project not in self.projects: 64 | return 65 | tablename = self._tablename(project) 66 | 67 | for task in self._select2dic(tablename, what=fields, order='updatetime DESC', 68 | offset=offset, limit=limit): 69 | yield self._parse(task) 70 | 71 | def count(self, project): 72 | if project not in self.projects: 73 | self._list_project() 74 | if project not in self.projects: 75 | return 0 76 | tablename = self._tablename(project) 77 | for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): 78 | return count 79 | 80 | def get(self, project, taskid, fields=None): 81 | if project not in self.projects: 82 | self._list_project() 83 | if project not in self.projects: 84 | return 85 | tablename = self._tablename(project) 86 | where = "`taskid` = %s" % self.placeholder 87 | for task in self._select2dic(tablename, what=fields, 88 | where=where, where_values=(taskid, )): 89 | return self._parse(task) 90 | -------------------------------------------------------------------------------- /spider/pyspider/database/sqlite/sqlitebase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-22 20:30:44 7 | 8 | import os 9 | import time 10 | import sqlite3 11 | import threading 12 | 13 | 14 | class SQLiteMixin(object): 15 | 16 | @property 17 | def dbcur(self): 18 | pid = (os.getpid(), threading.current_thread().ident) 19 | if not (self.conn and pid == self.last_pid): 20 | self.last_pid = pid 21 | self.conn = sqlite3.connect(self.path, isolation_level=None) 22 | return self.conn.cursor() 23 | 24 | 25 | class SplitTableMixin(object): 26 | UPDATE_PROJECTS_TIME = 10 * 60 27 | 28 | def _tablename(self, project): 29 | if self.__tablename__: 30 | return '%s_%s' % (self.__tablename__, project) 31 | else: 32 | return project 33 | 34 | @property 35 | def projects(self): 36 | if time.time() - getattr(self, '_last_update_projects', 0) \ 37 | > self.UPDATE_PROJECTS_TIME: 38 | self._list_project() 39 | return self._projects 40 | 41 | @projects.setter 42 | def projects(self, value): 43 | self._projects = value 44 | 45 | def _list_project(self): 46 | self._last_update_projects = time.time() 47 | self.projects = set() 48 | if self.__tablename__: 49 | prefix = '%s_' % self.__tablename__ 50 | else: 51 | prefix = '' 52 | for project, in self._select('sqlite_master', what='name', 53 | where='type = "table"'): 54 | if project.startswith(prefix): 55 | project = project[len(prefix):] 56 | self.projects.add(project) 57 | 58 | def drop(self, project): 59 | if project not in self.projects: 60 | self._list_project() 61 | if project not in self.projects: 62 | return 63 | tablename = self._tablename(project) 64 | self._execute("DROP TABLE %s" % self.escape(tablename)) 65 | self._list_project() 66 | -------------------------------------------------------------------------------- /spider/pyspider/fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .tornado_fetcher import Fetcher 2 | -------------------------------------------------------------------------------- /spider/pyspider/fetcher/cookie_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-14 09:07:11 7 | 8 | from requests.cookies import MockRequest 9 | 10 | 11 | class MockResponse(object): 12 | 13 | def __init__(self, headers): 14 | self._headers = headers 15 | 16 | def info(self): 17 | return self 18 | 19 | def getheaders(self, name): 20 | """make cookie python 2 version use this method to get cookie list""" 21 | return self._headers.get_list(name) 22 | 23 | def get_all(self, name, default=[]): 24 | """make cookie python 3 version use this instead of getheaders""" 25 | return self._headers.get_list(name) or default 26 | 27 | 28 | def extract_cookies_to_jar(jar, request, response): 29 | req = MockRequest(request) 30 | res = MockResponse(response) 31 | jar.extract_cookies(res, req) 32 | -------------------------------------------------------------------------------- /spider/pyspider/libs/ListIO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-26 23:41:51 7 | 8 | 9 | class ListO(object): 10 | 11 | """A StringO write to list.""" 12 | 13 | def __init__(self, buffer=None): 14 | self._buffer = buffer 15 | if self._buffer is None: 16 | self._buffer = [] 17 | 18 | def isatty(self): 19 | return False 20 | 21 | def close(self): 22 | pass 23 | 24 | def flush(self): 25 | pass 26 | 27 | def seek(self, n, mode=0): 28 | pass 29 | 30 | def readline(self): 31 | pass 32 | 33 | def reset(self): 34 | pass 35 | 36 | def write(self, x): 37 | self._buffer.append(x) 38 | 39 | def writelines(self, x): 40 | self._buffer.extend(x) 41 | -------------------------------------------------------------------------------- /spider/pyspider/libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/libs/__init__.py -------------------------------------------------------------------------------- /spider/pyspider/libs/dataurl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2012-11-16 10:33:20 7 | 8 | import six 9 | from base64 import b64encode, b64decode 10 | from . import utils 11 | from six.moves.urllib.parse import quote, unquote 12 | 13 | 14 | def encode(data, mime_type='', charset='utf-8', base64=True): 15 | """ 16 | Encode data to DataURL 17 | """ 18 | if isinstance(data, six.text_type): 19 | data = data.encode(charset) 20 | else: 21 | charset = None 22 | if base64: 23 | data = utils.text(b64encode(data)) 24 | else: 25 | data = utils.text(quote(data)) 26 | 27 | result = ['data:', ] 28 | if mime_type: 29 | result.append(mime_type) 30 | if charset: 31 | result.append(';charset=') 32 | result.append(charset) 33 | if base64: 34 | result.append(';base64') 35 | result.append(',') 36 | result.append(data) 37 | 38 | return ''.join(result) 39 | 40 | 41 | def decode(data_url): 42 | """ 43 | Decode DataURL data 44 | """ 45 | metadata, data = data_url.rsplit(',', 1) 46 | _, metadata = metadata.split('data:', 1) 47 | parts = metadata.split(';') 48 | if parts[-1] == 'base64': 49 | data = b64decode(data) 50 | else: 51 | data = unquote(data) 52 | 53 | for part in parts: 54 | if part.startswith("charset="): 55 | data = data.decode(part[8:]) 56 | return data 57 | -------------------------------------------------------------------------------- /spider/pyspider/libs/log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2012-10-24 16:08:17 7 | 8 | import logging 9 | 10 | try: 11 | import curses 12 | except ImportError: 13 | curses = None 14 | 15 | from tornado.log import LogFormatter as _LogFormatter 16 | 17 | 18 | class LogFormatter(_LogFormatter, object): 19 | """Init tornado.log.LogFormatter from logging.config.fileConfig""" 20 | def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs): 21 | if fmt is None: 22 | fmt = _LogFormatter.DEFAULT_FORMAT 23 | super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs) 24 | 25 | 26 | class SaveLogHandler(logging.Handler): 27 | """LogHandler that save records to a list""" 28 | 29 | def __init__(self, saveto=None, *args, **kwargs): 30 | self.saveto = saveto 31 | logging.Handler.__init__(self, *args, **kwargs) 32 | 33 | def emit(self, record): 34 | if self.saveto is not None: 35 | self.saveto.append(record) 36 | 37 | handle = emit 38 | 39 | 40 | def enable_pretty_logging(logger=logging.getLogger()): 41 | channel = logging.StreamHandler() 42 | channel.setFormatter(LogFormatter()) 43 | logger.addHandler(channel) 44 | -------------------------------------------------------------------------------- /spider/pyspider/libs/multiprocessing_queue.py: -------------------------------------------------------------------------------- 1 | import six 2 | import platform 3 | import multiprocessing 4 | from multiprocessing.queues import Queue as BaseQueue 5 | 6 | 7 | # The SharedCounter and Queue classes come from: 8 | # https://github.com/vterron/lemon/commit/9ca6b4b 9 | 10 | class SharedCounter(object): 11 | """ A synchronized shared counter. 12 | The locking done by multiprocessing.Value ensures that only a single 13 | process or thread may read or write the in-memory ctypes object. However, 14 | in order to do n += 1, Python performs a read followed by a write, so a 15 | second process may read the old value before the new one is written by the 16 | first process. The solution is to use a multiprocessing.Lock to guarantee 17 | the atomicity of the modifications to Value. 18 | This class comes almost entirely from Eli Bendersky's blog: 19 | http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/ 20 | """ 21 | 22 | def __init__(self, n=0): 23 | self.count = multiprocessing.Value('i', n) 24 | 25 | def increment(self, n=1): 26 | """ Increment the counter by n (default = 1) """ 27 | with self.count.get_lock(): 28 | self.count.value += n 29 | 30 | @property 31 | def value(self): 32 | """ Return the value of the counter """ 33 | return self.count.value 34 | 35 | 36 | class MultiProcessingQueue(BaseQueue): 37 | """ A portable implementation of multiprocessing.Queue. 38 | Because of multithreading / multiprocessing semantics, Queue.qsize() may 39 | raise the NotImplementedError exception on Unix platforms like Mac OS X 40 | where sem_getvalue() is not implemented. This subclass addresses this 41 | problem by using a synchronized shared counter (initialized to zero) and 42 | increasing / decreasing its value every time the put() and get() methods 43 | are called, respectively. This not only prevents NotImplementedError from 44 | being raised, but also allows us to implement a reliable version of both 45 | qsize() and empty(). 46 | """ 47 | def __init__(self, *args, **kwargs): 48 | super(MultiProcessingQueue, self).__init__(*args, **kwargs) 49 | self.size = SharedCounter(0) 50 | 51 | def put(self, *args, **kwargs): 52 | self.size.increment(1) 53 | super(MultiProcessingQueue, self).put(*args, **kwargs) 54 | 55 | def get(self, *args, **kwargs): 56 | v = super(MultiProcessingQueue, self).get(*args, **kwargs) 57 | self.size.increment(-1) 58 | return v 59 | 60 | def qsize(self): 61 | """ Reliable implementation of multiprocessing.Queue.qsize() """ 62 | return self.size.value 63 | 64 | 65 | if platform.system() == 'Darwin': 66 | if hasattr(multiprocessing, 'get_context'): # for py34 67 | def Queue(maxsize=0): 68 | return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context()) 69 | else: 70 | def Queue(maxsize=0): 71 | return MultiProcessingQueue(maxsize) 72 | else: 73 | from multiprocessing import Queue # flake8: noqa 74 | -------------------------------------------------------------------------------- /spider/pyspider/libs/sample_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on __DATE__ 4 | # Project: __PROJECT_NAME__ 5 | 6 | from pyspider.libs.base_handler import * 7 | 8 | 9 | class Handler(BaseHandler): 10 | crawl_config = { 11 | } 12 | 13 | @every(minutes=24 * 60) 14 | def on_start(self): 15 | self.crawl('__START_URL__', callback=self.index_page) 16 | 17 | @config(age=10 * 24 * 60 * 60) 18 | def index_page(self, response): 19 | for each in response.doc('a[href^="http"]').items(): 20 | self.crawl(each.attr.href, callback=self.detail_page) 21 | 22 | @config(priority=2) 23 | def detail_page(self, response): 24 | return { 25 | "url": response.url, 26 | "title": response.doc('title').text(), 27 | } 28 | -------------------------------------------------------------------------------- /spider/pyspider/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,scheduler,fetcher,processor,webui,bench,werkzeug 3 | 4 | [logger_root] 5 | level=INFO 6 | handlers=screen 7 | 8 | [logger_scheduler] 9 | level=INFO 10 | handlers=screen 11 | qualname=scheduler 12 | propagate=0 13 | 14 | [logger_fetcher] 15 | level=DEBUG 16 | handlers=screen 17 | qualname=fetcher 18 | propagate=0 19 | 20 | [logger_processor] 21 | level=DEBUG 22 | handlers=screen 23 | qualname=processor 24 | propagate=0 25 | 26 | [logger_webui] 27 | level=DEBUG 28 | handlers=screen 29 | qualname=webui 30 | propagate=0 31 | 32 | [logger_bench] 33 | level=DEBUG 34 | handlers=screen 35 | qualname=bench 36 | propagate=0 37 | 38 | [logger_werkzeug] 39 | level=INFO 40 | handlers=screen 41 | qualname=werkzeug 42 | propagate=0 43 | 44 | [handlers] 45 | keys=screen 46 | 47 | [handler_screen] 48 | class=logging.StreamHandler 49 | formatter=pretty 50 | level=DEBUG 51 | args=(sys.stderr, ) 52 | 53 | [formatters] 54 | keys=pretty 55 | 56 | [formatter_pretty] 57 | class=pyspider.libs.log.LogFormatter 58 | -------------------------------------------------------------------------------- /spider/pyspider/message_queue/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-04-30 21:47:08 7 | 8 | try: 9 | from urllib import parse as urlparse 10 | except ImportError: 11 | import urlparse 12 | 13 | 14 | def connect_message_queue(name, url=None, maxsize=0): 15 | """ 16 | create connection to message queue 17 | 18 | name: 19 | name of message queue 20 | 21 | rabbitmq: 22 | amqp://username:password@host:5672/%2F 23 | see https://www.rabbitmq.com/uri-spec.html 24 | beanstalk: 25 | beanstalk://host:11300/ 26 | redis: 27 | redis://host:6379/db 28 | kombu: 29 | kombu+transport://userid:password@hostname:port/virtual_host 30 | see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls 31 | builtin: 32 | None 33 | """ 34 | 35 | if not url: 36 | from pyspider.libs.multiprocessing_queue import Queue 37 | return Queue(maxsize=maxsize) 38 | 39 | parsed = urlparse.urlparse(url) 40 | if parsed.scheme == 'amqp': 41 | from .rabbitmq import Queue 42 | return Queue(name, url, maxsize=maxsize) 43 | elif parsed.scheme == 'beanstalk': 44 | from .beanstalk import Queue 45 | return Queue(name, host=parsed.netloc, maxsize=maxsize) 46 | elif parsed.scheme == 'redis': 47 | from .redis_queue import Queue 48 | db = parsed.path.lstrip('/').split('/') 49 | try: 50 | db = int(db[0]) 51 | except: 52 | db = 0 53 | 54 | password = parsed.password or None 55 | 56 | return Queue(name, parsed.hostname, parsed.port, db=db, maxsize=maxsize, password=password) 57 | else: 58 | if url.startswith('kombu+'): 59 | url = url[len('kombu+'):] 60 | from .kombu_queue import Queue 61 | return Queue(name, url, maxsize=maxsize) 62 | 63 | raise Exception('unknow connection url: %s', url) 64 | -------------------------------------------------------------------------------- /spider/pyspider/message_queue/redis_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-04-27 22:48:04 7 | 8 | import time 9 | import redis 10 | import umsgpack 11 | from six.moves import queue as BaseQueue 12 | 13 | 14 | class RedisQueue(object): 15 | """ 16 | A Queue like message built over redis 17 | """ 18 | 19 | Empty = BaseQueue.Empty 20 | Full = BaseQueue.Full 21 | max_timeout = 0.3 22 | 23 | def __init__(self, name, host='localhost', port=6379, db=0, 24 | maxsize=0, lazy_limit=True, password=None): 25 | """ 26 | Constructor for RedisQueue 27 | 28 | maxsize: an integer that sets the upperbound limit on the number of 29 | items that can be placed in the queue. 30 | lazy_limit: redis queue is shared via instance, a lazy size limit is used 31 | for better performance. 32 | """ 33 | self.name = name 34 | self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password) 35 | self.maxsize = maxsize 36 | self.lazy_limit = lazy_limit 37 | self.last_qsize = 0 38 | 39 | def qsize(self): 40 | self.last_qsize = self.redis.llen(self.name) 41 | return self.last_qsize 42 | 43 | def empty(self): 44 | if self.qsize() == 0: 45 | return True 46 | else: 47 | return False 48 | 49 | def full(self): 50 | if self.maxsize and self.qsize() >= self.maxsize: 51 | return True 52 | else: 53 | return False 54 | 55 | def put_nowait(self, obj): 56 | if self.lazy_limit and self.last_qsize < self.maxsize: 57 | pass 58 | elif self.full(): 59 | raise self.Full 60 | self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj)) 61 | return True 62 | 63 | def put(self, obj, block=True, timeout=None): 64 | if not block: 65 | return self.put_nowait() 66 | 67 | start_time = time.time() 68 | while True: 69 | try: 70 | return self.put_nowait(obj) 71 | except self.Full: 72 | if timeout: 73 | lasted = time.time() - start_time 74 | if timeout > lasted: 75 | time.sleep(min(self.max_timeout, timeout - lasted)) 76 | else: 77 | raise 78 | else: 79 | time.sleep(self.max_timeout) 80 | 81 | def get_nowait(self): 82 | ret = self.redis.lpop(self.name) 83 | if ret is None: 84 | raise self.Empty 85 | return umsgpack.unpackb(ret) 86 | 87 | def get(self, block=True, timeout=None): 88 | if not block: 89 | return self.get_nowait() 90 | 91 | start_time = time.time() 92 | while True: 93 | try: 94 | return self.get_nowait() 95 | except self.Empty: 96 | if timeout: 97 | lasted = time.time() - start_time 98 | if timeout > lasted: 99 | time.sleep(min(self.max_timeout, timeout - lasted)) 100 | else: 101 | raise 102 | else: 103 | time.sleep(self.max_timeout) 104 | 105 | Queue = RedisQueue 106 | -------------------------------------------------------------------------------- /spider/pyspider/processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .processor import ProcessorResult, Processor 2 | -------------------------------------------------------------------------------- /spider/pyspider/result/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-19 16:10:19 7 | 8 | from .result_worker import ResultWorker, OneResultWorker 9 | -------------------------------------------------------------------------------- /spider/pyspider/result/result_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-19 15:37:46 7 | 8 | import time 9 | import json 10 | import logging 11 | from six.moves import queue as Queue 12 | logger = logging.getLogger("result") 13 | 14 | 15 | class ResultWorker(object): 16 | 17 | """ 18 | do with result 19 | override this if needed. 20 | """ 21 | 22 | def __init__(self, resultdb, inqueue): 23 | self.resultdb = resultdb 24 | self.inqueue = inqueue 25 | self._quit = False 26 | 27 | def on_result(self, task, result): 28 | '''Called every result''' 29 | if not result: 30 | return 31 | if 'taskid' in task and 'project' in task and 'url' in task: 32 | logger.info('result %s:%s %s -> %.30r' % ( 33 | task['project'], task['taskid'], task['url'], result)) 34 | return self.resultdb.save( 35 | project=task['project'], 36 | taskid=task['taskid'], 37 | url=task['url'], 38 | result=result 39 | ) 40 | else: 41 | logger.warning('result UNKNOW -> %.30r' % result) 42 | return 43 | 44 | def quit(self): 45 | self._quit = True 46 | 47 | def run(self): 48 | '''Run loop''' 49 | logger.info("result_worker starting...") 50 | 51 | while not self._quit: 52 | try: 53 | task, result = self.inqueue.get(timeout=1) 54 | self.on_result(task, result) 55 | except Queue.Empty as e: 56 | continue 57 | except KeyboardInterrupt: 58 | break 59 | except AssertionError as e: 60 | logger.error(e) 61 | continue 62 | except Exception as e: 63 | logger.exception(e) 64 | continue 65 | 66 | logger.info("result_worker exiting...") 67 | 68 | 69 | class OneResultWorker(ResultWorker): 70 | '''Result Worker for one mode, write results to stdout''' 71 | def on_result(self, task, result): 72 | '''Called every result''' 73 | if not result: 74 | return 75 | if 'taskid' in task and 'project' in task and 'url' in task: 76 | logger.info('result %s:%s %s -> %.30r' % ( 77 | task['project'], task['taskid'], task['url'], result)) 78 | print(json.dumps({ 79 | 'taskid': task['taskid'], 80 | 'project': task['project'], 81 | 'url': task['url'], 82 | 'result': result, 83 | 'updatetime': time.time() 84 | })) 85 | else: 86 | logger.warning('result UNKNOW -> %.30r' % result) 87 | return 88 | -------------------------------------------------------------------------------- /spider/pyspider/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler # NOQA 2 | -------------------------------------------------------------------------------- /spider/pyspider/scheduler/token_bucket.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-07 16:53:08 7 | 8 | import time 9 | try: 10 | import threading as _threading 11 | except ImportError: 12 | import dummy_threading as _threading 13 | 14 | 15 | class Bucket(object): 16 | 17 | ''' 18 | traffic flow control with token bucket 19 | ''' 20 | 21 | update_interval = 30 22 | 23 | def __init__(self, rate=1, burst=None): 24 | self.rate = float(rate) 25 | if burst is None: 26 | self.burst = float(rate) * 10 27 | else: 28 | self.burst = float(burst) 29 | self.mutex = _threading.Lock() 30 | self.bucket = self.burst 31 | self.last_update = time.time() 32 | 33 | def get(self): 34 | '''Get the number of tokens in bucket''' 35 | now = time.time() 36 | if self.bucket >= self.burst: 37 | self.last_update = now 38 | return self.bucket 39 | bucket = self.rate * (now - self.last_update) 40 | self.mutex.acquire() 41 | if bucket > 1: 42 | self.bucket += bucket 43 | if self.bucket > self.burst: 44 | self.bucket = self.burst 45 | self.last_update = now 46 | self.mutex.release() 47 | return self.bucket 48 | 49 | def set(self, value): 50 | '''Set number of tokens in bucket''' 51 | self.bucket = value 52 | 53 | def desc(self, value=1): 54 | '''Use value tokens''' 55 | self.bucket -= value 56 | -------------------------------------------------------------------------------- /spider/pyspider/webui/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-22 23:20:40 7 | 8 | from . import app, index, debug, task, result, login 9 | -------------------------------------------------------------------------------- /spider/pyspider/webui/bench_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-08 22:31:17 7 | 8 | import random 9 | try: 10 | from urllib import urlencode 11 | except ImportError: 12 | from urllib.parse import urlencode 13 | 14 | from flask import request 15 | from .app import app 16 | 17 | 18 | @app.route('/bench') 19 | def bench_test(): 20 | total = int(request.args.get('total', 10000)) 21 | show = int(request.args.get('show', 20)) 22 | nlist = [random.randint(1, total) for _ in range(show)] 23 | result = [] 24 | result.append("") 25 | args = dict(request.args) 26 | for nl in nlist: 27 | args['n'] = nl 28 | argstr = urlencode(sorted(args.items()), doseq=True) 29 | result.append("follow {1}
".format(argstr, nl)) 30 | result.append("") 31 | return "".join(result) 32 | -------------------------------------------------------------------------------- /spider/pyspider/webui/login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-10 20:36:27 7 | 8 | import base64 9 | from flask import Response 10 | from flask.ext import login 11 | from .app import app 12 | 13 | login_manager = login.LoginManager() 14 | login_manager.init_app(app) 15 | 16 | 17 | class AnonymousUser(login.AnonymousUserMixin): 18 | 19 | def is_anonymous(self): 20 | return True 21 | 22 | def is_active(self): 23 | return False 24 | 25 | def is_authenticated(self): 26 | return False 27 | 28 | def get_id(self): 29 | return 30 | 31 | 32 | class User(login.UserMixin): 33 | 34 | def __init__(self, id, password): 35 | self.id = id 36 | self.password = password 37 | 38 | def is_authenticated(self): 39 | if not app.config.get('webui_username'): 40 | return True 41 | if self.id == app.config.get('webui_username') \ 42 | and self.password == app.config.get('webui_password'): 43 | return True 44 | return False 45 | 46 | def is_active(self): 47 | return self.is_authenticated() 48 | 49 | 50 | login_manager.anonymous_user = AnonymousUser 51 | 52 | 53 | @login_manager.request_loader 54 | def load_user_from_request(request): 55 | api_key = request.headers.get('Authorization') 56 | if api_key: 57 | api_key = api_key[len("Basic "):] 58 | try: 59 | api_key = base64.b64decode(api_key).decode('utf8') 60 | return User(*api_key.split(":", 1)) 61 | except Exception as e: 62 | app.logger.error('wrong api key: %r, %r', api_key, e) 63 | return None 64 | return None 65 | app.login_response = Response( 66 | "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'} 67 | ) 68 | 69 | 70 | @app.before_request 71 | def before_request(): 72 | if app.config.get('need_auth', False): 73 | if not login.current_user.is_active(): 74 | return app.login_response 75 | -------------------------------------------------------------------------------- /spider/pyspider/webui/result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-19 16:23:55 7 | 8 | from __future__ import unicode_literals 9 | 10 | from flask import render_template, request, json 11 | from flask import Response 12 | from .app import app 13 | from pyspider.libs import result_dump 14 | 15 | 16 | @app.route('/results') 17 | def result(): 18 | resultdb = app.config['resultdb'] 19 | project = request.args.get('project') 20 | offset = int(request.args.get('offset', 0)) 21 | limit = int(request.args.get('limit', 20)) 22 | 23 | count = resultdb.count(project) 24 | results = list(resultdb.select(project, offset=offset, limit=limit)) 25 | 26 | return render_template( 27 | "result.html", count=count, results=results, 28 | result_formater=result_dump.result_formater, 29 | project=project, offset=offset, limit=limit, json=json 30 | ) 31 | 32 | 33 | @app.route('/results/dump/.<_format>') 34 | def dump_result(project, _format): 35 | resultdb = app.config['resultdb'] 36 | # force update project list 37 | resultdb.get(project, 'any') 38 | if project not in resultdb.projects: 39 | return "no such project.", 404 40 | 41 | offset = int(request.args.get('offset', 0)) or None 42 | limit = int(request.args.get('limit', 0)) or None 43 | results = resultdb.select(project, offset=offset, limit=limit) 44 | 45 | if _format == 'json': 46 | valid = request.args.get('style', 'rows') == 'full' 47 | return Response(result_dump.dump_as_json(results, valid), 48 | mimetype='application/json') 49 | elif _format == 'txt': 50 | return Response(result_dump.dump_as_txt(results), 51 | mimetype='text/plain') 52 | elif _format == 'csv': 53 | return Response(result_dump.dump_as_csv(results), 54 | mimetype='text/csv') 55 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/index.css: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-02-23 00:28:30 */ 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 6 | /* Author: Binux */ 7 | /* http://binux.me */ 8 | /* Created on 2014-07-16 19:18:30 */ 9 | h1 { 10 | margin-top: 5px; 11 | } 12 | header .alert { 13 | position: absolute; 14 | width: 50rem; 15 | left: 50%; 16 | margin-left: -25rem; 17 | } 18 | .queue-info th, 19 | .queue-info td { 20 | text-align: center; 21 | border: 1px solid #ddd; 22 | } 23 | .projects { 24 | min-width: 850px; 25 | border-top: 1px solid #ddd; 26 | border-bottom: 1px solid #ddd; 27 | } 28 | .projects .project-group { 29 | width: 80px; 30 | } 31 | .projects .project-name { 32 | font-weight: bold; 33 | } 34 | .projects .project-status { 35 | width: 100px; 36 | } 37 | .projects .project-status > span { 38 | border: solid 1px #666666; 39 | padding: 1px 5px 0 5px; 40 | background: #808080; 41 | color: white; 42 | } 43 | .projects span.status-TODO { 44 | border: solid 1px #ec971f; 45 | padding: 1px 5px 0 5px; 46 | background: #f0ad4e; 47 | color: white; 48 | } 49 | .projects span.status-STOP { 50 | border: solid 1px #c9302c; 51 | padding: 1px 5px 0 5px; 52 | background: #d9534f; 53 | color: white; 54 | } 55 | .projects span.status-CHECKING { 56 | border: solid 1px #dcbe00; 57 | padding: 1px 5px 0 5px; 58 | background: #ffde10; 59 | color: white; 60 | } 61 | .projects span.status-DEBUG { 62 | border: solid 1px #3071a9; 63 | padding: 1px 5px 0 5px; 64 | background: #428bca; 65 | color: white; 66 | } 67 | .projects span.status-RUNNING { 68 | border: solid 1px #449d44; 69 | padding: 1px 5px 0 5px; 70 | background: #5cb85c; 71 | color: white; 72 | } 73 | .projects .project-rate { 74 | width: 110px; 75 | } 76 | .projects .project-time { 77 | width: 110px; 78 | } 79 | .projects th.project-progress { 80 | position: relative; 81 | } 82 | .projects th.project-progress span { 83 | position: absolute; 84 | } 85 | .projects td.project-progress { 86 | position: relative; 87 | min-width: 5%; 88 | } 89 | .projects td.project-progress.progress-all { 90 | min-width: 10%; 91 | } 92 | .projects td.project-progress .progress { 93 | position: relative; 94 | margin: 0; 95 | background-color: #aaa; 96 | } 97 | .projects td.project-progress .progress .progress-text { 98 | width: 100%; 99 | text-align: center; 100 | position: absolute; 101 | font-weight: bold; 102 | color: #fff; 103 | pointer-events: none; 104 | } 105 | .projects td.project-progress .progress .progress-bar { 106 | -webkit-transition: none; 107 | transition: none; 108 | } 109 | .projects .project-actions { 110 | width: 200px; 111 | } 112 | .global-btn { 113 | margin-top: -5px; 114 | padding: 10px 10px 10px 10px; 115 | } 116 | .global-btn .create-btn-div { 117 | float: right; 118 | } 119 | .global-btn .active-btn-div { 120 | float: left; 121 | } 122 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/index.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-02-23 00:28:30 */ 5 | 6 | @import "variable"; 7 | 8 | h1 { 9 | margin-top: 5px; 10 | } 11 | 12 | header .alert { 13 | position: absolute;; 14 | width: 50rem; 15 | left: 50%; 16 | margin-left: -25rem; 17 | } 18 | 19 | .queue-info { 20 | th, td { 21 | text-align: center; 22 | border: 1px solid #ddd; 23 | } 24 | } 25 | 26 | .projects { 27 | min-width: 850px; 28 | border-top: 1px solid #ddd; 29 | border-bottom: 1px solid #ddd; 30 | 31 | .project-group { 32 | width: 80px; 33 | } 34 | 35 | .project-name { 36 | font-weight: bold; 37 | } 38 | 39 | .project-status { 40 | width: 100px; 41 | } 42 | .project-status-span(@color) { 43 | border: solid 1px darken(@color, 10%); 44 | padding: 1px 5px 0 5px; 45 | background: @color; 46 | color: white; 47 | } 48 | .project-status>span { 49 | .project-status-span(lighten(black, 50%)); 50 | } 51 | span.status-TODO { 52 | .project-status-span(@orange); 53 | } 54 | span.status-STOP { 55 | .project-status-span(@red); 56 | } 57 | span.status-CHECKING { 58 | .project-status-span(darken(@yellow, 10%)); 59 | } 60 | span.status-DEBUG { 61 | .project-status-span(@blue); 62 | } 63 | span.status-RUNNING { 64 | .project-status-span(@green); 65 | } 66 | 67 | .project-rate { 68 | width: 110px; 69 | } 70 | 71 | .project-time { 72 | width: 110px; 73 | } 74 | 75 | th.project-progress { 76 | position: relative; 77 | span { 78 | position: absolute; 79 | } 80 | } 81 | 82 | td.project-progress { 83 | position: relative; 84 | min-width: 5%; 85 | &.progress-all { 86 | min-width: 10%; 87 | } 88 | 89 | .progress { 90 | position: relative; 91 | margin: 0; 92 | background-color: #aaa; 93 | .progress-text { 94 | width: 100%; 95 | text-align: center; 96 | position: absolute; 97 | font-weight: bold; 98 | color: #fff; 99 | pointer-events: none; 100 | } 101 | .progress-bar { 102 | -webkit-transition: none; 103 | transition: none; 104 | } 105 | } 106 | } 107 | 108 | .project-actions { 109 | width: 200px; 110 | } 111 | } 112 | 113 | .global-btn { 114 | margin-top: -5px; 115 | padding: 10px 10px 10px 10px; 116 | 117 | .create-btn-div { 118 | float: right; 119 | } 120 | 121 | .active-btn-div { 122 | float: left; 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/result.css: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-10-22 22:38:45 */ 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 6 | /* Author: Binux */ 7 | /* http://binux.me */ 8 | /* Created on 2014-07-16 19:18:30 */ 9 | .top-bar { 10 | padding: 10px 15px 2px 15px; 11 | height: 46px; 12 | background-color: #f5f5f5; 13 | border-bottom: 1px solid #ddd; 14 | position: relative; 15 | } 16 | .top-bar h1 { 17 | margin: 0 0 10px 0; 18 | font-size: 18px; 19 | } 20 | .top-bar .btn-group { 21 | margin: 8px 10px 0 0; 22 | position: absolute; 23 | right: 0; 24 | top: 0; 25 | } 26 | .pagination-wrap { 27 | text-align: right; 28 | padding-right: 15px; 29 | } 30 | table { 31 | border-bottom: 1px solid #ddd; 32 | } 33 | table td { 34 | word-break: break-all; 35 | } 36 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/result.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-10-22 22:38:45 */ 5 | 6 | @import "variable"; 7 | 8 | .top-bar { 9 | padding: 10px 15px 2px 15px; 10 | height: 46px; 11 | background-color: #f5f5f5; 12 | border-bottom: 1px solid #ddd; 13 | position: relative; 14 | 15 | h1 { 16 | margin: 0 0 10px 0; 17 | font-size: 18px; 18 | } 19 | 20 | .btn-group { 21 | margin: 8px 10px 0 0; 22 | position: absolute; 23 | right: 0; 24 | top: 0; 25 | 26 | a.btn { 27 | } 28 | } 29 | } 30 | 31 | .pagination-wrap { 32 | text-align: right; 33 | padding-right: 15px; 34 | } 35 | 36 | table { 37 | border-bottom: 1px solid #ddd; 38 | 39 | td { 40 | word-break: break-all; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/task.css: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-16 19:20:30 */ 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 6 | /* Author: Binux */ 7 | /* http://binux.me */ 8 | /* Created on 2014-07-16 19:18:30 */ 9 | .base-info { 10 | padding: 10px 15px 2px 15px; 11 | background-color: #f5f5f5; 12 | } 13 | .more-info { 14 | padding: 10px 15px; 15 | border-top: 1px solid #ddd; 16 | } 17 | .more-info dd { 18 | display: block; 19 | font-family: monospace; 20 | white-space: pre; 21 | word-break: break-all; 22 | word-wrap: break-word; 23 | margin: 1em 0px; 24 | } 25 | .status-1 { 26 | border: solid 1px #3071a9; 27 | padding: 1px 5px 0 5px; 28 | background: #428bca; 29 | color: white; 30 | } 31 | .status-2 { 32 | border: solid 1px #449d44; 33 | padding: 1px 5px 0 5px; 34 | background: #5cb85c; 35 | color: white; 36 | } 37 | .status-3 { 38 | border: solid 1px #c9302c; 39 | padding: 1px 5px 0 5px; 40 | background: #d9534f; 41 | color: white; 42 | } 43 | .status-4 { 44 | border: solid 1px #666666; 45 | padding: 1px 5px 0 5px; 46 | background: #808080; 47 | color: white; 48 | } 49 | .url { 50 | font-size: 120%; 51 | text-decoration: underline; 52 | } 53 | .callback { 54 | color: #f0ad4e; 55 | font-weight: bold; 56 | } 57 | .callback:hover, 58 | .callback:focus { 59 | color: #ec971f; 60 | } 61 | dt .glyphicon-ok { 62 | color: #5cb85c; 63 | } 64 | dt .glyphicon-remove { 65 | color: #d9534f; 66 | } 67 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/task.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-16 19:20:30 */ 5 | 6 | @import "variable"; 7 | 8 | .base-info { 9 | padding: 10px 15px 2px 15px; 10 | background-color: #f5f5f5; 11 | border-bottom: 1px solid #ddd; 12 | } 13 | 14 | .more-info { 15 | padding: 10px 15px; 16 | } 17 | 18 | .more-info dd { 19 | display: block; 20 | font-family: monospace; 21 | white-space: pre; 22 | word-break: break-all; 23 | word-wrap: break-word; 24 | margin: 1em 0px; 25 | } 26 | 27 | .status_mix(@color: lighten(black, 50%)) { 28 | border: solid 1px darken(@color, 10%); 29 | padding: 1px 5px 0 5px; 30 | background: @color; 31 | color: white; 32 | } 33 | .status { 34 | &-1 { 35 | .status_mix(@blue); 36 | } 37 | &-2 { 38 | .status_mix(@green); 39 | } 40 | &-3 { 41 | .status_mix(@red); 42 | } 43 | &-4 { 44 | .status_mix; 45 | } 46 | } 47 | 48 | .url { 49 | font-size: 120%; 50 | text-decoration: underline; 51 | } 52 | 53 | .callback { 54 | color: @orange; 55 | font-weight: bold; 56 | 57 | &:hover, &:focus { 58 | color: darken(@orange, 10%); 59 | } 60 | } 61 | 62 | dt .glyphicon-ok { 63 | color: @green; 64 | } 65 | dt .glyphicon-remove { 66 | color: @red; 67 | } 68 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/tasks.css: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-18 23:20:46 */ 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 6 | /* Author: Binux */ 7 | /* http://binux.me */ 8 | /* Created on 2014-07-16 19:18:30 */ 9 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 10 | /* Author: Binux */ 11 | /* http://binux.me */ 12 | /* Created on 2014-07-16 19:20:30 */ 13 | .base-info { 14 | padding: 10px 15px 2px 15px; 15 | background-color: #f5f5f5; 16 | border-bottom: 1px solid #ddd; 17 | } 18 | .more-info { 19 | padding: 10px 15px; 20 | } 21 | .more-info dd { 22 | display: block; 23 | font-family: monospace; 24 | white-space: pre; 25 | word-break: break-all; 26 | word-wrap: break-word; 27 | margin: 1em 0px; 28 | } 29 | .status-1 { 30 | border: solid 1px #3071a9; 31 | padding: 1px 5px 0 5px; 32 | background: #428bca; 33 | color: white; 34 | } 35 | .status-2 { 36 | border: solid 1px #449d44; 37 | padding: 1px 5px 0 5px; 38 | background: #5cb85c; 39 | color: white; 40 | } 41 | .status-3 { 42 | border: solid 1px #c9302c; 43 | padding: 1px 5px 0 5px; 44 | background: #d9534f; 45 | color: white; 46 | } 47 | .status-4 { 48 | border: solid 1px #666666; 49 | padding: 1px 5px 0 5px; 50 | background: #808080; 51 | color: white; 52 | } 53 | .url { 54 | font-size: 120%; 55 | text-decoration: underline; 56 | } 57 | .callback { 58 | color: #f0ad4e; 59 | font-weight: bold; 60 | } 61 | .callback:hover, 62 | .callback:focus { 63 | color: #ec971f; 64 | } 65 | dt .glyphicon-ok { 66 | color: #5cb85c; 67 | } 68 | dt .glyphicon-remove { 69 | color: #d9534f; 70 | } 71 | .tasks { 72 | margin: 0; 73 | padding: 0; 74 | list-style-type: none; 75 | } 76 | .tasks li { 77 | padding: 10px 15px 2px 15px; 78 | background-color: #f5f5f5; 79 | border-bottom: 1px solid #ddd; 80 | } 81 | .tasks li:nth-child(even) { 82 | background-color: white; 83 | } 84 | .tasks .url { 85 | display: inline-block; 86 | vertical-align: bottom; 87 | max-width: 40em; 88 | overflow: hidden; 89 | white-space: nowrap; 90 | text-overflow: ellipsis; 91 | } 92 | .tasks .update-time { 93 | font-weight: bold; 94 | } 95 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/tasks.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-18 23:20:46 */ 5 | 6 | @import "variable"; 7 | @import "task"; 8 | 9 | .tasks { 10 | margin: 0; 11 | padding: 0; 12 | list-style-type: none; 13 | 14 | li { 15 | .base-info; 16 | 17 | &:nth-child(even) { 18 | background-color: white; 19 | } 20 | } 21 | 22 | .url { 23 | display: inline-block; 24 | vertical-align: bottom; 25 | max-width: 40em; 26 | overflow: hidden; 27 | white-space: nowrap; 28 | text-overflow: ellipsis; 29 | } 30 | 31 | .update-time { 32 | font-weight: bold; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spider/pyspider/webui/static/variable.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-16 19:18:30 */ 5 | 6 | // colors 7 | @gray-darker: lighten(#000, 13.5%); // #222 8 | @gray-dark: lighten(#000, 20%); // #333 9 | @gray: lighten(#000, 33.5%); // #555 10 | @gray-light: lighten(#000, 60%); // #999 11 | @gray-lighter: lighten(#000, 93.5%); // #eee 12 | 13 | @blue: #428bca; 14 | @green: #5cb85c; 15 | @blue-light: #5bc0de; 16 | @orange: #f0ad4e; 17 | @yellow: #ffe543; 18 | @red: #d9534f; 19 | -------------------------------------------------------------------------------- /spider/pyspider/webui/task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-07-16 15:30:57 7 | 8 | import socket 9 | from flask import abort, render_template, request, json 10 | 11 | from pyspider.libs import utils 12 | from .app import app 13 | 14 | 15 | @app.route('/task/') 16 | def task(taskid): 17 | if ':' not in taskid: 18 | abort(400) 19 | project, taskid = taskid.split(':', 1) 20 | 21 | taskdb = app.config['taskdb'] 22 | task = taskdb.get_task(project, taskid) 23 | if not task: 24 | abort(404) 25 | resultdb = app.config['resultdb'] 26 | if resultdb: 27 | result = resultdb.get(project, taskid) 28 | 29 | return render_template("task.html", task=task, json=json, result=result, 30 | status_to_string=app.config['taskdb'].status_to_string) 31 | 32 | 33 | @app.route('/tasks') 34 | def tasks(): 35 | rpc = app.config['scheduler_rpc'] 36 | taskdb = app.config['taskdb'] 37 | project = request.args.get('project', "") 38 | limit = int(request.args.get('limit', 100)) 39 | 40 | try: 41 | updatetime_tasks = rpc.get_active_tasks(project, limit) 42 | except socket.error as e: 43 | app.logger.warning('connect to scheduler rpc error: %r', e) 44 | return 'connect to scheduler error', 502 45 | 46 | tasks = {} 47 | result = [] 48 | for updatetime, task in sorted(updatetime_tasks , key=lambda x: x[0]): 49 | key = '%(project)s:%(taskid)s' % task 50 | task['updatetime'] = updatetime 51 | if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE: 52 | result.append(tasks[key]) 53 | tasks[key] = task 54 | result.extend(tasks.values()) 55 | 56 | return render_template( 57 | "tasks.html", 58 | tasks=result, 59 | status_to_string=taskdb.status_to_string 60 | ) 61 | 62 | 63 | @app.route('/active_tasks') 64 | def active_tasks(): 65 | rpc = app.config['scheduler_rpc'] 66 | taskdb = app.config['taskdb'] 67 | project = request.args.get('project', "") 68 | limit = int(request.args.get('limit', 100)) 69 | 70 | try: 71 | tasks = rpc.get_active_tasks(project, limit) 72 | except socket.error as e: 73 | app.logger.warning('connect to scheduler rpc error: %r', e) 74 | return '{}', 502, {'Content-Type': 'application/json'} 75 | 76 | result = [] 77 | for updatetime, task in tasks: 78 | task['updatetime'] = updatetime 79 | task['updatetime_text'] = utils.format_date(updatetime) 80 | if 'status' in task: 81 | task['status_text'] = taskdb.status_to_string(task['status']) 82 | result.append(task) 83 | 84 | return json.dumps(result), 200, {'Content-Type': 'application/json'} 85 | 86 | app.template_filter('format_date')(utils.format_date) 87 | -------------------------------------------------------------------------------- /spider/pyspider/webui/templates/helper.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /spider/pyspider/webui/templates/helper.js: -------------------------------------------------------------------------------- 1 | // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: 2 | // Author: Binux 3 | // http://binux.me 4 | // Created on 2014-03-16 11:05:05 5 | 6 | (function() { 7 | var loaded = false; 8 | var start_time = (new Date()).getTime(); 9 | function resize() { 10 | if (!loaded) 11 | parent.postMessage({type: 'resize', height: document.body.scrollHeight}, '*'); 12 | } 13 | 14 | window.addEventListener('load', function() { 15 | resize(); 16 | loaded = true; 17 | }); 18 | setTimeout(resize, 5000); 19 | setTimeout(resize, 10000); 20 | setTimeout(resize, 20000); 21 | setTimeout(resize, 30000); 22 | 23 | var css_helper_enabled = false; 24 | window.addEventListener("message", function(ev) { 25 | if (!css_helper_enabled && ev.data.type == "enable_css_selector_helper") { 26 | var script = document.createElement("script"); 27 | script.src = "//{{ host }}/static/css_selector_helper.js"; 28 | document.body.appendChild(script); 29 | css_helper_enabled = true; 30 | } 31 | }, false); 32 | 33 | document.addEventListener('click', function(ev) { 34 | ev.preventDefault(); 35 | }); 36 | })(); 37 | -------------------------------------------------------------------------------- /spider/pyspider/webui/templates/tasks.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tasks - pyspider 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
    21 | {% for task in tasks | sort(reverse=True, attribute='updatetime') %} 22 |
  1. 23 | {% if task.status %} 24 | {{ status_to_string(task.status) }} 25 | {% elif task.track %} 26 | 27 | {% set fetchok = task.track.fetch and task.track.fetch.ok %} 28 | {% set processok = task.track.process and task.track.process.ok %} 29 | {%- if not fetchok -%} 30 | FETCH_ERROR 31 | {%- elif not processok -%} 32 | PROCESS_ERROR 33 | {%- endif -%} 34 | 35 | {% else %} 36 | ERROR 37 | {% endif %} 38 | 39 | {{ task.project }} 40 | > 41 | {{ task.url }} 42 | 43 | {{ task.updatetime | format_date }} 44 | 45 | {% if task.track and task.track.fetch %} 46 | 47 | {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms 48 | 49 | {% endif %} 50 | 51 | 52 | {% if task.track and task.track.process %} 53 | +{{ task.track.process.follows | int }} 54 | {% endif %} 55 | 56 |
  2. 57 | {% endfor %} 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /spider/pyspiderSource/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/.DS_Store -------------------------------------------------------------------------------- /spider/pyspiderSource/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/__all__/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/__all__/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_baby/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_baby/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_car/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_car/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_discovery/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_discovery/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_entertainment/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_entertainment/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_essay/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_essay/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_fashion/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_fashion/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_finance/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_finance/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_food/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_food/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_game/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_game/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_history/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_history/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_hot/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_hot/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_military/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_military/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_regimen/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_regimen/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_society/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_society/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_sports/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_sports/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_story/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_story/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_tech/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_tech/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_travel/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_travel/README.md -------------------------------------------------------------------------------- /spider/pyspiderSource/news_world/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_world/README.md -------------------------------------------------------------------------------- /spider/sina/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sina/README.md -------------------------------------------------------------------------------- /spider/sina/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /spider/sina/sina.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'Jeezy' 3 | 4 | import requests 5 | import re 6 | 7 | class GetSina(): 8 | ''' 9 | 通过网易新闻API获取新闻信息,保存至exel 10 | ''' 11 | def __init__(self,num,page): 12 | self.num = str(num) 13 | self.page = str(page) 14 | self.url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&k=&num="+self.num+"&asc=&page="+self.page+"&r=0.41627189057293945" 15 | def getNews(self): 16 | #通过API爬取新浪新闻文本内容 17 | gettext = requests.get(self.url) 18 | gettext.encoding='gbk' 19 | gettext = gettext.text 20 | allNewsData = [] 21 | patten = re.compile('channel : {title : "(.*?)",id.*?title : "(.*?)",url : "(.*?)",type.*?time : (.*?)}',re.S) 22 | items = re.findall(patten,gettext) 23 | for eachData in items: 24 | newsData = {} 25 | newsData["tag"] = eachData[0] 26 | newsData["title"] = eachData[1] 27 | newsData["display_url"] = eachData[2] 28 | newsData["display_time"] = eachData[3] 29 | newsData["source"] = "新浪新闻" 30 | allNewsData.append(newsData) 31 | return allNewsData 32 | #if allNewsData[0]['tag']== "体育": 33 | #print (allNewsData) 34 | 35 | #sina =GetSina(5,1) 36 | #sina.getNews() -------------------------------------------------------------------------------- /spider/sinaSource/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_entertainment/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_entertainment/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_finance/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_finance/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_military/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_military/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_society/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_society/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_sports/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_sports/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_tech/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_tech/README.md -------------------------------------------------------------------------------- /spider/sinaSource/news_world/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_world/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/__all__/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/__all__/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/gallery_detail/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/gallery_detail/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_baby/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_baby/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_car/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_car/.DS_Store -------------------------------------------------------------------------------- /spider/touTiaoSource/news_car/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_car/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_discovery/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_discovery/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_entertainment/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_entertainment/.DS_Store -------------------------------------------------------------------------------- /spider/touTiaoSource/news_entertainment/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_entertainment/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_essay/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_essay/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_fashion/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_fashion/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_finance/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_finance/.DS_Store -------------------------------------------------------------------------------- /spider/touTiaoSource/news_finance/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_finance/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_food/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_food/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_game/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_game/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_history/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_history/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_hot/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_hot/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_military/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_military/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_regimen/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_regimen/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_society/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_society/.DS_Store -------------------------------------------------------------------------------- /spider/touTiaoSource/news_society/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_society/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_sports/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_sports/.DS_Store -------------------------------------------------------------------------------- /spider/touTiaoSource/news_sports/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_sports/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_story/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_story/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_tech/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_tech/.DS_Store -------------------------------------------------------------------------------- /spider/touTiaoSource/news_tech/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_tech/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_travel/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_travel/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/news_world/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_world/README.md -------------------------------------------------------------------------------- /spider/touTiaoSource/video/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/video/README.md -------------------------------------------------------------------------------- /spider/toutiao/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'howie' -------------------------------------------------------------------------------- /spider/toutiao/touTiao.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | 4 | import sys, time, json, requests 5 | 6 | 7 | class GetToutiao(): 8 | """ 9 | 通过今日头条API获取新闻信息,保存至本地excel 10 | """ 11 | 12 | def __init__(self, count, category, time): 13 | self.count = count 14 | self.category = category 15 | self.time = time 16 | self.url = "http://toutiao.com/api/article/recent/?count=" + count + "&category=" + category + "&as=A1A5177BB0F7063&cp=57B0776066D39E1&max_create_time=1471155832&_=" + str( 17 | time) 18 | 19 | def getNews(self): 20 | print(self.url) 21 | try: 22 | header = { 23 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'} 24 | root = requests.get("http://toutiao.com/", headers=header) 25 | news = requests.get(self.url, headers=header, cookies=root.cookies) 26 | allNewsData = [] 27 | try: 28 | news = str(news.text).strip("'<>() ").replace('\'', '\"') 29 | newsJson = json.loads(news) 30 | if newsJson["data"]: 31 | for eachData in newsJson["data"]: 32 | newsData = {} 33 | newsData["title"] = eachData["title"] 34 | newsData["display_url"] = eachData["display_url"] 35 | newsData["display_time"] = eachData["display_time"] 36 | newsData["source"] = eachData["source"] 37 | newsData["keywords"] = eachData["keywords"] 38 | newsData["abstract"] = eachData["abstract"] 39 | if "middle_image" in eachData.keys(): 40 | newsData["images"] = eachData["middle_image"] 41 | else: 42 | newsData["images"] = "null" 43 | newsData["tag"] = eachData["tag"] 44 | allNewsData.append(newsData) 45 | else: 46 | exit("no data!") 47 | except: 48 | print(repr(news)) 49 | print(sys.exc_info()) 50 | return allNewsData 51 | except ConnectionError: 52 | exit("ConnectionError") 53 | 54 | # for i in range(1,20): 55 | # get = GetToutiao("30", "news_society", time.time()) 56 | # allNewsData = get.getNews() 57 | # for i in allNewsData: 58 | # print(i) 59 | -------------------------------------------------------------------------------- /spider/toutiao/touTiaoSpider.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | __author__ = 'howie' 3 | from xlsxwriter import * 4 | import time 5 | import datetime 6 | import os 7 | from random import choice 8 | from spider.toutiao.touTiao import GetToutiao 9 | from config.n_conf import dirPath 10 | 11 | 12 | def getToutiaoNews(category, page, num): 13 | """ 14 | Des: 返回今日头条新闻 15 | param: 16 | category:新闻类型,默认为__all__ 17 | page :爬取页面,默认20页 18 | num :每页新闻数量,根据今日头条每页返回数量变化,默认参数为20 19 | ctime :新闻时间,根据标准库time.time()获取 20 | return: /source/下各文件夹 21 | """ 22 | newsData = [] 23 | for page in range(0, page): 24 | # ltime = [time.time(),"1464710423","1464796865","1464753667","1464840044","1464883266"] 25 | # ctime = choice(ltime) 26 | # print(ctime) 27 | # 获取两天前的时间 28 | twoDayAgo = (datetime.datetime.now() - datetime.timedelta(days=1)) 29 | # 转换为时间戳: 30 | timeStamp = int(time.mktime(twoDayAgo.timetuple())) 31 | ctime = choice(range(timeStamp, int(time.time()))) 32 | toutiao = GetToutiao(str(num), category, ctime) 33 | allNewsData = toutiao.getNews() 34 | for news in allNewsData: 35 | newsData.append(news) 36 | mkExcel(category, newsData) 37 | 38 | 39 | def getTimestamp(startTime): 40 | """ 41 | Des: 将时间转化为时间戳 42 | param: startTime="2016-05-17 12:00:00"(格式) 43 | return: timeStamp 44 | """ 45 | timeArray = time.strptime(startTime, "%Y-%m-%d %H:%M:%S") 46 | timeStamp = int(time.mktime(timeArray)) 47 | return timeStamp 48 | 49 | 50 | def mkExcel(cate, data): 51 | """ 52 | 将新闻数据生成excel表 53 | :param cate: 新闻类型 54 | :param data: 爬取的新闻数据 55 | :return: 返回生成的excel表 56 | """ 57 | # 设置excel表名称 58 | excelName = dirPath + "/spider/touTiaoSource/" + cate + "/" + str( 59 | time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())) + "&" + cate + "&" + str(len(data)) + ".xlsx" 60 | # 设置excel表名称 61 | jr_work = Workbook(excelName) 62 | jr_sheet = jr_work.add_worksheet("toutiao") 63 | bold = jr_work.add_format({'bold': True}) # 设置一个加粗的格式对象 64 | jr_sheet.set_column('A:H', 40) 65 | jr_sheet.set_column('C:D', 15) 66 | jr_sheet.write(0, 0, '标题', bold) 67 | jr_sheet.write(0, 1, '发表地址', bold) 68 | jr_sheet.write(0, 2, '发表时间', bold) 69 | jr_sheet.write(0, 3, '来源', bold) 70 | jr_sheet.write(0, 4, '关键词', bold) 71 | jr_sheet.write(0, 5, '摘要', bold) 72 | jr_sheet.write(0, 6, '图片地址', bold) 73 | jr_sheet.write(0, 7, '标签', bold) 74 | line = 0 75 | for eachData in data: 76 | line += 1 77 | jr_sheet.write(line, 0, eachData["title"]) 78 | jr_sheet.write(line, 1, eachData["display_url"]) 79 | jr_sheet.write(line, 2, eachData["display_time"]) 80 | jr_sheet.write(line, 3, eachData["source"]) 81 | jr_sheet.write(line, 4, eachData["keywords"]) 82 | jr_sheet.write(line, 5, eachData["abstract"]) 83 | jr_sheet.write(line, 6, str(eachData["images"])) 84 | jr_sheet.write(line, 7, eachData["tag"]) 85 | jr_work.close() 86 | log = "%s新闻表抓取完成,抓取数据%d条" % (excelName, line) 87 | 88 | with open(dirPath+"/log.txt", 'a') as fp: 89 | fp.write(log + "\n") 90 | print(log) 91 | 92 | 93 | # 新闻种类 94 | category = ["news_society", "news_entertainment", 95 | "news_tech", "news_car", "news_sports", "news_finance", "news_military", "news_world", 96 | "news_fashion", "news_travel", "news_discovery", "news_baby", "news_regimen", "news_story", 97 | "news_essay", "news_game", "news_history", "news_food"] 98 | -------------------------------------------------------------------------------- /spider/wordAna/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/.DS_Store -------------------------------------------------------------------------------- /spider/wordAna/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/__init__.py -------------------------------------------------------------------------------- /spider/wordAna/allNews/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/allNews/.DS_Store -------------------------------------------------------------------------------- /spider/wordAna/allNews/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/allNews/README.md -------------------------------------------------------------------------------- /spider/wordAna/contentSpider.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | import jieba.analyse 3 | import os 4 | import re 5 | from spider.wordAna.contentTool import ContentOperator 6 | from spider.wordAna.excelTool import ExcelOperator 7 | from config.n_conf import dirPath 8 | 9 | 10 | def getNewsContent(): 11 | """ 12 | :return: 13 | """ 14 | # 获取总目录绝对路径 15 | orgDir = dirPath + "/spider/wordAna/allNews" 16 | # 获取最终存放目录绝对路径 17 | finalDir = dirPath + "/spider/wordAna/wordAnaNews/" 18 | print(orgDir) 19 | print(finalDir) 20 | 21 | # 获得excelTools.py中的excel操作工具 22 | et = ExcelOperator() 23 | # 获得contentTools.py中的content操作工具 24 | ct = ContentOperator() 25 | files = [x for x in os.listdir(orgDir) if os.path.splitext(x)[-1] == '.xlsx'] 26 | 27 | # 开始遍历各大新闻类别的excel文件 28 | for file in files: 29 | # print(file) 30 | # 此处得到该excel文件夹所有信息,是一个list,list中单个元素为dict,对应 列名:值 31 | infoList = et.getExcelInfo(os.path.join(orgDir, file)) 32 | # 用以存放完整的新闻信息元素的集合 33 | last_list = [] 34 | for new_info in infoList: 35 | urlstr = new_info["display_url"] 36 | # 区分链接,链接来自头条或新浪,关键词,toutiao.com和sina.com 37 | htmlContent, textContent, title, abstract, keywords, source, tag = '', '', '', '', '', '', '' 38 | img_url_list = [] 39 | try: 40 | # 这里需要去除sina的滚动图片类新闻及多媒体新闻 41 | if urlstr.find("sina.com") != -1 and urlstr.find("slide") == -1 and urlstr.find("video") == -1: 42 | print(urlstr) 43 | textContent, htmlContent, img_url_list, keyword_list, abstract = ct.getSinaContent(urlstr) 44 | new_info["keywords"] = ' '.join(keyword_list) 45 | new_info["abstract"] = ' '.join(abstract) 46 | 47 | elif urlstr.find("toutiao.com") != -1: 48 | print(urlstr) 49 | textContent, htmlContent, img_url_list, title, abstract, keywords, source, tag = ct.getToutiaoContent( 50 | urlstr) 51 | if title: 52 | new_info["title"] = title 53 | else: 54 | new_info["title"] = '' 55 | if abstract: 56 | new_info["abstract"] = abstract 57 | else: 58 | new_info["abstract"] = '' 59 | if keywords: 60 | new_info["keywords"] = keywords 61 | else: 62 | new_info["keywords"] = '' 63 | if source: 64 | new_info["source"] = source 65 | else: 66 | new_info["source"] = '' 67 | if tag: 68 | new_info["tag"] = tag 69 | else: 70 | new_info["tag"] = '' 71 | 72 | try: 73 | feature = jieba.analyse.extract_tags(textContent, 15) 74 | except: 75 | feature = new_info["keywords"] 76 | new_info["textContent"] = textContent 77 | new_info["htmlContent"] = htmlContent 78 | new_info["feature"] = feature 79 | new_info["img"] = img_url_list 80 | last_list.append(new_info) 81 | except: 82 | pass 83 | # 采用结巴中文分词提取正文最重要的十个特征词 84 | # 相关算法 --- tf-idf算法 85 | # print(textContent) 86 | 87 | # 信息过滤、爬取及关键词提取完毕,开始将它存到excel表中 88 | excelName = os.path.join(finalDir, file) 89 | print("excelName:" + excelName) 90 | # 这里,第二个参数,工作表名称,需调整 91 | et.saveToExcel(excelName, "allNews", last_list) 92 | -------------------------------------------------------------------------------- /spider/wordAna/wordAnaNews/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/wordAnaNews/.DS_Store -------------------------------------------------------------------------------- /spider/wordAna/wordAnaNews/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/wordAnaNews/README.md -------------------------------------------------------------------------------- /static/css/login.css: -------------------------------------------------------------------------------- 1 | body { 2 | background: url(../images/back.jpg) !important; 3 | font-family: "Helvetica Neue", "Hiragino Sans GB", "Microsoft YaHei", "\9ED1\4F53", Arial, sans-serif; 4 | color: #222; 5 | font-size: 12px; 6 | } 7 | * { 8 | padding: 0px; 9 | margin: 0px; 10 | } 11 | .top_div { 12 | background:rgba(15, 10, 10, 0.19); 13 | width: 100%; 14 | height: 400px; 15 | } 16 | .i_top{ 17 | width: 100%; 18 | height: 50px; 19 | background-color: rgba(59, 167, 173, 0.52); 20 | } 21 | .i_top img{ 22 | width: 45px; 23 | height: 45px; 24 | margin-left: 15px; 25 | } 26 | .ipt { 27 | border: 1px solid #d3d3d3; 28 | padding: 10px 10px; 29 | width: 290px; 30 | border-radius: 4px; 31 | padding-left: 35px; 32 | -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075); 33 | box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075); 34 | -webkit-transition: border-color ease-in-out .15s, -webkit-box-shadow ease-in-out .15s; 35 | -o-transition: border-color ease-in-out .15s, box-shadow ease-in-out .15s; 36 | transition: border-color ease-in-out .15s, box-shadow ease-in-out .15s 37 | } 38 | .ipt:focus { 39 | border-color: #66afe9; 40 | outline: 0; 41 | -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075), 0 0 8px rgba(102, 175, 233, .6); 42 | box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075), 0 0 8px rgba(102, 175, 233, .6) 43 | } 44 | .u_logo { 45 | background: url("../images/username.png") no-repeat; 46 | padding: 10px 10px; 47 | position: absolute; 48 | top: 43px; 49 | left: 40px; 50 | } 51 | .p_logo { 52 | background: url("../images/password.png") no-repeat; 53 | padding: 10px 10px; 54 | position: absolute; 55 | top: 12px; 56 | left: 40px; 57 | } 58 | a { 59 | text-decoration: none; 60 | } 61 | .tou { 62 | background: url("../images/top_1.png") no-repeat; 63 | width: 115px; 64 | height: 92px; 65 | position: absolute; 66 | top: -93px; 67 | left: 132.5px; 68 | } 69 | .left_hand { 70 | background: url("../images/left_hand.png") no-repeat; 71 | width: 32px; 72 | height: 37px; 73 | position: absolute; 74 | top: -38px; 75 | left: 150px; 76 | } 77 | .right_hand { 78 | background: url("../images/right_hand.png") no-repeat; 79 | width: 32px; 80 | height: 37px; 81 | position: absolute; 82 | top: -38px; 83 | right: -64px; 84 | } 85 | .initial_left_hand { 86 | background: url("../images/hand.png") no-repeat; 87 | width: 30px; 88 | height: 20px; 89 | position: absolute; 90 | top: -12px; 91 | left: 100px; 92 | } 93 | .initial_right_hand { 94 | background: url("../images/hand.png") no-repeat; 95 | width: 30px; 96 | height: 20px; 97 | position: absolute; 98 | top: -12px; 99 | right: -112px; 100 | } 101 | .left_handing { 102 | background: url("../images/left-handing.png") no-repeat; 103 | width: 30px; 104 | height: 20px; 105 | position: absolute; 106 | top: -24px; 107 | left: 139px; 108 | } 109 | .right_handinging { 110 | background: url("../images/right_handing.png") no-repeat; 111 | width: 30px; 112 | height: 20px; 113 | position: absolute; 114 | top: -21px; 115 | left: 210px; 116 | } 117 | #login{ 118 | cursor: pointer; 119 | } -------------------------------------------------------------------------------- /static/css/newsManage.css: -------------------------------------------------------------------------------- 1 | .read_list{ 2 | height:150px; 3 | width: 150px; 4 | border: 1px; 5 | border-radius: 50%; 6 | margin:0 auto; 7 | font-size: 25px; 8 | padding-left: 27px; 9 | padding-top: 59px; 10 | } 11 | 12 | #read_list{ 13 | background-color: #FFAEB9; 14 | /*#FFBBFF*/ 15 | } 16 | 17 | #love_list{ 18 | background-color: #f9c693; 19 | } 20 | 21 | #comment_list{ 22 | background-color: #f7dd90; 23 | } 24 | 25 | #delete_but{ 26 | background-color: #b6d7a8; 27 | } 28 | 29 | .news_list{ 30 | 31 | width: 80%; 32 | margin:0 auto; 33 | } 34 | 35 | .each_news{ 36 | background-color: white; 37 | width:100%; 38 | margin:0 auto; 39 | height:100%; 40 | margin-top:0.5%; 41 | text-align: center; 42 | } 43 | 44 | .read_list a{ 45 | color:#333; 46 | text-decoration: none; 47 | } 48 | 49 | .read_list a:visited { 50 | color: #333; 51 | text-decoration: none; 52 | } 53 | .read_list a:hover { 54 | color: #333; 55 | text-decoration: none; 56 | } 57 | .read_list a:active{ 58 | color: #333; 59 | text-decoration: none; 60 | } 61 | 62 | -------------------------------------------------------------------------------- /static/css/userManage.css: -------------------------------------------------------------------------------- 1 | .second-menu{ 2 | width: 20%; 3 | 4 | } 5 | 6 | .second-menu a:focus, .nav-list li.active > a{ 7 | text-decoration: none; 8 | background-color: #1A2022; 9 | border-left: 3px solid #23BAB5; 10 | font-size: 14px; 11 | color: #FFF; 12 | transition: all 0.1s ease 0s; 13 | } 14 | 15 | 16 | .second-menu>li{ 17 | list-style: none; 18 | width: 100%; 19 | height: 40px; 20 | font-size: 15px; 21 | color: #80969C; 22 | } 23 | 24 | .second-menu> li > a > span { 25 | font-size: 16px; 26 | padding-top: 5px; 27 | } 28 | 29 | 30 | .second-menu > li > a { 31 | color: #80969c; 32 | position: relative; 33 | display: block; 34 | padding: 10px 15px; 35 | text-decoration: none; 36 | } 37 | 38 | .second-menu > li > a:hover { 39 | background-color: #1b6d85; 40 | color: #ffffff; 41 | } 42 | 43 | .user_third_head{ 44 | padding-top: 3%; 45 | 46 | } 47 | .super_name{ 48 | padding-left: 30%; 49 | 50 | } 51 | 52 | .table-bordered > thead > a{ 53 | text-decoration: none; 54 | } 55 | 56 | .super_head{ 57 | margin-left: 15%; 58 | } 59 | 60 | .user_third_menu{ 61 | margin-top:100px; 62 | } 63 | 64 | .user_info{ 65 | margin-top: 70px; 66 | background-color: antiquewhite; 67 | width: 300px; 68 | height:260px; 69 | font-size: 18px; 70 | border: 1px solid #E5E6E7; 71 | border-radius: 7%; 72 | display: none; 73 | } 74 | 75 | .each_info{ 76 | padding-top: 20px; 77 | padding-left: 30px; 78 | } 79 | 80 | .page1{ 81 | font-size: 20px; 82 | } 83 | 84 | .user{ 85 | background-color: palevioletred; 86 | width: 400px; 87 | height: 600px; 88 | margin:0 auto; 89 | } 90 | 91 | 92 | /*反馈*/ 93 | 94 | .feedback{ 95 | width: 60%; 96 | height: 500px; 97 | margin-left: 10%; 98 | margin-top: 10px; 99 | float: left; 100 | } 101 | 102 | .each_feedback{ 103 | background-color: #DDDDDD; 104 | border: 1px solid #DDDDDD; 105 | padding-bottom: 10px; 106 | margin-top: 10px; 107 | } 108 | 109 | .feedback_name{ 110 | padding-top: 10px; 111 | padding-left: 10px; 112 | font-size: 20px; 113 | } 114 | 115 | .feedback_content{ 116 | height: 80px; 117 | width: 100%; 118 | font-size: 17px; 119 | padding-left: 20px; 120 | } 121 | .feedback_isreply{ 122 | font-size: 17px; 123 | float: right; 124 | padding-right: 20px; 125 | } 126 | 127 | .feedback_gettime{ 128 | font-size: 17px; 129 | float: right; 130 | padding-right: 20px; 131 | } 132 | 133 | .rep{ 134 | color: firebrick; 135 | } 136 | 137 | .rep_button{ 138 | margin-left: 10px; 139 | padding-left: 10px; 140 | } 141 | -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /static/images/1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Created by FontForge 20120731 at Sat May 14 15:29:57 2016 6 | By admin 7 | 8 | 9 | 10 | 24 | 26 | 28 | 30 | 32 | 36 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /static/images/admin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/admin.jpg -------------------------------------------------------------------------------- /static/images/back.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/back.jpg -------------------------------------------------------------------------------- /static/images/bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/bg.jpg -------------------------------------------------------------------------------- /static/images/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/bg.png -------------------------------------------------------------------------------- /static/images/bgb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/bgb.jpg -------------------------------------------------------------------------------- /static/images/hand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/hand.png -------------------------------------------------------------------------------- /static/images/left-handing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/left-handing.png -------------------------------------------------------------------------------- /static/images/left_hand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/left_hand.png -------------------------------------------------------------------------------- /static/images/news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/news.png -------------------------------------------------------------------------------- /static/images/password.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/password.png -------------------------------------------------------------------------------- /static/images/right_hand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/right_hand.png -------------------------------------------------------------------------------- /static/images/right_handing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/right_handing.png -------------------------------------------------------------------------------- /static/images/save.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/images/save0.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/images/top_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/top_1.png -------------------------------------------------------------------------------- /static/images/tou.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/tou.png -------------------------------------------------------------------------------- /static/images/username.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/username.png -------------------------------------------------------------------------------- /static/js/feedback.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by jeezy-lyoung on 16-8-2. 3 | */ 4 | $(document).ready(function () { 5 | 6 | var net = "http://127.0.0.1:8888"; 7 | var count = 3; 8 | var page_value = 1; 9 | var alrequest = 0; 10 | var page = "xia"; 11 | var time = '1'; 12 | var tooken = '764bfd755bc07f6871eee104219b2b2c'; 13 | 14 | 15 | 16 | function get_user() { 17 | var qx = {"alrequest":alrequest,"page":page,"time":time, "tooken":tooken}; 18 | $.ajax({ 19 | type:"get", 20 | url:net + "/api/adminfeedback?count=3", 21 | data:qx, 22 | cache:false, 23 | success:function(data) { 24 | 25 | var is_success; 26 | 27 | var result = eval('(' + data + ')'); 28 | var all_data = result.data; 29 | is_success = result.message; 30 | 31 | if (is_success == "failed"){ 32 | alert("已是最后一页"); 33 | page_value = page_value - 1; 34 | } 35 | else{ 36 | var table = ""; 37 | var u_id = ""; 38 | var feed_content = ""; 39 | var get_time = "" 40 | var name = ""; 41 | var is_rep = ""; 42 | var rep_button = ""; 43 | for(i=0;i" 45 | feed_content = "" 46 | get_time = "" 47 | is_rep = "" 48 | rep_button = "" 49 | table = table + "
    " +name+feed_content+get_time+is_rep+rep_button + "
    " 50 | 51 | } 52 | 53 | document.getElementById("not_feedback").innerHTML = table; 54 | 55 | 56 | 57 | } 58 | 59 | } 60 | 61 | 62 | 63 | }); 64 | } 65 | 66 | get_user(); 67 | 68 | $("#xia").click(function () { 69 | page = "xia"; 70 | alrequest = page_value * 3; 71 | get_user(); 72 | page_value = page_value + 1; 73 | 74 | 75 | 76 | }); 77 | 78 | $("#shang").click(function () { 79 | page = "shang"; 80 | if (page_value == 1){ 81 | alert("已是第一页") 82 | }else{ 83 | page_value = page_value - 1 ; 84 | alrequest = page_value * 3; 85 | get_user(); 86 | 87 | } 88 | 89 | 90 | 91 | }); 92 | 93 | 94 | 95 | 96 | 97 | 98 | }) -------------------------------------------------------------------------------- /static/js/howie.js: -------------------------------------------------------------------------------- 1 | window.onload = function () { 2 | var name = document.getElementById('username'); 3 | var pass = document.getElementById('password'); 4 | var login = document.getElementById('login'); 5 | login.onclick = function () { 6 | if (name.value == "" || pass.value == "") { 7 | alert('不能有内容为空'); 8 | } 9 | } 10 | } 11 | $(document).ready(function(){ 12 | $("#login").click(function(){ 13 | var user = $("#username").val(); 14 | var pwd = $("#password").val(); 15 | var pd = {"username":user, "password":pwd}; 16 | $.ajax({ 17 | type:"post", 18 | url:"/", 19 | data:pd, 20 | cache:false, 21 | success:function(data){ 22 | window.location.href="/admin?user="+data; 23 | }, 24 | error:function(){ 25 | alert("error!"); 26 | }, 27 | }); 28 | }); 29 | //得到焦点 30 | $("#password").focus(function () { 31 | $("#left_hand").animate({ 32 | left: "150", 33 | top: " -38" 34 | }, { 35 | step: function () { 36 | if (parseInt($("#left_hand").css("left")) > 140) { 37 | $("#left_hand").attr("class", "left_hand"); 38 | } 39 | } 40 | }, 2000); 41 | $("#right_hand").animate({ 42 | right: "-64", 43 | top: "-38px" 44 | }, { 45 | step: function () { 46 | if (parseInt($("#right_hand").css("right")) > -70) { 47 | $("#right_hand").attr("class", "right_hand"); 48 | } 49 | } 50 | }, 2000); 51 | }); 52 | //失去焦点 53 | $("#password").blur(function () { 54 | $("#left_hand").attr("class", "initial_left_hand"); 55 | $("#left_hand").attr("style", "left:100px;top:-12px;"); 56 | $("#right_hand").attr("class", "initial_right_hand"); 57 | $("#right_hand").attr("style", "right:-112px;top:-12px"); 58 | }); 59 | }); 60 | 61 | // function keyLogin() { 62 | // if (event.keyCode == 13) //回车键 63 | // document.getElementById('login').onclick() 64 | // } 65 | -------------------------------------------------------------------------------- /static/js/newsManage.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by jeezy-lyoung on 16-8-4. 3 | */ 4 | $(document).ready(function () { 5 | 6 | $("#read_list").mouseover(function () { 7 | document.getElementById('read_list').style.backgroundColor = "#FFBBFF"; 8 | }); 9 | 10 | $("#read_list").mouseout(function () { 11 | document.getElementById('read_list').style.backgroundColor = "#FFAEB9"; 12 | }); 13 | 14 | 15 | $("#love_list").mouseover(function () { 16 | document.getElementById('love_list').style.backgroundColor = "#f9d5b0"; 17 | }); 18 | 19 | $("#love_list").mouseout(function () { 20 | document.getElementById('love_list').style.backgroundColor = "#f9c693"; 21 | }); 22 | 23 | 24 | $("#comment_list").mouseover(function () { 25 | document.getElementById('comment_list').style.backgroundColor = "#fbe8af"; 26 | }); 27 | 28 | $("#comment_list").mouseout(function () { 29 | document.getElementById('comment_list').style.backgroundColor = "#f7dd90"; 30 | }); 31 | 32 | 33 | $("#delete_but").mouseover(function () { 34 | document.getElementById('delete_but').style.backgroundColor = "#c4ddb9"; 35 | }); 36 | 37 | $("#delete_but").mouseout(function () { 38 | document.getElementById('delete_but').style.backgroundColor = "#b6d7a8"; 39 | }); 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | /*自适应高度*/ 54 | var news_list_height = document.getElementById('news_list'); 55 | var each_news_height = document.getElementsByClassName('each_news'); 56 | 57 | var show_height; 58 | if (navigator.userAgent.indexOf("Firefox") > 0) { 59 | show_height = document.documentElement.scrollHeight; 60 | } 61 | if (window.navigator.userAgent.indexOf("Chrome") !== -1 || navigator.userAgent.indexOf("Safari") > 0) { 62 | show_height = document.body.scrollHeight; 63 | } 64 | if (navigator.userAgent.indexOf("MSIE") > 0) { 65 | show_height = (document.documentElement.scrollHeight > document.documentElement.clientHeight) ? document.documentElement.scrollHeight : document.documentElement.clientHeight; 66 | } else { 67 | show_height = (document.documentElement.scrollHeight > document.documentElement.clientHeight) ? document.documentElement.scrollHeight : document.documentElement.clientHeight; 68 | } 69 | news_list_height.style.height = (show_height - 300)*0.9 + "px"; 70 | news_list_height.style.marginTop = (show_height - 300)*0.1 + "px"; 71 | news_list_height.style.marginBottom = (show_height - 300)*0.1 + "px"; 72 | //news_list_height.style.fontSize = ($(".each_news").height())*0.5+ "px"; 73 | //alert($(".each_news").height()); 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | }); -------------------------------------------------------------------------------- /system/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/README.md -------------------------------------------------------------------------------- /system/classPredict/NavieBayesInfo/predict_result.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/NavieBayesInfo/predict_result.txt -------------------------------------------------------------------------------- /system/classPredict/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__init__.py -------------------------------------------------------------------------------- /system/classPredict/__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/main.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/main.cpython-34.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/main.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/main.cpython-35.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/newsPredict.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/newsPredict.cpython-34.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/newsPredict.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/newsPredict.cpython-35.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/predictTool.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/predictTool.cpython-34.pyc -------------------------------------------------------------------------------- /system/classPredict/__pycache__/predictTool.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/predictTool.cpython-35.pyc -------------------------------------------------------------------------------- /system/classPredict/main.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import os 3 | from system.classPredict.newsPredict import NewPredict 4 | from system.classPredict.predictTool import NavieBayesPredict 5 | from methods.pDb import newsDb 6 | from config.n_conf import dirPath 7 | 8 | # 存放新闻信息的集合,包括 id,文本正文 9 | data_list = [] 10 | 11 | test_data_file = dirPath + "/system/classPredict/NavieBayesInfo/predict_new_word.txt" 12 | model_file = dirPath + "/system/classPredict/NavieBayesInfo/model.txt" 13 | result_file = dirPath + "/system/classPredict/NavieBayesInfo/predict_result.txt" 14 | 15 | 16 | def startPredict(): 17 | db = newsDb() 18 | 19 | try: 20 | datasql = "select news_id,text_content from get_news where is_old = 0" 21 | data = db.select_table_three(datasql) 22 | for d in data: 23 | new = {} 24 | new["id"] = d[0] 25 | new["textContent"] = d[1] 26 | data_list.append(new) 27 | 28 | except Exception as e: 29 | print(e) 30 | 31 | np = NewPredict(data_list) 32 | np.getNewInfo() 33 | nb = NavieBayesPredict(test_data_file, model_file, result_file) 34 | nb.predict() 35 | 36 | # startPredict() 37 | -------------------------------------------------------------------------------- /system/classPredict/newsPredict.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import jieba 4 | import jieba.analyse 5 | from config.n_conf import dirPath 6 | 7 | 8 | class NewPredict(object): 9 | def __init__(self,data_list): 10 | self.data_list = data_list 11 | 12 | self.ft = open(dirPath + "/system/classPredict/NavieBayesInfo/predict_new_word.txt", 'w') 13 | #存放 单词:对应唯一id 的字典 14 | self.word_id_dict = {} 15 | #加载单词id字典,结果在word_id_dict里 16 | self.loadWord_id_dict() 17 | 18 | 19 | #将所需预测的新闻的类别:特征词保存进文件,格式: 类别 单词1id 单词2id 单词3id....... # 20 | def getNewInfo(self): 21 | for new in self.data_list: 22 | 23 | new_id = new["id"] 24 | 25 | textContent = new["textContent"] 26 | if textContent==None: 27 | continue 28 | 29 | feature = jieba.analyse.extract_tags(textContent, 15) 30 | # print(new_id + " " + str(feature)) 31 | #代表当前新闻的特征词id集合 32 | word_id_list = [] 33 | for word in feature: 34 | tmp = self.word_id_dict.get(word, None) 35 | if tmp==None: 36 | word_id_list.append("-1") 37 | else: 38 | word_id_list.append(str(tmp)) 39 | 40 | #将文章信息写入预测新闻信息文件 41 | self.writeFeature(new_id, word_id_list) 42 | #关闭资源 43 | self.ft.close() 44 | 45 | # cate为feature特征词id集合所属的类别 46 | def writeFeature(self, new_id, word_id_list): 47 | self.ft.write(new_id + ' ') 48 | for word_id in word_id_list: 49 | self.ft.write(word_id + ' ') 50 | self.ft.write('\n') 51 | 52 | 53 | def loadWord_id_dict(self): 54 | fd = open (dirPath + "/system/classPredict/NavieBayesInfo/word_id_dict.txt", 'r') 55 | allInfo = fd.read() 56 | arr = allInfo.strip().split() 57 | for i in range(0,len(arr)): 58 | if i%2==0: 59 | self.word_id_dict[arr[i]] = arr[i+1] 60 | 61 | 62 | # np = NewPredict([]) 63 | # np.loadWord_id_dict() 64 | -------------------------------------------------------------------------------- /system/classPredict/test.py: -------------------------------------------------------------------------------- 1 | 2 | from spider.wordAna.contentTool import * 3 | from newsPredict import * 4 | from predictTool import * 5 | import os 6 | ct = ContentOperator() 7 | # urlstr = "http://toutiao.com/a6292379445978808577/" 8 | # urlstr ="http://toutiao.com/a6292665412145365250/" 9 | # urlstr = "http://toutiao.com/group/6292605706580803842/" #汽车 10 | # urlstr = "http://toutiao.com/a6291989690913620225/" #美文 11 | # urlstr = "http://toutiao.com/a6292346062074691841/" #游戏 12 | # urlstr = "http://toutiao.com/a6280854429832233218/" #科技 13 | # urlstr = "http://toutiao.com/group/6291592935427604737/" #故事 14 | # urlstr = "http://toutiao.com/a6292516516223680770/" #养生 15 | # urlstr = "http://toutiao.com/a6292528444404973826/" #历史 16 | # urlstr = "http://toutiao.com/a6292557068092080386/" #美食 17 | # urlstr = "http://toutiao.com/a6292374615201153537/" #发现 18 | # urlstr = "http://toutiao.com/a6292035179544412417/" #时尚 19 | # urlstr = "http://toutiao.com/a6292511961298059521/" #旅游 20 | urlstr = "http://toutiao.com/a6292830759922729218/" #育儿 21 | textContent, htmlContent, img_url_list = ct.getToutiaoContent(urlstr) 22 | data_list = [] 23 | new = {} 24 | new["id"] = "1" 25 | new["textContent"] = textContent 26 | data_list.append(new) 27 | np = NewPredict(data_list) 28 | np.getNewInfo() 29 | test_data_file = os.path.abspath('.') + "/NavieBayesInfo/predict_new_word.txt" 30 | model_file = os.path.abspath('.') + "/NavieBayesInfo/model.txt" 31 | result_file = os.path.abspath('.') + "/NavieBayesInfo/predict_result.txt" 32 | print(test_data_file) 33 | nb = NavieBayesPredict(test_data_file,model_file,result_file) 34 | nb.predict() -------------------------------------------------------------------------------- /system/classPredict/trainData/2016-06-06-13-09-44&news_fashion.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/trainData/2016-06-06-13-09-44&news_fashion.xlsx -------------------------------------------------------------------------------- /system/latentFactor/README.md: -------------------------------------------------------------------------------- 1 | "update user_tag_score set news_baby,news_entertainment,news_discovery,news_history,news_society," \ 2 | "news_game,news_sports,news_car,news_essay,news_tech,news_military,news_travel,news_fashion,news_regimen," \ 3 | "news_story,news_finance,news_food,news_world = '" +tag_list_score['news_baby']+"','"+tag_list_score['news_entertainment']+"','"+\ 4 | tag_list_score['news_discovery']+"','"+tag_list_score['news_history']+"','"+tag_list_score['news_society']+"','"+tag_list_score['news_game']+"','"+\ 5 | tag_list_score['news_sports']+"','"+tag_list_score['news_car']+"','"+tag_list_score['news_essay']+"','"+tag_list_score['news_tech']+\ 6 | "','"+tag_list_score['news_military']+"','"+tag_list_score['news_travel']+"','"+tag_list_score['news_fashion']+"','"+tag_list_score['news_regimen']\ 7 | +"','"+tag_list_score['news_story']+"','"+tag_list_score['news_finance']+"','"+tag_list_score['news_food']+"','"+tag_list_score['news_world']+"' where user_id = '" + user_id + "'" -------------------------------------------------------------------------------- /system/latentFactor/__pycache__/geneCalcul.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneCalcul.cpython-34.pyc -------------------------------------------------------------------------------- /system/latentFactor/__pycache__/geneCalcul.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneCalcul.cpython-35.pyc -------------------------------------------------------------------------------- /system/latentFactor/__pycache__/geneNewsType.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneNewsType.cpython-34.pyc -------------------------------------------------------------------------------- /system/latentFactor/__pycache__/geneNewsType.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneNewsType.cpython-35.pyc -------------------------------------------------------------------------------- /system/latentFactor/__pycache__/geneUserType.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneUserType.cpython-34.pyc -------------------------------------------------------------------------------- /system/latentFactor/__pycache__/geneUserType.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneUserType.cpython-35.pyc -------------------------------------------------------------------------------- /system/latentFactor/dbTool.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import pymysql 4 | 5 | -------------------------------------------------------------------------------- /system/latentFactor/geneNewsType.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from methods.pDb import newsDb 4 | 5 | 6 | class NewsTagDataTool(object): 7 | def __init__(self): 8 | #存新闻编号 9 | self.new_id_list = [] 10 | self.newsTagMat = [] 11 | 12 | #存取新闻id对应的类别 13 | self.news_type_dict = {} 14 | 15 | def getData(self): 16 | try: 17 | 18 | db = newsDb() 19 | data = db.select_table_two(table="news_tag_deep",column="*") 20 | 21 | for item in data: 22 | #获得新闻id 23 | self.new_id_list.append(item[0]) 24 | #当前新闻标签比例因子集合,标签名称顺序按数据表设计来 25 | tagsWeight = [] 26 | for tag in item[1:len(item)]: 27 | tagsWeight.append(tag) 28 | self.newsTagMat.append(tagsWeight) 29 | 30 | datasql = "select news_id,tag from get_news where is_old = 0" 31 | data = db.select_table_three(datasql) 32 | # print(data) 33 | 34 | for item in data: 35 | #获取新闻的id及对应的类别: 36 | self.news_type_dict[item[0]] = item[1] 37 | 38 | #print(self.news_type_dict) 39 | # print(self.new_id_list) 40 | # print(self.newsTagMat) 41 | return self.news_type_dict,self.new_id_list,self.newsTagMat 42 | 43 | except Exception as e: 44 | print(e) 45 | 46 | # ntTool = NewsTagDataTool() 47 | # x,y,z=ntTool.getData() 48 | # print(x) 49 | # print(y) 50 | # 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /system/latentFactor/geneUserType.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from methods.pDb import newsDb 4 | 5 | '''用户-标签(类型)潜在因子矩阵计算,与新闻-标签(类型)潜在因子矩阵相乘得到推荐新闻''' 6 | '''数据表来源:用户行为信息表,存入标签喜欢程度表,计算方法是,通过用户行为信息表信息,计算各用户对各标签的喜欢程度''' 7 | 8 | 9 | class UserTagDataTool(object): 10 | def __init__(self): 11 | 12 | #存储各用户的id 13 | self.user_id_list = [] 14 | #存储各用户对各类别的喜欢比例 15 | self.userTagMat = [] 16 | 17 | def getData(self): 18 | try: 19 | db = newsDb() 20 | data = db.select_table_two(table="user_tag_score",column="*") 21 | for item in data: 22 | # 获得用户id 23 | self.user_id_list.append(item[0]) 24 | # 当前用户对各类别的新闻的分数的集合,类别名称顺序按数据表设计来 25 | tagsScore = [] 26 | curSum = 0 27 | for score in item[1:len(item)]: 28 | if score == None: 29 | tmp = 1 30 | else: 31 | tmp = float(score) 32 | 33 | curSum = curSum + tmp 34 | tagsScore.append(tmp) 35 | 36 | #当前用户对于各类别的喜欢比重,类别名称顺序按数据表设计来 37 | tagsWeight = [] 38 | for i in range(0,len(tagsScore)): 39 | tagsWeight.append(tagsScore[i]/(float)(curSum)) 40 | 41 | self.userTagMat.append(tagsWeight) 42 | # print(self.user_id_list) 43 | # print(self.userTagMat) 44 | return self.user_id_list,self.userTagMat 45 | except Exception as e: 46 | print(e) 47 | 48 | 49 | # gut = UserTagDataTool() 50 | # x,y = gut.getData() 51 | # print(x) 52 | # print(y) -------------------------------------------------------------------------------- /system/pointsAlo/__pycache__/scoreSetting.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/pointsAlo/__pycache__/scoreSetting.cpython-34.pyc -------------------------------------------------------------------------------- /system/pointsAlo/__pycache__/scoreSetting.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/pointsAlo/__pycache__/scoreSetting.cpython-35.pyc -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 新闻推荐系统后台 6 | 7 | 8 | 9 |
    10 |
    11 | 12 |
    13 |
    14 |
    15 |
    16 |
    17 |
    18 |
    19 |
    20 |
    21 |

    22 | 23 | 24 |

    25 |

    26 | 27 | 28 |

    29 |
    30 |

    31 | 32 | 登录 33 |

    34 |
    35 |
    36 |
    37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /templates/newsManage.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block header %} 4 | {{ header }} 5 | {% end %} 6 | 7 | {% block left-nav %} 8 |
  3. 9 | 10 | 11 | Home 12 | 13 |
  4. 14 |
  5. 15 | 16 | 17 | 数据分析 18 | 19 |
  6. 20 |
  7. 21 | 22 | 23 | 爬虫管理 24 | 25 |
  8. 26 |
  9. 27 | 28 | 29 | 新闻管理 30 | 31 |
  10. 32 |
  11. 33 | 34 | 35 | 用户管理 36 | 37 |
  12. 38 |
  13. 39 | 40 | 41 | 系统信息 42 | 43 |
  14. 44 | {% end %} 45 | 46 | {% block right %} 47 | 48 | 49 |
    50 |
    51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 |
    67 |
    68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 |
    用户最多评论
    新闻编号新闻标题新闻来源发表时间阅读人数喜欢人数评论人数
    e16931cd6f3fe68662a7司马懿怒挖诸葛亮坟,终于跪下了新浪新闻2016-08-07 15:12:0925105
    e16931cd6f3fe68662a7司马懿怒挖诸葛亮坟,终于跪下了新浪新闻2016-08-07 15:12:0925105
    90 |
    91 | 92 |
    93 |
    94 |
    95 |
    96 | 最多阅读 97 |
    98 |
    99 |
    100 |
    101 | 最多喜欢 102 |
    103 |
    104 |
    105 |
    106 | 最多评论 107 |
    108 |
    109 |
    110 |
    111 | 新闻去旧 112 |
    113 |
    114 |
    115 | {% end %} 116 | 117 | -------------------------------------------------------------------------------- /templates/spider.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {% extends "main.html" %} 6 | {% block header %} 7 | <title>{{ header }} 8 | {% end %} 9 | 10 | {% block left-nav %} 11 |
  15. 12 | 13 | 14 | Home 15 | 16 |
  16. 17 |
  17. 18 | 19 | 20 | 数据分析 21 | 22 |
  18. 23 |
  19. 24 | 25 | 26 | 爬虫管理 27 | 28 |
  20. 29 |
  21. 30 | 31 | 32 | 新闻管理 33 | 34 |
  22. 35 |
  23. 36 | 37 | 38 | 用户管理 39 | 40 |
  24. 41 |
  25. 42 | 43 | 44 | 系统信息 45 | 46 |
  26. 47 | {% end %} 48 | 49 | {% block right %} 50 | 51 | 52 | 55 | 56 | {% end %} 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /templates/system.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | {% block header %} 3 | {{ header }} 4 | {% end %} 5 | 6 | {% block left-nav %} 7 |
  27. 8 | 9 | 10 | Home 11 | 12 |
  28. 13 |
  29. 14 | 15 | 16 | 数据分析 17 | 18 |
  30. 19 |
  31. 20 | 21 | 22 | 爬虫管理 23 | 24 |
  32. 25 |
  33. 26 | 27 | 28 | 新闻管理 29 | 30 |
  34. 31 |
  35. 32 | 33 | 34 | 用户管理 35 | 36 |
  36. 37 |
  37. 38 | 39 | 40 | 系统信息 41 | 42 |
  38. 43 | {% end %} 44 | 45 | {% block right %} 46 | 47 | 48 | 51 | 52 | {% end %} -------------------------------------------------------------------------------- /templates/umFeedBack.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block header %} 4 | 5 | {% end %} 6 | 7 | {% block left-nav %} 8 |
  39. 9 | 10 | 11 | Home 12 | 13 |
  40. 14 |
  41. 15 | 16 | 17 | 数据分析 18 | 19 |
  42. 20 |
  43. 21 | 22 | 23 | 爬虫管理 24 | 25 |
  44. 26 |
  45. 27 | 28 | 29 | 新闻管理 30 | 31 |
  46. 32 |
  47. 33 | 34 | 35 | 用户管理 36 | 37 |
  48. 38 |
  49. 39 | 40 | 41 | 系统信息 42 | 43 |
  50. 44 | {% end %} 45 | 46 | 47 | {% block right %} 48 | 49 | 50 | 51 | 72 | 73 | 74 | 75 | 109 | 110 | 111 | {% end %} 112 | 113 | 114 | -------------------------------------------------------------------------------- /templates/umMyNote.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block header %} 4 | 5 | {% end %} 6 | 7 | {% block left-nav %} 8 |
  51. 9 | 10 | 11 | Home 12 | 13 |
  52. 14 |
  53. 15 | 16 | 17 | 数据分析 18 | 19 |
  54. 20 |
  55. 21 | 22 | 23 | 爬虫管理 24 | 25 |
  56. 26 |
  57. 27 | 28 | 29 | 新闻管理 30 | 31 |
  58. 32 |
  59. 33 | 34 | 35 | 用户管理 36 | 37 |
  60. 38 |
  61. 39 | 40 | 41 | 系统信息 42 | 43 |
  62. 44 | {% end %} 45 | 46 | 47 | {% block right %} 48 | 49 | 50 | 51 | 72 | {% end %} -------------------------------------------------------------------------------- /templates/userManage.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block header %} 4 | {{ header }} 5 | {% end %} 6 | 7 | {% block left-nav %} 8 |
  63. 9 | 10 | 11 | Home 12 | 13 |
  64. 14 |
  65. 15 | 16 | 17 | 数据分析 18 | 19 |
  66. 20 |
  67. 21 | 22 | 23 | 爬虫管理 24 | 25 |
  68. 26 |
  69. 27 | 28 | 29 | 新闻管理 30 | 31 |
  70. 32 |
  71. 33 | 34 | 35 | 用户管理 36 | 37 |
  72. 38 |
  73. 39 | 40 | 41 | 系统信息 42 | 43 |
  74. 44 | {% end %} 45 | 46 | 47 | {% block right %} 48 | 49 | 50 | 51 | 73 | 74 |
    75 |
    76 |
    77 | 78 |
    79 |
    80 |
    81 |
    82 | 83 | 87 |
    88 |
    89 | 93 | 94 |
    95 |
    96 |
    97 | 98 |
    99 |
    100 | {% end %} 101 | 102 | 103 | 104 | 105 | 106 | --------------------------------------------------------------------------------