├── .gitignore
├── README.md
├── __init__.py
├── application.py
├── config
    ├── .DS_Store
    ├── __init__.py
    ├── myNewsApi.conf
    └── n_conf.py
├── controller
    ├── __init__.py
    ├── dataController.py
    └── newsController.py
├── cookie_secret.py
├── doc
    ├── source
    │   ├── ERDDiagram.jpg
    │   ├── news.png
    │   ├── 推荐新闻.png
    │   ├── 新闻.png
    │   ├── 新闻分数.png
    │   ├── 新闻基本信息.png
    │   ├── 新闻标签因子.png
    │   ├── 标签喜欢程度.png
    │   ├── 用户.png
    │   ├── 用户基本信息.png
    │   ├── 用户操作.png
    │   └── 用户行为信息.png
    ├── 互联网内容推荐系统需求分析.md
    ├── 互联网推荐系统API分析.md
    ├── 新闻推荐系统后台管理系统.md
    └── 新闻推荐系统数据库设计.md
├── handlers
    ├── UmFeedBack.py
    ├── UmMyNote.py
    ├── __init__.py
    ├── admin.py
    ├── api
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-34.pyc
    │   │   ├── __init__.cpython-35.pyc
    │   │   ├── newsApi.cpython-34.pyc
    │   │   └── newsApi.cpython-35.pyc
    │   └── newsApi.py
    ├── base.py
    ├── changePass.py
    ├── dataAna.py
    ├── dataOperator.py
    ├── errorHandler.py
    ├── index.py
    ├── newsManage.py
    ├── spider.py
    ├── system.py
    └── userManage.py
├── log.txt
├── methods
    ├── .DS_Store
    ├── __init__.py
    └── pDb.py
├── myNews.py
├── myNewsApi.log
├── server.py
├── spider
    ├── .DS_Store
    ├── __init__.py
    ├── allSource
    │   ├── .DS_Store
    │   ├── README.md
    │   ├── __all__
    │   │   └── README.md
    │   ├── news_baby
    │   │   └── README.md
    │   ├── news_car
    │   │   └── README.md
    │   ├── news_discovery
    │   │   └── README.md
    │   ├── news_entertainment
    │   │   └── README.md
    │   ├── news_essay
    │   │   └── README.md
    │   ├── news_fashion
    │   │   └── README.md
    │   ├── news_finance
    │   │   └── README.md
    │   ├── news_food
    │   │   └── README.md
    │   ├── news_game
    │   │   └── README.md
    │   ├── news_history
    │   │   └── README.md
    │   ├── news_hot
    │   │   └── README.md
    │   ├── news_military
    │   │   └── README.md
    │   ├── news_regimen
    │   │   └── README.md
    │   ├── news_society
    │   │   └── README.md
    │   ├── news_sports
    │   │   └── README.md
    │   ├── news_story
    │   │   └── README.md
    │   ├── news_tech
    │   │   └── README.md
    │   ├── news_travel
    │   │   └── README.md
    │   └── news_world
    │   │   └── README.md
    ├── allSpider.py
    ├── mergeExcel.py
    ├── newsDb
    │   ├── __init__.py
    │   └── insertNews.py
    ├── pyspider
    │   ├── __init__.py
    │   ├── data
    │   │   ├── project.db
    │   │   ├── result.db
    │   │   ├── scheduler.1d
    │   │   ├── scheduler.1h
    │   │   ├── scheduler.all
    │   │   └── task.db
    │   ├── database
    │   │   ├── __init__.py
    │   │   ├── base
    │   │   │   ├── __init__.py
    │   │   │   ├── projectdb.py
    │   │   │   ├── resultdb.py
    │   │   │   └── taskdb.py
    │   │   ├── basedb.py
    │   │   ├── elasticsearch
    │   │   │   ├── __init__.py
    │   │   │   ├── projectdb.py
    │   │   │   ├── resultdb.py
    │   │   │   └── taskdb.py
    │   │   ├── local
    │   │   │   ├── __init__.py
    │   │   │   └── projectdb.py
    │   │   ├── mongodb
    │   │   │   ├── __init__.py
    │   │   │   ├── mongodbbase.py
    │   │   │   ├── projectdb.py
    │   │   │   ├── resultdb.py
    │   │   │   └── taskdb.py
    │   │   ├── mysql
    │   │   │   ├── __init__.py
    │   │   │   ├── mysqlbase.py
    │   │   │   ├── projectdb.py
    │   │   │   ├── resultdb.py
    │   │   │   └── taskdb.py
    │   │   ├── redis
    │   │   │   ├── __init__.py
    │   │   │   └── taskdb.py
    │   │   ├── sqlalchemy
    │   │   │   ├── __init__.py
    │   │   │   ├── projectdb.py
    │   │   │   ├── resultdb.py
    │   │   │   ├── sqlalchemybase.py
    │   │   │   └── taskdb.py
    │   │   └── sqlite
    │   │   │   ├── __init__.py
    │   │   │   ├── projectdb.py
    │   │   │   ├── resultdb.py
    │   │   │   ├── sqlitebase.py
    │   │   │   └── taskdb.py
    │   ├── fetcher
    │   │   ├── __init__.py
    │   │   ├── cookie_utils.py
    │   │   ├── phantomjs_fetcher.js
    │   │   └── tornado_fetcher.py
    │   ├── libs
    │   │   ├── ListIO.py
    │   │   ├── __init__.py
    │   │   ├── base_handler.py
    │   │   ├── bench.py
    │   │   ├── counter.py
    │   │   ├── dataurl.py
    │   │   ├── log.py
    │   │   ├── multiprocessing_queue.py
    │   │   ├── pprint.py
    │   │   ├── response.py
    │   │   ├── result_dump.py
    │   │   ├── sample_handler.py
    │   │   ├── url.py
    │   │   ├── utils.py
    │   │   └── wsgi_xmlrpc.py
    │   ├── logging.conf
    │   ├── message_queue
    │   │   ├── __init__.py
    │   │   ├── beanstalk.py
    │   │   ├── kombu_queue.py
    │   │   ├── rabbitmq.py
    │   │   └── redis_queue.py
    │   ├── processor
    │   │   ├── __init__.py
    │   │   ├── processor.py
    │   │   └── project_module.py
    │   ├── result
    │   │   ├── __init__.py
    │   │   └── result_worker.py
    │   ├── run.py
    │   ├── scheduler
    │   │   ├── __init__.py
    │   │   ├── scheduler.py
    │   │   ├── task_queue.py
    │   │   └── token_bucket.py
    │   └── webui
    │   │   ├── __init__.py
    │   │   ├── app.py
    │   │   ├── bench_test.py
    │   │   ├── debug.py
    │   │   ├── index.py
    │   │   ├── login.py
    │   │   ├── result.py
    │   │   ├── static
    │   │       ├── css_selector_helper.js
    │   │       ├── debug.css
    │   │       ├── debug.js
    │   │       ├── debug.less
    │   │       ├── index.css
    │   │       ├── index.js
    │   │       ├── index.less
    │   │       ├── result.css
    │   │       ├── result.less
    │   │       ├── splitter.js
    │   │       ├── task.css
    │   │       ├── task.less
    │   │       ├── tasks.css
    │   │       ├── tasks.less
    │   │       └── variable.less
    │   │   ├── task.py
    │   │   ├── templates
    │   │       ├── debug.html
    │   │       ├── helper.html
    │   │       ├── helper.js
    │   │       ├── index.html
    │   │       ├── result.html
    │   │       ├── task.html
    │   │       └── tasks.html
    │   │   └── webdav.py
    ├── pyspiderSource
    │   ├── .DS_Store
    │   ├── README.md
    │   ├── __all__
    │   │   └── README.md
    │   ├── news_baby
    │   │   └── README.md
    │   ├── news_car
    │   │   └── README.md
    │   ├── news_discovery
    │   │   └── README.md
    │   ├── news_entertainment
    │   │   └── README.md
    │   ├── news_essay
    │   │   └── README.md
    │   ├── news_fashion
    │   │   └── README.md
    │   ├── news_finance
    │   │   └── README.md
    │   ├── news_food
    │   │   └── README.md
    │   ├── news_game
    │   │   └── README.md
    │   ├── news_history
    │   │   └── README.md
    │   ├── news_hot
    │   │   └── README.md
    │   ├── news_military
    │   │   └── README.md
    │   ├── news_regimen
    │   │   └── README.md
    │   ├── news_society
    │   │   └── README.md
    │   ├── news_sports
    │   │   └── README.md
    │   ├── news_story
    │   │   └── README.md
    │   ├── news_tech
    │   │   └── README.md
    │   ├── news_travel
    │   │   └── README.md
    │   └── news_world
    │   │   └── README.md
    ├── sina
    │   ├── README.md
    │   ├── __init__.py
    │   ├── sina.py
    │   └── sinaSpider.py
    ├── sinaSource
    │   ├── README.md
    │   ├── news_entertainment
    │   │   └── README.md
    │   ├── news_finance
    │   │   └── README.md
    │   ├── news_military
    │   │   └── README.md
    │   ├── news_society
    │   │   └── README.md
    │   ├── news_sports
    │   │   └── README.md
    │   ├── news_tech
    │   │   └── README.md
    │   └── news_world
    │   │   └── README.md
    ├── touTiaoSource
    │   ├── README.md
    │   ├── __all__
    │   │   └── README.md
    │   ├── gallery_detail
    │   │   └── README.md
    │   ├── news_baby
    │   │   └── README.md
    │   ├── news_car
    │   │   ├── .DS_Store
    │   │   └── README.md
    │   ├── news_discovery
    │   │   └── README.md
    │   ├── news_entertainment
    │   │   ├── .DS_Store
    │   │   └── README.md
    │   ├── news_essay
    │   │   └── README.md
    │   ├── news_fashion
    │   │   └── README.md
    │   ├── news_finance
    │   │   ├── .DS_Store
    │   │   └── README.md
    │   ├── news_food
    │   │   └── README.md
    │   ├── news_game
    │   │   └── README.md
    │   ├── news_history
    │   │   └── README.md
    │   ├── news_hot
    │   │   └── README.md
    │   ├── news_military
    │   │   └── README.md
    │   ├── news_regimen
    │   │   └── README.md
    │   ├── news_society
    │   │   ├── .DS_Store
    │   │   └── README.md
    │   ├── news_sports
    │   │   ├── .DS_Store
    │   │   └── README.md
    │   ├── news_story
    │   │   └── README.md
    │   ├── news_tech
    │   │   ├── .DS_Store
    │   │   └── README.md
    │   ├── news_travel
    │   │   └── README.md
    │   ├── news_world
    │   │   └── README.md
    │   └── video
    │   │   └── README.md
    ├── toutiao
    │   ├── __init__.py
    │   ├── touTiao.py
    │   └── touTiaoSpider.py
    └── wordAna
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── allNews
    │       ├── .DS_Store
    │       └── README.md
    │   ├── contentSpider.py
    │   ├── contentTool.py
    │   ├── excelTool.py
    │   └── wordAnaNews
    │       ├── .DS_Store
    │       └── README.md
├── static
    ├── css
    │   ├── admin.css
    │   ├── bootstrap.css
    │   ├── bootstrap.min.css
    │   ├── login.css
    │   ├── newsManage.css
    │   └── userManage.css
    ├── fonts
    │   ├── glyphicons-halflings-regular.eot
    │   ├── glyphicons-halflings-regular.svg
    │   ├── glyphicons-halflings-regular.ttf
    │   ├── glyphicons-halflings-regular.woff
    │   └── glyphicons-halflings-regular.woff2
    ├── images
    │   ├── 1.svg
    │   ├── admin.jpg
    │   ├── back.jpg
    │   ├── bg.jpg
    │   ├── bg.png
    │   ├── bgb.jpg
    │   ├── hand.png
    │   ├── left-handing.png
    │   ├── left_hand.png
    │   ├── news.png
    │   ├── password.png
    │   ├── right_hand.png
    │   ├── right_handing.png
    │   ├── save.svg
    │   ├── save0.svg
    │   ├── top_1.png
    │   ├── tou.png
    │   └── username.png
    └── js
    │   ├── admin.js
    │   ├── bootstrap.min.js
    │   ├── d3.js
    │   ├── d3.min.js
    │   ├── feedback.js
    │   ├── howie.js
    │   ├── jquery.min.js
    │   ├── newsManage.js
    │   └── userManage.js
├── system
    ├── README.md
    ├── classPredict
    │   ├── NavieBayesInfo
    │   │   ├── last_model.txt
    │   │   ├── model.txt
    │   │   ├── predict_new_word.txt
    │   │   ├── predict_result.txt
    │   │   ├── train_news_Info.txt
    │   │   └── word_id_dict.txt
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-34.pyc
    │   │   ├── __init__.cpython-35.pyc
    │   │   ├── main.cpython-34.pyc
    │   │   ├── main.cpython-35.pyc
    │   │   ├── newsPredict.cpython-34.pyc
    │   │   ├── newsPredict.cpython-35.pyc
    │   │   ├── predictTool.cpython-34.pyc
    │   │   └── predictTool.cpython-35.pyc
    │   ├── bayesTool.py
    │   ├── dataPrepare.py
    │   ├── main.py
    │   ├── newsPredict.py
    │   ├── predictTool.py
    │   ├── test.py
    │   └── trainData
    │   │   └── 2016-06-06-13-09-44&news_fashion.xlsx
    ├── latentFactor
    │   ├── README.md
    │   ├── __pycache__
    │   │   ├── geneCalcul.cpython-34.pyc
    │   │   ├── geneCalcul.cpython-35.pyc
    │   │   ├── geneNewsType.cpython-34.pyc
    │   │   ├── geneNewsType.cpython-35.pyc
    │   │   ├── geneUserType.cpython-34.pyc
    │   │   └── geneUserType.cpython-35.pyc
    │   ├── dbTool.py
    │   ├── geneCalcul.py
    │   ├── geneNewsType.py
    │   ├── geneUserNews.py
    │   └── geneUserType.py
    └── pointsAlo
    │   ├── __pycache__
    │       ├── scoreSetting.cpython-34.pyc
    │       └── scoreSetting.cpython-35.pyc
    │   └── scoreSetting.py
├── templates
    ├── admin.html
    ├── dataAna.html
    ├── index.html
    ├── main.html
    ├── newsManage.html
    ├── spider.html
    ├── system.html
    ├── umFeedBack.html
    ├── umMyNote.html
    └── userManage.html
└── tools
    └── howie.sql


/.gitignore:
--------------------------------------------------------------------------------
1 | # 过滤文件
2 | *.pyc
3 | 
4 | # 过滤文件夹
5 | __pycache__/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Tornado新闻数据管理平台
 2 | 
 3 | ### 简述：
 4 | 
 5 | **代码规范以及项目结构都有很大问题，不再维护，当做大学时期的回忆保存。**
 6 | 
 7 | `git clone https://github.com/howie6879/getNews`至本地即可
 8 | 
 9 | ### 说明：
10 | 
11 | 对采集的新闻数据进行分析，后台实现图形化操作，生成API供Android调用
12 | 
13 | 
14 | ```
15 | myNews
16 | Usage: myNews [-p] <port>
17 | 
18 | Options:
19 |     -h,--help       显示帮助菜单
20 |     -p              端口号
21 | 
22 | Example:
23 |     myNews -p 8888  设置端口号为8888
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/application.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*
 2 | __author__ = 'Howie'
 3 | import tornado.web
 4 | import os
 5 | from handlers.errorHandler import ErrorHandler
 6 | from handlers.index import IndexHandler
 7 | from handlers.admin import AdminHandler
 8 | from handlers.dataAna import DataAna
 9 | from handlers.spider import Spider
10 | from handlers.system import System
11 | from handlers.newsManage import NewsManage
12 | from handlers.UmFeedBack import UmFeedBack
13 | from handlers.UmMyNote import UmMyNote
14 | from handlers.userManage import UserManage
15 | from handlers.changePass import ChangePass
16 | from handlers.dataOperator import DataOperator
17 | import handlers.api.newsApi as api
18 | 
19 | url = [
20 |     (r'/', IndexHandler),
21 |     (r'/admin',AdminHandler),
22 |     (r'/dataAna',DataAna),
23 |     (r'/spider', Spider),
24 |     (r'/system',System),
25 |     (r'/newsManage',NewsManage),
26 |     (r'/userManage',UserManage),
27 |     (r'/umMyNote', UmMyNote),
28 |     (r'/umFeedBack', UmFeedBack),
29 |     (r'/changePass',ChangePass),
30 |     (r'/dataOperator',DataOperator),
31 |     (r'/api/register',api.Register),
32 |     (r'/api/login', api.Login),
33 |     (r'/api/newstags', api.NewsTags),
34 |     (r'/api/newscontent', api.NewsContent),
35 |     (r'/api/userinfo', api.UserInfo),
36 |     (r'/api/userinfochange', api.UserInfoChange),
37 |     (r'/api/lovenews', api.LoveNews),
38 |     (r'/api/lovelist', api.LoveList),
39 |     (r'/api/hotlist', api.HotList),
40 |     (r'/api/feedback', api.FeedBack),
41 |     (r'/api/keyword', api.KeyWord),
42 |     (r'/api/comment', api.Comment),
43 |     (r'/api/lovecomment', api.LoveComment),
44 |     (r'/api/exitread', api.ExitRead),
45 |     (r'/api/adminuser', api.AdminUser),
46 |     (r'/api/adminuserinfo', api.AdminUserInfo),
47 |     (r'/api/adminfeedback', api.AdminFeedback),
48 |     (r'/api/returntags', api.ReturnTags),
49 |     #这个页面处理语句必须放在最后
50 |     (r".*", ErrorHandler)
51 | ]
52 | 
53 | setting = dict(
54 |     template_path = os.path.join(os.path.dirname(__file__), "templates"),
55 |     static_path = os.path.join(os.path.dirname(__file__), "static"),
56 |     cookie_secret = "XQ5rhITaQ1m7HoN40CcggWPCvR2jqUn1tY9E3kWU+yc=",
57 |     #xsrf_cookies = True,
58 |     debug = True,
59 |     login_url = '/',
60 | )
61 | 
62 | application = tornado.web.Application(
63 |     handlers = url,
64 |     **setting
65 | )


--------------------------------------------------------------------------------
/config/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/config/.DS_Store


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/config/myNewsApi.conf:
--------------------------------------------------------------------------------
 1 | [program:myNewsApi]
 2 | command     =/root/anaconda3/bin/python /root/programmming/git/getNews/myNews.py -p 8884
 3 | directory   =/root/programmming/git/getNews
 4 | user        =root
 5 | startsecs   =3
 6 | 
 7 | autostart=true
 8 | autorestart=true
 9 | 
10 | redirect_stderr         = true
11 | stdout_logfile_maxbytes = 50MB
12 | stdout_logfile_backups  = 10
13 | stdout_logfile          = /root/programmming/git/getNews/myNewsApi.log
14 | 


--------------------------------------------------------------------------------
/config/n_conf.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | 
 4 | 
 5 | admin = dict(
 6 |     WEBSITE=True,
 7 |     TOKEN="news&&admin"
 8 | )
 9 | 
10 | # 本地数据库配置
11 | localDatabase = dict(
12 |     host="127.0.0.1",
13 |     user="root",
14 |     password="",
15 |     db="howie",
16 |     charset="utf8",
17 |     port=3306
18 | )
19 | 
20 | # 路径配置
21 | #dirPath = "/home/howie/programming/python/getNews"
22 | dirPath = "/root/programming/git/getNews"
23 | 


--------------------------------------------------------------------------------
/controller/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/controller/dataController.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import pandas as pd
 4 | import os
 5 | from config.n_conf import dirPath
 6 | import controller.newsController as newsController
 7 | from config.n_conf import dirPath
 8 | 
 9 | 
10 | class DataController(newsController.NewsController):
11 |     def repeatedData(self, *dirs):
12 |         """
13 |         func:        对爬取的数据进行去重操作
14 |         :param *dirs:文件夹list,dirs[0]里面含有文件夹名称,默认为2个
15 |         :return:     成功返回True,否则返回"No Data"
16 |         """
17 |         self.initData = self.newsFiles("get", "allSource")
18 |         if self.initData:
19 |             for eachFile in self.initData:
20 |                 newsData = pd.read_excel(eachFile, sheetname="allNews")
21 |                 newsData = newsData.drop_duplicates()  # 去重
22 |                 # 获取主路径
23 |                 path = os.path.join(dirPath, 'spider')
24 |                 # 获取处理后文件路径
25 |                 for dir in dirs[0]:
26 |                     path = os.path.join(path, dir)
27 |                 filePath = os.path.join(path, os.path.split(eachFile)[1])
28 |                 log = filePath + "文件去重成功"
29 |                 print(log)
30 |                 with open(dirPath+"/log.txt", 'a') as fp:
31 |                     fp.write(log + "\n")
32 |                 newsData.to_excel(excel_writer=filePath, sheet_name="allNews")
33 |             return True
34 |         else:
35 |             return "No Data!"
36 | 
37 |     def rmAllNews(self, newsSource):
38 |         for i in newsSource:
39 |             self.newsFiles("rm", i)
40 |         return self.rmRepeate(['wordAna', 'allNews'])
41 | 
42 | 
43 | newsSource = ["touTiaoSource", "sinaSource", "allSource","pyspiderSource"]
44 | DataController = DataController()
45 | # print(DataController.rmAllNews(newsSource))                     #删除所有原始数据
46 | # print(DataController.initData)
47 | # print(DataController.initData)
48 | # DataController.rmRepeate(['wordAna','allNews'])            #删除去重文件夹里面的表
49 | # DataController.rmRepeate(['wordAna','wordAnaNews'])            #删除分词文件夹里面的表
50 | # print(DataController.repeatedData(['wordAna','allNews']))      #进行去重操作
51 | 


--------------------------------------------------------------------------------
/controller/newsController.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import os
 4 | from config.n_conf import dirPath
 5 | 
 6 | class NewsController():
 7 |     """
 8 |     系统控制类
 9 |     """
10 | 
11 |     def newsFiles(self, operator, sourceName):
12 |         """
13 |         :func   获取spider/sourceName/目录下爬取的各个新闻excel表
14 |         :param  operator: 根据get或rm进行获取文件以及删除文件操作
15 |                 sourceName:新闻网站文件夹
16 |         :return:获取文件操作返回文件名列表,删除文件,删除成功返回allFiles=False,表示目录下没有文件
17 |         """
18 |         # 获取新闻目录
19 |         path = os.path.join(os.path.join(dirPath, 'spider'), sourceName)
20 |         allFiles = []
21 |         for dir in os.listdir(path):
22 |             tarPath = os.path.join(path, dir)
23 |             if os.path.isdir(tarPath):
24 |                 files = [file for file in os.listdir(tarPath) if
25 |                         os.path.isfile(os.path.join(tarPath, file)) and os.path.splitext(file)[1] == ".xlsx"]
26 |                 if files and operator == "get":
27 |                     for file in files:
28 |                         allFiles.append(os.path.join(tarPath, file))
29 |                 # 删除原始数据
30 |                 elif files and operator == "rm":
31 |                     for file in files:
32 |                         os.remove(os.path.join(tarPath, file))
33 |                         log = os.path.join(tarPath, file) + "文件删除成功"
34 |                         print(log)
35 |                         with open(dirPath+"/log.txt", 'a') as fp:
36 |                             fp.write(log + "\n")
37 |         if not allFiles:
38 |             return False
39 |         else:
40 |             return allFiles
41 | 
42 |     def rmRepeate(self,*dirs):
43 |         """
44 |         func:       删除已经去重的文件
45 |         :param *dirs:文件夹list,dirs[0]里面含有文件夹名称,默认为2个
46 |         :return:    删除成功返回True
47 |         """
48 |         path = os.path.join(dirPath,'spider')
49 |         #生成去重的数据目录
50 |         for dir in dirs[0]:
51 |             path = os.path.join(path,str(dir))
52 | 
53 |         files = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path,file)) and os.path.splitext(file)[1] == ".xlsx"]
54 |         for file in files:
55 |             os.remove(os.path.join(path, file))
56 |             log = os.path.join(path, file) + "文件删除成功"
57 |             print(log)
58 |             with open(dirPath+"/log.txt", 'a') as fp:
59 |                 fp.write(log + "\n")
60 |         return True
61 | 


--------------------------------------------------------------------------------
/cookie_secret.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'
3 | import base64
4 | import uuid
5 | 
6 | cookie_secret = base64.b64encode(uuid.uuid4().bytes + uuid.uuid4().bytes)
7 | #print(cookie_secret)


--------------------------------------------------------------------------------
/doc/source/ERDDiagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/ERDDiagram.jpg


--------------------------------------------------------------------------------
/doc/source/news.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/news.png


--------------------------------------------------------------------------------
/doc/source/推荐新闻.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/推荐新闻.png


--------------------------------------------------------------------------------
/doc/source/新闻.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻.png


--------------------------------------------------------------------------------
/doc/source/新闻分数.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻分数.png


--------------------------------------------------------------------------------
/doc/source/新闻基本信息.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻基本信息.png


--------------------------------------------------------------------------------
/doc/source/新闻标签因子.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/新闻标签因子.png


--------------------------------------------------------------------------------
/doc/source/标签喜欢程度.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/标签喜欢程度.png


--------------------------------------------------------------------------------
/doc/source/用户.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户.png


--------------------------------------------------------------------------------
/doc/source/用户基本信息.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户基本信息.png


--------------------------------------------------------------------------------
/doc/source/用户操作.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户操作.png


--------------------------------------------------------------------------------
/doc/source/用户行为信息.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/doc/source/用户行为信息.png


--------------------------------------------------------------------------------
/doc/新闻推荐系统后台管理系统.md:
--------------------------------------------------------------------------------
 1 | ## 新闻推荐系统后台管理文档
 2 | 
 3 | ### 1.需求分析
 4 | 
 5 | ##### 目标
 6 | 
 7 | 对数据进行界面化管理
 8 | 
 9 | ### 2.数据库设计
10 | 
11 | #### 2.1.用户表(admin.user)
12 | 
13 | |  列名  |    数据类型     |          是否为空           |  说明  |
14 | | :--: | :---------: | :---------------------: | :--: |
15 | |  id  |     int     | not null auto_increment |  PK  |
16 | | name | varchar(10) |        not null         | 管理员  |
17 | | pass | varchar(40) |        not null         |  密码  |
18 | 
19 | ```mysql
20 | -- Table: user
21 | CREATE TABLE `n_admin` (
22 | 
23 |   `id` int(11) NOT NULL AUTO_INCREMENT,
24 | 
25 |   `name` varchar(10) NOT NULL,
26 | 
27 |   `pass` varchar(40) NOT NULL,
28 | 
29 |   PRIMARY KEY (`id`)
30 | 
31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8
32 | ```
33 | ### 3.系统搭建
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/handlers/UmFeedBack.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class UmFeedBack(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 | 
12 | 
13 |         self.render("umFeedBack.html")


--------------------------------------------------------------------------------
/handlers/UmMyNote.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class UmMyNote(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 | 
12 | 
13 |         self.render("umMyNote.html")


--------------------------------------------------------------------------------
/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/handlers/admin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*
 2 | __author__ = 'Howie'
 3 | 
 4 | import tornado.web
 5 | import tornado.escape
 6 | from methods.pDb import newsDb
 7 | from handlers.base import BaseHandler
 8 | 
 9 | 
10 | class AdminHandler(BaseHandler):
11 |     @tornado.web.authenticated
12 |     def get(self, *args, **kwargs):
13 |         user = self.get_argument("user")
14 |         if user == "logout":
15 |             self.clear_cookie("user")
16 |             self.render("index.html")
17 |         else:
18 |             header = "新闻推荐系统后台"
19 |             cateType = {"news_society":"社会", "news_entertainment":"娱乐","news_tech":"科技", "news_car":"汽车", "news_sports":"体育", "news_finance":"财经",
20 |                     "news_military":"军事", "news_world":"国际","news_fashion":"时尚", "news_travel":"旅游", "news_discovery":"探索", "news_baby":"育儿",
21 |                     "news_regimen":"养生", "news_story":"故事","news_essay":"美文", "news_game":"游戏", "news_history":"历史", "news_food":"美食"}
22 |             numTag = {}
23 |             for i in cateType.keys():
24 |                 mSql = newsDb()
25 |                 result = mSql.select_table(table="get_news",column="count(*)",condition="tag",value=i)
26 |                 numTag[cateType[i]]=result[0][0]
27 |             #排序
28 |             sortTag = list(sorted(numTag.items(), key=lambda d:d[1], reverse = True))
29 |             self.render("admin.html", header=header, numTag=numTag,sortTag=sortTag[0:7])
30 | 


--------------------------------------------------------------------------------
/handlers/api/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/handlers/api/__pycache__/__init__.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/__init__.cpython-34.pyc


--------------------------------------------------------------------------------
/handlers/api/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/handlers/api/__pycache__/newsApi.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/newsApi.cpython-34.pyc


--------------------------------------------------------------------------------
/handlers/api/__pycache__/newsApi.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/handlers/api/__pycache__/newsApi.cpython-35.pyc


--------------------------------------------------------------------------------
/handlers/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*
 2 | __author__ = 'Howie'
 3 | 
 4 | import tornado.web
 5 | 
 6 | 
 7 | class BaseHandler(tornado.web.RequestHandler):
 8 |     def get_current_user(self):
 9 |         return self.get_secure_cookie("user")
10 | 
11 |     def write_error(self, status_code, **kwargs):
12 |         self.write("错误页面,状态码{0}.\n".format(
13 |             status_code))
14 | 
15 |     def output(self):
16 |         self.write("hi")
17 |         print("hi")
18 | 


--------------------------------------------------------------------------------
/handlers/changePass.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | import hashlib
 6 | from methods.pDb import newsDb
 7 | from config.n_conf import admin
 8 | from handlers.base import BaseHandler
 9 | 
10 | 
11 | class ChangePass(BaseHandler):
12 |     @tornado.web.authenticated
13 |     def get(self, *args, **kwargs):
14 |         password = self.get_argument("pass")
15 |         password = str(hashlib.md5((admin["TOKEN"] + password).encode("utf-8")).hexdigest())
16 |         sql = "update n_admin set pass='" + password + "' where name = 'admin'"  # 执行SQL语句
17 |         mSql = newsDb()
18 |         if mSql.exeSql(sql):
19 |             self.write("密码修改成功")
20 |         else:
21 |             self.write("密码修改失败")
22 | 


--------------------------------------------------------------------------------
/handlers/dataAna.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class DataAna(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 |         header = "数据分析"
12 |         self.render("dataAna.html",header=header)


--------------------------------------------------------------------------------
/handlers/dataOperator.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | from spider import allSpider
 7 | from controller.dataController import DataController, newsSource
 8 | from spider.newsDb.insertNews import newsInsert
 9 | from system.classPredict.main import startPredict
10 | from system.latentFactor.geneCalcul import GeneCulcal
11 | from methods.pDb import newsDb
12 | 
13 | 
14 | class DataOperator(BaseHandler):
15 |     @tornado.web.authenticated
16 |     def get(self, *args, **kwargs):
17 |         # 新闻种类
18 |         action = self.get_argument('action')
19 |         if action == "getNews":
20 |             page = int(self.get_argument('page'))
21 |             num = int(self.get_argument('num'))
22 |             cate = ["__all__","news_hot","news_society", "news_entertainment",
23 |                     "news_tech", "news_car", "news_sports", "news_finance", "news_military", "news_world",
24 |                     "news_fashion", "news_travel", "news_discovery", "news_baby", "news_regimen", "news_story",
25 |                     "news_essay", "news_game", "news_history", "news_food"]
26 |             allSpider.touTiao(category=cate, page=page, num=num)
27 |             allSpider.sina(num=1000, page=1)
28 |             print("success")
29 |         elif action == "repeatedData":
30 |             # 先进行合并
31 |             allSpider.merge()
32 |             # 进行去重
33 |             print(DataController.repeatedData(['wordAna', 'allNews']))
34 |             print("success")
35 |         elif action == "anaData":
36 |             # 进行词性分析
37 |             allSpider.wordAna()
38 | 
39 |         elif action == "rmAllNews":
40 |             DataController.rmAllNews(newsSource)
41 |             print("success")
42 |         elif action == "insertDB":
43 |             # 清除老数据
44 |             db = newsDb()
45 |             db.exeSql("delete from news_tag_deep")
46 |             db.exeSql("delete from news_nums")
47 |             #db.exeSql("delete from get_news where is_old=0")
48 |             db.exeSql("insert into news_nums select * from news_nums_view")
49 |             # 将新闻插入数据库
50 |             newsInsert.insertSql("wordAnaNews")
51 |             # 删除分词文件夹里面的表
52 |             DataController.rmRepeate(['wordAna', 'wordAnaNews'])
53 |             startPredict()
54 |             gc = GeneCulcal()
55 |             gc.getMatData()
56 |             print("success")
57 | 


--------------------------------------------------------------------------------
/handlers/errorHandler.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*
 2 | __author__ = 'Howie'
 3 | 
 4 | import tornado.web
 5 | 
 6 | class ErrorHandler(tornado.web.RequestHandler):
 7 | 
 8 |     def write_error(self, status_code, **kwargs):
 9 |         self.write("错误状态码{0}.\n".format(
10 |             status_code))


--------------------------------------------------------------------------------
/handlers/index.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*
 2 | __author__ = 'Howie'
 3 | 
 4 | import tornado.escape
 5 | import hashlib
 6 | from methods.pDb import newsDb
 7 | from config.n_conf import admin
 8 | from handlers.base import BaseHandler
 9 | 
10 | class IndexHandler(BaseHandler):
11 |     def get(self):
12 |         self.clear_cookie("user")
13 |         self.render("index.html") if admin["WEBSITE"] else self.write("<h3>网站正在维护...</h3>")
14 | 
15 |     def post(self):
16 |         username = self.get_argument("username")
17 |         password = self.get_argument("password")
18 |         mSql = newsDb()
19 |         result = mSql.select_table("n_admin", "*", "name", username)
20 |         if result:
21 |             db_pwd = result[0][2]
22 |             password = hashlib.md5((admin["TOKEN"]+password).encode("utf-8")).hexdigest()
23 |             if db_pwd == password:
24 |                 self.set_current_user(username)     #将当前用户名写入cookie
25 |                 self.write(username)
26 |             else:
27 |                 self.clear_cookie("user")
28 |                 self.write("-1")
29 |         else:
30 |             self.clear_cookie("user")
31 |             self.write("-1")
32 | 
33 |     def set_current_user(self,user):
34 |         if user:
35 |             self.set_secure_cookie('user',tornado.escape.json_encode(user))
36 |         else:
37 |             self.clear_cookie("user")
38 | 


--------------------------------------------------------------------------------
/handlers/newsManage.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class NewsManage(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 |         header = "新闻管理"
12 |         self.render("newsManage.html",header=header)
13 | 
14 | 


--------------------------------------------------------------------------------
/handlers/spider.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class Spider(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 |         header = "爬虫管理"
12 |         self.render("spider.html",header=header)


--------------------------------------------------------------------------------
/handlers/system.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class System(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 |         header = "系统信息"
12 |         self.render("system.html",header=header)


--------------------------------------------------------------------------------
/handlers/userManage.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import tornado.web
 4 | import tornado.escape
 5 | from handlers.base import BaseHandler
 6 | 
 7 | class UserManage(BaseHandler):
 8 | 
 9 |     @tornado.web.authenticated
10 |     def get(self, *args, **kwargs):
11 |         header = "用户管理"
12 | 
13 |         self.render("userManage.html",header=header)


--------------------------------------------------------------------------------
/methods/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/methods/.DS_Store


--------------------------------------------------------------------------------
/methods/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/methods/pDb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*
 2 | __author__ = 'Howie,jeezy'
 3 | 
 4 | import pymysql
 5 | from config.n_conf import localDatabase
 6 | 
 7 | 
 8 | class newsDb(object):
 9 |     """
10 |     connect mysql
11 |     """
12 | 
13 |     def __init__(self):
14 |         self.conn = pymysql.connect(**localDatabase)
15 |         self.cur = self.conn.cursor()
16 | 
17 |     def select_table(self, table, column, condition, value):
18 |         sql = "select " + column + " from " + table + " where " + condition + "= '" + value + "'"
19 |         print(sql)
20 |         self.cur.execute(sql)
21 |         lines = self.cur.fetchall()
22 |         return lines
23 | 
24 |     def select_table_two(self, table, column):
25 |         sql = "select " + column + " from " + table
26 |         print (sql)
27 |         self.cur.execute (sql)
28 |         lines = self.cur.fetchall ()
29 |         return lines
30 | 
31 |     def select_table_three(self,sql):
32 |         print (sql)
33 |         self.cur.execute (sql)
34 |         lines = self.cur.fetchall ()
35 |         return lines
36 | 
37 |     def  insert_table(self, table, field, values):
38 |         sql = "insert into " + table + field + " values" + values
39 |         print(sql)
40 |         try:
41 |             self.cur.execute(sql)
42 |             # 提交到数据库执行
43 |             self.conn.commit()
44 |             return True
45 |         except:
46 |             # 出现错误则回滚
47 |             self.conn.rollback()
48 |             return False
49 | 
50 |     def update_column(self, table, column, value_set, condition, value_find):
51 |         sql = "update " + table + " set " + column + "= '" + value_set + "' where " + condition + "='" + value_find + "'"
52 |         print(sql)
53 |         try:
54 |             self.cur.execute(sql)
55 |             self.conn.commit()
56 |             return True
57 |         except:
58 |             self.conn.rollback()
59 |             return False
60 | 
61 | 
62 | 
63 |     def exeSql(self,sql):
64 |         print (sql)
65 |         try:
66 |             self.cur.execute(sql)
67 |             self.conn.commit()
68 |             return True
69 |         except:
70 |             self.conn.rollback()
71 |             return False
72 | 
73 |     def __del__(self):
74 |         self.cur.close()
75 |         self.conn.close()
76 | 


--------------------------------------------------------------------------------
/myNews.py:
--------------------------------------------------------------------------------
 1 | """myNews
 2 | 
 3 | Usage: myNews [-p] <port>
 4 | 
 5 | Options:
 6 |     -h,--help       显示帮助菜单
 7 |     -p              端口号
 8 | 
 9 | Example:
10 |     myNews -p 8888  设置端口号为8888
11 | """
12 | 
13 | from docopt import docopt
14 | from server import main
15 | 
16 | 
17 | def cli():
18 |     kwargs = docopt(__doc__)
19 |     port = kwargs['<port>']
20 |     main(port)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     cli()
25 | 


--------------------------------------------------------------------------------
/myNewsApi.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/myNewsApi.log


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*
 2 | __author__ = 'Howie'
 3 | 
 4 | import tornado.options
 5 | import tornado.ioloop
 6 | from application import application
 7 | 
 8 | 
 9 | def main(port):
10 |     #tornado.options.parse_command_line()
11 |     application.listen(port)
12 |     print("Development server is running at http://127.0.0.1:%s" % port)
13 |     print("Quit the server with Control-C")
14 |     tornado.ioloop.IOLoop.instance().start()
15 | 
16 | #main(8888)


--------------------------------------------------------------------------------
/spider/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/.DS_Store


--------------------------------------------------------------------------------
/spider/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/spider/allSource/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/.DS_Store


--------------------------------------------------------------------------------
/spider/allSource/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/README.md


--------------------------------------------------------------------------------
/spider/allSource/__all__/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/__all__/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_baby/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_baby/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_car/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_car/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_discovery/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_discovery/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_entertainment/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_entertainment/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_essay/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_essay/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_fashion/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_fashion/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_finance/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_finance/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_food/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_food/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_game/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_game/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_history/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_history/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_hot/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_hot/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_military/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_military/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_regimen/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_regimen/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_society/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_society/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_sports/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_sports/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_story/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_story/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_tech/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_tech/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_travel/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_travel/README.md


--------------------------------------------------------------------------------
/spider/allSource/news_world/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/allSource/news_world/README.md


--------------------------------------------------------------------------------
/spider/allSpider.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | import time
 4 | import os
 5 | import spider.toutiao.touTiaoSpider as ts
 6 | import spider.sina.sinaSpider as ss
 7 | import spider.mergeExcel as me
 8 | import spider.wordAna.contentSpider as cs
 9 | from config.n_conf import dirPath
10 | 
11 | 
12 | ss.cate = ["news_world", "news_sports", "news_finance", "news_society", "news_entertainment", "news_military",
13 |            "news_tech"]
14 | 
15 | 
16 | def touTiao(category, page, num):
17 |     # 爬取今日头条
18 |     for cate in category:
19 |         ts.getToutiaoNews(cate, page, num)
20 | 
21 | 
22 | def sina(num=1000, page=1, type=ss.cate):
23 |     # 爬取新浪新闻
24 |     ss.getSinaNews(num, page, type)
25 | 
26 | def merge():
27 |     #新闻合并操作
28 |     mainPath = os.path.join(dirPath,'spider')
29 |     secondPath = os.path.join(mainPath,'allSource')
30 |     mergeExel = me.mergeExcel()
31 |     mergeExel.merge(mainPath,secondPath)
32 | 
33 | def wordAna():
34 |     cs.getNewsContent()
35 | 
36 | def insertNews():
37 |     pass
38 | #touTiao(category=ts.category, page=2, num=20, time=time.time())
39 | #sina()
40 | #merge()
41 | #wordAna()


--------------------------------------------------------------------------------
/spider/newsDb/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/spider/newsDb/insertNews.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*
 2 | __author__ = 'Jeezy'
 3 | 
 4 | import os
 5 | import time
 6 | import hashlib
 7 | from methods.pDb import newsDb
 8 | import random
 9 | import pandas as pd
10 | from config.n_conf import dirPath
11 | 
12 | 
13 | class newsInsert:
14 |     def __init__(self):
15 |         pass
16 | 
17 |     def insertSql(self, mainPath):
18 |         path = dirPath + "/spider/wordAna/" + mainPath
19 |         for dir in os.listdir(path):
20 |             if os.path.splitext(dir)[1] == ".xlsx":
21 |                 file = os.path.join(path, dir)
22 |                 self.insert(file)
23 | 
24 |     def insert(self, file):
25 |         try:
26 |             data = pd.read_excel(file, sheetname="allNews")
27 |             data = data.drop_duplicates(subset='title', keep='last')
28 |             db = newsDb()
29 |             cateType = {"news_society": "社会", "news_entertainment": "娱乐", "news_tech": "科技", "news_car": "汽车",
30 |                         "news_sports": "体育", "news_finance": "财经",
31 |                         "news_military": "军事", "news_world": "国际", "news_fashion": "时尚", "news_travel": "旅游",
32 |                         "news_discovery": "探索", "news_baby": "育儿",
33 |                         "news_regimen": "养生", "news_story": "故事", "news_essay": "美文", "news_game": "游戏",
34 |                         "news_history": "历史", "news_food": "美食"}
35 |             tag = file.split('&')[1]
36 |             for i in range(0, len(data)):
37 |                 value = data.values[i]
38 |                 if value[8] in cateType.keys(): tag = value[8]
39 |                 if value[11]:
40 |                     times = time.time()
41 |                     md5newid = hashlib.md5(str(times).encode("utf-8")).hexdigest()
42 |                     startNum = random.randint(0, (len(md5newid) - 20))
43 |                     newsId = str(md5newid)[startNum:(startNum + 20)]
44 |                     try:
45 |                         mysqlSuccess = db.insert_table(table="get_news",
46 |                                                        field="(news_id,news_link,source,title,abstract,tag,"
47 |                                                              "text_content,html_content,image,keyword)",
48 |                                                        values="('" + newsId + "','" + value[2] + "','" + value[
49 |                                                            4] + "','" +
50 |                                                               value[1]
51 |                                                               + "','" + value[6] + "','" + tag + "','" + value[
52 |                                                                   10] + "','" + value[11] + "','" + value[12] + "','" +
53 |                                                               value[
54 |                                                                   9] + "')")
55 | 
56 |                         if mysqlSuccess:
57 |                             print("新闻保存sql完成!")
58 |                     except:
59 |                         print("failed")
60 |         except:
61 |             print("import failed")
62 | 
63 | 
64 | newsInsert = newsInsert()
65 | 


--------------------------------------------------------------------------------
/spider/pyspider/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-11-17 19:17:12
7 | 
8 | __version__ = '0.3.8'
9 | 


--------------------------------------------------------------------------------
/spider/pyspider/data/project.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/project.db


--------------------------------------------------------------------------------
/spider/pyspider/data/result.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/result.db


--------------------------------------------------------------------------------
/spider/pyspider/data/scheduler.1d:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/scheduler.1d


--------------------------------------------------------------------------------
/spider/pyspider/data/scheduler.1h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/scheduler.1h


--------------------------------------------------------------------------------
/spider/pyspider/data/scheduler.all:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/scheduler.all


--------------------------------------------------------------------------------
/spider/pyspider/data/task.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/data/task.db


--------------------------------------------------------------------------------
/spider/pyspider/database/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/database/base/__init__.py


--------------------------------------------------------------------------------
/spider/pyspider/database/base/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-09 11:28:52
 7 | 
 8 | import re
 9 | 
10 | # NOTE: When get/get_all/check_update from database with default fields,
11 | #       all following fields should be included in output dict.
12 | {
13 |     'project': {
14 |         'name': str,
15 |         'group': str,
16 |         'status': str,
17 |         'script': str,
18 |         # 'config': str,
19 |         'comments': str,
20 |         # 'priority': int,
21 |         'rate': int,
22 |         'burst': int,
23 |         'updatetime': int,
24 |     }
25 | }
26 | 
27 | 
28 | class ProjectDB(object):
29 |     status_str = [
30 |         'TODO',
31 |         'STOP',
32 |         'CHECKING',
33 |         'DEBUG',
34 |         'RUNNING',
35 |     ]
36 | 
37 |     def insert(self, name, obj={}):
38 |         raise NotImplementedError
39 | 
40 |     def update(self, name, obj={}, **kwargs):
41 |         raise NotImplementedError
42 | 
43 |     def get_all(self, fields=None):
44 |         raise NotImplementedError
45 | 
46 |     def get(self, name, fields):
47 |         raise NotImplementedError
48 | 
49 |     def drop(self, name):
50 |         raise NotImplementedError
51 | 
52 |     def check_update(self, timestamp, fields=None):
53 |         raise NotImplementedError
54 | 
55 |     def split_group(self, group, lower=True):
56 |         return re.split("\W+", (group or '').lower())
57 | 
58 |     def verify_project_name(self, name):
59 |         if len(name) > 64:
60 |             return False
61 |         if re.search(r"[^\w]", name):
62 |             return False
63 |         return True
64 | 
65 |     def copy(self):
66 |         '''
67 |         database should be able to copy itself to create new connection
68 | 
69 |         it's implemented automatically by pyspider.database.connect_database
70 |         if you are not create database connection via connect_database method,
71 |         you should implement this
72 |         '''
73 |         raise NotImplementedError
74 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/base/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-11 18:40:03
 7 | 
 8 | # result schema
 9 | {
10 |     'result': {
11 |         'taskid': str,  # new, not changeable
12 |         'project': str,  # new, not changeable
13 |         'url': str,  # new, not changeable
14 |         'result': str,  # json string
15 |         'updatetime': int,
16 |     }
17 | }
18 | 
19 | 
20 | class ResultDB(object):
21 |     """
22 |     database for result
23 |     """
24 |     projects = set()  # projects in resultdb
25 | 
26 |     def save(self, project, taskid, url, result):
27 |         raise NotImplementedError
28 | 
29 |     def select(self, project, fields=None, offset=0, limit=None):
30 |         raise NotImplementedError
31 | 
32 |     def count(self, project):
33 |         raise NotImplementedError
34 | 
35 |     def get(self, project, taskid, fields=None):
36 |         raise NotImplementedError
37 | 
38 |     def drop(self, project):
39 |         raise NotImplementedError
40 | 
41 |     def copy(self):
42 |         '''
43 |         database should be able to copy itself to create new connection
44 | 
45 |         it's implemented automatically by pyspider.database.connect_database
46 |         if you are not create database connection via connect_database method,
47 |         you should implement this
48 |         '''
49 |         raise NotImplementedError
50 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/base/taskdb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<i@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2014-02-08 10:28:48
  7 | 
  8 | # task schema
  9 | {
 10 |     'task': {
 11 |         'taskid': str,  # new, not change
 12 |         'project': str,  # new, not change
 13 |         'url': str,  # new, not change
 14 |         'status': int,  # change
 15 |         'schedule': {
 16 |             'priority': int,
 17 |             'retries': int,
 18 |             'retried': int,
 19 |             'exetime': int,
 20 |             'age': int,
 21 |             'itag': str,
 22 |             # 'recrawl': int
 23 |         },  # new and restart
 24 |         'fetch': {
 25 |             'method': str,
 26 |             'headers': dict,
 27 |             'data': str,
 28 |             'timeout': int,
 29 |             'save': dict,
 30 |         },  # new and restart
 31 |         'process': {
 32 |             'callback': str,
 33 |         },  # new and restart
 34 |         'track': {
 35 |             'fetch': {
 36 |                 'ok': bool,
 37 |                 'time': int,
 38 |                 'status_code': int,
 39 |                 'headers': dict,
 40 |                 'encoding': str,
 41 |                 'content': str,
 42 |             },
 43 |             'process': {
 44 |                 'ok': bool,
 45 |                 'time': int,
 46 |                 'follows': int,
 47 |                 'outputs': int,
 48 |                 'logs': str,
 49 |                 'exception': str,
 50 |             },
 51 |             'save': object,  # jsonable object saved by processor
 52 |         },  # finish
 53 |         'lastcrawltime': int,  # keep between request
 54 |         'updatetime': int,  # keep between request
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | class TaskDB(object):
 60 |     ACTIVE = 1
 61 |     SUCCESS = 2
 62 |     FAILED = 3
 63 |     BAD = 4
 64 | 
 65 |     projects = set()  # projects in taskdb
 66 | 
 67 |     def load_tasks(self, status, project=None, fields=None):
 68 |         raise NotImplementedError
 69 | 
 70 |     def get_task(self, project, taskid, fields=None):
 71 |         raise NotImplementedError
 72 | 
 73 |     def status_count(self, project):
 74 |         '''
 75 |         return a dict
 76 |         '''
 77 |         raise NotImplementedError
 78 | 
 79 |     def insert(self, project, taskid, obj={}):
 80 |         raise NotImplementedError
 81 | 
 82 |     def update(self, project, taskid, obj={}, **kwargs):
 83 |         raise NotImplementedError
 84 | 
 85 |     def drop(self, project):
 86 |         raise NotImplementedError
 87 | 
 88 |     @staticmethod
 89 |     def status_to_string(status):
 90 |         return {
 91 |             1: 'ACTIVE',
 92 |             2: 'SUCCESS',
 93 |             3: 'FAILED',
 94 |             4: 'BAD',
 95 |         }.get(status, 'UNKNOWN')
 96 | 
 97 |     @staticmethod
 98 |     def status_to_int(status):
 99 |         return {
100 |             'ACTIVE': 1,
101 |             'SUCCESS': 2,
102 |             'FAILED': 3,
103 |             'BAD': 4,
104 |         }.get(status, 4)
105 | 
106 |     def copy(self):
107 |         '''
108 |         database should be able to copy itself to create new connection
109 | 
110 |         it's implemented automatically by pyspider.database.connect_database
111 |         if you are not create database connection via connect_database method,
112 |         you should implement this
113 |         '''
114 |         raise NotImplementedError
115 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/elasticsearch/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2016-01-17 18:31:58
7 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/elasticsearch/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2016-01-17 18:32:33
 7 | 
 8 | import time
 9 | 
10 | import elasticsearch.helpers
11 | from elasticsearch import Elasticsearch
12 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
13 | 
14 | 
15 | class ProjectDB(BaseProjectDB):
16 |     __type__ = 'project'
17 | 
18 |     def __init__(self, hosts, index='pyspider'):
19 |         self.index = index
20 |         self.es = Elasticsearch(hosts=hosts)
21 | 
22 |         self.es.indices.create(index=self.index, ignore=400)
23 |         if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
24 |             self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
25 |                 "_all": {"enabled": False},
26 |                 "properties": {
27 |                     "updatetime": {"type": "double"}
28 |                 }
29 |             })
30 | 
31 |     def insert(self, name, obj={}):
32 |         obj = dict(obj)
33 |         obj['name'] = name
34 |         obj['updatetime'] = time.time()
35 | 
36 |         obj.setdefault('group', '')
37 |         obj.setdefault('status', 'TODO')
38 |         obj.setdefault('script', '')
39 |         obj.setdefault('comments', '')
40 |         obj.setdefault('rate', 0)
41 |         obj.setdefault('burst', 0)
42 | 
43 |         return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name,
44 |                              refresh=True)
45 | 
46 |     def update(self, name, obj={}, **kwargs):
47 |         obj = dict(obj)
48 |         obj.update(kwargs)
49 |         obj['updatetime'] = time.time()
50 |         return self.es.update(index=self.index, doc_type=self.__type__,
51 |                               body={'doc': obj}, id=name, refresh=True, ignore=404)
52 | 
53 |     def get_all(self, fields=None):
54 |         for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
55 |                                                  query={'query': {"match_all": {}}},
56 |                                                  _source_include=fields or []):
57 |             yield record['_source']
58 | 
59 |     def get(self, name, fields=None):
60 |         ret = self.es.get(index=self.index, doc_type=self.__type__, id=name,
61 |                           _source_include=fields or [], ignore=404)
62 |         return ret.get('_source', None)
63 | 
64 |     def check_update(self, timestamp, fields=None):
65 |         for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
66 |                                                  query={'query': {"range": {
67 |                                                      "updatetime": {"gte": timestamp}
68 |                                                  }}}, _source_include=fields or []):
69 |             yield record['_source']
70 | 
71 |     def drop(self, name):
72 |         return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True)
73 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/local/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2015-01-17 20:56:50
7 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/local/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-01-17 12:32:17
 7 | 
 8 | import os
 9 | import re
10 | import six
11 | import glob
12 | import logging
13 | 
14 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
15 | 
16 | 
17 | class ProjectDB(BaseProjectDB):
18 |     """ProjectDB loading scripts from local file."""
19 | 
20 |     def __init__(self, files):
21 |         self.files = files
22 |         self.projects = {}
23 |         self.load_scripts()
24 | 
25 |     def load_scripts(self):
26 |         project_names = set(self.projects.keys())
27 |         for path in self.files:
28 |             for filename in glob.glob(path):
29 |                 name = os.path.splitext(os.path.basename(filename))[0]
30 |                 if name in project_names:
31 |                     project_names.remove(name)
32 |                 updatetime = os.path.getmtime(filename)
33 |                 if name not in self.projects or updatetime > self.projects[name]['updatetime']:
34 |                     project = self._build_project(filename)
35 |                     if not project:
36 |                         continue
37 |                     self.projects[project['name']] = project
38 | 
39 |         for name in project_names:
40 |             del self.projects[name]
41 | 
42 |     rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M)
43 |     burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M)
44 | 
45 |     def _build_project(self, filename):
46 |         try:
47 |             with open(filename) as fp:
48 |                 script = fp.read()
49 |             m = self.rate_re.search(script)
50 |             if m:
51 |                 rate = float(m.group(1))
52 |             else:
53 |                 rate = 1
54 | 
55 |             m = self.burst_re.search(script)
56 |             if m:
57 |                 burst = float(m.group(1))
58 |             else:
59 |                 burst = 3
60 | 
61 |             return {
62 |                 'name': os.path.splitext(os.path.basename(filename))[0],
63 |                 'group': None,
64 |                 'status': 'RUNNING',
65 |                 'script': script,
66 |                 'comments': None,
67 |                 'rate': rate,
68 |                 'burst': burst,
69 |                 'updatetime': os.path.getmtime(filename),
70 |             }
71 |         except OSError as e:
72 |             logging.error('loading project script error: %s', e)
73 |             return None
74 | 
75 |     def get_all(self, fields=None):
76 |         for projectname in self.projects:
77 |             yield self.get(projectname, fields)
78 | 
79 |     def get(self, name, fields=None):
80 |         if name not in self.projects:
81 |             return None
82 |         project = self.projects[name]
83 |         result = {}
84 |         for f in fields or project:
85 |             if f in project:
86 |                 result[f] = project[f]
87 |             else:
88 |                 result[f] = None
89 |         return result
90 | 
91 |     def check_update(self, timestamp, fields=None):
92 |         self.load_scripts()
93 |         for projectname, project in six.iteritems(self.projects):
94 |             if project['updatetime'] > timestamp:
95 |                 yield self.get(projectname, fields)
96 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/mongodb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/database/mongodb/__init__.py


--------------------------------------------------------------------------------
/spider/pyspider/database/mongodb/mongodbbase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-22 20:42:01
 7 | 
 8 | import time
 9 | 
10 | 
11 | class SplitTableMixin(object):
12 |     UPDATE_PROJECTS_TIME = 10 * 60
13 | 
14 |     def _collection_name(self, project):
15 |         if self.collection_prefix:
16 |             return "%s.%s" % (self.collection_prefix, project)
17 |         else:
18 |             return project
19 | 
20 |     @property
21 |     def projects(self):
22 |         if time.time() - getattr(self, '_last_update_projects', 0) \
23 |                 > self.UPDATE_PROJECTS_TIME:
24 |             self._list_project()
25 |         return self._projects
26 | 
27 |     @projects.setter
28 |     def projects(self, value):
29 |         self._projects = value
30 | 
31 |     def _list_project(self):
32 |         self._last_update_projects = time.time()
33 |         self.projects = set()
34 |         if self.collection_prefix:
35 |             prefix = "%s." % self.collection_prefix
36 |         else:
37 |             prefix = ''
38 |         for each in self.database.collection_names():
39 |             if each.startswith('system.'):
40 |                 continue
41 |             if each.startswith(prefix):
42 |                 self.projects.add(each[len(prefix):])
43 | 
44 |     def drop(self, project):
45 |         if project not in self.projects:
46 |             self._list_project()
47 |         if project not in self.projects:
48 |             return
49 |         collection_name = self._collection_name(project)
50 |         self.database[collection_name].drop()
51 |         self._list_project()
52 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/mongodb/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-12 12:22:42
 7 | 
 8 | import time
 9 | from pymongo import MongoClient
10 | 
11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
12 | 
13 | 
14 | class ProjectDB(BaseProjectDB):
15 |     __collection_name__ = 'projectdb'
16 | 
17 |     def __init__(self, url, database='projectdb'):
18 |         self.conn = MongoClient(url)
19 |         self.conn.admin.command("ismaster")
20 |         self.database = self.conn[database]
21 |         self.collection = self.database[self.__collection_name__]
22 | 
23 |         self.collection.ensure_index('name', unique=True)
24 | 
25 |     def _default_fields(self, each):
26 |         if each is None:
27 |             return each
28 |         each.setdefault('group', None)
29 |         each.setdefault('status', 'TODO')
30 |         each.setdefault('script', '')
31 |         each.setdefault('comments', None)
32 |         each.setdefault('rate', 0)
33 |         each.setdefault('burst', 0)
34 |         each.setdefault('updatetime', 0)
35 |         return each
36 | 
37 |     def insert(self, name, obj={}):
38 |         obj = dict(obj)
39 |         obj['name'] = name
40 |         obj['updatetime'] = time.time()
41 |         return self.collection.update({'name': name}, {'$set': obj}, upsert=True)
42 | 
43 |     def update(self, name, obj={}, **kwargs):
44 |         obj = dict(obj)
45 |         obj.update(kwargs)
46 |         obj['updatetime'] = time.time()
47 |         return self.collection.update({'name': name}, {'$set': obj})
48 | 
49 |     def get_all(self, fields=None):
50 |         for each in self.collection.find({}, fields):
51 |             if each and '_id' in each:
52 |                 del each['_id']
53 |             yield self._default_fields(each)
54 | 
55 |     def get(self, name, fields=None):
56 |         each = self.collection.find_one({'name': name}, fields)
57 |         if each and '_id' in each:
58 |             del each['_id']
59 |         return self._default_fields(each)
60 | 
61 |     def check_update(self, timestamp, fields=None):
62 |         for project in self.get_all(fields=('updatetime', 'name')):
63 |             if project['updatetime'] > timestamp:
64 |                 project = self.get(project['name'], fields)
65 |                 yield self._default_fields(project)
66 | 
67 |     def drop(self, name):
68 |         return self.collection.remove({'name': name})
69 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/mongodb/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-13 22:18:36
 7 | 
 8 | import json
 9 | import time
10 | from pymongo import MongoClient
11 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
12 | from .mongodbbase import SplitTableMixin
13 | 
14 | 
15 | class ResultDB(SplitTableMixin, BaseResultDB):
16 |     collection_prefix = ''
17 | 
18 |     def __init__(self, url, database='resultdb'):
19 |         self.conn = MongoClient(url)
20 |         self.conn.admin.command("ismaster")
21 |         self.database = self.conn[database]
22 |         self.projects = set()
23 | 
24 |         self._list_project()
25 |         for project in self.projects:
26 |             collection_name = self._collection_name(project)
27 |             self.database[collection_name].ensure_index('taskid')
28 | 
29 |     def _create_project(self, project):
30 |         collection_name = self._collection_name(project)
31 |         self.database[collection_name].ensure_index('taskid')
32 |         self._list_project()
33 | 
34 |     def _parse(self, data):
35 |         data['_id'] = str(data['_id'])
36 |         if 'result' in data:
37 |             data['result'] = json.loads(data['result'])
38 |         return data
39 | 
40 |     def _stringify(self, data):
41 |         if 'result' in data:
42 |             data['result'] = json.dumps(data['result'])
43 |         return data
44 | 
45 |     def save(self, project, taskid, url, result):
46 |         if project not in self.projects:
47 |             self._create_project(project)
48 |         collection_name = self._collection_name(project)
49 |         obj = {
50 |             'taskid': taskid,
51 |             'url': url,
52 |             'result': result,
53 |             'updatetime': time.time(),
54 |         }
55 |         return self.database[collection_name].update(
56 |             {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True
57 |         )
58 | 
59 |     def select(self, project, fields=None, offset=0, limit=0):
60 |         if project not in self.projects:
61 |             self._list_project()
62 |         if project not in self.projects:
63 |             return
64 |         collection_name = self._collection_name(project)
65 |         for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit):
66 |             yield self._parse(result)
67 | 
68 |     def count(self, project):
69 |         if project not in self.projects:
70 |             self._list_project()
71 |         if project not in self.projects:
72 |             return
73 |         collection_name = self._collection_name(project)
74 |         return self.database[collection_name].count()
75 | 
76 |     def get(self, project, taskid, fields=None):
77 |         if project not in self.projects:
78 |             self._list_project()
79 |         if project not in self.projects:
80 |             return
81 |         collection_name = self._collection_name(project)
82 |         ret = self.database[collection_name].find_one({'taskid': taskid}, fields)
83 |         if not ret:
84 |             return ret
85 |         return self._parse(ret)
86 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/mysql/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-07-17 20:12:54
7 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/mysql/mysqlbase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-05 10:42:24
 7 | 
 8 | import time
 9 | import mysql.connector
10 | 
11 | 
12 | class MySQLMixin(object):
13 | 
14 |     @property
15 |     def dbcur(self):
16 |         try:
17 |             if self.conn.unread_result:
18 |                 self.conn.get_rows()
19 |             return self.conn.cursor()
20 |         except (mysql.connector.OperationalError, mysql.connector.InterfaceError):
21 |             self.conn.ping(reconnect=True)
22 |             self.conn.database = self.database_name
23 |             return self.conn.cursor()
24 | 
25 | 
26 | class SplitTableMixin(object):
27 |     UPDATE_PROJECTS_TIME = 10 * 60
28 | 
29 |     def _tablename(self, project):
30 |         if self.__tablename__:
31 |             return '%s_%s' % (self.__tablename__, project)
32 |         else:
33 |             return project
34 | 
35 |     @property
36 |     def projects(self):
37 |         if time.time() - getattr(self, '_last_update_projects', 0) \
38 |                 > self.UPDATE_PROJECTS_TIME:
39 |             self._list_project()
40 |         return self._projects
41 | 
42 |     @projects.setter
43 |     def projects(self, value):
44 |         self._projects = value
45 | 
46 |     def _list_project(self):
47 |         self._last_update_projects = time.time()
48 |         self.projects = set()
49 |         if self.__tablename__:
50 |             prefix = '%s_' % self.__tablename__
51 |         else:
52 |             prefix = ''
53 |         for project, in self._execute('show tables;'):
54 |             if project.startswith(prefix):
55 |                 project = project[len(prefix):]
56 |                 self.projects.add(project)
57 | 
58 |     def drop(self, project):
59 |         if project not in self.projects:
60 |             self._list_project()
61 |         if project not in self.projects:
62 |             return
63 |         tablename = self._tablename(project)
64 |         self._execute("DROP TABLE %s" % self.escape(tablename))
65 |         self._list_project()
66 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/mysql/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-07-17 21:06:43
 7 | 
 8 | import time
 9 | import mysql.connector
10 | 
11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
12 | from pyspider.database.basedb import BaseDB
13 | from .mysqlbase import MySQLMixin
14 | 
15 | 
16 | class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB):
17 |     __tablename__ = 'projectdb'
18 | 
19 |     def __init__(self, host='localhost', port=3306, database='projectdb',
20 |                  user='root', passwd=None):
21 |         self.database_name = database
22 |         self.conn = mysql.connector.connect(user=user, password=passwd,
23 |                                             host=host, port=port, autocommit=True)
24 |         if database not in [x[0] for x in self._execute('show databases')]:
25 |             self._execute('CREATE DATABASE %s' % self.escape(database))
26 |         self.conn.database = database
27 | 
28 |         self._execute('''CREATE TABLE IF NOT EXISTS %s (
29 |             `name` varchar(64) PRIMARY KEY,
30 |             `group` varchar(64),
31 |             `status` varchar(16),
32 |             `script` TEXT,
33 |             `comments` varchar(1024),
34 |             `rate` float(11, 4),
35 |             `burst` float(11, 4),
36 |             `updatetime` double(16, 4)
37 |             ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__))
38 | 
39 |     def insert(self, name, obj={}):
40 |         obj = dict(obj)
41 |         obj['name'] = name
42 |         obj['updatetime'] = time.time()
43 |         return self._insert(**obj)
44 | 
45 |     def update(self, name, obj={}, **kwargs):
46 |         obj = dict(obj)
47 |         obj.update(kwargs)
48 |         obj['updatetime'] = time.time()
49 |         ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj)
50 |         return ret.rowcount
51 | 
52 |     def get_all(self, fields=None):
53 |         return self._select2dic(what=fields)
54 | 
55 |     def get(self, name, fields=None):
56 |         where = "`name` = %s" % self.placeholder
57 |         for each in self._select2dic(what=fields, where=where, where_values=(name, )):
58 |             return each
59 |         return None
60 | 
61 |     def drop(self, name):
62 |         where = "`name` = %s" % self.placeholder
63 |         return self._delete(where=where, where_values=(name, ))
64 | 
65 |     def check_update(self, timestamp, fields=None):
66 |         where = "`updatetime` >= %f" % timestamp
67 |         return self._select2dic(what=fields, where=where)
68 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/redis/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2015-05-17 01:34:21
7 | 
8 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/sqlalchemy/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-12-04 20:11:04
7 | 
8 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/sqlalchemy/sqlalchemybase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-04 18:48:47
 7 | 
 8 | import time
 9 | 
10 | 
11 | def result2dict(columns, task):
12 |     r = {}
13 |     for key in task.keys():
14 |         r[key] = task[key]
15 |     return r
16 | 
17 | 
18 | class SplitTableMixin(object):
19 |     UPDATE_PROJECTS_TIME = 10 * 60
20 | 
21 |     def _tablename(self, project):
22 |         if self.__tablename__:
23 |             return '%s_%s' % (self.__tablename__, project)
24 |         else:
25 |             return project
26 | 
27 |     @property
28 |     def projects(self):
29 |         if time.time() - getattr(self, '_last_update_projects', 0) \
30 |                 > self.UPDATE_PROJECTS_TIME:
31 |             self._list_project()
32 |         return self._projects
33 | 
34 |     @projects.setter
35 |     def projects(self, value):
36 |         self._projects = value
37 | 
38 |     def _list_project(self):
39 |         self._last_update_projects = time.time()
40 |         self.projects = set()
41 |         if self.__tablename__:
42 |             prefix = '%s_' % self.__tablename__
43 |         else:
44 |             prefix = ''
45 | 
46 |         for project in self.engine.table_names():
47 |             if project.startswith(prefix):
48 |                 project = project[len(prefix):]
49 |                 self.projects.add(project)
50 | 
51 |     def drop(self, project):
52 |         if project not in self.projects:
53 |             self._list_project()
54 |         if project not in self.projects:
55 |             return
56 |         self.table.name = self._tablename(project)
57 |         self.table.drop(self.engine)
58 |         self._list_project()
59 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/sqlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/database/sqlite/__init__.py


--------------------------------------------------------------------------------
/spider/pyspider/database/sqlite/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-09 12:05:52
 7 | 
 8 | import time
 9 | 
10 | from .sqlitebase import SQLiteMixin
11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
12 | from pyspider.database.basedb import BaseDB
13 | 
14 | 
15 | class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB):
16 |     __tablename__ = 'projectdb'
17 |     placeholder = '?'
18 | 
19 |     def __init__(self, path):
20 |         self.path = path
21 |         self.last_pid = 0
22 |         self.conn = None
23 |         self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
24 |                 name PRIMARY KEY,
25 |                 `group`,
26 |                 status, script, comments,
27 |                 rate, burst, updatetime
28 |                 )''' % self.__tablename__)
29 | 
30 |     def insert(self, name, obj={}):
31 |         obj = dict(obj)
32 |         obj['name'] = name
33 |         obj['updatetime'] = time.time()
34 |         return self._insert(**obj)
35 | 
36 |     def update(self, name, obj={}, **kwargs):
37 |         obj = dict(obj)
38 |         obj.update(kwargs)
39 |         obj['updatetime'] = time.time()
40 |         ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj)
41 |         return ret.rowcount
42 | 
43 |     def get_all(self, fields=None):
44 |         return self._select2dic(what=fields)
45 | 
46 |     def get(self, name, fields=None):
47 |         where = "`name` = %s" % self.placeholder
48 |         for each in self._select2dic(what=fields, where=where, where_values=(name, )):
49 |             return each
50 |         return None
51 | 
52 |     def check_update(self, timestamp, fields=None):
53 |         where = "`updatetime` >= %f" % timestamp
54 |         return self._select2dic(what=fields, where=where)
55 | 
56 |     def drop(self, name):
57 |         where = "`name` = %s" % self.placeholder
58 |         return self._delete(where=where, where_values=(name, ))
59 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/sqlite/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-13 17:08:43
 7 | 
 8 | import re
 9 | import time
10 | import json
11 | 
12 | from .sqlitebase import SQLiteMixin, SplitTableMixin
13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
14 | from pyspider.database.basedb import BaseDB
15 | 
16 | 
17 | class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB):
18 |     __tablename__ = 'resultdb'
19 |     placeholder = '?'
20 | 
21 |     def __init__(self, path):
22 |         self.path = path
23 |         self.last_pid = 0
24 |         self.conn = None
25 |         self._list_project()
26 | 
27 |     def _create_project(self, project):
28 |         assert re.match(r'^\w+$', project) is not None
29 |         tablename = self._tablename(project)
30 |         self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
31 |                 taskid PRIMARY KEY,
32 |                 url,
33 |                 result,
34 |                 updatetime
35 |                 )''' % tablename)
36 | 
37 |     def _parse(self, data):
38 |         if 'result' in data:
39 |             data['result'] = json.loads(data['result'])
40 |         return data
41 | 
42 |     def _stringify(self, data):
43 |         if 'result' in data:
44 |             data['result'] = json.dumps(data['result'])
45 |         return data
46 | 
47 |     def save(self, project, taskid, url, result):
48 |         tablename = self._tablename(project)
49 |         if project not in self.projects:
50 |             self._create_project(project)
51 |             self._list_project()
52 |         obj = {
53 |             'taskid': taskid,
54 |             'url': url,
55 |             'result': result,
56 |             'updatetime': time.time(),
57 |         }
58 |         return self._replace(tablename, **self._stringify(obj))
59 | 
60 |     def select(self, project, fields=None, offset=0, limit=None):
61 |         if project not in self.projects:
62 |             self._list_project()
63 |         if project not in self.projects:
64 |             return
65 |         tablename = self._tablename(project)
66 | 
67 |         for task in self._select2dic(tablename, what=fields, order='updatetime DESC',
68 |                                      offset=offset, limit=limit):
69 |             yield self._parse(task)
70 | 
71 |     def count(self, project):
72 |         if project not in self.projects:
73 |             self._list_project()
74 |         if project not in self.projects:
75 |             return 0
76 |         tablename = self._tablename(project)
77 |         for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)):
78 |             return count
79 | 
80 |     def get(self, project, taskid, fields=None):
81 |         if project not in self.projects:
82 |             self._list_project()
83 |         if project not in self.projects:
84 |             return
85 |         tablename = self._tablename(project)
86 |         where = "`taskid` = %s" % self.placeholder
87 |         for task in self._select2dic(tablename, what=fields,
88 |                                      where=where, where_values=(taskid, )):
89 |             return self._parse(task)
90 | 


--------------------------------------------------------------------------------
/spider/pyspider/database/sqlite/sqlitebase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-22 20:30:44
 7 | 
 8 | import os
 9 | import time
10 | import sqlite3
11 | import threading
12 | 
13 | 
14 | class SQLiteMixin(object):
15 | 
16 |     @property
17 |     def dbcur(self):
18 |         pid = (os.getpid(), threading.current_thread().ident)
19 |         if not (self.conn and pid == self.last_pid):
20 |             self.last_pid = pid
21 |             self.conn = sqlite3.connect(self.path, isolation_level=None)
22 |         return self.conn.cursor()
23 | 
24 | 
25 | class SplitTableMixin(object):
26 |     UPDATE_PROJECTS_TIME = 10 * 60
27 | 
28 |     def _tablename(self, project):
29 |         if self.__tablename__:
30 |             return '%s_%s' % (self.__tablename__, project)
31 |         else:
32 |             return project
33 | 
34 |     @property
35 |     def projects(self):
36 |         if time.time() - getattr(self, '_last_update_projects', 0) \
37 |                 > self.UPDATE_PROJECTS_TIME:
38 |             self._list_project()
39 |         return self._projects
40 | 
41 |     @projects.setter
42 |     def projects(self, value):
43 |         self._projects = value
44 | 
45 |     def _list_project(self):
46 |         self._last_update_projects = time.time()
47 |         self.projects = set()
48 |         if self.__tablename__:
49 |             prefix = '%s_' % self.__tablename__
50 |         else:
51 |             prefix = ''
52 |         for project, in self._select('sqlite_master', what='name',
53 |                                      where='type = "table"'):
54 |             if project.startswith(prefix):
55 |                 project = project[len(prefix):]
56 |                 self.projects.add(project)
57 | 
58 |     def drop(self, project):
59 |         if project not in self.projects:
60 |             self._list_project()
61 |         if project not in self.projects:
62 |             return
63 |         tablename = self._tablename(project)
64 |         self._execute("DROP TABLE %s" % self.escape(tablename))
65 |         self._list_project()
66 | 


--------------------------------------------------------------------------------
/spider/pyspider/fetcher/__init__.py:
--------------------------------------------------------------------------------
1 | from .tornado_fetcher import Fetcher
2 | 


--------------------------------------------------------------------------------
/spider/pyspider/fetcher/cookie_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-14 09:07:11
 7 | 
 8 | from requests.cookies import MockRequest
 9 | 
10 | 
11 | class MockResponse(object):
12 | 
13 |     def __init__(self, headers):
14 |         self._headers = headers
15 | 
16 |     def info(self):
17 |         return self
18 | 
19 |     def getheaders(self, name):
20 |         """make cookie python 2 version use this method to get cookie list"""
21 |         return self._headers.get_list(name)
22 | 
23 |     def get_all(self, name, default=[]):
24 |         """make cookie python 3 version use this instead of getheaders"""
25 |         return self._headers.get_list(name) or default
26 | 
27 | 
28 | def extract_cookies_to_jar(jar, request, response):
29 |     req = MockRequest(request)
30 |     res = MockResponse(response)
31 |     jar.extract_cookies(res, req)
32 | 


--------------------------------------------------------------------------------
/spider/pyspider/libs/ListIO.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-26 23:41:51
 7 | 
 8 | 
 9 | class ListO(object):
10 | 
11 |     """A StringO write to list."""
12 | 
13 |     def __init__(self, buffer=None):
14 |         self._buffer = buffer
15 |         if self._buffer is None:
16 |             self._buffer = []
17 | 
18 |     def isatty(self):
19 |         return False
20 | 
21 |     def close(self):
22 |         pass
23 | 
24 |     def flush(self):
25 |         pass
26 | 
27 |     def seek(self, n, mode=0):
28 |         pass
29 | 
30 |     def readline(self):
31 |         pass
32 | 
33 |     def reset(self):
34 |         pass
35 | 
36 |     def write(self, x):
37 |         self._buffer.append(x)
38 | 
39 |     def writelines(self, x):
40 |         self._buffer.extend(x)
41 | 


--------------------------------------------------------------------------------
/spider/pyspider/libs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspider/libs/__init__.py


--------------------------------------------------------------------------------
/spider/pyspider/libs/dataurl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2012-11-16 10:33:20
 7 | 
 8 | import six
 9 | from base64 import b64encode, b64decode
10 | from . import utils
11 | from six.moves.urllib.parse import quote, unquote
12 | 
13 | 
14 | def encode(data, mime_type='', charset='utf-8', base64=True):
15 |     """
16 |     Encode data to DataURL
17 |     """
18 |     if isinstance(data, six.text_type):
19 |         data = data.encode(charset)
20 |     else:
21 |         charset = None
22 |     if base64:
23 |         data = utils.text(b64encode(data))
24 |     else:
25 |         data = utils.text(quote(data))
26 | 
27 |     result = ['data:', ]
28 |     if mime_type:
29 |         result.append(mime_type)
30 |     if charset:
31 |         result.append(';charset=')
32 |         result.append(charset)
33 |     if base64:
34 |         result.append(';base64')
35 |     result.append(',')
36 |     result.append(data)
37 | 
38 |     return ''.join(result)
39 | 
40 | 
41 | def decode(data_url):
42 |     """
43 |     Decode DataURL data
44 |     """
45 |     metadata, data = data_url.rsplit(',', 1)
46 |     _, metadata = metadata.split('data:', 1)
47 |     parts = metadata.split(';')
48 |     if parts[-1] == 'base64':
49 |         data = b64decode(data)
50 |     else:
51 |         data = unquote(data)
52 | 
53 |     for part in parts:
54 |         if part.startswith("charset="):
55 |             data = data.decode(part[8:])
56 |     return data
57 | 


--------------------------------------------------------------------------------
/spider/pyspider/libs/log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2012-10-24 16:08:17
 7 | 
 8 | import logging
 9 | 
10 | try:
11 |     import curses
12 | except ImportError:
13 |     curses = None
14 | 
15 | from tornado.log import LogFormatter as _LogFormatter
16 | 
17 | 
18 | class LogFormatter(_LogFormatter, object):
19 |     """Init tornado.log.LogFormatter from logging.config.fileConfig"""
20 |     def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs):
21 |         if fmt is None:
22 |             fmt = _LogFormatter.DEFAULT_FORMAT
23 |         super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs)
24 | 
25 | 
26 | class SaveLogHandler(logging.Handler):
27 |     """LogHandler that save records to a list"""
28 | 
29 |     def __init__(self, saveto=None, *args, **kwargs):
30 |         self.saveto = saveto
31 |         logging.Handler.__init__(self, *args, **kwargs)
32 | 
33 |     def emit(self, record):
34 |         if self.saveto is not None:
35 |             self.saveto.append(record)
36 | 
37 |     handle = emit
38 | 
39 | 
40 | def enable_pretty_logging(logger=logging.getLogger()):
41 |     channel = logging.StreamHandler()
42 |     channel.setFormatter(LogFormatter())
43 |     logger.addHandler(channel)
44 | 


--------------------------------------------------------------------------------
/spider/pyspider/libs/multiprocessing_queue.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | import platform
 3 | import multiprocessing
 4 | from multiprocessing.queues import Queue as BaseQueue
 5 | 
 6 | 
 7 | # The SharedCounter and Queue classes come from:
 8 | # https://github.com/vterron/lemon/commit/9ca6b4b
 9 | 
10 | class SharedCounter(object):
11 |     """ A synchronized shared counter.
12 |     The locking done by multiprocessing.Value ensures that only a single
13 |     process or thread may read or write the in-memory ctypes object. However,
14 |     in order to do n += 1, Python performs a read followed by a write, so a
15 |     second process may read the old value before the new one is written by the
16 |     first process. The solution is to use a multiprocessing.Lock to guarantee
17 |     the atomicity of the modifications to Value.
18 |     This class comes almost entirely from Eli Bendersky's blog:
19 |     http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/
20 |     """
21 | 
22 |     def __init__(self, n=0):
23 |         self.count = multiprocessing.Value('i', n)
24 | 
25 |     def increment(self, n=1):
26 |         """ Increment the counter by n (default = 1) """
27 |         with self.count.get_lock():
28 |             self.count.value += n
29 | 
30 |     @property
31 |     def value(self):
32 |         """ Return the value of the counter """
33 |         return self.count.value
34 | 
35 | 
36 | class MultiProcessingQueue(BaseQueue):
37 |     """ A portable implementation of multiprocessing.Queue.
38 |     Because of multithreading / multiprocessing semantics, Queue.qsize() may
39 |     raise the NotImplementedError exception on Unix platforms like Mac OS X
40 |     where sem_getvalue() is not implemented. This subclass addresses this
41 |     problem by using a synchronized shared counter (initialized to zero) and
42 |     increasing / decreasing its value every time the put() and get() methods
43 |     are called, respectively. This not only prevents NotImplementedError from
44 |     being raised, but also allows us to implement a reliable version of both
45 |     qsize() and empty().
46 |     """
47 |     def __init__(self, *args, **kwargs):
48 |         super(MultiProcessingQueue, self).__init__(*args, **kwargs)
49 |         self.size = SharedCounter(0)
50 | 
51 |     def put(self, *args, **kwargs):
52 |         self.size.increment(1)
53 |         super(MultiProcessingQueue, self).put(*args, **kwargs)
54 | 
55 |     def get(self, *args, **kwargs):
56 |         v = super(MultiProcessingQueue, self).get(*args, **kwargs)
57 |         self.size.increment(-1)
58 |         return v
59 | 
60 |     def qsize(self):
61 |         """ Reliable implementation of multiprocessing.Queue.qsize() """
62 |         return self.size.value
63 | 
64 | 
65 | if platform.system() == 'Darwin':
66 |     if hasattr(multiprocessing, 'get_context'):  # for py34
67 |         def Queue(maxsize=0):
68 |             return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context())
69 |     else:
70 |         def Queue(maxsize=0):
71 |             return MultiProcessingQueue(maxsize)
72 | else:
73 |     from multiprocessing import Queue  # flake8: noqa
74 | 


--------------------------------------------------------------------------------
/spider/pyspider/libs/sample_handler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # Created on __DATE__
 4 | # Project: __PROJECT_NAME__
 5 | 
 6 | from pyspider.libs.base_handler import *
 7 | 
 8 | 
 9 | class Handler(BaseHandler):
10 |     crawl_config = {
11 |     }
12 | 
13 |     @every(minutes=24 * 60)
14 |     def on_start(self):
15 |         self.crawl('__START_URL__', callback=self.index_page)
16 | 
17 |     @config(age=10 * 24 * 60 * 60)
18 |     def index_page(self, response):
19 |         for each in response.doc('a[href^="http"]').items():
20 |             self.crawl(each.attr.href, callback=self.detail_page)
21 | 
22 |     @config(priority=2)
23 |     def detail_page(self, response):
24 |         return {
25 |             "url": response.url,
26 |             "title": response.doc('title').text(),
27 |         }
28 | 


--------------------------------------------------------------------------------
/spider/pyspider/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,scheduler,fetcher,processor,webui,bench,werkzeug
 3 | 
 4 | [logger_root]
 5 | level=INFO
 6 | handlers=screen
 7 | 
 8 | [logger_scheduler]
 9 | level=INFO
10 | handlers=screen
11 | qualname=scheduler
12 | propagate=0
13 | 
14 | [logger_fetcher]
15 | level=DEBUG
16 | handlers=screen
17 | qualname=fetcher
18 | propagate=0
19 | 
20 | [logger_processor]
21 | level=DEBUG
22 | handlers=screen
23 | qualname=processor
24 | propagate=0
25 | 
26 | [logger_webui]
27 | level=DEBUG
28 | handlers=screen
29 | qualname=webui
30 | propagate=0
31 | 
32 | [logger_bench]
33 | level=DEBUG
34 | handlers=screen
35 | qualname=bench
36 | propagate=0
37 | 
38 | [logger_werkzeug]
39 | level=INFO
40 | handlers=screen
41 | qualname=werkzeug
42 | propagate=0
43 | 
44 | [handlers]
45 | keys=screen
46 | 
47 | [handler_screen]
48 | class=logging.StreamHandler
49 | formatter=pretty
50 | level=DEBUG
51 | args=(sys.stderr, )
52 | 
53 | [formatters]
54 | keys=pretty
55 | 
56 | [formatter_pretty]
57 | class=pyspider.libs.log.LogFormatter
58 | 


--------------------------------------------------------------------------------
/spider/pyspider/message_queue/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-04-30 21:47:08
 7 | 
 8 | try:
 9 |     from urllib import parse as urlparse
10 | except ImportError:
11 |     import urlparse
12 | 
13 | 
14 | def connect_message_queue(name, url=None, maxsize=0):
15 |     """
16 |     create connection to message queue
17 | 
18 |     name:
19 |         name of message queue
20 | 
21 |     rabbitmq:
22 |         amqp://username:password@host:5672/%2F
23 |         see https://www.rabbitmq.com/uri-spec.html
24 |     beanstalk:
25 |         beanstalk://host:11300/
26 |     redis:
27 |         redis://host:6379/db
28 |     kombu:
29 |         kombu+transport://userid:password@hostname:port/virtual_host
30 |         see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
31 |     builtin:
32 |         None
33 |     """
34 | 
35 |     if not url:
36 |         from pyspider.libs.multiprocessing_queue import Queue
37 |         return Queue(maxsize=maxsize)
38 | 
39 |     parsed = urlparse.urlparse(url)
40 |     if parsed.scheme == 'amqp':
41 |         from .rabbitmq import Queue
42 |         return Queue(name, url, maxsize=maxsize)
43 |     elif parsed.scheme == 'beanstalk':
44 |         from .beanstalk import Queue
45 |         return Queue(name, host=parsed.netloc, maxsize=maxsize)
46 |     elif parsed.scheme == 'redis':
47 |         from .redis_queue import Queue
48 |         db = parsed.path.lstrip('/').split('/')
49 |         try:
50 |             db = int(db[0])
51 |         except:
52 |             db = 0
53 | 
54 |         password = parsed.password or None
55 | 
56 |         return Queue(name, parsed.hostname, parsed.port, db=db, maxsize=maxsize, password=password)
57 |     else:
58 |         if url.startswith('kombu+'):
59 |             url = url[len('kombu+'):]
60 |         from .kombu_queue import Queue
61 |         return Queue(name, url, maxsize=maxsize)
62 | 
63 |     raise Exception('unknow connection url: %s', url)
64 | 


--------------------------------------------------------------------------------
/spider/pyspider/message_queue/redis_queue.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<roy@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2015-04-27 22:48:04
  7 | 
  8 | import time
  9 | import redis
 10 | import umsgpack
 11 | from six.moves import queue as BaseQueue
 12 | 
 13 | 
 14 | class RedisQueue(object):
 15 |     """
 16 |     A Queue like message built over redis
 17 |     """
 18 | 
 19 |     Empty = BaseQueue.Empty
 20 |     Full = BaseQueue.Full
 21 |     max_timeout = 0.3
 22 | 
 23 |     def __init__(self, name, host='localhost', port=6379, db=0,
 24 |                  maxsize=0, lazy_limit=True, password=None):
 25 |         """
 26 |         Constructor for RedisQueue
 27 | 
 28 |         maxsize:    an integer that sets the upperbound limit on the number of
 29 |                     items that can be placed in the queue.
 30 |         lazy_limit: redis queue is shared via instance, a lazy size limit is used
 31 |                     for better performance.
 32 |         """
 33 |         self.name = name
 34 |         self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
 35 |         self.maxsize = maxsize
 36 |         self.lazy_limit = lazy_limit
 37 |         self.last_qsize = 0
 38 | 
 39 |     def qsize(self):
 40 |         self.last_qsize = self.redis.llen(self.name)
 41 |         return self.last_qsize
 42 | 
 43 |     def empty(self):
 44 |         if self.qsize() == 0:
 45 |             return True
 46 |         else:
 47 |             return False
 48 | 
 49 |     def full(self):
 50 |         if self.maxsize and self.qsize() >= self.maxsize:
 51 |             return True
 52 |         else:
 53 |             return False
 54 | 
 55 |     def put_nowait(self, obj):
 56 |         if self.lazy_limit and self.last_qsize < self.maxsize:
 57 |             pass
 58 |         elif self.full():
 59 |             raise self.Full
 60 |         self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj))
 61 |         return True
 62 | 
 63 |     def put(self, obj, block=True, timeout=None):
 64 |         if not block:
 65 |             return self.put_nowait()
 66 | 
 67 |         start_time = time.time()
 68 |         while True:
 69 |             try:
 70 |                 return self.put_nowait(obj)
 71 |             except self.Full:
 72 |                 if timeout:
 73 |                     lasted = time.time() - start_time
 74 |                     if timeout > lasted:
 75 |                         time.sleep(min(self.max_timeout, timeout - lasted))
 76 |                     else:
 77 |                         raise
 78 |                 else:
 79 |                     time.sleep(self.max_timeout)
 80 | 
 81 |     def get_nowait(self):
 82 |         ret = self.redis.lpop(self.name)
 83 |         if ret is None:
 84 |             raise self.Empty
 85 |         return umsgpack.unpackb(ret)
 86 | 
 87 |     def get(self, block=True, timeout=None):
 88 |         if not block:
 89 |             return self.get_nowait()
 90 | 
 91 |         start_time = time.time()
 92 |         while True:
 93 |             try:
 94 |                 return self.get_nowait()
 95 |             except self.Empty:
 96 |                 if timeout:
 97 |                     lasted = time.time() - start_time
 98 |                     if timeout > lasted:
 99 |                         time.sleep(min(self.max_timeout, timeout - lasted))
100 |                     else:
101 |                         raise
102 |                 else:
103 |                     time.sleep(self.max_timeout)
104 | 
105 | Queue = RedisQueue
106 | 


--------------------------------------------------------------------------------
/spider/pyspider/processor/__init__.py:
--------------------------------------------------------------------------------
1 | from .processor import ProcessorResult, Processor
2 | 


--------------------------------------------------------------------------------
/spider/pyspider/result/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-10-19 16:10:19
7 | 
8 | from .result_worker import ResultWorker, OneResultWorker
9 | 


--------------------------------------------------------------------------------
/spider/pyspider/result/result_worker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-19 15:37:46
 7 | 
 8 | import time
 9 | import json
10 | import logging
11 | from six.moves import queue as Queue
12 | logger = logging.getLogger("result")
13 | 
14 | 
15 | class ResultWorker(object):
16 | 
17 |     """
18 |     do with result
19 |     override this if needed.
20 |     """
21 | 
22 |     def __init__(self, resultdb, inqueue):
23 |         self.resultdb = resultdb
24 |         self.inqueue = inqueue
25 |         self._quit = False
26 | 
27 |     def on_result(self, task, result):
28 |         '''Called every result'''
29 |         if not result:
30 |             return
31 |         if 'taskid' in task and 'project' in task and 'url' in task:
32 |             logger.info('result %s:%s %s -> %.30r' % (
33 |                 task['project'], task['taskid'], task['url'], result))
34 |             return self.resultdb.save(
35 |                 project=task['project'],
36 |                 taskid=task['taskid'],
37 |                 url=task['url'],
38 |                 result=result
39 |             )
40 |         else:
41 |             logger.warning('result UNKNOW -> %.30r' % result)
42 |             return
43 | 
44 |     def quit(self):
45 |         self._quit = True
46 | 
47 |     def run(self):
48 |         '''Run loop'''
49 |         logger.info("result_worker starting...")
50 | 
51 |         while not self._quit:
52 |             try:
53 |                 task, result = self.inqueue.get(timeout=1)
54 |                 self.on_result(task, result)
55 |             except Queue.Empty as e:
56 |                 continue
57 |             except KeyboardInterrupt:
58 |                 break
59 |             except AssertionError as e:
60 |                 logger.error(e)
61 |                 continue
62 |             except Exception as e:
63 |                 logger.exception(e)
64 |                 continue
65 | 
66 |         logger.info("result_worker exiting...")
67 | 
68 | 
69 | class OneResultWorker(ResultWorker):
70 |     '''Result Worker for one mode, write results to stdout'''
71 |     def on_result(self, task, result):
72 |         '''Called every result'''
73 |         if not result:
74 |             return
75 |         if 'taskid' in task and 'project' in task and 'url' in task:
76 |             logger.info('result %s:%s %s -> %.30r' % (
77 |                 task['project'], task['taskid'], task['url'], result))
78 |             print(json.dumps({
79 |                 'taskid': task['taskid'],
80 |                 'project': task['project'],
81 |                 'url': task['url'],
82 |                 'result': result,
83 |                 'updatetime': time.time()
84 |             }))
85 |         else:
86 |             logger.warning('result UNKNOW -> %.30r' % result)
87 |             return
88 | 


--------------------------------------------------------------------------------
/spider/pyspider/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler  # NOQA
2 | 


--------------------------------------------------------------------------------
/spider/pyspider/scheduler/token_bucket.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-07 16:53:08
 7 | 
 8 | import time
 9 | try:
10 |     import threading as _threading
11 | except ImportError:
12 |     import dummy_threading as _threading
13 | 
14 | 
15 | class Bucket(object):
16 | 
17 |     '''
18 |     traffic flow control with token bucket
19 |     '''
20 | 
21 |     update_interval = 30
22 | 
23 |     def __init__(self, rate=1, burst=None):
24 |         self.rate = float(rate)
25 |         if burst is None:
26 |             self.burst = float(rate) * 10
27 |         else:
28 |             self.burst = float(burst)
29 |         self.mutex = _threading.Lock()
30 |         self.bucket = self.burst
31 |         self.last_update = time.time()
32 | 
33 |     def get(self):
34 |         '''Get the number of tokens in bucket'''
35 |         now = time.time()
36 |         if self.bucket >= self.burst:
37 |             self.last_update = now
38 |             return self.bucket
39 |         bucket = self.rate * (now - self.last_update)
40 |         self.mutex.acquire()
41 |         if bucket > 1:
42 |             self.bucket += bucket
43 |             if self.bucket > self.burst:
44 |                 self.bucket = self.burst
45 |             self.last_update = now
46 |         self.mutex.release()
47 |         return self.bucket
48 | 
49 |     def set(self, value):
50 |         '''Set number of tokens in bucket'''
51 |         self.bucket = value
52 | 
53 |     def desc(self, value=1):
54 |         '''Use value tokens'''
55 |         self.bucket -= value
56 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-02-22 23:20:40
7 | 
8 | from . import app, index, debug, task, result, login
9 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/bench_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-08 22:31:17
 7 | 
 8 | import random
 9 | try:
10 |     from urllib import urlencode
11 | except ImportError:
12 |     from urllib.parse import urlencode
13 | 
14 | from flask import request
15 | from .app import app
16 | 
17 | 
18 | @app.route('/bench')
19 | def bench_test():
20 |     total = int(request.args.get('total', 10000))
21 |     show = int(request.args.get('show', 20))
22 |     nlist = [random.randint(1, total) for _ in range(show)]
23 |     result = []
24 |     result.append("<html><head></head><body>")
25 |     args = dict(request.args)
26 |     for nl in nlist:
27 |         args['n'] = nl
28 |         argstr = urlencode(sorted(args.items()), doseq=True)
29 |         result.append("<a href='/bench?{0}'>follow {1}</a><br>".format(argstr, nl))
30 |     result.append("</body></html>")
31 |     return "".join(result)
32 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-10 20:36:27
 7 | 
 8 | import base64
 9 | from flask import Response
10 | from flask.ext import login
11 | from .app import app
12 | 
13 | login_manager = login.LoginManager()
14 | login_manager.init_app(app)
15 | 
16 | 
17 | class AnonymousUser(login.AnonymousUserMixin):
18 | 
19 |     def is_anonymous(self):
20 |         return True
21 | 
22 |     def is_active(self):
23 |         return False
24 | 
25 |     def is_authenticated(self):
26 |         return False
27 | 
28 |     def get_id(self):
29 |         return
30 | 
31 | 
32 | class User(login.UserMixin):
33 | 
34 |     def __init__(self, id, password):
35 |         self.id = id
36 |         self.password = password
37 | 
38 |     def is_authenticated(self):
39 |         if not app.config.get('webui_username'):
40 |             return True
41 |         if self.id == app.config.get('webui_username') \
42 |                 and self.password == app.config.get('webui_password'):
43 |             return True
44 |         return False
45 | 
46 |     def is_active(self):
47 |         return self.is_authenticated()
48 | 
49 | 
50 | login_manager.anonymous_user = AnonymousUser
51 | 
52 | 
53 | @login_manager.request_loader
54 | def load_user_from_request(request):
55 |     api_key = request.headers.get('Authorization')
56 |     if api_key:
57 |         api_key = api_key[len("Basic "):]
58 |         try:
59 |             api_key = base64.b64decode(api_key).decode('utf8')
60 |             return User(*api_key.split(":", 1))
61 |         except Exception as e:
62 |             app.logger.error('wrong api key: %r, %r', api_key, e)
63 |             return None
64 |     return None
65 | app.login_response = Response(
66 |     "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}
67 | )
68 | 
69 | 
70 | @app.before_request
71 | def before_request():
72 |     if app.config.get('need_auth', False):
73 |         if not login.current_user.is_active():
74 |             return app.login_response
75 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-19 16:23:55
 7 | 
 8 | from __future__ import unicode_literals
 9 | 
10 | from flask import render_template, request, json
11 | from flask import Response
12 | from .app import app
13 | from pyspider.libs import result_dump
14 | 
15 | 
16 | @app.route('/results')
17 | def result():
18 |     resultdb = app.config['resultdb']
19 |     project = request.args.get('project')
20 |     offset = int(request.args.get('offset', 0))
21 |     limit = int(request.args.get('limit', 20))
22 | 
23 |     count = resultdb.count(project)
24 |     results = list(resultdb.select(project, offset=offset, limit=limit))
25 | 
26 |     return render_template(
27 |         "result.html", count=count, results=results,
28 |         result_formater=result_dump.result_formater,
29 |         project=project, offset=offset, limit=limit, json=json
30 |     )
31 | 
32 | 
33 | @app.route('/results/dump/<project>.<_format>')
34 | def dump_result(project, _format):
35 |     resultdb = app.config['resultdb']
36 |     # force update project list
37 |     resultdb.get(project, 'any')
38 |     if project not in resultdb.projects:
39 |         return "no such project.", 404
40 | 
41 |     offset = int(request.args.get('offset', 0)) or None
42 |     limit = int(request.args.get('limit', 0)) or None
43 |     results = resultdb.select(project, offset=offset, limit=limit)
44 | 
45 |     if _format == 'json':
46 |         valid = request.args.get('style', 'rows') == 'full'
47 |         return Response(result_dump.dump_as_json(results, valid),
48 |                         mimetype='application/json')
49 |     elif _format == 'txt':
50 |         return Response(result_dump.dump_as_txt(results),
51 |                         mimetype='text/plain')
52 |     elif _format == 'csv':
53 |         return Response(result_dump.dump_as_csv(results),
54 |                         mimetype='text/csv')
55 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/index.css:
--------------------------------------------------------------------------------
  1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
  2 | /* Author: Binux<i@binux.me> */
  3 | /*         http://binux.me */
  4 | /* Created on 2014-02-23 00:28:30 */
  5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
  6 | /* Author: Binux<i@binux.me> */
  7 | /*         http://binux.me */
  8 | /* Created on 2014-07-16 19:18:30 */
  9 | h1 {
 10 |   margin-top: 5px;
 11 | }
 12 | header .alert {
 13 |   position: absolute;
 14 |   width: 50rem;
 15 |   left: 50%;
 16 |   margin-left: -25rem;
 17 | }
 18 | .queue-info th,
 19 | .queue-info td {
 20 |   text-align: center;
 21 |   border: 1px solid #ddd;
 22 | }
 23 | .projects {
 24 |   min-width: 850px;
 25 |   border-top: 1px solid #ddd;
 26 |   border-bottom: 1px solid #ddd;
 27 | }
 28 | .projects .project-group {
 29 |   width: 80px;
 30 | }
 31 | .projects .project-name {
 32 |   font-weight: bold;
 33 | }
 34 | .projects .project-status {
 35 |   width: 100px;
 36 | }
 37 | .projects .project-status > span {
 38 |   border: solid 1px #666666;
 39 |   padding: 1px 5px 0 5px;
 40 |   background: #808080;
 41 |   color: white;
 42 | }
 43 | .projects span.status-TODO {
 44 |   border: solid 1px #ec971f;
 45 |   padding: 1px 5px 0 5px;
 46 |   background: #f0ad4e;
 47 |   color: white;
 48 | }
 49 | .projects span.status-STOP {
 50 |   border: solid 1px #c9302c;
 51 |   padding: 1px 5px 0 5px;
 52 |   background: #d9534f;
 53 |   color: white;
 54 | }
 55 | .projects span.status-CHECKING {
 56 |   border: solid 1px #dcbe00;
 57 |   padding: 1px 5px 0 5px;
 58 |   background: #ffde10;
 59 |   color: white;
 60 | }
 61 | .projects span.status-DEBUG {
 62 |   border: solid 1px #3071a9;
 63 |   padding: 1px 5px 0 5px;
 64 |   background: #428bca;
 65 |   color: white;
 66 | }
 67 | .projects span.status-RUNNING {
 68 |   border: solid 1px #449d44;
 69 |   padding: 1px 5px 0 5px;
 70 |   background: #5cb85c;
 71 |   color: white;
 72 | }
 73 | .projects .project-rate {
 74 |   width: 110px;
 75 | }
 76 | .projects .project-time {
 77 |   width: 110px;
 78 | }
 79 | .projects th.project-progress {
 80 |   position: relative;
 81 | }
 82 | .projects th.project-progress span {
 83 |   position: absolute;
 84 | }
 85 | .projects td.project-progress {
 86 |   position: relative;
 87 |   min-width: 5%;
 88 | }
 89 | .projects td.project-progress.progress-all {
 90 |   min-width: 10%;
 91 | }
 92 | .projects td.project-progress .progress {
 93 |   position: relative;
 94 |   margin: 0;
 95 |   background-color: #aaa;
 96 | }
 97 | .projects td.project-progress .progress .progress-text {
 98 |   width: 100%;
 99 |   text-align: center;
100 |   position: absolute;
101 |   font-weight: bold;
102 |   color: #fff;
103 |   pointer-events: none;
104 | }
105 | .projects td.project-progress .progress .progress-bar {
106 |   -webkit-transition: none;
107 |   transition: none;
108 | }
109 | .projects .project-actions {
110 |   width: 200px;
111 | }
112 | .global-btn {
113 |   margin-top: -5px;
114 |   padding: 10px 10px 10px 10px;
115 | }
116 | .global-btn .create-btn-div {
117 |   float: right;
118 | }
119 | .global-btn .active-btn-div {
120 |   float: left;
121 | }
122 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/index.less:
--------------------------------------------------------------------------------
  1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
  2 | /* Author: Binux<i@binux.me> */
  3 | /*         http://binux.me */
  4 | /* Created on 2014-02-23 00:28:30 */
  5 | 
  6 | @import "variable";
  7 | 
  8 | h1 {
  9 |   margin-top: 5px;
 10 | }
 11 | 
 12 | header .alert {
 13 |   position: absolute;;
 14 |   width: 50rem;
 15 |   left: 50%;
 16 |   margin-left: -25rem;
 17 | }
 18 | 
 19 | .queue-info {
 20 |   th, td {
 21 |     text-align: center;
 22 |     border: 1px solid #ddd;
 23 |   }
 24 | }
 25 | 
 26 | .projects {
 27 |   min-width: 850px;
 28 |   border-top: 1px solid #ddd;
 29 |   border-bottom: 1px solid #ddd;
 30 | 
 31 |   .project-group {
 32 |     width: 80px;
 33 |   }
 34 | 
 35 |   .project-name {
 36 |     font-weight: bold;
 37 |   }
 38 | 
 39 |   .project-status {
 40 |     width: 100px;
 41 |   }
 42 |   .project-status-span(@color) {
 43 |     border: solid 1px darken(@color, 10%);
 44 |     padding: 1px 5px 0 5px;
 45 |     background: @color;
 46 |     color: white;
 47 |   }
 48 |   .project-status>span {
 49 |     .project-status-span(lighten(black, 50%));
 50 |   }
 51 |   span.status-TODO {
 52 |     .project-status-span(@orange);
 53 |   }
 54 |   span.status-STOP {
 55 |     .project-status-span(@red);
 56 |   }
 57 |   span.status-CHECKING {
 58 |     .project-status-span(darken(@yellow, 10%));
 59 |   }
 60 |   span.status-DEBUG {
 61 |     .project-status-span(@blue);
 62 |   }
 63 |   span.status-RUNNING {
 64 |     .project-status-span(@green);
 65 |   }
 66 | 
 67 |   .project-rate {
 68 |     width: 110px;
 69 |   }
 70 | 
 71 |   .project-time {
 72 |     width: 110px;
 73 |   }
 74 |   
 75 |   th.project-progress {
 76 |     position: relative;
 77 |     span {
 78 |       position: absolute;
 79 |     }
 80 |   }
 81 | 
 82 |   td.project-progress {
 83 |     position: relative;
 84 |     min-width: 5%;
 85 |     &.progress-all {
 86 |       min-width: 10%;
 87 |     }
 88 | 
 89 |     .progress {
 90 |       position: relative;
 91 |       margin: 0;
 92 |       background-color: #aaa;
 93 |       .progress-text {
 94 |         width: 100%;
 95 |         text-align: center;
 96 |         position: absolute;
 97 |         font-weight: bold;
 98 |         color: #fff;
 99 |         pointer-events: none;
100 |       }
101 |       .progress-bar {
102 |         -webkit-transition: none;
103 |         transition: none;
104 |       }
105 |     }
106 |   }
107 | 
108 |   .project-actions {
109 |     width: 200px;
110 |   }
111 | }
112 | 
113 | .global-btn {
114 |   margin-top: -5px;
115 |   padding: 10px 10px 10px 10px;
116 | 
117 |   .create-btn-div {
118 |     float: right;
119 |   }
120 | 
121 |   .active-btn-div {
122 |     float: left;
123 |   }
124 | }
125 | 
126 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/result.css:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-10-22 22:38:45 */
 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 6 | /* Author: Binux<i@binux.me> */
 7 | /*         http://binux.me */
 8 | /* Created on 2014-07-16 19:18:30 */
 9 | .top-bar {
10 |   padding: 10px 15px 2px 15px;
11 |   height: 46px;
12 |   background-color: #f5f5f5;
13 |   border-bottom: 1px solid #ddd;
14 |   position: relative;
15 | }
16 | .top-bar h1 {
17 |   margin: 0 0 10px 0;
18 |   font-size: 18px;
19 | }
20 | .top-bar .btn-group {
21 |   margin: 8px 10px 0 0;
22 |   position: absolute;
23 |   right: 0;
24 |   top: 0;
25 | }
26 | .pagination-wrap {
27 |   text-align: right;
28 |   padding-right: 15px;
29 | }
30 | table {
31 |   border-bottom: 1px solid #ddd;
32 | }
33 | table td {
34 |   word-break: break-all;
35 | }
36 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/result.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-10-22 22:38:45 */
 5 | 
 6 | @import "variable";
 7 | 
 8 | .top-bar {
 9 |   padding: 10px 15px 2px 15px;
10 |   height: 46px;
11 |   background-color: #f5f5f5;
12 |   border-bottom: 1px solid #ddd;
13 |   position: relative;
14 |   
15 |   h1 {
16 |     margin: 0 0 10px 0;
17 |     font-size: 18px;
18 |   }
19 | 
20 |   .btn-group {
21 |     margin: 8px 10px 0 0;
22 |     position: absolute;
23 |     right: 0;
24 |     top: 0;
25 | 
26 |     a.btn {
27 |     }
28 |   }
29 | }
30 | 
31 | .pagination-wrap {
32 |   text-align: right;
33 |   padding-right: 15px;
34 | }
35 | 
36 | table {
37 |   border-bottom: 1px solid #ddd;
38 | 
39 |   td {
40 |     word-break: break-all;
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/task.css:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-16 19:20:30 */
 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 6 | /* Author: Binux<i@binux.me> */
 7 | /*         http://binux.me */
 8 | /* Created on 2014-07-16 19:18:30 */
 9 | .base-info {
10 |   padding: 10px 15px 2px 15px;
11 |   background-color: #f5f5f5;
12 | }
13 | .more-info {
14 |   padding: 10px 15px;
15 |   border-top: 1px solid #ddd;
16 | }
17 | .more-info dd {
18 |   display: block;
19 |   font-family: monospace;
20 |   white-space: pre;
21 |   word-break: break-all;
22 |   word-wrap: break-word;
23 |   margin: 1em 0px;
24 | }
25 | .status-1 {
26 |   border: solid 1px #3071a9;
27 |   padding: 1px 5px 0 5px;
28 |   background: #428bca;
29 |   color: white;
30 | }
31 | .status-2 {
32 |   border: solid 1px #449d44;
33 |   padding: 1px 5px 0 5px;
34 |   background: #5cb85c;
35 |   color: white;
36 | }
37 | .status-3 {
38 |   border: solid 1px #c9302c;
39 |   padding: 1px 5px 0 5px;
40 |   background: #d9534f;
41 |   color: white;
42 | }
43 | .status-4 {
44 |   border: solid 1px #666666;
45 |   padding: 1px 5px 0 5px;
46 |   background: #808080;
47 |   color: white;
48 | }
49 | .url {
50 |   font-size: 120%;
51 |   text-decoration: underline;
52 | }
53 | .callback {
54 |   color: #f0ad4e;
55 |   font-weight: bold;
56 | }
57 | .callback:hover,
58 | .callback:focus {
59 |   color: #ec971f;
60 | }
61 | dt .glyphicon-ok {
62 |   color: #5cb85c;
63 | }
64 | dt .glyphicon-remove {
65 |   color: #d9534f;
66 | }
67 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/task.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-16 19:20:30 */
 5 | 
 6 | @import "variable";
 7 | 
 8 | .base-info {
 9 |   padding: 10px 15px 2px 15px;
10 |   background-color: #f5f5f5;
11 |   border-bottom: 1px solid #ddd;
12 | }
13 | 
14 | .more-info {
15 |   padding: 10px 15px;
16 | }
17 | 
18 | .more-info dd {
19 |   display: block;
20 |   font-family: monospace;
21 |   white-space: pre;
22 |   word-break: break-all;
23 |   word-wrap: break-word;
24 |   margin: 1em 0px;
25 | }
26 | 
27 | .status_mix(@color: lighten(black, 50%)) {
28 |   border: solid 1px darken(@color, 10%);
29 |   padding: 1px 5px 0 5px;
30 |   background: @color;
31 |   color: white;
32 | }
33 | .status {
34 |   &-1 {
35 |     .status_mix(@blue);
36 |   }
37 |   &-2 {
38 |     .status_mix(@green);
39 |   }
40 |   &-3 {
41 |     .status_mix(@red);
42 |   }
43 |   &-4 {
44 |     .status_mix;
45 |   }
46 | }
47 | 
48 | .url {
49 |   font-size: 120%;
50 |   text-decoration: underline;
51 | }
52 | 
53 | .callback {
54 |   color: @orange;
55 |   font-weight: bold;
56 | 
57 |   &:hover, &:focus {
58 |     color: darken(@orange, 10%);
59 |   }
60 | }
61 | 
62 | dt .glyphicon-ok {
63 |   color: @green;
64 | }
65 | dt .glyphicon-remove {
66 |   color: @red;
67 | }
68 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/tasks.css:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-18 23:20:46 */
 5 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 6 | /* Author: Binux<i@binux.me> */
 7 | /*         http://binux.me */
 8 | /* Created on 2014-07-16 19:18:30 */
 9 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
10 | /* Author: Binux<i@binux.me> */
11 | /*         http://binux.me */
12 | /* Created on 2014-07-16 19:20:30 */
13 | .base-info {
14 |   padding: 10px 15px 2px 15px;
15 |   background-color: #f5f5f5;
16 |   border-bottom: 1px solid #ddd;
17 | }
18 | .more-info {
19 |   padding: 10px 15px;
20 | }
21 | .more-info dd {
22 |   display: block;
23 |   font-family: monospace;
24 |   white-space: pre;
25 |   word-break: break-all;
26 |   word-wrap: break-word;
27 |   margin: 1em 0px;
28 | }
29 | .status-1 {
30 |   border: solid 1px #3071a9;
31 |   padding: 1px 5px 0 5px;
32 |   background: #428bca;
33 |   color: white;
34 | }
35 | .status-2 {
36 |   border: solid 1px #449d44;
37 |   padding: 1px 5px 0 5px;
38 |   background: #5cb85c;
39 |   color: white;
40 | }
41 | .status-3 {
42 |   border: solid 1px #c9302c;
43 |   padding: 1px 5px 0 5px;
44 |   background: #d9534f;
45 |   color: white;
46 | }
47 | .status-4 {
48 |   border: solid 1px #666666;
49 |   padding: 1px 5px 0 5px;
50 |   background: #808080;
51 |   color: white;
52 | }
53 | .url {
54 |   font-size: 120%;
55 |   text-decoration: underline;
56 | }
57 | .callback {
58 |   color: #f0ad4e;
59 |   font-weight: bold;
60 | }
61 | .callback:hover,
62 | .callback:focus {
63 |   color: #ec971f;
64 | }
65 | dt .glyphicon-ok {
66 |   color: #5cb85c;
67 | }
68 | dt .glyphicon-remove {
69 |   color: #d9534f;
70 | }
71 | .tasks {
72 |   margin: 0;
73 |   padding: 0;
74 |   list-style-type: none;
75 | }
76 | .tasks li {
77 |   padding: 10px 15px 2px 15px;
78 |   background-color: #f5f5f5;
79 |   border-bottom: 1px solid #ddd;
80 | }
81 | .tasks li:nth-child(even) {
82 |   background-color: white;
83 | }
84 | .tasks .url {
85 |   display: inline-block;
86 |   vertical-align: bottom;
87 |   max-width: 40em;
88 |   overflow: hidden;
89 |   white-space: nowrap;
90 |   text-overflow: ellipsis;
91 | }
92 | .tasks .update-time {
93 |   font-weight: bold;
94 | }
95 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/tasks.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-18 23:20:46 */
 5 | 
 6 | @import "variable";
 7 | @import "task";
 8 | 
 9 | .tasks {
10 |   margin: 0;
11 |   padding: 0;
12 |   list-style-type: none;
13 | 
14 |   li {
15 |     .base-info;
16 | 
17 |     &:nth-child(even) {
18 |       background-color: white;
19 |     }
20 |   }
21 | 
22 |   .url {
23 |     display: inline-block;
24 |     vertical-align: bottom;
25 |     max-width: 40em;
26 |     overflow: hidden;
27 |     white-space: nowrap;
28 |     text-overflow: ellipsis;
29 |   }
30 |   
31 |   .update-time {
32 |     font-weight: bold;
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/static/variable.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-16 19:18:30 */
 5 | 
 6 | // colors
 7 | @gray-darker:            lighten(#000, 13.5%); // #222
 8 | @gray-dark:              lighten(#000, 20%);   // #333
 9 | @gray:                   lighten(#000, 33.5%); // #555
10 | @gray-light:             lighten(#000, 60%);   // #999
11 | @gray-lighter:           lighten(#000, 93.5%); // #eee
12 | 
13 | @blue: #428bca;
14 | @green: #5cb85c;
15 | @blue-light: #5bc0de;
16 | @orange: #f0ad4e;
17 | @yellow: #ffe543;
18 | @red: #d9534f;
19 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/task.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-07-16 15:30:57
 7 | 
 8 | import socket
 9 | from flask import abort, render_template, request, json
10 | 
11 | from pyspider.libs import utils
12 | from .app import app
13 | 
14 | 
15 | @app.route('/task/<taskid>')
16 | def task(taskid):
17 |     if ':' not in taskid:
18 |         abort(400)
19 |     project, taskid = taskid.split(':', 1)
20 | 
21 |     taskdb = app.config['taskdb']
22 |     task = taskdb.get_task(project, taskid)
23 |     if not task:
24 |         abort(404)
25 |     resultdb = app.config['resultdb']
26 |     if resultdb:
27 |         result = resultdb.get(project, taskid)
28 | 
29 |     return render_template("task.html", task=task, json=json, result=result,
30 |                            status_to_string=app.config['taskdb'].status_to_string)
31 | 
32 | 
33 | @app.route('/tasks')
34 | def tasks():
35 |     rpc = app.config['scheduler_rpc']
36 |     taskdb = app.config['taskdb']
37 |     project = request.args.get('project', "")
38 |     limit = int(request.args.get('limit', 100))
39 | 
40 |     try:
41 |         updatetime_tasks = rpc.get_active_tasks(project, limit)
42 |     except socket.error as e:
43 |         app.logger.warning('connect to scheduler rpc error: %r', e)
44 |         return 'connect to scheduler error', 502
45 | 
46 |     tasks = {}
47 |     result = []
48 |     for updatetime, task in sorted(updatetime_tasks , key=lambda x: x[0]):
49 |         key = '%(project)s:%(taskid)s' % task
50 |         task['updatetime'] = updatetime
51 |         if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE:
52 |             result.append(tasks[key])
53 |         tasks[key] = task
54 |     result.extend(tasks.values())
55 | 
56 |     return render_template(
57 |         "tasks.html",
58 |         tasks=result,
59 |         status_to_string=taskdb.status_to_string
60 |     )
61 | 
62 | 
63 | @app.route('/active_tasks')
64 | def active_tasks():
65 |     rpc = app.config['scheduler_rpc']
66 |     taskdb = app.config['taskdb']
67 |     project = request.args.get('project', "")
68 |     limit = int(request.args.get('limit', 100))
69 | 
70 |     try:
71 |         tasks = rpc.get_active_tasks(project, limit)
72 |     except socket.error as e:
73 |         app.logger.warning('connect to scheduler rpc error: %r', e)
74 |         return '{}', 502, {'Content-Type': 'application/json'}
75 | 
76 |     result = []
77 |     for updatetime, task in tasks:
78 |         task['updatetime'] = updatetime
79 |         task['updatetime_text'] = utils.format_date(updatetime)
80 |         if 'status' in task:
81 |             task['status_text'] = taskdb.status_to_string(task['status'])
82 |         result.append(task)
83 | 
84 |     return json.dumps(result), 200, {'Content-Type': 'application/json'}
85 | 
86 | app.template_filter('format_date')(utils.format_date)
87 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/templates/helper.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <body>
 4 |     <script>
 5 |       {% if height %}
 6 |         parent.parent.resize_iframe({{ height }});
 7 |       {% else %}
 8 |         {% autoescape false %}
 9 |           {{ script }};
10 |         {% endautoescape %}
11 |       {% endif %}
12 |     </script>
13 |   </body>
14 | </html>
15 | <!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->
16 | 
17 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/templates/helper.js:
--------------------------------------------------------------------------------
 1 | // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:
 2 | // Author: Binux<i@binux.me>
 3 | //         http://binux.me
 4 | // Created on 2014-03-16 11:05:05
 5 | 
 6 | (function() {
 7 |   var loaded = false;
 8 |   var start_time = (new Date()).getTime();
 9 |   function resize() {
10 |     if (!loaded)
11 |       parent.postMessage({type: 'resize', height: document.body.scrollHeight}, '*');
12 |   }
13 | 
14 |   window.addEventListener('load', function() {
15 |     resize();
16 |     loaded = true;
17 |   });
18 |   setTimeout(resize, 5000);
19 |   setTimeout(resize, 10000);
20 |   setTimeout(resize, 20000);
21 |   setTimeout(resize, 30000);
22 | 
23 |   var css_helper_enabled = false;
24 |   window.addEventListener("message", function(ev) {
25 |     if (!css_helper_enabled && ev.data.type == "enable_css_selector_helper") {
26 |       var script = document.createElement("script");
27 |       script.src = "//{{ host }}/static/css_selector_helper.js";
28 |       document.body.appendChild(script);
29 |       css_helper_enabled = true;
30 |     }
31 |   }, false);
32 | 
33 |   document.addEventListener('click', function(ev) {
34 |     ev.preventDefault();
35 |   });
36 | })();
37 | 


--------------------------------------------------------------------------------
/spider/pyspider/webui/templates/tasks.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Tasks - pyspider</title>
 6 |     <!--[if lt IE 9]>
 7 |       <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 8 |     <![endif]-->
 9 | 
10 |     <meta name="description" content="last actived tasks">
11 |     <meta name="author" content="binux">
12 |     <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
13 |     <link href="{{ url_for('static', filename='tasks.css') }}" rel="stylesheet">
14 | 
15 |     <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
16 |     <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
17 |   </head>
18 | 
19 |   <body>
20 |     <ol class=tasks>
21 |       {% for task in tasks | sort(reverse=True, attribute='updatetime') %}
22 |       <li class=task>
23 |         {% if task.status %}
24 |           <span class="status status-{{ task.status }}">{{ status_to_string(task.status) }}</span>
25 |         {% elif task.track %}
26 |         <span class="status status-3">
27 |           {% set fetchok = task.track.fetch and task.track.fetch.ok %}
28 |           {% set processok = task.track.process and task.track.process.ok %}
29 |           {%- if not fetchok -%}
30 |           FETCH_ERROR
31 |           {%- elif not processok -%}
32 |           PROCESS_ERROR
33 |           {%- endif -%}
34 |         </span>
35 |         {% else %}
36 |           <span class="status status-4 }}">ERROR</span>
37 |         {% endif %}
38 | 
39 |         <a class=callback href="/debug/{{ task.project }}?taskid={{ task.taskid }}" target=_blank>{{ task.project }}</a>
40 |         &gt;
41 |         <a class=url href="/task/{{ task.project }}:{{ task.taskid }}" title="{{ task.url }}" target=_blank>{{ task.url }}</a>
42 | 
43 |         <span class=update-time>{{ task.updatetime | format_date }}</span>
44 | 
45 |         {% if task.track and task.track.fetch %}
46 |         <span span=use-time>
47 |           {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms
48 |         </span>
49 |         {% endif %}
50 | 
51 |         <span span=follows>
52 |         {% if task.track and task.track.process %}
53 |         +{{ task.track.process.follows | int }}
54 |         {% endif %}
55 |         </span>
56 |       </li>
57 |       {% endfor %}
58 |     </ul>
59 |   </body>
60 | </html>
61 | <!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->
62 | 
63 | 


--------------------------------------------------------------------------------
/spider/pyspiderSource/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/.DS_Store


--------------------------------------------------------------------------------
/spider/pyspiderSource/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/__all__/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/__all__/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_baby/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_baby/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_car/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_car/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_discovery/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_discovery/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_entertainment/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_entertainment/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_essay/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_essay/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_fashion/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_fashion/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_finance/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_finance/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_food/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_food/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_game/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_game/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_history/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_history/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_hot/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_hot/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_military/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_military/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_regimen/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_regimen/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_society/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_society/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_sports/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_sports/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_story/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_story/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_tech/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_tech/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_travel/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_travel/README.md


--------------------------------------------------------------------------------
/spider/pyspiderSource/news_world/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/pyspiderSource/news_world/README.md


--------------------------------------------------------------------------------
/spider/sina/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sina/README.md


--------------------------------------------------------------------------------
/spider/sina/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/spider/sina/sina.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'Jeezy'
 3 | 
 4 | import requests
 5 | import re
 6 | 
 7 | class GetSina():
 8 | 	'''
 9 | 	通过网易新闻API获取新闻信息，保存至exel
10 | 	'''
11 | 	def __init__(self,num,page):
12 | 		self.num = str(num)
13 | 		self.page = str(page)
14 | 		self.url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&k=&num="+self.num+"&asc=&page="+self.page+"&r=0.41627189057293945"
15 | 	def getNews(self):
16 | 		#通过API爬取新浪新闻文本内容
17 | 		gettext = requests.get(self.url)
18 | 		gettext.encoding='gbk'
19 | 		gettext = gettext.text
20 | 		allNewsData = []
21 | 		patten = re.compile('channel : {title : "(.*?)",id.*?title : "(.*?)",url : "(.*?)",type.*?time : (.*?)}',re.S)
22 | 		items = re.findall(patten,gettext)
23 | 		for eachData in items:
24 | 			newsData = {}
25 | 			newsData["tag"] = eachData[0]
26 | 			newsData["title"] = eachData[1]
27 | 			newsData["display_url"] = eachData[2]
28 | 			newsData["display_time"] = eachData[3]
29 | 			newsData["source"] = "新浪新闻"
30 | 			allNewsData.append(newsData)
31 | 		return allNewsData
32 | 			#if allNewsData[0]['tag']== "体育":
33 | 			#print (allNewsData)
34 | 
35 | #sina =GetSina(5,1)
36 | #sina.getNews()


--------------------------------------------------------------------------------
/spider/sinaSource/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_entertainment/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_entertainment/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_finance/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_finance/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_military/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_military/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_society/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_society/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_sports/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_sports/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_tech/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_tech/README.md


--------------------------------------------------------------------------------
/spider/sinaSource/news_world/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/sinaSource/news_world/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/__all__/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/__all__/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/gallery_detail/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/gallery_detail/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_baby/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_baby/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_car/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_car/.DS_Store


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_car/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_car/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_discovery/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_discovery/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_entertainment/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_entertainment/.DS_Store


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_entertainment/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_entertainment/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_essay/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_essay/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_fashion/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_fashion/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_finance/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_finance/.DS_Store


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_finance/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_finance/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_food/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_food/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_game/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_game/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_history/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_history/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_hot/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_hot/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_military/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_military/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_regimen/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_regimen/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_society/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_society/.DS_Store


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_society/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_society/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_sports/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_sports/.DS_Store


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_sports/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_sports/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_story/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_story/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_tech/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_tech/.DS_Store


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_tech/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_tech/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_travel/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_travel/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/news_world/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/news_world/README.md


--------------------------------------------------------------------------------
/spider/touTiaoSource/video/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/touTiaoSource/video/README.md


--------------------------------------------------------------------------------
/spider/toutiao/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | __author__ = 'howie'


--------------------------------------------------------------------------------
/spider/toutiao/touTiao.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | 
 4 | import sys, time, json, requests
 5 | 
 6 | 
 7 | class GetToutiao():
 8 |     """
 9 |     通过今日头条API获取新闻信息,保存至本地excel
10 |     """
11 | 
12 |     def __init__(self, count, category, time):
13 |         self.count = count
14 |         self.category = category
15 |         self.time = time
16 |         self.url = "http://toutiao.com/api/article/recent/?count=" + count + "&category=" + category + "&as=A1A5177BB0F7063&cp=57B0776066D39E1&max_create_time=1471155832&_=" + str(
17 |             time)
18 | 
19 |     def getNews(self):
20 |         print(self.url)
21 |         try:
22 |             header = {
23 |                 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
24 |             root = requests.get("http://toutiao.com/", headers=header)
25 |             news = requests.get(self.url, headers=header, cookies=root.cookies)
26 |             allNewsData = []
27 |             try:
28 |                 news = str(news.text).strip("'<>() ").replace('\'', '\"')
29 |                 newsJson = json.loads(news)
30 |                 if newsJson["data"]:
31 |                     for eachData in newsJson["data"]:
32 |                         newsData = {}
33 |                         newsData["title"] = eachData["title"]
34 |                         newsData["display_url"] = eachData["display_url"]
35 |                         newsData["display_time"] = eachData["display_time"]
36 |                         newsData["source"] = eachData["source"]
37 |                         newsData["keywords"] = eachData["keywords"]
38 |                         newsData["abstract"] = eachData["abstract"]
39 |                         if "middle_image" in eachData.keys():
40 |                             newsData["images"] = eachData["middle_image"]
41 |                         else:
42 |                             newsData["images"] = "null"
43 |                         newsData["tag"] = eachData["tag"]
44 |                         allNewsData.append(newsData)
45 |                 else:
46 |                     exit("no data!")
47 |             except:
48 |                 print(repr(news))
49 |                 print(sys.exc_info())
50 |             return allNewsData
51 |         except ConnectionError:
52 |             exit("ConnectionError")
53 | 
54 | # for i in range(1,20):
55 | #     get = GetToutiao("30", "news_society", time.time())
56 | #     allNewsData = get.getNews()
57 | #     for i in allNewsData:
58 | #         print(i)
59 | 


--------------------------------------------------------------------------------
/spider/toutiao/touTiaoSpider.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | __author__ = 'howie'
 3 | from xlsxwriter import *
 4 | import time
 5 | import datetime
 6 | import os
 7 | from random import choice
 8 | from spider.toutiao.touTiao import GetToutiao
 9 | from config.n_conf import dirPath
10 | 
11 | 
12 | def getToutiaoNews(category, page, num):
13 |     """
14 |     Des:    返回今日头条新闻
15 |     param:
16 |     category:新闻类型,默认为__all__
17 |     page    :爬取页面,默认20页
18 |     num     :每页新闻数量,根据今日头条每页返回数量变化,默认参数为20
19 |     ctime    :新闻时间,根据标准库time.time()获取
20 |     return: /source/下各文件夹
21 |     """
22 |     newsData = []
23 |     for page in range(0, page):
24 |         # ltime = [time.time(),"1464710423","1464796865","1464753667","1464840044","1464883266"]
25 |         # ctime = choice(ltime)
26 |         # print(ctime)
27 |         # 获取两天前的时间
28 |         twoDayAgo = (datetime.datetime.now() - datetime.timedelta(days=1))
29 |         # 转换为时间戳:
30 |         timeStamp = int(time.mktime(twoDayAgo.timetuple()))
31 |         ctime = choice(range(timeStamp, int(time.time())))
32 |         toutiao = GetToutiao(str(num), category, ctime)
33 |         allNewsData = toutiao.getNews()
34 |         for news in allNewsData:
35 |             newsData.append(news)
36 |     mkExcel(category, newsData)
37 | 
38 | 
39 | def getTimestamp(startTime):
40 |     """
41 |     Des:    将时间转化为时间戳
42 |     param:  startTime="2016-05-17 12:00:00"(格式)
43 |     return: timeStamp
44 |     """
45 |     timeArray = time.strptime(startTime, "%Y-%m-%d %H:%M:%S")
46 |     timeStamp = int(time.mktime(timeArray))
47 |     return timeStamp
48 | 
49 | 
50 | def mkExcel(cate, data):
51 |     """
52 |     将新闻数据生成excel表
53 |     :param cate: 新闻类型
54 |     :param data: 爬取的新闻数据
55 |     :return:     返回生成的excel表
56 |     """
57 |     # 设置excel表名称
58 |     excelName = dirPath + "/spider/touTiaoSource/" + cate + "/" + str(
59 |         time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())) + "&" + cate + "&" + str(len(data)) + ".xlsx"
60 |     # 设置excel表名称
61 |     jr_work = Workbook(excelName)
62 |     jr_sheet = jr_work.add_worksheet("toutiao")
63 |     bold = jr_work.add_format({'bold': True})  # 设置一个加粗的格式对象
64 |     jr_sheet.set_column('A:H', 40)
65 |     jr_sheet.set_column('C:D', 15)
66 |     jr_sheet.write(0, 0, '标题', bold)
67 |     jr_sheet.write(0, 1, '发表地址', bold)
68 |     jr_sheet.write(0, 2, '发表时间', bold)
69 |     jr_sheet.write(0, 3, '来源', bold)
70 |     jr_sheet.write(0, 4, '关键词', bold)
71 |     jr_sheet.write(0, 5, '摘要', bold)
72 |     jr_sheet.write(0, 6, '图片地址', bold)
73 |     jr_sheet.write(0, 7, '标签', bold)
74 |     line = 0
75 |     for eachData in data:
76 |         line += 1
77 |         jr_sheet.write(line, 0, eachData["title"])
78 |         jr_sheet.write(line, 1, eachData["display_url"])
79 |         jr_sheet.write(line, 2, eachData["display_time"])
80 |         jr_sheet.write(line, 3, eachData["source"])
81 |         jr_sheet.write(line, 4, eachData["keywords"])
82 |         jr_sheet.write(line, 5, eachData["abstract"])
83 |         jr_sheet.write(line, 6, str(eachData["images"]))
84 |         jr_sheet.write(line, 7, eachData["tag"])
85 |     jr_work.close()
86 |     log = "%s新闻表抓取完成,抓取数据%d条" % (excelName, line)
87 | 
88 |     with open(dirPath+"/log.txt", 'a') as fp:
89 |         fp.write(log + "\n")
90 |     print(log)
91 | 
92 | 
93 | # 新闻种类
94 | category = ["news_society", "news_entertainment",
95 |             "news_tech", "news_car", "news_sports", "news_finance", "news_military", "news_world",
96 |             "news_fashion", "news_travel", "news_discovery", "news_baby", "news_regimen", "news_story",
97 |             "news_essay", "news_game", "news_history", "news_food"]
98 | 


--------------------------------------------------------------------------------
/spider/wordAna/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/.DS_Store


--------------------------------------------------------------------------------
/spider/wordAna/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/__init__.py


--------------------------------------------------------------------------------
/spider/wordAna/allNews/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/allNews/.DS_Store


--------------------------------------------------------------------------------
/spider/wordAna/allNews/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/allNews/README.md


--------------------------------------------------------------------------------
/spider/wordAna/contentSpider.py:
--------------------------------------------------------------------------------
 1 | import jieba
 2 | import jieba.analyse
 3 | import os
 4 | import re
 5 | from spider.wordAna.contentTool import ContentOperator
 6 | from spider.wordAna.excelTool import ExcelOperator
 7 | from config.n_conf import dirPath
 8 | 
 9 | 
10 | def getNewsContent():
11 |     """
12 |     :return:
13 |     """
14 |     # 获取总目录绝对路径
15 |     orgDir = dirPath + "/spider/wordAna/allNews"
16 |     # 获取最终存放目录绝对路径
17 |     finalDir = dirPath + "/spider/wordAna/wordAnaNews/"
18 |     print(orgDir)
19 |     print(finalDir)
20 | 
21 |     # 获得excelTools.py中的excel操作工具
22 |     et = ExcelOperator()
23 |     # 获得contentTools.py中的content操作工具
24 |     ct = ContentOperator()
25 |     files = [x for x in os.listdir(orgDir) if os.path.splitext(x)[-1] == '.xlsx']
26 | 
27 |     # 开始遍历各大新闻类别的excel文件
28 |     for file in files:
29 |         # print(file)
30 |         # 此处得到该excel文件夹所有信息，是一个list，list中单个元素为dict，对应 列名：值
31 |         infoList = et.getExcelInfo(os.path.join(orgDir, file))
32 |         # 用以存放完整的新闻信息元素的集合
33 |         last_list = []
34 |         for new_info in infoList:
35 |             urlstr = new_info["display_url"]
36 |             # 区分链接，链接来自头条或新浪，关键词，toutiao.com和sina.com
37 |             htmlContent, textContent, title, abstract, keywords, source, tag = '', '', '', '', '', '', ''
38 |             img_url_list = []
39 |             try:
40 |                 # 这里需要去除sina的滚动图片类新闻及多媒体新闻
41 |                 if urlstr.find("sina.com") != -1 and urlstr.find("slide") == -1 and urlstr.find("video") == -1:
42 |                     print(urlstr)
43 |                     textContent, htmlContent, img_url_list, keyword_list, abstract = ct.getSinaContent(urlstr)
44 |                     new_info["keywords"] = ' '.join(keyword_list)
45 |                     new_info["abstract"] = ' '.join(abstract)
46 | 
47 |                 elif urlstr.find("toutiao.com") != -1:
48 |                     print(urlstr)
49 |                     textContent, htmlContent, img_url_list, title, abstract, keywords, source, tag = ct.getToutiaoContent(
50 |                         urlstr)
51 |                     if title:
52 |                         new_info["title"] = title
53 |                     else:
54 |                         new_info["title"] = ''
55 |                     if abstract:
56 |                         new_info["abstract"] = abstract
57 |                     else:
58 |                         new_info["abstract"] = ''
59 |                     if keywords:
60 |                         new_info["keywords"] = keywords
61 |                     else:
62 |                         new_info["keywords"] = ''
63 |                     if source:
64 |                         new_info["source"] = source
65 |                     else:
66 |                         new_info["source"] = ''
67 |                     if tag:
68 |                         new_info["tag"] = tag
69 |                     else:
70 |                         new_info["tag"] = ''
71 | 
72 |                 try:
73 |                     feature = jieba.analyse.extract_tags(textContent, 15)
74 |                 except:
75 |                     feature = new_info["keywords"]
76 |                 new_info["textContent"] = textContent
77 |                 new_info["htmlContent"] = htmlContent
78 |                 new_info["feature"] = feature
79 |                 new_info["img"] = img_url_list
80 |                 last_list.append(new_info)
81 |             except:
82 |                 pass
83 |                 # 采用结巴中文分词提取正文最重要的十个特征词
84 |                 # 相关算法 --- tf-idf算法
85 |                 # print(textContent)
86 | 
87 |         # 信息过滤、爬取及关键词提取完毕，开始将它存到excel表中
88 |         excelName = os.path.join(finalDir, file)
89 |         print("excelName:" + excelName)
90 |         # 这里，第二个参数，工作表名称，需调整
91 |         et.saveToExcel(excelName, "allNews", last_list)
92 | 


--------------------------------------------------------------------------------
/spider/wordAna/wordAnaNews/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/wordAnaNews/.DS_Store


--------------------------------------------------------------------------------
/spider/wordAna/wordAnaNews/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/spider/wordAna/wordAnaNews/README.md


--------------------------------------------------------------------------------
/static/css/login.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     background: url(../images/back.jpg) !important;
  3 |     font-family: "Helvetica Neue", "Hiragino Sans GB", "Microsoft YaHei", "\9ED1\4F53", Arial, sans-serif;
  4 |     color: #222;
  5 |     font-size: 12px;
  6 | }
  7 | * {
  8 |     padding: 0px;
  9 |     margin: 0px;
 10 | }
 11 | .top_div {
 12 |     background:rgba(15, 10, 10, 0.19);
 13 |     width: 100%;
 14 |     height: 400px;
 15 | }
 16 | .i_top{
 17 |     width: 100%;
 18 |     height: 50px;
 19 |     background-color: rgba(59, 167, 173, 0.52);
 20 | }
 21 | .i_top img{
 22 |     width: 45px;
 23 |     height: 45px;
 24 |     margin-left: 15px;
 25 | }
 26 | .ipt {
 27 |     border: 1px solid #d3d3d3;
 28 |     padding: 10px 10px;
 29 |     width: 290px;
 30 |     border-radius: 4px;
 31 |     padding-left: 35px;
 32 |     -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075);
 33 |     box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075);
 34 |     -webkit-transition: border-color ease-in-out .15s, -webkit-box-shadow ease-in-out .15s;
 35 |     -o-transition: border-color ease-in-out .15s, box-shadow ease-in-out .15s;
 36 |     transition: border-color ease-in-out .15s, box-shadow ease-in-out .15s
 37 | }
 38 | .ipt:focus {
 39 |     border-color: #66afe9;
 40 |     outline: 0;
 41 |     -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075), 0 0 8px rgba(102, 175, 233, .6);
 42 |     box-shadow: inset 0 1px 1px rgba(0, 0, 0, .075), 0 0 8px rgba(102, 175, 233, .6)
 43 | }
 44 | .u_logo {
 45 |     background: url("../images/username.png") no-repeat;
 46 |     padding: 10px 10px;
 47 |     position: absolute;
 48 |     top: 43px;
 49 |     left: 40px;
 50 | }
 51 | .p_logo {
 52 |     background: url("../images/password.png") no-repeat;
 53 |     padding: 10px 10px;
 54 |     position: absolute;
 55 |     top: 12px;
 56 |     left: 40px;
 57 | }
 58 | a {
 59 |     text-decoration: none;
 60 | }
 61 | .tou {
 62 |     background: url("../images/top_1.png") no-repeat;
 63 |     width: 115px;
 64 |     height: 92px;
 65 |     position: absolute;
 66 |     top: -93px;
 67 |     left: 132.5px;
 68 | }
 69 | .left_hand {
 70 |     background: url("../images/left_hand.png") no-repeat;
 71 |     width: 32px;
 72 |     height: 37px;
 73 |     position: absolute;
 74 |     top: -38px;
 75 |     left: 150px;
 76 | }
 77 | .right_hand {
 78 |     background: url("../images/right_hand.png") no-repeat;
 79 |     width: 32px;
 80 |     height: 37px;
 81 |     position: absolute;
 82 |     top: -38px;
 83 |     right: -64px;
 84 | }
 85 | .initial_left_hand {
 86 |     background: url("../images/hand.png") no-repeat;
 87 |     width: 30px;
 88 |     height: 20px;
 89 |     position: absolute;
 90 |     top: -12px;
 91 |     left: 100px;
 92 | }
 93 | .initial_right_hand {
 94 |     background: url("../images/hand.png") no-repeat;
 95 |     width: 30px;
 96 |     height: 20px;
 97 |     position: absolute;
 98 |     top: -12px;
 99 |     right: -112px;
100 |         }
101 | .left_handing {
102 |     background: url("../images/left-handing.png") no-repeat;
103 |     width: 30px;
104 |     height: 20px;
105 |     position: absolute;
106 |     top: -24px;
107 |     left: 139px;
108 | }
109 | .right_handinging {
110 |     background: url("../images/right_handing.png") no-repeat;
111 |     width: 30px;
112 |     height: 20px;
113 |     position: absolute;
114 |     top: -21px;
115 |     left: 210px;
116 | }
117 | #login{
118 |     cursor: pointer;
119 | }


--------------------------------------------------------------------------------
/static/css/newsManage.css:
--------------------------------------------------------------------------------
 1 | .read_list{
 2 |     height:150px;
 3 |     width: 150px;
 4 |     border: 1px;
 5 |     border-radius: 50%;
 6 |     margin:0 auto;
 7 |     font-size: 25px;
 8 |     padding-left: 27px;
 9 |     padding-top: 59px;
10 | }
11 | 
12 | #read_list{
13 |     background-color: #FFAEB9;
14 |      	/*#FFBBFF*/
15 | }
16 | 
17 | #love_list{
18 |     background-color: #f9c693;
19 | }
20 | 
21 | #comment_list{
22 |     background-color: #f7dd90;
23 | }
24 | 
25 | #delete_but{
26 |     background-color: #b6d7a8;
27 | }
28 | 
29 | .news_list{
30 | 
31 |     width: 80%;
32 |     margin:0 auto;
33 | }
34 | 
35 | .each_news{
36 |     background-color: white;
37 |     width:100%;
38 |     margin:0 auto;
39 |     height:100%;
40 |     margin-top:0.5%;
41 |     text-align: center;
42 | }
43 | 
44 | .read_list a{
45 |     color:#333;
46 |     text-decoration: none;
47 | }
48 | 
49 | .read_list a:visited {
50 |     color: #333;
51 |     text-decoration: none;
52 | }
53 | .read_list a:hover {
54 |     color: #333;
55 |     text-decoration: none;
56 | }
57 | .read_list a:active{
58 |     color: #333;
59 |     text-decoration: none;
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/static/css/userManage.css:
--------------------------------------------------------------------------------
  1 | .second-menu{
  2 |     width: 20%;
  3 | 
  4 | }
  5 | 
  6 | .second-menu a:focus, .nav-list li.active > a{
  7 |     text-decoration: none;
  8 |     background-color: #1A2022;
  9 |     border-left: 3px solid #23BAB5;
 10 |     font-size: 14px;
 11 |     color: #FFF;
 12 |     transition: all 0.1s ease 0s;
 13 | }
 14 | 
 15 | 
 16 | .second-menu>li{
 17 |     list-style: none;
 18 |     width: 100%;
 19 |     height: 40px;
 20 |     font-size: 15px;
 21 |     color: #80969C;
 22 | }
 23 | 
 24 | .second-menu> li > a > span {
 25 |     font-size: 16px;
 26 |     padding-top: 5px;
 27 | }
 28 | 
 29 | 
 30 | .second-menu > li > a {
 31 |     color: #80969c;
 32 |     position: relative;
 33 |     display: block;
 34 |     padding: 10px 15px;
 35 |     text-decoration: none;
 36 | }
 37 | 
 38 | .second-menu > li > a:hover {
 39 |     background-color: #1b6d85;
 40 |     color: #ffffff;
 41 | }
 42 | 
 43 | .user_third_head{
 44 |     padding-top: 3%;
 45 | 
 46 | }
 47 | .super_name{
 48 |     padding-left: 30%;
 49 | 
 50 | }
 51 | 
 52 | .table-bordered > thead > a{
 53 |     text-decoration: none;
 54 | }
 55 | 
 56 | .super_head{
 57 |     margin-left: 15%;
 58 | }
 59 | 
 60 | .user_third_menu{
 61 |     margin-top:100px;
 62 | }
 63 | 
 64 | .user_info{
 65 |     margin-top: 70px;
 66 |     background-color: antiquewhite;
 67 |     width: 300px;
 68 |     height:260px;
 69 |     font-size: 18px;
 70 |     border: 1px solid #E5E6E7;
 71 |     border-radius: 7%;
 72 |     display: none;
 73 | }
 74 | 
 75 | .each_info{
 76 |     padding-top: 20px;
 77 |     padding-left: 30px;
 78 | }
 79 | 
 80 | .page1{
 81 |     font-size: 20px;
 82 | }
 83 | 
 84 | .user{
 85 |     background-color: palevioletred;
 86 |     width: 400px;
 87 |     height: 600px;
 88 |     margin:0 auto;
 89 | }
 90 | 
 91 | 
 92 | /*反馈*/
 93 | 
 94 | .feedback{
 95 |     width: 60%;
 96 |     height: 500px;
 97 |     margin-left: 10%;
 98 |     margin-top: 10px;
 99 |     float: left;
100 | }
101 | 
102 | .each_feedback{
103 |     background-color: #DDDDDD;
104 |      border: 1px solid #DDDDDD;
105 |     padding-bottom: 10px;
106 |     margin-top: 10px;
107 | }
108 | 
109 | .feedback_name{
110 |     padding-top: 10px;
111 |     padding-left: 10px;
112 |     font-size: 20px;
113 | }
114 | 
115 | .feedback_content{
116 |     height: 80px;
117 |     width: 100%;
118 |     font-size: 17px;
119 |     padding-left: 20px;
120 | }
121 | .feedback_isreply{
122 |     font-size: 17px;
123 |     float: right;
124 |     padding-right: 20px;
125 | }
126 | 
127 | .feedback_gettime{
128 |     font-size: 17px;
129 |     float: right;
130 |     padding-right: 20px;
131 | }
132 | 
133 | .rep{
134 |     color: firebrick;
135 | }
136 | 
137 | .rep_button{
138 |     margin-left: 10px;
139 |     padding-left: 10px;
140 | }
141 | 


--------------------------------------------------------------------------------
/static/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/static/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/static/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/static/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/static/images/1.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" >
 3 | <svg xmlns="http://www.w3.org/2000/svg">
 4 | <metadata>
 5 | Created by FontForge 20120731 at Sat May 14 15:29:57 2016
 6 |  By admin
 7 | </metadata>
 8 | <defs>
 9 | <font id="iconfont" horiz-adv-x="374" >
10 |   <font-face 
11 |     font-family="iconfont"
12 |     font-weight="500"
13 |     font-stretch="normal"
14 |     units-per-em="1024"
15 |     panose-1="2 0 6 3 0 0 0 0 0 0"
16 |     ascent="896"
17 |     descent="-128"
18 |     x-height="792"
19 |     bbox="34 -48 957 792"
20 |     underline-thickness="50"
21 |     underline-position="-100"
22 |     unicode-range="U+0078-E61A"
23 |   />
24 | <missing-glyph 
25 | d="M34 0v682h272v-682h-272zM68 34h204v614h-204v-614z" />
26 |     <glyph glyph-name=".notdef" 
27 | d="M34 0v682h272v-682h-272zM68 34h204v614h-204v-614z" />
28 |     <glyph glyph-name=".null" horiz-adv-x="0" 
29 |  />
30 |     <glyph glyph-name="nonmarkingreturn" horiz-adv-x="341" 
31 |  />
32 |     <glyph glyph-name="x" unicode="x" horiz-adv-x="1001" 
33 | d="M281 543q-27 -1 -53 -1h-83q-18 0 -36.5 -6t-32.5 -18.5t-23 -32t-9 -45.5v-76h912v41q0 16 -0.5 30t-0.5 18q0 13 -5 29t-17 29.5t-31.5 22.5t-49.5 9h-133v-97h-438v97zM955 310v-52q0 -23 0.5 -52t0.5 -58t-10.5 -47.5t-26 -30t-33 -16t-31.5 -4.5q-14 -1 -29.5 -0.5
34 | t-29.5 0.5h-32l-45 128h-439l-44 -128h-29h-34q-20 0 -45 1q-25 0 -41 9.5t-25.5 23t-13.5 29.5t-4 30v167h911zM163 247q-12 0 -21 -8.5t-9 -21.5t9 -21.5t21 -8.5q13 0 22 8.5t9 21.5t-9 21.5t-22 8.5zM316 123q-8 -26 -14 -48q-5 -19 -10.5 -37t-7.5 -25t-3 -15t1 -14.5
35 | t9.5 -10.5t21.5 -4h37h67h81h80h64h36q23 0 34 12t2 38q-5 13 -9.5 30.5t-9.5 34.5q-5 19 -11 39h-368zM336 498v228q0 11 2.5 23t10 21.5t20.5 15.5t34 6h188q31 0 51.5 -14.5t20.5 -52.5v-227h-327z" />
36 |     <glyph glyph-name="uniE61A" unicode="&#xe61a;" horiz-adv-x="1024" 
37 | d="M944 -30.5q-13 -12.5 -31 -12.5l-738 -5h-5q-17 1 -28 13.5t-11 28.5v10l23 1v-11q0 -8 6 -14t15 -6l738 5q9 0 15 6.5t6 14.5l-4 547q0 8 -5.5 13.5t-13.5 6.5h-2h-9v23h9q18 0 31 -12.5t13 -30.5l4 -547q0 -18 -13 -30.5zM853 15l-739 -5h-4q-17 2 -28.5 14t-11.5 29
38 | l-4 547q0 17 13 30t31 13l738 5q19 0 31.5 -12.5t12.5 -29.5l4 -547q1 -18 -12 -30.5t-31 -13.5zM286 552h-28l1 -96l-61 96l-29 -1l1 -143h28l-1 94l60 -94h30zM171 361l1 -167l170 2l-1 166zM440 434l-82 -1v39l73 1v24h-74v31l79 1v24h-109l1 -144l112 1v24zM623 555
39 | l-22 -101l-26 100h-36l-26 -99l-22 99h-31l36 -144h32l29 108l30 -107h31l35 144h-30zM709 529q7 5 20 5t19 -5t8 -16l30 2q-1 19 -15 31t-41 12q-17 0 -29 -5t-18.5 -15t-6.5 -21q0 -17 14 -29q10 -8 33 -13q19 -5 24 -7q8 -2 11 -5.5t3 -8.5q0 -8 -7 -14t-22 -6
40 | q-13 0 -21 6.5t-11 20.5l-29 -3q3 -24 18 -36.5t43 -12.5q19 0 32 6q13 5 20 16t7 23q0 14 -6 23t-17 14q-10 6 -32 10q-21 5 -27 10q-4 3 -4 9q-1 5 4 9zM799 359l-399 -2q-8 0 -8 -8q0 -7 7 -7h1l399 2q7 0 7 7.5t-7 7.5zM799 285l-399 -3q-8 0 -7 -8q0 -6 6 -7h1l399 3
41 | q8 0 8 7.5t-8 7.5zM800 218l-399 -3q-8 0 -8 -8q0 -7 7 -7h1l399 3q7 0 7 7.5t-7 7.5zM800 150l-628 -4q-7 0 -7 -8q0 -6 7 -7v0l628 4q8 0 8 7.5t-8 7.5z" />
42 |   </font>
43 | </defs></svg>
44 | 


--------------------------------------------------------------------------------
/static/images/admin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/admin.jpg


--------------------------------------------------------------------------------
/static/images/back.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/back.jpg


--------------------------------------------------------------------------------
/static/images/bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/bg.jpg


--------------------------------------------------------------------------------
/static/images/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/bg.png


--------------------------------------------------------------------------------
/static/images/bgb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/bgb.jpg


--------------------------------------------------------------------------------
/static/images/hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/hand.png


--------------------------------------------------------------------------------
/static/images/left-handing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/left-handing.png


--------------------------------------------------------------------------------
/static/images/left_hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/left_hand.png


--------------------------------------------------------------------------------
/static/images/news.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/news.png


--------------------------------------------------------------------------------
/static/images/password.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/password.png


--------------------------------------------------------------------------------
/static/images/right_hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/right_hand.png


--------------------------------------------------------------------------------
/static/images/right_handing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/right_handing.png


--------------------------------------------------------------------------------
/static/images/save.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg width="32" height="32" viewBox="0 0 32 32" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><style type="text/css"><![CDATA[
2 | @font-face { font-family: ifont; src: url("http://at.alicdn.com/t/font_1442373896_4754455.eot?#iefix") format("embedded-opentype"), url("http://at.alicdn.com/t/font_1442373896_4754455.woff") format("woff"), url("http://at.alicdn.com/t/font_1442373896_4754455.ttf") format("truetype"), url("http://at.alicdn.com/t/font_1442373896_4754455.svg#ifont") format("svg"); }
3 | 
4 | ]]></style></defs><g class="transform-group"><g transform="scale(0.03125, 0.03125)"><path d="M721.92 352.376471c-12.950588-0.542118-26.021647 3.794824-35.900235 13.733647L463.992471 588.137412 337.92 462.064941C328.041412 452.186353 314.970353 447.849412 302.08 448.331294c-11.625412 0.481882-23.130353 4.818824-32.045176 13.733647C261.180235 470.919529 256.843294 482.484706 256.361412 494.110118 255.879529 507.000471 260.216471 520.071529 270.034824 529.950118l159.984941 159.984941c9.396706 9.336471 21.684706 14.034824 33.912471 14.034824 12.288 0 24.576-4.698353 33.912471-14.034824l256-256c9.878588-9.878588 14.215529-22.949647 13.733647-35.84-0.481882-11.625412-4.818824-23.190588-13.733647-32.045176S733.545412 352.798118 721.92 352.376471zM512 0C229.255529 0 0 229.255529 0 512S229.255529 1024 512 1024c282.744471 0 512-229.255529 512-512S794.744471 0 512 0zM512 927.984941c-229.376 0-415.984941-186.669176-415.984941-415.984941S282.624 96.015059 512 96.015059c229.376 0 415.984941 186.669176 415.984941 415.984941S741.376 927.984941 512 927.984941z" fill="#11cd6e"></path></g></g></svg>


--------------------------------------------------------------------------------
/static/images/save0.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg width="32.09375" height="32" viewBox="0 0 32.09375 32" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><style type="text/css"><![CDATA[
2 | @font-face { font-family: ifont; src: url("http://at.alicdn.com/t/font_1442373896_4754455.eot?#iefix") format("embedded-opentype"), url("http://at.alicdn.com/t/font_1442373896_4754455.woff") format("woff"), url("http://at.alicdn.com/t/font_1442373896_4754455.ttf") format("truetype"), url("http://at.alicdn.com/t/font_1442373896_4754455.svg#ifont") format("svg"); }
3 | 
4 | ]]></style></defs><g class="transform-group"><g transform="scale(0.03125, 0.03125)"><path d="M669.318373 369.122345 448.8995 589.542233l-97.841247-97.863576c-14.148413-14.148413-37.107613-14.148413-51.211367 0-14.193071 14.148413-14.193071 37.108627 0 51.233696l123.468753 123.492096c7.074206 7.074206 16.328527 10.634653 25.583862 10.634653 9.255335 0 18.554314-3.560447 25.62852-10.634653l246.048408-246.027094c14.149428-14.148413 14.149428-37.085284 0-51.256025C706.427001 354.972917 683.468816 354.996261 669.318373 369.122345L669.318373 369.122345zM669.318373 369.122345M510.166999 1018.925247c-279.240341 0-501.174536-221.932165-501.174536-501.173521 0-279.240341 221.93318-501.173521 501.174536-501.173521 279.240341 0 501.174536 221.93318 501.174536 501.173521C1011.341535 796.993082 789.40734 1018.925247 510.166999 1018.925247L510.166999 1018.925247 510.166999 1018.925247zM510.166999 73.886381c-243.423746 0-436.74648 200.442614-436.74648 436.702838s200.487271 436.747495 436.74648 436.747495c236.260224 0 436.74648-200.487271 436.74648-436.747495S753.58973 73.886381 510.166999 73.886381L510.166999 73.886381 510.166999 73.886381z" fill="#a9b7b7"></path></g></g></svg>


--------------------------------------------------------------------------------
/static/images/top_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/top_1.png


--------------------------------------------------------------------------------
/static/images/tou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/tou.png


--------------------------------------------------------------------------------
/static/images/username.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/static/images/username.png


--------------------------------------------------------------------------------
/static/js/feedback.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by jeezy-lyoung on 16-8-2.
 3 |  */
 4 | $(document).ready(function () {
 5 | 
 6 |     var net = "http://127.0.0.1:8888";
 7 |     var count = 3;
 8 |     var page_value = 1;
 9 |     var alrequest = 0;
10 |     var page = "xia";
11 |     var time = '1';
12 |     var tooken = '764bfd755bc07f6871eee104219b2b2c';
13 | 
14 | 
15 | 
16 |     function get_user() {
17 |         var qx = {"alrequest":alrequest,"page":page,"time":time, "tooken":tooken};
18 |     $.ajax({
19 |         type:"get",
20 |         url:net + "/api/adminfeedback?count=3",
21 |         data:qx,
22 |         cache:false,
23 |         success:function(data) {
24 | 
25 |             var is_success;
26 | 
27 |                 var result = eval('(' + data + ')');
28 |                 var all_data = result.data;
29 |                 is_success = result.message;
30 | 
31 |                 if (is_success == "failed"){
32 |                     alert("已是最后一页");
33 |                     page_value = page_value - 1;
34 |                 }
35 |                 else{
36 |                     var table = "";
37 |                     var u_id = "";
38 |                     var feed_content = "";
39 |                     var get_time = ""
40 |                     var name = "";
41 |                     var is_rep = "";
42 |                     var rep_button = "";
43 |                     for(i=0;i<all_data.length;i++){
44 |                         name = "<div class='feedback_name'>"+ all_data[i].user_name +": </div>"
45 |                         feed_content = "<div class='feedback_content'>"+ all_data[i].contents +": </div>"
46 |                         get_time = "<div class='feedback_gettime'>"+ all_data[i].times +": </div>"
47 |                         is_rep = "<div class='feedback_isreply'>是否已审批: <span class='rep'>否</span>: </div>"
48 |                         rep_button = "<input class='rep_button' type='button' value='审批'>"
49 |                         table = table +  "<div class='each_feedback'>" +name+feed_content+get_time+is_rep+rep_button + "</div>"
50 | 
51 |                     }
52 | 
53 |                     document.getElementById("not_feedback").innerHTML = table;
54 | 
55 | 
56 | 
57 |                 }
58 | 
59 |         }
60 | 
61 | 
62 | 
63 |     });
64 |     }
65 | 
66 |     get_user();
67 | 
68 |     $("#xia").click(function () {
69 |         page = "xia";
70 |         alrequest = page_value * 3;
71 |         get_user();
72 |         page_value = page_value + 1;
73 | 
74 | 
75 | 
76 |     });
77 | 
78 |     $("#shang").click(function () {
79 |         page = "shang";
80 |         if (page_value == 1){
81 |             alert("已是第一页")
82 |         }else{
83 |             page_value = page_value - 1 ;
84 |             alrequest = page_value * 3;
85 |             get_user();
86 | 
87 |         }
88 | 
89 | 
90 | 
91 |     });
92 | 
93 | 
94 |     
95 |     
96 | 
97 | 
98 | })


--------------------------------------------------------------------------------
/static/js/howie.js:
--------------------------------------------------------------------------------
 1 | window.onload = function () {
 2 |     var name = document.getElementById('username');
 3 |     var pass = document.getElementById('password');
 4 |     var login = document.getElementById('login');
 5 |     login.onclick = function () {
 6 |         if (name.value == "" || pass.value == "") {
 7 |             alert('不能有内容为空');
 8 |         }
 9 |     }
10 | }
11 | $(document).ready(function(){
12 |     $("#login").click(function(){
13 |         var user = $("#username").val();
14 |         var pwd = $("#password").val();
15 |         var pd = {"username":user, "password":pwd};
16 |         $.ajax({
17 |             type:"post",
18 |             url:"/",
19 |             data:pd,
20 |             cache:false,
21 |             success:function(data){
22 |                 window.location.href="/admin?user="+data;
23 |             },
24 |             error:function(){
25 |                 alert("error!");
26 |             },
27 |         });
28 |     });
29 |     //得到焦点
30 |     $("#password").focus(function () {
31 |         $("#left_hand").animate({
32 |             left: "150",
33 |             top: " -38"
34 |         }, {
35 |             step: function () {
36 |                 if (parseInt($("#left_hand").css("left")) > 140) {
37 |                     $("#left_hand").attr("class", "left_hand");
38 |                 }
39 |             }
40 |         }, 2000);
41 |         $("#right_hand").animate({
42 |             right: "-64",
43 |             top: "-38px"
44 |         }, {
45 |             step: function () {
46 |                 if (parseInt($("#right_hand").css("right")) > -70) {
47 |                     $("#right_hand").attr("class", "right_hand");
48 |                 }
49 |             }
50 |         }, 2000);
51 |     });
52 |     //失去焦点
53 |     $("#password").blur(function () {
54 |         $("#left_hand").attr("class", "initial_left_hand");
55 |         $("#left_hand").attr("style", "left:100px;top:-12px;");
56 |         $("#right_hand").attr("class", "initial_right_hand");
57 |         $("#right_hand").attr("style", "right:-112px;top:-12px");
58 |     });
59 | });
60 | 
61 | // function keyLogin() {
62 | //     if (event.keyCode == 13) //回车键
63 | //         document.getElementById('login').onclick()
64 | // }
65 | 


--------------------------------------------------------------------------------
/static/js/newsManage.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by jeezy-lyoung on 16-8-4.
 3 |  */
 4 | $(document).ready(function () {
 5 | 
 6 |     $("#read_list").mouseover(function () {
 7 |         document.getElementById('read_list').style.backgroundColor = "#FFBBFF";
 8 |     });
 9 | 
10 |     $("#read_list").mouseout(function () {
11 |         document.getElementById('read_list').style.backgroundColor = "#FFAEB9";
12 |     });
13 | 
14 | 
15 |     $("#love_list").mouseover(function () {
16 |         document.getElementById('love_list').style.backgroundColor = "#f9d5b0";
17 |     });
18 | 
19 |     $("#love_list").mouseout(function () {
20 |         document.getElementById('love_list').style.backgroundColor = "#f9c693";
21 |     });
22 | 
23 | 
24 |     $("#comment_list").mouseover(function () {
25 |         document.getElementById('comment_list').style.backgroundColor = "#fbe8af";
26 |     });
27 | 
28 |     $("#comment_list").mouseout(function () {
29 |         document.getElementById('comment_list').style.backgroundColor = "#f7dd90";
30 |     });
31 | 
32 | 
33 |     $("#delete_but").mouseover(function () {
34 |         document.getElementById('delete_but').style.backgroundColor = "#c4ddb9";
35 |     });
36 | 
37 |     $("#delete_but").mouseout(function () {
38 |         document.getElementById('delete_but').style.backgroundColor = "#b6d7a8";
39 |     });
40 | 
41 | 
42 | 
43 | 
44 |     
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 |     /*自适应高度*/
54 |     var news_list_height = document.getElementById('news_list');
55 |     var each_news_height = document.getElementsByClassName('each_news');
56 | 
57 |     var show_height;
58 |     if (navigator.userAgent.indexOf("Firefox") > 0) {
59 |         show_height = document.documentElement.scrollHeight;
60 |     }
61 |     if (window.navigator.userAgent.indexOf("Chrome") !== -1 || navigator.userAgent.indexOf("Safari") > 0) {
62 |         show_height = document.body.scrollHeight;
63 |     }
64 |     if (navigator.userAgent.indexOf("MSIE") > 0) {
65 |         show_height = (document.documentElement.scrollHeight > document.documentElement.clientHeight) ? document.documentElement.scrollHeight : document.documentElement.clientHeight;
66 |     } else {
67 |         show_height = (document.documentElement.scrollHeight > document.documentElement.clientHeight) ? document.documentElement.scrollHeight : document.documentElement.clientHeight;
68 |     }
69 |     news_list_height.style.height = (show_height - 300)*0.9 + "px";
70 |     news_list_height.style.marginTop = (show_height - 300)*0.1 + "px";
71 |     news_list_height.style.marginBottom = (show_height - 300)*0.1 + "px";
72 |     //news_list_height.style.fontSize = ($(".each_news").height())*0.5+ "px";
73 |     //alert($(".each_news").height());
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | });


--------------------------------------------------------------------------------
/system/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/README.md


--------------------------------------------------------------------------------
/system/classPredict/NavieBayesInfo/predict_result.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/NavieBayesInfo/predict_result.txt


--------------------------------------------------------------------------------
/system/classPredict/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__init__.py


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/__init__.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/__init__.cpython-34.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/main.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/main.cpython-34.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/main.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/main.cpython-35.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/newsPredict.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/newsPredict.cpython-34.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/newsPredict.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/newsPredict.cpython-35.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/predictTool.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/predictTool.cpython-34.pyc


--------------------------------------------------------------------------------
/system/classPredict/__pycache__/predictTool.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/__pycache__/predictTool.cpython-35.pyc


--------------------------------------------------------------------------------
/system/classPredict/main.py:
--------------------------------------------------------------------------------
 1 | import pymysql
 2 | import os
 3 | from system.classPredict.newsPredict import NewPredict
 4 | from system.classPredict.predictTool import NavieBayesPredict
 5 | from methods.pDb import newsDb
 6 | from config.n_conf import dirPath
 7 | 
 8 | # 存放新闻信息的集合，包括 id,文本正文
 9 | data_list = []
10 | 
11 | test_data_file = dirPath + "/system/classPredict/NavieBayesInfo/predict_new_word.txt"
12 | model_file = dirPath + "/system/classPredict/NavieBayesInfo/model.txt"
13 | result_file = dirPath + "/system/classPredict/NavieBayesInfo/predict_result.txt"
14 | 
15 | 
16 | def startPredict():
17 |     db = newsDb()
18 | 
19 |     try:
20 |         datasql = "select news_id,text_content from get_news where is_old = 0"
21 |         data = db.select_table_three(datasql)
22 |         for d in data:
23 |             new = {}
24 |             new["id"] = d[0]
25 |             new["textContent"] = d[1]
26 |             data_list.append(new)
27 | 
28 |     except Exception as e:
29 |         print(e)
30 | 
31 |     np = NewPredict(data_list)
32 |     np.getNewInfo()
33 |     nb = NavieBayesPredict(test_data_file, model_file, result_file)
34 |     nb.predict()
35 | 
36 |     # startPredict()
37 | 


--------------------------------------------------------------------------------
/system/classPredict/newsPredict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import jieba
 4 | import jieba.analyse
 5 | from config.n_conf import dirPath
 6 | 
 7 | 
 8 | class NewPredict(object):
 9 |     def __init__(self,data_list):
10 |         self.data_list = data_list
11 | 
12 |         self.ft = open(dirPath + "/system/classPredict/NavieBayesInfo/predict_new_word.txt", 'w')
13 |         #存放 单词：对应唯一id 的字典
14 |         self.word_id_dict = {}
15 |         #加载单词id字典，结果在word_id_dict里
16 |         self.loadWord_id_dict()
17 | 
18 | 
19 |     #将所需预测的新闻的类别：特征词保存进文件，格式： 类别 单词1id  单词2id  单词3id....... #
20 |     def getNewInfo(self):
21 |         for new in self.data_list:
22 | 
23 |             new_id = new["id"]
24 | 
25 |             textContent = new["textContent"]
26 |             if textContent==None:
27 |                 continue
28 | 
29 |             feature = jieba.analyse.extract_tags(textContent, 15)
30 |             # print(new_id + " " + str(feature))
31 |             #代表当前新闻的特征词id集合
32 |             word_id_list = []
33 |             for word in feature:
34 |                 tmp = self.word_id_dict.get(word, None)
35 |                 if tmp==None:
36 |                     word_id_list.append("-1")
37 |                 else:
38 |                     word_id_list.append(str(tmp))
39 | 
40 |             #将文章信息写入预测新闻信息文件
41 |             self.writeFeature(new_id, word_id_list)
42 |         #关闭资源
43 |         self.ft.close()
44 | 
45 |     # cate为feature特征词id集合所属的类别
46 |     def writeFeature(self, new_id, word_id_list):
47 |         self.ft.write(new_id + ' ')
48 |         for word_id in word_id_list:
49 |             self.ft.write(word_id + ' ')
50 |         self.ft.write('\n')
51 | 
52 | 
53 |     def loadWord_id_dict(self):
54 |         fd = open (dirPath + "/system/classPredict/NavieBayesInfo/word_id_dict.txt", 'r')
55 |         allInfo = fd.read()
56 |         arr = allInfo.strip().split()
57 |         for i in range(0,len(arr)):
58 |             if i%2==0:
59 |                 self.word_id_dict[arr[i]] = arr[i+1]
60 | 
61 | 
62 | # np = NewPredict([])
63 | # np.loadWord_id_dict()
64 | 


--------------------------------------------------------------------------------
/system/classPredict/test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from spider.wordAna.contentTool import *
 3 | from newsPredict import *
 4 | from predictTool import *
 5 | import os
 6 | ct = ContentOperator()
 7 | # urlstr = "http://toutiao.com/a6292379445978808577/"
 8 | # urlstr ="http://toutiao.com/a6292665412145365250/"
 9 | # urlstr = "http://toutiao.com/group/6292605706580803842/" #汽车
10 | # urlstr = "http://toutiao.com/a6291989690913620225/" #美文
11 | # urlstr = "http://toutiao.com/a6292346062074691841/" #游戏
12 | # urlstr = "http://toutiao.com/a6280854429832233218/"  #科技
13 | # urlstr = "http://toutiao.com/group/6291592935427604737/" #故事
14 | # urlstr = "http://toutiao.com/a6292516516223680770/" #养生
15 | # urlstr = "http://toutiao.com/a6292528444404973826/" #历史
16 | # urlstr = "http://toutiao.com/a6292557068092080386/" #美食
17 | # urlstr = "http://toutiao.com/a6292374615201153537/" #发现
18 | # urlstr = "http://toutiao.com/a6292035179544412417/" #时尚
19 | # urlstr = "http://toutiao.com/a6292511961298059521/" #旅游
20 | urlstr = "http://toutiao.com/a6292830759922729218/"  #育儿
21 | textContent, htmlContent, img_url_list = ct.getToutiaoContent(urlstr)
22 | data_list = []
23 | new = {}
24 | new["id"] = "1"
25 | new["textContent"] = textContent
26 | data_list.append(new)
27 | np = NewPredict(data_list)
28 | np.getNewInfo()
29 | test_data_file = os.path.abspath('.') + "/NavieBayesInfo/predict_new_word.txt"
30 | model_file = os.path.abspath('.') + "/NavieBayesInfo/model.txt"
31 | result_file = os.path.abspath('.') + "/NavieBayesInfo/predict_result.txt"
32 | print(test_data_file)
33 | nb = NavieBayesPredict(test_data_file,model_file,result_file)
34 | nb.predict()


--------------------------------------------------------------------------------
/system/classPredict/trainData/2016-06-06-13-09-44&news_fashion.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/classPredict/trainData/2016-06-06-13-09-44&news_fashion.xlsx


--------------------------------------------------------------------------------
/system/latentFactor/README.md:
--------------------------------------------------------------------------------
1 | "update user_tag_score set news_baby,news_entertainment,news_discovery,news_history,news_society," \
2 |                                             "news_game,news_sports,news_car,news_essay,news_tech,news_military,news_travel,news_fashion,news_regimen," \
3 |                                             "news_story,news_finance,news_food,news_world = '" +tag_list_score['news_baby']+"','"+tag_list_score['news_entertainment']+"','"+\
4 |                                             tag_list_score['news_discovery']+"','"+tag_list_score['news_history']+"','"+tag_list_score['news_society']+"','"+tag_list_score['news_game']+"','"+\
5 |                                             tag_list_score['news_sports']+"','"+tag_list_score['news_car']+"','"+tag_list_score['news_essay']+"','"+tag_list_score['news_tech']+\
6 |                                             "','"+tag_list_score['news_military']+"','"+tag_list_score['news_travel']+"','"+tag_list_score['news_fashion']+"','"+tag_list_score['news_regimen']\
7 |                                             +"','"+tag_list_score['news_story']+"','"+tag_list_score['news_finance']+"','"+tag_list_score['news_food']+"','"+tag_list_score['news_world']+"'  where user_id = '" + user_id + "'"


--------------------------------------------------------------------------------
/system/latentFactor/__pycache__/geneCalcul.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneCalcul.cpython-34.pyc


--------------------------------------------------------------------------------
/system/latentFactor/__pycache__/geneCalcul.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneCalcul.cpython-35.pyc


--------------------------------------------------------------------------------
/system/latentFactor/__pycache__/geneNewsType.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneNewsType.cpython-34.pyc


--------------------------------------------------------------------------------
/system/latentFactor/__pycache__/geneNewsType.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneNewsType.cpython-35.pyc


--------------------------------------------------------------------------------
/system/latentFactor/__pycache__/geneUserType.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneUserType.cpython-34.pyc


--------------------------------------------------------------------------------
/system/latentFactor/__pycache__/geneUserType.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/latentFactor/__pycache__/geneUserType.cpython-35.pyc


--------------------------------------------------------------------------------
/system/latentFactor/dbTool.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 | 
3 | import pymysql
4 | 
5 | 


--------------------------------------------------------------------------------
/system/latentFactor/geneNewsType.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | 
 3 | from methods.pDb import newsDb
 4 | 
 5 | 
 6 | class NewsTagDataTool(object):
 7 |     def __init__(self):
 8 |         #存新闻编号
 9 |         self.new_id_list = []
10 |         self.newsTagMat = []
11 | 
12 |         #存取新闻id对应的类别
13 |         self.news_type_dict = {}
14 | 
15 |     def getData(self):
16 |         try:
17 | 
18 |             db = newsDb()
19 |             data = db.select_table_two(table="news_tag_deep",column="*")
20 | 
21 |             for item in data:
22 |                 #获得新闻id
23 |                 self.new_id_list.append(item[0])
24 |                 #当前新闻标签比例因子集合，标签名称顺序按数据表设计来
25 |                 tagsWeight = []
26 |                 for tag in item[1:len(item)]:
27 |                     tagsWeight.append(tag)
28 |                 self.newsTagMat.append(tagsWeight)
29 | 
30 |             datasql = "select news_id,tag from get_news where is_old = 0"
31 |             data = db.select_table_three(datasql)
32 |             # print(data)
33 | 
34 |             for item in data:
35 |                 #获取新闻的id及对应的类别：
36 |                 self.news_type_dict[item[0]] = item[1]
37 | 
38 |             #print(self.news_type_dict)
39 |             # print(self.new_id_list)
40 |             # print(self.newsTagMat)
41 |             return self.news_type_dict,self.new_id_list,self.newsTagMat
42 | 
43 |         except Exception as e:
44 |             print(e)
45 | 
46 | # ntTool = NewsTagDataTool()
47 | # x,y,z=ntTool.getData()
48 | # print(x)
49 | # print(y)
50 | #
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/system/latentFactor/geneUserType.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8-*-
 2 | 
 3 | from methods.pDb import newsDb
 4 | 
 5 | '''用户-标签（类型）潜在因子矩阵计算，与新闻-标签（类型）潜在因子矩阵相乘得到推荐新闻'''
 6 | '''数据表来源：用户行为信息表，存入标签喜欢程度表,计算方法是，通过用户行为信息表信息，计算各用户对各标签的喜欢程度'''
 7 | 
 8 | 
 9 | class UserTagDataTool(object):
10 |     def __init__(self):
11 | 
12 |         #存储各用户的id
13 |         self.user_id_list = []
14 |         #存储各用户对各类别的喜欢比例
15 |         self.userTagMat = []
16 | 
17 |     def getData(self):
18 |         try:
19 |             db = newsDb()
20 |             data = db.select_table_two(table="user_tag_score",column="*")
21 |             for item in data:
22 |                 # 获得用户id
23 |                 self.user_id_list.append(item[0])
24 |                 # 当前用户对各类别的新闻的分数的集合，类别名称顺序按数据表设计来
25 |                 tagsScore = []
26 |                 curSum = 0
27 |                 for score in item[1:len(item)]:
28 |                     if score == None:
29 |                         tmp = 1
30 |                     else:
31 |                         tmp = float(score)
32 | 
33 |                     curSum = curSum + tmp
34 |                     tagsScore.append(tmp)
35 | 
36 |                 #当前用户对于各类别的喜欢比重，类别名称顺序按数据表设计来
37 |                 tagsWeight = []
38 |                 for i in range(0,len(tagsScore)):
39 |                     tagsWeight.append(tagsScore[i]/(float)(curSum))
40 | 
41 |                 self.userTagMat.append(tagsWeight)
42 |             # print(self.user_id_list)
43 |             # print(self.userTagMat)
44 |             return self.user_id_list,self.userTagMat
45 |         except Exception as e:
46 |             print(e)
47 | 
48 | 
49 | # gut = UserTagDataTool()
50 | # x,y = gut.getData()
51 | # print(x)
52 | # print(y)


--------------------------------------------------------------------------------
/system/pointsAlo/__pycache__/scoreSetting.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/pointsAlo/__pycache__/scoreSetting.cpython-34.pyc


--------------------------------------------------------------------------------
/system/pointsAlo/__pycache__/scoreSetting.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howie6879/getNews/ab5ad56c8520e60d5f568deed0081dfc127b7cd9/system/pointsAlo/__pycache__/scoreSetting.cpython-35.pyc


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>新闻推荐系统后台</title>
 6 |     <link rel="stylesheet" href="{{static_url('css/login.css')}}">
 7 | </head>
 8 | <body>
 9 |     <div class="top_div">
10 |         <div class="i_top">
11 |             <img src="{{static_url('images/news.png')}}">
12 |         </div>
13 |     </div>
14 |     <form id="backStage" method="post">
15 |         <div style="background: rgb(255, 255, 255); margin: -100px auto auto; border: 1px solid rgb(231, 231, 231); border-image: none; width: 400px; height: 200px; text-align: center;">
16 |             <div style="width: 165px; height: 96px; position: absolute;">
17 |                 <div class="tou"></div>
18 |                 <div class="initial_left_hand" id="left_hand"></div>
19 |                 <div class="initial_right_hand" id="right_hand"></div>
20 |             </div>
21 |             <p style="padding: 30px 0px 10px; position: relative;">
22 |                 <span class="u_logo"></span>
23 |                 <input id="username" class="ipt" type="text" placeholder="请输入用户名" value="admin">
24 |             </p>
25 |             <p style="position: relative;">
26 |                 <span class="p_logo"></span>
27 |                 <input id="password" class="ipt" type="password" placeholder="请输入密码" value="">
28 |             </p>
29 |             <div style="height: 50px; line-height: 50px; margin-top: 30px; border-top-color: rgb(231, 231, 231); border-top-width: 1px; border-top-style: solid;">
30 |                 <P style="margin: 0px 35px 20px 45px;">
31 |                     <span style="float: left;"><a style="color: rgb(204, 204, 204);" ></a></span>
32 |                     <span id="login" style="float: right;"><a style="background: rgb(0, 142, 173); padding: 7px 10px; border-radius: 4px; border: 1px solid rgb(26, 117, 152); border-image: none; color: rgb(255, 255, 255); font-weight: bold;">登录</a></span>
33 |                 </P>
34 |             </div>
35 |         </div>
36 |     </form>
37 | <script src="{{static_url('js/jquery.min.js')}}"> </script>
38 | <script src="{{static_url('js/howie.js')}}"> </script>
39 | </body>
40 | </html>


--------------------------------------------------------------------------------
/templates/newsManage.html:
--------------------------------------------------------------------------------
  1 | {% extends "main.html" %}
  2 | 
  3 | {% block header %}
  4 |     <title>{{ header }}</title>
  5 | {% end %}
  6 | 
  7 | {% block left-nav %}
  8 | <li>
  9 |     <a href="/admin?user=admin">
 10 |         <i class="glyphicon glyphicon-home" aria-hidden="true"></i>
 11 |         <span>Home</span>
 12 |     </a>
 13 | </li>
 14 | <li>
 15 |     <a href="/dataAna">
 16 |         <i class="glyphicon glyphicon-education" aria-hidden="true"></i>
 17 |         <span class="">数据分析</span>
 18 |     </a>
 19 | </li>
 20 | <li>
 21 |     <a href="http://123.207.236.147:5000/" target="_blank">
 22 |         <i class="glyphicon glyphicon-save" aria-hidden="true"></i>
 23 |         <span class="">爬虫管理</span>
 24 |     </a>
 25 | </li>
 26 | <li class="active">
 27 |     <a href="/newsManage">
 28 |         <i class="glyphicon glyphicon-file" aria-hidden="true"></i>
 29 |         <span class="">新闻管理</span>
 30 |     </a>
 31 | </li>
 32 | <li>
 33 |     <a href="/userManage">
 34 |         <i class="glyphicon glyphicon-user" aria-hidden="true"></i>
 35 |         <span class="">用户管理</span>
 36 |     </a>
 37 | </li>
 38 | <li>
 39 |     <a href="/system">
 40 |         <i class="glyphicon glyphicon-cog" aria-hidden="true"></i>
 41 |         <span class="">系统信息</span>
 42 |     </a>
 43 | </li>
 44 | {% end %}
 45 | 
 46 | {% block right %}
 47 | <script src="{{static_url('js/newsManage.js')}}"></script>
 48 | 
 49 |     <div id="news_list" class="row news_list">
 50 |         <div class=" row each_news">
 51 | 
 52 |            <table class="table">
 53 |            <caption>用户最多评论</caption>
 54 |            <thead>
 55 |               <tr>
 56 |                  <th>新闻编号</th>
 57 |                  <th>新闻标题</th>
 58 |                  <th>新闻来源</th>
 59 |                  <th>发表时间</th>
 60 |                  <th>阅读人数</th>
 61 |                  <th>喜欢人数</th>
 62 |                  <th>评论人数</th>
 63 |               </tr>
 64 |            </thead>
 65 |            <tbody>
 66 |                 <div id="news_table">
 67 |                     <tr>
 68 |                  <td>e16931cd6f3fe68662a7</td>
 69 |                  <td>司马懿怒挖诸葛亮坟，终于跪下了</td>
 70 |                   <td>新浪新闻</td>
 71 |                  <td>2016-08-07 15:12:09</td>
 72 |                   <td>25</td>
 73 |                  <td>10</td>
 74 |                   <td>5</td>
 75 |               </tr>
 76 |               <tr>
 77 |                  <td>e16931cd6f3fe68662a7</td>
 78 |                  <td>司马懿怒挖诸葛亮坟，终于跪下了</td>
 79 |                   <td>新浪新闻</td>
 80 |                  <td>2016-08-07 15:12:09</td>
 81 |                   <td>25</td>
 82 |                  <td>10</td>
 83 |                   <td>5</td>
 84 |               </tr>
 85 |                 </div>
 86 | 
 87 |            </tbody>
 88 | 
 89 |         </table>
 90 |         </div>
 91 | 
 92 |     </div>
 93 |     <div class="row control_button">
 94 |         <div  class="col-sm-3">
 95 |             <div id="read_list" class="read_list">
 96 |                 <a id="read" href="">最多阅读</a>
 97 |             </div>
 98 |         </div>
 99 |         <div  class="col-sm-3">
100 |             <div id="love_list" class="read_list">
101 |                 <a id="love" href="">最多喜欢</a>
102 |             </div>
103 |         </div>
104 |         <div  class="col-sm-3">
105 |             <div id="comment_list" class="read_list">
106 |                 <a id="comment" href="">最多评论</a>
107 |             </div>
108 |         </div>
109 |         <div  class="col-sm-3">
110 |             <div id="delete_but" class="read_list">
111 |                 <a id="" href="">新闻去旧</a>
112 |             </div>
113 |         </div>
114 |    </div>
115 | {% end %}
116 | 
117 | 


--------------------------------------------------------------------------------
/templates/spider.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>{% extends "main.html" %}
 6 | {% block header %}
 7 | <title>{{ header }}</title>
 8 | {% end %}
 9 | 
10 | {% block left-nav %}
11 | <li>
12 |     <a href="/admin?user=admin">
13 |         <i class="glyphicon glyphicon-home" aria-hidden="true"></i>
14 |         <span>Home</span>
15 |     </a>
16 | </li>
17 | <li>
18 |     <a href="/dataAna">
19 |         <i class="glyphicon glyphicon-education" aria-hidden="true"></i>
20 |         <span class="">数据分析</span>
21 |     </a>
22 | </li>
23 | <li class="active">
24 |     <a href="/spider">
25 |         <i class="glyphicon glyphicon-save" aria-hidden="true"></i>
26 |         <span class="">爬虫管理</span>
27 |     </a>
28 | </li>
29 | <li>
30 |     <a href="/newsManage">
31 |         <i class="glyphicon glyphicon-file" aria-hidden="true"></i>
32 |         <span class="">新闻管理</span>
33 |     </a>
34 | </li>
35 | <li>
36 |     <a href="/userManage">
37 |         <i class="glyphicon glyphicon-user" aria-hidden="true"></i>
38 |         <span class="">用户管理</span>
39 |     </a>
40 | </li>
41 | <li>
42 |     <a href="/system">
43 |         <i class="glyphicon glyphicon-cog" aria-hidden="true"></i>
44 |         <span class="">系统信息</span>
45 |     </a>
46 | </li>
47 | {% end %}
48 | 
49 | {% block right %}
50 | 
51 | 
52 | <iframe id="systemPage" width='100%' frameborder=0 scrolling=auto hspace=-100 src="http://45.78.24.111:5000/">
53 | 
54 | </iframe>
55 | 
56 | {% end %}</title>
57 | </head>
58 | <body>
59 | 
60 | </body>
61 | </html>


--------------------------------------------------------------------------------
/templates/system.html:
--------------------------------------------------------------------------------
 1 | {% extends "main.html" %}
 2 | {% block header %}
 3 | <title>{{ header }}</title>
 4 | {% end %}
 5 | 
 6 | {% block left-nav %}
 7 | <li>
 8 |     <a href="/admin?user=admin">
 9 |         <i class="glyphicon glyphicon-home" aria-hidden="true"></i>
10 |         <span>Home</span>
11 |     </a>
12 | </li>
13 | <li>
14 |     <a href="/dataAna">
15 |         <i class="glyphicon glyphicon-education" aria-hidden="true"></i>
16 |         <span class="">数据分析</span>
17 |     </a>
18 | </li>
19 | <li>
20 |     <a href="http://123.207.236.147:5000/" target="_blank">
21 |         <i class="glyphicon glyphicon-save" aria-hidden="true"></i>
22 |         <span class="">爬虫管理</span>
23 |     </a>
24 | </li>
25 | <li>
26 |     <a href="/newsManage">
27 |         <i class="glyphicon glyphicon-file" aria-hidden="true"></i>
28 |         <span class="">新闻管理</span>
29 |     </a>
30 | </li>
31 | <li>
32 |     <a href="/userManage">
33 |         <i class="glyphicon glyphicon-user" aria-hidden="true"></i>
34 |         <span class="">用户管理</span>
35 |     </a>
36 | </li>
37 | <li class="active">
38 |     <a href="/system">
39 |         <i class="glyphicon glyphicon-cog" aria-hidden="true"></i>
40 |         <span class="">系统信息</span>
41 |     </a>
42 | </li>
43 | {% end %}
44 | 
45 | {% block right %}
46 | 
47 | 
48 | <iframe style="margin-top: -53px;" id="systemPage" width='100%' frameborder=0 scrolling=auto hspace=-100 src="http://45.78.24.111:19999/">
49 | 
50 | </iframe>
51 | 
52 | {% end %}


--------------------------------------------------------------------------------
/templates/umFeedBack.html:
--------------------------------------------------------------------------------
  1 | {% extends "main.html" %}
  2 | 
  3 | {% block header %}
  4 | <title></title>
  5 | {% end %}
  6 | 
  7 | {% block left-nav %}
  8 | <li>
  9 |     <a href="/admin?user=admin">
 10 |         <i class="glyphicon glyphicon-home" aria-hidden="true"></i>
 11 |         <span>Home</span>
 12 |     </a>
 13 | </li>
 14 | <li>
 15 |     <a href="/dataAna">
 16 |         <i class="glyphicon glyphicon-education" aria-hidden="true"></i>
 17 |         <span class="">数据分析</span>
 18 |     </a>
 19 | </li>
 20 | <li>
 21 |     <a href="http://123.207.236.147:5000/" target="_blank">
 22 |         <i class="glyphicon glyphicon-save" aria-hidden="true"></i>
 23 |         <span class="">爬虫管理</span>
 24 |     </a>
 25 | </li>
 26 | <li>
 27 |     <a href="/newsManage">
 28 |         <i class="glyphicon glyphicon-file" aria-hidden="true"></i>
 29 |         <span class="">新闻管理</span>
 30 |     </a>
 31 | </li>
 32 | <li class="active">
 33 |     <a href="/userManage">
 34 |         <i class="glyphicon glyphicon-user" aria-hidden="true"></i>
 35 |         <span class="">用户管理</span>
 36 |     </a>
 37 | </li>
 38 | <li>
 39 |     <a href="/system">
 40 |         <i class="glyphicon glyphicon-cog" aria-hidden="true"></i>
 41 |         <span class="">系统信息</span>
 42 |     </a>
 43 | </li>
 44 | {% end %}
 45 | 
 46 | 
 47 | {% block right %}
 48 | 
 49 | <script src="{{static_url('js/userManage.js')}}"></script>
 50 | 
 51 | <div id= "second" class = "left-sidebar nav-list second-menu">
 52 | <li>
 53 |     <a href="/userManage">
 54 |         <i class="glyphicon " aria-hidden="true"></i>
 55 |         <span class="">用户信息</span>
 56 |     </a>
 57 | </li>
 58 | <li class="active">
 59 |     <a href="/umFeedBack">
 60 |         <i class="glyphicon " aria-hidden="true"></i>
 61 |         <span class="">反馈信息</span>
 62 |     </a>
 63 | </li>
 64 | <li>
 65 |     <a href="/umMyNote">
 66 |         <i class="glyphicon " aria-hidden="true"></i>
 67 |         <span class="">管理日志</span>
 68 |     </a>
 69 | </li>
 70 | 
 71 | </div>
 72 | 
 73 | <script src="{{static_url('js/feedback.js')}}"></script>
 74 | 
 75 | <div id="feedback" class="feedback">
 76 | 
 77 |     <div id="not_feedback">
 78 |         <div class="each_feedback">
 79 |             <div class="feedback_name">
 80 |             等一句地老天荒:
 81 |         </div>
 82 |         <div class="feedback_content">
 83 |              INSERT语句是最常见的SQL语句之一,但是MySQL中INSERT语句的用法和标准用法不尽相同,下文就为您详细介绍MySQL中INSERT的一般用法,供您参考。 1用法 在..
 84 |         </div>
 85 |         <div class="feedback_gettime">
 86 |             2016-07-15 12:12:12
 87 |         </div>
 88 |         <div class="feedback_isreply">
 89 |             是否已审批: <span class="rep">否</span>
 90 |         </div>
 91 | 
 92 |         <input class="rep_button" type="button" value="审批">
 93 |         </div>
 94 | 
 95 | 
 96 |         <hr>
 97 | 
 98 | 
 99 | 
100 |     </div>
101 | 
102 |     <ul class="pager">
103 |         <li><a id="shang" href="#">Previous</a></li>
104 |         <li><a id="xia" href="#">Next</a></li>
105 |     </ul>
106 | 
107 | 
108 | </div>
109 | 
110 | 
111 | {% end %}
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/templates/umMyNote.html:
--------------------------------------------------------------------------------
 1 | {% extends "main.html" %}
 2 | 
 3 | {% block header %}
 4 | <title></title>
 5 | {% end %}
 6 | 
 7 | {% block left-nav %}
 8 | <li>
 9 |     <a href="/admin?user=admin">
10 |         <i class="glyphicon glyphicon-home" aria-hidden="true"></i>
11 |         <span>Home</span>
12 |     </a>
13 | </li>
14 | <li>
15 |     <a href="/dataAna">
16 |         <i class="glyphicon glyphicon-education" aria-hidden="true"></i>
17 |         <span class="">数据分析</span>
18 |     </a>
19 | </li>
20 | <li>
21 |     <a href="http://123.207.236.147:5000/" target="_blank">
22 |         <i class="glyphicon glyphicon-save" aria-hidden="true"></i>
23 |         <span class="">爬虫管理</span>
24 |     </a>
25 | </li>
26 | <li>
27 |     <a href="/newsManage">
28 |         <i class="glyphicon glyphicon-file" aria-hidden="true"></i>
29 |         <span class="">新闻管理</span>
30 |     </a>
31 | </li>
32 | <li class="active">
33 |     <a href="/userManage">
34 |         <i class="glyphicon glyphicon-user" aria-hidden="true"></i>
35 |         <span class="">用户管理</span>
36 |     </a>
37 | </li>
38 | <li>
39 |     <a href="/system">
40 |         <i class="glyphicon glyphicon-cog" aria-hidden="true"></i>
41 |         <span class="">系统信息</span>
42 |     </a>
43 | </li>
44 | {% end %}
45 | 
46 | 
47 | {% block right %}
48 | 
49 | <script src="{{static_url('js/userManage.js')}}"></script>
50 | 
51 | <div id= "second" class = "left-sidebar nav-list second-menu">
52 | <li>
53 |     <a href="/userManage">
54 |         <i class="glyphicon " aria-hidden="true"></i>
55 |         <span class="">用户信息</span>
56 |     </a>
57 | </li>
58 | <li>
59 |     <a href="/umFeedBack">
60 |         <i class="glyphicon " aria-hidden="true"></i>
61 |         <span class="">反馈信息</span>
62 |     </a>
63 | </li>
64 | <li class="active">
65 |     <a href="/umMyNote">
66 |         <i class="glyphicon " aria-hidden="true"></i>
67 |         <span class="">管理日志</span>
68 |     </a>
69 | </li>
70 | 
71 | </div>
72 | {% end %}


--------------------------------------------------------------------------------
/templates/userManage.html:
--------------------------------------------------------------------------------
  1 | {% extends "main.html" %}
  2 | 
  3 | {% block header %}
  4 | <title>{{ header }}</title>
  5 | {% end %}
  6 | 
  7 | {% block left-nav %}
  8 | <li>
  9 |     <a href="/admin?user=admin">
 10 |         <i class="glyphicon glyphicon-home" aria-hidden="true"></i>
 11 |         <span>Home</span>
 12 |     </a>
 13 | </li>
 14 | <li>
 15 |     <a href="/dataAna">
 16 |         <i class="glyphicon glyphicon-education" aria-hidden="true"></i>
 17 |         <span class="">数据分析</span>
 18 |     </a>
 19 | </li>
 20 | <li>
 21 |     <a href="http://123.207.236.147:5000/" target="_blank">
 22 |         <i class="glyphicon glyphicon-save" aria-hidden="true"></i>
 23 |         <span class="">爬虫管理</span>
 24 |     </a>
 25 | </li>
 26 | <li>
 27 |     <a href="/newsManage">
 28 |         <i class="glyphicon glyphicon-file" aria-hidden="true"></i>
 29 |         <span class="">新闻管理</span>
 30 |     </a>
 31 | </li>
 32 | <li class="active">
 33 |     <a href="/userManage">
 34 |         <i class="glyphicon glyphicon-user" aria-hidden="true"></i>
 35 |         <span class="">用户管理</span>
 36 |     </a>
 37 | </li>
 38 | <li>
 39 |     <a href="/system">
 40 |         <i class="glyphicon glyphicon-cog" aria-hidden="true"></i>
 41 |         <span class="">系统信息</span>
 42 |     </a>
 43 | </li>
 44 | {% end %}
 45 | 
 46 | 
 47 | {% block right %}
 48 | 
 49 | <script src="{{static_url('js/userManage.js')}}"></script>
 50 | 
 51 | <div id= "second" class = "left-sidebar nav-list second-menu">
 52 | 
 53 | <li class="active">
 54 |     <a href="/userManage">
 55 |         <i class="glyphicon " aria-hidden="true"></i>
 56 |         <span class="">用户信息</span>
 57 |     </a>
 58 | </li>
 59 | <li>
 60 |     <a href="/umFeedBack">
 61 |         <i class="glyphicon " aria-hidden="true"></i>
 62 |         <span class="">反馈信息</span>
 63 |     </a>
 64 | </li>
 65 | <li>
 66 |     <a href="/umMyNote">
 67 |         <i class="glyphicon " aria-hidden="true"></i>
 68 |         <span class="">管理日志</span>
 69 |     </a>
 70 | </li>
 71 | 
 72 | </div>
 73 | 
 74 | <div>
 75 |     <div class="row user_third_menu">
 76 |       <div class="col-sm-1 user_third_head">
 77 | 
 78 |       </div>
 79 |       <div class="col-sm-5">
 80 |           <div id = "page1" class = "page1">
 81 |           </div>
 82 | 
 83 |               <ul class="pager">
 84 |                 <li><a id="last" href="#">Previous</a></li>
 85 |                 <li><a id="next" href="#">Next</a></li>
 86 |               </ul>
 87 |       </div>
 88 |         <div  class="col-sm-3">
 89 |             <div id="user_info" class="user_info">
 90 | 
 91 | 
 92 |             </div>
 93 | 
 94 |         </div>
 95 |     </div>
 96 |    <div class="row">
 97 | 
 98 |    </div>
 99 | </div>
100 | {% end %}
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------