├── .gitignore
├── README.md
├── Run.sh
├── Start_rqworker.sh
├── license.txt
├── requirements.txt
├── run.py
├── settings.py.example
├── sql
└── create_table.sql
├── topic_id_reenqueue.py
├── v2ex_base
├── __init__.py
├── log_in.py
└── v2_sql.py
├── v2ex_spider
├── __init__.py
├── base_spider.py
├── node_spider.py
├── rss_spider.py
└── topic_spider.py
└── v2ex_tester
├── __init__.py
└── topic_tester.py
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/python
3 |
4 | ### Python ###
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | .hypothesis/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # dotenv
87 | .env
88 |
89 | # virtualenv
90 | .venv
91 | venv/
92 | ENV/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # End of https://www.gitignore.io/api/python
105 | #Custom
106 | .project
107 | .pydevproject
108 | .cookies.json
109 | database.db
110 | .time_log.json
111 | *~
112 | dump.rdb
113 | .topics_all.json
114 | settings.py
115 | .node_number.json
116 | .topics_tester.json
117 | rsync_vps.sh
118 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # v2ex删贴监测系统
2 |
3 | ## 初衷
4 | [v2ex](https://www.v2ex.com/)一直把不能删贴、不能删回复作为社区的一个特色。
5 |
6 | 我一开始也是认为v2ex是不会删贴的,顶多是把帖子移动到特殊节点。
7 | 后来发现v2ex也删贴,我感到有一点失望,但也对这种做法表示理解,毕竟已经在国内备案,做出一点妥协也是可以理解的。
8 |
9 | 直到有一天我发现 [/t/349115](https://www.v2ex.com/t/349115) 发布不久便被删除了,这让我感到了欺骗。
10 | 于是,我萌生了写一个v2ex删贴监测系统的想法。
11 |
12 | ## 关于v2ex删贴
13 |
14 | 发布第一期删贴统计后,引发了[一些讨论](https://www.v2ex.com/t/368217)。
15 |
16 | 有同学提醒应该加上一些说明,防止出现一些误会。
17 | 现在关于v2ex删贴的事加上我的一些说明。
18 |
19 | - V2EX 从来没有没有说过“绝对不删帖”。关于删除,更确切的规则是,这里的用户在发布了主题或者回复之后,自己无法删除或者修改。 *[注1](#note1)*
20 | - 一些删贴的情况:*[注2](#note2)*
21 | 1. Spam 是会彻底删除的
22 | 1. 有部分 0 回复主题根据发帖者要求被删除
23 | 1. [/go/sandbox](https://www.v2ex.com/go/sandbox) 里的帖子会被不定时清理
24 | 1. 涉及人身攻击、极端政治言论的帖子也会被删除
25 | 1. 某些特殊原因(如律师信等情况)也会被删除
26 |
27 | **注1:**
28 | https://www.v2ex.com/t/368217#r_4423842
29 |
30 | **注2:**
31 | https://www.v2ex.com/t/192635#r_2079456
32 | https://www.v2ex.com/t/231563#r_2568942
33 |
34 | ## 使用方法
35 |
36 | ### 安装Redis
37 | Ubuntu
38 | ```
39 | $ sudo apt-get update
40 | $ sudo apt-get upgrade
41 | $ sudo apt-get -y install redis-server
42 | ```
43 | 其他系统可参见:https://redis.io/topics/quickstart
44 |
45 | ### 安装依赖
46 | ```
47 | $ pip3 install -r requirements.txt
48 | $ pip3 install requests[socks]
49 | ```
50 |
51 | ### 创建数据库
52 | 使用`sql/create_table.sql`,通过sqlite3创建数据库。
53 |
54 | ### 修改配置文件
55 | ```
56 | $ cp settings.py.example settings.py
57 | ```
58 | 修改`settings.py`,填入v2ex帐户(建议使用小号),添加User-Agent,根据情况修改代理设置,修改数据库所在路径。
59 |
60 | 修改`Run.sh`,将`/home/arch/python/v2ex_delete`替换为项目所在路径。
61 |
62 | ### 添加crontab
63 | 将`Run.sh`添加至crontab。
64 | 如:`* * * * * bash /home/arch/python/v2ex_delete/Run.sh`
65 |
66 | 进行此步前,请确保`redis-server`已经运行。
67 |
68 | ### 启动rqworker
69 | 在项目目录下运行
70 | ```
71 | ./Start_rqworker.sh
72 | ```
73 |
74 | ## 使用建议
75 | 1. 为降低被封概率,请添加代理、UA。
76 | 1. 对于重要请求尽量不使用代理,如使用请保证代理的稳定性。
77 | 1. 对于一般请求请使用代理以免主ip被封。根据经验10个**稳定的**代理,便可以满足要求。
78 | 1. 请根据代理的稳定性,调整failed队列worker数量。请求失败率高,请增加worker数量。
79 | 1. 请定期监控项目的工作情况(`rqinfo`),如failed队列过长,请及时检查代理情况。
80 | 1. 本项目有两种抓取模式,`Mode1`与`Mode2`,`Mode2`请求更少,效果相比`Mode1`稍差,建议在无代理或代理较少的情况下使用`Mode2`。
81 | 1. 该项目默认设置为两周后开始检测删贴,如需修改,可修改`run.py`中`tester_tasker`函数中的SQL语句。
82 |
--------------------------------------------------------------------------------
/Run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd /home/arch/python/v2ex_delete
4 | python3 run.py
5 |
--------------------------------------------------------------------------------
/Start_rqworker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | rqworker node1 &> log/node1_rqworker.log &
4 | rqworker node2 &> log/node2_rqworker.log &
5 | rqworker node3 &> log/node3_rqworker.log &
6 | rqworker node4 &> log/node4_rqworker.log &
7 | rqworker node5 &> log/node5_rqworker.log &
8 | rqworker topic &> log/topic_rqworker.log &
9 | rqworker tester &> log/tester_rqworker.log &
10 | rqworker failed &> log/failed1_rqworker.log &
11 | rqworker failed &> log/failed2_rqworker.log &
12 | rqworker failed &> log/failed3_rqworker.log &
13 | rqworker failed &> log/failed4_rqworker.log &
14 | rqworker failed &> log/failed5_rqworker.log &
15 | rqworker failed &> log/failed6_rqworker.log &
16 | echo "Start rqworker,Finished!"
17 |
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | Copyright 2017 yingziwu
180 |
181 | Licensed under the Apache License, Version 2.0 (the "License");
182 | you may not use this file except in compliance with the License.
183 | You may obtain a copy of the License at
184 |
185 | http://www.apache.org/licenses/LICENSE-2.0
186 |
187 | Unless required by applicable law or agreed to in writing, software
188 | distributed under the License is distributed on an "AS IS" BASIS,
189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 | See the License for the specific language governing permissions and
191 | limitations under the License.
192 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | redis
4 | rq
5 | feedparser
6 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 9, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import json
7 | import time
8 | import requests
9 | import os
10 | from redis import Redis
11 | from rq import Queue
12 | import logging
13 |
14 | from v2ex_spider import node_spider
15 | from v2ex_spider import rss_spider
16 | from v2ex_base.v2_sql import SQL
17 | from v2ex_tester import topic_tester
18 | from v2ex_base import log_in
19 | import topic_id_reenqueue
20 | import settings
21 |
22 | class Start(object):
23 | '''
24 | Start the project.
25 | '''
26 |
27 | def __init__(self):
28 | '''
29 | $ python run.py
30 | or
31 | $ ./Run.sh
32 | '''
33 | logging.info('start')
34 | logging.debug('open sql database.')
35 | self.SQ=SQL()
36 | self.SQ.open_datebase()
37 | self.redis_conn=Redis()
38 | self.load_config()
39 | #base
40 | self.load_json()
41 | # self.update_cookies()
42 | try:
43 | self.update_nodes()
44 | except APIError as e:
45 | pass
46 |
47 | def Mode1(self):
48 | logging.info('start mode1')
49 | #start
50 | self.get_rss()
51 | self.tasker()
52 | self.topic_ids_enqueue()
53 | self.tester_tasker()
54 | #end
55 | self.end()
56 |
57 | def Mode2(self):
58 | logging.info('start mode2')
59 | #start
60 |
61 | self.get_rss()
62 | self.topic_ids_enqueue()
63 | self.tester_tasker()
64 | #end
65 | self.end()
66 |
67 | def end(self):
68 | self.SQ.close_datebase()
69 | self.dump_json()
70 | logging.info('end')
71 |
72 | def load_json(self):
73 | logging.debug('load json')
74 | #load .time_log.json
75 | if os.path.exists('.time_log.json'):
76 | with open('.time_log.json','r') as f:
77 | self.time_log=json.load(f)
78 | else:
79 | self.time_log={'cookies_time':'0','nodes_time':'0','8000_node':'0','4000_node':'0','1000_node':'0','500_node':'0','0_node':'0','rss_time':'0','tester':'0',
80 | 'topic_id_reenqueue':'0'}
81 | #load .node_number.json
82 | if os.path.exists('.node_number.json'):
83 | with open('.node_number.json','r') as f:
84 | self.node_number=json.load(f)
85 | else:
86 | self.node_number=list()
87 | return
88 |
89 | def dump_json(self):
90 | #dump .time_log.json
91 | with open('.time_log.json','w') as f1:
92 | json.dump(self.time_log, f1)
93 | #dump .node_number.json
94 | with open('.node_number.json','w') as f2:
95 | self.node_number=list(set(self.node_number))
96 | json.dump(self.node_number,f2)
97 | return
98 |
99 | def topic_ids_enqueue(self):
100 | if int(time.time())-int(self.time_log['topic_id_reenqueue']) >= 1800:
101 | logging.info('start topic id reenqueue')
102 | max_id=topic_id_reenqueue.max_id
103 | topic_id_reenqueue.reenqueue_m(max_id-2000, max_id-29)
104 | self.time_log['topic_id_reenqueue']=str(int(time.time()))
105 | return
106 |
107 | def update_cookies(self):
108 | if int(time.time())-int(self.time_log["cookies_time"]) >= 86400 * 4:
109 | cookies_time_status = False
110 | else:
111 | cookies_time_status = True
112 | if not os.path.exists('cookies.txt') or cookies_time_status is False:
113 | logging.debug('update cookies')
114 | try:
115 | log_s=log_in.v2ex_log_in()
116 | log_s.log_in(3)
117 | log_s.save_cookies()
118 | except log_in.LogError as e:
119 | return
120 | self.time_log["cookies_time"]=str(int(time.time()))
121 | return
122 |
123 | def update_nodes(self):
124 | if int(time.time())-int(self.time_log["nodes_time"]) >= 10800:
125 | nodes_time_status=False
126 | else:
127 | nodes_time_status=True
128 | if not nodes_time_status:
129 | logging.info('update nodes')
130 | try:
131 | resp=self.s.get('https://www.v2ex.com/api/nodes/all.json', timeout=10)
132 | except requests.exceptions.RequestException as e:
133 | logging.error('update_node failed.')
134 | logging.error('proxy_status: %s' % settings.i_proxy_enable)
135 | if settings.i_proxy_enable is True:
136 | logging.error('proxy: %s' % self.s.proxies)
137 | logging.error(e)
138 | self.node_number=list(set(self.node_number))
139 | return
140 | if resp.status_code != 200:
141 | logging.error('update_node failed.')
142 | logging.error('proxy_status: %s' % settings.i_proxy_enable)
143 | if settings.i_proxy_enable is True:
144 | logging.error('proxy: %s' % self.s.proxies)
145 | logging.error(APIError('update_node'))
146 | self.node_number=list(set(self.node_number))
147 | raise APIError('update_node')
148 | nodes=resp.json()
149 | for node in nodes:
150 | n_id=node["id"]
151 | name=node["name"]
152 | url=node["url"]
153 | title=node["title"]
154 | title_alternative=node["title_alternative"]
155 | topics=node["topics"]
156 | header=node["header"]
157 | footer=node["footer"]
158 | created=node["created"]
159 | n_time=int(time.time())
160 | if self.SQ.node_test(n_id, topics) is True:
161 | self.node_number.append(int(n_id))
162 | self.SQ.write_to_db_node(n_id, name, url, title, title_alternative, topics, header, footer, created, n_time)
163 | self.time_log["nodes_time"]=str(int(time.time()))
164 | self.node_number=list(set(self.node_number))
165 | return
166 |
167 | def tasker(self):
168 | node_configs_1=[{'sql':'SELECT ID FROM NODES WHERE topics >= 8000;','sleep_time':5,'between_time':900,'time_log':'8000_node','queue_name':'node1'},
169 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;','sleep_time':10,'between_time':1800,'time_log':'4000_node','queue_name':'node2'},
170 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;','sleep_time':20,'between_time':7200,'time_log':'1000_node','queue_name':'node3'},
171 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;','sleep_time':90,'between_time':86400,'time_log':'500_node','queue_name':'node4'},
172 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;','sleep_time':90,'between_time':86400,'time_log':'0_node','queue_name':'node5'}]
173 | node_configs_2=[{'sql':'SELECT ID FROM NODES WHERE topics >= 8000;','sleep_time':5,'between_time':1800,'time_log':'8000_node','queue_name':'node1'},
174 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;','sleep_time':10,'between_time':3600,'time_log':'4000_node','queue_name':'node2'},
175 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;','sleep_time':20,'between_time':14400,'time_log':'1000_node','queue_name':'node3'},
176 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;','sleep_time':90,'between_time':86400,'time_log':'500_node','queue_name':'node4'},
177 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;','sleep_time':90,'between_time':86400,'time_log':'0_node','queue_name':'node5'}]
178 | time.tzname=('CST', 'CST')
179 | if int(time.strftime('%H')) >= 8 or int(time.strftime('%H')) < 2:
180 | node_configs=node_configs_1
181 | else:
182 | node_configs=node_configs_2
183 | for node_config in node_configs:
184 | sql=node_config['sql']
185 | sleep_time=node_config['sleep_time']
186 | between_time=node_config['between_time']
187 | time_log_name=node_config['time_log']
188 | queue_name=node_config['queue_name']
189 | q_node=Queue(queue_name,connection=self.redis_conn)
190 | if int(time.time()) - int(self.time_log[time_log_name]) >= between_time:
191 | logging.info('start enqueue, queue name: %s' % queue_name)
192 | self.SQ.cursor.execute(sql)
193 | node_ids=self.SQ.cursor.fetchall()
194 | for node_id in node_ids:
195 | node_id=node_id[0]
196 | if queue_name not in ['node4','node5'] or (queue_name in ['node4','node5'] and node_id in self.node_number):
197 | if queue_name in ['node4','node5']:
198 | self.node_number.remove(int(node_id))
199 | q_node.enqueue(node_spider.start,node_id,sleep_time)
200 | self.time_log[time_log_name]=str(int(time.time()))
201 | return
202 |
203 | def get_rss(self):
204 | if int(time.time())-int(self.time_log["rss_time"]) >= 600:
205 | logging.debug('start get_rss')
206 | try:
207 | rss_spider.Rss_spider()
208 | except requests.exceptions.RequestException as e:
209 | self.time_log["rss_time"]=str(int(time.time()))
210 | return
211 | self.time_log["rss_time"]=str(int(time.time()))
212 | return
213 |
214 | def load_config(self):
215 | logging.debug('load config')
216 | self.proxy_enable=settings.i_proxy_enable
217 | self.s=requests.session()
218 | self.s.headers=settings.API_headers
219 | if self.proxy_enable:
220 | self.s.proxies=settings.i_proxies()
221 | return
222 |
223 | def tester_tasker(self):
224 | if int(time.time())-int(self.time_log["tester"]) >= 1800:
225 | logging.info('start enqueue tester')
226 | #losd json
227 | if os.path.exists('.topics_tester.json'):
228 | with open('.topics_tester.json','r') as f:
229 | tmp_topics=json.load(f)
230 | else:
231 | tmp_topics=list()
232 | #main
233 | sql="SELECT ID FROM TOPIC WHERE (time - created) < 345600 AND ID NOT IN (SELECT T_ID FROM STATUS) AND (STRFTIME('%s','now') - created) > 1209600;"
234 | sleep_time=20
235 | self.SQ.cursor.execute(sql)
236 | topic_ids=[x[0] for x in self.SQ.cursor.fetchall()]
237 | q=Queue('tester',connection=self.redis_conn)
238 | for topic_id in topic_ids:
239 | if topic_id not in tmp_topics:
240 | q.enqueue(topic_tester.start,topic_id, sleep_time)
241 | tmp_topics.append(topic_id)
242 | #end
243 | tmp_topics=list(set(tmp_topics))
244 | with open('.topics_tester.json','w') as f:
245 | json.dump(tmp_topics,f)
246 | self.time_log["tester"]=str(int(time.time()))
247 | return
248 |
249 | class APIError(ValueError):
250 | pass
251 |
252 | if __name__ == '__main__':
253 | S=Start()
254 | if settings.mode == 'Mode1':
255 | S.Mode1()
256 | elif settings.mode == 'Mode2':
257 | S.Mode2()
258 |
--------------------------------------------------------------------------------
/settings.py.example:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 12, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | # Settings
7 | ## v2ex account #建议使用小号
8 | account='v2ex_account' #v2ex帐号
9 | password='v2ex_password' #v2ex密码
10 |
11 | ## User-Agent setting
12 | ### WEB_User_Agents:访问v2ex网页版时使用的UA池。请自行添加
13 | WEB_User_Agents=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2869.0 Safari/537.36']
14 | ### API_User_Agents:调用v2ex API时使用的UA池。请自行添加
15 | API_User_Agents=['Mozilla/5.0 (Linux; Android 6.0; Nexus 5X Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.124 Mobile Safari/537.36']
16 |
17 |
18 | ## proxy setting
19 | proxy_enable=False #是否对一般请求使用代理,默认值为不使用。
20 | ### proxies_list:代理池,具体说明详见:http://docs.python-requests.org/en/master/user/advanced/#proxies
21 | proxies_list=[{"http":"socks5://localhost:1080","https":"socks5://localhost:1080"},
22 | {"http":"http://localhost:8090","https":"http://localhost:8090"},]
23 |
24 | i_proxy_enable=False #是否对重要请求使用代理,默认值为不使用。(run.py、log_in.py、rss_spider.py为重要请求)
25 | i_proxies_list=[{"http":"socks5://127.0.0.1:1080","https":"socks5://127.0.0.1:1080"}]
26 |
27 | ## database setting
28 | database_path="/home/arch/python/v2ex_delete/database.db" #sqlite数据库路径
29 |
30 | ## log
31 | import logging
32 | import os
33 | log_level=logging.INFO #log等级
34 | log_dir='log' #log路径
35 | log_path=os.path.join(log_dir,'all.log')
36 | logging.basicConfig(level=log_level,
37 | filename=log_path,filemode='a',
38 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
39 |
40 | ## mode (Mode1 or Mode2)
41 | mode='Mode1' #抓取模式
42 |
43 | # Program
44 | import random
45 | WEB_headers={'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
46 | 'Accept-Encoding':"gzip, deflate",
47 | 'Accept-Language':"zh-CN,zh;q=0.8",
48 | 'User-Agent':random.choice(WEB_User_Agents)}
49 | API_headers={'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
50 | 'Accept-Encoding':"gzip, deflate",
51 | 'Accept-Language':"zh-CN,zh;q=0.8",
52 | 'User-Agent':random.choice(API_User_Agents)}
53 |
54 | def proxies():
55 | return random.choice(proxies_list)
56 |
57 | def i_proxies():
58 | return random.choice(i_proxies_list)
59 |
--------------------------------------------------------------------------------
/sql/create_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE TOPIC (
2 | ID INTEGER PRIMARY KEY,
3 | title TEXT,
4 | author TEXT,
5 | author_id INTEGER,
6 | content TEXT,
7 | content_rendered TEXT,
8 | replies INTEGER,
9 | node INTEGER,
10 | created INTEGER,
11 | time INTEGER
12 | );
13 |
14 | CREATE TABLE STATUS (
15 | ID INTEGER PRIMARY KEY AUTOINCREMENT,
16 | T_ID INTEGER,
17 | NODE INTEGER,
18 | STATUS INTEGER,
19 | TIME INTEGER
20 | );
21 |
22 | CREATE TABLE NODES (
23 | ID INTEGER PRIMARY KEY,
24 | name TEXT,
25 | url TEXT,
26 | title TEXT,
27 | title_alternative TEXT,
28 | topics INTEGER,
29 | header TEXT,
30 | footer TEXT,
31 | created INTEGER,
32 | time INTEGER
33 | );
34 |
35 | CREATE VIEW HUMAN_READER AS
36 | SELECT TOPIC.ID,TOPIC.title,TOPIC.author,TOPIC.author_id,TOPIC.content,TOPIC.content_rendered,TOPIC.replies,
37 | NODES_1.name AS node_name,TOPIC.node AS node_id,
38 | TOPIC.created AS create_time,DATETIME(TOPIC.created,'unixepoch') AS create_time_h,TOPIC.time AS grab_time,DATETIME(TOPIC.time,'unixepoch') AS grab_time_h,
39 | STATUS.TIME AS test_time,DATETIME(STATUS.TIME,'unixepoch') AS test_time_h,NODES_2.name AS node_name_on_test,STATUS.NODE AS node_id_on_test,STATUS.STATUS
40 | FROM TOPIC
41 | INNER JOIN NODES AS NODES_1
42 | ON NODES_1.ID = TOPIC.node
43 | LEFT OUTER JOIN STATUS
44 | ON STATUS.T_ID = TOPIC.ID
45 | LEFT OUTER JOIN NODES AS NODES_2
46 | ON NODES_2.ID = STATUS.NODE
47 | ORDER BY TOPIC.ID ASC;
48 |
--------------------------------------------------------------------------------
/topic_id_reenqueue.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jun 10, 2017
3 |
4 | @author: arch
5 | '''
6 | from redis import Redis
7 | from rq import Queue
8 | import sqlite3
9 | import os
10 | import json
11 | import logging
12 |
13 | from v2ex_spider import topic_spider
14 | import settings
15 |
16 | sql="select ID from TOPIC;"
17 | conn=sqlite3.connect(settings.database_path)
18 | cursor=conn.cursor()
19 | cursor.execute(sql)
20 | r=cursor.fetchall()
21 | ids_db=[int(x[0]) for x in r]
22 | max_id=max(ids_db)
23 | cursor.close()
24 | conn.commit()
25 | conn.close()
26 |
27 | redis_conn=Redis()
28 | q=Queue('topic',connection=redis_conn)
29 |
30 | def reenqueue_m(start_id,end_id):
31 | #load topic id
32 | if os.path.exists('.topics_all.json'):
33 | with open('.topics_all.json','r') as f:
34 | tmp_topics=json.load(f)
35 | else:
36 | tmp_topics=list()
37 | #work
38 | for x in range(int(start_id),int(end_id)+1):
39 | if x not in ids_db and x not in tmp_topics:
40 | logging.info('enqueue the topic %d' % int(x))
41 | q.enqueue(topic_spider.start,x, 10)
42 | tmp_topics.append(int(x))
43 | #save topic id
44 | topics_all=list()
45 | topics_all.extend(ids_db)
46 | topics_all.extend(tmp_topics)
47 | topics_all=list(set(topics_all))
48 | with open('.topics_all.json','w') as f:
49 | json.dump(topics_all, f)
50 | return
51 |
52 | def reenqueue_a():
53 | start_id=max_id-2000
54 | end_id=max_id-150
55 | return reenqueue_m(start_id, end_id)
56 |
57 | if __name__ == '__main__':
58 | import sys
59 | args=sys.argv
60 | if len(args) < 3 and (len(args) != 3 and args[1] != 'auto'):
61 | print('Please input the topic id of start and end.')
62 | print('Or use the auto mode.')
63 | exit(2)
64 | elif len(args) != 3 and args[1] == 'auto':
65 | reenqueue_a()
66 | print('auto')
67 | elif len(args) == 3:
68 | start_id=args[1]
69 | end_id=args[2]
70 | print('manual')
71 | reenqueue_m(start_id, end_id)
72 | print('Finish!')
--------------------------------------------------------------------------------
/v2ex_base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingziwu/v2ex_delete/8f1befcd43bd83c9531bf7180e23bbaf8b12b4ed/v2ex_base/__init__.py
--------------------------------------------------------------------------------
/v2ex_base/log_in.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 9, 2017
3 |
4 | @author: yingziwu
5 | '''
6 |
7 | import requests
8 | from lxml import etree
9 | import json
10 | import logging
11 |
12 | import settings
13 |
14 | class v2ex_log_in(object):
15 | '''
16 | log in v2ex account,return the cookies.
17 | '''
18 |
19 |
20 | def __init__(self):
21 | '''
22 | >>>import log_in
23 | >>>log_s=log_in.v2ex_log_in()
24 | >>>log_s.log_in()
25 | >>>log_s.save_cookies()
26 | '''
27 | logging.info('start log in')
28 | self.load_config()
29 | return
30 |
31 | def load_config(self):
32 | logging.debug('start load_config')
33 | self.account=settings.account
34 | self.passwd=settings.password
35 | self.proxy_enable=settings.i_proxy_enable
36 | self.base_headers=settings.WEB_headers
37 | self.s=requests.session()
38 | self.s.headers=self.base_headers
39 | if self.proxy_enable:
40 | self.s.proxies=settings.i_proxies()
41 | return
42 |
43 | def log_in(self,try_time):
44 | logging.debug('start log_in')
45 | if try_time >= 4:
46 | logging.error(LogError('try time too much.'))
47 | raise LogError('try time too much.')
48 | #1
49 | try:
50 | r1=self.s.get('https://www.v2ex.com/signin', timeout=10)
51 | except requests.exceptions.RequestException as e:
52 | logging.error('log in error')
53 | logging.error(try_time)
54 | logging.error('proxy status: %s' % self.proxy_enable)
55 | if self.proxy_enable is True:
56 | logging.error('proxy: %s' % self.s.proxies)
57 | logging.error(e)
58 | try_time=try_time+1
59 | return self.log_in(try_time)
60 | if r1.status_code != 200:
61 | error_info='proxy status: %s, proxy: %s' % (str(settings.i_proxy_enable),str(self.s.proxies))
62 | raise LogError(error_info)
63 | self.s.headers={'Referer': 'https://v2ex.com/signin'}
64 | t1=etree.HTML(r1.text)
65 | text_name=t1.xpath('//input[@type="text"]/@name')[-1]
66 | password_name=t1.xpath('//input[@type="password"]/@name')[0]
67 | once1=t1.xpath('//input[@type="hidden"]/@value')[0]
68 | post_data={text_name:self.account, password_name:self.passwd, 'once':str(once1), 'next':'/'}
69 | #r2
70 | try:
71 | r2=self.s.post('https://www.v2ex.com/signin', data=post_data)
72 | except requests.exceptions.RequestException as e:
73 | logging.error('log in error')
74 | logging.error(try_time)
75 | logging.error('proxy status: %s' % self.proxy_enable)
76 | if self.proxy_enable is True:
77 | logging.error('proxy: %s' % self.s.proxies)
78 | logging.error(e)
79 | try_time=try_time+1
80 | return self.log_in(try_time)
81 | return
82 |
83 | def save_cookies(self):
84 | logging.debug('start save_cookies')
85 | resp=self.s.get('https://www.v2ex.com/go/flamewar', timeout=10)
86 | if '登录' in resp.text:
87 | raise LogError('log failed.')
88 | with open('.cookies.json','w') as f:
89 | json.dump(requests.utils.dict_from_cookiejar(self.s.cookies),f)
90 | return
91 |
92 | class LogError(ValueError):
93 | pass
94 |
95 | if __name__ == '__main__':
96 | tmp=v2ex_log_in()
97 | tmp.log_in(1)
98 | tmp.save_cookies()
99 | print('finish!')
100 |
--------------------------------------------------------------------------------
/v2ex_base/v2_sql.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 12, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import settings
7 | import sqlite3
8 |
9 | class SQL(object):
10 | '''
11 | The sqlite class.
12 | '''
13 |
14 |
15 | def __init__(self):
16 | '''
17 | >>>from v2ex_base.v2_sql import SQL
18 | >>>SQ=SQL()
19 | >>>SQ.open_datebase()
20 | write_to_db_base
21 | >>>SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)
22 | write_to_db_node
23 | >>>SQ.write_to_db_node(n_id,name,url,title,title_alternative,topics,header,footer,created,n_time)
24 | node_test
25 | >>>SQ.node_test(node_id,number_now)
26 | '''
27 | self.database_path=settings.database_path
28 |
29 | def open_datebase(self):
30 | self.conn=sqlite3.connect(self.database_path)
31 | self.cursor=self.conn.cursor()
32 |
33 | def close_datebase(self):
34 | self.cursor.close()
35 | self.conn.commit()
36 | self.conn.close()
37 |
38 | def write_to_db_base(self,t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time):
39 | sql="INSERT INTO TOPIC (ID,title,author,author_id,content,content_rendered,replies,node,created,time) VALUES ( %s );" % ', '.join(['?'] * 10)
40 | try:
41 | self.cursor.execute(sql,(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time))
42 | except sqlite3.IntegrityError as e:
43 | pass
44 | self.conn.commit()
45 | return
46 |
47 | def write_to_db_node(self,n_id,name,url,title,title_alternative,topics,header,footer,created,n_time):
48 | sql="REPLACE INTO NODES (ID,name,url,title,title_alternative,topics,header,footer,created,time) VALUES ( %s );" % ', '.join(['?'] * 10)
49 | try:
50 | self.cursor.execute(sql, (n_id,name,url,title,title_alternative,topics,header,footer,created,n_time))
51 | except sqlite3.IntegrityError as e:
52 | pass
53 | self.conn.commit()
54 | return
55 |
56 | def write_to_db_status(self,T_ID,NODE,STATUS,TIME):
57 | sql="INSERT INTO STATUS (T_ID,NODE,STATUS,TIME) VALUES ( %s );" % ', '.join(['?'] * 4)
58 | try:
59 | self.cursor.execute(sql,(T_ID,NODE,STATUS,TIME))
60 | except sqlite3.IntegrityError as e:
61 | pass
62 | self.conn.commit()
63 | return
64 |
65 | def node_test(self,node_id,number_now):
66 | sql="SELECT topics FROM NODES WHERE ID = %d;" % int(node_id)
67 | self.cursor.execute(sql)
68 | number_old_r=self.cursor.fetchone()
69 | if number_old_r is None:
70 | return True
71 | else:
72 | number_old=number_old_r[0]
73 | if int(number_old) != int(number_now):
74 | return True
75 | else:
76 | return False
77 |
--------------------------------------------------------------------------------
/v2ex_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingziwu/v2ex_delete/8f1befcd43bd83c9531bf7180e23bbaf8b12b4ed/v2ex_spider/__init__.py
--------------------------------------------------------------------------------
/v2ex_spider/base_spider.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 12, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import requests
7 | import time
8 | import logging
9 |
10 | from v2ex_base.v2_sql import SQL
11 | import settings
12 |
13 | class spider(object):
14 | '''
15 | A base Spider for v2ex.
16 | '''
17 |
18 |
19 | def __init__(self,url,sleep_time):
20 | '''
21 | >>>from v2ex_spider import base_spider
22 | >>>base_spider.start(url,sleep_time)
23 | '''
24 | logging.info('Start base spider. Url is %s' % url)
25 | self.url=url
26 | self.sleep_time=sleep_time
27 | time.sleep(int(self.sleep_time))
28 | self.SQ=SQL()
29 | self.SQ.open_datebase()
30 | #run
31 | self.load_config()
32 | self.spider()
33 | #end
34 | self.SQ.close_datebase()
35 | logging.info('Spider Finished.')
36 |
37 | def spider(self):
38 | logging.debug('start spider.')
39 | try:
40 | resp=self.s.get(self.url, timeout=10)
41 | except requests.exceptions.RequestException as e:
42 | logging.error('spider failed.')
43 | logging.error('proxy_status: %s' % settings.proxy_enable)
44 | if settings.proxy_enable is True:
45 | logging.error('proxy: %s' % self.s.proxies)
46 | logging.error(e)
47 | raise e
48 | if resp.status_code != 200:
49 | self.SQ.close_datebase()
50 | error_info='proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies))
51 | logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies)))
52 | raise APIError(error_info)
53 | topics=resp.json()
54 | for topic in topics:
55 | t_id=topic["id"]
56 | title=topic["title"]
57 | author=topic["member"]["username"]
58 | author_id=topic["member"]["id"]
59 | content=topic["content"]
60 | content_rendered=topic["content_rendered"]
61 | replies=topic["replies"]
62 | node=topic["node"]["id"]
63 | created=topic["created"]
64 | n_time=int(time.time())
65 | self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)
66 | self.SQ.conn.commit()
67 | return
68 |
69 | def load_config(self):
70 | logging.debug('start load_config')
71 | self.proxy_enable=settings.proxy_enable
72 | self.s=requests.session()
73 | self.s.headers=settings.API_headers
74 | if self.proxy_enable:
75 | self.s.proxies=settings.proxies()
76 | return
77 |
78 | class APIError(ValueError):
79 | pass
80 |
--------------------------------------------------------------------------------
/v2ex_spider/node_spider.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 9, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import logging
7 |
8 | from v2ex_spider import base_spider
9 | import settings
10 |
11 | def start(node_id,sleep_time):
12 | logging.info('Start node spider. Node id is %d.' % int(node_id))
13 | url='https://www.v2ex.com/api/topics/show.json?node_id=%s' % str(node_id)
14 | base_spider.spider(url,sleep_time)
15 | return
16 |
17 | if __name__ == '__main__':
18 | start(12,5)
19 | print('Finish!')
--------------------------------------------------------------------------------
/v2ex_spider/rss_spider.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 9, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import feedparser
7 | import time
8 | import re
9 | import requests
10 | from redis import Redis
11 | from rq import Queue
12 | import json
13 | import os
14 | import logging
15 |
16 | from v2ex_spider import topic_spider
17 | from v2ex_base.v2_sql import SQL
18 | import settings
19 |
20 |
21 | class Rss_spider(object):
22 | '''
23 | A Spider for v2ex's Rss.
24 | Get the latest and hot topic on the index.
25 | Using the rss generate the topic list that need to spider.
26 | '''
27 |
28 |
29 | def __init__(self):
30 | '''
31 | >>>from v2ex_spider import rss_spider
32 | >>>rss_spider.Rss_spider()
33 | '''
34 | logging.info('start Rss spider')
35 | self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml',
36 | 'https://www.v2ex.com/feed/tab/qna.xml',
37 | 'https://www.v2ex.com/feed/tab/jobs.xml',
38 | 'https://www.v2ex.com/feed/tab/deals.xml',
39 | 'https://www.v2ex.com/feed/tab/city.xml',
40 | 'https://www.v2ex.com/feed/tab/play.xml',
41 | 'https://www.v2ex.com/feed/tab/apple.xml',
42 | 'https://www.v2ex.com/feed/tab/creative.xml',
43 | 'https://www.v2ex.com/feed/tab/tech.xml']
44 | self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json']
45 | self.topic_sleep_time=10
46 | logging.debug('open sql database')
47 | self.SQ=SQL()
48 | self.SQ.open_datebase()
49 | self.redis_conn=Redis()
50 | self.load_config()
51 | #run
52 | try:
53 | self.latest_and_hot()
54 | except APIError as e:
55 | pass
56 | self.gen_topic_queue()
57 | #end
58 | self.SQ.close_datebase()
59 | logging.info('end the Rss spider')
60 |
61 | def topics_id_rss(self):
62 | logging.debug('fetch rss feeds')
63 | topic_ids=list()
64 | for v2ex_rss_url in self.v2ex_rss_url_list:
65 | feed=feedparser.parse(v2ex_rss_url)
66 | logging.debug('fetch rss feed: %s' % v2ex_rss_url)
67 | items=feed["items"]
68 | for item in items:
69 | author=item["author"]
70 | title=item["title"]
71 | link=item["link"]
72 | published=item[ "date" ]
73 | summary=item["summary"]
74 | topic_id=int(re.findall(r't\/(\d+)#?', link)[0])
75 | topic_ids.append(topic_id)
76 | topic_ids=set(topic_ids)
77 | return topic_ids
78 |
79 | def topics_id_sqlite(self):
80 | logging.debug('SELECT ID FROM TOPIC')
81 | sql='SELECT ID FROM TOPIC;'
82 | self.SQ.cursor.execute(sql)
83 | topics_ids=[x[0] for x in self.SQ.cursor.fetchall()]
84 | return topics_ids
85 |
86 | def latest_and_hot(self):
87 | logging.debug('start latest_and_hot')
88 | for url in self.latest_hot_api:
89 | try:
90 | resp=self.s.get(url, timeout=10)
91 | except requests.exceptions.RequestException as e:
92 | logging.error('latest_and_hot error')
93 | logging.error('proxy_status: %s' % self.proxy_enable)
94 | if self.proxy_enable is True:
95 | logging.error('proxy: %s' % self.s.proxies)
96 | logging.error(e)
97 | raise e
98 | if resp.status_code != 200:
99 | logging.error('latest_and_hot error')
100 | logging.error('proxy_status: %s' % self.proxy_enable)
101 | if self.proxy_enable is True:
102 | logging.error('proxy: %s' % self.s.proxies)
103 | logging.error(APIError('latest_and_hot'))
104 | raise APIError('latest_and_hot')
105 | topics=resp.json()
106 | for topic in topics:
107 | t_id=topic["id"]
108 | title=topic["title"]
109 | author=topic["member"]["username"]
110 | author_id=topic["member"]["id"]
111 | content=topic["content"]
112 | content_rendered=topic["content_rendered"]
113 | replies=topic["replies"]
114 | node=topic["node"]["id"]
115 | created=topic["created"]
116 | n_time=int(time.time())
117 | self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)
118 | self.SQ.conn.commit()
119 | return
120 |
121 | def gen_topic_queue(self):
122 | logging.debug('start topic enqueue')
123 | topics_sql=self.topics_id_sqlite()
124 | if len(topics_sql) <= 2000:
125 | return
126 | topics_rss=self.topics_id_rss()
127 | # load topics
128 | if os.path.exists('.topics_all.json'):
129 | with open('.topics_all.json','r') as f:
130 | tmp_topics=json.load(f)
131 | else:
132 | tmp_topics=list()
133 | t_queue=Queue('topic',connection=self.redis_conn)
134 | # gen queue
135 | for topic in topics_rss:
136 | if topic not in topics_sql and topic not in tmp_topics:
137 | topic_id=int(topic)
138 | t_queue.enqueue(topic_spider.start,topic_id, self.topic_sleep_time)
139 | #save topics
140 | topics_all=list()
141 | topics_all.extend(tmp_topics)
142 | topics_all.extend(topics_rss)
143 | topics_all.extend(topics_sql)
144 | topics_all=list(set(topics_all))
145 | with open('.topics_all.json','w') as f:
146 | json.dump(topics_all, f)
147 | return
148 |
149 | def load_config(self):
150 | logging.debug('load config')
151 | self.proxy_enable=settings.i_proxy_enable
152 | self.s=requests.session()
153 | self.s.headers=settings.API_headers
154 | if self.proxy_enable:
155 | self.s.proxies=settings.i_proxies()
156 |
157 | class APIError(ValueError):
158 | pass
159 |
160 | if __name__ == '__main__':
161 | Rss_spider()
162 | print('Finish!')
--------------------------------------------------------------------------------
/v2ex_spider/topic_spider.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 10, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import logging
7 |
8 | from v2ex_spider import base_spider
9 | import settings
10 |
11 | def start(topic_id, sleep_time):
12 | logging.info('Start topic spider. Topic id is %d.' % int(topic_id))
13 | url='https://www.v2ex.com/api/topics/show.json?id=%s' % str(topic_id)
14 | base_spider.spider(url,sleep_time)
15 | return
16 |
17 | if __name__ == '__main__':
18 | start(1,5)
19 | print('Finish!')
--------------------------------------------------------------------------------
/v2ex_tester/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingziwu/v2ex_delete/8f1befcd43bd83c9531bf7180e23bbaf8b12b4ed/v2ex_tester/__init__.py
--------------------------------------------------------------------------------
/v2ex_tester/topic_tester.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 9, 2017
3 |
4 | @author: yingziwu
5 | '''
6 | import requests
7 | import json
8 | from lxml import etree
9 | import time
10 | import re
11 | import logging
12 |
13 | from v2ex_base.v2_sql import SQL
14 | import settings
15 |
16 | class tester(object):
17 | '''
18 | The tester for v2ex topics.
19 | '''
20 |
21 |
22 | def __init__(self):
23 | '''
24 | >>>from v2ex_tester import topic_tester
25 | >>>topic_tester(topic_id,sleep_time)
26 | '''
27 | logging.debug('init class tester')
28 | self.s=requests.session()
29 | if settings.proxy_enable is True:
30 | self.s.proxies=settings.proxies()
31 | self.s.headers=settings.WEB_headers
32 | self.log_status=False
33 |
34 | def init_database(self):
35 | logging.debug('init database')
36 | self.SQ=SQL()
37 | self.SQ.open_datebase()
38 |
39 | def log_in(self):
40 | logging.debug('log in account')
41 | with open('.cookies.json','r') as f:
42 | cookies=requests.utils.cookiejar_from_dict(json.load(f))
43 | self.s.cookies=cookies
44 | self.s.headers=settings.WEB_headers
45 | self.log_status=True
46 | return
47 |
48 | def web_test(self,t_id,status):
49 | logging.debug('Start web_test')
50 | url='https://www.v2ex.com/t/%s' % str(t_id)
51 | n_time=int(time.time())
52 | try:
53 | resp=self.s.get(url, timeout=10)
54 | except requests.exceptions.RequestException as e:
55 | logging.error('web_test failed.')
56 | logging.error('proxy_status: %s' % settings.proxy_enable)
57 | if settings.proxy_enable is True:
58 | logging.error('proxy: %s' % self.s.proxies)
59 | logging.error(e)
60 | raise e
61 | if resp.status_code == 403:
62 | error_info='proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies))
63 | logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies)))
64 | raise APIError(error_info)
65 | if resp.status_code == 404 and '404 Topic Not Found' in resp.text :
66 | return {'T_ID':int(t_id),'NODE':None,'STATUS':3,'TIME':n_time}
67 | if resp.url == 'https://www.v2ex.com/':
68 | return self.api_test(t_id, status=2)
69 | if 'signin' in resp.url and self.log_status is False:
70 | # self.log_in()
71 | # return self.web_test(t_id, status=1)
72 | return self.api_test(t_id, status=1)
73 | tree=etree.HTML(resp.text)
74 | node_name=re.findall(r'\/go\/(\w+)', tree.xpath('//div[@class="header"]/a[2]/@href')[0])[0]
75 | self.SQ.cursor.execute("SELECT ID FROM NODES WHERE name == '%s';" % node_name)
76 | node_id=self.SQ.cursor.fetchone()[0]
77 | return {'T_ID':int(t_id),'NODE':node_id,'STATUS':status,'TIME':n_time}
78 |
79 | def api_test(self,t_id,status):
80 | logging.debug('Start api_test')
81 | self.s_a=requests.session()
82 | if settings.proxy_enable is True:
83 | self.s_a.proxies=settings.proxies()
84 | self.s_a.headers=settings.API_headers
85 | url='https://www.v2ex.com/api/topics/show.json?id=%s' % str(t_id)
86 | n_time=int(time.time())
87 | try:
88 | resp=self.s_a.get(url, timeout=10)
89 | except requests.exceptions.RequestException as e:
90 | logging.error('api_test failed.')
91 | logging.error('proxy_status: %s' % settings.proxy_enable)
92 | if settings.proxy_enable is True:
93 | logging.error('proxy: %s' % self.s.proxies)
94 | logging.error(e)
95 | raise e
96 | if resp.status_code != 200:
97 | error_info='proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies))
98 | logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies)))
99 | raise APIError(error_info)
100 | if len(resp.json()) == 0:
101 | return {'T_ID':int(t_id),'NODE':None,'STATUS':3,'TIME':n_time}
102 | topic=resp.json()[0]
103 | node_id=topic["node"]["id"]
104 | return {'T_ID':int(t_id),'NODE':node_id,'STATUS':status,'TIME':n_time}
105 |
106 | def write_to_sql(self,T_ID, NODE, STATUS, TIME):
107 | self.SQ.write_to_db_status(T_ID, NODE, STATUS, TIME)
108 | return
109 |
110 | class APIError(ValueError):
111 | pass
112 |
113 | def start(t_id,sleep_time):
114 | logging.info('Start topic test. Topic id is %d.' % int(t_id))
115 | time.sleep(sleep_time)
116 | t=tester()
117 | t.init_database()
118 | result=t.web_test(t_id, 0)
119 | t.write_to_sql(result['T_ID'],result['NODE'],result['STATUS'],result['TIME'])
120 | t.SQ.close_datebase()
121 | if result['NODE'] is not None:
122 | logging.info('Topic test finish. Topic id is %d, results is : node id %d, status %d' % (int(t_id),result['NODE'],result['STATUS']))
123 | else:
124 | logging.info('Topic test finish. Topic id is %d, results is : node id is None, status %d' % (int(t_id),result['STATUS']))
125 | return
126 |
127 | if __name__ == '__main__':
128 | # start(1,5)
129 | start(375807,5)
130 | print('finish!')
--------------------------------------------------------------------------------