├── .gitignore ├── README.md ├── Run.sh ├── Start_rqworker.sh ├── license.txt ├── requirements.txt ├── run.py ├── settings.py.example ├── sql └── create_table.sql ├── topic_id_reenqueue.py ├── v2ex_base ├── __init__.py ├── log_in.py └── v2_sql.py ├── v2ex_spider ├── __init__.py ├── base_spider.py ├── node_spider.py ├── rss_spider.py └── topic_spider.py └── v2ex_tester ├── __init__.py └── topic_tester.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # End of https://www.gitignore.io/api/python 105 | #Custom 106 | .project 107 | .pydevproject 108 | .cookies.json 109 | database.db 110 | .time_log.json 111 | *~ 112 | dump.rdb 113 | .topics_all.json 114 | settings.py 115 | .node_number.json 116 | .topics_tester.json 117 | rsync_vps.sh 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # v2ex删贴监测系统 2 | 3 | ## 初衷 4 | [v2ex](https://www.v2ex.com/)一直把不能删贴、不能删回复作为社区的一个特色。 5 | 6 | 我一开始也是认为v2ex是不会删贴的,顶多是把帖子移动到特殊节点。 7 | 后来发现v2ex也删贴,我感到有一点失望,但也对这种做法表示理解,毕竟已经在国内备案,做出一点妥协也是可以理解的。 8 | 9 | 直到有一天我发现 [/t/349115](https://www.v2ex.com/t/349115) 发布不久便被删除了,这让我感到了欺骗。 10 | 于是,我萌生了写一个v2ex删贴监测系统的想法。 11 | 12 | ## 关于v2ex删贴 13 | 14 | 发布第一期删贴统计后,引发了[一些讨论](https://www.v2ex.com/t/368217)。 15 | 16 | 有同学提醒应该加上一些说明,防止出现一些误会。 17 | 现在关于v2ex删贴的事加上我的一些说明。 18 | 19 | - V2EX 从来没有没有说过“绝对不删帖”。关于删除,更确切的规则是,这里的用户在发布了主题或者回复之后,自己无法删除或者修改。 *[注1](#note1)* 20 | - 一些删贴的情况:*[注2](#note2)* 21 | 1. Spam 是会彻底删除的 22 | 1. 有部分 0 回复主题根据发帖者要求被删除 23 | 1. [/go/sandbox](https://www.v2ex.com/go/sandbox) 里的帖子会被不定时清理 24 | 1. 涉及人身攻击、极端政治言论的帖子也会被删除 25 | 1. 某些特殊原因(如律师信等情况)也会被删除 26 | 27 | **注1:** 28 | https://www.v2ex.com/t/368217#r_4423842 29 | 30 | **注2:** 31 | https://www.v2ex.com/t/192635#r_2079456 32 | https://www.v2ex.com/t/231563#r_2568942 33 | 34 | ## 使用方法 35 | 36 | ### 安装Redis 37 | Ubuntu 38 | ``` 39 | $ sudo apt-get update 40 | $ sudo apt-get upgrade 41 | $ sudo apt-get -y install redis-server 42 | ``` 43 | 其他系统可参见:https://redis.io/topics/quickstart 44 | 45 | ### 安装依赖 46 | ``` 47 | $ pip3 install -r requirements.txt 48 | $ pip3 install requests[socks] 49 | ``` 50 | 51 | ### 创建数据库 52 | 使用`sql/create_table.sql`,通过sqlite3创建数据库。 53 | 54 | ### 修改配置文件 55 | ``` 56 | $ cp settings.py.example settings.py 57 | ``` 58 | 修改`settings.py`,填入v2ex帐户(建议使用小号),添加User-Agent,根据情况修改代理设置,修改数据库所在路径。 59 | 60 | 修改`Run.sh`,将`/home/arch/python/v2ex_delete`替换为项目所在路径。 61 | 62 | ### 添加crontab 63 | 将`Run.sh`添加至crontab。 64 | 如:`* * * * * bash /home/arch/python/v2ex_delete/Run.sh` 65 | 66 | 进行此步前,请确保`redis-server`已经运行。 67 | 68 | ### 启动rqworker 69 | 在项目目录下运行 70 | ``` 71 | ./Start_rqworker.sh 72 | ``` 73 | 74 | ## 使用建议 75 | 1. 为降低被封概率,请添加代理、UA。 76 | 1. 对于重要请求尽量不使用代理,如使用请保证代理的稳定性。 77 | 1. 对于一般请求请使用代理以免主ip被封。根据经验10个**稳定的**代理,便可以满足要求。 78 | 1. 请根据代理的稳定性,调整failed队列worker数量。请求失败率高,请增加worker数量。 79 | 1. 请定期监控项目的工作情况(`rqinfo`),如failed队列过长,请及时检查代理情况。 80 | 1. 本项目有两种抓取模式,`Mode1`与`Mode2`,`Mode2`请求更少,效果相比`Mode1`稍差,建议在无代理或代理较少的情况下使用`Mode2`。 81 | 1. 该项目默认设置为两周后开始检测删贴,如需修改,可修改`run.py`中`tester_tasker`函数中的SQL语句。 82 | -------------------------------------------------------------------------------- /Run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /home/arch/python/v2ex_delete 4 | python3 run.py 5 | -------------------------------------------------------------------------------- /Start_rqworker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rqworker node1 &> log/node1_rqworker.log & 4 | rqworker node2 &> log/node2_rqworker.log & 5 | rqworker node3 &> log/node3_rqworker.log & 6 | rqworker node4 &> log/node4_rqworker.log & 7 | rqworker node5 &> log/node5_rqworker.log & 8 | rqworker topic &> log/topic_rqworker.log & 9 | rqworker tester &> log/tester_rqworker.log & 10 | rqworker failed &> log/failed1_rqworker.log & 11 | rqworker failed &> log/failed2_rqworker.log & 12 | rqworker failed &> log/failed3_rqworker.log & 13 | rqworker failed &> log/failed4_rqworker.log & 14 | rqworker failed &> log/failed5_rqworker.log & 15 | rqworker failed &> log/failed6_rqworker.log & 16 | echo "Start rqworker,Finished!" 17 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | Copyright 2017 yingziwu 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | redis 4 | rq 5 | feedparser 6 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 9, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import json 7 | import time 8 | import requests 9 | import os 10 | from redis import Redis 11 | from rq import Queue 12 | import logging 13 | 14 | from v2ex_spider import node_spider 15 | from v2ex_spider import rss_spider 16 | from v2ex_base.v2_sql import SQL 17 | from v2ex_tester import topic_tester 18 | from v2ex_base import log_in 19 | import topic_id_reenqueue 20 | import settings 21 | 22 | class Start(object): 23 | ''' 24 | Start the project. 25 | ''' 26 | 27 | def __init__(self): 28 | ''' 29 | $ python run.py 30 | or 31 | $ ./Run.sh 32 | ''' 33 | logging.info('start') 34 | logging.debug('open sql database.') 35 | self.SQ=SQL() 36 | self.SQ.open_datebase() 37 | self.redis_conn=Redis() 38 | self.load_config() 39 | #base 40 | self.load_json() 41 | # self.update_cookies() 42 | try: 43 | self.update_nodes() 44 | except APIError as e: 45 | pass 46 | 47 | def Mode1(self): 48 | logging.info('start mode1') 49 | #start 50 | self.get_rss() 51 | self.tasker() 52 | self.topic_ids_enqueue() 53 | self.tester_tasker() 54 | #end 55 | self.end() 56 | 57 | def Mode2(self): 58 | logging.info('start mode2') 59 | #start 60 | 61 | self.get_rss() 62 | self.topic_ids_enqueue() 63 | self.tester_tasker() 64 | #end 65 | self.end() 66 | 67 | def end(self): 68 | self.SQ.close_datebase() 69 | self.dump_json() 70 | logging.info('end') 71 | 72 | def load_json(self): 73 | logging.debug('load json') 74 | #load .time_log.json 75 | if os.path.exists('.time_log.json'): 76 | with open('.time_log.json','r') as f: 77 | self.time_log=json.load(f) 78 | else: 79 | self.time_log={'cookies_time':'0','nodes_time':'0','8000_node':'0','4000_node':'0','1000_node':'0','500_node':'0','0_node':'0','rss_time':'0','tester':'0', 80 | 'topic_id_reenqueue':'0'} 81 | #load .node_number.json 82 | if os.path.exists('.node_number.json'): 83 | with open('.node_number.json','r') as f: 84 | self.node_number=json.load(f) 85 | else: 86 | self.node_number=list() 87 | return 88 | 89 | def dump_json(self): 90 | #dump .time_log.json 91 | with open('.time_log.json','w') as f1: 92 | json.dump(self.time_log, f1) 93 | #dump .node_number.json 94 | with open('.node_number.json','w') as f2: 95 | self.node_number=list(set(self.node_number)) 96 | json.dump(self.node_number,f2) 97 | return 98 | 99 | def topic_ids_enqueue(self): 100 | if int(time.time())-int(self.time_log['topic_id_reenqueue']) >= 1800: 101 | logging.info('start topic id reenqueue') 102 | max_id=topic_id_reenqueue.max_id 103 | topic_id_reenqueue.reenqueue_m(max_id-2000, max_id-29) 104 | self.time_log['topic_id_reenqueue']=str(int(time.time())) 105 | return 106 | 107 | def update_cookies(self): 108 | if int(time.time())-int(self.time_log["cookies_time"]) >= 86400 * 4: 109 | cookies_time_status = False 110 | else: 111 | cookies_time_status = True 112 | if not os.path.exists('cookies.txt') or cookies_time_status is False: 113 | logging.debug('update cookies') 114 | try: 115 | log_s=log_in.v2ex_log_in() 116 | log_s.log_in(3) 117 | log_s.save_cookies() 118 | except log_in.LogError as e: 119 | return 120 | self.time_log["cookies_time"]=str(int(time.time())) 121 | return 122 | 123 | def update_nodes(self): 124 | if int(time.time())-int(self.time_log["nodes_time"]) >= 10800: 125 | nodes_time_status=False 126 | else: 127 | nodes_time_status=True 128 | if not nodes_time_status: 129 | logging.info('update nodes') 130 | try: 131 | resp=self.s.get('https://www.v2ex.com/api/nodes/all.json', timeout=10) 132 | except requests.exceptions.RequestException as e: 133 | logging.error('update_node failed.') 134 | logging.error('proxy_status: %s' % settings.i_proxy_enable) 135 | if settings.i_proxy_enable is True: 136 | logging.error('proxy: %s' % self.s.proxies) 137 | logging.error(e) 138 | self.node_number=list(set(self.node_number)) 139 | return 140 | if resp.status_code != 200: 141 | logging.error('update_node failed.') 142 | logging.error('proxy_status: %s' % settings.i_proxy_enable) 143 | if settings.i_proxy_enable is True: 144 | logging.error('proxy: %s' % self.s.proxies) 145 | logging.error(APIError('update_node')) 146 | self.node_number=list(set(self.node_number)) 147 | raise APIError('update_node') 148 | nodes=resp.json() 149 | for node in nodes: 150 | n_id=node["id"] 151 | name=node["name"] 152 | url=node["url"] 153 | title=node["title"] 154 | title_alternative=node["title_alternative"] 155 | topics=node["topics"] 156 | header=node["header"] 157 | footer=node["footer"] 158 | created=node["created"] 159 | n_time=int(time.time()) 160 | if self.SQ.node_test(n_id, topics) is True: 161 | self.node_number.append(int(n_id)) 162 | self.SQ.write_to_db_node(n_id, name, url, title, title_alternative, topics, header, footer, created, n_time) 163 | self.time_log["nodes_time"]=str(int(time.time())) 164 | self.node_number=list(set(self.node_number)) 165 | return 166 | 167 | def tasker(self): 168 | node_configs_1=[{'sql':'SELECT ID FROM NODES WHERE topics >= 8000;','sleep_time':5,'between_time':900,'time_log':'8000_node','queue_name':'node1'}, 169 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;','sleep_time':10,'between_time':1800,'time_log':'4000_node','queue_name':'node2'}, 170 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;','sleep_time':20,'between_time':7200,'time_log':'1000_node','queue_name':'node3'}, 171 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;','sleep_time':90,'between_time':86400,'time_log':'500_node','queue_name':'node4'}, 172 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;','sleep_time':90,'between_time':86400,'time_log':'0_node','queue_name':'node5'}] 173 | node_configs_2=[{'sql':'SELECT ID FROM NODES WHERE topics >= 8000;','sleep_time':5,'between_time':1800,'time_log':'8000_node','queue_name':'node1'}, 174 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;','sleep_time':10,'between_time':3600,'time_log':'4000_node','queue_name':'node2'}, 175 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;','sleep_time':20,'between_time':14400,'time_log':'1000_node','queue_name':'node3'}, 176 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;','sleep_time':90,'between_time':86400,'time_log':'500_node','queue_name':'node4'}, 177 | {'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;','sleep_time':90,'between_time':86400,'time_log':'0_node','queue_name':'node5'}] 178 | time.tzname=('CST', 'CST') 179 | if int(time.strftime('%H')) >= 8 or int(time.strftime('%H')) < 2: 180 | node_configs=node_configs_1 181 | else: 182 | node_configs=node_configs_2 183 | for node_config in node_configs: 184 | sql=node_config['sql'] 185 | sleep_time=node_config['sleep_time'] 186 | between_time=node_config['between_time'] 187 | time_log_name=node_config['time_log'] 188 | queue_name=node_config['queue_name'] 189 | q_node=Queue(queue_name,connection=self.redis_conn) 190 | if int(time.time()) - int(self.time_log[time_log_name]) >= between_time: 191 | logging.info('start enqueue, queue name: %s' % queue_name) 192 | self.SQ.cursor.execute(sql) 193 | node_ids=self.SQ.cursor.fetchall() 194 | for node_id in node_ids: 195 | node_id=node_id[0] 196 | if queue_name not in ['node4','node5'] or (queue_name in ['node4','node5'] and node_id in self.node_number): 197 | if queue_name in ['node4','node5']: 198 | self.node_number.remove(int(node_id)) 199 | q_node.enqueue(node_spider.start,node_id,sleep_time) 200 | self.time_log[time_log_name]=str(int(time.time())) 201 | return 202 | 203 | def get_rss(self): 204 | if int(time.time())-int(self.time_log["rss_time"]) >= 600: 205 | logging.debug('start get_rss') 206 | try: 207 | rss_spider.Rss_spider() 208 | except requests.exceptions.RequestException as e: 209 | self.time_log["rss_time"]=str(int(time.time())) 210 | return 211 | self.time_log["rss_time"]=str(int(time.time())) 212 | return 213 | 214 | def load_config(self): 215 | logging.debug('load config') 216 | self.proxy_enable=settings.i_proxy_enable 217 | self.s=requests.session() 218 | self.s.headers=settings.API_headers 219 | if self.proxy_enable: 220 | self.s.proxies=settings.i_proxies() 221 | return 222 | 223 | def tester_tasker(self): 224 | if int(time.time())-int(self.time_log["tester"]) >= 1800: 225 | logging.info('start enqueue tester') 226 | #losd json 227 | if os.path.exists('.topics_tester.json'): 228 | with open('.topics_tester.json','r') as f: 229 | tmp_topics=json.load(f) 230 | else: 231 | tmp_topics=list() 232 | #main 233 | sql="SELECT ID FROM TOPIC WHERE (time - created) < 345600 AND ID NOT IN (SELECT T_ID FROM STATUS) AND (STRFTIME('%s','now') - created) > 1209600;" 234 | sleep_time=20 235 | self.SQ.cursor.execute(sql) 236 | topic_ids=[x[0] for x in self.SQ.cursor.fetchall()] 237 | q=Queue('tester',connection=self.redis_conn) 238 | for topic_id in topic_ids: 239 | if topic_id not in tmp_topics: 240 | q.enqueue(topic_tester.start,topic_id, sleep_time) 241 | tmp_topics.append(topic_id) 242 | #end 243 | tmp_topics=list(set(tmp_topics)) 244 | with open('.topics_tester.json','w') as f: 245 | json.dump(tmp_topics,f) 246 | self.time_log["tester"]=str(int(time.time())) 247 | return 248 | 249 | class APIError(ValueError): 250 | pass 251 | 252 | if __name__ == '__main__': 253 | S=Start() 254 | if settings.mode == 'Mode1': 255 | S.Mode1() 256 | elif settings.mode == 'Mode2': 257 | S.Mode2() 258 | -------------------------------------------------------------------------------- /settings.py.example: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 12, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | # Settings 7 | ## v2ex account #建议使用小号 8 | account='v2ex_account' #v2ex帐号 9 | password='v2ex_password' #v2ex密码 10 | 11 | ## User-Agent setting 12 | ### WEB_User_Agents:访问v2ex网页版时使用的UA池。请自行添加 13 | WEB_User_Agents=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2869.0 Safari/537.36'] 14 | ### API_User_Agents:调用v2ex API时使用的UA池。请自行添加 15 | API_User_Agents=['Mozilla/5.0 (Linux; Android 6.0; Nexus 5X Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.124 Mobile Safari/537.36'] 16 | 17 | 18 | ## proxy setting 19 | proxy_enable=False #是否对一般请求使用代理,默认值为不使用。 20 | ### proxies_list:代理池,具体说明详见:http://docs.python-requests.org/en/master/user/advanced/#proxies 21 | proxies_list=[{"http":"socks5://localhost:1080","https":"socks5://localhost:1080"}, 22 | {"http":"http://localhost:8090","https":"http://localhost:8090"},] 23 | 24 | i_proxy_enable=False #是否对重要请求使用代理,默认值为不使用。(run.py、log_in.py、rss_spider.py为重要请求) 25 | i_proxies_list=[{"http":"socks5://127.0.0.1:1080","https":"socks5://127.0.0.1:1080"}] 26 | 27 | ## database setting 28 | database_path="/home/arch/python/v2ex_delete/database.db" #sqlite数据库路径 29 | 30 | ## log 31 | import logging 32 | import os 33 | log_level=logging.INFO #log等级 34 | log_dir='log' #log路径 35 | log_path=os.path.join(log_dir,'all.log') 36 | logging.basicConfig(level=log_level, 37 | filename=log_path,filemode='a', 38 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') 39 | 40 | ## mode (Mode1 or Mode2) 41 | mode='Mode1' #抓取模式 42 | 43 | # Program 44 | import random 45 | WEB_headers={'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 46 | 'Accept-Encoding':"gzip, deflate", 47 | 'Accept-Language':"zh-CN,zh;q=0.8", 48 | 'User-Agent':random.choice(WEB_User_Agents)} 49 | API_headers={'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 50 | 'Accept-Encoding':"gzip, deflate", 51 | 'Accept-Language':"zh-CN,zh;q=0.8", 52 | 'User-Agent':random.choice(API_User_Agents)} 53 | 54 | def proxies(): 55 | return random.choice(proxies_list) 56 | 57 | def i_proxies(): 58 | return random.choice(i_proxies_list) 59 | -------------------------------------------------------------------------------- /sql/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE TOPIC ( 2 | ID INTEGER PRIMARY KEY, 3 | title TEXT, 4 | author TEXT, 5 | author_id INTEGER, 6 | content TEXT, 7 | content_rendered TEXT, 8 | replies INTEGER, 9 | node INTEGER, 10 | created INTEGER, 11 | time INTEGER 12 | ); 13 | 14 | CREATE TABLE STATUS ( 15 | ID INTEGER PRIMARY KEY AUTOINCREMENT, 16 | T_ID INTEGER, 17 | NODE INTEGER, 18 | STATUS INTEGER, 19 | TIME INTEGER 20 | ); 21 | 22 | CREATE TABLE NODES ( 23 | ID INTEGER PRIMARY KEY, 24 | name TEXT, 25 | url TEXT, 26 | title TEXT, 27 | title_alternative TEXT, 28 | topics INTEGER, 29 | header TEXT, 30 | footer TEXT, 31 | created INTEGER, 32 | time INTEGER 33 | ); 34 | 35 | CREATE VIEW HUMAN_READER AS 36 | SELECT TOPIC.ID,TOPIC.title,TOPIC.author,TOPIC.author_id,TOPIC.content,TOPIC.content_rendered,TOPIC.replies, 37 | NODES_1.name AS node_name,TOPIC.node AS node_id, 38 | TOPIC.created AS create_time,DATETIME(TOPIC.created,'unixepoch') AS create_time_h,TOPIC.time AS grab_time,DATETIME(TOPIC.time,'unixepoch') AS grab_time_h, 39 | STATUS.TIME AS test_time,DATETIME(STATUS.TIME,'unixepoch') AS test_time_h,NODES_2.name AS node_name_on_test,STATUS.NODE AS node_id_on_test,STATUS.STATUS 40 | FROM TOPIC 41 | INNER JOIN NODES AS NODES_1 42 | ON NODES_1.ID = TOPIC.node 43 | LEFT OUTER JOIN STATUS 44 | ON STATUS.T_ID = TOPIC.ID 45 | LEFT OUTER JOIN NODES AS NODES_2 46 | ON NODES_2.ID = STATUS.NODE 47 | ORDER BY TOPIC.ID ASC; 48 | -------------------------------------------------------------------------------- /topic_id_reenqueue.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jun 10, 2017 3 | 4 | @author: arch 5 | ''' 6 | from redis import Redis 7 | from rq import Queue 8 | import sqlite3 9 | import os 10 | import json 11 | import logging 12 | 13 | from v2ex_spider import topic_spider 14 | import settings 15 | 16 | sql="select ID from TOPIC;" 17 | conn=sqlite3.connect(settings.database_path) 18 | cursor=conn.cursor() 19 | cursor.execute(sql) 20 | r=cursor.fetchall() 21 | ids_db=[int(x[0]) for x in r] 22 | max_id=max(ids_db) 23 | cursor.close() 24 | conn.commit() 25 | conn.close() 26 | 27 | redis_conn=Redis() 28 | q=Queue('topic',connection=redis_conn) 29 | 30 | def reenqueue_m(start_id,end_id): 31 | #load topic id 32 | if os.path.exists('.topics_all.json'): 33 | with open('.topics_all.json','r') as f: 34 | tmp_topics=json.load(f) 35 | else: 36 | tmp_topics=list() 37 | #work 38 | for x in range(int(start_id),int(end_id)+1): 39 | if x not in ids_db and x not in tmp_topics: 40 | logging.info('enqueue the topic %d' % int(x)) 41 | q.enqueue(topic_spider.start,x, 10) 42 | tmp_topics.append(int(x)) 43 | #save topic id 44 | topics_all=list() 45 | topics_all.extend(ids_db) 46 | topics_all.extend(tmp_topics) 47 | topics_all=list(set(topics_all)) 48 | with open('.topics_all.json','w') as f: 49 | json.dump(topics_all, f) 50 | return 51 | 52 | def reenqueue_a(): 53 | start_id=max_id-2000 54 | end_id=max_id-150 55 | return reenqueue_m(start_id, end_id) 56 | 57 | if __name__ == '__main__': 58 | import sys 59 | args=sys.argv 60 | if len(args) < 3 and (len(args) != 3 and args[1] != 'auto'): 61 | print('Please input the topic id of start and end.') 62 | print('Or use the auto mode.') 63 | exit(2) 64 | elif len(args) != 3 and args[1] == 'auto': 65 | reenqueue_a() 66 | print('auto') 67 | elif len(args) == 3: 68 | start_id=args[1] 69 | end_id=args[2] 70 | print('manual') 71 | reenqueue_m(start_id, end_id) 72 | print('Finish!') -------------------------------------------------------------------------------- /v2ex_base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yingziwu/v2ex_delete/8f1befcd43bd83c9531bf7180e23bbaf8b12b4ed/v2ex_base/__init__.py -------------------------------------------------------------------------------- /v2ex_base/log_in.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 9, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | 7 | import requests 8 | from lxml import etree 9 | import json 10 | import logging 11 | 12 | import settings 13 | 14 | class v2ex_log_in(object): 15 | ''' 16 | log in v2ex account,return the cookies. 17 | ''' 18 | 19 | 20 | def __init__(self): 21 | ''' 22 | >>>import log_in 23 | >>>log_s=log_in.v2ex_log_in() 24 | >>>log_s.log_in() 25 | >>>log_s.save_cookies() 26 | ''' 27 | logging.info('start log in') 28 | self.load_config() 29 | return 30 | 31 | def load_config(self): 32 | logging.debug('start load_config') 33 | self.account=settings.account 34 | self.passwd=settings.password 35 | self.proxy_enable=settings.i_proxy_enable 36 | self.base_headers=settings.WEB_headers 37 | self.s=requests.session() 38 | self.s.headers=self.base_headers 39 | if self.proxy_enable: 40 | self.s.proxies=settings.i_proxies() 41 | return 42 | 43 | def log_in(self,try_time): 44 | logging.debug('start log_in') 45 | if try_time >= 4: 46 | logging.error(LogError('try time too much.')) 47 | raise LogError('try time too much.') 48 | #1 49 | try: 50 | r1=self.s.get('https://www.v2ex.com/signin', timeout=10) 51 | except requests.exceptions.RequestException as e: 52 | logging.error('log in error') 53 | logging.error(try_time) 54 | logging.error('proxy status: %s' % self.proxy_enable) 55 | if self.proxy_enable is True: 56 | logging.error('proxy: %s' % self.s.proxies) 57 | logging.error(e) 58 | try_time=try_time+1 59 | return self.log_in(try_time) 60 | if r1.status_code != 200: 61 | error_info='proxy status: %s, proxy: %s' % (str(settings.i_proxy_enable),str(self.s.proxies)) 62 | raise LogError(error_info) 63 | self.s.headers={'Referer': 'https://v2ex.com/signin'} 64 | t1=etree.HTML(r1.text) 65 | text_name=t1.xpath('//input[@type="text"]/@name')[-1] 66 | password_name=t1.xpath('//input[@type="password"]/@name')[0] 67 | once1=t1.xpath('//input[@type="hidden"]/@value')[0] 68 | post_data={text_name:self.account, password_name:self.passwd, 'once':str(once1), 'next':'/'} 69 | #r2 70 | try: 71 | r2=self.s.post('https://www.v2ex.com/signin', data=post_data) 72 | except requests.exceptions.RequestException as e: 73 | logging.error('log in error') 74 | logging.error(try_time) 75 | logging.error('proxy status: %s' % self.proxy_enable) 76 | if self.proxy_enable is True: 77 | logging.error('proxy: %s' % self.s.proxies) 78 | logging.error(e) 79 | try_time=try_time+1 80 | return self.log_in(try_time) 81 | return 82 | 83 | def save_cookies(self): 84 | logging.debug('start save_cookies') 85 | resp=self.s.get('https://www.v2ex.com/go/flamewar', timeout=10) 86 | if '登录' in resp.text: 87 | raise LogError('log failed.') 88 | with open('.cookies.json','w') as f: 89 | json.dump(requests.utils.dict_from_cookiejar(self.s.cookies),f) 90 | return 91 | 92 | class LogError(ValueError): 93 | pass 94 | 95 | if __name__ == '__main__': 96 | tmp=v2ex_log_in() 97 | tmp.log_in(1) 98 | tmp.save_cookies() 99 | print('finish!') 100 | -------------------------------------------------------------------------------- /v2ex_base/v2_sql.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 12, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import settings 7 | import sqlite3 8 | 9 | class SQL(object): 10 | ''' 11 | The sqlite class. 12 | ''' 13 | 14 | 15 | def __init__(self): 16 | ''' 17 | >>>from v2ex_base.v2_sql import SQL 18 | >>>SQ=SQL() 19 | >>>SQ.open_datebase() 20 | write_to_db_base 21 | >>>SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time) 22 | write_to_db_node 23 | >>>SQ.write_to_db_node(n_id,name,url,title,title_alternative,topics,header,footer,created,n_time) 24 | node_test 25 | >>>SQ.node_test(node_id,number_now) 26 | ''' 27 | self.database_path=settings.database_path 28 | 29 | def open_datebase(self): 30 | self.conn=sqlite3.connect(self.database_path) 31 | self.cursor=self.conn.cursor() 32 | 33 | def close_datebase(self): 34 | self.cursor.close() 35 | self.conn.commit() 36 | self.conn.close() 37 | 38 | def write_to_db_base(self,t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time): 39 | sql="INSERT INTO TOPIC (ID,title,author,author_id,content,content_rendered,replies,node,created,time) VALUES ( %s );" % ', '.join(['?'] * 10) 40 | try: 41 | self.cursor.execute(sql,(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)) 42 | except sqlite3.IntegrityError as e: 43 | pass 44 | self.conn.commit() 45 | return 46 | 47 | def write_to_db_node(self,n_id,name,url,title,title_alternative,topics,header,footer,created,n_time): 48 | sql="REPLACE INTO NODES (ID,name,url,title,title_alternative,topics,header,footer,created,time) VALUES ( %s );" % ', '.join(['?'] * 10) 49 | try: 50 | self.cursor.execute(sql, (n_id,name,url,title,title_alternative,topics,header,footer,created,n_time)) 51 | except sqlite3.IntegrityError as e: 52 | pass 53 | self.conn.commit() 54 | return 55 | 56 | def write_to_db_status(self,T_ID,NODE,STATUS,TIME): 57 | sql="INSERT INTO STATUS (T_ID,NODE,STATUS,TIME) VALUES ( %s );" % ', '.join(['?'] * 4) 58 | try: 59 | self.cursor.execute(sql,(T_ID,NODE,STATUS,TIME)) 60 | except sqlite3.IntegrityError as e: 61 | pass 62 | self.conn.commit() 63 | return 64 | 65 | def node_test(self,node_id,number_now): 66 | sql="SELECT topics FROM NODES WHERE ID = %d;" % int(node_id) 67 | self.cursor.execute(sql) 68 | number_old_r=self.cursor.fetchone() 69 | if number_old_r is None: 70 | return True 71 | else: 72 | number_old=number_old_r[0] 73 | if int(number_old) != int(number_now): 74 | return True 75 | else: 76 | return False 77 | -------------------------------------------------------------------------------- /v2ex_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yingziwu/v2ex_delete/8f1befcd43bd83c9531bf7180e23bbaf8b12b4ed/v2ex_spider/__init__.py -------------------------------------------------------------------------------- /v2ex_spider/base_spider.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 12, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import requests 7 | import time 8 | import logging 9 | 10 | from v2ex_base.v2_sql import SQL 11 | import settings 12 | 13 | class spider(object): 14 | ''' 15 | A base Spider for v2ex. 16 | ''' 17 | 18 | 19 | def __init__(self,url,sleep_time): 20 | ''' 21 | >>>from v2ex_spider import base_spider 22 | >>>base_spider.start(url,sleep_time) 23 | ''' 24 | logging.info('Start base spider. Url is %s' % url) 25 | self.url=url 26 | self.sleep_time=sleep_time 27 | time.sleep(int(self.sleep_time)) 28 | self.SQ=SQL() 29 | self.SQ.open_datebase() 30 | #run 31 | self.load_config() 32 | self.spider() 33 | #end 34 | self.SQ.close_datebase() 35 | logging.info('Spider Finished.') 36 | 37 | def spider(self): 38 | logging.debug('start spider.') 39 | try: 40 | resp=self.s.get(self.url, timeout=10) 41 | except requests.exceptions.RequestException as e: 42 | logging.error('spider failed.') 43 | logging.error('proxy_status: %s' % settings.proxy_enable) 44 | if settings.proxy_enable is True: 45 | logging.error('proxy: %s' % self.s.proxies) 46 | logging.error(e) 47 | raise e 48 | if resp.status_code != 200: 49 | self.SQ.close_datebase() 50 | error_info='proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies)) 51 | logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies))) 52 | raise APIError(error_info) 53 | topics=resp.json() 54 | for topic in topics: 55 | t_id=topic["id"] 56 | title=topic["title"] 57 | author=topic["member"]["username"] 58 | author_id=topic["member"]["id"] 59 | content=topic["content"] 60 | content_rendered=topic["content_rendered"] 61 | replies=topic["replies"] 62 | node=topic["node"]["id"] 63 | created=topic["created"] 64 | n_time=int(time.time()) 65 | self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time) 66 | self.SQ.conn.commit() 67 | return 68 | 69 | def load_config(self): 70 | logging.debug('start load_config') 71 | self.proxy_enable=settings.proxy_enable 72 | self.s=requests.session() 73 | self.s.headers=settings.API_headers 74 | if self.proxy_enable: 75 | self.s.proxies=settings.proxies() 76 | return 77 | 78 | class APIError(ValueError): 79 | pass 80 | -------------------------------------------------------------------------------- /v2ex_spider/node_spider.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 9, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import logging 7 | 8 | from v2ex_spider import base_spider 9 | import settings 10 | 11 | def start(node_id,sleep_time): 12 | logging.info('Start node spider. Node id is %d.' % int(node_id)) 13 | url='https://www.v2ex.com/api/topics/show.json?node_id=%s' % str(node_id) 14 | base_spider.spider(url,sleep_time) 15 | return 16 | 17 | if __name__ == '__main__': 18 | start(12,5) 19 | print('Finish!') -------------------------------------------------------------------------------- /v2ex_spider/rss_spider.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 9, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import feedparser 7 | import time 8 | import re 9 | import requests 10 | from redis import Redis 11 | from rq import Queue 12 | import json 13 | import os 14 | import logging 15 | 16 | from v2ex_spider import topic_spider 17 | from v2ex_base.v2_sql import SQL 18 | import settings 19 | 20 | 21 | class Rss_spider(object): 22 | ''' 23 | A Spider for v2ex's Rss. 24 | Get the latest and hot topic on the index. 25 | Using the rss generate the topic list that need to spider. 26 | ''' 27 | 28 | 29 | def __init__(self): 30 | ''' 31 | >>>from v2ex_spider import rss_spider 32 | >>>rss_spider.Rss_spider() 33 | ''' 34 | logging.info('start Rss spider') 35 | self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml', 36 | 'https://www.v2ex.com/feed/tab/qna.xml', 37 | 'https://www.v2ex.com/feed/tab/jobs.xml', 38 | 'https://www.v2ex.com/feed/tab/deals.xml', 39 | 'https://www.v2ex.com/feed/tab/city.xml', 40 | 'https://www.v2ex.com/feed/tab/play.xml', 41 | 'https://www.v2ex.com/feed/tab/apple.xml', 42 | 'https://www.v2ex.com/feed/tab/creative.xml', 43 | 'https://www.v2ex.com/feed/tab/tech.xml'] 44 | self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json'] 45 | self.topic_sleep_time=10 46 | logging.debug('open sql database') 47 | self.SQ=SQL() 48 | self.SQ.open_datebase() 49 | self.redis_conn=Redis() 50 | self.load_config() 51 | #run 52 | try: 53 | self.latest_and_hot() 54 | except APIError as e: 55 | pass 56 | self.gen_topic_queue() 57 | #end 58 | self.SQ.close_datebase() 59 | logging.info('end the Rss spider') 60 | 61 | def topics_id_rss(self): 62 | logging.debug('fetch rss feeds') 63 | topic_ids=list() 64 | for v2ex_rss_url in self.v2ex_rss_url_list: 65 | feed=feedparser.parse(v2ex_rss_url) 66 | logging.debug('fetch rss feed: %s' % v2ex_rss_url) 67 | items=feed["items"] 68 | for item in items: 69 | author=item["author"] 70 | title=item["title"] 71 | link=item["link"] 72 | published=item[ "date" ] 73 | summary=item["summary"] 74 | topic_id=int(re.findall(r't\/(\d+)#?', link)[0]) 75 | topic_ids.append(topic_id) 76 | topic_ids=set(topic_ids) 77 | return topic_ids 78 | 79 | def topics_id_sqlite(self): 80 | logging.debug('SELECT ID FROM TOPIC') 81 | sql='SELECT ID FROM TOPIC;' 82 | self.SQ.cursor.execute(sql) 83 | topics_ids=[x[0] for x in self.SQ.cursor.fetchall()] 84 | return topics_ids 85 | 86 | def latest_and_hot(self): 87 | logging.debug('start latest_and_hot') 88 | for url in self.latest_hot_api: 89 | try: 90 | resp=self.s.get(url, timeout=10) 91 | except requests.exceptions.RequestException as e: 92 | logging.error('latest_and_hot error') 93 | logging.error('proxy_status: %s' % self.proxy_enable) 94 | if self.proxy_enable is True: 95 | logging.error('proxy: %s' % self.s.proxies) 96 | logging.error(e) 97 | raise e 98 | if resp.status_code != 200: 99 | logging.error('latest_and_hot error') 100 | logging.error('proxy_status: %s' % self.proxy_enable) 101 | if self.proxy_enable is True: 102 | logging.error('proxy: %s' % self.s.proxies) 103 | logging.error(APIError('latest_and_hot')) 104 | raise APIError('latest_and_hot') 105 | topics=resp.json() 106 | for topic in topics: 107 | t_id=topic["id"] 108 | title=topic["title"] 109 | author=topic["member"]["username"] 110 | author_id=topic["member"]["id"] 111 | content=topic["content"] 112 | content_rendered=topic["content_rendered"] 113 | replies=topic["replies"] 114 | node=topic["node"]["id"] 115 | created=topic["created"] 116 | n_time=int(time.time()) 117 | self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time) 118 | self.SQ.conn.commit() 119 | return 120 | 121 | def gen_topic_queue(self): 122 | logging.debug('start topic enqueue') 123 | topics_sql=self.topics_id_sqlite() 124 | if len(topics_sql) <= 2000: 125 | return 126 | topics_rss=self.topics_id_rss() 127 | # load topics 128 | if os.path.exists('.topics_all.json'): 129 | with open('.topics_all.json','r') as f: 130 | tmp_topics=json.load(f) 131 | else: 132 | tmp_topics=list() 133 | t_queue=Queue('topic',connection=self.redis_conn) 134 | # gen queue 135 | for topic in topics_rss: 136 | if topic not in topics_sql and topic not in tmp_topics: 137 | topic_id=int(topic) 138 | t_queue.enqueue(topic_spider.start,topic_id, self.topic_sleep_time) 139 | #save topics 140 | topics_all=list() 141 | topics_all.extend(tmp_topics) 142 | topics_all.extend(topics_rss) 143 | topics_all.extend(topics_sql) 144 | topics_all=list(set(topics_all)) 145 | with open('.topics_all.json','w') as f: 146 | json.dump(topics_all, f) 147 | return 148 | 149 | def load_config(self): 150 | logging.debug('load config') 151 | self.proxy_enable=settings.i_proxy_enable 152 | self.s=requests.session() 153 | self.s.headers=settings.API_headers 154 | if self.proxy_enable: 155 | self.s.proxies=settings.i_proxies() 156 | 157 | class APIError(ValueError): 158 | pass 159 | 160 | if __name__ == '__main__': 161 | Rss_spider() 162 | print('Finish!') -------------------------------------------------------------------------------- /v2ex_spider/topic_spider.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 10, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import logging 7 | 8 | from v2ex_spider import base_spider 9 | import settings 10 | 11 | def start(topic_id, sleep_time): 12 | logging.info('Start topic spider. Topic id is %d.' % int(topic_id)) 13 | url='https://www.v2ex.com/api/topics/show.json?id=%s' % str(topic_id) 14 | base_spider.spider(url,sleep_time) 15 | return 16 | 17 | if __name__ == '__main__': 18 | start(1,5) 19 | print('Finish!') -------------------------------------------------------------------------------- /v2ex_tester/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yingziwu/v2ex_delete/8f1befcd43bd83c9531bf7180e23bbaf8b12b4ed/v2ex_tester/__init__.py -------------------------------------------------------------------------------- /v2ex_tester/topic_tester.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 9, 2017 3 | 4 | @author: yingziwu 5 | ''' 6 | import requests 7 | import json 8 | from lxml import etree 9 | import time 10 | import re 11 | import logging 12 | 13 | from v2ex_base.v2_sql import SQL 14 | import settings 15 | 16 | class tester(object): 17 | ''' 18 | The tester for v2ex topics. 19 | ''' 20 | 21 | 22 | def __init__(self): 23 | ''' 24 | >>>from v2ex_tester import topic_tester 25 | >>>topic_tester(topic_id,sleep_time) 26 | ''' 27 | logging.debug('init class tester') 28 | self.s=requests.session() 29 | if settings.proxy_enable is True: 30 | self.s.proxies=settings.proxies() 31 | self.s.headers=settings.WEB_headers 32 | self.log_status=False 33 | 34 | def init_database(self): 35 | logging.debug('init database') 36 | self.SQ=SQL() 37 | self.SQ.open_datebase() 38 | 39 | def log_in(self): 40 | logging.debug('log in account') 41 | with open('.cookies.json','r') as f: 42 | cookies=requests.utils.cookiejar_from_dict(json.load(f)) 43 | self.s.cookies=cookies 44 | self.s.headers=settings.WEB_headers 45 | self.log_status=True 46 | return 47 | 48 | def web_test(self,t_id,status): 49 | logging.debug('Start web_test') 50 | url='https://www.v2ex.com/t/%s' % str(t_id) 51 | n_time=int(time.time()) 52 | try: 53 | resp=self.s.get(url, timeout=10) 54 | except requests.exceptions.RequestException as e: 55 | logging.error('web_test failed.') 56 | logging.error('proxy_status: %s' % settings.proxy_enable) 57 | if settings.proxy_enable is True: 58 | logging.error('proxy: %s' % self.s.proxies) 59 | logging.error(e) 60 | raise e 61 | if resp.status_code == 403: 62 | error_info='proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies)) 63 | logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies))) 64 | raise APIError(error_info) 65 | if resp.status_code == 404 and '404 Topic Not Found' in resp.text : 66 | return {'T_ID':int(t_id),'NODE':None,'STATUS':3,'TIME':n_time} 67 | if resp.url == 'https://www.v2ex.com/': 68 | return self.api_test(t_id, status=2) 69 | if 'signin' in resp.url and self.log_status is False: 70 | # self.log_in() 71 | # return self.web_test(t_id, status=1) 72 | return self.api_test(t_id, status=1) 73 | tree=etree.HTML(resp.text) 74 | node_name=re.findall(r'\/go\/(\w+)', tree.xpath('//div[@class="header"]/a[2]/@href')[0])[0] 75 | self.SQ.cursor.execute("SELECT ID FROM NODES WHERE name == '%s';" % node_name) 76 | node_id=self.SQ.cursor.fetchone()[0] 77 | return {'T_ID':int(t_id),'NODE':node_id,'STATUS':status,'TIME':n_time} 78 | 79 | def api_test(self,t_id,status): 80 | logging.debug('Start api_test') 81 | self.s_a=requests.session() 82 | if settings.proxy_enable is True: 83 | self.s_a.proxies=settings.proxies() 84 | self.s_a.headers=settings.API_headers 85 | url='https://www.v2ex.com/api/topics/show.json?id=%s' % str(t_id) 86 | n_time=int(time.time()) 87 | try: 88 | resp=self.s_a.get(url, timeout=10) 89 | except requests.exceptions.RequestException as e: 90 | logging.error('api_test failed.') 91 | logging.error('proxy_status: %s' % settings.proxy_enable) 92 | if settings.proxy_enable is True: 93 | logging.error('proxy: %s' % self.s.proxies) 94 | logging.error(e) 95 | raise e 96 | if resp.status_code != 200: 97 | error_info='proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies)) 98 | logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable),str(self.s.proxies))) 99 | raise APIError(error_info) 100 | if len(resp.json()) == 0: 101 | return {'T_ID':int(t_id),'NODE':None,'STATUS':3,'TIME':n_time} 102 | topic=resp.json()[0] 103 | node_id=topic["node"]["id"] 104 | return {'T_ID':int(t_id),'NODE':node_id,'STATUS':status,'TIME':n_time} 105 | 106 | def write_to_sql(self,T_ID, NODE, STATUS, TIME): 107 | self.SQ.write_to_db_status(T_ID, NODE, STATUS, TIME) 108 | return 109 | 110 | class APIError(ValueError): 111 | pass 112 | 113 | def start(t_id,sleep_time): 114 | logging.info('Start topic test. Topic id is %d.' % int(t_id)) 115 | time.sleep(sleep_time) 116 | t=tester() 117 | t.init_database() 118 | result=t.web_test(t_id, 0) 119 | t.write_to_sql(result['T_ID'],result['NODE'],result['STATUS'],result['TIME']) 120 | t.SQ.close_datebase() 121 | if result['NODE'] is not None: 122 | logging.info('Topic test finish. Topic id is %d, results is : node id %d, status %d' % (int(t_id),result['NODE'],result['STATUS'])) 123 | else: 124 | logging.info('Topic test finish. Topic id is %d, results is : node id is None, status %d' % (int(t_id),result['STATUS'])) 125 | return 126 | 127 | if __name__ == '__main__': 128 | # start(1,5) 129 | start(375807,5) 130 | print('finish!') --------------------------------------------------------------------------------