├── .gitignore
├── config.ini
├── readme.md
├── dao_sql.py
└── v2ex.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /__pycache__
2 | /.idea
3 | myconfig.ini


--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
1 | [mysql]
2 | host=
3 | port=
4 | user=
5 | password=
6 | db_name=


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # V2er 
 2 | 
 3 | 留住真实，记录语言。
 4 | 
 5 | ---
 6 | 
 7 | 
 8 | 
 9 | * 对指定人回复进行收集
10 | 
11 | * 帖子进入水深火热对水深火热进行遍历
12 | 
13 | * SQL表
14 | 
15 | 
16 | | ID   | 作者    | 发表时间   | 言论 |    言论MD5     | url      |
17 | | ---- | ------- | ---------- | ---- | :------------: | -------- |
18 | | 1    | Lunatic | 2020/12/12 | test | asd31dfsf43524 | v2ex.com |
19 | 
20 | ​	
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/dao_sql.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from configparser import ConfigParser
  3 | import pymysql
  4 | 
  5 | config = ConfigParser()
  6 | config.read("config.ini")
  7 | 
  8 | 
  9 | host = config.get("mysql","host")
 10 | port = int(config.get("mysql","port"))
 11 | user = config.get("mysql","user")
 12 | passwd = config.get("mysql","password")
 13 | db = config.get("mysql","db_name")
 14 | class sqlDeviceMpa:
 15 |     # 获取数据库命令对象
 16 |     connect = None
 17 |     cursor = None
 18 | 
 19 |     def __init__(self):
 20 |         self.openDatabase()
 21 | 
 22 |     def test_conn(self):
 23 |         try:
 24 |             self.connect.ping()
 25 |         except:
 26 |             global connect, cursor
 27 |             # 打开数据库连接
 28 |             self.connect = pymysql.Connect(
 29 |                 host=host,
 30 |                 port=port,
 31 |                 user=user,
 32 |                 passwd=passwd,
 33 |                 db=db,
 34 |                 charset='utf8'
 35 |             )
 36 |             # 获取游标
 37 |             self.cursor = self.connect.cursor()
 38 | 
 39 |     def openDatabase(self):
 40 |         global connect, cursor
 41 |         # 打开数据库连接
 42 |         self.connect = pymysql.Connect(
 43 |             host=host,
 44 |             port=port,
 45 |             user=user,
 46 |             passwd=passwd,
 47 |             db=db,
 48 |             charset='utf8'
 49 |         )
 50 |         # 获取游标
 51 |         self.cursor = self.connect.cursor()
 52 | 
 53 |     # 检查设备ID
 54 |     def checkData(self, code, deviceid):
 55 |         self.test_conn()
 56 |         sql = "SELECT * FROM  device_map WHERE code='{}'".format(code)
 57 |         self.cursor.execute(sql)
 58 |         sqlRes = self.cursor.fetchone()
 59 |         if sqlRes == None:
 60 |             return False
 61 |         sqlCode = sqlRes[0]
 62 |         sqlDeviceId = sqlRes[1]
 63 |         if deviceid == "" or deviceid == None:
 64 |             return False
 65 |         if sqlDeviceId == None or sqlDeviceId == "":
 66 |             self.updateData(code,deviceid)
 67 |             return True
 68 |         return check_device.check_id_device(sqlDeviceId,deviceid)
 69 | 
 70 |     # 如果查询出来的设备ID为空则更新设备ID
 71 |     def updateData(self,code,deviceid):
 72 |         self.test_conn()
 73 |         sql = " UPDATE device_map SET deviceid = '{}' WHERE code = '{}' ".format(deviceid,code)
 74 |         self.cursor.execute(sql)
 75 |         self.connect.commit()
 76 | 
 77 |     def insertCode(self,listCode):
 78 |         self.test_conn()
 79 |         if listCode == None:
 80 |             return False
 81 |         sql = "INSERT INTO device_map (code) VALUES ('{}')"
 82 |         for i in listCode:
 83 |             self.cursor.execute(sql.format(i))
 84 |             self.connect.commit()
 85 |         return True
 86 | 
 87 |     def getNotUsedCode(self):
 88 |         self.test_conn()
 89 |         sql = "SELECT * FROM  device_map WHERE deviceid = '' or deviceid is null "
 90 |         self.cursor.execute(sql)
 91 |         sqlRes = self.cursor.fetchall()
 92 |         listCode = []
 93 |         for i in sqlRes:
 94 |             listCode.append(i[0])
 95 |         return listCode
 96 | 
 97 |     def getUsedCode(self):
 98 |         self.test_conn()
 99 |         sql = "SELECT * FROM  device_map WHERE deviceid !='' or deviceid is not null and LENGTH(trim(deviceid))>0"
100 |         self.cursor.execute(sql)
101 |         sqlRes = self.cursor.fetchall()
102 |         return sqlRes


--------------------------------------------------------------------------------
/v2ex.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import hashlib
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | import re
  7 | 
  8 | 
  9 | heard = {
 10 |     "authority": "www.v2ex.com",
 11 |     "method": "GET",
 12 |     "path": "/go/flamewar?p=1",
 13 |     "scheme": "https",
 14 |     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
 15 |     "accept-encoding": "gzip, deflate, br",
 16 |     "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
 17 |     "cache-control": "max-age=0",
 18 |     "cookie": '_ga=GA1.2.39129319.1594622347; __gads=ID=043d6a87b305c2fa:T=1594622347:S=ALNI_MYxvZs8jISFiRaeb5xvTo3mUiGRZA; PB3_SESSION="2|1:0|10:1597026505|11:PB3_SESSION|36:djJleDoxMTkuMjguMTAuMTY3OjI0NzMxMjE1|a5b5feed68c8856fe23a22a10eff515d675860025125f4f901aa50dd5c8afd85"; _gid=GA1.2.1161767240.1597026508; __cfduid=d179e866f124e7b7d354b82808a770c311597214357; A2="2|1:0|10:1597227977|2:A2|48:ZDVjOGM5YTItOGQ1MC00YzFhLWE3NzAtYTgyNmNiNzM1YzQ2|f576fd971ed7f25522b6d1e13111422dd7ffcb3b362d9350bc812a727d70a766"; V2EX_LANG=zhcn; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; V2EX_REFERRER="2|1:0|10:1597374396|13:V2EX_REFERRER|12:Y29vbGt1MTIz|d1960c82f06faba562be315853f950c762a9cdaf5a7b00a823d64c24ea07102a"; V2EX_TAB="2|1:0|10:1597396058|8:V2EX_TAB|12:Y3JlYXRpdmU=|e18c67474904672366de8a4537a5faf6544d1b2a4c1a68cb8ba4c983697ec1d4"',
 19 |     "sec-fetch-dest": "document",
 20 |     "sec-fetch-mode": "navigate",
 21 |     "sec-fetch-site": "none",
 22 |     "sec-fetch-user": "?1",
 23 |     "upgrade-insecure-requests": "1",
 24 |     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
 25 | }
 26 | 
 27 | 
 28 | urllist  = ['https://www.v2ex.com/go/flamewar','https://www.v2ex.com/go/chamber','https://www.v2ex.com/?tab=hot']
 29 | listSubjecId = []
 30 | # 获取每个节点的主题url以及页数
 31 | def getSubjectUrl(url):
 32 |     req = requests.get(url=url + '?p=1',headers = heard)
 33 |     # print(req.text)
 34 |     soup = BeautifulSoup(req.text,"lxml")
 35 |     a = soup.select("#TopicsNode > div")
 36 |     #print(len(a))
 37 | 
 38 |     for i in a :
 39 |         pattern = re.compile('topic-link" href="(.*?)">', re.S)
 40 |         items = re.findall(pattern, str(i))
 41 |         timeK = True
 42 |         pattern = re.compile('</a></strong>  •  (.*?)前', re.S)
 43 |         items1 = re.findall(pattern, str(i))
 44 | 
 45 |         pattern = re.compile('.*<a class=".*?" href=.*">(.*?)</a>', re.S)
 46 |         items2 = re.findall(pattern, str(i))
 47 |         # 记录页数
 48 |         page = 0
 49 |         try:
 50 |             page = int(items2[0])/100
 51 |         except Exception:
 52 |             continue
 53 |         a = str(page).split('.')[1]
 54 |         page = int(page)
 55 |         if int(a) > 0:
 56 |             page = int(page) + 1
 57 |         else:
 58 |             page = int(page) + 0
 59 |         # 这里对回复时间进行判断如果超过循环的时间，不对该贴进行爬虫
 60 |         link = (str(items[0]).split("#")[0], page)
 61 |         if (timeK):
 62 |             listSubjecId.append(link)
 63 |         else:
 64 |             items1Str = str(items1[0])
 65 |             if (not("天" in items1Str or "小时" in items1Str)):
 66 |                 listSubjecId.append(link)
 67 | 
 68 | 
 69 | 
 70 | # 获取单个帖子的回复
 71 | def getSubject(tiezi):
 72 |     for i in range(1,int(tiezi[1])+1):
 73 |         url = 'https://www.v2ex.com' + tiezi[0]+"?p="+str(i)
 74 |         req = requests.get(url=url,headers = heard)
 75 |         soup = BeautifulSoup(req.text,"lxml")
 76 |         a = soup.select("#Main > div:nth-child(4) > div.cell")
 77 |         # print(a)
 78 |         for ai in a:
 79 | 
 80 |             soup1 = BeautifulSoup(str(ai), "lxml")
 81 |             a1 = soup1.select("div.reply_content")
 82 |             try:
 83 |                 # 获取回复
 84 |                 # 进行MD5
 85 |                 print(a1[0].get_text())
 86 |             except Exception as e:
 87 |                 continue
 88 |             pattern = re.compile('<a class="dark".*">(.*?)</a>.*ago">(.*?)前', re.S)
 89 |             items = re.findall(pattern, str(ai))
 90 |             # 获取回复的时间与作者
 91 |             #print(items)
 92 | 
 93 |             #print(items[0][0] + "   "+items[0][1])
 94 |             day = 0
 95 |             if "天" in items[0][1]:
 96 |                 number = re.sub("\D", "", items[0][1])
 97 |                 day = replyTimeCompute(int(number))
 98 |             else:
 99 |                 day = replyTimeCompute(0)
100 |             print(items[0][0] + "    "+ str(day) + "  " + tiezi[0])
101 |             print("===========================")
102 | 
103 | 
104 | 
105 | 
106 | 
107 | # 获取单个用户的回复
108 | def getReplyList(user):
109 |     url = "https://www.v2ex.com/member/" + user+"/replies"
110 |     req = requests.get(url=url)
111 |     soup = BeautifulSoup(req.text, "lxml")
112 |     #a = soup.select("#Main > div:nth-child(6)")  #Main > div.box
113 |     a = soup.select("#Main > div.box")
114 |     soup1 = BeautifulSoup(str(a), "lxml")
115 |     reply = soup1.select("div.reply_content")  # reply
116 |     replyInfo = soup1.select("div.dock_area")  # reply info
117 |     ReplyList = []
118 |     for i in range(len(reply)):
119 |         # 获取时间/url
120 |         pattern = re.compile('"fade">(.*?)</span></div>.*a href="(.*?)">', re.S)
121 |         items = re.findall(pattern, str(replyInfo[i]))
122 |         # print(items[0]) # ('1 天前', '/t/697703#reply47')
123 |         items1 = reply[i].get_text()
124 |         md5 = replyContentMD5(items1[0])
125 |         day = 0
126 |         if "天" in items[0][0]:
127 |             number = re.sub("\D", "", items[0][0])
128 |             day = replyTimeCompute(int(number))
129 |         else:
130 |             day = replyTimeCompute(0)
131 |         ReplyList.append((day,items1,md5,items[0][1]))
132 |     return user,ReplyList
133 | 
134 | 
135 | def replyTimeCompute(day):
136 |     new = datetime.datetime.strptime(str(datetime.date.today()), "%Y-%m-%d")
137 |     replyDay = datetime.timedelta(days=day)
138 |     replyTime = (new - replyDay).strftime("%Y-%m-%d")
139 |     return replyTime
140 | 
141 | def replyContentMD5(reply):
142 |     return hashlib.md5(reply.encode(encoding='utf-8')).hexdigest()
143 | 
144 | 
145 | # if __name__ == '__main__':
146 | #     for i in urllist:
147 | #         getSubjectUrl(i)
148 | #     for i in listSubjecId:
149 | #         getSubject(i)
150 | 
151 | 
152 | '''
153 | 用户名单 以及 节点名单均在数据库中存放方便及时添加用户或节点
154 | '''


--------------------------------------------------------------------------------