├── .gitignore
├── config.ini
├── readme.md
├── dao_sql.py
└── v2ex.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /__pycache__
2 | /.idea
3 | myconfig.ini
--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
1 | [mysql]
2 | host=
3 | port=
4 | user=
5 | password=
6 | db_name=
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # V2er
2 |
3 | 留住真实,记录语言。
4 |
5 | ---
6 |
7 |
8 |
9 | * 对指定人回复进行收集
10 |
11 | * 帖子进入水深火热对水深火热进行遍历
12 |
13 | * SQL表
14 |
15 |
16 | | ID | 作者 | 发表时间 | 言论 | 言论MD5 | url |
17 | | ---- | ------- | ---------- | ---- | :------------: | -------- |
18 | | 1 | Lunatic | 2020/12/12 | test | asd31dfsf43524 | v2ex.com |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/dao_sql.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from configparser import ConfigParser
3 | import pymysql
4 |
5 | config = ConfigParser()
6 | config.read("config.ini")
7 |
8 |
9 | host = config.get("mysql","host")
10 | port = int(config.get("mysql","port"))
11 | user = config.get("mysql","user")
12 | passwd = config.get("mysql","password")
13 | db = config.get("mysql","db_name")
14 | class sqlDeviceMpa:
15 | # 获取数据库命令对象
16 | connect = None
17 | cursor = None
18 |
19 | def __init__(self):
20 | self.openDatabase()
21 |
22 | def test_conn(self):
23 | try:
24 | self.connect.ping()
25 | except:
26 | global connect, cursor
27 | # 打开数据库连接
28 | self.connect = pymysql.Connect(
29 | host=host,
30 | port=port,
31 | user=user,
32 | passwd=passwd,
33 | db=db,
34 | charset='utf8'
35 | )
36 | # 获取游标
37 | self.cursor = self.connect.cursor()
38 |
39 | def openDatabase(self):
40 | global connect, cursor
41 | # 打开数据库连接
42 | self.connect = pymysql.Connect(
43 | host=host,
44 | port=port,
45 | user=user,
46 | passwd=passwd,
47 | db=db,
48 | charset='utf8'
49 | )
50 | # 获取游标
51 | self.cursor = self.connect.cursor()
52 |
53 | # 检查设备ID
54 | def checkData(self, code, deviceid):
55 | self.test_conn()
56 | sql = "SELECT * FROM device_map WHERE code='{}'".format(code)
57 | self.cursor.execute(sql)
58 | sqlRes = self.cursor.fetchone()
59 | if sqlRes == None:
60 | return False
61 | sqlCode = sqlRes[0]
62 | sqlDeviceId = sqlRes[1]
63 | if deviceid == "" or deviceid == None:
64 | return False
65 | if sqlDeviceId == None or sqlDeviceId == "":
66 | self.updateData(code,deviceid)
67 | return True
68 | return check_device.check_id_device(sqlDeviceId,deviceid)
69 |
70 | # 如果查询出来的设备ID为空则更新设备ID
71 | def updateData(self,code,deviceid):
72 | self.test_conn()
73 | sql = " UPDATE device_map SET deviceid = '{}' WHERE code = '{}' ".format(deviceid,code)
74 | self.cursor.execute(sql)
75 | self.connect.commit()
76 |
77 | def insertCode(self,listCode):
78 | self.test_conn()
79 | if listCode == None:
80 | return False
81 | sql = "INSERT INTO device_map (code) VALUES ('{}')"
82 | for i in listCode:
83 | self.cursor.execute(sql.format(i))
84 | self.connect.commit()
85 | return True
86 |
87 | def getNotUsedCode(self):
88 | self.test_conn()
89 | sql = "SELECT * FROM device_map WHERE deviceid = '' or deviceid is null "
90 | self.cursor.execute(sql)
91 | sqlRes = self.cursor.fetchall()
92 | listCode = []
93 | for i in sqlRes:
94 | listCode.append(i[0])
95 | return listCode
96 |
97 | def getUsedCode(self):
98 | self.test_conn()
99 | sql = "SELECT * FROM device_map WHERE deviceid !='' or deviceid is not null and LENGTH(trim(deviceid))>0"
100 | self.cursor.execute(sql)
101 | sqlRes = self.cursor.fetchall()
102 | return sqlRes
--------------------------------------------------------------------------------
/v2ex.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import hashlib
4 | import requests
5 | from bs4 import BeautifulSoup
6 | import re
7 |
8 |
9 | heard = {
10 | "authority": "www.v2ex.com",
11 | "method": "GET",
12 | "path": "/go/flamewar?p=1",
13 | "scheme": "https",
14 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
15 | "accept-encoding": "gzip, deflate, br",
16 | "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
17 | "cache-control": "max-age=0",
18 | "cookie": '_ga=GA1.2.39129319.1594622347; __gads=ID=043d6a87b305c2fa:T=1594622347:S=ALNI_MYxvZs8jISFiRaeb5xvTo3mUiGRZA; PB3_SESSION="2|1:0|10:1597026505|11:PB3_SESSION|36:djJleDoxMTkuMjguMTAuMTY3OjI0NzMxMjE1|a5b5feed68c8856fe23a22a10eff515d675860025125f4f901aa50dd5c8afd85"; _gid=GA1.2.1161767240.1597026508; __cfduid=d179e866f124e7b7d354b82808a770c311597214357; A2="2|1:0|10:1597227977|2:A2|48:ZDVjOGM5YTItOGQ1MC00YzFhLWE3NzAtYTgyNmNiNzM1YzQ2|f576fd971ed7f25522b6d1e13111422dd7ffcb3b362d9350bc812a727d70a766"; V2EX_LANG=zhcn; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; V2EX_REFERRER="2|1:0|10:1597374396|13:V2EX_REFERRER|12:Y29vbGt1MTIz|d1960c82f06faba562be315853f950c762a9cdaf5a7b00a823d64c24ea07102a"; V2EX_TAB="2|1:0|10:1597396058|8:V2EX_TAB|12:Y3JlYXRpdmU=|e18c67474904672366de8a4537a5faf6544d1b2a4c1a68cb8ba4c983697ec1d4"',
19 | "sec-fetch-dest": "document",
20 | "sec-fetch-mode": "navigate",
21 | "sec-fetch-site": "none",
22 | "sec-fetch-user": "?1",
23 | "upgrade-insecure-requests": "1",
24 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
25 | }
26 |
27 |
28 | urllist = ['https://www.v2ex.com/go/flamewar','https://www.v2ex.com/go/chamber','https://www.v2ex.com/?tab=hot']
29 | listSubjecId = []
30 | # 获取每个节点的主题url以及页数
31 | def getSubjectUrl(url):
32 | req = requests.get(url=url + '?p=1',headers = heard)
33 | # print(req.text)
34 | soup = BeautifulSoup(req.text,"lxml")
35 | a = soup.select("#TopicsNode > div")
36 | #print(len(a))
37 |
38 | for i in a :
39 | pattern = re.compile('topic-link" href="(.*?)">', re.S)
40 | items = re.findall(pattern, str(i))
41 | timeK = True
42 | pattern = re.compile(' • (.*?)前', re.S)
43 | items1 = re.findall(pattern, str(i))
44 |
45 | pattern = re.compile('.*(.*?)', re.S)
46 | items2 = re.findall(pattern, str(i))
47 | # 记录页数
48 | page = 0
49 | try:
50 | page = int(items2[0])/100
51 | except Exception:
52 | continue
53 | a = str(page).split('.')[1]
54 | page = int(page)
55 | if int(a) > 0:
56 | page = int(page) + 1
57 | else:
58 | page = int(page) + 0
59 | # 这里对回复时间进行判断如果超过循环的时间,不对该贴进行爬虫
60 | link = (str(items[0]).split("#")[0], page)
61 | if (timeK):
62 | listSubjecId.append(link)
63 | else:
64 | items1Str = str(items1[0])
65 | if (not("天" in items1Str or "小时" in items1Str)):
66 | listSubjecId.append(link)
67 |
68 |
69 |
70 | # 获取单个帖子的回复
71 | def getSubject(tiezi):
72 | for i in range(1,int(tiezi[1])+1):
73 | url = 'https://www.v2ex.com' + tiezi[0]+"?p="+str(i)
74 | req = requests.get(url=url,headers = heard)
75 | soup = BeautifulSoup(req.text,"lxml")
76 | a = soup.select("#Main > div:nth-child(4) > div.cell")
77 | # print(a)
78 | for ai in a:
79 |
80 | soup1 = BeautifulSoup(str(ai), "lxml")
81 | a1 = soup1.select("div.reply_content")
82 | try:
83 | # 获取回复
84 | # 进行MD5
85 | print(a1[0].get_text())
86 | except Exception as e:
87 | continue
88 | pattern = re.compile('(.*?).*ago">(.*?)前', re.S)
89 | items = re.findall(pattern, str(ai))
90 | # 获取回复的时间与作者
91 | #print(items)
92 |
93 | #print(items[0][0] + " "+items[0][1])
94 | day = 0
95 | if "天" in items[0][1]:
96 | number = re.sub("\D", "", items[0][1])
97 | day = replyTimeCompute(int(number))
98 | else:
99 | day = replyTimeCompute(0)
100 | print(items[0][0] + " "+ str(day) + " " + tiezi[0])
101 | print("===========================")
102 |
103 |
104 |
105 |
106 |
107 | # 获取单个用户的回复
108 | def getReplyList(user):
109 | url = "https://www.v2ex.com/member/" + user+"/replies"
110 | req = requests.get(url=url)
111 | soup = BeautifulSoup(req.text, "lxml")
112 | #a = soup.select("#Main > div:nth-child(6)") #Main > div.box
113 | a = soup.select("#Main > div.box")
114 | soup1 = BeautifulSoup(str(a), "lxml")
115 | reply = soup1.select("div.reply_content") # reply
116 | replyInfo = soup1.select("div.dock_area") # reply info
117 | ReplyList = []
118 | for i in range(len(reply)):
119 | # 获取时间/url
120 | pattern = re.compile('"fade">(.*?).*a href="(.*?)">', re.S)
121 | items = re.findall(pattern, str(replyInfo[i]))
122 | # print(items[0]) # ('1 天前', '/t/697703#reply47')
123 | items1 = reply[i].get_text()
124 | md5 = replyContentMD5(items1[0])
125 | day = 0
126 | if "天" in items[0][0]:
127 | number = re.sub("\D", "", items[0][0])
128 | day = replyTimeCompute(int(number))
129 | else:
130 | day = replyTimeCompute(0)
131 | ReplyList.append((day,items1,md5,items[0][1]))
132 | return user,ReplyList
133 |
134 |
135 | def replyTimeCompute(day):
136 | new = datetime.datetime.strptime(str(datetime.date.today()), "%Y-%m-%d")
137 | replyDay = datetime.timedelta(days=day)
138 | replyTime = (new - replyDay).strftime("%Y-%m-%d")
139 | return replyTime
140 |
141 | def replyContentMD5(reply):
142 | return hashlib.md5(reply.encode(encoding='utf-8')).hexdigest()
143 |
144 |
145 | # if __name__ == '__main__':
146 | # for i in urllist:
147 | # getSubjectUrl(i)
148 | # for i in listSubjecId:
149 | # getSubject(i)
150 |
151 |
152 | '''
153 | 用户名单 以及 节点名单均在数据库中存放方便及时添加用户或节点
154 | '''
--------------------------------------------------------------------------------