├── .gitignore ├── config.ini ├── readme.md ├── dao_sql.py └── v2ex.py /.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__ 2 | /.idea 3 | myconfig.ini -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [mysql] 2 | host= 3 | port= 4 | user= 5 | password= 6 | db_name= -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # V2er 2 | 3 | 留住真实,记录语言。 4 | 5 | --- 6 | 7 | 8 | 9 | * 对指定人回复进行收集 10 | 11 | * 帖子进入水深火热对水深火热进行遍历 12 | 13 | * SQL表 14 | 15 | 16 | | ID | 作者 | 发表时间 | 言论 | 言论MD5 | url | 17 | | ---- | ------- | ---------- | ---- | :------------: | -------- | 18 | | 1 | Lunatic | 2020/12/12 | test | asd31dfsf43524 | v2ex.com | 19 | 20 | ​ 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /dao_sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from configparser import ConfigParser 3 | import pymysql 4 | 5 | config = ConfigParser() 6 | config.read("config.ini") 7 | 8 | 9 | host = config.get("mysql","host") 10 | port = int(config.get("mysql","port")) 11 | user = config.get("mysql","user") 12 | passwd = config.get("mysql","password") 13 | db = config.get("mysql","db_name") 14 | class sqlDeviceMpa: 15 | # 获取数据库命令对象 16 | connect = None 17 | cursor = None 18 | 19 | def __init__(self): 20 | self.openDatabase() 21 | 22 | def test_conn(self): 23 | try: 24 | self.connect.ping() 25 | except: 26 | global connect, cursor 27 | # 打开数据库连接 28 | self.connect = pymysql.Connect( 29 | host=host, 30 | port=port, 31 | user=user, 32 | passwd=passwd, 33 | db=db, 34 | charset='utf8' 35 | ) 36 | # 获取游标 37 | self.cursor = self.connect.cursor() 38 | 39 | def openDatabase(self): 40 | global connect, cursor 41 | # 打开数据库连接 42 | self.connect = pymysql.Connect( 43 | host=host, 44 | port=port, 45 | user=user, 46 | passwd=passwd, 47 | db=db, 48 | charset='utf8' 49 | ) 50 | # 获取游标 51 | self.cursor = self.connect.cursor() 52 | 53 | # 检查设备ID 54 | def checkData(self, code, deviceid): 55 | self.test_conn() 56 | sql = "SELECT * FROM device_map WHERE code='{}'".format(code) 57 | self.cursor.execute(sql) 58 | sqlRes = self.cursor.fetchone() 59 | if sqlRes == None: 60 | return False 61 | sqlCode = sqlRes[0] 62 | sqlDeviceId = sqlRes[1] 63 | if deviceid == "" or deviceid == None: 64 | return False 65 | if sqlDeviceId == None or sqlDeviceId == "": 66 | self.updateData(code,deviceid) 67 | return True 68 | return check_device.check_id_device(sqlDeviceId,deviceid) 69 | 70 | # 如果查询出来的设备ID为空则更新设备ID 71 | def updateData(self,code,deviceid): 72 | self.test_conn() 73 | sql = " UPDATE device_map SET deviceid = '{}' WHERE code = '{}' ".format(deviceid,code) 74 | self.cursor.execute(sql) 75 | self.connect.commit() 76 | 77 | def insertCode(self,listCode): 78 | self.test_conn() 79 | if listCode == None: 80 | return False 81 | sql = "INSERT INTO device_map (code) VALUES ('{}')" 82 | for i in listCode: 83 | self.cursor.execute(sql.format(i)) 84 | self.connect.commit() 85 | return True 86 | 87 | def getNotUsedCode(self): 88 | self.test_conn() 89 | sql = "SELECT * FROM device_map WHERE deviceid = '' or deviceid is null " 90 | self.cursor.execute(sql) 91 | sqlRes = self.cursor.fetchall() 92 | listCode = [] 93 | for i in sqlRes: 94 | listCode.append(i[0]) 95 | return listCode 96 | 97 | def getUsedCode(self): 98 | self.test_conn() 99 | sql = "SELECT * FROM device_map WHERE deviceid !='' or deviceid is not null and LENGTH(trim(deviceid))>0" 100 | self.cursor.execute(sql) 101 | sqlRes = self.cursor.fetchall() 102 | return sqlRes -------------------------------------------------------------------------------- /v2ex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import hashlib 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import re 7 | 8 | 9 | heard = { 10 | "authority": "www.v2ex.com", 11 | "method": "GET", 12 | "path": "/go/flamewar?p=1", 13 | "scheme": "https", 14 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 15 | "accept-encoding": "gzip, deflate, br", 16 | "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", 17 | "cache-control": "max-age=0", 18 | "cookie": '_ga=GA1.2.39129319.1594622347; __gads=ID=043d6a87b305c2fa:T=1594622347:S=ALNI_MYxvZs8jISFiRaeb5xvTo3mUiGRZA; PB3_SESSION="2|1:0|10:1597026505|11:PB3_SESSION|36:djJleDoxMTkuMjguMTAuMTY3OjI0NzMxMjE1|a5b5feed68c8856fe23a22a10eff515d675860025125f4f901aa50dd5c8afd85"; _gid=GA1.2.1161767240.1597026508; __cfduid=d179e866f124e7b7d354b82808a770c311597214357; A2="2|1:0|10:1597227977|2:A2|48:ZDVjOGM5YTItOGQ1MC00YzFhLWE3NzAtYTgyNmNiNzM1YzQ2|f576fd971ed7f25522b6d1e13111422dd7ffcb3b362d9350bc812a727d70a766"; V2EX_LANG=zhcn; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; V2EX_REFERRER="2|1:0|10:1597374396|13:V2EX_REFERRER|12:Y29vbGt1MTIz|d1960c82f06faba562be315853f950c762a9cdaf5a7b00a823d64c24ea07102a"; V2EX_TAB="2|1:0|10:1597396058|8:V2EX_TAB|12:Y3JlYXRpdmU=|e18c67474904672366de8a4537a5faf6544d1b2a4c1a68cb8ba4c983697ec1d4"', 19 | "sec-fetch-dest": "document", 20 | "sec-fetch-mode": "navigate", 21 | "sec-fetch-site": "none", 22 | "sec-fetch-user": "?1", 23 | "upgrade-insecure-requests": "1", 24 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36", 25 | } 26 | 27 | 28 | urllist = ['https://www.v2ex.com/go/flamewar','https://www.v2ex.com/go/chamber','https://www.v2ex.com/?tab=hot'] 29 | listSubjecId = [] 30 | # 获取每个节点的主题url以及页数 31 | def getSubjectUrl(url): 32 | req = requests.get(url=url + '?p=1',headers = heard) 33 | # print(req.text) 34 | soup = BeautifulSoup(req.text,"lxml") 35 | a = soup.select("#TopicsNode > div") 36 | #print(len(a)) 37 | 38 | for i in a : 39 | pattern = re.compile('topic-link" href="(.*?)">', re.S) 40 | items = re.findall(pattern, str(i)) 41 | timeK = True 42 | pattern = re.compile('  •  (.*?)前', re.S) 43 | items1 = re.findall(pattern, str(i)) 44 | 45 | pattern = re.compile('.*(.*?)', re.S) 46 | items2 = re.findall(pattern, str(i)) 47 | # 记录页数 48 | page = 0 49 | try: 50 | page = int(items2[0])/100 51 | except Exception: 52 | continue 53 | a = str(page).split('.')[1] 54 | page = int(page) 55 | if int(a) > 0: 56 | page = int(page) + 1 57 | else: 58 | page = int(page) + 0 59 | # 这里对回复时间进行判断如果超过循环的时间,不对该贴进行爬虫 60 | link = (str(items[0]).split("#")[0], page) 61 | if (timeK): 62 | listSubjecId.append(link) 63 | else: 64 | items1Str = str(items1[0]) 65 | if (not("天" in items1Str or "小时" in items1Str)): 66 | listSubjecId.append(link) 67 | 68 | 69 | 70 | # 获取单个帖子的回复 71 | def getSubject(tiezi): 72 | for i in range(1,int(tiezi[1])+1): 73 | url = 'https://www.v2ex.com' + tiezi[0]+"?p="+str(i) 74 | req = requests.get(url=url,headers = heard) 75 | soup = BeautifulSoup(req.text,"lxml") 76 | a = soup.select("#Main > div:nth-child(4) > div.cell") 77 | # print(a) 78 | for ai in a: 79 | 80 | soup1 = BeautifulSoup(str(ai), "lxml") 81 | a1 = soup1.select("div.reply_content") 82 | try: 83 | # 获取回复 84 | # 进行MD5 85 | print(a1[0].get_text()) 86 | except Exception as e: 87 | continue 88 | pattern = re.compile('(.*?).*ago">(.*?)前', re.S) 89 | items = re.findall(pattern, str(ai)) 90 | # 获取回复的时间与作者 91 | #print(items) 92 | 93 | #print(items[0][0] + " "+items[0][1]) 94 | day = 0 95 | if "天" in items[0][1]: 96 | number = re.sub("\D", "", items[0][1]) 97 | day = replyTimeCompute(int(number)) 98 | else: 99 | day = replyTimeCompute(0) 100 | print(items[0][0] + " "+ str(day) + " " + tiezi[0]) 101 | print("===========================") 102 | 103 | 104 | 105 | 106 | 107 | # 获取单个用户的回复 108 | def getReplyList(user): 109 | url = "https://www.v2ex.com/member/" + user+"/replies" 110 | req = requests.get(url=url) 111 | soup = BeautifulSoup(req.text, "lxml") 112 | #a = soup.select("#Main > div:nth-child(6)") #Main > div.box 113 | a = soup.select("#Main > div.box") 114 | soup1 = BeautifulSoup(str(a), "lxml") 115 | reply = soup1.select("div.reply_content") # reply 116 | replyInfo = soup1.select("div.dock_area") # reply info 117 | ReplyList = [] 118 | for i in range(len(reply)): 119 | # 获取时间/url 120 | pattern = re.compile('"fade">(.*?).*a href="(.*?)">', re.S) 121 | items = re.findall(pattern, str(replyInfo[i])) 122 | # print(items[0]) # ('1 天前', '/t/697703#reply47') 123 | items1 = reply[i].get_text() 124 | md5 = replyContentMD5(items1[0]) 125 | day = 0 126 | if "天" in items[0][0]: 127 | number = re.sub("\D", "", items[0][0]) 128 | day = replyTimeCompute(int(number)) 129 | else: 130 | day = replyTimeCompute(0) 131 | ReplyList.append((day,items1,md5,items[0][1])) 132 | return user,ReplyList 133 | 134 | 135 | def replyTimeCompute(day): 136 | new = datetime.datetime.strptime(str(datetime.date.today()), "%Y-%m-%d") 137 | replyDay = datetime.timedelta(days=day) 138 | replyTime = (new - replyDay).strftime("%Y-%m-%d") 139 | return replyTime 140 | 141 | def replyContentMD5(reply): 142 | return hashlib.md5(reply.encode(encoding='utf-8')).hexdigest() 143 | 144 | 145 | # if __name__ == '__main__': 146 | # for i in urllist: 147 | # getSubjectUrl(i) 148 | # for i in listSubjecId: 149 | # getSubject(i) 150 | 151 | 152 | ''' 153 | 用户名单 以及 节点名单均在数据库中存放方便及时添加用户或节点 154 | ''' --------------------------------------------------------------------------------