├── TiebaStrat.bat
├── TiebaSetting.conf
├── Sentiment
├── TiebaSetting.conf
├── README.md
└── tieba.py
├── README.MD
├── ApiRedme
├── README.md
└── TiebaApiUtil.py
├── myitchat.py
├── myitchat
└── myitchat.py
├── TiebaApiUtil.py
└── tieba.py
/TiebaStrat.bat:
--------------------------------------------------------------------------------
1 | python .\myitchat.py
--------------------------------------------------------------------------------
/TiebaSetting.conf:
--------------------------------------------------------------------------------
1 | [Setting]
2 | sleep = 1
3 | tb = 林俊杰,周杰伦
4 | start = 1
5 | end = 2
6 | Time = 2018-10-14-1
7 | [Customize]
8 | essential =演唱会
9 |
10 |
--------------------------------------------------------------------------------
/Sentiment/TiebaSetting.conf:
--------------------------------------------------------------------------------
1 | # 配置文件注释
2 | [Setting]
3 |
4 | Sleep = 18000
5 | # 贴吧名 贴吧用英文逗号(,)分隔符分割
6 | tb = 武汉东湖学院,汉口学院,武汉学院
7 |
8 | # 贴吧页数指定爬取 默认起始页Start=1 End = 3
9 |
10 | Start = 1
11 |
12 | End = 10
13 |
14 | [Customize]
15 |
16 | # 关键词
17 |
18 | Essential = 留学,雅思,托福,新东方,新航道,出国
19 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | # Tieba API Sentiment
2 |
3 | 贴吧API以及关键词提醒,舆情监控,微信陪聊机器人。
4 |
5 | :smiley::smiley::smiley::smiley::smiley:
6 |
7 |
8 |
9 | ## Time update
10 |
11 | - 2018/7/3 立项
12 | - 2018/7/12 完成设计
13 | - 2018/7/14 完成API部分
14 | - 2018/7/19 修复API部分,完成关键词监控部分
15 | - 2018/7/20 关键词监控部分上线,微信监控部分测试...
16 | - 2018/7/20 修复初始化问题
17 | - 2018/7/23 更新Sentiment人性化设置,加入全新微信机器人监控(myitchat.py)
18 | - 2018/7/25 更新myitchat以及Sentiment API,添加配置修改内容
19 | - 2018/7/25 Beta V0.1 上线
20 | - 2018/7/27 修复若干问题,正式版V1.0上线
21 | - 2018/7/28 修复因访问速度过快引起的连接重置的bug
22 | - 2018/10/15 增加时间过滤,修复若干BUG。
23 | - 2019/6/28 web版失效
24 | - 2020/4/29 web版生效
25 |
26 |
27 | ## 相关依赖
28 | python 3+
29 | pip install requests
30 | pip install BeautifulSoup
31 | pip install itchat
32 |
33 |
34 | ## 使用方法
35 |
36 | ### Windows:
37 |
38 | 点击目录下的:
39 | ***TiebaStrat.bat***
40 |
41 | 
42 |
43 |
44 | ### Windows/Linux:
45 |
46 | CMD/powershell:***python myitchat.py***
47 |
48 | 
49 |
50 |
51 | 扫码完成后即可开启贴吧微信助手。
52 |
53 |
54 |
55 | ### 微信机器人
56 |
57 | 
58 |
59 |
60 | ### 关键词舆情监控
61 |
62 | 编辑TiebaSetting.conf:
63 |
64 | [Setting]
65 | # 休眠周期/秒
66 | sleep = 300
67 | # 监控贴吧
68 | tb = 林俊杰,周杰伦,王力宏
69 | # 开始监控页数
70 | start = 1
71 | # 结束监控页数
72 | end = 3
73 | # 时间控制
74 | Time = 2018-10-14-0
75 | # 0/1 0:过滤Time之前 1:过滤Time之后
76 |
77 | [Customize]
78 | # 关键词
79 | essential = 演唱会,华语乐坛
80 |
81 |
82 |
83 |
84 |
85 | 1. 启动微信扫码登陆
86 | 2. 输入“开启监控”,即可记录该用户ID并且在之后的监控数据以及配置修改都只有该用户可执行。
87 | 
88 | 3. 输入“修改配置”,复制模板修改并且发送即可修改。
89 | 
90 | 4. 享受结果。
91 |
92 | 
93 |
94 | ----------
95 | ## 详细思路
96 |
97 | 访问我的博客:
98 |
99 | [贴吧监控助手](http://lunatic.wang/posts/d1eb00bb/)
100 |
101 | ----------
102 | ## 自定义第三方拓展
103 |
104 | - 第三方贴吧请以 *TieBa API Util* 拓展
105 | - 舆情功能拓展以及其他功能拓展 *Sentiment API*
106 | - 微信机器人拓展 *myitchat*
107 |
108 | 以上为建议拓展文件衍生。
109 |
110 | ----------
111 | ## TieBa API Util
112 |
113 | - 获取官方贴吧信息
114 | - 高度自由化的信息收集
115 | - 可制作第三方客户端
116 | - 回复以及楼中楼形式
117 |
118 | [Tieba API 相关说明](/ApiRedme)
119 |
120 | ## Sentiment API
121 |
122 | - 自定义贴吧监控
123 | - 自定义关键词监控
124 | - 自定义更新周期
125 | - 可定制化的信息收集
126 | - 可作为其他项目工具
127 |
128 |
129 | [Sentiment API 相关说明](/Sentiment)
130 |
131 | ## myitchat
132 |
133 | - 图灵机器人
134 | - 智能对话
135 | - 微信监控舆情
136 | - 智能化控制
137 |
138 | [myitchat 相关说明](/myitchat)
139 |
140 |
141 |
142 | ## 其他说明
143 |
144 | 请勿用于商业用途
145 |
--------------------------------------------------------------------------------
/ApiRedme/README.md:
--------------------------------------------------------------------------------
1 | # Tieba API Util
2 |
3 | :smiley: :smiley: :smiley: :smiley:
4 |
5 |
6 | ## TieBa API
7 |
8 |
9 | ### 相关依赖
10 | pip install requests
11 |
12 | pip install BeautifulSoup
13 |
14 | ### 使用方法
15 |
16 | import TiebaApiUtil
17 |
18 | 内置两个方法,分别是GetPage/GetTiebaOne。
19 |
20 |
21 | ### GetPage
22 |
23 | 调用方法
24 |
25 | TiebaApiUtil.GetPage(Key,Start,End)
26 |
27 | 参数说明:
28 |
29 |
30 | * key:贴吧关键词
31 |
32 | 需要访问的贴吧关键词,譬如:'国际米兰'。
33 |
34 | * Start:开始页数
35 |
36 | 默认为1,不能为负。
37 |
38 | * End:结束页数
39 |
40 | 默认为3,不能为负。
41 |
42 | 该函数返回Json,返回形式为:
43 |
44 | * key:贴吧名称
45 | * Page:
46 | * X:贴吧当前页数
47 | * Id:帖子ID
48 | * Title:帖子标题
49 | * Reply:帖子回复数
50 | * Author:帖子作者
51 | * Time:最后回复时间
52 |
53 |
54 |
55 |
56 | TiebaApiUtil.GetPage(key='国际米兰',Start=1,End=3)
57 | 如下:
58 |
59 |
60 |
61 |
62 | {
63 |
64 | "key":"国际米兰",
65 | "Page"::[
66 | {
67 | "1":[
68 | {
69 | "Id":"5793707221",
70 | "Title":"官方:埃德尔转会江苏苏宁",
71 | "Reply":"102",
72 | "Author":"树欲动而风又止",
73 | "Time":"23:34"
74 | },
75 | {
76 | "Id":"5793703055",
77 | "Title":"国际米兰新闻晚报,7月13日",
78 | "Reply":"254",
79 | "Author":"wyp861025",
80 | "Time":"23:44"
81 | }
82 | ]
83 | }
84 | {
85 | "2":[
86 | {
87 | "Id":"5793707221",
88 | "Title":"官方:埃德尔转会江苏苏宁",
89 | "Reply":"102",
90 | "Author":"树欲动而风又止",
91 | "Time":"23:34"
92 | },
93 | {
94 | "Id":"5793703055",
95 | "Title":"国际米兰新闻晚报,7月13日",
96 | "Reply":"254",
97 | "Author":"wyp861025",
98 | "Time":"23:44"
99 | }
100 | ]
101 | }
102 | ]
103 |
104 | }
105 |
106 |
107 | ### GetTiebaOne
108 |
109 | 使用方法:
110 |
111 | TiebaApiUtil.GetTiebaOne(ID)
112 | 参数说明:
113 |
114 | * Id:帖子唯一ID
115 |
116 | 该函数返回Json,返回形式为:
117 |
118 | * Text:内容
119 | * Author:用户
120 | * Time:时间
121 | * FloorInFloor:楼中楼
122 | * Text:内容
123 | * Author:用户
124 | * Time:时间
125 |
126 |
127 |
128 |
129 | TiebaApiUtil.GetTiebaOne(5789990094)
130 |
131 |
132 |
133 |
134 | {
135 | "Text":"
这么容易就爆照的,不是抠脚就是快餐
",
136 | "Author":"一涵呦",
137 | "Time":"7-10 20:56",
138 | "FloorInFloor":[
139 | {
140 | "Text":"快餐是啥",
141 | "Author":"可爱的Hjkjbb",
142 | "Time":"14:25"
143 | },
144 | {
145 | "Text":"回复 言清欢🍒🔯🔯 :快餐就是我们平常点的外卖。",
146 | "Author":"李坤铭12",
147 | "Time":"15:16"
148 | },
149 | {
150 | "Text":"回复 言清欢🍒🔯🔯 :打一次就走的,不过夜的
",
151 | "Author":"啦啦队长15",
152 | "Time":"15:24"
153 | },
154 | {
155 | "Text":"回复 米破是张小恒 :你小弟弟就这么粗。",
156 | "Author":"让我鸡儿放会假",
157 | "Time":"16:47"
158 | },
159 | {
160 | "Text":"卖茶叶的",
161 | "Author":"天生的she手",
162 | "Time":"20:12"
163 | }
164 | ]
165 | }
166 |
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------
/myitchat.py:
--------------------------------------------------------------------------------
1 | import itchat
2 | import requests
3 | import tieba
4 | import threading
5 | import json
6 | import platform
7 |
8 | KEY = 'xxxxxxx'
9 |
10 | T = 0
11 |
12 | def get_response(msg, UserId):
13 | # 这里我们就像在“3. 实现最简单的与图灵机器人的交互”中做的一样
14 | # 构造了要发送给服务器的数据
15 | apiUrl = 'http://openapi.tuling123.com/openapi/api/v2'
16 |
17 |
18 | data = {
19 | "reqType": 0,
20 | "perception": {
21 | "inputText": {
22 | "text": msg
23 | },
24 | "inputImage": {
25 | "url": ""
26 | },
27 | "selfInfo": {
28 | "location": {
29 | "city": "",
30 | "province": "",
31 | "street": ""
32 | }
33 | }
34 | },
35 | "userInfo": {
36 | "apiKey": "42afd1a6112f4a93bbaa83022d980132",
37 | "userId": str(UserId)[1:33]
38 | }
39 | }
40 |
41 | print(data)
42 | try:
43 | r = requests.post(apiUrl, data=json.dumps(data)).json()
44 | # 字典的get方法在字典没有'text'值的时候会返回None而不会抛出异常
45 |
46 | r = r['results']
47 | r = r[0]
48 | r = r['values']
49 | return r['text']
50 | # 为了防止服务器没有正常响应导致程序异常退出,这里用try-except捕获了异常
51 | # 如果服务器没能正常交互(返回非json或无法连接),那么就会进入下面的return
52 | except:
53 | # 将会返回一个None
54 | return
55 |
56 |
57 | id = ''
58 |
59 |
60 | @itchat.msg_register(itchat.content.TEXT)
61 | def print_content(msg):
62 | # print(msg)
63 | global id
64 | print('id = |'+ id)
65 | print(msg['Text'] == '修改配置')
66 | print(id == msg['FromUserName'])
67 | if msg['Text'] == '开启监控' and (id == '' or id ==None ):
68 |
69 | # 引用全局变量
70 | id = msg['FromUserName']
71 | itchat.send_msg('已经开启监控~', toUserName=id)
72 | itchat.send_msg(tieba.setting(), toUserName=id)
73 | return
74 |
75 | if msg['Text'] == '修改配置' and id == msg['FromUserName']:
76 | Setting = tieba.GetSetting()
77 | a = {
78 | '监控贴吧列表':Setting[0],
79 | '监控关键词':Setting[1],
80 | '监控周期(S)':Setting[2],
81 | '开始页数':Setting[3],
82 | '结束页数': Setting[4]
83 | }
84 | itchat.send_msg('修改以下列信息,并且将修改后的信息复制发送', toUserName=id)
85 |
86 | itchat.send_msg(str(a), toUserName=id)
87 | return
88 |
89 | if '监控贴吧列表' in msg['Text']:
90 | global T
91 | T = 1
92 |
93 | son = tieba.SetSetting(eval(msg['Text']))
94 | return son
95 |
96 |
97 |
98 |
99 |
100 | # 这次对接收信息做一次判断
101 | sentence = msg['Text']
102 |
103 | # 如果用户发送的是YYF则执行刷任务
104 | print(msg['FromUserName'])
105 |
106 |
107 |
108 | return get_response(msg['Text'],msg['FromUserName'])
109 |
110 | def Main():
111 | global T
112 | i = 1
113 | while 1:
114 |
115 | C = tieba.Main()
116 | print('This is myitchat: '+ str(C) )
117 | print(T)
118 |
119 | if C == None or C == [] or C == ' ' or len(C) > 3452:
120 | continue
121 | if T == 1:
122 | T = 0
123 | print('改变了T: '+ str(T))
124 |
125 | if T ==0 and i != 0 :
126 | itchat.send_msg('监控到更新的数据 \n \n'+str(C),toUserName=id)
127 | i += 1
128 |
129 |
130 |
131 |
132 |
133 | # 识别系统
134 | sysstr = platform.system()
135 |
136 | if(sysstr =="Windows"):
137 | itchat.auto_login(hotReload=True)
138 | elif sysstr == "Linux":
139 | itchat.auto_login(enableCmdQR=2)
140 |
141 |
142 | # blockThread=False 启用解除block
143 | itchat.run(blockThread=False)
144 | tieba.ini()
145 | tie = threading.Thread(target=Main())
146 | tie.start()
147 |
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/myitchat/myitchat.py:
--------------------------------------------------------------------------------
1 | import itchat
2 | import requests
3 | import tieba
4 | import threading
5 | import json
6 | import platform
7 |
8 | KEY = 'xxxxxxx'
9 |
10 | T = 0
11 |
12 | def get_response(msg, UserId):
13 | # 这里我们就像在“3. 实现最简单的与图灵机器人的交互”中做的一样
14 | # 构造了要发送给服务器的数据
15 | apiUrl = 'http://openapi.tuling123.com/openapi/api/v2'
16 |
17 |
18 | data = {
19 | "reqType": 0,
20 | "perception": {
21 | "inputText": {
22 | "text": msg
23 | },
24 | "inputImage": {
25 | "url": ""
26 | },
27 | "selfInfo": {
28 | "location": {
29 | "city": "",
30 | "province": "",
31 | "street": ""
32 | }
33 | }
34 | },
35 | "userInfo": {
36 | "apiKey": "42afd1a6112f4a93bbaa83022d980132",
37 | "userId": str(UserId)[1:33]
38 | }
39 | }
40 |
41 | print(data)
42 | try:
43 | r = requests.post(apiUrl, data=json.dumps(data)).json()
44 | # 字典的get方法在字典没有'text'值的时候会返回None而不会抛出异常
45 |
46 | r = r['results']
47 | r = r[0]
48 | r = r['values']
49 | return r['text']
50 | # 为了防止服务器没有正常响应导致程序异常退出,这里用try-except捕获了异常
51 | # 如果服务器没能正常交互(返回非json或无法连接),那么就会进入下面的return
52 | except:
53 | # 将会返回一个None
54 | return
55 |
56 |
57 | id = ''
58 |
59 |
60 | @itchat.msg_register(itchat.content.TEXT)
61 | def print_content(msg):
62 | # print(msg)
63 | global id
64 | print('id = |'+ id)
65 | print(msg['Text'] == '修改配置')
66 | print(id == msg['FromUserName'])
67 | if msg['Text'] == '开启监控' and (id == '' or id ==None ):
68 |
69 | # 引用全局变量
70 | id = msg['FromUserName']
71 | itchat.send_msg('已经开启监控~', toUserName=id)
72 | itchat.send_msg(tieba.setting(), toUserName=id)
73 | return
74 |
75 | if msg['Text'] == '修改配置' and id == msg['FromUserName']:
76 | Setting = tieba.GetSetting()
77 | a = {
78 | '监控贴吧列表':Setting[0],
79 | '监控关键词':Setting[1],
80 | '监控周期(S)':Setting[2],
81 | '开始页数':Setting[3],
82 | '结束页数': Setting[4]
83 | }
84 | itchat.send_msg('修改以下列信息,并且将修改后的信息复制发送', toUserName=id)
85 |
86 | itchat.send_msg(str(a), toUserName=id)
87 | return
88 |
89 | if '监控贴吧列表' in msg['Text']:
90 | global T
91 | T = 1
92 |
93 | son = tieba.SetSetting(eval(msg['Text']))
94 | return son
95 |
96 |
97 |
98 |
99 |
100 | # 这次对接收信息做一次判断
101 | sentence = msg['Text']
102 |
103 | # 如果用户发送的是YYF则执行刷任务
104 | print(msg['FromUserName'])
105 |
106 |
107 |
108 | return get_response(msg['Text'],msg['FromUserName'])
109 |
110 | def Main():
111 | global T
112 | i = 1
113 | while 1:
114 |
115 | C = tieba.Main()
116 | print('This is myitchat: '+ str(C) )
117 | print(T)
118 |
119 | if C == None or C == [] or C == ' ' or len(C) > 3452:
120 | continue
121 | if T == 1:
122 | T = 0
123 | print('改变了T: '+ str(T))
124 |
125 | if T ==0 and i != 0 :
126 | itchat.send_msg('监控到更新的数据 \n \n'+str(C),toUserName=id)
127 | i += 1
128 |
129 |
130 |
131 |
132 |
133 | # 识别系统
134 | sysstr = platform.system()
135 |
136 | if(sysstr =="Windows"):
137 | itchat.auto_login(hotReload=True)
138 | elif sysstr == "Linux":
139 | itchat.auto_login(enableCmdQR=2)
140 |
141 |
142 | # blockThread=False 启用解除block
143 | itchat.run(blockThread=False)
144 | tieba.ini()
145 | tie = threading.Thread(target=Main())
146 | tie.start()
147 |
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/Sentiment/README.md:
--------------------------------------------------------------------------------
1 | # Sentiment API
2 |
3 | ---
4 |
5 | :tokyo_tower: :tokyo_tower: :tokyo_tower:
6 |
7 |
8 | ## Sentiment
9 |
10 |
11 | ### 相关依赖
12 |
13 | import TiebaApiUtil
14 |
15 | ### 使用方法
16 |
17 | import tieba
18 |
19 | 内置方法:
20 |
21 | - GetId
22 | - GetText
23 | - OneToOne
24 | - ComparisonDict()
25 | - ini()
26 | - Main()
27 | - setting()
28 | - GetSetting()
29 | - SetSetting()
30 | - dict_string()
31 |
32 | 内置文件:
33 |
34 | - TiebaSetting.conf
35 |
36 | 配置文件相关参数:
37 |
38 | [Setting]
39 |
40 | Sleep = 18000
41 | # 贴吧名 贴吧用英文逗号(,)分隔符分割
42 |
43 | tb = 国际米兰,AC米兰,皇家马德里
44 |
45 | # 贴吧页数指定爬取 默认起始页Start=1 End = 3
46 |
47 | Start = 1
48 |
49 | End = 10
50 |
51 | [Customize]
52 |
53 | # 关键词
54 |
55 | Essential = C罗,尤文,梅西,卡卡,透
56 |
57 |
58 |
59 | ---
60 | ### GetId
61 |
62 |
63 | #### 调用方法
64 |
65 |
66 | tieba.GetId()
67 |
68 | #### 方法说明
69 |
70 | 该方法获取配置文件信息,获取贴吧列表调用TiebaApiUtil.GetPage()方法。
71 |
72 |
73 | #### 返回形式:
74 |
75 | 
76 |
77 |
78 |
79 |
80 |
81 | ---
82 |
83 | ### GetText
84 |
85 | #### 使用方法:
86 |
87 | tieba.GetText(list)
88 |
89 | #### 参数说明:
90 |
91 | list:获取贴吧页数标题列表,以及比较相关关键词信息
92 |
93 | #### 方法说明
94 |
95 | 通过调用OneToOne()比较关键词,将相关数据存入Save中。
96 |
97 | #### 返回形式
98 |
99 |
100 | 该方法没有返回值。
101 |
102 |
103 | ---
104 | ### OneToOne
105 |
106 | #### 使用方法:
107 |
108 | tieba.OneToOne(Text)
109 |
110 | #### 参数说明:
111 |
112 | Text:用户回复内容
113 |
114 | #### 方法说明:
115 |
116 | 将内容与关键词进行比较
117 |
118 | #### 返回形式
119 |
120 | 该方法返回布尔值。
121 |
122 | - True
123 | - 比对成功
124 | - False
125 | - 比对失败
126 |
127 |
128 | ---
129 |
130 | ### ComparisonDict
131 |
132 | #### 使用方法:
133 | 不推荐单独使用
134 |
135 | #### 参数说明:
136 | 无
137 |
138 | #### 方法说明:
139 | 该方法用以Save与OldSave进行对比从而达到监控目的
140 |
141 | #### 返回形式
142 |
143 | 重新生成新list —> NewList
144 |
145 | #### NewList
146 |
147 | 对比差异及结果
148 |
149 | ---
150 | ### ini
151 |
152 | #### 使用方法:
153 | tieba.ini()
154 |
155 | #### 参数说明:
156 | 无
157 |
158 | #### 方法说明:
159 | 该方法用以程序第一次启动用以初始化,在程序未启动或者未生成数据文件时先行调用该方法。
160 |
161 | #### 返回形式
162 | 无
163 |
164 | ---
165 | ### Main
166 |
167 | #### 使用方法:
168 |
169 | tieba.ini()
170 |
171 | #### 参数说明
172 | 无
173 |
174 | #### 方法说明:
175 |
176 | 该方法为监控主程序,当程序完成初始化后进行监控
177 |
178 | #### 返回形式:
179 |
180 | NewList ——> 该list存放监控数据变化
181 |
182 |
183 |
184 |
185 | ---
186 |
187 | ### setting()
188 |
189 | #### 使用方法:
190 |
191 | tieba.setting()
192 |
193 | #### 参数说明
194 | 无
195 |
196 | #### 方法说明:
197 |
198 | 该方法查看TiebaSetting.conf配置内容
199 |
200 | #### 返回形式:
201 |
202 | String 字符串形式:
203 |
204 | 
205 |
206 | ---
207 |
208 | ### GetSetting()
209 |
210 | #### 使用方法:
211 |
212 | tieba.GetSetting()
213 |
214 | #### 参数说明
215 | 无
216 |
217 | #### 方法说明:
218 |
219 | 该方法查看TiebaSetting.conf配置内容,以list形式返回
220 |
221 | #### 返回形式:
222 |
223 | 
224 |
225 | ---
226 | ### SetSetting(dict)
227 |
228 | #### 使用方法:
229 |
230 | tieba.GetSetting()
231 |
232 | #### 参数说明
233 | dict : 传入修改的配置文件信息
234 |
235 | #### 方法说明:
236 |
237 | 该方法动态修改配置文件信息,用户在程序运行中改变关键词等配置信
238 |
239 | #### 返回形式:
240 |
241 |
242 | 
243 |
244 | ---
245 | ### dict_string(dict)
246 |
247 | #### 使用方法:
248 |
249 | tieba.dict_string(dict)
250 |
251 | #### 参数说明
252 | dict :适配关键词的Text等相关dict信息
253 |
254 | #### 方法说明:
255 |
256 | 该方法用于去除适配的Text内容中去除HTML信息的图片标签以及其他影响阅读的标签
257 |
258 | #### 返回形式:
259 |
260 | > 监控到更新的数据
261 | >
262 | > 帖子地址: https://tieba.baidu.com/p/5660167836
263 |
264 | >
265 | >
266 | > 帖子地址: https://tieba.baidu.com/p/5806639338
267 | > 二硕影迷(7-23 19:24):回复 啦啦啦哈哈撒
269 | > 卿卿且苧(11:23):回复 二硕影迷
271 | >
272 | > 帖子地址: https://tieba.baidu.com/p/5805131589
273 | > 萝卜森兔耳德(7-22 07:42):现在马上要录取了,突然想到一个很害怕的情况!我是第二批次,十个学校知道必须专业服从调剂于是我先把那十个勾给打上了然后再写的学校和代码,问一下提交的时候我服从的调剂勾会不会没有啊感觉好吓人啊!
274 | >
275 | >
276 | > 帖子地址: https://tieba.baidu.com/p/5803120565
277 | > GGXHTML(7-20 16:37):山东影制的录取 出来啦吗
279 | > a37203050(7-22 05:45):回复 GGXHTML
281 | > 神奇的兔酱(7-22 08:33):回复 a37203050 :没有出,10个计划山东,最低录取到411分。 应该会追加计划
282 | >
283 | > GGXHTML(7-22 09:43):这个学习的录取线大概什么时候出啊?
284 | >
285 | >
286 | > 帖子地址: https://tieba.baidu.com/p/5808492570
287 | > 熏熏暖风(22:40):本来江夏区算半个郊区,现在还没有地铁只有3种公交可以出去,不过地铁通了就方便了,地铁口离东湖还是有一些距离的,宿舍环境装修了,被称为酒店级宿舍,图书馆还是不错的,市区的话,江夏区的市区大概半个多小时,不过没什么玩的买买一些东西是可以的,洪山区那边公交就50分钟到街道口那边,之后去哪里有地铁就各种方便了,就是放假的时候公交人很多
288 | >
289 | >
290 | > 帖子地址: https://tieba.baidu.com/p/5807674831
291 | > zzhxiannv(23:04):不知道什么预录变成录取
292 | >
293 | >
294 | > 帖子地址: https://tieba.baidu.com/p/5806260556
295 | > 哈欠女神(7-23 10:49):最低528!!(2017年这所学校在海南录取最低分)
296 | >
297 | > shine包仔(7-23 14:07):回复 泌夫人(.*?).*?回([0-9]\d*)\s(.*?)\s(.*?)
',re.S)
90 | items = re.findall(pattern, str(x))
91 | # print(items)
92 | # 这里对标题进行一次修正,防止出现 '1.\xa0' 情况
93 | if c < 10:
94 | Title = items[0][1][3:]
95 | else:
96 | Title = items[0][1][4:]
97 |
98 | c += 1
99 | # 创建一个Dict
100 |
101 | Son = {
102 | 'Id':items[0][0],
103 | 'Title':Title,
104 | 'Reply':items[0][2],
105 | 'Author':items[0][3],
106 | 'Time':items[0][4]
107 | }
108 | SouList.append(Son)
109 |
110 | Page = {
111 | str(int((i + 20) / 20)): SouList
112 | }
113 |
114 |
115 | # 这里需要清空list
116 | """
117 | 引入copy
118 | 在python 赋值是引入A=B,当清空或者销毁前者B被赋值的变量时,出现复制后的变量A为空,所以这里我们采用copy包中的deepcopy方法而不是copy方法。
119 |
120 | 参考:https://www.cnblogs.com/koliverpool/p/6791579.html
121 | """
122 |
123 | SuperList.append(copy.deepcopy(Page))
124 | SouList.clear()
125 | if not '下一页' in GetPageID.text:
126 | break
127 | ReturnJson['Page'] = SuperList
128 |
129 |
130 |
131 | # 添加 ensure_ascii=False 防止中文乱码
132 | Result = json.dumps(ReturnJson,ensure_ascii=False)
133 |
134 | return Result
135 |
136 | """
137 | 获取单个页数,该函数只于贴子ID有关
138 |
139 | http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=4552337163&new_word=&pinf=1_2_0&pn=60&lp=6021
140 | 关键词是
141 | KZ:帖子ID
142 | pn:(0,30,60,90)
143 | (1,2,3,4)
144 |
145 | """
146 | def GetTiebaOne(ID):
147 | """
148 | json 格式
149 | {
150 | 'Text':'balabalabala'
151 | 'Author':'123'
152 | 'FloorInFloor':{
153 | {
154 | "Text": "balabalabala ",
155 | "Author": "123",
156 | "Time": "uacpayhs 09:34"
157 | },{
158 | "Text": "回复 ",
159 | "Author": "极限rabbit",
160 | "Time": "uacpayhs 09:34"
161 | },
162 | }
163 |
164 | }
165 | """
166 | # 先获取帖子第一页以及帖子回复数
167 |
168 | page = 0
169 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
170 | url2 = '&new_word=&pinf=1_2_0&pn='+ str(page)
171 | url3 = '&lp=6021'
172 | url = url1+str(ID)+url2+url3
173 | # print(url)
174 | # 于前处理
175 | time.sleep(0.01)
176 | GetContent = requests.get(url=url,headers=headers)
177 | # Soup = BeautifulSoup(GetContent.text,'lxml')
178 |
179 | # 获取页数
180 | # SumPage = Soup.select_one('div.h > input[type="text"]').attrs['value']
181 |
182 | # --------------------------------------------
183 | # print(str(page))
184 | Soup = BeautifulSoup(GetContent.text, 'lxml')
185 | findall = Soup.select('div.i')
186 | FatherList = []
187 | SonDict = {}
188 |
189 | # 异常以及不存在的帖子说明:
190 |
191 | if '您要浏览的贴子不存在' in GetContent.text:
192 | ReturnJ = {
193 | 'Error' : 'Error'
194 | }
195 | return json.dumps(ReturnJ)
196 | # 页数增加
197 | for page in range(0,16122330,30):
198 | # 这里要对页数循环
199 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
200 | url2 = '&new_word=&pinf=1_2_0&pn=' + str(page)
201 | url3 = '&lp=6021'
202 | url = url1 + str(ID) + url2 + url3
203 | # print(url)
204 | time.sleep(0.01)
205 | GetContent = requests.get(url=url,headers=headers)
206 |
207 | Soup = BeautifulSoup(GetContent.text, 'lxml')
208 | findall = Soup.select('div.i')
209 |
210 | for OneContent,count in zip(findall,range(1,999)):
211 |
212 |
213 | if count == 1 and page == 0:
214 |
215 | pattern = re.compile('class="i">1楼.\s(.*?).*?(.*?).*?class="b">(.*?)\d*楼.\s(.*?).*?(.*?).*?class="b">(.*?).*?href="(.*?)">回复(.*?)', re.S)
228 | items = re.findall(pattern, str(OneContent))
229 |
230 | if items == [] or items == None or items == '':
231 | continue
232 | Text = items[0][0]
233 | Author = items[0][1]
234 | Time = items[0][2]
235 |
236 |
237 | Floor = items[0][4][1:-1]
238 | # print(items)
239 | FloorInFloor = []
240 | if not (Floor == '' or Floor == None):
241 | # print(items[0][2])
242 | FloorInFloor = GetFloorInFloor(url=items[0][3])
243 | SonDict['Text'] = Text
244 | SonDict['Author'] = items[0][1]
245 | SonDict['Time'] = Time
246 | SonDict['FloorInFloor'] = FloorInFloor
247 | FatherList.append(copy.deepcopy(SonDict))
248 | FloorInFloor.clear()
249 | SonDict.clear()
250 |
251 | # 这里不能直接判断下一页
252 |
253 | if not '下一页' in GetContent.text:
254 | break
255 |
256 | FatherListJson = json.dumps(FatherList,ensure_ascii=False)
257 | return FatherListJson
258 | #获取楼中楼
259 |
260 | def GetFloorInFloor(url):
261 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/'
262 |
263 | # 这里要做一次替换,因为html的&是&,做一次替换
264 |
265 | # 参数内置默认99
266 | url2 = url1 + url.replace('&','&') +'&fpn='
267 |
268 |
269 |
270 | # 读取楼中楼信息
271 |
272 | ReturnList = []
273 |
274 | for pn in range(1,100):
275 | time.sleep(0.01)
276 | web = requests.get(url=url2+str(pn),headers=headers)
277 | Soup = BeautifulSoup(web.text,'lxml')
278 | findall = Soup.select('div.i')
279 | for i in findall:
280 |
281 | pattern = re.compile(
282 | '(.*?)
(.*?).*?
(.*?)',
283 | re.S)
284 | items = re.findall(pattern, str(i))
285 | Son = {
286 | 'Text':items[0][0],
287 | 'Author':items[0][1],
288 | 'Time':items[0][2]
289 | }
290 | ReturnList.append(Son)
291 |
292 | # 获取楼中楼页数
293 | if not '下一页' in web.text:
294 | break
295 | return ReturnList
296 |
297 |
298 |
299 |
300 |
301 |
302 |
--------------------------------------------------------------------------------
/ApiRedme/TiebaApiUtil.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import re
6 | import copy
7 | import time
8 |
9 | # 自制贴吧api
10 | """
11 | 该api依赖百度贴吧web版
12 |
13 | 贴吧api作为工具类,可制作第三方客户端,极速/个性/无广告
14 | """
15 |
16 |
17 | """
18 | 模拟三星手机访问
19 | 虽然不加headers也可正常访问,但是我们还是要严谨
20 | """
21 |
22 | headers = {
23 | 'Host': 'tieba.baidu.com',
24 | 'Connection': 'keep-alive',
25 | 'Cache-Control': 'max-age=0',
26 | 'Upgrade-Insecure-Requests': '1',
27 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36',
28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
29 | 'Accept-Encoding': 'gzip, deflate',
30 | 'Accept-Language': 'zh-CN,zh;q=0.9'
31 | }
32 |
33 | # 获取某个贴吧页面帖子
34 |
35 | """
36 | url = http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw=%E6%9E%97%E4%BF%8A%E6%9D%B0&lp=5011&lm=&pn=0
37 | kw:贴吧关键词
38 | pn=页数(0,20,40,60...)页数
39 | (1,2,3,4)
40 | """
41 |
42 | """
43 | pnf=1 起始页 默认为第一页 Start
44 | pne=3 结束页 默认为第三页 End
45 | """
46 | def GetPage(key,Start=1,End=3):
47 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw='
48 | url2 = '&lp=5011&lm=&pn='
49 |
50 | Start = Start*20-20
51 | # 防止出现输入0的情况
52 | if Start == -20:
53 | Start = 0
54 |
55 | End = End*20
56 |
57 |
58 | ReturnJson = {
59 | 'key' : key
60 | }
61 | SuperList = []
62 | for i in range(Start,End,20):
63 |
64 | url = url1 + key + url2 + str(i)
65 |
66 | # print(url)
67 | time.sleep(0.01)
68 | GetPageID = requests.get(url = url,headers=headers)
69 | if '欢迎创建本吧,与今后来到这里的朋友交流讨论' in GetPageID.text:
70 | Error = {
71 | 'Error':'改吧尚未建立'
72 | }
73 | return json.dumps(Error,ensure_ascii=False)
74 | Soup = BeautifulSoup(GetPageID.text,'lxml')
75 | find = Soup.select('div.i')
76 |
77 | # 计数 调整标题
78 | c = 1
79 |
80 | '''
81 | 这里需要添加一个list,这样一页就在一个list中
82 |
83 | '''
84 | SouList = []
85 |
86 | for x in find:
87 |
88 | # print(x)
89 | pattern = re.compile('kz=(.*?)&.*?">(.*?).*?回([0-9]\d*)\s(.*?)\s(.*?)',re.S)
90 | items = re.findall(pattern, str(x))
91 | # print(items)
92 | # 这里对标题进行一次修正,防止出现 '1.\xa0' 情况
93 | if c < 10:
94 | Title = items[0][1][3:]
95 | else:
96 | Title = items[0][1][4:]
97 |
98 | c += 1
99 | # 创建一个Dict
100 |
101 | Son = {
102 | 'Id':items[0][0],
103 | 'Title':Title,
104 | 'Reply':items[0][2],
105 | 'Author':items[0][3],
106 | 'Time':items[0][4]
107 | }
108 | SouList.append(Son)
109 |
110 | Page = {
111 | str(int((i + 20) / 20)): SouList
112 | }
113 |
114 |
115 | # 这里需要清空list
116 | """
117 | 引入copy
118 | 在python 赋值是引入A=B,当清空或者销毁前者B被赋值的变量时,出现复制后的变量A为空,所以这里我们采用copy包中的deepcopy方法而不是copy方法。
119 |
120 | 参考:https://www.cnblogs.com/koliverpool/p/6791579.html
121 | """
122 |
123 | SuperList.append(copy.deepcopy(Page))
124 | SouList.clear()
125 | if not '下一页' in GetPageID.text:
126 | break
127 | ReturnJson['Page'] = SuperList
128 |
129 |
130 |
131 | # 添加 ensure_ascii=False 防止中文乱码
132 | Result = json.dumps(ReturnJson,ensure_ascii=False)
133 |
134 | return Result
135 |
136 | """
137 | 获取单个页数,该函数只于贴子ID有关
138 |
139 | http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=4552337163&new_word=&pinf=1_2_0&pn=60&lp=6021
140 | 关键词是
141 | KZ:帖子ID
142 | pn:(0,30,60,90)
143 | (1,2,3,4)
144 |
145 | """
146 | def GetTiebaOne(ID):
147 | """
148 | json 格式
149 | {
150 | 'Text':'balabalabala'
151 | 'Author':'123'
152 | 'FloorInFloor':{
153 | {
154 | "Text": "balabalabala ",
155 | "Author": "123",
156 | "Time": "
uacpayhs 09:34"
157 | },{
158 | "Text": "回复 ",
159 | "Author": "极限rabbit",
160 | "Time": "uacpayhs 09:34"
161 | },
162 | }
163 |
164 | }
165 | """
166 | # 先获取帖子第一页以及帖子回复数
167 |
168 | page = 0
169 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
170 | url2 = '&new_word=&pinf=1_2_0&pn='+ str(page)
171 | url3 = '&lp=6021'
172 | url = url1+str(ID)+url2+url3
173 | # print(url)
174 | # 于前处理
175 | time.sleep(0.01)
176 | GetContent = requests.get(url=url,headers=headers)
177 | # Soup = BeautifulSoup(GetContent.text,'lxml')
178 |
179 | # 获取页数
180 | # SumPage = Soup.select_one('div.h > input[type="text"]').attrs['value']
181 |
182 | # --------------------------------------------
183 | # print(str(page))
184 | Soup = BeautifulSoup(GetContent.text, 'lxml')
185 | findall = Soup.select('div.i')
186 | FatherList = []
187 | SonDict = {}
188 |
189 | # 异常以及不存在的帖子说明:
190 |
191 | if '您要浏览的贴子不存在' in GetContent.text:
192 | ReturnJ = {
193 | 'Error' : 'Error'
194 | }
195 | return json.dumps(ReturnJ)
196 | # 页数增加
197 | for page in range(0,16122330,30):
198 | # 这里要对页数循环
199 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
200 | url2 = '&new_word=&pinf=1_2_0&pn=' + str(page)
201 | url3 = '&lp=6021'
202 | url = url1 + str(ID) + url2 + url3
203 | # print(url)
204 | time.sleep(0.01)
205 | GetContent = requests.get(url=url,headers=headers)
206 |
207 | Soup = BeautifulSoup(GetContent.text, 'lxml')
208 | findall = Soup.select('div.i')
209 |
210 | for OneContent,count in zip(findall,range(1,999)):
211 |
212 |
213 | if count == 1 and page == 0:
214 |
215 | pattern = re.compile('class="i">1楼.\s(.*?).*?(.*?).*?class="b">(.*?)\d*楼.\s(.*?).*?(.*?).*?class="b">(.*?).*?href="(.*?)">回复(.*?)', re.S)
228 | items = re.findall(pattern, str(OneContent))
229 |
230 | if items == [] or items == None or items == '':
231 | continue
232 | Text = items[0][0]
233 | Author = items[0][1]
234 | Time = items[0][2]
235 |
236 |
237 | Floor = items[0][4][1:-1]
238 | # print(items)
239 | FloorInFloor = []
240 | if not (Floor == '' or Floor == None):
241 | # print(items[0][2])
242 | FloorInFloor = GetFloorInFloor(url=items[0][3])
243 | SonDict['Text'] = Text
244 | SonDict['Author'] = items[0][1]
245 | SonDict['Time'] = Time
246 | SonDict['FloorInFloor'] = FloorInFloor
247 | FatherList.append(copy.deepcopy(SonDict))
248 | FloorInFloor.clear()
249 | SonDict.clear()
250 |
251 | # 这里不能直接判断下一页
252 |
253 | if not '下一页' in GetContent.text:
254 | break
255 |
256 | FatherListJson = json.dumps(FatherList,ensure_ascii=False)
257 | return FatherListJson
258 | #获取楼中楼
259 |
260 | def GetFloorInFloor(url):
261 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/'
262 |
263 | # 这里要做一次替换,因为html的&是&,做一次替换
264 |
265 | # 参数内置默认99
266 | url2 = url1 + url.replace('&','&') +'&fpn='
267 |
268 |
269 |
270 | # 读取楼中楼信息
271 |
272 | ReturnList = []
273 |
274 | for pn in range(1,100):
275 | time.sleep(0.01)
276 | web = requests.get(url=url2+str(pn),headers=headers)
277 | Soup = BeautifulSoup(web.text,'lxml')
278 | findall = Soup.select('div.i')
279 | for i in findall:
280 |
281 | pattern = re.compile(
282 | '(.*?)
(.*?).*?
(.*?)',
283 | re.S)
284 | items = re.findall(pattern, str(i))
285 | Son = {
286 | 'Text':items[0][0],
287 | 'Author':items[0][1],
288 | 'Time':items[0][2]
289 | }
290 | ReturnList.append(Son)
291 |
292 | # 获取楼中楼页数
293 | if not '下一页' in web.text:
294 | break
295 | return ReturnList
296 |
297 |
298 |
299 |
300 |
301 |
302 |
--------------------------------------------------------------------------------
/Sentiment/tieba.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import time
4 | import configparser
5 | import TiebaApiUtil
6 | import copy
7 | import re
8 |
9 | """
10 | 此处用".conf"进行配置
11 | Python 读取写入配置文件 —— ConfigParser
12 | https://www.cnblogs.com/feeland/p/4514771.html
13 |
14 | config = configparser.ConfigParser()
15 | config.read('test.conf')
16 | db_host = config.get("db", "db_port")
17 |
18 | """
19 |
20 |
21 | # 参数配置说明
22 | """
23 | [Setting]
24 | 爬取贴吧
25 | tb:{'国际米兰','linjj','dota','dota2'}
26 |
27 | 爬取默认页数
28 | Start:1
29 | End:4
30 |
31 | [Customize]
32 | 关键词
33 | Essential:{'昨天','今天','明天','后天'}
34 |
35 | """
36 |
37 |
38 | config = configparser.ConfigParser()
39 | # 编码要设置成utf-8-sig而并不是utf-8
40 | config.read('TiebaSetting.conf', encoding='utf-8-sig')
41 | key = config.get("Setting", "tb")
42 | keyList = key.split(',')
43 | Essential = config.get("Customize", "Essential")
44 | EssentialList = Essential.split(',')
45 |
46 | # 休息周期
47 | X = config.getint("Setting", "Sleep")
48 |
49 | # 开始页数
50 | Start = config.getint("Setting", "Start")
51 | # 结束页数
52 | End = config.getint("Setting", "End")
53 |
54 | # 信息存取列表
55 | Save = {}
56 |
57 | # 读取文件或前一个爬取列表
58 |
59 | OldSave = {}
60 |
61 | # 新出现的帖子
62 |
63 | NewList = []
64 |
65 |
66 | # 获取ID
67 | def GetId():
68 | IdList = []
69 | for name in keyList:
70 | test = TiebaApiUtil.GetPage(key=name,Start=Start,End=End)
71 | test1 = eval(test)
72 | # print(test1)
73 | for i in test1['Page']:
74 | for x in i.values():
75 | for j in x:
76 | IdList.append(j['Id'])
77 | # print(IdList.__len__())
78 | return IdList
79 |
80 |
81 | # 存取相关关键词详细信息
82 |
83 | OldKeyText = {}
84 | KeyText = {}
85 |
86 | '''
87 | {
88 | 123123123 : [
89 | {
90 | }
91 | ]
92 |
93 |
94 | }
95 |
96 | '''
97 |
98 |
99 |
100 | # 获取TEXT 以及对比
101 | def GetText(list):
102 |
103 | for id in list:
104 |
105 | # 标志 为0则代表无关键词,为1则有关键词
106 | T = 0
107 |
108 | # 存放 某个ID的所有关键回复楼
109 | KeyList = []
110 |
111 | Text = TiebaApiUtil.GetTiebaOne(id)
112 |
113 | test1 = eval(Text)
114 | # print(str(test1))
115 |
116 | for i in test1:
117 | # 检测回复是否有关键词
118 | try:
119 | if OneToOne(i['Text']):
120 | # 存放关键词检索
121 | T = 1
122 | key = {
123 | 'Author': i['Author'],
124 | 'Text':i['Text'],
125 | 'Time':i['Time']
126 | }
127 |
128 | KeyList.append(key)
129 |
130 | # ID 在Save中
131 |
132 | if str(id) in Save:
133 | # print('旧ID')
134 | Save[str(id)] += 1
135 | # ID 不在Save中
136 | else:
137 | # print('出现新ID')
138 | Save[str(id)] = 1
139 | # 检测楼中楼是否存在
140 | if not (i['FloorInFloor'] == '' or i['FloorInFloor'] == None or i['FloorInFloor'] == []):
141 |
142 | # 存在楼中楼则遍历
143 | for f in i['FloorInFloor']:
144 |
145 | if OneToOne(f['Text']):
146 |
147 | # 存放关键词检索
148 | T = 1
149 | key = {
150 | 'Author': f['Author'],
151 | 'Text': f['Text'],
152 | 'Time': f['Time']
153 | }
154 |
155 | KeyList.append(key)
156 |
157 | if str(id) in Save:
158 | # print('旧楼中楼ID')
159 | Save[str(id)] += 1
160 | else:
161 | # print('新楼中楼ID')
162 | Save[str(id)] = 1
163 | except TypeError:
164 | pass
165 | if T == 1:
166 | T = 0
167 | KeyText[str(id)] = copy.deepcopy(KeyList)
168 | KeyList.clear()
169 |
170 |
171 |
172 |
173 | def OneToOne(Text):
174 | for i in EssentialList:
175 | if i in Text:
176 | return True
177 |
178 |
179 | '''
180 |
181 | # 信息存取列表
182 | Save = {}
183 |
184 | # 读取文件或前一个爬取列表
185 |
186 | NewSave = {}
187 |
188 | '''
189 | def ComparisonDict():
190 | # print('This is OldSave: '+ str(OldSave))
191 | # print('This is Save: '+ str(Save))
192 |
193 | for x in Save:
194 | for i,y in zip(OldSave,range(1,len(OldSave)+1)):
195 | # print(str(len(OldSave)) + ' y '+ str(y))
196 | if x == i:
197 | # 值相同
198 | # print('KEY相同')
199 | # print(str(x))
200 | if not Save[x] == OldSave[i]:
201 | # print('值不相同')
202 | NewList.append(str(i))
203 | break
204 | # if y == len(OldSave):
205 | # # print('y == len(OldSave)')
206 | # # print(x +" "+i)
207 | if y == len(OldSave) and x != i:
208 | # print('test2222')
209 | NewList.append(str(x))
210 |
211 | OldSave.clear()
212 | OldSave.update(copy.deepcopy(Save))
213 | Save.clear()
214 | # 5800836228 3244759899
215 |
216 | # 程序每次完成循环都要存取一次
217 |
218 | # 判断NewList 是否为空
219 |
220 | NewKey = {}
221 | dict = {}
222 |
223 | def ComparisonDictKey():
224 | # print('This is OldKeyText: ' + str(OldKeyText))
225 | # print('This is KeyText: ' + str(KeyText))
226 |
227 |
228 |
229 | for New in KeyText:
230 | list = []
231 |
232 |
233 | # 判断新的KeyText的ID是不是在OldKeyText中,如果在则对比是否有不同
234 | if New in OldKeyText.keys():
235 | T = 0
236 | for K in KeyText[New]:
237 |
238 | for O,C in zip(OldKeyText[New],range(1,len(OldKeyText[New])+1)):
239 | if K['Text'] == O['Text'] and K['Author'] == O['Author'] and K['Time'] == O['Time']:
240 | break
241 | if C == len(OldKeyText[New]) and (K['Text'] != O['Text'] or K['Author'] != O['Author'] or K['Time'] != O['Time']):
242 | print(K)
243 | list.append(K)
244 | T = 1
245 | if T == 1:
246 | dict[New] = copy.deepcopy(list)
247 | list.clear()
248 | T = 0
249 | else:
250 | dict[New] = copy.deepcopy(KeyText[New])
251 |
252 | print('This is Update '+str(dict))
253 |
254 | OldKeyText.clear()
255 | OldKeyText.update(copy.deepcopy(KeyText))
256 | KeyText.clear()
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 | def FileOpen():
273 | with open('list.tieba','r',encoding='utf-8') as file:
274 | save = file.read()
275 | OldSave.update(eval(save))
276 |
277 | with open('key.tieba','r',encoding='utf-8') as file:
278 | oldkeytext = file.read()
279 | OldKeyText.update(eval(oldkeytext))
280 |
281 | def FileSave():
282 | with open('list.tieba','w',encoding='utf-8') as file:
283 | file.write(str(OldSave))
284 |
285 | with open('key.tieba','w',encoding='utf-8') as file:
286 | file.write(str(OldKeyText))
287 |
288 |
289 |
290 |
291 | def ini():
292 | # 初始化
293 | try:
294 | F = open('list.tieba','r',encoding='utf-8')
295 | F.close()
296 | except OSError :
297 | print('初始化')
298 | GetText(GetId())
299 | OldSave.update(copy.deepcopy(Save))
300 | Save.clear()
301 | FileSave()
302 |
303 | return
304 |
305 |
306 |
307 | def Main():
308 |
309 |
310 | time.sleep(X)
311 | print('开始运行')
312 | # 清空对比函数
313 | NewList.clear()
314 | dict.clear()
315 | # 获取Save
316 | GetText(GetId())
317 | # 对比Save to OldSave
318 | ComparisonDict()
319 | ComparisonDictKey()
320 | # 保存以防错误
321 | FileSave()
322 | print('This is NewList : ' + str(NewList))
323 | print(dict)
324 | c = dict_string(dict)
325 |
326 |
327 |
328 | return c
329 |
330 |
331 |
332 |
333 | def setting():
334 | # print(keyList)
335 | # print(EssentialList)
336 | # print(X)
337 | # print('监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End))
338 | return '监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End)
339 |
340 | def GetSetting():
341 | return keyList,EssentialList,X,Start,End
342 | '''
343 | 帖子地址:https://tieba.baidu.com/p/5806299422
344 | 遥远彼方(12:24):
345 |
346 | '''
347 |
348 | def dict_string(dict):
349 | res = ''
350 | url = 'https://tieba.baidu.com/p/'
351 | for one in dict:
352 | IDurl = url + one
353 |
354 | str1 = ''
355 | for i in dict[one]:
356 |
357 | a = re.compile('< img .*?"/>', re.I)
358 | b = re.compile('
', re.I)
359 | c = re.compile('
', re.I)
360 | f = re.compile('
', re.I)
361 | d = a.sub('', str(i['Text']))
362 | d = b.sub('', str(d))
363 | d = c.sub(' ', str(d))
364 | d = f.sub(' ', str(d))
365 |
366 | str1 = str1 + i['Author'] + '('+i['Time']+')'+':'+d + '\n\n'
367 |
368 | res = res + '帖子地址: '+IDurl + '\n' + str1 + '\n'
369 | print(res)
370 | return res
371 |
372 |
373 |
374 |
375 | # 更改配置
376 |
377 | def SetSetting(dict):
378 | global keyList,EssentialList,X,Start,End
379 |
380 | keyList = dict['监控贴吧列表']
381 | EssentialList = dict['监控关键词']
382 | X = dict['监控周期(S)']
383 | Start = dict['开始页数']
384 | End = dict['结束页数']
385 |
386 | # 写入文件
387 | config = configparser.ConfigParser()
388 | # 编码要设置成utf-8-sig而并不是utf-8
389 | config.read('TiebaSetting.conf', encoding='utf-8-sig')
390 |
391 | config.set('Setting','Sleep',str(X))
392 | config.set('Setting', 'Start', str(Start))
393 | config.set('Setting', 'End', str(End))
394 | config.set('Setting', 'tb', ','.join(keyList))
395 | config.set('Customize', 'Essential', ','.join(EssentialList))
396 |
397 | with open("TiebaSetting.conf", "w+",encoding='utf-8') as f:
398 | config.write(f)
399 |
400 | return '监控贴吧列表: ' + str(keyList) + '\n' + '监控关键词: ' + str(EssentialList) + '\n' + '监控周期: ' + str(
401 | X) + '\n' + '开始-终止/页数: ' + str(Start) + '-' + str(End)
402 |
403 |
404 |
--------------------------------------------------------------------------------
/tieba.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import time
4 | import configparser
5 | import TiebaApiUtil
6 | import copy
7 | import re
8 | import datetime
9 | """
10 | 此处用".conf"进行配置
11 | Python 读取写入配置文件 —— ConfigParser
12 | https://www.cnblogs.com/feeland/p/4514771.html
13 |
14 | config = configparser.ConfigParser()
15 | config.read('test.conf')
16 | db_host = config.get("db", "db_port")
17 |
18 | """
19 |
20 |
21 | # 参数配置说明
22 | """
23 | [Setting]
24 | 爬取贴吧
25 | tb:{'国际米兰','linjj','dota','dota2'}
26 |
27 | 爬取默认页数
28 | Start:1
29 | End:4
30 |
31 | [Customize]
32 | 关键词
33 | Essential:{'昨天','今天','明天','后天'}
34 |
35 | """
36 |
37 |
38 | config = configparser.ConfigParser()
39 | # 编码要设置成utf-8-sig而并不是utf-8
40 | config.read('TiebaSetting.conf', encoding='utf-8-sig')
41 | key = config.get("Setting", "tb")
42 | keyList = key.split(',')
43 | Essential = config.get("Customize", "Essential")
44 | EssentialList = Essential.split(',')
45 | Time = config.get("Setting","Time")
46 | # 休息周期
47 | X = config.getint("Setting", "Sleep")
48 |
49 | # 开始页数
50 | Start = config.getint("Setting", "Start")
51 | # 结束页数
52 | End = config.getint("Setting", "End")
53 |
54 |
55 | Year = int(datetime.datetime.now().strftime('%Y-%m-%d').split('-')[0])
56 |
57 | refer_to = Time.split('-')
58 |
59 | # 信息存取列表
60 | Save = {}
61 |
62 | # 读取文件或前一个爬取列表
63 |
64 | OldSave = {}
65 |
66 | # 新出现的帖子
67 |
68 | NewList = []
69 |
70 |
71 | # 获取ID
72 | def GetId():
73 | IdList = []
74 | for name in keyList:
75 | test = TiebaApiUtil.GetPage(key=name,Start=Start,End=End)
76 | test1 = eval(test)
77 | # print(test1)
78 | for i in test1['Page']:
79 | for x in i.values():
80 | for j in x:
81 | IdList.append(j['Id'])
82 | # print(IdList.__len__())
83 | return IdList
84 |
85 |
86 | # 存取相关关键词详细信息
87 |
88 | OldKeyText = {}
89 | KeyText = {}
90 |
91 |
92 |
93 |
94 |
95 | # 获取TEXT 以及对比
96 | def GetText(list):
97 |
98 | for id in list:
99 |
100 | # 标志 为0则代表无关键词,为1则有关键词
101 | T = 0
102 |
103 | # 存放 某个ID的所有关键回复楼
104 | KeyList = []
105 |
106 | Text = TiebaApiUtil.GetTiebaOne(id)
107 |
108 | test1 = eval(Text)
109 | # print(str(test1))
110 |
111 | for i in test1:
112 | # 检测回复是否有关键词
113 | try:
114 | if OneToOne(i['Text']):
115 | # 存放关键词检索
116 | T = 1
117 | key = {
118 | 'Author': i['Author'],
119 | 'Text':i['Text'],
120 | 'Time':i['Time']
121 | }
122 |
123 | # 时间限制
124 | if(TimeLimit(i['Time'])):
125 | print(i['Time'])
126 | KeyList.append(key)
127 |
128 | # ID 在Save中
129 |
130 | if str(id) in Save:
131 | # print('旧ID')
132 | Save[str(id)] += 1
133 | # ID 不在Save中
134 | else:
135 | # print('出现新ID')
136 | Save[str(id)] = 1
137 | # 检测楼中楼是否存在
138 | if not (i['FloorInFloor'] == '' or i['FloorInFloor'] == None or i['FloorInFloor'] == []):
139 |
140 | # 存在楼中楼则遍历
141 | for f in i['FloorInFloor']:
142 |
143 | if OneToOne(f['Text']):
144 |
145 | # 存放关键词检索
146 | T = 1
147 | key = {
148 | 'Author': f['Author'],
149 | 'Text': f['Text'],
150 | 'Time': f['Time']
151 | }
152 | if (TimeLimit(i['Time'])):
153 | print(i['Time'])
154 | KeyList.append(key)
155 |
156 | if str(id) in Save:
157 | # print('旧楼中楼ID')
158 | Save[str(id)] += 1
159 | else:
160 | # print('新楼中楼ID')
161 | Save[str(id)] = 1
162 | except TypeError:
163 | pass
164 | if T == 1:
165 | T = 0
166 | KeyText[str(id)] = copy.deepcopy(KeyList)
167 | KeyList.clear()
168 |
169 |
170 |
171 |
172 | def OneToOne(Text):
173 | for i in EssentialList:
174 | if i in Text:
175 | return True
176 |
177 |
178 | '''
179 |
180 | # 信息存取列表
181 | Save = {}
182 |
183 | # 读取文件或前一个爬取列表
184 |
185 | NewSave = {}
186 |
187 | '''
188 | def ComparisonDict():
189 | print('This is OldSave: '+ str(OldSave))
190 | print('This is Save: '+ str(Save))
191 |
192 | for x in Save:
193 | for i,y in zip(OldSave,range(1,len(OldSave)+1)):
194 | # print(str(len(OldSave)) + ' y '+ str(y))
195 | if x == i:
196 | # 值相同
197 | # print('KEY相同')
198 | # print(str(x))
199 | if not Save[x] == OldSave[i]:
200 | # print('值不相同')
201 | NewList.append(str(i))
202 | break
203 | # if y == len(OldSave):
204 | # # print('y == len(OldSave)')
205 | # # print(x +" "+i)
206 | if y == len(OldSave) and x != i:
207 | # print('test2222')
208 | NewList.append(str(x))
209 | print("This NewList :" +str(NewList))
210 | OldSave.clear()
211 | OldSave.update(copy.deepcopy(Save))
212 | Save.clear()
213 | # 5800836228 3244759899
214 |
215 | # 程序每次完成循环都要存取一次
216 |
217 | # 判断NewList 是否为空
218 |
219 | NewKey = {}
220 | dict = {}
221 |
222 | def ComparisonDictKey():
223 | # print('This is OldKeyText: ' + str(OldKeyText))
224 | # print('This is KeyText: ' + str(KeyText))
225 |
226 |
227 |
228 | for New in KeyText:
229 | list = []
230 |
231 |
232 | # 判断新的KeyText的ID是不是在OldKeyText中,如果在则对比是否有不同
233 | if New in OldKeyText.keys():
234 | T = 0
235 | for K in KeyText[New]:
236 |
237 | for O,C in zip(OldKeyText[New],range(1,len(OldKeyText[New])+1)):
238 | if K['Text'] == O['Text'] and K['Author'] == O['Author'] and K['Time'] == O['Time']:
239 | break
240 | if C == len(OldKeyText[New]) and (K['Text'] != O['Text'] or K['Author'] != O['Author'] or K['Time'] != O['Time']):
241 | print(K)
242 | list.append(K)
243 | T = 1
244 | if T == 1:
245 | dict[New] = copy.deepcopy(list)
246 | list.clear()
247 | T = 0
248 | else:
249 | dict[New] = copy.deepcopy(KeyText[New])
250 |
251 | print('This is Update '+str(dict))
252 |
253 | OldKeyText.clear()
254 | OldKeyText.update(copy.deepcopy(KeyText))
255 | KeyText.clear()
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 | def FileOpen():
272 | with open('list.tieba','r',encoding='utf-8') as file:
273 | save = file.read()
274 | OldSave.update(eval(save))
275 |
276 | with open('key.tieba','r',encoding='utf-8') as file:
277 | oldkeytext = file.read()
278 | OldKeyText.update(eval(oldkeytext))
279 |
280 | def FileSave():
281 | with open('list.tieba','w',encoding='utf-8') as file:
282 | file.write(str(OldSave))
283 |
284 | with open('key.tieba','w',encoding='utf-8') as file:
285 | file.write(str(OldKeyText))
286 |
287 |
288 |
289 |
290 | def ini():
291 | # 初始化
292 | try:
293 | F = open('list.tieba','r',encoding='utf-8')
294 | F.close()
295 | except OSError :
296 | print('初始化')
297 | GetText(GetId())
298 | OldSave.update(copy.deepcopy(Save))
299 | Save.clear()
300 | FileSave()
301 |
302 | return
303 |
304 |
305 |
306 | def Main():
307 |
308 |
309 | time.sleep(X)
310 | print('开始运行')
311 | # 清空对比函数
312 | NewList.clear()
313 | dict.clear()
314 | # 获取Save
315 | GetText(GetId())
316 | # 对比Save to OldSave
317 | ComparisonDict()
318 | ComparisonDictKey()
319 | # 保存以防错误
320 | FileSave()
321 | print('This is NewList : ' + str(NewList))
322 | # print(dict)
323 | c = dict_string(dict)
324 | # print("This is c "+ c)
325 |
326 |
327 | return c
328 |
329 |
330 |
331 |
332 | def setting():
333 | # print(keyList)
334 | # print(EssentialList)
335 | # print(X)
336 | # print('监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End))
337 | return '监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+str(Start)+'-'+str(End)+ '\n'+ '监控时间:'+str(Time)
338 |
339 | def GetSetting():
340 | return keyList,EssentialList,X,Start,End
341 | '''
342 | 帖子地址:https://tieba.baidu.com/p/5806299422
343 | 遥远彼方(12:24):
344 |
345 | '''
346 |
347 | def dict_string(dict):
348 | res = ''
349 | url = 'https://tieba.baidu.com/p/'
350 | for one in dict:
351 | IDurl = url + one
352 |
353 | str1 = ''
354 | for i in dict[one]:
355 |
356 | a = re.compile('< img .*?"/>', re.I)
357 | b = re.compile('', re.I)
358 | c = re.compile('
', re.I)
359 | f = re.compile('
', re.I)
360 | d = a.sub('', str(i['Text']))
361 | d = b.sub('', str(d))
362 | d = c.sub(' ', str(d))
363 | d = f.sub(' ', str(d))
364 |
365 | str1 = str1 + i['Author'] + '('+i['Time']+')'+':'+d + '\n'
366 |
367 | res = res + '帖子地址: '+IDurl + '\n' + str1 + '\n'
368 | # print(res)
369 | return res
370 |
371 |
372 |
373 |
374 | # 更改配置
375 |
376 | def SetSetting(dict):
377 | global keyList,EssentialList,X,Start,End
378 |
379 | keyList = dict['监控贴吧列表']
380 | EssentialList = dict['监控关键词']
381 | X = dict['监控周期(S)']
382 | Start = dict['开始页数']
383 | End = dict['结束页数']
384 |
385 | # 写入文件
386 | config = configparser.ConfigParser()
387 | # 编码要设置成utf-8-sig而并不是utf-8
388 | config.read('TiebaSetting.conf', encoding='utf-8-sig')
389 |
390 | config.set('Setting','Sleep',str(X))
391 | config.set('Setting', 'Start', str(Start))
392 | config.set('Setting', 'End', str(End))
393 | config.set('Setting', 'tb', ','.join(keyList))
394 | config.set('Customize', 'Essential', ','.join(EssentialList))
395 |
396 | with open("TiebaSetting.conf", "w+",encoding='utf-8') as f:
397 | config.write(f)
398 |
399 | return '监控贴吧列表: ' + str(keyList) + '\n' + '监控关键词: ' + str(EssentialList) + '\n' + '监控周期: ' + str(
400 | X) + '\n' + '开始-终止/页数: ' + str(Start) + '-' + str(End)
401 |
402 |
403 | # 后前...0/1
404 | # eg 2018-7-3-1
405 | # Year Month Day
406 | # 默认 0000-00-00
407 |
408 | def TimeLimit(time):
409 | c = time.split('-')
410 | refer_to[0] = int(refer_to[0])
411 | refer_to[1] = int(refer_to[1])
412 | refer_to[2] = int(refer_to[2])
413 |
414 | if(int(refer_to[-1])==0): # 比较前后
415 |
416 | if(len(c)==2):
417 | yue = int(c[0])
418 | ri = int(c[1].split(' ')[0])
419 |
420 | if(Year>=refer_to[0] and yue>=refer_to[1] and ri >=refer_to[2]):
421 | return True
422 | return False
423 | if(len(c)==3):
424 | if(int(c[0])>refer_to[0]):
425 | return True
426 | if(int(c[0])>=refer_to[0] and int(c[1])>=refer_to[1] and int(c[2])>=refer_to[2]):
427 | return True
428 | return False
429 | else:
430 | if (len(c) == 2):
431 | yue = int(c[0])
432 | ri = int(c[1].split(' ')[0])
433 | if (Year <= refer_to[0] and yue <= refer_to[1] and ri <= refer_to[2]):
434 | return True
435 | return False
436 | if (len(c) == 3):
437 | if (int(c[0]) < refer_to[0]):
438 | return True
439 | if (int(c[0]) <= refer_to[0] and int(c[1]) <= refer_to[1] and int(c[2]) <= refer_to[2]):
440 | return True
441 | return False
442 |
443 |
444 |
445 |
446 |
--------------------------------------------------------------------------------