├── TiebaStrat.bat
├── TiebaSetting.conf
├── Sentiment
    ├── TiebaSetting.conf
    ├── README.md
    └── tieba.py
├── README.MD
├── ApiRedme
    ├── README.md
    └── TiebaApiUtil.py
├── myitchat.py
├── myitchat
    └── myitchat.py
├── TiebaApiUtil.py
└── tieba.py


/TiebaStrat.bat:
--------------------------------------------------------------------------------
1 | python .\myitchat.py


--------------------------------------------------------------------------------
/TiebaSetting.conf:
--------------------------------------------------------------------------------
 1 | [Setting]
 2 | sleep = 1
 3 | tb = 林俊杰,周杰伦
 4 | start = 1
 5 | end = 2
 6 | Time = 2018-10-14-1
 7 | [Customize]
 8 | essential =演唱会
 9 | 
10 | 


--------------------------------------------------------------------------------
/Sentiment/TiebaSetting.conf:
--------------------------------------------------------------------------------
 1 | # 配置文件注释
 2 | [Setting]
 3 | 
 4 | Sleep = 18000
 5 | # 贴吧名 贴吧用英文逗号(,)分隔符分割
 6 | tb = 武汉东湖学院,汉口学院,武汉学院
 7 | 
 8 | # 贴吧页数指定爬取 默认起始页Start=1 End = 3
 9 | 
10 | Start = 1
11 | 
12 | End = 10
13 | 
14 | [Customize]
15 | 
16 | # 关键词
17 | 
18 | Essential = 留学,雅思,托福,新东方,新航道,出国
19 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
  1 | # Tieba API Sentiment
  2 | 
  3 | 贴吧API以及关键词提醒，舆情监控,微信陪聊机器人。
  4 | 
  5 | :smiley::smiley::smiley::smiley::smiley:
  6 | 
  7 | 
  8 | 
  9 | ## Time update
 10 | 
 11 | - 2018/7/3 立项
 12 | - 2018/7/12 完成设计
 13 | - 2018/7/14 完成API部分
 14 | - 2018/7/19 修复API部分，完成关键词监控部分
 15 | - 2018/7/20 关键词监控部分上线，微信监控部分测试...
 16 | - 2018/7/20 修复初始化问题
 17 | - 2018/7/23 更新Sentiment人性化设置，加入全新微信机器人监控(myitchat.py)
 18 | - 2018/7/25 更新myitchat以及Sentiment API，添加配置修改内容
 19 | - 2018/7/25 Beta V0.1 上线 
 20 | - 2018/7/27 修复若干问题,正式版V1.0上线
 21 | - 2018/7/28 修复因访问速度过快引起的连接重置的bug
 22 | - 2018/10/15 增加时间过滤，修复若干BUG。
 23 | - 2019/6/28 web版失效
 24 | - 2020/4/29 web版生效
 25 | 
 26 | 
 27 | ## 相关依赖
 28 | 	python 3+
 29 | 	pip install requests
 30 | 	pip install BeautifulSoup
 31 | 	pip install itchat
 32 | 
 33 | 
 34 | ## 使用方法
 35 | 
 36 | ### Windows：
 37 | 
 38 | 点击目录下的：
 39 | ***TiebaStrat.bat***
 40 | 
 41 | ![](http://img.lunatic.wang/win.jpg)
 42 | 
 43 | 
 44 | ### Windows/Linux：
 45 | 
 46 | CMD/powershell：***python myitchat.py***
 47 | 
 48 | ![](http://cdn.lunatic.wang/linux.jpg)
 49 | 
 50 | 
 51 | 扫码完成后即可开启贴吧微信助手。
 52 | 
 53 | 
 54 | 
 55 | ### 微信机器人
 56 | 
 57 | ![](http://cdn.lunatic.wang/robet.PNG)
 58 | 
 59 | 
 60 | ### 关键词舆情监控
 61 | 
 62 | 编辑TiebaSetting.conf：
 63 | 
 64 | 	[Setting]
 65 | 	# 休眠周期/秒
 66 | 	sleep = 300
 67 | 	# 监控贴吧
 68 | 	tb = 林俊杰,周杰伦,王力宏
 69 | 	# 开始监控页数
 70 | 	start = 1
 71 | 	# 结束监控页数
 72 | 	end = 3
 73 | 	# 时间控制
 74 | 	Time = 2018-10-14-0
 75 | 	# 0/1  0:过滤Time之前 1:过滤Time之后 
 76 | 	
 77 | 	[Customize]
 78 | 	# 关键词
 79 | 	essential = 演唱会,华语乐坛
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 1. 启动微信扫码登陆
 86 | 2. 输入“开启监控”,即可记录该用户ID并且在之后的监控数据以及配置修改都只有该用户可执行。
 87 | ![](http://cdn.lunatic.wang/wx1.png)
 88 | 3. 输入“修改配置”，复制模板修改并且发送即可修改。
 89 | ![](http://cdn.lunatic.wang/wx5.png)
 90 | 4. 享受结果。
 91 | 
 92 | ![](http://cdn.lunatic.wang/wx6.PNG)
 93 | 
 94 | ----------
 95 | ## 详细思路
 96 | 
 97 | 访问我的博客：
 98 | 
 99 | [贴吧监控助手](http://lunatic.wang/posts/d1eb00bb/)
100 | 
101 | ----------
102 | ## 自定义第三方拓展
103 | 
104 | - 第三方贴吧请以 *TieBa API Util*  拓展
105 | - 舆情功能拓展以及其他功能拓展 *Sentiment API*
106 | - 微信机器人拓展 *myitchat*
107 | 
108 | 以上为建议拓展文件衍生。
109 | 
110 | ----------
111 | ## TieBa API Util 
112 | 
113 | - 获取官方贴吧信息
114 | - 高度自由化的信息收集
115 | - 可制作第三方客户端
116 | - 回复以及楼中楼形式
117 | 
118 | [<font size=4>Tieba API 相关说明</font>](/ApiRedme)
119 | 
120 | ## Sentiment API 
121 | 
122 | - 自定义贴吧监控
123 | - 自定义关键词监控
124 | - 自定义更新周期
125 | - 可定制化的信息收集
126 | - 可作为其他项目工具
127 | 
128 | 
129 | [<font size=4>Sentiment API 相关说明</font>](/Sentiment)
130 | 
131 | ## myitchat
132 | 
133 | - 图灵机器人
134 | - 智能对话
135 | - 微信监控舆情
136 | - 智能化控制
137 | 
138 | [<font size=4>myitchat 相关说明</font>](/myitchat)
139 | 
140 | 
141 | 
142 | ## 其他说明
143 | 
144 | 请勿用于商业用途
145 | 


--------------------------------------------------------------------------------
/ApiRedme/README.md:
--------------------------------------------------------------------------------
  1 | # Tieba API Util 
  2 | 
  3 | :smiley: :smiley: :smiley: :smiley:
  4 | 
  5 | 
  6 | ## TieBa API
  7 | 
  8 | 
  9 | ### 相关依赖
 10 | 	pip install requests
 11 | 	
 12 | 	pip install BeautifulSoup
 13 | 
 14 | ### 使用方法
 15 | 
 16 | 	import TiebaApiUtil
 17 | 
 18 | 内置两个方法，分别是GetPage/GetTiebaOne。
 19 | 
 20 | 
 21 | ### GetPage
 22 | 
 23 | 调用方法
 24 | 
 25 | 	TiebaApiUtil.GetPage(Key,Start,End)
 26 | 
 27 | 参数说明：
 28 | 
 29 | 
 30 | * key：贴吧关键词
 31 | 	
 32 | 需要访问的贴吧关键词，譬如：'国际米兰'。
 33 | 
 34 | * Start：开始页数
 35 | 
 36 | 默认为1，不能为负。
 37 | 
 38 | * End：结束页数
 39 | 
 40 | 默认为3，不能为负。
 41 | 
 42 | 该函数返回Json，返回形式为：
 43 | 
 44 | * key:贴吧名称
 45 | * Page：
 46 | 	* X：贴吧当前页数
 47 | 		* Id：帖子ID
 48 | 		* Title：帖子标题
 49 | 		* Reply：帖子回复数
 50 | 		* Author：帖子作者
 51 | 		* Time：最后回复时间
 52 | 
 53 | 
 54 | <br>
 55 | 
 56 | 	TiebaApiUtil.GetPage(key='国际米兰',Start=1,End=3)
 57 | 如下：
 58 | <br>	
 59 | 
 60 | 	
 61 | 
 62 | 	{
 63 | 		
 64 | 		"key":"国际米兰",
 65 | 		"Page"::[
 66 | 				{
 67 | 				"1":[
 68 | 					{
 69 | 						"Id":"5793707221",
 70 | 						"Title":"官方:埃德尔转会江苏苏宁",
 71 | 						"Reply":"102",
 72 | 						"Author":"树欲动而风又止",
 73 | 						"Time":"23:34"
 74 | 					},
 75 | 					{
 76 | 						"Id":"5793703055",
 77 | 						"Title":"国际米兰新闻晚报，7月13日",
 78 | 						"Reply":"254",
 79 | 						"Author":"wyp861025",
 80 | 						"Time":"23:44"
 81 | 					}
 82 | 					]
 83 | 				}
 84 | 				{
 85 | 				"2":[
 86 | 					{
 87 | 						"Id":"5793707221",
 88 | 						"Title":"官方:埃德尔转会江苏苏宁",
 89 | 						"Reply":"102",
 90 | 						"Author":"树欲动而风又止",
 91 | 						"Time":"23:34"
 92 | 					},
 93 | 					{
 94 | 						"Id":"5793703055",
 95 | 						"Title":"国际米兰新闻晚报，7月13日",
 96 | 						"Reply":"254",
 97 | 						"Author":"wyp861025",
 98 | 						"Time":"23:44"
 99 | 					}
100 | 					]
101 | 				}
102 | 				]
103 | 
104 | 	}
105 | 
106 | 
107 | ### GetTiebaOne
108 | 
109 | 使用方法:
110 | 
111 | 	TiebaApiUtil.GetTiebaOne(ID)
112 | 参数说明：
113 | 
114 | * Id:帖子唯一ID
115 | 
116 | 该函数返回Json，返回形式为：
117 | 
118 | * Text：内容
119 | * Author：用户
120 | * Time：时间
121 | * FloorInFloor：楼中楼
122 | 	* Text：内容
123 | 	* Author：用户
124 | 	* Time：时间
125 | 
126 | <br>
127 | 
128 | 
129 | 	TiebaApiUtil.GetTiebaOne(5789990094)
130 | 
131 | <br>
132 | 
133 | 
134 | 	{
135 | 		"Text":"<img src=\"http://tb2.bdstatic.com/tb/editor/images/client/image_emoticon25.png\"/>这么容易就爆照的，不是抠脚就是快餐<br/>",
136 | 		"Author":"一涵呦",
137 | 		"Time":"7-10 20:56",
138 | 		"FloorInFloor":[
139 | 				{
140 | 				"Text":"快餐是啥",
141 | 				"Author":"可爱的Hjkjbb",
142 | 				"Time":"14:25"
143 | 				},
144 | 				{
145 | 				"Text":"回复 <a href=\"i?un=言清欢🍒🔯🔯\">言清欢🍒🔯🔯</a> :快餐就是我们平常点的外卖。",
146 | 				"Author":"李坤铭12",
147 | 				"Time":"15:16"
148 | 				},
149 | 				{
150 | 				"Text":"回复 言清欢🍒🔯🔯 ：打一次就走的，不过夜的<img src=\"http://tb2.bdstatic.com/tb/editor/images/client/image_emoticon68.png\"/><img src=\"http://tb2.bdstatic.com/tb/editor/images/client/image_emoticon68.png\"/>",
151 | 				"Author":"啦啦队长15",
152 | 				"Time":"15:24"
153 | 				},
154 | 				{
155 | 				"Text":"回复 <a href=\"i?un=米破是张小恒\">米破是张小恒</a> :你小弟弟就这么粗。",
156 | 				"Author":"让我鸡儿放会假",
157 | 				"Time":"16:47"
158 | 				},
159 | 				{
160 | 				"Text":"卖茶叶的",
161 | 				"Author":"天生的she手",
162 | 				"Time":"20:12"
163 | 				}
164 | 			]
165 | 	}
166 | 
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/myitchat.py:
--------------------------------------------------------------------------------
  1 | import itchat
  2 | import requests
  3 | import tieba
  4 | import threading
  5 | import json
  6 | import platform
  7 | 
  8 | KEY = 'xxxxxxx'
  9 | 
 10 | T = 0
 11 | 
 12 | def get_response(msg, UserId):
 13 |     # 这里我们就像在“3. 实现最简单的与图灵机器人的交互”中做的一样
 14 |     # 构造了要发送给服务器的数据
 15 |     apiUrl = 'http://openapi.tuling123.com/openapi/api/v2'
 16 | 
 17 | 
 18 |     data = {
 19 |     "reqType": 0,
 20 |     "perception": {
 21 |         "inputText": {
 22 |             "text": msg
 23 |         },
 24 |         "inputImage": {
 25 |             "url": ""
 26 |         },
 27 |         "selfInfo": {
 28 |             "location": {
 29 |                 "city": "",
 30 |                 "province": "",
 31 |                 "street": ""
 32 |             }
 33 |         }
 34 |     },
 35 |     "userInfo": {
 36 |         "apiKey": "42afd1a6112f4a93bbaa83022d980132",
 37 |         "userId": str(UserId)[1:33]
 38 |     }
 39 | }
 40 | 
 41 |     print(data)
 42 |     try:
 43 |         r = requests.post(apiUrl, data=json.dumps(data)).json()
 44 |         # 字典的get方法在字典没有'text'值的时候会返回None而不会抛出异常
 45 | 
 46 |         r = r['results']
 47 |         r = r[0]
 48 |         r = r['values']
 49 |         return r['text']
 50 |     # 为了防止服务器没有正常响应导致程序异常退出，这里用try-except捕获了异常
 51 |     # 如果服务器没能正常交互（返回非json或无法连接），那么就会进入下面的return
 52 |     except:
 53 |         # 将会返回一个None
 54 |         return
 55 | 
 56 | 
 57 | id = ''
 58 | 
 59 | 
 60 | @itchat.msg_register(itchat.content.TEXT)
 61 | def print_content(msg):
 62 |     # print(msg)
 63 |     global id
 64 |     print('id = |'+ id)
 65 |     print(msg['Text'] == '修改配置')
 66 |     print(id == msg['FromUserName'])
 67 |     if msg['Text'] == '开启监控' and (id == '' or id ==None ):
 68 | 
 69 |         # 引用全局变量
 70 |         id = msg['FromUserName']
 71 |         itchat.send_msg('已经开启监控~', toUserName=id)
 72 |         itchat.send_msg(tieba.setting(), toUserName=id)
 73 |         return
 74 | 
 75 |     if msg['Text'] == '修改配置' and id == msg['FromUserName']:
 76 |         Setting = tieba.GetSetting()
 77 |         a = {
 78 |             '监控贴吧列表':Setting[0],
 79 |             '监控关键词':Setting[1],
 80 |             '监控周期(S)':Setting[2],
 81 |             '开始页数':Setting[3],
 82 |             '结束页数': Setting[4]
 83 |         }
 84 |         itchat.send_msg('修改以下列信息，并且将修改后的信息复制发送', toUserName=id)
 85 | 
 86 |         itchat.send_msg(str(a), toUserName=id)
 87 |         return
 88 | 
 89 |     if  '监控贴吧列表' in msg['Text']:
 90 |         global T
 91 |         T = 1
 92 | 
 93 |         son = tieba.SetSetting(eval(msg['Text']))
 94 |         return son
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 |     # 这次对接收信息做一次判断
101 |     sentence = msg['Text']
102 | 
103 |     # 如果用户发送的是YYF则执行刷任务
104 |     print(msg['FromUserName'])
105 | 
106 | 
107 | 
108 |     return get_response(msg['Text'],msg['FromUserName'])
109 | 
110 | def Main():
111 |     global T
112 |     i = 1
113 |     while 1:
114 | 
115 |         C = tieba.Main()
116 |         print('This is myitchat: '+ str(C) )
117 |         print(T)
118 | 
119 |         if C == None or C == [] or C == ' ' or len(C) > 3452:
120 |             continue
121 |         if T == 1:
122 |             T = 0
123 |             print('改变了T: '+ str(T))
124 | 
125 |         if  T ==0 and i != 0 :
126 |             itchat.send_msg('监控到更新的数据  \n \n'+str(C),toUserName=id)
127 |         i += 1
128 | 
129 | 
130 | 
131 | 
132 | 
133 | # 识别系统
134 | sysstr = platform.system()
135 | 
136 | if(sysstr =="Windows"):
137 |     itchat.auto_login(hotReload=True)
138 | elif sysstr == "Linux":
139 |     itchat.auto_login(enableCmdQR=2)
140 | 
141 | 
142 | # blockThread=False 启用解除block
143 | itchat.run(blockThread=False)
144 | tieba.ini()
145 | tie = threading.Thread(target=Main())
146 | tie.start()
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/myitchat/myitchat.py:
--------------------------------------------------------------------------------
  1 | import itchat
  2 | import requests
  3 | import tieba
  4 | import threading
  5 | import json
  6 | import platform
  7 | 
  8 | KEY = 'xxxxxxx'
  9 | 
 10 | T = 0
 11 | 
 12 | def get_response(msg, UserId):
 13 |     # 这里我们就像在“3. 实现最简单的与图灵机器人的交互”中做的一样
 14 |     # 构造了要发送给服务器的数据
 15 |     apiUrl = 'http://openapi.tuling123.com/openapi/api/v2'
 16 | 
 17 | 
 18 |     data = {
 19 |     "reqType": 0,
 20 |     "perception": {
 21 |         "inputText": {
 22 |             "text": msg
 23 |         },
 24 |         "inputImage": {
 25 |             "url": ""
 26 |         },
 27 |         "selfInfo": {
 28 |             "location": {
 29 |                 "city": "",
 30 |                 "province": "",
 31 |                 "street": ""
 32 |             }
 33 |         }
 34 |     },
 35 |     "userInfo": {
 36 |         "apiKey": "42afd1a6112f4a93bbaa83022d980132",
 37 |         "userId": str(UserId)[1:33]
 38 |     }
 39 | }
 40 | 
 41 |     print(data)
 42 |     try:
 43 |         r = requests.post(apiUrl, data=json.dumps(data)).json()
 44 |         # 字典的get方法在字典没有'text'值的时候会返回None而不会抛出异常
 45 | 
 46 |         r = r['results']
 47 |         r = r[0]
 48 |         r = r['values']
 49 |         return r['text']
 50 |     # 为了防止服务器没有正常响应导致程序异常退出，这里用try-except捕获了异常
 51 |     # 如果服务器没能正常交互（返回非json或无法连接），那么就会进入下面的return
 52 |     except:
 53 |         # 将会返回一个None
 54 |         return
 55 | 
 56 | 
 57 | id = ''
 58 | 
 59 | 
 60 | @itchat.msg_register(itchat.content.TEXT)
 61 | def print_content(msg):
 62 |     # print(msg)
 63 |     global id
 64 |     print('id = |'+ id)
 65 |     print(msg['Text'] == '修改配置')
 66 |     print(id == msg['FromUserName'])
 67 |     if msg['Text'] == '开启监控' and (id == '' or id ==None ):
 68 | 
 69 |         # 引用全局变量
 70 |         id = msg['FromUserName']
 71 |         itchat.send_msg('已经开启监控~', toUserName=id)
 72 |         itchat.send_msg(tieba.setting(), toUserName=id)
 73 |         return
 74 | 
 75 |     if msg['Text'] == '修改配置' and id == msg['FromUserName']:
 76 |         Setting = tieba.GetSetting()
 77 |         a = {
 78 |             '监控贴吧列表':Setting[0],
 79 |             '监控关键词':Setting[1],
 80 |             '监控周期(S)':Setting[2],
 81 |             '开始页数':Setting[3],
 82 |             '结束页数': Setting[4]
 83 |         }
 84 |         itchat.send_msg('修改以下列信息，并且将修改后的信息复制发送', toUserName=id)
 85 | 
 86 |         itchat.send_msg(str(a), toUserName=id)
 87 |         return
 88 | 
 89 |     if  '监控贴吧列表' in msg['Text']:
 90 |         global T
 91 |         T = 1
 92 | 
 93 |         son = tieba.SetSetting(eval(msg['Text']))
 94 |         return son
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 |     # 这次对接收信息做一次判断
101 |     sentence = msg['Text']
102 | 
103 |     # 如果用户发送的是YYF则执行刷任务
104 |     print(msg['FromUserName'])
105 | 
106 | 
107 | 
108 |     return get_response(msg['Text'],msg['FromUserName'])
109 | 
110 | def Main():
111 |     global T
112 |     i = 1
113 |     while 1:
114 | 
115 |         C = tieba.Main()
116 |         print('This is myitchat: '+ str(C) )
117 |         print(T)
118 | 
119 |         if C == None or C == [] or C == ' ' or len(C) > 3452:
120 |             continue
121 |         if T == 1:
122 |             T = 0
123 |             print('改变了T: '+ str(T))
124 | 
125 |         if  T ==0 and i != 0 :
126 |             itchat.send_msg('监控到更新的数据  \n \n'+str(C),toUserName=id)
127 |         i += 1
128 | 
129 | 
130 | 
131 | 
132 | 
133 | # 识别系统
134 | sysstr = platform.system()
135 | 
136 | if(sysstr =="Windows"):
137 |     itchat.auto_login(hotReload=True)
138 | elif sysstr == "Linux":
139 |     itchat.auto_login(enableCmdQR=2)
140 | 
141 | 
142 | # blockThread=False 启用解除block
143 | itchat.run(blockThread=False)
144 | tieba.ini()
145 | tie = threading.Thread(target=Main())
146 | tie.start()
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/Sentiment/README.md:
--------------------------------------------------------------------------------
  1 | # Sentiment API
  2 | 
  3 | ---
  4 | 
  5 | :tokyo_tower: :tokyo_tower: :tokyo_tower: 
  6 | 
  7 | 
  8 | ## Sentiment
  9 | 
 10 | 
 11 | ### 相关依赖
 12 | 
 13 | 	import TiebaApiUtil
 14 | 
 15 | ### 使用方法
 16 | 
 17 | 	import tieba
 18 | 
 19 | 内置方法：
 20 | 
 21 | - GetId
 22 | - GetText
 23 | - OneToOne
 24 | - ComparisonDict()
 25 | - ini()
 26 | - Main()
 27 | - setting()
 28 | - GetSetting() 
 29 | - SetSetting() 
 30 | - dict_string()
 31 | 
 32 | 内置文件:
 33 | 
 34 | - TiebaSetting.conf
 35 | 
 36 | 配置文件相关参数:
 37 | 
 38 | 	[Setting]
 39 | 	
 40 | 	Sleep = 18000
 41 | 	# 贴吧名 贴吧用英文逗号(,)分隔符分割
 42 | 
 43 | 	tb = 国际米兰,AC米兰,皇家马德里
 44 | 	
 45 | 	# 贴吧页数指定爬取 默认起始页Start=1 End = 3
 46 | 	
 47 | 	Start = 1
 48 | 	
 49 | 	End = 10
 50 | 	
 51 | 	[Customize]
 52 | 	
 53 | 	# 关键词
 54 | 	
 55 | 	Essential = C罗,尤文,梅西,卡卡,透
 56 | 
 57 | 	
 58 | 
 59 | ---
 60 | ### GetId
 61 | 
 62 | 
 63 | #### 调用方法
 64 | 
 65 | 
 66 | 	tieba.GetId()
 67 | 
 68 | #### 方法说明
 69 | 
 70 | 该方法获取配置文件信息，获取贴吧列表调用TiebaApiUtil.GetPage()方法。
 71 | 
 72 | 
 73 | #### 返回形式：
 74 | 
 75 | ![](http://cdn.lunatic.wang/tieba6)
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | ---
 82 | 
 83 | ### GetText
 84 | 
 85 | #### 使用方法:
 86 | 
 87 | 	tieba.GetText(list)
 88 | 
 89 | #### 参数说明：
 90 | 
 91 | list:获取贴吧页数标题列表，以及比较相关关键词信息
 92 | 
 93 | #### 方法说明
 94 | 
 95 | 通过调用OneToOne()比较关键词,将相关数据存入Save中。
 96 | 
 97 | #### 返回形式
 98 | 
 99 | 
100 | 该方法没有返回值。
101 | 
102 | 
103 | ---
104 | ### OneToOne
105 | 
106 | #### 使用方法:
107 | 
108 | 	tieba.OneToOne(Text)
109 | 
110 | #### 参数说明：
111 | 
112 | Text:用户回复内容
113 | 
114 | #### 方法说明：
115 | 
116 | 将内容与关键词进行比较
117 | 
118 | #### 返回形式
119 | 
120 | 该方法返回布尔值。
121 | 
122 | - True
123 | 	- 比对成功
124 | - False
125 | 	- 比对失败 
126 | 
127 | 
128 | ---
129 | 
130 | ### ComparisonDict
131 | 
132 | #### 使用方法：
133 | 不推荐单独使用
134 | 
135 | #### 参数说明：
136 | 无
137 | 
138 | #### 方法说明：
139 | 该方法用以Save与OldSave进行对比从而达到监控目的
140 | 
141 | #### 返回形式
142 | 
143 | 重新生成新list —> NewList
144 | 
145 | #### NewList
146 | 
147 | 对比差异及结果
148 | 
149 | ---
150 | ### ini
151 | 
152 | #### 使用方法：
153 | tieba.ini()
154 | 
155 | #### 参数说明：
156 | 无
157 | 
158 | #### 方法说明：
159 | 该方法用以程序第一次启动用以初始化，在程序未启动或者未生成数据文件时先行调用该方法。
160 | 
161 | #### 返回形式
162 | 无
163 | 
164 | ---
165 | ### Main
166 | 
167 | #### 使用方法：
168 | 
169 | tieba.ini()
170 | 
171 | #### 参数说明
172 | 无
173 | 
174 | #### 方法说明：
175 | 
176 | 该方法为监控主程序,当程序完成初始化后进行监控
177 | 
178 | #### 返回形式：
179 | 
180 | NewList ——> 该list存放监控数据变化
181 | 
182 | 
183 | 
184 | 
185 | ---
186 | 
187 | ### setting()
188 | 
189 | #### 使用方法：
190 | 
191 | tieba.setting()
192 | 
193 | #### 参数说明
194 | 无
195 | 
196 | #### 方法说明：
197 | 
198 | 该方法查看TiebaSetting.conf配置内容
199 | 
200 | #### 返回形式：
201 | 
202 | String 字符串形式：
203 | 
204 | ![](http://cdn.lunatic.wang/tieba7.jpg)
205 | 
206 | ---
207 | 
208 | ### GetSetting() 
209 | 
210 | #### 使用方法：
211 | 
212 | tieba.GetSetting()
213 | 
214 | #### 参数说明
215 | 无
216 | 
217 | #### 方法说明：
218 | 
219 | 该方法查看TiebaSetting.conf配置内容,以list形式返回
220 | 
221 | #### 返回形式：
222 | 
223 | ![](http://cdn.lunatic.wang/tieba8.jpg)
224 | 
225 | ---
226 | ### SetSetting(dict) 
227 | 
228 | #### 使用方法：
229 | 
230 | tieba.GetSetting()
231 | 
232 | #### 参数说明
233 | dict : 传入修改的配置文件信息
234 | 
235 | #### 方法说明：
236 | 
237 | 该方法动态修改配置文件信息，用户在程序运行中改变关键词等配置信
238 | 
239 | #### 返回形式：
240 | 
241 | 
242 | ![](http://cdn.lunatic.wang/tieba7.jpg)
243 | 
244 | ---
245 | ### dict_string(dict) 
246 | 
247 | #### 使用方法：
248 | 
249 | tieba.dict_string(dict)
250 | 
251 | #### 参数说明
252 | dict :适配关键词的Text等相关dict信息
253 | 
254 | #### 方法说明：
255 | 
256 | 该方法用于去除适配的Text内容中去除HTML信息的图片标签以及其他影响阅读的标签
257 | 
258 | #### 返回形式：
259 | 
260 | > 监控到更新的数据  
261 | >  
262 | > 帖子地址: https://tieba.baidu.com/p/5660167836
263 | 
264 | > 
265 | > 
266 | > 帖子地址: https://tieba.baidu.com/p/5806639338
267 | > 二硕影迷(7-23 19:24):回复 啦啦啦哈哈撒</a :嗯，今年刚录取的
268 | > 
269 | > 卿卿且苧(11:23):回复 二硕影迷</a :每天自己拿桶存水啊，或者买个和垃圾桶大的桶存一宿舍的水
270 | > 
271 | > 
272 | > 帖子地址: https://tieba.baidu.com/p/5805131589
273 | > 萝卜森兔耳德(7-22 07:42):现在马上要录取了，突然想到一个很害怕的情况！我是第二批次，十个学校知道必须专业服从调剂于是我先把那十个勾给打上了然后再写的学校和代码，问一下提交的时候我服从的调剂勾会不会没有啊感觉好吓人啊！  
274 | > 
275 | > 
276 | > 帖子地址: https://tieba.baidu.com/p/5803120565
277 | > GGXHTML(7-20 16:37):山东影制的录取 出来啦吗   </a  
278 | > 
279 | > a37203050(7-22 05:45):回复 GGXHTML</a :出录取了嘛
280 | > 
281 | > 神奇的兔酱(7-22 08:33):回复 a37203050 ：没有出，10个计划山东，最低录取到411分。 应该会追加计划
282 | > 
283 | > GGXHTML(7-22 09:43):这个学习的录取线大概什么时候出啊？
284 | > 
285 | > 
286 | > 帖子地址: https://tieba.baidu.com/p/5808492570
287 | > 熏熏暖风(22:40):本来江夏区算半个郊区，现在还没有地铁只有3种公交可以出去，不过地铁通了就方便了，地铁口离东湖还是有一些距离的，宿舍环境装修了，被称为酒店级宿舍，图书馆还是不错的，市区的话，江夏区的市区大概半个多小时，不过没什么玩的买买一些东西是可以的，洪山区那边公交就50分钟到街道口那边，之后去哪里有地铁就各种方便了，就是放假的时候公交人很多 
288 | > 
289 | > 
290 | > 帖子地址: https://tieba.baidu.com/p/5807674831
291 | > zzhxiannv(23:04):不知道什么预录变成录取 
292 | > 
293 | > 
294 | > 帖子地址: https://tieba.baidu.com/p/5806260556
295 | > 哈欠女神(7-23 10:49):最低528！！（2017年这所学校在海南录取最低分）
296 | > 
297 | > shine包仔(7-23 14:07):回复 泌夫人</a :我觉得还可以，环境不错，宿舍统一四人间，独卫，有热水有空调
298 | 
299 | 
300 | 


--------------------------------------------------------------------------------
/TiebaApiUtil.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import re
  6 | import copy
  7 | import time
  8 | 
  9 | # 自制贴吧api
 10 | """
 11 | 该api依赖百度贴吧web版
 12 | 
 13 | 贴吧api作为工具类，可制作第三方客户端，极速/个性/无广告
 14 | """
 15 | 
 16 | 
 17 | """
 18 | 模拟三星手机访问
 19 | 虽然不加headers也可正常访问，但是我们还是要严谨
 20 | """
 21 | 
 22 | headers = {
 23 |     'Host': 'tieba.baidu.com',
 24 |     'Connection': 'keep-alive',
 25 |     'Cache-Control': 'max-age=0',
 26 |     'Upgrade-Insecure-Requests': '1',
 27 |     'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36',
 28 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 29 |     'Accept-Encoding': 'gzip, deflate',
 30 |     'Accept-Language': 'zh-CN,zh;q=0.9'
 31 | }
 32 | 
 33 | # 获取某个贴吧页面帖子
 34 | 
 35 | """
 36 | url = http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw=%E6%9E%97%E4%BF%8A%E6%9D%B0&lp=5011&lm=&pn=0
 37 | kw:贴吧关键词
 38 | pn=页数(0,20,40,60...)页数
 39 |         (1,2,3,4)
 40 | """
 41 | 
 42 | """
 43 |  pnf=1 起始页 默认为第一页 Start
 44 |  pne=3 结束页 默认为第三页 End
 45 | """
 46 | def GetPage(key,Start=1,End=3):
 47 |     url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw='
 48 |     url2 = '&lp=5011&lm=&pn='
 49 | 
 50 |     Start = Start*20-20
 51 |     # 防止出现输入0的情况
 52 |     if Start == -20:
 53 |         Start = 0
 54 | 
 55 |     End = End*20
 56 | 
 57 | 
 58 |     ReturnJson = {
 59 |         'key' : key
 60 |     }
 61 |     SuperList = []
 62 |     for i in range(Start,End,20):
 63 | 
 64 |         url = url1 + key + url2 + str(i)
 65 | 
 66 |         # print(url)
 67 |         time.sleep(0.01)
 68 |         GetPageID = requests.get(url = url,headers=headers)
 69 |         if '欢迎创建本吧，与今后来到这里的朋友交流讨论' in GetPageID.text:
 70 |             Error = {
 71 |                 'Error':'改吧尚未建立'
 72 |             }
 73 |             return json.dumps(Error,ensure_ascii=False)
 74 |         Soup = BeautifulSoup(GetPageID.text,'lxml')
 75 |         find = Soup.select('div.i')
 76 | 
 77 |         # 计数 调整标题
 78 |         c = 1
 79 | 
 80 |         '''
 81 |         这里需要添加一个list,这样一页就在一个list中
 82 |         
 83 |         '''
 84 |         SouList = []
 85 | 
 86 |         for x in find:
 87 | 
 88 |             # print(x)
 89 |             pattern = re.compile('kz=(.*?)&.*?">(.*?)</a>.*?回([0-9]\d*)\s(.*?)\s(.*?)</p>',re.S)
 90 |             items = re.findall(pattern, str(x))
 91 |             # print(items)
 92 |             # 这里对标题进行一次修正，防止出现 '1.\xa0' 情况
 93 |             if c < 10:
 94 |                 Title = items[0][1][3:]
 95 |             else:
 96 |                 Title = items[0][1][4:]
 97 | 
 98 |             c += 1
 99 |             # 创建一个Dict
100 | 
101 |             Son = {
102 |                 'Id':items[0][0],
103 |                 'Title':Title,
104 |                 'Reply':items[0][2],
105 |                 'Author':items[0][3],
106 |                 'Time':items[0][4]
107 |             }
108 |             SouList.append(Son)
109 | 
110 |         Page = {
111 |             str(int((i + 20) / 20)): SouList
112 |         }
113 | 
114 | 
115 |         # 这里需要清空list
116 |         """
117 |         引入copy 
118 |         在python 赋值是引入A=B，当清空或者销毁前者B被赋值的变量时，出现复制后的变量A为空，所以这里我们采用copy包中的deepcopy方法而不是copy方法。
119 |         
120 |         参考：https://www.cnblogs.com/koliverpool/p/6791579.html
121 |         """
122 | 
123 |         SuperList.append(copy.deepcopy(Page))
124 |         SouList.clear()
125 |         if not '下一页' in GetPageID.text:
126 |             break
127 |     ReturnJson['Page'] = SuperList
128 | 
129 | 
130 | 
131 |     # 添加 ensure_ascii=False 防止中文乱码
132 |     Result = json.dumps(ReturnJson,ensure_ascii=False)
133 | 
134 |     return Result
135 | 
136 | """
137 | 获取单个页数，该函数只于贴子ID有关
138 | 
139 | http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=4552337163&new_word=&pinf=1_2_0&pn=60&lp=6021
140 | 关键词是
141 | KZ：帖子ID
142 | pn：(0,30,60,90)
143 | (1,2,3,4)
144 | 
145 | """
146 | def GetTiebaOne(ID):
147 |     """
148 |         json 格式
149 |         {
150 |             'Text':'balabalabala'
151 |             'Author':'123'
152 |             'FloorInFloor':{
153 |                     {
154 |                 "Text": "balabalabala ",
155 |                 "Author": "123",
156 |                 "Time": "<a href=\"i?un=uacpayhs\">uacpayhs</a> <span class=\"b\">09:34"
157 |                 },{
158 |                 "Text": "回复 ",
159 |                 "Author": "极限rabbit",
160 |                 "Time": "<a href=\"i?un=uacpayhs\">uacpayhs</a> <span class=\"b\">09:34"
161 |             },
162 |                 }
163 | 
164 |         }
165 |     """
166 |     # 先获取帖子第一页以及帖子回复数
167 | 
168 |     page = 0
169 |     url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
170 |     url2 = '&new_word=&pinf=1_2_0&pn='+ str(page)
171 |     url3 = '&lp=6021'
172 |     url = url1+str(ID)+url2+url3
173 |     # print(url)
174 |     # 于前处理
175 |     time.sleep(0.01)
176 |     GetContent = requests.get(url=url,headers=headers)
177 |     # Soup = BeautifulSoup(GetContent.text,'lxml')
178 | 
179 |     #   获取页数
180 |     # SumPage = Soup.select_one('div.h > input[type="text"]').attrs['value']
181 | 
182 |     # --------------------------------------------
183 |     # print(str(page))
184 |     Soup = BeautifulSoup(GetContent.text, 'lxml')
185 |     findall = Soup.select('div.i')
186 |     FatherList = []
187 |     SonDict = {}
188 | 
189 |     # 异常以及不存在的帖子说明：
190 | 
191 |     if '您要浏览的贴子不存在' in GetContent.text:
192 |         ReturnJ = {
193 |             'Error' : 'Error'
194 |         }
195 |         return json.dumps(ReturnJ)
196 |     # 页数增加
197 |     for page in range(0,16122330,30):
198 |         # 这里要对页数循环
199 |         url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
200 |         url2 = '&new_word=&pinf=1_2_0&pn=' + str(page)
201 |         url3 = '&lp=6021'
202 |         url = url1 + str(ID) + url2 + url3
203 |         # print(url)
204 |         time.sleep(0.01)
205 |         GetContent = requests.get(url=url,headers=headers)
206 | 
207 |         Soup = BeautifulSoup(GetContent.text, 'lxml')
208 |         findall = Soup.select('div.i')
209 | 
210 |         for OneContent,count in zip(findall,range(1,999)):
211 | 
212 | 
213 |             if count == 1 and page == 0:
214 | 
215 |                 pattern = re.compile('class="i">1楼.\s(.*?)<table>.*?<span class="g"><a href=".*?">(.*?)</a>.*?class="b">(.*?)</s', re.S)
216 |                 items = re.findall(pattern, str(OneContent))
217 |                 Text = items[0][0]
218 |                 Author = items[0][1]
219 |                 Time = items[0][2]
220 |                 SonDict['Text'] = Text
221 |                 SonDict['Author'] = Author
222 |                 SonDict['Time'] = Time
223 |                 SonDict['FloorInFloor'] = ''
224 |                 FatherList.append(copy.deepcopy(SonDict))
225 |             else:
226 | 
227 |                 pattern = re.compile('class="i">\d*楼.\s(.*?)<table>.*?<span class="g"><a href=".*?">(.*?)</a>.*?class="b">(.*?)</span>.*?href="(.*?)">回复(.*?)</a>', re.S)
228 |                 items = re.findall(pattern, str(OneContent))
229 | 
230 |                 if items == [] or items == None or items == '':
231 |                     continue
232 |                 Text = items[0][0]
233 |                 Author = items[0][1]
234 |                 Time = items[0][2]
235 | 
236 | 
237 |                 Floor = items[0][4][1:-1]
238 |                 # print(items)
239 |                 FloorInFloor = []
240 |                 if not (Floor == '' or Floor == None):
241 |                     # print(items[0][2])
242 |                     FloorInFloor = GetFloorInFloor(url=items[0][3])
243 |                 SonDict['Text'] = Text
244 |                 SonDict['Author'] = items[0][1]
245 |                 SonDict['Time'] = Time
246 |                 SonDict['FloorInFloor'] = FloorInFloor
247 |                 FatherList.append(copy.deepcopy(SonDict))
248 |                 FloorInFloor.clear()
249 |                 SonDict.clear()
250 | 
251 |             # 这里不能直接判断下一页
252 | 
253 |         if not '下一页' in GetContent.text:
254 |             break
255 | 
256 |     FatherListJson = json.dumps(FatherList,ensure_ascii=False)
257 |     return FatherListJson
258 | #获取楼中楼
259 | 
260 | def GetFloorInFloor(url):
261 |     url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/'
262 | 
263 |     # 这里要做一次替换，因为html的&是&amp;,做一次替换
264 | 
265 |     # 参数内置默认99
266 |     url2 = url1 + url.replace('&amp;','&') +'&fpn='
267 | 
268 | 
269 | 
270 |     # 读取楼中楼信息
271 | 
272 |     ReturnList = []
273 | 
274 |     for pn in range(1,100):
275 |         time.sleep(0.01)
276 |         web = requests.get(url=url2+str(pn),headers=headers)
277 |         Soup = BeautifulSoup(web.text,'lxml')
278 |         findall = Soup.select('div.i')
279 |         for i in findall:
280 | 
281 |             pattern = re.compile(
282 |                 '<div class="i">(.*?)<br/><a hre.*?>(.*?)</a>.*?<span class="b">(.*?)</span>',
283 |                 re.S)
284 |             items = re.findall(pattern, str(i))
285 |             Son = {
286 |                 'Text':items[0][0],
287 |                 'Author':items[0][1],
288 |                 'Time':items[0][2]
289 |             }
290 |             ReturnList.append(Son)
291 | 
292 |     # 获取楼中楼页数
293 |         if not '下一页' in web.text:
294 |             break
295 |     return ReturnList
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 


--------------------------------------------------------------------------------
/ApiRedme/TiebaApiUtil.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import re
  6 | import copy
  7 | import time
  8 | 
  9 | # 自制贴吧api
 10 | """
 11 | 该api依赖百度贴吧web版
 12 | 
 13 | 贴吧api作为工具类，可制作第三方客户端，极速/个性/无广告
 14 | """
 15 | 
 16 | 
 17 | """
 18 | 模拟三星手机访问
 19 | 虽然不加headers也可正常访问，但是我们还是要严谨
 20 | """
 21 | 
 22 | headers = {
 23 |     'Host': 'tieba.baidu.com',
 24 |     'Connection': 'keep-alive',
 25 |     'Cache-Control': 'max-age=0',
 26 |     'Upgrade-Insecure-Requests': '1',
 27 |     'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36',
 28 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 29 |     'Accept-Encoding': 'gzip, deflate',
 30 |     'Accept-Language': 'zh-CN,zh;q=0.9'
 31 | }
 32 | 
 33 | # 获取某个贴吧页面帖子
 34 | 
 35 | """
 36 | url = http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw=%E6%9E%97%E4%BF%8A%E6%9D%B0&lp=5011&lm=&pn=0
 37 | kw:贴吧关键词
 38 | pn=页数(0,20,40,60...)页数
 39 |         (1,2,3,4)
 40 | """
 41 | 
 42 | """
 43 |  pnf=1 起始页 默认为第一页 Start
 44 |  pne=3 结束页 默认为第三页 End
 45 | """
 46 | def GetPage(key,Start=1,End=3):
 47 |     url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw='
 48 |     url2 = '&lp=5011&lm=&pn='
 49 | 
 50 |     Start = Start*20-20
 51 |     # 防止出现输入0的情况
 52 |     if Start == -20:
 53 |         Start = 0
 54 | 
 55 |     End = End*20
 56 | 
 57 | 
 58 |     ReturnJson = {
 59 |         'key' : key
 60 |     }
 61 |     SuperList = []
 62 |     for i in range(Start,End,20):
 63 | 
 64 |         url = url1 + key + url2 + str(i)
 65 | 
 66 |         # print(url)
 67 |         time.sleep(0.01)
 68 |         GetPageID = requests.get(url = url,headers=headers)
 69 |         if '欢迎创建本吧，与今后来到这里的朋友交流讨论' in GetPageID.text:
 70 |             Error = {
 71 |                 'Error':'改吧尚未建立'
 72 |             }
 73 |             return json.dumps(Error,ensure_ascii=False)
 74 |         Soup = BeautifulSoup(GetPageID.text,'lxml')
 75 |         find = Soup.select('div.i')
 76 | 
 77 |         # 计数 调整标题
 78 |         c = 1
 79 | 
 80 |         '''
 81 |         这里需要添加一个list,这样一页就在一个list中
 82 |         
 83 |         '''
 84 |         SouList = []
 85 | 
 86 |         for x in find:
 87 | 
 88 |             # print(x)
 89 |             pattern = re.compile('kz=(.*?)&.*?">(.*?)</a>.*?回([0-9]\d*)\s(.*?)\s(.*?)</p>',re.S)
 90 |             items = re.findall(pattern, str(x))
 91 |             # print(items)
 92 |             # 这里对标题进行一次修正，防止出现 '1.\xa0' 情况
 93 |             if c < 10:
 94 |                 Title = items[0][1][3:]
 95 |             else:
 96 |                 Title = items[0][1][4:]
 97 | 
 98 |             c += 1
 99 |             # 创建一个Dict
100 | 
101 |             Son = {
102 |                 'Id':items[0][0],
103 |                 'Title':Title,
104 |                 'Reply':items[0][2],
105 |                 'Author':items[0][3],
106 |                 'Time':items[0][4]
107 |             }
108 |             SouList.append(Son)
109 | 
110 |         Page = {
111 |             str(int((i + 20) / 20)): SouList
112 |         }
113 | 
114 | 
115 |         # 这里需要清空list
116 |         """
117 |         引入copy 
118 |         在python 赋值是引入A=B，当清空或者销毁前者B被赋值的变量时，出现复制后的变量A为空，所以这里我们采用copy包中的deepcopy方法而不是copy方法。
119 |         
120 |         参考：https://www.cnblogs.com/koliverpool/p/6791579.html
121 |         """
122 | 
123 |         SuperList.append(copy.deepcopy(Page))
124 |         SouList.clear()
125 |         if not '下一页' in GetPageID.text:
126 |             break
127 |     ReturnJson['Page'] = SuperList
128 | 
129 | 
130 | 
131 |     # 添加 ensure_ascii=False 防止中文乱码
132 |     Result = json.dumps(ReturnJson,ensure_ascii=False)
133 | 
134 |     return Result
135 | 
136 | """
137 | 获取单个页数，该函数只于贴子ID有关
138 | 
139 | http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=4552337163&new_word=&pinf=1_2_0&pn=60&lp=6021
140 | 关键词是
141 | KZ：帖子ID
142 | pn：(0,30,60,90)
143 | (1,2,3,4)
144 | 
145 | """
146 | def GetTiebaOne(ID):
147 |     """
148 |         json 格式
149 |         {
150 |             'Text':'balabalabala'
151 |             'Author':'123'
152 |             'FloorInFloor':{
153 |                     {
154 |                 "Text": "balabalabala ",
155 |                 "Author": "123",
156 |                 "Time": "<a href=\"i?un=uacpayhs\">uacpayhs</a> <span class=\"b\">09:34"
157 |                 },{
158 |                 "Text": "回复 ",
159 |                 "Author": "极限rabbit",
160 |                 "Time": "<a href=\"i?un=uacpayhs\">uacpayhs</a> <span class=\"b\">09:34"
161 |             },
162 |                 }
163 | 
164 |         }
165 |     """
166 |     # 先获取帖子第一页以及帖子回复数
167 | 
168 |     page = 0
169 |     url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
170 |     url2 = '&new_word=&pinf=1_2_0&pn='+ str(page)
171 |     url3 = '&lp=6021'
172 |     url = url1+str(ID)+url2+url3
173 |     # print(url)
174 |     # 于前处理
175 |     time.sleep(0.01)
176 |     GetContent = requests.get(url=url,headers=headers)
177 |     # Soup = BeautifulSoup(GetContent.text,'lxml')
178 | 
179 |     #   获取页数
180 |     # SumPage = Soup.select_one('div.h > input[type="text"]').attrs['value']
181 | 
182 |     # --------------------------------------------
183 |     # print(str(page))
184 |     Soup = BeautifulSoup(GetContent.text, 'lxml')
185 |     findall = Soup.select('div.i')
186 |     FatherList = []
187 |     SonDict = {}
188 | 
189 |     # 异常以及不存在的帖子说明：
190 | 
191 |     if '您要浏览的贴子不存在' in GetContent.text:
192 |         ReturnJ = {
193 |             'Error' : 'Error'
194 |         }
195 |         return json.dumps(ReturnJ)
196 |     # 页数增加
197 |     for page in range(0,16122330,30):
198 |         # 这里要对页数循环
199 |         url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz='
200 |         url2 = '&new_word=&pinf=1_2_0&pn=' + str(page)
201 |         url3 = '&lp=6021'
202 |         url = url1 + str(ID) + url2 + url3
203 |         # print(url)
204 |         time.sleep(0.01)
205 |         GetContent = requests.get(url=url,headers=headers)
206 | 
207 |         Soup = BeautifulSoup(GetContent.text, 'lxml')
208 |         findall = Soup.select('div.i')
209 | 
210 |         for OneContent,count in zip(findall,range(1,999)):
211 | 
212 | 
213 |             if count == 1 and page == 0:
214 | 
215 |                 pattern = re.compile('class="i">1楼.\s(.*?)<table>.*?<span class="g"><a href=".*?">(.*?)</a>.*?class="b">(.*?)</s', re.S)
216 |                 items = re.findall(pattern, str(OneContent))
217 |                 Text = items[0][0]
218 |                 Author = items[0][1]
219 |                 Time = items[0][2]
220 |                 SonDict['Text'] = Text
221 |                 SonDict['Author'] = Author
222 |                 SonDict['Time'] = Time
223 |                 SonDict['FloorInFloor'] = ''
224 |                 FatherList.append(copy.deepcopy(SonDict))
225 |             else:
226 | 
227 |                 pattern = re.compile('class="i">\d*楼.\s(.*?)<table>.*?<span class="g"><a href=".*?">(.*?)</a>.*?class="b">(.*?)</span>.*?href="(.*?)">回复(.*?)</a>', re.S)
228 |                 items = re.findall(pattern, str(OneContent))
229 | 
230 |                 if items == [] or items == None or items == '':
231 |                     continue
232 |                 Text = items[0][0]
233 |                 Author = items[0][1]
234 |                 Time = items[0][2]
235 | 
236 | 
237 |                 Floor = items[0][4][1:-1]
238 |                 # print(items)
239 |                 FloorInFloor = []
240 |                 if not (Floor == '' or Floor == None):
241 |                     # print(items[0][2])
242 |                     FloorInFloor = GetFloorInFloor(url=items[0][3])
243 |                 SonDict['Text'] = Text
244 |                 SonDict['Author'] = items[0][1]
245 |                 SonDict['Time'] = Time
246 |                 SonDict['FloorInFloor'] = FloorInFloor
247 |                 FatherList.append(copy.deepcopy(SonDict))
248 |                 FloorInFloor.clear()
249 |                 SonDict.clear()
250 | 
251 |             # 这里不能直接判断下一页
252 | 
253 |         if not '下一页' in GetContent.text:
254 |             break
255 | 
256 |     FatherListJson = json.dumps(FatherList,ensure_ascii=False)
257 |     return FatherListJson
258 | #获取楼中楼
259 | 
260 | def GetFloorInFloor(url):
261 |     url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/'
262 | 
263 |     # 这里要做一次替换，因为html的&是&amp;,做一次替换
264 | 
265 |     # 参数内置默认99
266 |     url2 = url1 + url.replace('&amp;','&') +'&fpn='
267 | 
268 | 
269 | 
270 |     # 读取楼中楼信息
271 | 
272 |     ReturnList = []
273 | 
274 |     for pn in range(1,100):
275 |         time.sleep(0.01)
276 |         web = requests.get(url=url2+str(pn),headers=headers)
277 |         Soup = BeautifulSoup(web.text,'lxml')
278 |         findall = Soup.select('div.i')
279 |         for i in findall:
280 | 
281 |             pattern = re.compile(
282 |                 '<div class="i">(.*?)<br/><a hre.*?>(.*?)</a>.*?<span class="b">(.*?)</span>',
283 |                 re.S)
284 |             items = re.findall(pattern, str(i))
285 |             Son = {
286 |                 'Text':items[0][0],
287 |                 'Author':items[0][1],
288 |                 'Time':items[0][2]
289 |             }
290 |             ReturnList.append(Son)
291 | 
292 |     # 获取楼中楼页数
293 |         if not '下一页' in web.text:
294 |             break
295 |     return ReturnList
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 


--------------------------------------------------------------------------------
/Sentiment/tieba.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import time
  4 | import configparser
  5 | import TiebaApiUtil
  6 | import copy
  7 | import re
  8 | 
  9 | """
 10 | 此处用".conf"进行配置
 11 | Python 读取写入配置文件 —— ConfigParser
 12 | https://www.cnblogs.com/feeland/p/4514771.html
 13 | 
 14 |     config = configparser.ConfigParser()
 15 |     config.read('test.conf')
 16 |     db_host = config.get("db", "db_port")
 17 |     
 18 | """
 19 | 
 20 | 
 21 | # 参数配置说明
 22 | """
 23 | [Setting]
 24 | 爬取贴吧
 25 | tb:{'国际米兰','linjj','dota','dota2'}
 26 | 
 27 | 爬取默认页数
 28 | Start：1
 29 | End：4
 30 | 
 31 | [Customize]
 32 | 关键词
 33 | Essential:{'昨天','今天','明天','后天'}
 34 | 
 35 | """
 36 | 
 37 | 
 38 | config = configparser.ConfigParser()
 39 | # 编码要设置成utf-8-sig而并不是utf-8
 40 | config.read('TiebaSetting.conf', encoding='utf-8-sig')
 41 | key = config.get("Setting", "tb")
 42 | keyList = key.split(',')
 43 | Essential = config.get("Customize", "Essential")
 44 | EssentialList = Essential.split(',')
 45 | 
 46 | # 休息周期
 47 | X = config.getint("Setting", "Sleep")
 48 | 
 49 | # 开始页数
 50 | Start = config.getint("Setting", "Start")
 51 | # 结束页数
 52 | End = config.getint("Setting", "End")
 53 | 
 54 | # 信息存取列表
 55 | Save = {}
 56 | 
 57 | # 读取文件或前一个爬取列表
 58 | 
 59 | OldSave = {}
 60 | 
 61 | # 新出现的帖子
 62 | 
 63 | NewList = []
 64 | 
 65 | 
 66 | # 获取ID
 67 | def GetId():
 68 |     IdList = []
 69 |     for name in keyList:
 70 |         test = TiebaApiUtil.GetPage(key=name,Start=Start,End=End)
 71 |         test1 = eval(test)
 72 |         # print(test1)
 73 |         for i in test1['Page']:
 74 |             for x in i.values():
 75 |                 for j in x:
 76 |                     IdList.append(j['Id'])
 77 |     # print(IdList.__len__())
 78 |     return IdList
 79 | 
 80 | 
 81 | # 存取相关关键词详细信息
 82 | 
 83 | OldKeyText = {}
 84 | KeyText = {}
 85 | 
 86 | '''
 87 | {
 88 | 123123123 : [
 89 | {
 90 | }
 91 | ]
 92 | 
 93 | 
 94 | }
 95 | 
 96 | '''
 97 | 
 98 | 
 99 | 
100 | # 获取TEXT 以及对比
101 | def GetText(list):
102 | 
103 |     for id in list:
104 | 
105 |         # 标志  为0则代表无关键词，为1则有关键词
106 |         T = 0
107 | 
108 |         # 存放 某个ID的所有关键回复楼
109 |         KeyList = []
110 | 
111 |         Text = TiebaApiUtil.GetTiebaOne(id)
112 | 
113 |         test1 = eval(Text)
114 |         # print(str(test1))
115 | 
116 |         for i in test1:
117 |         # 检测回复是否有关键词
118 |             try:
119 |                 if OneToOne(i['Text']):
120 |                     # 存放关键词检索
121 |                     T = 1
122 |                     key = {
123 |                         'Author': i['Author'],
124 |                         'Text':i['Text'],
125 |                         'Time':i['Time']
126 |                     }
127 | 
128 |                     KeyList.append(key)
129 | 
130 |                     # ID 在Save中
131 | 
132 |                     if str(id) in Save:
133 |                         # print('旧ID')
134 |                         Save[str(id)] += 1
135 |                     # ID 不在Save中
136 |                     else:
137 |                         # print('出现新ID')
138 |                         Save[str(id)] = 1
139 |                 # 检测楼中楼是否存在
140 |                 if  not (i['FloorInFloor']  == '' or i['FloorInFloor']  == None or i['FloorInFloor']  == []):
141 | 
142 |                     # 存在楼中楼则遍历
143 |                     for f in i['FloorInFloor']:
144 | 
145 |                         if OneToOne(f['Text']):
146 | 
147 |                             # 存放关键词检索
148 |                             T = 1
149 |                             key = {
150 |                                 'Author': f['Author'],
151 |                                 'Text': f['Text'],
152 |                                 'Time': f['Time']
153 |                             }
154 | 
155 |                             KeyList.append(key)
156 | 
157 |                             if str(id) in Save:
158 |                                 # print('旧楼中楼ID')
159 |                                 Save[str(id)] += 1
160 |                             else:
161 |                                 # print('新楼中楼ID')
162 |                                 Save[str(id)] = 1
163 |             except TypeError:
164 |                 pass
165 |         if T == 1:
166 |             T = 0
167 |             KeyText[str(id)] = copy.deepcopy(KeyList)
168 |             KeyList.clear()
169 | 
170 | 
171 | 
172 | 
173 | def OneToOne(Text):
174 |     for i in EssentialList:
175 |         if  i in Text:
176 |             return True
177 | 
178 | 
179 | '''
180 | 
181 | # 信息存取列表
182 | Save = {}
183 | 
184 | # 读取文件或前一个爬取列表
185 | 
186 | NewSave = {}
187 | 
188 | '''
189 | def ComparisonDict():
190 |     # print('This is OldSave: '+ str(OldSave))
191 |     # print('This is Save: '+ str(Save))
192 | 
193 |     for x in Save:
194 |         for i,y in zip(OldSave,range(1,len(OldSave)+1)):
195 |             # print(str(len(OldSave)) + '    y '+ str(y))
196 |             if x == i:
197 |                 # 值相同
198 |                 # print('KEY相同')
199 |                 # print(str(x))
200 |                 if not Save[x] == OldSave[i]:
201 |                     # print('值不相同')
202 |                     NewList.append(str(i))
203 |                 break
204 |             # if y == len(OldSave):
205 |             #     # print('y == len(OldSave)')
206 |             #     # print(x +"     "+i)
207 |             if y == len(OldSave) and  x != i:
208 |                 # print('test2222')
209 |                 NewList.append(str(x))
210 | 
211 |     OldSave.clear()
212 |     OldSave.update(copy.deepcopy(Save))
213 |     Save.clear()
214 |  # 5800836228     3244759899
215 | 
216 | # 程序每次完成循环都要存取一次
217 | 
218 | # 判断NewList 是否为空
219 | 
220 | NewKey = {}
221 | dict = {}
222 | 
223 | def ComparisonDictKey():
224 |     # print('This is OldKeyText: ' + str(OldKeyText))
225 |     # print('This is KeyText: ' + str(KeyText))
226 | 
227 | 
228 | 
229 |     for New in KeyText:
230 |         list = []
231 | 
232 | 
233 |         # 判断新的KeyText的ID是不是在OldKeyText中，如果在则对比是否有不同
234 |         if New in OldKeyText.keys():
235 |             T = 0
236 |             for K in KeyText[New]:
237 | 
238 |                 for O,C in zip(OldKeyText[New],range(1,len(OldKeyText[New])+1)):
239 |                     if K['Text'] == O['Text'] and K['Author'] == O['Author'] and K['Time'] == O['Time']:
240 |                         break
241 |                     if C == len(OldKeyText[New]) and (K['Text'] != O['Text'] or K['Author'] != O['Author'] or K['Time'] != O['Time']):
242 |                         print(K)
243 |                         list.append(K)
244 |                         T = 1
245 |             if T == 1:
246 |                 dict[New] = copy.deepcopy(list)
247 |                 list.clear()
248 |                 T = 0
249 |         else:
250 |             dict[New] = copy.deepcopy(KeyText[New])
251 | 
252 |     print('This is Update '+str(dict))
253 | 
254 |     OldKeyText.clear()
255 |     OldKeyText.update(copy.deepcopy(KeyText))
256 |     KeyText.clear()
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | def FileOpen():
273 |     with open('list.tieba','r',encoding='utf-8') as file:
274 |         save = file.read()
275 |         OldSave.update(eval(save))
276 | 
277 |     with open('key.tieba','r',encoding='utf-8') as file:
278 |         oldkeytext = file.read()
279 |         OldKeyText.update(eval(oldkeytext))
280 | 
281 | def FileSave():
282 |     with open('list.tieba','w',encoding='utf-8') as file:
283 |         file.write(str(OldSave))
284 | 
285 |     with open('key.tieba','w',encoding='utf-8') as file:
286 |         file.write(str(OldKeyText))
287 | 
288 | 
289 | 
290 | 
291 | def ini():
292 |     # 初始化
293 |     try:
294 |         F = open('list.tieba','r',encoding='utf-8')
295 |         F.close()
296 |     except OSError :
297 |         print('初始化')
298 |         GetText(GetId())
299 |         OldSave.update(copy.deepcopy(Save))
300 |         Save.clear()
301 |         FileSave()
302 | 
303 |     return
304 | 
305 | 
306 | 
307 | def Main():
308 | 
309 | 
310 |     time.sleep(X)
311 |     print('开始运行')
312 |     # 清空对比函数
313 |     NewList.clear()
314 |     dict.clear()
315 |     # 获取Save
316 |     GetText(GetId())
317 |     # 对比Save to OldSave
318 |     ComparisonDict()
319 |     ComparisonDictKey()
320 |     # 保存以防错误
321 |     FileSave()
322 |     print('This is NewList : ' + str(NewList))
323 |     print(dict)
324 |     c = dict_string(dict)
325 | 
326 | 
327 | 
328 |     return c
329 | 
330 | 
331 | 
332 | 
333 | def setting():
334 |     # print(keyList)
335 |     # print(EssentialList)
336 |     # print(X)
337 |    #  print('监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End))
338 |     return '监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End)
339 | 
340 | def GetSetting():
341 |     return keyList,EssentialList,X,Start,End
342 | '''
343 | 帖子地址：https://tieba.baidu.com/p/5806299422
344 | 遥远彼方(12:24):
345 | 
346 | '''
347 | 
348 | def dict_string(dict):
349 |     res = ''
350 |     url = 'https://tieba.baidu.com/p/'
351 |     for one in dict:
352 |         IDurl = url + one
353 | 
354 |         str1 = ''
355 |         for i in dict[one]:
356 | 
357 |             a = re.compile('< img .*?"/>', re.I)
358 |             b = re.compile('<a h.*?">', re.I)
359 |             c = re.compile('<br/>', re.I)
360 |             f = re.compile('<img .*?"/>', re.I)
361 |             d = a.sub('', str(i['Text']))
362 |             d = b.sub('', str(d))
363 |             d = c.sub(' ', str(d))
364 |             d = f.sub(' ', str(d))
365 | 
366 |             str1 = str1 + i['Author'] + '('+i['Time']+')'+':'+d + '\n\n'
367 | 
368 |         res = res + '帖子地址: '+IDurl + '\n' + str1 + '\n'
369 |         print(res)
370 |     return res
371 | 
372 | 
373 | 
374 | 
375 | # 更改配置
376 | 
377 | def SetSetting(dict):
378 |     global keyList,EssentialList,X,Start,End
379 | 
380 |     keyList = dict['监控贴吧列表']
381 |     EssentialList = dict['监控关键词']
382 |     X = dict['监控周期(S)']
383 |     Start = dict['开始页数']
384 |     End = dict['结束页数']
385 | 
386 |     # 写入文件
387 |     config = configparser.ConfigParser()
388 |     # 编码要设置成utf-8-sig而并不是utf-8
389 |     config.read('TiebaSetting.conf', encoding='utf-8-sig')
390 | 
391 |     config.set('Setting','Sleep',str(X))
392 |     config.set('Setting', 'Start', str(Start))
393 |     config.set('Setting', 'End', str(End))
394 |     config.set('Setting', 'tb', ','.join(keyList))
395 |     config.set('Customize', 'Essential', ','.join(EssentialList))
396 | 
397 |     with open("TiebaSetting.conf", "w+",encoding='utf-8') as f:
398 |         config.write(f)
399 | 
400 |     return '监控贴吧列表: ' + str(keyList) + '\n' + '监控关键词: ' + str(EssentialList) + '\n' + '监控周期: ' + str(
401 |         X) + '\n' + '开始-终止/页数: ' + str(Start) + '-' + str(End)
402 | 
403 | 
404 | 


--------------------------------------------------------------------------------
/tieba.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import time
  4 | import configparser
  5 | import TiebaApiUtil
  6 | import copy
  7 | import re
  8 | import datetime
  9 | """
 10 | 此处用".conf"进行配置
 11 | Python 读取写入配置文件 —— ConfigParser
 12 | https://www.cnblogs.com/feeland/p/4514771.html
 13 | 
 14 |     config = configparser.ConfigParser()
 15 |     config.read('test.conf')
 16 |     db_host = config.get("db", "db_port")
 17 |     
 18 | """
 19 | 
 20 | 
 21 | # 参数配置说明
 22 | """
 23 | [Setting]
 24 | 爬取贴吧
 25 | tb:{'国际米兰','linjj','dota','dota2'}
 26 | 
 27 | 爬取默认页数
 28 | Start：1
 29 | End：4
 30 | 
 31 | [Customize]
 32 | 关键词
 33 | Essential:{'昨天','今天','明天','后天'}
 34 | 
 35 | """
 36 | 
 37 | 
 38 | config = configparser.ConfigParser()
 39 | # 编码要设置成utf-8-sig而并不是utf-8
 40 | config.read('TiebaSetting.conf', encoding='utf-8-sig')
 41 | key = config.get("Setting", "tb")
 42 | keyList = key.split(',')
 43 | Essential = config.get("Customize", "Essential")
 44 | EssentialList = Essential.split(',')
 45 | Time = config.get("Setting","Time")
 46 | # 休息周期
 47 | X = config.getint("Setting", "Sleep")
 48 | 
 49 | # 开始页数
 50 | Start = config.getint("Setting", "Start")
 51 | # 结束页数
 52 | End = config.getint("Setting", "End")
 53 | 
 54 | 
 55 | Year = int(datetime.datetime.now().strftime('%Y-%m-%d').split('-')[0])
 56 | 
 57 | refer_to = Time.split('-')
 58 | 
 59 | # 信息存取列表
 60 | Save = {}
 61 | 
 62 | # 读取文件或前一个爬取列表
 63 | 
 64 | OldSave = {}
 65 | 
 66 | # 新出现的帖子
 67 | 
 68 | NewList = []
 69 | 
 70 | 
 71 | # 获取ID
 72 | def GetId():
 73 |     IdList = []
 74 |     for name in keyList:
 75 |         test = TiebaApiUtil.GetPage(key=name,Start=Start,End=End)
 76 |         test1 = eval(test)
 77 |         # print(test1)
 78 |         for i in test1['Page']:
 79 |             for x in i.values():
 80 |                 for j in x:
 81 |                     IdList.append(j['Id'])
 82 |     # print(IdList.__len__())
 83 |     return IdList
 84 | 
 85 | 
 86 | # 存取相关关键词详细信息
 87 | 
 88 | OldKeyText = {}
 89 | KeyText = {}
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | # 获取TEXT 以及对比
 96 | def GetText(list):
 97 | 
 98 |     for id in list:
 99 | 
100 |         # 标志  为0则代表无关键词，为1则有关键词
101 |         T = 0
102 | 
103 |         # 存放 某个ID的所有关键回复楼
104 |         KeyList = []
105 | 
106 |         Text = TiebaApiUtil.GetTiebaOne(id)
107 | 
108 |         test1 = eval(Text)
109 |         # print(str(test1))
110 | 
111 |         for i in test1:
112 |         # 检测回复是否有关键词
113 |             try:
114 |                 if OneToOne(i['Text']):
115 |                     # 存放关键词检索
116 |                     T = 1
117 |                     key = {
118 |                         'Author': i['Author'],
119 |                         'Text':i['Text'],
120 |                         'Time':i['Time']
121 |                     }
122 | 
123 |                     # 时间限制
124 |                     if(TimeLimit(i['Time'])):
125 |                         print(i['Time'])
126 |                         KeyList.append(key)
127 | 
128 |                         # ID 在Save中
129 | 
130 |                         if str(id) in Save:
131 |                             # print('旧ID')
132 |                             Save[str(id)] += 1
133 |                         # ID 不在Save中
134 |                         else:
135 |                             # print('出现新ID')
136 |                             Save[str(id)] = 1
137 |                 # 检测楼中楼是否存在
138 |                 if  not (i['FloorInFloor']  == '' or i['FloorInFloor']  == None or i['FloorInFloor']  == []):
139 | 
140 |                     # 存在楼中楼则遍历
141 |                     for f in i['FloorInFloor']:
142 | 
143 |                         if OneToOne(f['Text']):
144 | 
145 |                             # 存放关键词检索
146 |                             T = 1
147 |                             key = {
148 |                                 'Author': f['Author'],
149 |                                 'Text': f['Text'],
150 |                                 'Time': f['Time']
151 |                             }
152 |                             if (TimeLimit(i['Time'])):
153 |                                 print(i['Time'])
154 |                                 KeyList.append(key)
155 | 
156 |                                 if str(id) in Save:
157 |                                     # print('旧楼中楼ID')
158 |                                     Save[str(id)] += 1
159 |                                 else:
160 |                                     # print('新楼中楼ID')
161 |                                     Save[str(id)] = 1
162 |             except TypeError:
163 |                 pass
164 |         if T == 1:
165 |             T = 0
166 |             KeyText[str(id)] = copy.deepcopy(KeyList)
167 |             KeyList.clear()
168 | 
169 | 
170 | 
171 | 
172 | def OneToOne(Text):
173 |     for i in EssentialList:
174 |         if  i in Text:
175 |             return True
176 | 
177 | 
178 | '''
179 | 
180 | # 信息存取列表
181 | Save = {}
182 | 
183 | # 读取文件或前一个爬取列表
184 | 
185 | NewSave = {}
186 | 
187 | '''
188 | def ComparisonDict():
189 |     print('This is OldSave: '+ str(OldSave))
190 |     print('This is Save: '+ str(Save))
191 | 
192 |     for x in Save:
193 |         for i,y in zip(OldSave,range(1,len(OldSave)+1)):
194 |             # print(str(len(OldSave)) + '    y '+ str(y))
195 |             if x == i:
196 |                 # 值相同
197 |                 # print('KEY相同')
198 |                 # print(str(x))
199 |                 if not Save[x] == OldSave[i]:
200 |                     # print('值不相同')
201 |                     NewList.append(str(i))
202 |                 break
203 |             # if y == len(OldSave):
204 |             #     # print('y == len(OldSave)')
205 |             #     # print(x +"     "+i)
206 |             if y == len(OldSave) and  x != i:
207 |                 # print('test2222')
208 |                 NewList.append(str(x))
209 |     print("This NewList :" +str(NewList))
210 |     OldSave.clear()
211 |     OldSave.update(copy.deepcopy(Save))
212 |     Save.clear()
213 |  # 5800836228     3244759899
214 | 
215 | # 程序每次完成循环都要存取一次
216 | 
217 | # 判断NewList 是否为空
218 | 
219 | NewKey = {}
220 | dict = {}
221 | 
222 | def ComparisonDictKey():
223 |     # print('This is OldKeyText: ' + str(OldKeyText))
224 |     # print('This is KeyText: ' + str(KeyText))
225 | 
226 | 
227 | 
228 |     for New in KeyText:
229 |         list = []
230 | 
231 | 
232 |         # 判断新的KeyText的ID是不是在OldKeyText中，如果在则对比是否有不同
233 |         if New in OldKeyText.keys():
234 |             T = 0
235 |             for K in KeyText[New]:
236 | 
237 |                 for O,C in zip(OldKeyText[New],range(1,len(OldKeyText[New])+1)):
238 |                     if K['Text'] == O['Text'] and K['Author'] == O['Author'] and K['Time'] == O['Time']:
239 |                         break
240 |                     if C == len(OldKeyText[New]) and (K['Text'] != O['Text'] or K['Author'] != O['Author'] or K['Time'] != O['Time']):
241 |                         print(K)
242 |                         list.append(K)
243 |                         T = 1
244 |             if T == 1:
245 |                 dict[New] = copy.deepcopy(list)
246 |                 list.clear()
247 |                 T = 0
248 |         else:
249 |             dict[New] = copy.deepcopy(KeyText[New])
250 | 
251 |     print('This is Update '+str(dict))
252 | 
253 |     OldKeyText.clear()
254 |     OldKeyText.update(copy.deepcopy(KeyText))
255 |     KeyText.clear()
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | def FileOpen():
272 |     with open('list.tieba','r',encoding='utf-8') as file:
273 |         save = file.read()
274 |         OldSave.update(eval(save))
275 | 
276 |     with open('key.tieba','r',encoding='utf-8') as file:
277 |         oldkeytext = file.read()
278 |         OldKeyText.update(eval(oldkeytext))
279 | 
280 | def FileSave():
281 |     with open('list.tieba','w',encoding='utf-8') as file:
282 |         file.write(str(OldSave))
283 | 
284 |     with open('key.tieba','w',encoding='utf-8') as file:
285 |         file.write(str(OldKeyText))
286 | 
287 | 
288 | 
289 | 
290 | def ini():
291 |     # 初始化
292 |     try:
293 |         F = open('list.tieba','r',encoding='utf-8')
294 |         F.close()
295 |     except OSError :
296 |         print('初始化')
297 |         GetText(GetId())
298 |         OldSave.update(copy.deepcopy(Save))
299 |         Save.clear()
300 |         FileSave()
301 | 
302 |     return
303 | 
304 | 
305 | 
306 | def Main():
307 | 
308 | 
309 |     time.sleep(X)
310 |     print('开始运行')
311 |     # 清空对比函数
312 |     NewList.clear()
313 |     dict.clear()
314 |     # 获取Save
315 |     GetText(GetId())
316 |     # 对比Save to OldSave
317 |     ComparisonDict()
318 |     ComparisonDictKey()
319 |     # 保存以防错误
320 |     FileSave()
321 |     print('This is NewList : ' + str(NewList))
322 |     # print(dict)
323 |     c = dict_string(dict)
324 |     # print("This is c "+ c)
325 | 
326 | 
327 |     return c
328 | 
329 | 
330 | 
331 | 
332 | def setting():
333 |     # print(keyList)
334 |     # print(EssentialList)
335 |     # print(X)
336 |    #  print('监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End))
337 |     return '监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+str(Start)+'-'+str(End)+ '\n'+ '监控时间：'+str(Time)
338 | 
339 | def GetSetting():
340 |     return keyList,EssentialList,X,Start,End
341 | '''
342 | 帖子地址：https://tieba.baidu.com/p/5806299422
343 | 遥远彼方(12:24):
344 | 
345 | '''
346 | 
347 | def dict_string(dict):
348 |     res = ''
349 |     url = 'https://tieba.baidu.com/p/'
350 |     for one in dict:
351 |         IDurl = url + one
352 | 
353 |         str1 = ''
354 |         for i in dict[one]:
355 | 
356 |             a = re.compile('< img .*?"/>', re.I)
357 |             b = re.compile('<a h.*?">', re.I)
358 |             c = re.compile('<br/>', re.I)
359 |             f = re.compile('<img .*?"/>', re.I)
360 |             d = a.sub('', str(i['Text']))
361 |             d = b.sub('', str(d))
362 |             d = c.sub(' ', str(d))
363 |             d = f.sub(' ', str(d))
364 | 
365 |             str1 = str1 + i['Author'] + '('+i['Time']+')'+':'+d + '\n'
366 | 
367 |         res = res + '帖子地址: '+IDurl + '\n' + str1 + '\n'
368 |         # print(res)
369 |     return res
370 | 
371 | 
372 | 
373 | 
374 | # 更改配置
375 | 
376 | def SetSetting(dict):
377 |     global keyList,EssentialList,X,Start,End
378 | 
379 |     keyList = dict['监控贴吧列表']
380 |     EssentialList = dict['监控关键词']
381 |     X = dict['监控周期(S)']
382 |     Start = dict['开始页数']
383 |     End = dict['结束页数']
384 | 
385 |     # 写入文件
386 |     config = configparser.ConfigParser()
387 |     # 编码要设置成utf-8-sig而并不是utf-8
388 |     config.read('TiebaSetting.conf', encoding='utf-8-sig')
389 | 
390 |     config.set('Setting','Sleep',str(X))
391 |     config.set('Setting', 'Start', str(Start))
392 |     config.set('Setting', 'End', str(End))
393 |     config.set('Setting', 'tb', ','.join(keyList))
394 |     config.set('Customize', 'Essential', ','.join(EssentialList))
395 | 
396 |     with open("TiebaSetting.conf", "w+",encoding='utf-8') as f:
397 |         config.write(f)
398 | 
399 |     return '监控贴吧列表: ' + str(keyList) + '\n' + '监控关键词: ' + str(EssentialList) + '\n' + '监控周期: ' + str(
400 |         X) + '\n' + '开始-终止/页数: ' + str(Start) + '-' + str(End)
401 | 
402 | 
403 | # 后前...0/1
404 | # eg 2018-7-3-1
405 | # Year Month Day
406 | # 默认 0000-00-00
407 | 
408 | def TimeLimit(time):
409 |     c = time.split('-')
410 |     refer_to[0] = int(refer_to[0])
411 |     refer_to[1] = int(refer_to[1])
412 |     refer_to[2] = int(refer_to[2])
413 | 
414 |     if(int(refer_to[-1])==0): # 比较前后
415 | 
416 |         if(len(c)==2):
417 |             yue = int(c[0])
418 |             ri = int(c[1].split(' ')[0])
419 | 
420 |             if(Year>=refer_to[0] and yue>=refer_to[1] and ri >=refer_to[2]):
421 |                 return True
422 |             return False
423 |         if(len(c)==3):
424 |             if(int(c[0])>refer_to[0]):
425 |                 return True
426 |             if(int(c[0])>=refer_to[0] and int(c[1])>=refer_to[1] and int(c[2])>=refer_to[2]):
427 |                 return True
428 |             return False
429 |     else:
430 |         if (len(c) == 2):
431 |             yue = int(c[0])
432 |             ri = int(c[1].split(' ')[0])
433 |             if (Year <= refer_to[0] and yue <= refer_to[1] and ri <= refer_to[2]):
434 |                 return True
435 |             return False
436 |         if (len(c) == 3):
437 |             if (int(c[0]) < refer_to[0]):
438 |                 return True
439 |             if (int(c[0]) <= refer_to[0] and int(c[1]) <= refer_to[1] and int(c[2]) <= refer_to[2]):
440 |                 return True
441 |             return False
442 | 
443 | 
444 | 
445 | 
446 | 


--------------------------------------------------------------------------------