├── TiebaStrat.bat ├── TiebaSetting.conf ├── Sentiment ├── TiebaSetting.conf ├── README.md └── tieba.py ├── README.MD ├── ApiRedme ├── README.md └── TiebaApiUtil.py ├── myitchat.py ├── myitchat └── myitchat.py ├── TiebaApiUtil.py └── tieba.py /TiebaStrat.bat: -------------------------------------------------------------------------------- 1 | python .\myitchat.py -------------------------------------------------------------------------------- /TiebaSetting.conf: -------------------------------------------------------------------------------- 1 | [Setting] 2 | sleep = 1 3 | tb = 林俊杰,周杰伦 4 | start = 1 5 | end = 2 6 | Time = 2018-10-14-1 7 | [Customize] 8 | essential =演唱会 9 | 10 | -------------------------------------------------------------------------------- /Sentiment/TiebaSetting.conf: -------------------------------------------------------------------------------- 1 | # 配置文件注释 2 | [Setting] 3 | 4 | Sleep = 18000 5 | # 贴吧名 贴吧用英文逗号(,)分隔符分割 6 | tb = 武汉东湖学院,汉口学院,武汉学院 7 | 8 | # 贴吧页数指定爬取 默认起始页Start=1 End = 3 9 | 10 | Start = 1 11 | 12 | End = 10 13 | 14 | [Customize] 15 | 16 | # 关键词 17 | 18 | Essential = 留学,雅思,托福,新东方,新航道,出国 19 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # Tieba API Sentiment 2 | 3 | 贴吧API以及关键词提醒,舆情监控,微信陪聊机器人。 4 | 5 | :smiley::smiley::smiley::smiley::smiley: 6 | 7 | 8 | 9 | ## Time update 10 | 11 | - 2018/7/3 立项 12 | - 2018/7/12 完成设计 13 | - 2018/7/14 完成API部分 14 | - 2018/7/19 修复API部分,完成关键词监控部分 15 | - 2018/7/20 关键词监控部分上线,微信监控部分测试... 16 | - 2018/7/20 修复初始化问题 17 | - 2018/7/23 更新Sentiment人性化设置,加入全新微信机器人监控(myitchat.py) 18 | - 2018/7/25 更新myitchat以及Sentiment API,添加配置修改内容 19 | - 2018/7/25 Beta V0.1 上线 20 | - 2018/7/27 修复若干问题,正式版V1.0上线 21 | - 2018/7/28 修复因访问速度过快引起的连接重置的bug 22 | - 2018/10/15 增加时间过滤,修复若干BUG。 23 | - 2019/6/28 web版失效 24 | - 2020/4/29 web版生效 25 | 26 | 27 | ## 相关依赖 28 | python 3+ 29 | pip install requests 30 | pip install BeautifulSoup 31 | pip install itchat 32 | 33 | 34 | ## 使用方法 35 | 36 | ### Windows: 37 | 38 | 点击目录下的: 39 | ***TiebaStrat.bat*** 40 | 41 | ![](http://img.lunatic.wang/win.jpg) 42 | 43 | 44 | ### Windows/Linux: 45 | 46 | CMD/powershell:***python myitchat.py*** 47 | 48 | ![](http://cdn.lunatic.wang/linux.jpg) 49 | 50 | 51 | 扫码完成后即可开启贴吧微信助手。 52 | 53 | 54 | 55 | ### 微信机器人 56 | 57 | ![](http://cdn.lunatic.wang/robet.PNG) 58 | 59 | 60 | ### 关键词舆情监控 61 | 62 | 编辑TiebaSetting.conf: 63 | 64 | [Setting] 65 | # 休眠周期/秒 66 | sleep = 300 67 | # 监控贴吧 68 | tb = 林俊杰,周杰伦,王力宏 69 | # 开始监控页数 70 | start = 1 71 | # 结束监控页数 72 | end = 3 73 | # 时间控制 74 | Time = 2018-10-14-0 75 | # 0/1 0:过滤Time之前 1:过滤Time之后 76 | 77 | [Customize] 78 | # 关键词 79 | essential = 演唱会,华语乐坛 80 | 81 | 82 | 83 | 84 | 85 | 1. 启动微信扫码登陆 86 | 2. 输入“开启监控”,即可记录该用户ID并且在之后的监控数据以及配置修改都只有该用户可执行。 87 | ![](http://cdn.lunatic.wang/wx1.png) 88 | 3. 输入“修改配置”,复制模板修改并且发送即可修改。 89 | ![](http://cdn.lunatic.wang/wx5.png) 90 | 4. 享受结果。 91 | 92 | ![](http://cdn.lunatic.wang/wx6.PNG) 93 | 94 | ---------- 95 | ## 详细思路 96 | 97 | 访问我的博客: 98 | 99 | [贴吧监控助手](http://lunatic.wang/posts/d1eb00bb/) 100 | 101 | ---------- 102 | ## 自定义第三方拓展 103 | 104 | - 第三方贴吧请以 *TieBa API Util* 拓展 105 | - 舆情功能拓展以及其他功能拓展 *Sentiment API* 106 | - 微信机器人拓展 *myitchat* 107 | 108 | 以上为建议拓展文件衍生。 109 | 110 | ---------- 111 | ## TieBa API Util 112 | 113 | - 获取官方贴吧信息 114 | - 高度自由化的信息收集 115 | - 可制作第三方客户端 116 | - 回复以及楼中楼形式 117 | 118 | [Tieba API 相关说明](/ApiRedme) 119 | 120 | ## Sentiment API 121 | 122 | - 自定义贴吧监控 123 | - 自定义关键词监控 124 | - 自定义更新周期 125 | - 可定制化的信息收集 126 | - 可作为其他项目工具 127 | 128 | 129 | [Sentiment API 相关说明](/Sentiment) 130 | 131 | ## myitchat 132 | 133 | - 图灵机器人 134 | - 智能对话 135 | - 微信监控舆情 136 | - 智能化控制 137 | 138 | [myitchat 相关说明](/myitchat) 139 | 140 | 141 | 142 | ## 其他说明 143 | 144 | 请勿用于商业用途 145 | -------------------------------------------------------------------------------- /ApiRedme/README.md: -------------------------------------------------------------------------------- 1 | # Tieba API Util 2 | 3 | :smiley: :smiley: :smiley: :smiley: 4 | 5 | 6 | ## TieBa API 7 | 8 | 9 | ### 相关依赖 10 | pip install requests 11 | 12 | pip install BeautifulSoup 13 | 14 | ### 使用方法 15 | 16 | import TiebaApiUtil 17 | 18 | 内置两个方法,分别是GetPage/GetTiebaOne。 19 | 20 | 21 | ### GetPage 22 | 23 | 调用方法 24 | 25 | TiebaApiUtil.GetPage(Key,Start,End) 26 | 27 | 参数说明: 28 | 29 | 30 | * key:贴吧关键词 31 | 32 | 需要访问的贴吧关键词,譬如:'国际米兰'。 33 | 34 | * Start:开始页数 35 | 36 | 默认为1,不能为负。 37 | 38 | * End:结束页数 39 | 40 | 默认为3,不能为负。 41 | 42 | 该函数返回Json,返回形式为: 43 | 44 | * key:贴吧名称 45 | * Page: 46 | * X:贴吧当前页数 47 | * Id:帖子ID 48 | * Title:帖子标题 49 | * Reply:帖子回复数 50 | * Author:帖子作者 51 | * Time:最后回复时间 52 | 53 | 54 |
55 | 56 | TiebaApiUtil.GetPage(key='国际米兰',Start=1,End=3) 57 | 如下: 58 |
59 | 60 | 61 | 62 | { 63 | 64 | "key":"国际米兰", 65 | "Page"::[ 66 | { 67 | "1":[ 68 | { 69 | "Id":"5793707221", 70 | "Title":"官方:埃德尔转会江苏苏宁", 71 | "Reply":"102", 72 | "Author":"树欲动而风又止", 73 | "Time":"23:34" 74 | }, 75 | { 76 | "Id":"5793703055", 77 | "Title":"国际米兰新闻晚报,7月13日", 78 | "Reply":"254", 79 | "Author":"wyp861025", 80 | "Time":"23:44" 81 | } 82 | ] 83 | } 84 | { 85 | "2":[ 86 | { 87 | "Id":"5793707221", 88 | "Title":"官方:埃德尔转会江苏苏宁", 89 | "Reply":"102", 90 | "Author":"树欲动而风又止", 91 | "Time":"23:34" 92 | }, 93 | { 94 | "Id":"5793703055", 95 | "Title":"国际米兰新闻晚报,7月13日", 96 | "Reply":"254", 97 | "Author":"wyp861025", 98 | "Time":"23:44" 99 | } 100 | ] 101 | } 102 | ] 103 | 104 | } 105 | 106 | 107 | ### GetTiebaOne 108 | 109 | 使用方法: 110 | 111 | TiebaApiUtil.GetTiebaOne(ID) 112 | 参数说明: 113 | 114 | * Id:帖子唯一ID 115 | 116 | 该函数返回Json,返回形式为: 117 | 118 | * Text:内容 119 | * Author:用户 120 | * Time:时间 121 | * FloorInFloor:楼中楼 122 | * Text:内容 123 | * Author:用户 124 | * Time:时间 125 | 126 |
127 | 128 | 129 | TiebaApiUtil.GetTiebaOne(5789990094) 130 | 131 |
132 | 133 | 134 | { 135 | "Text":"这么容易就爆照的,不是抠脚就是快餐
", 136 | "Author":"一涵呦", 137 | "Time":"7-10 20:56", 138 | "FloorInFloor":[ 139 | { 140 | "Text":"快餐是啥", 141 | "Author":"可爱的Hjkjbb", 142 | "Time":"14:25" 143 | }, 144 | { 145 | "Text":"回复 言清欢🍒🔯🔯 :快餐就是我们平常点的外卖。", 146 | "Author":"李坤铭12", 147 | "Time":"15:16" 148 | }, 149 | { 150 | "Text":"回复 言清欢🍒🔯🔯 :打一次就走的,不过夜的", 151 | "Author":"啦啦队长15", 152 | "Time":"15:24" 153 | }, 154 | { 155 | "Text":"回复 米破是张小恒 :你小弟弟就这么粗。", 156 | "Author":"让我鸡儿放会假", 157 | "Time":"16:47" 158 | }, 159 | { 160 | "Text":"卖茶叶的", 161 | "Author":"天生的she手", 162 | "Time":"20:12" 163 | } 164 | ] 165 | } 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /myitchat.py: -------------------------------------------------------------------------------- 1 | import itchat 2 | import requests 3 | import tieba 4 | import threading 5 | import json 6 | import platform 7 | 8 | KEY = 'xxxxxxx' 9 | 10 | T = 0 11 | 12 | def get_response(msg, UserId): 13 | # 这里我们就像在“3. 实现最简单的与图灵机器人的交互”中做的一样 14 | # 构造了要发送给服务器的数据 15 | apiUrl = 'http://openapi.tuling123.com/openapi/api/v2' 16 | 17 | 18 | data = { 19 | "reqType": 0, 20 | "perception": { 21 | "inputText": { 22 | "text": msg 23 | }, 24 | "inputImage": { 25 | "url": "" 26 | }, 27 | "selfInfo": { 28 | "location": { 29 | "city": "", 30 | "province": "", 31 | "street": "" 32 | } 33 | } 34 | }, 35 | "userInfo": { 36 | "apiKey": "42afd1a6112f4a93bbaa83022d980132", 37 | "userId": str(UserId)[1:33] 38 | } 39 | } 40 | 41 | print(data) 42 | try: 43 | r = requests.post(apiUrl, data=json.dumps(data)).json() 44 | # 字典的get方法在字典没有'text'值的时候会返回None而不会抛出异常 45 | 46 | r = r['results'] 47 | r = r[0] 48 | r = r['values'] 49 | return r['text'] 50 | # 为了防止服务器没有正常响应导致程序异常退出,这里用try-except捕获了异常 51 | # 如果服务器没能正常交互(返回非json或无法连接),那么就会进入下面的return 52 | except: 53 | # 将会返回一个None 54 | return 55 | 56 | 57 | id = '' 58 | 59 | 60 | @itchat.msg_register(itchat.content.TEXT) 61 | def print_content(msg): 62 | # print(msg) 63 | global id 64 | print('id = |'+ id) 65 | print(msg['Text'] == '修改配置') 66 | print(id == msg['FromUserName']) 67 | if msg['Text'] == '开启监控' and (id == '' or id ==None ): 68 | 69 | # 引用全局变量 70 | id = msg['FromUserName'] 71 | itchat.send_msg('已经开启监控~', toUserName=id) 72 | itchat.send_msg(tieba.setting(), toUserName=id) 73 | return 74 | 75 | if msg['Text'] == '修改配置' and id == msg['FromUserName']: 76 | Setting = tieba.GetSetting() 77 | a = { 78 | '监控贴吧列表':Setting[0], 79 | '监控关键词':Setting[1], 80 | '监控周期(S)':Setting[2], 81 | '开始页数':Setting[3], 82 | '结束页数': Setting[4] 83 | } 84 | itchat.send_msg('修改以下列信息,并且将修改后的信息复制发送', toUserName=id) 85 | 86 | itchat.send_msg(str(a), toUserName=id) 87 | return 88 | 89 | if '监控贴吧列表' in msg['Text']: 90 | global T 91 | T = 1 92 | 93 | son = tieba.SetSetting(eval(msg['Text'])) 94 | return son 95 | 96 | 97 | 98 | 99 | 100 | # 这次对接收信息做一次判断 101 | sentence = msg['Text'] 102 | 103 | # 如果用户发送的是YYF则执行刷任务 104 | print(msg['FromUserName']) 105 | 106 | 107 | 108 | return get_response(msg['Text'],msg['FromUserName']) 109 | 110 | def Main(): 111 | global T 112 | i = 1 113 | while 1: 114 | 115 | C = tieba.Main() 116 | print('This is myitchat: '+ str(C) ) 117 | print(T) 118 | 119 | if C == None or C == [] or C == ' ' or len(C) > 3452: 120 | continue 121 | if T == 1: 122 | T = 0 123 | print('改变了T: '+ str(T)) 124 | 125 | if T ==0 and i != 0 : 126 | itchat.send_msg('监控到更新的数据 \n \n'+str(C),toUserName=id) 127 | i += 1 128 | 129 | 130 | 131 | 132 | 133 | # 识别系统 134 | sysstr = platform.system() 135 | 136 | if(sysstr =="Windows"): 137 | itchat.auto_login(hotReload=True) 138 | elif sysstr == "Linux": 139 | itchat.auto_login(enableCmdQR=2) 140 | 141 | 142 | # blockThread=False 启用解除block 143 | itchat.run(blockThread=False) 144 | tieba.ini() 145 | tie = threading.Thread(target=Main()) 146 | tie.start() 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /myitchat/myitchat.py: -------------------------------------------------------------------------------- 1 | import itchat 2 | import requests 3 | import tieba 4 | import threading 5 | import json 6 | import platform 7 | 8 | KEY = 'xxxxxxx' 9 | 10 | T = 0 11 | 12 | def get_response(msg, UserId): 13 | # 这里我们就像在“3. 实现最简单的与图灵机器人的交互”中做的一样 14 | # 构造了要发送给服务器的数据 15 | apiUrl = 'http://openapi.tuling123.com/openapi/api/v2' 16 | 17 | 18 | data = { 19 | "reqType": 0, 20 | "perception": { 21 | "inputText": { 22 | "text": msg 23 | }, 24 | "inputImage": { 25 | "url": "" 26 | }, 27 | "selfInfo": { 28 | "location": { 29 | "city": "", 30 | "province": "", 31 | "street": "" 32 | } 33 | } 34 | }, 35 | "userInfo": { 36 | "apiKey": "42afd1a6112f4a93bbaa83022d980132", 37 | "userId": str(UserId)[1:33] 38 | } 39 | } 40 | 41 | print(data) 42 | try: 43 | r = requests.post(apiUrl, data=json.dumps(data)).json() 44 | # 字典的get方法在字典没有'text'值的时候会返回None而不会抛出异常 45 | 46 | r = r['results'] 47 | r = r[0] 48 | r = r['values'] 49 | return r['text'] 50 | # 为了防止服务器没有正常响应导致程序异常退出,这里用try-except捕获了异常 51 | # 如果服务器没能正常交互(返回非json或无法连接),那么就会进入下面的return 52 | except: 53 | # 将会返回一个None 54 | return 55 | 56 | 57 | id = '' 58 | 59 | 60 | @itchat.msg_register(itchat.content.TEXT) 61 | def print_content(msg): 62 | # print(msg) 63 | global id 64 | print('id = |'+ id) 65 | print(msg['Text'] == '修改配置') 66 | print(id == msg['FromUserName']) 67 | if msg['Text'] == '开启监控' and (id == '' or id ==None ): 68 | 69 | # 引用全局变量 70 | id = msg['FromUserName'] 71 | itchat.send_msg('已经开启监控~', toUserName=id) 72 | itchat.send_msg(tieba.setting(), toUserName=id) 73 | return 74 | 75 | if msg['Text'] == '修改配置' and id == msg['FromUserName']: 76 | Setting = tieba.GetSetting() 77 | a = { 78 | '监控贴吧列表':Setting[0], 79 | '监控关键词':Setting[1], 80 | '监控周期(S)':Setting[2], 81 | '开始页数':Setting[3], 82 | '结束页数': Setting[4] 83 | } 84 | itchat.send_msg('修改以下列信息,并且将修改后的信息复制发送', toUserName=id) 85 | 86 | itchat.send_msg(str(a), toUserName=id) 87 | return 88 | 89 | if '监控贴吧列表' in msg['Text']: 90 | global T 91 | T = 1 92 | 93 | son = tieba.SetSetting(eval(msg['Text'])) 94 | return son 95 | 96 | 97 | 98 | 99 | 100 | # 这次对接收信息做一次判断 101 | sentence = msg['Text'] 102 | 103 | # 如果用户发送的是YYF则执行刷任务 104 | print(msg['FromUserName']) 105 | 106 | 107 | 108 | return get_response(msg['Text'],msg['FromUserName']) 109 | 110 | def Main(): 111 | global T 112 | i = 1 113 | while 1: 114 | 115 | C = tieba.Main() 116 | print('This is myitchat: '+ str(C) ) 117 | print(T) 118 | 119 | if C == None or C == [] or C == ' ' or len(C) > 3452: 120 | continue 121 | if T == 1: 122 | T = 0 123 | print('改变了T: '+ str(T)) 124 | 125 | if T ==0 and i != 0 : 126 | itchat.send_msg('监控到更新的数据 \n \n'+str(C),toUserName=id) 127 | i += 1 128 | 129 | 130 | 131 | 132 | 133 | # 识别系统 134 | sysstr = platform.system() 135 | 136 | if(sysstr =="Windows"): 137 | itchat.auto_login(hotReload=True) 138 | elif sysstr == "Linux": 139 | itchat.auto_login(enableCmdQR=2) 140 | 141 | 142 | # blockThread=False 启用解除block 143 | itchat.run(blockThread=False) 144 | tieba.ini() 145 | tie = threading.Thread(target=Main()) 146 | tie.start() 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /Sentiment/README.md: -------------------------------------------------------------------------------- 1 | # Sentiment API 2 | 3 | --- 4 | 5 | :tokyo_tower: :tokyo_tower: :tokyo_tower: 6 | 7 | 8 | ## Sentiment 9 | 10 | 11 | ### 相关依赖 12 | 13 | import TiebaApiUtil 14 | 15 | ### 使用方法 16 | 17 | import tieba 18 | 19 | 内置方法: 20 | 21 | - GetId 22 | - GetText 23 | - OneToOne 24 | - ComparisonDict() 25 | - ini() 26 | - Main() 27 | - setting() 28 | - GetSetting() 29 | - SetSetting() 30 | - dict_string() 31 | 32 | 内置文件: 33 | 34 | - TiebaSetting.conf 35 | 36 | 配置文件相关参数: 37 | 38 | [Setting] 39 | 40 | Sleep = 18000 41 | # 贴吧名 贴吧用英文逗号(,)分隔符分割 42 | 43 | tb = 国际米兰,AC米兰,皇家马德里 44 | 45 | # 贴吧页数指定爬取 默认起始页Start=1 End = 3 46 | 47 | Start = 1 48 | 49 | End = 10 50 | 51 | [Customize] 52 | 53 | # 关键词 54 | 55 | Essential = C罗,尤文,梅西,卡卡,透 56 | 57 | 58 | 59 | --- 60 | ### GetId 61 | 62 | 63 | #### 调用方法 64 | 65 | 66 | tieba.GetId() 67 | 68 | #### 方法说明 69 | 70 | 该方法获取配置文件信息,获取贴吧列表调用TiebaApiUtil.GetPage()方法。 71 | 72 | 73 | #### 返回形式: 74 | 75 | ![](http://cdn.lunatic.wang/tieba6) 76 | 77 | 78 | 79 | 80 | 81 | --- 82 | 83 | ### GetText 84 | 85 | #### 使用方法: 86 | 87 | tieba.GetText(list) 88 | 89 | #### 参数说明: 90 | 91 | list:获取贴吧页数标题列表,以及比较相关关键词信息 92 | 93 | #### 方法说明 94 | 95 | 通过调用OneToOne()比较关键词,将相关数据存入Save中。 96 | 97 | #### 返回形式 98 | 99 | 100 | 该方法没有返回值。 101 | 102 | 103 | --- 104 | ### OneToOne 105 | 106 | #### 使用方法: 107 | 108 | tieba.OneToOne(Text) 109 | 110 | #### 参数说明: 111 | 112 | Text:用户回复内容 113 | 114 | #### 方法说明: 115 | 116 | 将内容与关键词进行比较 117 | 118 | #### 返回形式 119 | 120 | 该方法返回布尔值。 121 | 122 | - True 123 | - 比对成功 124 | - False 125 | - 比对失败 126 | 127 | 128 | --- 129 | 130 | ### ComparisonDict 131 | 132 | #### 使用方法: 133 | 不推荐单独使用 134 | 135 | #### 参数说明: 136 | 无 137 | 138 | #### 方法说明: 139 | 该方法用以Save与OldSave进行对比从而达到监控目的 140 | 141 | #### 返回形式 142 | 143 | 重新生成新list —> NewList 144 | 145 | #### NewList 146 | 147 | 对比差异及结果 148 | 149 | --- 150 | ### ini 151 | 152 | #### 使用方法: 153 | tieba.ini() 154 | 155 | #### 参数说明: 156 | 无 157 | 158 | #### 方法说明: 159 | 该方法用以程序第一次启动用以初始化,在程序未启动或者未生成数据文件时先行调用该方法。 160 | 161 | #### 返回形式 162 | 无 163 | 164 | --- 165 | ### Main 166 | 167 | #### 使用方法: 168 | 169 | tieba.ini() 170 | 171 | #### 参数说明 172 | 无 173 | 174 | #### 方法说明: 175 | 176 | 该方法为监控主程序,当程序完成初始化后进行监控 177 | 178 | #### 返回形式: 179 | 180 | NewList ——> 该list存放监控数据变化 181 | 182 | 183 | 184 | 185 | --- 186 | 187 | ### setting() 188 | 189 | #### 使用方法: 190 | 191 | tieba.setting() 192 | 193 | #### 参数说明 194 | 无 195 | 196 | #### 方法说明: 197 | 198 | 该方法查看TiebaSetting.conf配置内容 199 | 200 | #### 返回形式: 201 | 202 | String 字符串形式: 203 | 204 | ![](http://cdn.lunatic.wang/tieba7.jpg) 205 | 206 | --- 207 | 208 | ### GetSetting() 209 | 210 | #### 使用方法: 211 | 212 | tieba.GetSetting() 213 | 214 | #### 参数说明 215 | 无 216 | 217 | #### 方法说明: 218 | 219 | 该方法查看TiebaSetting.conf配置内容,以list形式返回 220 | 221 | #### 返回形式: 222 | 223 | ![](http://cdn.lunatic.wang/tieba8.jpg) 224 | 225 | --- 226 | ### SetSetting(dict) 227 | 228 | #### 使用方法: 229 | 230 | tieba.GetSetting() 231 | 232 | #### 参数说明 233 | dict : 传入修改的配置文件信息 234 | 235 | #### 方法说明: 236 | 237 | 该方法动态修改配置文件信息,用户在程序运行中改变关键词等配置信 238 | 239 | #### 返回形式: 240 | 241 | 242 | ![](http://cdn.lunatic.wang/tieba7.jpg) 243 | 244 | --- 245 | ### dict_string(dict) 246 | 247 | #### 使用方法: 248 | 249 | tieba.dict_string(dict) 250 | 251 | #### 参数说明 252 | dict :适配关键词的Text等相关dict信息 253 | 254 | #### 方法说明: 255 | 256 | 该方法用于去除适配的Text内容中去除HTML信息的图片标签以及其他影响阅读的标签 257 | 258 | #### 返回形式: 259 | 260 | > 监控到更新的数据 261 | > 262 | > 帖子地址: https://tieba.baidu.com/p/5660167836 263 | 264 | > 265 | > 266 | > 帖子地址: https://tieba.baidu.com/p/5806639338 267 | > 二硕影迷(7-23 19:24):回复 啦啦啦哈哈撒 269 | > 卿卿且苧(11:23):回复 二硕影迷 271 | > 272 | > 帖子地址: https://tieba.baidu.com/p/5805131589 273 | > 萝卜森兔耳德(7-22 07:42):现在马上要录取了,突然想到一个很害怕的情况!我是第二批次,十个学校知道必须专业服从调剂于是我先把那十个勾给打上了然后再写的学校和代码,问一下提交的时候我服从的调剂勾会不会没有啊感觉好吓人啊! 274 | > 275 | > 276 | > 帖子地址: https://tieba.baidu.com/p/5803120565 277 | > GGXHTML(7-20 16:37):山东影制的录取 出来啦吗 279 | > a37203050(7-22 05:45):回复 GGXHTML 281 | > 神奇的兔酱(7-22 08:33):回复 a37203050 :没有出,10个计划山东,最低录取到411分。 应该会追加计划 282 | > 283 | > GGXHTML(7-22 09:43):这个学习的录取线大概什么时候出啊? 284 | > 285 | > 286 | > 帖子地址: https://tieba.baidu.com/p/5808492570 287 | > 熏熏暖风(22:40):本来江夏区算半个郊区,现在还没有地铁只有3种公交可以出去,不过地铁通了就方便了,地铁口离东湖还是有一些距离的,宿舍环境装修了,被称为酒店级宿舍,图书馆还是不错的,市区的话,江夏区的市区大概半个多小时,不过没什么玩的买买一些东西是可以的,洪山区那边公交就50分钟到街道口那边,之后去哪里有地铁就各种方便了,就是放假的时候公交人很多 288 | > 289 | > 290 | > 帖子地址: https://tieba.baidu.com/p/5807674831 291 | > zzhxiannv(23:04):不知道什么预录变成录取 292 | > 293 | > 294 | > 帖子地址: https://tieba.baidu.com/p/5806260556 295 | > 哈欠女神(7-23 10:49):最低528!!(2017年这所学校在海南录取最低分) 296 | > 297 | > shine包仔(7-23 14:07):回复 泌夫人(.*?).*?回([0-9]\d*)\s(.*?)\s(.*?)

',re.S) 90 | items = re.findall(pattern, str(x)) 91 | # print(items) 92 | # 这里对标题进行一次修正,防止出现 '1.\xa0' 情况 93 | if c < 10: 94 | Title = items[0][1][3:] 95 | else: 96 | Title = items[0][1][4:] 97 | 98 | c += 1 99 | # 创建一个Dict 100 | 101 | Son = { 102 | 'Id':items[0][0], 103 | 'Title':Title, 104 | 'Reply':items[0][2], 105 | 'Author':items[0][3], 106 | 'Time':items[0][4] 107 | } 108 | SouList.append(Son) 109 | 110 | Page = { 111 | str(int((i + 20) / 20)): SouList 112 | } 113 | 114 | 115 | # 这里需要清空list 116 | """ 117 | 引入copy 118 | 在python 赋值是引入A=B,当清空或者销毁前者B被赋值的变量时,出现复制后的变量A为空,所以这里我们采用copy包中的deepcopy方法而不是copy方法。 119 | 120 | 参考:https://www.cnblogs.com/koliverpool/p/6791579.html 121 | """ 122 | 123 | SuperList.append(copy.deepcopy(Page)) 124 | SouList.clear() 125 | if not '下一页' in GetPageID.text: 126 | break 127 | ReturnJson['Page'] = SuperList 128 | 129 | 130 | 131 | # 添加 ensure_ascii=False 防止中文乱码 132 | Result = json.dumps(ReturnJson,ensure_ascii=False) 133 | 134 | return Result 135 | 136 | """ 137 | 获取单个页数,该函数只于贴子ID有关 138 | 139 | http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=4552337163&new_word=&pinf=1_2_0&pn=60&lp=6021 140 | 关键词是 141 | KZ:帖子ID 142 | pn:(0,30,60,90) 143 | (1,2,3,4) 144 | 145 | """ 146 | def GetTiebaOne(ID): 147 | """ 148 | json 格式 149 | { 150 | 'Text':'balabalabala' 151 | 'Author':'123' 152 | 'FloorInFloor':{ 153 | { 154 | "Text": "balabalabala ", 155 | "Author": "123", 156 | "Time": "uacpayhs 09:34" 157 | },{ 158 | "Text": "回复 ", 159 | "Author": "极限rabbit", 160 | "Time": "uacpayhs 09:34" 161 | }, 162 | } 163 | 164 | } 165 | """ 166 | # 先获取帖子第一页以及帖子回复数 167 | 168 | page = 0 169 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=' 170 | url2 = '&new_word=&pinf=1_2_0&pn='+ str(page) 171 | url3 = '&lp=6021' 172 | url = url1+str(ID)+url2+url3 173 | # print(url) 174 | # 于前处理 175 | time.sleep(0.01) 176 | GetContent = requests.get(url=url,headers=headers) 177 | # Soup = BeautifulSoup(GetContent.text,'lxml') 178 | 179 | # 获取页数 180 | # SumPage = Soup.select_one('div.h > input[type="text"]').attrs['value'] 181 | 182 | # -------------------------------------------- 183 | # print(str(page)) 184 | Soup = BeautifulSoup(GetContent.text, 'lxml') 185 | findall = Soup.select('div.i') 186 | FatherList = [] 187 | SonDict = {} 188 | 189 | # 异常以及不存在的帖子说明: 190 | 191 | if '您要浏览的贴子不存在' in GetContent.text: 192 | ReturnJ = { 193 | 'Error' : 'Error' 194 | } 195 | return json.dumps(ReturnJ) 196 | # 页数增加 197 | for page in range(0,16122330,30): 198 | # 这里要对页数循环 199 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=' 200 | url2 = '&new_word=&pinf=1_2_0&pn=' + str(page) 201 | url3 = '&lp=6021' 202 | url = url1 + str(ID) + url2 + url3 203 | # print(url) 204 | time.sleep(0.01) 205 | GetContent = requests.get(url=url,headers=headers) 206 | 207 | Soup = BeautifulSoup(GetContent.text, 'lxml') 208 | findall = Soup.select('div.i') 209 | 210 | for OneContent,count in zip(findall,range(1,999)): 211 | 212 | 213 | if count == 1 and page == 0: 214 | 215 | pattern = re.compile('class="i">1楼.\s(.*?).*?(.*?).*?class="b">(.*?)\d*楼.\s(.*?)
.*?(.*?).*?class="b">(.*?).*?href="(.*?)">回复(.*?)', re.S) 228 | items = re.findall(pattern, str(OneContent)) 229 | 230 | if items == [] or items == None or items == '': 231 | continue 232 | Text = items[0][0] 233 | Author = items[0][1] 234 | Time = items[0][2] 235 | 236 | 237 | Floor = items[0][4][1:-1] 238 | # print(items) 239 | FloorInFloor = [] 240 | if not (Floor == '' or Floor == None): 241 | # print(items[0][2]) 242 | FloorInFloor = GetFloorInFloor(url=items[0][3]) 243 | SonDict['Text'] = Text 244 | SonDict['Author'] = items[0][1] 245 | SonDict['Time'] = Time 246 | SonDict['FloorInFloor'] = FloorInFloor 247 | FatherList.append(copy.deepcopy(SonDict)) 248 | FloorInFloor.clear() 249 | SonDict.clear() 250 | 251 | # 这里不能直接判断下一页 252 | 253 | if not '下一页' in GetContent.text: 254 | break 255 | 256 | FatherListJson = json.dumps(FatherList,ensure_ascii=False) 257 | return FatherListJson 258 | #获取楼中楼 259 | 260 | def GetFloorInFloor(url): 261 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/' 262 | 263 | # 这里要做一次替换,因为html的&是&,做一次替换 264 | 265 | # 参数内置默认99 266 | url2 = url1 + url.replace('&','&') +'&fpn=' 267 | 268 | 269 | 270 | # 读取楼中楼信息 271 | 272 | ReturnList = [] 273 | 274 | for pn in range(1,100): 275 | time.sleep(0.01) 276 | web = requests.get(url=url2+str(pn),headers=headers) 277 | Soup = BeautifulSoup(web.text,'lxml') 278 | findall = Soup.select('div.i') 279 | for i in findall: 280 | 281 | pattern = re.compile( 282 | '
(.*?)
(.*?).*?(.*?)', 283 | re.S) 284 | items = re.findall(pattern, str(i)) 285 | Son = { 286 | 'Text':items[0][0], 287 | 'Author':items[0][1], 288 | 'Time':items[0][2] 289 | } 290 | ReturnList.append(Son) 291 | 292 | # 获取楼中楼页数 293 | if not '下一页' in web.text: 294 | break 295 | return ReturnList 296 | 297 | 298 | 299 | 300 | 301 | 302 | -------------------------------------------------------------------------------- /ApiRedme/TiebaApiUtil.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import re 6 | import copy 7 | import time 8 | 9 | # 自制贴吧api 10 | """ 11 | 该api依赖百度贴吧web版 12 | 13 | 贴吧api作为工具类,可制作第三方客户端,极速/个性/无广告 14 | """ 15 | 16 | 17 | """ 18 | 模拟三星手机访问 19 | 虽然不加headers也可正常访问,但是我们还是要严谨 20 | """ 21 | 22 | headers = { 23 | 'Host': 'tieba.baidu.com', 24 | 'Connection': 'keep-alive', 25 | 'Cache-Control': 'max-age=0', 26 | 'Upgrade-Insecure-Requests': '1', 27 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36', 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 29 | 'Accept-Encoding': 'gzip, deflate', 30 | 'Accept-Language': 'zh-CN,zh;q=0.9' 31 | } 32 | 33 | # 获取某个贴吧页面帖子 34 | 35 | """ 36 | url = http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw=%E6%9E%97%E4%BF%8A%E6%9D%B0&lp=5011&lm=&pn=0 37 | kw:贴吧关键词 38 | pn=页数(0,20,40,60...)页数 39 | (1,2,3,4) 40 | """ 41 | 42 | """ 43 | pnf=1 起始页 默认为第一页 Start 44 | pne=3 结束页 默认为第三页 End 45 | """ 46 | def GetPage(key,Start=1,End=3): 47 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kw=' 48 | url2 = '&lp=5011&lm=&pn=' 49 | 50 | Start = Start*20-20 51 | # 防止出现输入0的情况 52 | if Start == -20: 53 | Start = 0 54 | 55 | End = End*20 56 | 57 | 58 | ReturnJson = { 59 | 'key' : key 60 | } 61 | SuperList = [] 62 | for i in range(Start,End,20): 63 | 64 | url = url1 + key + url2 + str(i) 65 | 66 | # print(url) 67 | time.sleep(0.01) 68 | GetPageID = requests.get(url = url,headers=headers) 69 | if '欢迎创建本吧,与今后来到这里的朋友交流讨论' in GetPageID.text: 70 | Error = { 71 | 'Error':'改吧尚未建立' 72 | } 73 | return json.dumps(Error,ensure_ascii=False) 74 | Soup = BeautifulSoup(GetPageID.text,'lxml') 75 | find = Soup.select('div.i') 76 | 77 | # 计数 调整标题 78 | c = 1 79 | 80 | ''' 81 | 这里需要添加一个list,这样一页就在一个list中 82 | 83 | ''' 84 | SouList = [] 85 | 86 | for x in find: 87 | 88 | # print(x) 89 | pattern = re.compile('kz=(.*?)&.*?">(.*?).*?回([0-9]\d*)\s(.*?)\s(.*?)

',re.S) 90 | items = re.findall(pattern, str(x)) 91 | # print(items) 92 | # 这里对标题进行一次修正,防止出现 '1.\xa0' 情况 93 | if c < 10: 94 | Title = items[0][1][3:] 95 | else: 96 | Title = items[0][1][4:] 97 | 98 | c += 1 99 | # 创建一个Dict 100 | 101 | Son = { 102 | 'Id':items[0][0], 103 | 'Title':Title, 104 | 'Reply':items[0][2], 105 | 'Author':items[0][3], 106 | 'Time':items[0][4] 107 | } 108 | SouList.append(Son) 109 | 110 | Page = { 111 | str(int((i + 20) / 20)): SouList 112 | } 113 | 114 | 115 | # 这里需要清空list 116 | """ 117 | 引入copy 118 | 在python 赋值是引入A=B,当清空或者销毁前者B被赋值的变量时,出现复制后的变量A为空,所以这里我们采用copy包中的deepcopy方法而不是copy方法。 119 | 120 | 参考:https://www.cnblogs.com/koliverpool/p/6791579.html 121 | """ 122 | 123 | SuperList.append(copy.deepcopy(Page)) 124 | SouList.clear() 125 | if not '下一页' in GetPageID.text: 126 | break 127 | ReturnJson['Page'] = SuperList 128 | 129 | 130 | 131 | # 添加 ensure_ascii=False 防止中文乱码 132 | Result = json.dumps(ReturnJson,ensure_ascii=False) 133 | 134 | return Result 135 | 136 | """ 137 | 获取单个页数,该函数只于贴子ID有关 138 | 139 | http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=4552337163&new_word=&pinf=1_2_0&pn=60&lp=6021 140 | 关键词是 141 | KZ:帖子ID 142 | pn:(0,30,60,90) 143 | (1,2,3,4) 144 | 145 | """ 146 | def GetTiebaOne(ID): 147 | """ 148 | json 格式 149 | { 150 | 'Text':'balabalabala' 151 | 'Author':'123' 152 | 'FloorInFloor':{ 153 | { 154 | "Text": "balabalabala ", 155 | "Author": "123", 156 | "Time": "uacpayhs 09:34" 157 | },{ 158 | "Text": "回复 ", 159 | "Author": "极限rabbit", 160 | "Time": "uacpayhs 09:34" 161 | }, 162 | } 163 | 164 | } 165 | """ 166 | # 先获取帖子第一页以及帖子回复数 167 | 168 | page = 0 169 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=' 170 | url2 = '&new_word=&pinf=1_2_0&pn='+ str(page) 171 | url3 = '&lp=6021' 172 | url = url1+str(ID)+url2+url3 173 | # print(url) 174 | # 于前处理 175 | time.sleep(0.01) 176 | GetContent = requests.get(url=url,headers=headers) 177 | # Soup = BeautifulSoup(GetContent.text,'lxml') 178 | 179 | # 获取页数 180 | # SumPage = Soup.select_one('div.h > input[type="text"]').attrs['value'] 181 | 182 | # -------------------------------------------- 183 | # print(str(page)) 184 | Soup = BeautifulSoup(GetContent.text, 'lxml') 185 | findall = Soup.select('div.i') 186 | FatherList = [] 187 | SonDict = {} 188 | 189 | # 异常以及不存在的帖子说明: 190 | 191 | if '您要浏览的贴子不存在' in GetContent.text: 192 | ReturnJ = { 193 | 'Error' : 'Error' 194 | } 195 | return json.dumps(ReturnJ) 196 | # 页数增加 197 | for page in range(0,16122330,30): 198 | # 这里要对页数循环 199 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/m?kz=' 200 | url2 = '&new_word=&pinf=1_2_0&pn=' + str(page) 201 | url3 = '&lp=6021' 202 | url = url1 + str(ID) + url2 + url3 203 | # print(url) 204 | time.sleep(0.01) 205 | GetContent = requests.get(url=url,headers=headers) 206 | 207 | Soup = BeautifulSoup(GetContent.text, 'lxml') 208 | findall = Soup.select('div.i') 209 | 210 | for OneContent,count in zip(findall,range(1,999)): 211 | 212 | 213 | if count == 1 and page == 0: 214 | 215 | pattern = re.compile('class="i">1楼.\s(.*?)
.*?(.*?).*?class="b">(.*?)\d*楼.\s(.*?)
.*?(.*?).*?class="b">(.*?).*?href="(.*?)">回复(.*?)', re.S) 228 | items = re.findall(pattern, str(OneContent)) 229 | 230 | if items == [] or items == None or items == '': 231 | continue 232 | Text = items[0][0] 233 | Author = items[0][1] 234 | Time = items[0][2] 235 | 236 | 237 | Floor = items[0][4][1:-1] 238 | # print(items) 239 | FloorInFloor = [] 240 | if not (Floor == '' or Floor == None): 241 | # print(items[0][2]) 242 | FloorInFloor = GetFloorInFloor(url=items[0][3]) 243 | SonDict['Text'] = Text 244 | SonDict['Author'] = items[0][1] 245 | SonDict['Time'] = Time 246 | SonDict['FloorInFloor'] = FloorInFloor 247 | FatherList.append(copy.deepcopy(SonDict)) 248 | FloorInFloor.clear() 249 | SonDict.clear() 250 | 251 | # 这里不能直接判断下一页 252 | 253 | if not '下一页' in GetContent.text: 254 | break 255 | 256 | FatherListJson = json.dumps(FatherList,ensure_ascii=False) 257 | return FatherListJson 258 | #获取楼中楼 259 | 260 | def GetFloorInFloor(url): 261 | url1 = 'http://tieba.baidu.com/mo/q---9CC3CD881B0FE2BA30F4559A6AF8A941%3AFG%3D1-sz%40320_240%2C-1-3-0--2--wapp_1531379582221_177/' 262 | 263 | # 这里要做一次替换,因为html的&是&,做一次替换 264 | 265 | # 参数内置默认99 266 | url2 = url1 + url.replace('&','&') +'&fpn=' 267 | 268 | 269 | 270 | # 读取楼中楼信息 271 | 272 | ReturnList = [] 273 | 274 | for pn in range(1,100): 275 | time.sleep(0.01) 276 | web = requests.get(url=url2+str(pn),headers=headers) 277 | Soup = BeautifulSoup(web.text,'lxml') 278 | findall = Soup.select('div.i') 279 | for i in findall: 280 | 281 | pattern = re.compile( 282 | '
(.*?)
(.*?).*?(.*?)', 283 | re.S) 284 | items = re.findall(pattern, str(i)) 285 | Son = { 286 | 'Text':items[0][0], 287 | 'Author':items[0][1], 288 | 'Time':items[0][2] 289 | } 290 | ReturnList.append(Son) 291 | 292 | # 获取楼中楼页数 293 | if not '下一页' in web.text: 294 | break 295 | return ReturnList 296 | 297 | 298 | 299 | 300 | 301 | 302 | -------------------------------------------------------------------------------- /Sentiment/tieba.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | import configparser 5 | import TiebaApiUtil 6 | import copy 7 | import re 8 | 9 | """ 10 | 此处用".conf"进行配置 11 | Python 读取写入配置文件 —— ConfigParser 12 | https://www.cnblogs.com/feeland/p/4514771.html 13 | 14 | config = configparser.ConfigParser() 15 | config.read('test.conf') 16 | db_host = config.get("db", "db_port") 17 | 18 | """ 19 | 20 | 21 | # 参数配置说明 22 | """ 23 | [Setting] 24 | 爬取贴吧 25 | tb:{'国际米兰','linjj','dota','dota2'} 26 | 27 | 爬取默认页数 28 | Start:1 29 | End:4 30 | 31 | [Customize] 32 | 关键词 33 | Essential:{'昨天','今天','明天','后天'} 34 | 35 | """ 36 | 37 | 38 | config = configparser.ConfigParser() 39 | # 编码要设置成utf-8-sig而并不是utf-8 40 | config.read('TiebaSetting.conf', encoding='utf-8-sig') 41 | key = config.get("Setting", "tb") 42 | keyList = key.split(',') 43 | Essential = config.get("Customize", "Essential") 44 | EssentialList = Essential.split(',') 45 | 46 | # 休息周期 47 | X = config.getint("Setting", "Sleep") 48 | 49 | # 开始页数 50 | Start = config.getint("Setting", "Start") 51 | # 结束页数 52 | End = config.getint("Setting", "End") 53 | 54 | # 信息存取列表 55 | Save = {} 56 | 57 | # 读取文件或前一个爬取列表 58 | 59 | OldSave = {} 60 | 61 | # 新出现的帖子 62 | 63 | NewList = [] 64 | 65 | 66 | # 获取ID 67 | def GetId(): 68 | IdList = [] 69 | for name in keyList: 70 | test = TiebaApiUtil.GetPage(key=name,Start=Start,End=End) 71 | test1 = eval(test) 72 | # print(test1) 73 | for i in test1['Page']: 74 | for x in i.values(): 75 | for j in x: 76 | IdList.append(j['Id']) 77 | # print(IdList.__len__()) 78 | return IdList 79 | 80 | 81 | # 存取相关关键词详细信息 82 | 83 | OldKeyText = {} 84 | KeyText = {} 85 | 86 | ''' 87 | { 88 | 123123123 : [ 89 | { 90 | } 91 | ] 92 | 93 | 94 | } 95 | 96 | ''' 97 | 98 | 99 | 100 | # 获取TEXT 以及对比 101 | def GetText(list): 102 | 103 | for id in list: 104 | 105 | # 标志 为0则代表无关键词,为1则有关键词 106 | T = 0 107 | 108 | # 存放 某个ID的所有关键回复楼 109 | KeyList = [] 110 | 111 | Text = TiebaApiUtil.GetTiebaOne(id) 112 | 113 | test1 = eval(Text) 114 | # print(str(test1)) 115 | 116 | for i in test1: 117 | # 检测回复是否有关键词 118 | try: 119 | if OneToOne(i['Text']): 120 | # 存放关键词检索 121 | T = 1 122 | key = { 123 | 'Author': i['Author'], 124 | 'Text':i['Text'], 125 | 'Time':i['Time'] 126 | } 127 | 128 | KeyList.append(key) 129 | 130 | # ID 在Save中 131 | 132 | if str(id) in Save: 133 | # print('旧ID') 134 | Save[str(id)] += 1 135 | # ID 不在Save中 136 | else: 137 | # print('出现新ID') 138 | Save[str(id)] = 1 139 | # 检测楼中楼是否存在 140 | if not (i['FloorInFloor'] == '' or i['FloorInFloor'] == None or i['FloorInFloor'] == []): 141 | 142 | # 存在楼中楼则遍历 143 | for f in i['FloorInFloor']: 144 | 145 | if OneToOne(f['Text']): 146 | 147 | # 存放关键词检索 148 | T = 1 149 | key = { 150 | 'Author': f['Author'], 151 | 'Text': f['Text'], 152 | 'Time': f['Time'] 153 | } 154 | 155 | KeyList.append(key) 156 | 157 | if str(id) in Save: 158 | # print('旧楼中楼ID') 159 | Save[str(id)] += 1 160 | else: 161 | # print('新楼中楼ID') 162 | Save[str(id)] = 1 163 | except TypeError: 164 | pass 165 | if T == 1: 166 | T = 0 167 | KeyText[str(id)] = copy.deepcopy(KeyList) 168 | KeyList.clear() 169 | 170 | 171 | 172 | 173 | def OneToOne(Text): 174 | for i in EssentialList: 175 | if i in Text: 176 | return True 177 | 178 | 179 | ''' 180 | 181 | # 信息存取列表 182 | Save = {} 183 | 184 | # 读取文件或前一个爬取列表 185 | 186 | NewSave = {} 187 | 188 | ''' 189 | def ComparisonDict(): 190 | # print('This is OldSave: '+ str(OldSave)) 191 | # print('This is Save: '+ str(Save)) 192 | 193 | for x in Save: 194 | for i,y in zip(OldSave,range(1,len(OldSave)+1)): 195 | # print(str(len(OldSave)) + ' y '+ str(y)) 196 | if x == i: 197 | # 值相同 198 | # print('KEY相同') 199 | # print(str(x)) 200 | if not Save[x] == OldSave[i]: 201 | # print('值不相同') 202 | NewList.append(str(i)) 203 | break 204 | # if y == len(OldSave): 205 | # # print('y == len(OldSave)') 206 | # # print(x +" "+i) 207 | if y == len(OldSave) and x != i: 208 | # print('test2222') 209 | NewList.append(str(x)) 210 | 211 | OldSave.clear() 212 | OldSave.update(copy.deepcopy(Save)) 213 | Save.clear() 214 | # 5800836228 3244759899 215 | 216 | # 程序每次完成循环都要存取一次 217 | 218 | # 判断NewList 是否为空 219 | 220 | NewKey = {} 221 | dict = {} 222 | 223 | def ComparisonDictKey(): 224 | # print('This is OldKeyText: ' + str(OldKeyText)) 225 | # print('This is KeyText: ' + str(KeyText)) 226 | 227 | 228 | 229 | for New in KeyText: 230 | list = [] 231 | 232 | 233 | # 判断新的KeyText的ID是不是在OldKeyText中,如果在则对比是否有不同 234 | if New in OldKeyText.keys(): 235 | T = 0 236 | for K in KeyText[New]: 237 | 238 | for O,C in zip(OldKeyText[New],range(1,len(OldKeyText[New])+1)): 239 | if K['Text'] == O['Text'] and K['Author'] == O['Author'] and K['Time'] == O['Time']: 240 | break 241 | if C == len(OldKeyText[New]) and (K['Text'] != O['Text'] or K['Author'] != O['Author'] or K['Time'] != O['Time']): 242 | print(K) 243 | list.append(K) 244 | T = 1 245 | if T == 1: 246 | dict[New] = copy.deepcopy(list) 247 | list.clear() 248 | T = 0 249 | else: 250 | dict[New] = copy.deepcopy(KeyText[New]) 251 | 252 | print('This is Update '+str(dict)) 253 | 254 | OldKeyText.clear() 255 | OldKeyText.update(copy.deepcopy(KeyText)) 256 | KeyText.clear() 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | def FileOpen(): 273 | with open('list.tieba','r',encoding='utf-8') as file: 274 | save = file.read() 275 | OldSave.update(eval(save)) 276 | 277 | with open('key.tieba','r',encoding='utf-8') as file: 278 | oldkeytext = file.read() 279 | OldKeyText.update(eval(oldkeytext)) 280 | 281 | def FileSave(): 282 | with open('list.tieba','w',encoding='utf-8') as file: 283 | file.write(str(OldSave)) 284 | 285 | with open('key.tieba','w',encoding='utf-8') as file: 286 | file.write(str(OldKeyText)) 287 | 288 | 289 | 290 | 291 | def ini(): 292 | # 初始化 293 | try: 294 | F = open('list.tieba','r',encoding='utf-8') 295 | F.close() 296 | except OSError : 297 | print('初始化') 298 | GetText(GetId()) 299 | OldSave.update(copy.deepcopy(Save)) 300 | Save.clear() 301 | FileSave() 302 | 303 | return 304 | 305 | 306 | 307 | def Main(): 308 | 309 | 310 | time.sleep(X) 311 | print('开始运行') 312 | # 清空对比函数 313 | NewList.clear() 314 | dict.clear() 315 | # 获取Save 316 | GetText(GetId()) 317 | # 对比Save to OldSave 318 | ComparisonDict() 319 | ComparisonDictKey() 320 | # 保存以防错误 321 | FileSave() 322 | print('This is NewList : ' + str(NewList)) 323 | print(dict) 324 | c = dict_string(dict) 325 | 326 | 327 | 328 | return c 329 | 330 | 331 | 332 | 333 | def setting(): 334 | # print(keyList) 335 | # print(EssentialList) 336 | # print(X) 337 | # print('监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End)) 338 | return '监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End) 339 | 340 | def GetSetting(): 341 | return keyList,EssentialList,X,Start,End 342 | ''' 343 | 帖子地址:https://tieba.baidu.com/p/5806299422 344 | 遥远彼方(12:24): 345 | 346 | ''' 347 | 348 | def dict_string(dict): 349 | res = '' 350 | url = 'https://tieba.baidu.com/p/' 351 | for one in dict: 352 | IDurl = url + one 353 | 354 | str1 = '' 355 | for i in dict[one]: 356 | 357 | a = re.compile('< img .*?"/>', re.I) 358 | b = re.compile('', re.I) 359 | c = re.compile('
', re.I) 360 | f = re.compile('', re.I) 361 | d = a.sub('', str(i['Text'])) 362 | d = b.sub('', str(d)) 363 | d = c.sub(' ', str(d)) 364 | d = f.sub(' ', str(d)) 365 | 366 | str1 = str1 + i['Author'] + '('+i['Time']+')'+':'+d + '\n\n' 367 | 368 | res = res + '帖子地址: '+IDurl + '\n' + str1 + '\n' 369 | print(res) 370 | return res 371 | 372 | 373 | 374 | 375 | # 更改配置 376 | 377 | def SetSetting(dict): 378 | global keyList,EssentialList,X,Start,End 379 | 380 | keyList = dict['监控贴吧列表'] 381 | EssentialList = dict['监控关键词'] 382 | X = dict['监控周期(S)'] 383 | Start = dict['开始页数'] 384 | End = dict['结束页数'] 385 | 386 | # 写入文件 387 | config = configparser.ConfigParser() 388 | # 编码要设置成utf-8-sig而并不是utf-8 389 | config.read('TiebaSetting.conf', encoding='utf-8-sig') 390 | 391 | config.set('Setting','Sleep',str(X)) 392 | config.set('Setting', 'Start', str(Start)) 393 | config.set('Setting', 'End', str(End)) 394 | config.set('Setting', 'tb', ','.join(keyList)) 395 | config.set('Customize', 'Essential', ','.join(EssentialList)) 396 | 397 | with open("TiebaSetting.conf", "w+",encoding='utf-8') as f: 398 | config.write(f) 399 | 400 | return '监控贴吧列表: ' + str(keyList) + '\n' + '监控关键词: ' + str(EssentialList) + '\n' + '监控周期: ' + str( 401 | X) + '\n' + '开始-终止/页数: ' + str(Start) + '-' + str(End) 402 | 403 | 404 | -------------------------------------------------------------------------------- /tieba.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | import configparser 5 | import TiebaApiUtil 6 | import copy 7 | import re 8 | import datetime 9 | """ 10 | 此处用".conf"进行配置 11 | Python 读取写入配置文件 —— ConfigParser 12 | https://www.cnblogs.com/feeland/p/4514771.html 13 | 14 | config = configparser.ConfigParser() 15 | config.read('test.conf') 16 | db_host = config.get("db", "db_port") 17 | 18 | """ 19 | 20 | 21 | # 参数配置说明 22 | """ 23 | [Setting] 24 | 爬取贴吧 25 | tb:{'国际米兰','linjj','dota','dota2'} 26 | 27 | 爬取默认页数 28 | Start:1 29 | End:4 30 | 31 | [Customize] 32 | 关键词 33 | Essential:{'昨天','今天','明天','后天'} 34 | 35 | """ 36 | 37 | 38 | config = configparser.ConfigParser() 39 | # 编码要设置成utf-8-sig而并不是utf-8 40 | config.read('TiebaSetting.conf', encoding='utf-8-sig') 41 | key = config.get("Setting", "tb") 42 | keyList = key.split(',') 43 | Essential = config.get("Customize", "Essential") 44 | EssentialList = Essential.split(',') 45 | Time = config.get("Setting","Time") 46 | # 休息周期 47 | X = config.getint("Setting", "Sleep") 48 | 49 | # 开始页数 50 | Start = config.getint("Setting", "Start") 51 | # 结束页数 52 | End = config.getint("Setting", "End") 53 | 54 | 55 | Year = int(datetime.datetime.now().strftime('%Y-%m-%d').split('-')[0]) 56 | 57 | refer_to = Time.split('-') 58 | 59 | # 信息存取列表 60 | Save = {} 61 | 62 | # 读取文件或前一个爬取列表 63 | 64 | OldSave = {} 65 | 66 | # 新出现的帖子 67 | 68 | NewList = [] 69 | 70 | 71 | # 获取ID 72 | def GetId(): 73 | IdList = [] 74 | for name in keyList: 75 | test = TiebaApiUtil.GetPage(key=name,Start=Start,End=End) 76 | test1 = eval(test) 77 | # print(test1) 78 | for i in test1['Page']: 79 | for x in i.values(): 80 | for j in x: 81 | IdList.append(j['Id']) 82 | # print(IdList.__len__()) 83 | return IdList 84 | 85 | 86 | # 存取相关关键词详细信息 87 | 88 | OldKeyText = {} 89 | KeyText = {} 90 | 91 | 92 | 93 | 94 | 95 | # 获取TEXT 以及对比 96 | def GetText(list): 97 | 98 | for id in list: 99 | 100 | # 标志 为0则代表无关键词,为1则有关键词 101 | T = 0 102 | 103 | # 存放 某个ID的所有关键回复楼 104 | KeyList = [] 105 | 106 | Text = TiebaApiUtil.GetTiebaOne(id) 107 | 108 | test1 = eval(Text) 109 | # print(str(test1)) 110 | 111 | for i in test1: 112 | # 检测回复是否有关键词 113 | try: 114 | if OneToOne(i['Text']): 115 | # 存放关键词检索 116 | T = 1 117 | key = { 118 | 'Author': i['Author'], 119 | 'Text':i['Text'], 120 | 'Time':i['Time'] 121 | } 122 | 123 | # 时间限制 124 | if(TimeLimit(i['Time'])): 125 | print(i['Time']) 126 | KeyList.append(key) 127 | 128 | # ID 在Save中 129 | 130 | if str(id) in Save: 131 | # print('旧ID') 132 | Save[str(id)] += 1 133 | # ID 不在Save中 134 | else: 135 | # print('出现新ID') 136 | Save[str(id)] = 1 137 | # 检测楼中楼是否存在 138 | if not (i['FloorInFloor'] == '' or i['FloorInFloor'] == None or i['FloorInFloor'] == []): 139 | 140 | # 存在楼中楼则遍历 141 | for f in i['FloorInFloor']: 142 | 143 | if OneToOne(f['Text']): 144 | 145 | # 存放关键词检索 146 | T = 1 147 | key = { 148 | 'Author': f['Author'], 149 | 'Text': f['Text'], 150 | 'Time': f['Time'] 151 | } 152 | if (TimeLimit(i['Time'])): 153 | print(i['Time']) 154 | KeyList.append(key) 155 | 156 | if str(id) in Save: 157 | # print('旧楼中楼ID') 158 | Save[str(id)] += 1 159 | else: 160 | # print('新楼中楼ID') 161 | Save[str(id)] = 1 162 | except TypeError: 163 | pass 164 | if T == 1: 165 | T = 0 166 | KeyText[str(id)] = copy.deepcopy(KeyList) 167 | KeyList.clear() 168 | 169 | 170 | 171 | 172 | def OneToOne(Text): 173 | for i in EssentialList: 174 | if i in Text: 175 | return True 176 | 177 | 178 | ''' 179 | 180 | # 信息存取列表 181 | Save = {} 182 | 183 | # 读取文件或前一个爬取列表 184 | 185 | NewSave = {} 186 | 187 | ''' 188 | def ComparisonDict(): 189 | print('This is OldSave: '+ str(OldSave)) 190 | print('This is Save: '+ str(Save)) 191 | 192 | for x in Save: 193 | for i,y in zip(OldSave,range(1,len(OldSave)+1)): 194 | # print(str(len(OldSave)) + ' y '+ str(y)) 195 | if x == i: 196 | # 值相同 197 | # print('KEY相同') 198 | # print(str(x)) 199 | if not Save[x] == OldSave[i]: 200 | # print('值不相同') 201 | NewList.append(str(i)) 202 | break 203 | # if y == len(OldSave): 204 | # # print('y == len(OldSave)') 205 | # # print(x +" "+i) 206 | if y == len(OldSave) and x != i: 207 | # print('test2222') 208 | NewList.append(str(x)) 209 | print("This NewList :" +str(NewList)) 210 | OldSave.clear() 211 | OldSave.update(copy.deepcopy(Save)) 212 | Save.clear() 213 | # 5800836228 3244759899 214 | 215 | # 程序每次完成循环都要存取一次 216 | 217 | # 判断NewList 是否为空 218 | 219 | NewKey = {} 220 | dict = {} 221 | 222 | def ComparisonDictKey(): 223 | # print('This is OldKeyText: ' + str(OldKeyText)) 224 | # print('This is KeyText: ' + str(KeyText)) 225 | 226 | 227 | 228 | for New in KeyText: 229 | list = [] 230 | 231 | 232 | # 判断新的KeyText的ID是不是在OldKeyText中,如果在则对比是否有不同 233 | if New in OldKeyText.keys(): 234 | T = 0 235 | for K in KeyText[New]: 236 | 237 | for O,C in zip(OldKeyText[New],range(1,len(OldKeyText[New])+1)): 238 | if K['Text'] == O['Text'] and K['Author'] == O['Author'] and K['Time'] == O['Time']: 239 | break 240 | if C == len(OldKeyText[New]) and (K['Text'] != O['Text'] or K['Author'] != O['Author'] or K['Time'] != O['Time']): 241 | print(K) 242 | list.append(K) 243 | T = 1 244 | if T == 1: 245 | dict[New] = copy.deepcopy(list) 246 | list.clear() 247 | T = 0 248 | else: 249 | dict[New] = copy.deepcopy(KeyText[New]) 250 | 251 | print('This is Update '+str(dict)) 252 | 253 | OldKeyText.clear() 254 | OldKeyText.update(copy.deepcopy(KeyText)) 255 | KeyText.clear() 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | def FileOpen(): 272 | with open('list.tieba','r',encoding='utf-8') as file: 273 | save = file.read() 274 | OldSave.update(eval(save)) 275 | 276 | with open('key.tieba','r',encoding='utf-8') as file: 277 | oldkeytext = file.read() 278 | OldKeyText.update(eval(oldkeytext)) 279 | 280 | def FileSave(): 281 | with open('list.tieba','w',encoding='utf-8') as file: 282 | file.write(str(OldSave)) 283 | 284 | with open('key.tieba','w',encoding='utf-8') as file: 285 | file.write(str(OldKeyText)) 286 | 287 | 288 | 289 | 290 | def ini(): 291 | # 初始化 292 | try: 293 | F = open('list.tieba','r',encoding='utf-8') 294 | F.close() 295 | except OSError : 296 | print('初始化') 297 | GetText(GetId()) 298 | OldSave.update(copy.deepcopy(Save)) 299 | Save.clear() 300 | FileSave() 301 | 302 | return 303 | 304 | 305 | 306 | def Main(): 307 | 308 | 309 | time.sleep(X) 310 | print('开始运行') 311 | # 清空对比函数 312 | NewList.clear() 313 | dict.clear() 314 | # 获取Save 315 | GetText(GetId()) 316 | # 对比Save to OldSave 317 | ComparisonDict() 318 | ComparisonDictKey() 319 | # 保存以防错误 320 | FileSave() 321 | print('This is NewList : ' + str(NewList)) 322 | # print(dict) 323 | c = dict_string(dict) 324 | # print("This is c "+ c) 325 | 326 | 327 | return c 328 | 329 | 330 | 331 | 332 | def setting(): 333 | # print(keyList) 334 | # print(EssentialList) 335 | # print(X) 336 | # print('监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+ str(Start)+'-'+str(End)) 337 | return '监控贴吧列表: '+ str(keyList) + '\n' + '监控关键词: '+ str(EssentialList) + '\n'+ '监控周期: ' + str(X) + '\n'+'开始-终止/页数: '+str(Start)+'-'+str(End)+ '\n'+ '监控时间:'+str(Time) 338 | 339 | def GetSetting(): 340 | return keyList,EssentialList,X,Start,End 341 | ''' 342 | 帖子地址:https://tieba.baidu.com/p/5806299422 343 | 遥远彼方(12:24): 344 | 345 | ''' 346 | 347 | def dict_string(dict): 348 | res = '' 349 | url = 'https://tieba.baidu.com/p/' 350 | for one in dict: 351 | IDurl = url + one 352 | 353 | str1 = '' 354 | for i in dict[one]: 355 | 356 | a = re.compile('< img .*?"/>', re.I) 357 | b = re.compile('
', re.I) 358 | c = re.compile('
', re.I) 359 | f = re.compile('', re.I) 360 | d = a.sub('', str(i['Text'])) 361 | d = b.sub('', str(d)) 362 | d = c.sub(' ', str(d)) 363 | d = f.sub(' ', str(d)) 364 | 365 | str1 = str1 + i['Author'] + '('+i['Time']+')'+':'+d + '\n' 366 | 367 | res = res + '帖子地址: '+IDurl + '\n' + str1 + '\n' 368 | # print(res) 369 | return res 370 | 371 | 372 | 373 | 374 | # 更改配置 375 | 376 | def SetSetting(dict): 377 | global keyList,EssentialList,X,Start,End 378 | 379 | keyList = dict['监控贴吧列表'] 380 | EssentialList = dict['监控关键词'] 381 | X = dict['监控周期(S)'] 382 | Start = dict['开始页数'] 383 | End = dict['结束页数'] 384 | 385 | # 写入文件 386 | config = configparser.ConfigParser() 387 | # 编码要设置成utf-8-sig而并不是utf-8 388 | config.read('TiebaSetting.conf', encoding='utf-8-sig') 389 | 390 | config.set('Setting','Sleep',str(X)) 391 | config.set('Setting', 'Start', str(Start)) 392 | config.set('Setting', 'End', str(End)) 393 | config.set('Setting', 'tb', ','.join(keyList)) 394 | config.set('Customize', 'Essential', ','.join(EssentialList)) 395 | 396 | with open("TiebaSetting.conf", "w+",encoding='utf-8') as f: 397 | config.write(f) 398 | 399 | return '监控贴吧列表: ' + str(keyList) + '\n' + '监控关键词: ' + str(EssentialList) + '\n' + '监控周期: ' + str( 400 | X) + '\n' + '开始-终止/页数: ' + str(Start) + '-' + str(End) 401 | 402 | 403 | # 后前...0/1 404 | # eg 2018-7-3-1 405 | # Year Month Day 406 | # 默认 0000-00-00 407 | 408 | def TimeLimit(time): 409 | c = time.split('-') 410 | refer_to[0] = int(refer_to[0]) 411 | refer_to[1] = int(refer_to[1]) 412 | refer_to[2] = int(refer_to[2]) 413 | 414 | if(int(refer_to[-1])==0): # 比较前后 415 | 416 | if(len(c)==2): 417 | yue = int(c[0]) 418 | ri = int(c[1].split(' ')[0]) 419 | 420 | if(Year>=refer_to[0] and yue>=refer_to[1] and ri >=refer_to[2]): 421 | return True 422 | return False 423 | if(len(c)==3): 424 | if(int(c[0])>refer_to[0]): 425 | return True 426 | if(int(c[0])>=refer_to[0] and int(c[1])>=refer_to[1] and int(c[2])>=refer_to[2]): 427 | return True 428 | return False 429 | else: 430 | if (len(c) == 2): 431 | yue = int(c[0]) 432 | ri = int(c[1].split(' ')[0]) 433 | if (Year <= refer_to[0] and yue <= refer_to[1] and ri <= refer_to[2]): 434 | return True 435 | return False 436 | if (len(c) == 3): 437 | if (int(c[0]) < refer_to[0]): 438 | return True 439 | if (int(c[0]) <= refer_to[0] and int(c[1]) <= refer_to[1] and int(c[2]) <= refer_to[2]): 440 | return True 441 | return False 442 | 443 | 444 | 445 | 446 | --------------------------------------------------------------------------------