├── .env
├── .gitignore
├── .vscode
└── setting.json
├── README.md
├── docs
├── index.html
└── index.md
├── old
├── Makefile
├── README.md
├── getImg.py
├── progress.py
├── response.js
├── settings.py
├── spider.py
└── spider_test.py
└── src
├── download_img.py
├── emoji.py
├── files
├── imgEmoji.html
├── unicode.json
├── unicodeDesc.html
└── unicodeEmoji.html
├── main.py
├── my.py
└── util
├── Bmob.py
├── config.py
├── mysql_helper.py
└── table.py
/.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=./src
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | weibo
3 | profile.txt
4 | .idea
5 | *.log
6 | imgs
--------------------------------------------------------------------------------
/.vscode/setting.json:
--------------------------------------------------------------------------------
1 | // Place your settings in this file to overwrite default and user settings.
2 |
3 | {
4 | "files.exclude": {
5 | "**/.git": true, // this is a default value
6 | "**/.DS_Store": true, // this is a default value
7 |
8 | "**/node_modules": true, // this excludes all folders
9 | // named "node_modules" from
10 | // the explore tree
11 |
12 | // alternative version
13 | "node_modules": true, // this excludes the folder
14 | // only from the root of
15 | // your workspace
16 | "weibo": true,
17 | "log": true
18 | }
19 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Weibo API
2 |
3 | 广度优先搜索按照 following 关系爬取所有微博、评论和表情,利用 MySQL 存储爬取到的微博信息,可以在 `config.py` 中进行配置。
4 |
5 | 
6 |
7 | ## 使用
8 |
9 | 1. 安装 MySQL,无需操心建表过程,程序自动完成
10 | 2. 前往配置 [config.py](src/util/config.py) 填充配置信息
11 | 3. 运行 `python src/main.py` 开始爬取
12 |
13 | ## 参考资料
14 |
15 | 1. [Request sessions](https://2.python-requests.org//en/latest/user/advanced/#session-objects)
16 | 2. [一行代码将 cookie 字符串转换成字典对象](https://foofish.net/extract_cookie.html)
17 | 3. [如何在requests session中手动设置cookie](https://blog.csdn.net/mgxcool/article/details/52663382)
18 | 4. [Python + MySQL 编码问题](https://stackoverflow.com/a/20349552/8242705)
19 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
People
6 |
😄😆😊😃☺️😏😍😘😚😳😌😆😁😉😜😝😀😗😙😛😴😟😦😧😮😬😕😯😑😒😅😓😥😩😔😞😖😨😰😣😢😭😂😲😱😫😠😡😤😪😋😷😎😵👿😈😐😶😇👽💛💙💜❤️💚💔💓💗💕💞💘💖✨⭐️🌟💫💥💥💢❗️❓❕❔💤💨💦🎶🎵🔥💩💩💩👍👍👎👎👌👊👊✊✌️👋✋✋👐☝️👇👈👉🙌🙏👆👏💪🤘🖕🚶🏃🏃👫👪👬👭💃👯🙆🙅💁🙋👰🙎🙍🙇💑💆💇💅👦👧👩👨👶👵👴👱👲👳👷👮👼👸😺😸😻😽😼🙀😿😹😾👹👺🙈🙉🙊💂💀🐾👄💋💧👂👀👃👅💌👤👥💬💭
7 |
8 |
Nature
9 |
☀️☔️☁️❄️⛄️⚡️🌀🌁🌊🐱🐶🐭🐹🐰🐺🐸🐯🐨🐻🐷🐽🐮🐗🐵🐒🐴🐎🐫🐑🐘🐼🐍🐦🐤🐥🐣🐔🐧🐢🐛🐝🐜🐞🐌🐙🐠🐟🐳🐋🐬🐄🐏🐀🐃🐅🐇🐉🐐🐓🐕🐖🐁🐂🐲🐡🐊🐪🐆🐈🐩🐾💐🌸🌷🍀🌹🌻🌺🍁🍃🍂🌿🍄🌵🌴🌲🌳🌰🌱🌼🌾🐚🌐🌞🌝🌚🌑🌒🌓🌔🌕🌖🌗🌘🌜🌛🌔🌍🌎🌏🌋🌌⛅️
10 |
Objects
11 |
🎍💝🎎🎒🎓🎏🎆🎇🎐🎑🎃👻🎅🎄🎁🔔🔕🎋🎉🎊🎈🔮💿📀💾📷📹🎥💻📺📱☎️☎️📞📟📠💽📼🔉🔈🔇📢📣⌛️⏳⏰⌚️📻📡➿🔍🔎🔓🔒🔏🔐🔑💡🔦🔆🔅🔌🔋📲✉️📫📮🛀🛁🚿🚽🔧🔩🔨💺💰💴💵💷💶💳💸📧📥📤✉️📨📯📪📬📭📦🚪🚬💣🔫🔪💊💉📄📃📑📊📈📉📜📋📆📅📇📁📂✂️📌📎✒️✏️📏📐📕📗📘📙📓📔📒📚🔖📛🔬🔭📰🏈🏀⚽️⚾️🎾🎱🏉🎳⛳️🚵🚴🏇🏂🏊🏄🎿♠️♥️♣️♦️💎💍🏆🎼🎹🎻👾🎮🃏🎴🎲🎯🀄️🎬📝📝📖🎨🎤🎧🎺🎷🎸👞👡👠💄👢👕👕👔👚👗🎽👖👘👙🎀🎩👑👒👞🌂💼👜👝👛👓🎣☕️🍵🍶🍼🍺🍻🍸🍹🍷🍴🍕🍔🍟🍗🍖🍝🍛🍤🍱🍣🍥🍙🍘🍚🍜🍲🍢🍡🥚🍞🍩🍮🍦🍨🍧🎂🍰🍪🍫🍬🍭🍯🍎🍏🍊🍋🍒🍇🍉🍓🍑🍈🍌🍐🍍🍠🍆🍅🌽
12 |
Places
13 |
🏠🏡🏫🏢🏣🏥🏦🏪🏩🏨💒⛪️🏬🏤🌇🌆🏯🏰⛺️🏭🗼🗾🗻🌄🌅🌠🗽🌉🎠🌈🎡⛲️🎢🚢🚤⛵️⛵️🚣⚓️🚀✈️🚁🚂🚊🚞🚲🚡🚟🚠🚜🚙🚘🚗🚗🚕🚖🚛🚌🚍🚨🚓🚔🚒🚑🚐🚚🚋🚉🚆🚅🚄🚈🚝🚃🚎🎫⛽️🚦🚥⚠️🚧🔰🏧🎰🚏💈♨️🏁🎌🏮🗿🎪🎭📍🚩🇯🇵🇰🇷🇨🇳🇺🇸🇫🇷🇪🇸🇮🇹🇷🇺🇬🇧🇬🇧🇩🇪
14 |
Symbols
15 |
1️⃣2️⃣3️⃣4️⃣5️⃣6️⃣7️⃣8️⃣9️⃣🔟🔢0️⃣#️⃣🔣◀️⬇️▶️⬅️🔠🔡🔤↙️↘️➡️⬆️↖️↗️⏬⏫🔽⤵️⤴️↩️↪️↔️↕️🔼🔃🔄⏪⏩ℹ️🆗🔀🔁🔂🆕🔝🆙🆒🆓🆖🎦🈁📶🈹🈴🈺🈯️🈷️🈶🈵🈚️🈸🈳🈲🈂️🚻🚹🚺🚼🚭🅿️♿️🚇🛄🉑🚾🚰🚮㊙️㊗️Ⓜ️🛂🛅🛃🉐🆑🆘🆔🚫🔞📵🚯🚱🚳🚷🚸⛔️✳️❇️✴️💟🆚📳📴💹💱♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️♒️♓️⛎🔯❎🅰️🅱️🆎🅾️💠♻️🔚🔙🔛🔜🕐🕜🕙🕥🕚🕦🕛🕧🕑🕝🕒🕞🕓🕟🕔🕠🕕🕡🕖🕢🕗🕣🕘🕤💲©️®️™️❌❗️‼️⁉️⭕️✖️➕➖➗💮💯✔️☑️🔘🔗➰〰️〽️🔱▪️▫️◾️◽️◼️◻️⬛️⬜️✅🔲🔳⚫️⚪️🔴🔵🔷🔶🔹🔸🔺🔻
16 |
--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*
2 |
3 | import os, logging, time, signal, sys, requests, traceback, json
4 | from util.mysql_helper import *
5 | from collections import deque
6 | from lxml import etree
7 |
8 | # 用来获取 containerid
9 | INFO_URL = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={}'
10 | # 注意不同人的微博 containerid 是不同的
11 | WEIBO_URL = 'https://m.weibo.cn/api/container/getIndex?containerid={}&page={}'
12 | LONG_WEIBO_URL = 'https://m.weibo.cn/statuses/extend?id={}'
13 | COMMENT_URL = 'https://m.weibo.cn/api/comments/show?id={}&page={}'
14 | FOLLOWING_URL = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{}&page={}'
15 |
16 |
17 | class WBSpider():
18 |
19 | def init_logging(self, name='crawling', log_level=logging.INFO):
20 | file_dir = os.path.dirname(os.path.realpath('__file__')) + "/log"
21 | # 没有目录的时候自动创建
22 | if not os.path.isdir(file_dir):
23 | os.makedirs(file_dir)
24 | fileh = logging.FileHandler(file_dir+f'/{name}-{logging.getLevelName(log_level)}.log', 'w', encoding='utf-8')
25 | formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(message)s",
26 | "%Y-%m-%d %H:%M:%S")
27 | fileh.setFormatter(formatter)
28 |
29 | log = logging.getLogger() # root logger
30 | for hdlr in log.handlers[:]: # remove all old handlers
31 | log.removeHandler(hdlr)
32 | log.addHandler(fileh) # set the new handler
33 | log.setLevel(log_level)
34 |
35 | return fileh
36 |
37 | def fetch_table(self, table='Crawling'):
38 | self.MYCURSOR.execute(f'SELECT * FROM {table}')
39 | columns = [col[0] for col in self.MYCURSOR.description]
40 | return [dict(zip(columns, row)) for row in self.MYCURSOR.fetchall()]
41 |
42 | def sel_from_table(self, table, key, value):
43 | self.MYCURSOR.execute(f"SELECT * FROM {table} WHERE {key} = '{value}'")
44 | columns = [col[0] for col in self.MYCURSOR.description]
45 | return [dict(zip(columns, row)) for row in self.MYCURSOR.fetchall()]
46 |
47 | def del_from_table(self, table, key, value):
48 | self.MYCURSOR.execute(f"DELETE FROM {table} WHERE {key} = '{value}'")
49 | self.MYDB.commit()
50 | def ins_to_table(self, table, data_dict):
51 | try:
52 | columns = ', '.join(data_dict.keys())
53 | placeholders = ', '.join(['%s'] * len(data_dict))
54 | sql = "INSERT INTO %s ( %s ) VALUES ( %s )" % (table, columns, placeholders)
55 | for key in data_dict.keys():
56 | if isinstance(data_dict[key], list):
57 | data_dict[key] = json.dumps(data_dict[key])
58 | self.MYCURSOR.execute(sql, list(data_dict.values()))
59 | self.MYDB.commit()
60 | except mysql.connector.errors.IntegrityError as e:
61 | # 遇到重复插入直接跳过
62 | pass
63 |
64 | def init_crawl(self):
65 | # 待爬取队列,采用广度优先搜索
66 | self.crawling = deque(self.fetch_table())
67 | self.crawled = deque(self.fetch_table('Crawled'))
68 | def save_crawl_to_bmob(self):
69 | for crawling_item in self.crawling:
70 | crawling_item.save()
71 | for crawled_item in crawled:
72 | crawled_item.save()
73 |
74 | def init_session(self):
75 | self.session = requests.Session()
76 | cookies_dict = dict([l.split("=", 1) for l in config['weibo']['COOKIE'].split("; ")])
77 | # https://blog.csdn.net/mgxcool/article/details/52663382
78 | requests.utils.add_dict_to_cookiejar(self.session.cookies, cookies_dict)
79 |
80 | def init_mysql(self):
81 | create_db_if_not_exists()
82 | (self.MYDB, self.MYCURSOR) = create_table_if_not_exists()
83 |
84 | def __init__(self):
85 | self.init_logging()
86 | logging.info('正在初始化数据库...')
87 | self.init_mysql()
88 | logging.info('正在初始化爬取队列...')
89 | self.init_crawl()
90 | self.init_session()
91 |
92 | def get_data(self, url):
93 | # 每次请求之前等待数秒,防止因为速度过快被封
94 | time.sleep(config['crawl']['PERIOD'])
95 | res = self.session.get(url).json()
96 | if 'msg' in res.keys() and res['msg'] == '请求过于频繁,歇歇吧':
97 | logging.warning(f"当前请求过于频繁,等待 {config['crawl']['FORBID_PAUSE']} 秒")
98 | time.sleep(config['crawl']['FORBID_PAUSE'])
99 | logging.warnng(f'等待完毕,重新请求')
100 | return get_data(url)
101 | return res
102 |
103 | def crawl_user_following(self, uid):
104 | """
105 | 返回此用户的所有 following 的 (uid, uname)(字典)
106 | """
107 | try:
108 | result = []
109 | cur_page = 1
110 | while True:
111 | logging.info(f'正在爬取 {uid} 的第 {cur_page} 页的 following')
112 | # https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_1669879400&page=0
113 | url = FOLLOWING_URL.format(uid, cur_page)
114 | data = self.get_data(url)
115 | if len(data['data']['cards']) == 0:
116 | logging.info(f'用户 {uid} 的 following 爬取完毕')
117 | return result
118 |
119 | for card in data['data']['cards']:
120 | for card_group_item in card['card_group']:
121 | # 只有类型为 10 才是真正的关注列表
122 | if card_group_item['card_type'] != 10:
123 | continue
124 | result.append({'uid': card_group_item['user']['id'], 'uname': card_group_item['user']['screen_name']})
125 | cur_page += 1
126 | logging.info(f'将新增加 {len(result)} 个 following 到队列中')
127 | return result
128 | except:
129 | logging.error('following 抓取出错')
130 | logging.error(traceback.format_exc())
131 | return []
132 |
133 | def get_weibo_containerid(self, uid):
134 | try:
135 | # https://m.weibo.cn/api/container/getIndex?type=uid&value=1669879400
136 | url = INFO_URL.format(uid)
137 | data = self.get_data(url)
138 | return data['data']['tabsInfo']['tabs'][1]['containerid']
139 | except:
140 | logging.error('containerid 抓取出错')
141 | logging.error(traceback.format_exc())
142 | logging.error(data)
143 | def crawl_user_weibo(self, uid):
144 | """
145 | 将所有的微博爬取到,并存储到 Weibo 表中
146 | """
147 | try:
148 | containerid = self.get_weibo_containerid(uid)
149 | cur_page = 1
150 | while True:
151 | logging.info(f'正在爬取 {uid} 的第 {cur_page} 页微博')
152 | # https://m.weibo.cn/api/container/getIndex?containerid=1076031669879400&page=0
153 | url = WEIBO_URL.format(containerid, cur_page)
154 | data = self.get_data(url)
155 | if len(data['data']['cards']) == 0:
156 | logging.info(f'用户 {uid} 爬取完毕')
157 | return
158 |
159 | for card in data['data']['cards']:
160 | # 忽略广告等其他卡片
161 | if card["card_type"] != 9:
162 | continue
163 | mblog = card["mblog"]
164 | # 如果是转发微博的话,忽略
165 | if "retweeted_status" in mblog:
166 | continue
167 |
168 | selector = etree.HTML(mblog["text"])
169 | a_text = selector.xpath("//a/text()")
170 | # 将 HTML 转换为 txt
171 | # 参考 https://www.zybuluo.com/Alston/note/778377
172 | text = etree.tostring(selector, method="text", encoding="UTF-8").decode('utf-8')
173 | img_emoji = selector.xpath("//span/img/@alt")
174 |
175 | weibo = {'uid': uid, 'text': text, 'mid': mblog['mid'], 'img_emoji': img_emoji}
176 | self.ins_to_table('Weibo', weibo)
177 |
178 | # 抓取评论
179 | self.crawl_weibo_comments(mblog['mid'])
180 |
181 | cur_page += 1
182 | except:
183 | logging.error('微博抓取出错')
184 | logging.error(traceback.format_exc())
185 | logging.error(data)
186 |
187 | def crawl_weibo_comments(self, mid, max=10):
188 | """
189 | 将某一篇微博的评论爬取 10 页,并存储到 Comment 表中,将 mid(博文唯一标识)设置为传入的 mid
190 | """
191 | try:
192 | cur_page = 1
193 | for i in range(max):
194 | logging.info(f'正在抓取 {mid} 的第 {cur_page} 页评论')
195 | # https://m.weibo.cn/api/comments/show?id=4384122253963002&page=0
196 | url = COMMENT_URL.format(mid, cur_page)
197 | data = self.get_data(url)
198 | if data['msg'] == '暂无数据':
199 | break
200 | for comment in data['data']['data']:
201 | selector = etree.HTML(comment["text"])
202 | cid = comment["id"]
203 | text = etree.tostring(selector, method="text", encoding="UTF-8").decode('utf-8')
204 | img_emoji = selector.xpath("//span/img/@alt")
205 |
206 | comment = {'cid': cid, 'mid': mid, 'text': text, 'img_emoji': img_emoji}
207 | self.ins_to_table('Comment', comment)
208 |
209 | cur_page += 1
210 | logging.info(f'微博 {mid} 爬取完毕')
211 | except:
212 | logging.error('评论抓取出错')
213 | logging.error(traceback.format_exc())
214 | logging.error(data)
215 |
216 | def crawl(self, uid):
217 | """
218 | 爬取 uid 所代表的用户
219 | 结束之后返回此用户的所有 following 的 (uid, uname)(字典)
220 | """
221 | self.crawl_user_weibo(uid)
222 | return self.crawl_user_following(uid)
223 |
224 | def startBFS(self):
225 | """
226 | 开始爬取(广度优先搜索)
227 | """
228 | # 理论上会结束,实际上并不会结束
229 | while len(self.crawling) > 0:
230 | crawling_user = self.crawling.popleft()
231 | adj_arr = self.crawl(crawling_user['uid'])
232 | if adj_arr == None:
233 | logging.error('不正常终止')
234 | exit(-1)
235 | self.del_from_table('Crawling', 'uid', crawling_user['uid'])
236 | logging.info(f"{crawling_user['uid']}-{crawling_user['uname']} 已从 Crawling 队列和数据库中移除")
237 | self.ins_to_table('Crawled', crawling_user)
238 | logging.info(f'{crawling_user["uid"]}-{crawling_user["uname"]} 已加入到 Crawled 队列和数据库中')
239 | # 是 Following,并且没有被抓取过
240 | for v in adj_arr:
241 | if len(self.sel_from_table('Crawled', 'uid', v['uid'])) == 0:
242 | crawling_user_new = {'uid': v['uid'], 'uname': v['uname']}
243 | self.ins_to_table('Crawling', crawling_user_new)
244 | self.crawling.append(crawling_user_new)
245 | logging.info(f"{v['uid']}-{v['uname']} 已加入到 Crawling 队列和数据库中")
246 |
247 | def signal_handler(sig, frame):
248 | print('You pressed Ctrl+C!')
249 | sys.exit(0)
250 | if __name__ == "__main__":
251 | signal.signal(signal.SIGINT, signal_handler)
252 |
253 | spider = WBSpider()
254 | spider.startBFS()
255 |
--------------------------------------------------------------------------------
/src/my.py:
--------------------------------------------------------------------------------
1 | from util.mysql_helper import *
2 |
3 | create_db_if_not_exists()
4 | create_table_if_not_exists()
5 |
--------------------------------------------------------------------------------
/src/util/Bmob.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*
2 | '''
3 | Created on 2015年7月2日
4 |
5 | @author: RobinTang
6 |
7 | https://github.com/sintrb/Bmob-Py
8 |
9 | '''
10 | import json
11 | import copy
12 | import functools
13 | import requests
14 | from urllib import parse
15 | from .config import *
16 |
17 | def _urljoin(func):
18 | @functools.wraps(func)
19 | def _wrapper(self, resource_path, *args, **kwargs):
20 | url = self.apiurl + '/' + resource_path
21 | return func(self, url, *args, **kwargs)
22 | return _wrapper
23 |
24 |
25 | def urlencode(params):
26 | if isinstance(params, dict):
27 | return parse.urlencode(params)
28 | elif isinstance(params, list):
29 | return parse.quote(''.join(params))
30 | else:
31 | return parse.quote(params)
32 |
33 |
34 | class BmobSDK(object):
35 | '''
36 | BmobSDK, create with Application ID and REST API Key. You can use she same Application with BmobSDK.setup() method.
37 | '''
38 | context = None
39 |
40 | def __init__(self, appid, restkey, apiurl='http://api.bmob.cn/1/classes'):
41 | super(BmobSDK, self).__init__()
42 | self.appid = appid
43 | self.restkey = restkey
44 | self.apiurl = apiurl
45 | self._http_headers = {
46 | "x-Bmob-Application-Id": self.appid,
47 | "X-Bmob-REST-API-Key": self.restkey,
48 | "Content-Type": "application/json"}
49 | # https://stackoverflow.com/questions/24873927/python-requests-module-and-connection-reuse
50 | # 连接重用,防止出现连接数过多抛出异常
51 | # https://2.python-requests.org//en/latest/user/advanced/#session-objects
52 | self.session = requests.Session()
53 | @_urljoin
54 | def get(self, url):
55 | return self.session.get(url, headers=self._http_headers)
56 |
57 | @_urljoin
58 | def post(self, url, **kwargs):
59 | return self.session.post(url, json=kwargs.get('data'), headers=self._http_headers)
60 |
61 | @_urljoin
62 | def put(self, url, **kwargs):
63 | return self.session.put(url, json=kwargs.get('data'), headers=self._http_headers)
64 |
65 | @_urljoin
66 | def delete(self, url, **kwargs):
67 | return self.session.delete(url, headers=self._http_headers)
68 |
69 |
70 | @staticmethod
71 | def setup(appid, restkey):
72 | BmobSDK.context = BmobSDK(appid, restkey)
73 |
74 |
75 | class Query(object):
76 | '''
77 | Bmob Query
78 | '''
79 |
80 | def __init__(self, clz, context=None):
81 | super(Query, self).__init__()
82 | if not context:
83 | context = BmobSDK.context
84 | if not context:
85 | raise BaseException("No BmobSDK context setuped!")
86 | self.context = context
87 | self.clz = clz
88 | self.q = {}
89 | self.w = {} # where
90 | self.items = None
91 |
92 | def copy(self):
93 | q = Query(self.clz, self.context)
94 | q.q = copy.deepcopy(self.q)
95 | q.w = copy.deepcopy(self.w)
96 | return q
97 |
98 | def get_urlencode(self):
99 | if self.w:
100 | self.q['where'] = json.dumps(self.w)
101 | elif 'where' in self.q:
102 | del self.q['where']
103 | return urlencode(self.q)
104 |
105 | def order(self, o):
106 | self.q['order'] = o
107 | return self.copy()
108 |
109 | def limit(self, l):
110 | self.q['limit'] = l
111 | return self.copy()
112 |
113 | def skip(self, s):
114 | self.q['skip'] = s
115 | return self.copy()
116 |
117 | def count(self):
118 | if not self.items is None:
119 | return len(self.items)
120 | else:
121 | self.limit(0)
122 | self.q['count'] = 1
123 | path = '/'.join([self.clz.__name__, '?' + self.get_urlencode()])
124 | return self.context.get(path).json()['count']
125 |
126 | def get_kw(self, k):
127 | if k in self.w:
128 | return self.w[k]
129 | else:
130 | self.w[k] = {}
131 | return self.w[k]
132 |
133 | def w_eq(self, k, v):
134 | '''equal'''
135 | self.w[k] = v
136 | return self.copy()
137 |
138 | def w_lt(self, k, v):
139 | '''less than'''
140 | self.get_kw(k)['$lt'] = v
141 | return self.copy()
142 |
143 | def w_lte(self, k, v):
144 | '''less than or equal'''
145 | self.get_kw(k)['$lte'] = v
146 | return self.copy()
147 |
148 | def w_gt(self, k, v):
149 | '''greater than'''
150 | self.geet_kw(k)['$gt'] = v
151 | return self.copy()
152 |
153 | def w_gte(self, k, v):
154 | '''greater than or equal'''
155 | self.get_kw(k)['$gte'] = v
156 | return self.copy()
157 |
158 | def w_ne(self, k, v):
159 | '''not equal'''
160 | self.get_kw(k)['$ne'] = v
161 | return self.copy()
162 |
163 | def w_in(self, k, v):
164 | '''in'''
165 | self.get_kw(k)['$in'] = v
166 | return self.copy()
167 |
168 | def w_nin(self, k, v):
169 | '''not in'''
170 | self.get_kw(k)['$nin'] = v
171 | return self.copy()
172 |
173 | def w_exists(self, k, v):
174 | self.get_kw(k)['$exists'] = v
175 | return self.copy()
176 |
177 | def w_select(self, k, v):
178 | self.get_kw(k)['$select'] = v
179 | return self.copy()
180 |
181 | def w_dontSelect(self, k, v):
182 | self.get_kw(k)['$dontSelect'] = v
183 | return self.copy()
184 |
185 | def w_all(self, k, v):
186 | self.get_kw(k)['$all'] = v
187 | return self.copy()
188 |
189 | def w_regex(self, k, v):
190 | self.get_kw(k)['$regex'] = v
191 | return self.copy()
192 |
193 | def exec_query(self):
194 | rs = []
195 | path = '/'.join([self.clz.__name__, '?' + self.get_urlencode()])
196 | for r in self.context.get(path).json()['results']:
197 | rs.append(self.clz(**r))
198 | self.items = rs
199 | return self.items
200 |
201 | def first(self):
202 | q = self.copy()
203 | q.limit(1)
204 | rs = q.exec_query()
205 | return len(rs) and rs[0] or None
206 |
207 | def __getslice__(self, s, e):
208 | if self.items is None:
209 | self.exec_query()
210 | return self.items.__getslice__(s, e)
211 |
212 | def __iter__(self):
213 | if self.items is None:
214 | self.exec_query()
215 | return iter(self.items)
216 |
217 | def __getitem__(self, k):
218 | if self.items is None:
219 | self.exec_query()
220 | return self.items.__getitem__(k)
221 |
222 | def __len__(self):
223 | return self.count()
224 |
225 |
226 | class BmobModel(object):
227 | '''
228 | Basic Bmob model, all other Bmob model must inherit this class.
229 | '''
230 |
231 | def __init__(self, context=None, objectId=None, **kwargs):
232 | super(BmobModel, self).__init__()
233 | # check objectId
234 | if isinstance(context, str):
235 | objectId = context
236 | context = None
237 |
238 | if not context:
239 | context = BmobSDK.context
240 | if not context:
241 | raise BaseException("No BmobSDK context setuped!")
242 | self.context = context
243 | self.objectId = objectId
244 | if self.objectId:
245 | # get object by id
246 | path = '/'.join([self.get_modelname(), self.objectId])
247 | for k, v in self.context.get(path).json().items():
248 | setattr(self, k, v)
249 | else:
250 | for k, v in kwargs.items():
251 | setattr(self, k, v)
252 |
253 | def get_attrs(self):
254 | return [k for k in type(self).__dict__ if not k.startswith('__')]
255 |
256 | def get_dict(self):
257 | ks = self.get_attrs()
258 | clz = type(self)
259 | dic = {}
260 | tps = [type(v) for v in [1, 1, 1.0, '1', (1, 2), [1, 2], {'1': '1'}, {1, 2}]]
261 | return dict([(k, type(getattr(clz, k))(getattr(self, k)))
262 | for k in ks if type(getattr(clz, k)) in tps])
263 |
264 | def get_modelname(self):
265 | return type(self).__name__
266 |
267 | def save(self):
268 | data = self.get_dict()
269 | #jdata = json.dumps(data)
270 | if self.objectId:
271 | path = '/'.join([self.get_modelname(), self.objectId])
272 | for k, v in self.context.put(path, data=data).json().items():
273 | setattr(self, k, v)
274 | else:
275 | path = self.get_modelname()
276 | for k, v in self.context.post(path, data=data).json().items():
277 | setattr(self, k, v)
278 |
279 | def delete(self):
280 | if self.objectId:
281 | path = '/'.join([self.get_modelname(), self.objectId])
282 | res = self.context.delete(path).json()['msg'] == 'ok'
283 | if res:
284 | self.objectId = None
285 | return res
286 | else:
287 | return True
288 |
289 | def query(self):
290 | return Query(type(self))
291 |
292 |
293 |
--------------------------------------------------------------------------------
/src/util/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*
2 |
3 | config = {
4 | # Bmob 配置
5 | 'bmob': {
6 | 'APP_ID': '366372322020724a39d8de5ccd61eeaa',
7 | 'REST_API_KEY': '40de9f3e91287703e695fe1f6b94393a',
8 | },
9 | # 微博配置
10 | 'weibo': {
11 | # Cookie 获取方法:前往 m.weibo.cn,打开一条评论较多的微博全文,往下翻几页
12 | # 这时 Chrome 的 Network 界面的 request headers 就会有 Cookie 信息了
13 | # 注意: m.weibo.cn 比较特殊,查看微博并不需要登录,而看评论确实是需要的
14 | # 比如直接进这个网址 https://m.weibo.cn/detail/4389138709375153,往后多翻几条评论在 Network 的 XHR 里面可以看到 request headers 的 Cookie
15 | 'COOKIE': 'ALF=1564734503; SCF=AuUY2ywPv1KKDsxqBgngDXYn7XTsKn_5p4iBblRihSxO8mUlZ5DB13iaxpPOY50QQzi_qq8HXRkR0NEl6MjJ-Ts.; SUB=_2A25wGOOGDeRhGeFP4lcU9SfJzD-IHXVT4o3OrDV6PUJbktBeLW_RkW1NQO_UfEy6P_rwgaJHDE-0R3sOldFws7cD; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5aERPgdESA6l4AaFTr3jGy5JpX5K-hUgL.FoMp1K-fSK.fS0e2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM7SKnRe0eRe0z0; SUHB=0laVFNbqkGjoCM; _T_WM=68656738488; WEIBOCN_FROM=1110106030; MLOGIN=1; XSRF-TOKEN=aa01d1; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D1076031669879400%26uicode%3D20000061%26fid%3D4389138709375153%26oid%3D4389138709375153'
16 | },
17 | 'mysql': {
18 | 'CONNECTION': {
19 | 'host': "localhost",
20 | 'user': 'upupming',
21 | 'charset': 'utf8mb4'
22 | }
23 | },
24 | 'crawl': {
25 | # 用来初始化爬取队列
26 | 'START_USER': '2803301701',
27 | # 每两次请求之间等待 PERIOD 秒
28 | 'PERIOD': 4,
29 | # 被封之后等待 5 分钟再次请求
30 | 'FORBID_PAUSE': 300
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/util/mysql_helper.py:
--------------------------------------------------------------------------------
1 | import mysql.connector
2 | from .config import *
3 |
4 | MYDB = mysql.connector.connect(**config['mysql']['CONNECTION'])
5 | MYCURSOR = MYDB.cursor()
6 |
7 | def create_db_if_not_exists():
8 | MYCURSOR.execute('create database if not exists seq2emoji')
9 | def create_table_if_not_exists():
10 | """
11 | 返回指向表的 MYDB 和 MYCURSOR
12 | """
13 | global MYDB, MYCURSOR
14 | MYDB = mysql.connector.connect(**config['mysql']['CONNECTION'], database='seq2emoji')
15 | MYCURSOR = MYDB.cursor()
16 |
17 | # Crawling
18 | MYCURSOR.execute("""
19 | create table if not exists Crawling (
20 | uid varchar(255) not null unique,
21 | uname varchar
22 | (255) CHARACTER SET utf8mb4 collate utf8mb4_unicode_520_ci,
23 | primary key (uid)
24 | )
25 | """)
26 | MYCURSOR.execute("SELECT * FROM Crawling")
27 | myresult = MYCURSOR.fetchall()
28 | # 当前没有正在抓取的用户,就初始化一下
29 | if len(myresult) == 0:
30 | MYCURSOR.execute('insert into Crawling (uid, uname) values (%s, %s)', (config['crawl']['START_USER'], None))
31 | MYDB.commit()
32 | # Crawled
33 | MYCURSOR.execute("""
34 | create table if not exists Crawled (
35 | uid varchar(255) not null unique,
36 | uname varchar
37 | (255) CHARACTER SET utf8mb4 collate utf8mb4_unicode_520_ci,
38 | primary key (uid)
39 | )
40 | """)
41 | # Weibo
42 | MYCURSOR.execute("""
43 | create table if not exists Weibo (
44 | uid varchar(255),
45 | mid varchar(255) not null unique,
46 | text text CHARACTER SET utf8mb4 collate utf8mb4_unicode_520_ci,
47 | img_emoji json,
48 | primary key (mid)
49 | )
50 | """)
51 | # Comment
52 | MYCURSOR.execute("""
53 | create table if not exists Comment (
54 | mid varchar(255) not null,
55 | cid varchar(255) not null unique,
56 | text text CHARACTER SET utf8mb4 collate utf8mb4_unicode_520_ci,
57 | img_emoji json,
58 | primary key (cid)
59 | )
60 | """)
61 |
62 | return (MYDB, MYCURSOR)
63 |
--------------------------------------------------------------------------------
/src/util/table.py:
--------------------------------------------------------------------------------
1 | from .Bmob import BmobSDK, BmobModel
2 |
3 | class Emoji(BmobModel):
4 | desc = ''
5 | content = ''
6 |
7 | class Crawling(BmobModel):
8 | uid = 0
9 | uname = ''
10 |
11 | class Crawled(BmobModel):
12 | uid = 0
13 | uname = ''
14 |
15 | class Weibo(BmobModel):
16 | mid = ''
17 | text = ''
18 | img_emoji = []
19 |
20 | class Comment(BmobModel):
21 | cid = ''
22 | mid = ''
23 | text = ''
24 | img_emoji = []
25 |
--------------------------------------------------------------------------------