├── .github
    ├── cookie.png
    └── weibospider.png
├── requirements.txt
├── weibospider
    ├── spiders
    │   ├── __init__.py
    │   ├── tweet_by_tweet_id.py
    │   ├── repost.py
    │   ├── follower.py
    │   ├── fan.py
    │   ├── user.py
    │   ├── comment.py
    │   ├── tweet_by_user_id.py
    │   ├── tweet_by_keyword.py
    │   └── common.py
    ├── cookie.txt
    ├── middlewares.py
    ├── settings.py
    ├── pipelines.py
    └── run_spider.py
├── LICENSE
└── README.md


/.github/cookie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nghuyong/WeiboSpider/HEAD/.github/cookie.png


--------------------------------------------------------------------------------
/.github/weibospider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nghuyong/WeiboSpider/HEAD/.github/weibospider.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy==2.5.1
2 | python_dateutil
3 | cryptography==36.0.2
4 | pyOpenSSL==22.0.0
5 | Twisted==22.10.0
6 | 


--------------------------------------------------------------------------------
/weibospider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/weibospider/cookie.txt:
--------------------------------------------------------------------------------
1 | OUTFOX_SEARCH_USER_ID_NCOO=1892334049.0611136; _T_WM=5d0d245ef51ec62db21a50916b1e384b; SCF=AkceVOKbLcdJ3i5GuWKRwRkI-sTF8ozuM9kLPqvaNmzIbMz84C_D9mrcfkUDZ5USvQLxxcNSHMeGPwtuCmRzkQA.; SUB=_2A25OUHI9DeRhGeBN6VUX9SvEzT-IHXVtux51rDV6PUJbktAKLVCmkW1NRJ24IFM3w0MsIksejOBZJEcAgJZZobMv; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWYACoMUZFHDoS6U9MYf.vu5NHD95Qce0zNSo-f1hq0Ws4DqcjzdJUQUPLadJMt; SSOLoginState=1666450029


--------------------------------------------------------------------------------
/weibospider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | class IPProxyMiddleware(object):
 5 |     """
 6 |     代理IP中间件
 7 |     """
 8 | 
 9 |     @staticmethod
10 |     def fetch_proxy():
11 |         """
12 |         获取一个代理IP
13 |         """
14 |         # You need to rewrite this function if you want to add proxy pool
15 |         # the function should return an ip in the format of "ip:port" like "12.34.1.4:9090"
16 |         return None
17 | 
18 |     def process_request(self, request, spider):
19 |         """
20 |         将代理IP添加到request请求中
21 |         """
22 |         proxy_data = self.fetch_proxy()
23 |         if proxy_data:
24 |             current_proxy = f'http://{proxy_data}'
25 |             spider.logger.debug(f"current proxy:{current_proxy}")
26 |             request.meta['proxy'] = current_proxy
27 | 


--------------------------------------------------------------------------------
/weibospider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'spider'
 4 | 
 5 | SPIDER_MODULES = ['spiders']
 6 | NEWSPIDER_MODULE = 'spiders'
 7 | 
 8 | ROBOTSTXT_OBEY = False
 9 | 
10 | with open('cookie.txt', 'rt', encoding='utf-8') as f:
11 |     cookie = f.read().strip()
12 | DEFAULT_REQUEST_HEADERS = {
13 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0',
14 |     'Cookie': cookie
15 | }
16 | 
17 | CONCURRENT_REQUESTS = 16
18 | 
19 | DOWNLOAD_DELAY = 1
20 | 
21 | DOWNLOADER_MIDDLEWARES = {
22 |     'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
23 |     'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
24 |     'middlewares.IPProxyMiddleware': 100,
25 |     'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 101,
26 | }
27 | 
28 | ITEM_PIPELINES = {
29 |     'pipelines.JsonWriterPipeline': 300,
30 | }
31 | 


--------------------------------------------------------------------------------
/weibospider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import json
 4 | import os.path
 5 | import time
 6 | 
 7 | 
 8 | class JsonWriterPipeline(object):
 9 |     """
10 |     写入json文件的pipline
11 |     """
12 | 
13 |     def __init__(self):
14 |         self.file = None
15 |         if not os.path.exists('../output'):
16 |             os.mkdir('../output')
17 | 
18 |     def process_item(self, item, spider):
19 |         """
20 |         处理item
21 |         """
22 |         if not self.file:
23 |             now = datetime.datetime.now()
24 |             file_name = spider.name + "_" + now.strftime("%Y%m%d%H%M%S") + '.jsonl'
25 |             self.file = open(f'../output/{file_name}', 'wt', encoding='utf-8')
26 |         item['crawl_time'] = int(time.time())
27 |         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
28 |         self.file.write(line)
29 |         self.file.flush()
30 |         return item
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 HuYong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/weibospider/spiders/tweet_by_tweet_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import json
 9 | from scrapy import Spider
10 | from scrapy.http import Request
11 | from spiders.common import parse_tweet_info, parse_long_tweet
12 | 
13 | 
14 | class TweetSpiderByTweetID(Spider):
15 |     """
16 |     用户推文ID采集推文
17 |     """
18 |     name = "tweet_spider_by_tweet_id"
19 |     base_url = "https://weibo.cn"
20 | 
21 |     def start_requests(self):
22 |         """
23 |         爬虫入口
24 |         """
25 |         # 这里user_ids可替换成实际待采集的数据
26 |         tweet_ids = ['LqlZNhJFm']
27 |         for tweet_id in tweet_ids:
28 |             url = f"https://weibo.com/ajax/statuses/show?id={tweet_id}"
29 |             yield Request(url, callback=self.parse)
30 | 
31 |     def parse(self, response, **kwargs):
32 |         """
33 |         网页解析
34 |         """
35 |         data = json.loads(response.text)
36 |         item = parse_tweet_info(data)
37 |         if item['isLongText']:
38 |             url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid']
39 |             yield Request(url, callback=parse_long_tweet, meta={'item': item})
40 |         yield item
41 | 


--------------------------------------------------------------------------------
/weibospider/run_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2019-12-07 21:27
 7 | """
 8 | import os
 9 | import sys
10 | from scrapy.crawler import CrawlerProcess
11 | from scrapy.utils.project import get_project_settings
12 | from spiders.tweet_by_user_id import TweetSpiderByUserID
13 | from spiders.tweet_by_keyword import TweetSpiderByKeyword
14 | from spiders.tweet_by_tweet_id import TweetSpiderByTweetID
15 | from spiders.comment import CommentSpider
16 | from spiders.follower import FollowerSpider
17 | from spiders.user import UserSpider
18 | from spiders.fan import FanSpider
19 | from spiders.repost import RepostSpider
20 | 
21 | if __name__ == '__main__':
22 |     mode = sys.argv[1]
23 |     os.environ['SCRAPY_SETTINGS_MODULE'] = 'settings'
24 |     settings = get_project_settings()
25 |     process = CrawlerProcess(settings)
26 |     mode_to_spider = {
27 |         'comment': CommentSpider,
28 |         'fan': FanSpider,
29 |         'follow': FollowerSpider,
30 |         'user': UserSpider,
31 |         'repost': RepostSpider,
32 |         'tweet_by_tweet_id': TweetSpiderByTweetID,
33 |         'tweet_by_user_id': TweetSpiderByUserID,
34 |         'tweet_by_keyword': TweetSpiderByKeyword,
35 |     }
36 |     process.crawl(mode_to_spider[mode])
37 |     # the script will block here until the crawling is finished
38 |     process.start()
39 | 


--------------------------------------------------------------------------------
/weibospider/spiders/repost.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import json
 9 | from scrapy import Spider
10 | from scrapy.http import Request
11 | from spiders.common import parse_tweet_info, url_to_mid
12 | 
13 | 
14 | class RepostSpider(Spider):
15 |     """
16 |     微博转发数据采集
17 |     """
18 |     name = "repost"
19 | 
20 |     def start_requests(self):
21 |         """
22 |         爬虫入口
23 |         """
24 |         # 这里tweet_ids可替换成实际待采集的数据
25 |         tweet_ids = ['Mb15BDYR0']
26 |         for tweet_id in tweet_ids:
27 |             mid = url_to_mid(tweet_id)
28 |             url = f"https://weibo.com/ajax/statuses/repostTimeline?id={mid}&page=1&moduleID=feed&count=10"
29 |             yield Request(url, callback=self.parse, meta={'page_num': 1, 'mid': mid})
30 | 
31 |     def parse(self, response, **kwargs):
32 |         """
33 |         网页解析
34 |         """
35 |         data = json.loads(response.text)
36 |         for tweet in data['data']:
37 |             item = parse_tweet_info(tweet)
38 |             yield item
39 |         if data['data']:
40 |             mid, page_num = response.meta['mid'], response.meta['page_num']
41 |             page_num += 1
42 |             url = f"https://weibo.com/ajax/statuses/repostTimeline?id={mid}&page={page_num}&moduleID=feed&count=10"
43 |             yield Request(url, callback=self.parse, meta={'page_num': page_num, 'mid': mid})
44 | 


--------------------------------------------------------------------------------
/weibospider/spiders/follower.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import json
 9 | from scrapy import Spider
10 | from scrapy.http import Request
11 | from spiders.comment import parse_user_info
12 | 
13 | 
14 | class FollowerSpider(Spider):
15 |     """
16 |     微博关注数据采集
17 |     """
18 |     name = "follower"
19 |     base_url = 'https://weibo.com/ajax/friendships/friends'
20 | 
21 |     def start_requests(self):
22 |         """
23 |         爬虫入口
24 |         """
25 |         # 这里user_ids可替换成实际待采集的数据
26 |         user_ids = ['1087770692']
27 |         for user_id in user_ids:
28 |             url = self.base_url + f"?page=1&uid={user_id}"
29 |             yield Request(url, callback=self.parse, meta={'user': user_id, 'page_num': 1})
30 | 
31 |     def parse(self, response, **kwargs):
32 |         """
33 |         网页解析
34 |         """
35 |         data = json.loads(response.text)
36 |         for user in data['users']:
37 |             item = dict()
38 |             item['fan_id'] = response.meta['user']
39 |             item['follower_info'] = parse_user_info(user)
40 |             item['_id'] = response.meta['user'] + '_' + item['follower_info']['_id']
41 |             yield item
42 |         if data['users']:
43 |             response.meta['page_num'] += 1
44 |             url = self.base_url + f"?page={response.meta['page_num']}&uid={response.meta['user']}"
45 |             yield Request(url, callback=self.parse, meta=response.meta)
46 | 


--------------------------------------------------------------------------------
/weibospider/spiders/fan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import json
 9 | from scrapy import Spider
10 | from scrapy.http import Request
11 | from spiders.comment import parse_user_info
12 | 
13 | 
14 | class FanSpider(Spider):
15 |     """
16 |     微博粉丝数据采集
17 |     """
18 |     name = "fan"
19 |     base_url = 'https://weibo.com/ajax/friendships/friends'
20 | 
21 |     def start_requests(self):
22 |         """
23 |         爬虫入口
24 |         """
25 |         # 这里user_ids可替换成实际待采集的数据
26 |         user_ids = ['1087770692']
27 |         for user_id in user_ids:
28 |             url = self.base_url + f"?relate=fans&page=1&uid={user_id}&type=fans"
29 |             yield Request(url, callback=self.parse, meta={'user': user_id, 'page_num': 1})
30 | 
31 |     def parse(self, response, **kwargs):
32 |         """
33 |         网页解析
34 |         """
35 |         data = json.loads(response.text)
36 |         for user in data['users']:
37 |             item = dict()
38 |             item['follower_id'] = response.meta['user']
39 |             item['fan_info'] = parse_user_info(user)
40 |             item['_id'] = response.meta['user'] + '_' + item['fan_info']['_id']
41 |             yield item
42 |         if data['users']:
43 |             response.meta['page_num'] += 1
44 |             url = self.base_url + f"?relate=fans&page={response.meta['page_num']}&uid={response.meta['user']}&type=fans"
45 |             yield Request(url, callback=self.parse, meta=response.meta)
46 | 


--------------------------------------------------------------------------------
/weibospider/spiders/user.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import json
 9 | from scrapy import Spider
10 | from scrapy.http import Request
11 | from spiders.common import parse_user_info
12 | 
13 | 
14 | class UserSpider(Spider):
15 |     """
16 |     微博用户信息爬虫
17 |     """
18 |     name = "user_spider"
19 |     base_url = "https://weibo.cn"
20 | 
21 |     def start_requests(self):
22 |         """
23 |         爬虫入口
24 |         """
25 |         # 这里user_ids可替换成实际待采集的数据
26 |         user_ids = ['1749127163']
27 |         urls = [f'https://weibo.com/ajax/profile/info?uid={user_id}' for user_id in user_ids]
28 |         for url in urls:
29 |             yield Request(url, callback=self.parse)
30 | 
31 |     def parse(self, response, **kwargs):
32 |         """
33 |         网页解析
34 |         """
35 |         data = json.loads(response.text)
36 |         item = parse_user_info(data['data']['user'])
37 |         url = f"https://weibo.com/ajax/profile/detail?uid={item['_id']}"
38 |         yield Request(url, callback=self.parse_detail, meta={'item': item})
39 | 
40 |     @staticmethod
41 |     def parse_detail(response):
42 |         """
43 |         解析详细数据
44 |         """
45 |         item = response.meta['item']
46 |         data = json.loads(response.text)['data']
47 |         item['birthday'] = data.get('birthday', '')
48 |         if 'created_at' not in item:
49 |             item['created_at'] = data.get('created_at', '')
50 |         item['desc_text'] = data.get('desc_text', '')
51 |         item['ip_location'] = data.get('ip_location', '')
52 |         item['sunshine_credit'] = data.get('sunshine_credit', {}).get('level', '')
53 |         item['label_desc'] = [label['name'] for label in data.get('label_desc', [])]
54 |         if 'company' in data:
55 |             item['company'] = data['company']
56 |         if 'education' in data:
57 |             item['education'] = data['education']
58 |         yield item
59 | 


--------------------------------------------------------------------------------
/weibospider/spiders/comment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import json
 9 | from scrapy import Spider
10 | from scrapy.http import Request
11 | from spiders.common import parse_user_info, parse_time, url_to_mid
12 | 
13 | 
14 | class CommentSpider(Spider):
15 |     """
16 |     微博评论数据采集
17 |     """
18 |     name = "comment"
19 | 
20 |     def start_requests(self):
21 |         """
22 |         爬虫入口
23 |         """
24 |         # 这里tweet_ids可替换成实际待采集的数据
25 |         tweet_ids = ['Mb15BDYR0']
26 |         for tweet_id in tweet_ids:
27 |             mid = url_to_mid(tweet_id)
28 |             url = f"https://weibo.com/ajax/statuses/buildComments?" \
29 |                   f"is_reload=1&id={mid}&is_show_bulletin=2&is_mix=0&count=20"
30 |             yield Request(url, callback=self.parse, meta={'source_url': url})
31 | 
32 |     def parse(self, response, **kwargs):
33 |         """
34 |         网页解析
35 |         """
36 |         data = json.loads(response.text)
37 |         for comment_info in data['data']:
38 |             item = self.parse_comment(comment_info)
39 |             yield item
40 |             # 解析二级评论
41 |             if 'more_info' in comment_info:
42 |                 url = f"https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={comment_info['id']}" \
43 |                       f"&is_show_bulletin=2&is_mix=1&fetch_level=1&max_id=0&count=100"
44 |                 yield Request(url, callback=self.parse, priority=20)
45 |         if data.get('max_id', 0) != 0 and 'fetch_level=1' not in response.url:
46 |             url = response.meta['source_url'] + '&max_id=' + str(data['max_id'])
47 |             yield Request(url, callback=self.parse, meta=response.meta)
48 | 
49 |     @staticmethod
50 |     def parse_comment(data):
51 |         """
52 |         解析comment
53 |         """
54 |         item = dict()
55 |         item['created_at'] = parse_time(data['created_at'])
56 |         item['_id'] = data['id']
57 |         item['like_counts'] = data['like_counts']
58 |         item['ip_location'] = data.get('source', '')
59 |         item['content'] = data['text_raw']
60 |         item['comment_user'] = parse_user_info(data['user'])
61 |         if 'reply_comment' in data:
62 |             item['reply_comment'] = {
63 |                 '_id': data['reply_comment']['id'],
64 |                 'text': data['reply_comment']['text'],
65 |                 'user': parse_user_info(data['reply_comment']['user']),
66 |             }
67 |         return item
68 | 


--------------------------------------------------------------------------------
/weibospider/spiders/tweet_by_user_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: nghuyong
 5 | Mail: nghuyong@163.com
 6 | Created Time: 2020/4/14
 7 | """
 8 | import datetime
 9 | import json
10 | import re
11 | 
12 | from scrapy import Spider
13 | from scrapy.http import Request
14 | from spiders.common import parse_tweet_info, parse_long_tweet
15 | 
16 | 
17 | class TweetSpiderByUserID(Spider):
18 |     """
19 |     用户推文数据采集
20 |     """
21 |     name = "tweet_spider_by_user_id"
22 | 
23 |     def start_requests(self):
24 |         """
25 |         爬虫入口
26 |         """
27 |         # 这里user_ids可替换成实际待采集的数据
28 |         user_ids = ['1087770692']
29 |         # 这里的时间替换成实际需要的时间段，如果要采集用户全部推文 is_crawl_specific_time_span 设置为False
30 |         is_crawl_specific_time_span = True
31 |         start_time = datetime.datetime(year=2022, month=1, day=1)
32 |         end_time = datetime.datetime(year=2023, month=1, day=1)
33 |         for user_id in user_ids:
34 |             url = f"https://weibo.com/ajax/statuses/searchProfile?uid={user_id}&page=1&hasori=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1&hasret=1"
35 |             if not is_crawl_specific_time_span:
36 |                 yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': 1})
37 |             else:
38 |                 # 切分成10天进行
39 |                 tmp_start_time = start_time
40 |                 while tmp_start_time <= end_time:
41 |                     tmp_end_time = tmp_start_time + datetime.timedelta(days=10)
42 |                     tmp_end_time = min(tmp_end_time, end_time)
43 |                     tmp_url = url + f'&starttime={int(tmp_start_time.timestamp())}&endtime={int(tmp_end_time.timestamp())}'
44 |                     yield Request(tmp_url, callback=self.parse, meta={'user_id': user_id, 'page_num': 1})
45 |                     tmp_start_time = tmp_end_time + datetime.timedelta(days=1)
46 | 
47 |     def parse(self, response, **kwargs):
48 |         """
49 |         网页解析
50 |         """
51 |         data = json.loads(response.text)
52 |         tweets = data['data']['list']
53 |         for tweet in tweets:
54 |             item = parse_tweet_info(tweet)
55 |             del item['user']
56 |             if item['isLongText']:
57 |                 url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid']
58 |                 yield Request(url, callback=parse_long_tweet, meta={'item': item})
59 |             else:
60 |                 yield item
61 |         if tweets:
62 |             user_id, page_num = response.meta['user_id'], response.meta['page_num']
63 |             url = response.url.replace(f'page={page_num}', f'page={page_num + 1}')
64 |             yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': page_num + 1})
65 | 


--------------------------------------------------------------------------------
/weibospider/spiders/tweet_by_keyword.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Author: rightyonghu
 5 | Created Time: 2022/10/22
 6 | """
 7 | import datetime
 8 | import json
 9 | import re
10 | from scrapy import Spider, Request
11 | from spiders.common import parse_tweet_info, parse_long_tweet
12 | 
13 | 
14 | class TweetSpiderByKeyword(Spider):
15 |     """
16 |     关键词搜索采集
17 |     """
18 |     name = "tweet_spider_by_keyword"
19 |     base_url = "https://s.weibo.com/"
20 | 
21 |     def start_requests(self):
22 |         """
23 |         爬虫入口
24 |         """
25 |         # 这里keywords可替换成实际待采集的数据
26 |         keywords = ['丽江']
27 |         # 这里的时间可替换成实际需要的时间段
28 |         start_time = datetime.datetime(year=2022, month=10, day=1, hour=0)
29 |         end_time = datetime.datetime(year=2022, month=10, day=7, hour=23)
30 |         # 是否按照小时进行切分，数据量更大; 对于非热门关键词**不需要**按照小时切分
31 |         is_split_by_hour = True
32 |         for keyword in keywords:
33 |             if not is_split_by_hour:
34 |                 _start_time = start_time.strftime("%Y-%m-%d-%H")
35 |                 _end_time = end_time.strftime("%Y-%m-%d-%H")
36 |                 url = f"https://s.weibo.com/weibo?q={keyword}&timescope=custom%3A{_start_time}%3A{_end_time}&page=1"
37 |                 yield Request(url, callback=self.parse, meta={'keyword': keyword})
38 |             else:
39 |                 time_cur = start_time
40 |                 while time_cur < end_time:
41 |                     _start_time = time_cur.strftime("%Y-%m-%d-%H")
42 |                     _end_time = (time_cur + datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H")
43 |                     url = f"https://s.weibo.com/weibo?q={keyword}&timescope=custom%3A{_start_time}%3A{_end_time}&page=1"
44 |                     yield Request(url, callback=self.parse, meta={'keyword': keyword})
45 |                     time_cur = time_cur + datetime.timedelta(hours=1)
46 | 
47 |     def parse(self, response, **kwargs):
48 |         """
49 |         网页解析
50 |         """
51 |         html = response.text
52 |         if '<p>抱歉，未找到相关结果。</p>' in html:
53 |             self.logger.info(f'no search result. url: {response.url}')
54 |             return
55 |         tweets_infos = re.findall('<div class="from"\s+>(.*?)</div>', html, re.DOTALL)
56 |         for tweets_info in tweets_infos:
57 |             tweet_ids = re.findall(r'weibo\.com/\d+/(.+?)\?refer_flag=1001030103_" ', tweets_info)
58 |             for tweet_id in tweet_ids:
59 |                 url = f"https://weibo.com/ajax/statuses/show?id={tweet_id}"
60 |                 yield Request(url, callback=self.parse_tweet, meta=response.meta, priority=10)
61 |         next_page = re.search('<a href="(.*?)" class="next">下一页</a>', html)
62 |         if next_page:
63 |             url = "https://s.weibo.com" + next_page.group(1)
64 |             yield Request(url, callback=self.parse, meta=response.meta)
65 | 
66 |     @staticmethod
67 |     def parse_tweet(response):
68 |         """
69 |         解析推文
70 |         """
71 |         data = json.loads(response.text)
72 |         item = parse_tweet_info(data)
73 |         item['keyword'] = response.meta['keyword']
74 |         if item['isLongText']:
75 |             url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid']
76 |             yield Request(url, callback=parse_long_tweet, meta={'item': item}, priority=20)
77 |         else:
78 |             yield item
79 | 


--------------------------------------------------------------------------------
/weibospider/spiders/common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | """
  4 | Author: rightyonghu
  5 | Created Time: 2022/10/24
  6 | """
  7 | import json
  8 | import re
  9 | 
 10 | import dateutil.parser
 11 | 
 12 | 
 13 | def base62_decode(string):
 14 |     """
 15 |     base
 16 |     """
 17 |     alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 18 |     string = str(string)
 19 |     num = 0
 20 |     idx = 0
 21 |     for char in string:
 22 |         power = (len(string) - (idx + 1))
 23 |         num += alphabet.index(char) * (len(alphabet) ** power)
 24 |         idx += 1
 25 | 
 26 |     return num
 27 | 
 28 | 
 29 | def reverse_cut_to_length(content, code_func, cut_num=4, fill_num=7):
 30 |     """
 31 |     url to mid
 32 |     """
 33 |     content = str(content)
 34 |     cut_list = [content[i - cut_num if i >= cut_num else 0:i] for i in range(len(content), 0, (-1 * cut_num))]
 35 |     cut_list.reverse()
 36 |     result = []
 37 |     for i, item in enumerate(cut_list):
 38 |         s = str(code_func(item))
 39 |         if i > 0 and len(s) < fill_num:
 40 |             s = (fill_num - len(s)) * '0' + s
 41 |         result.append(s)
 42 |     return ''.join(result)
 43 | 
 44 | 
 45 | def url_to_mid(url: str):
 46 |     """>>> url_to_mid('z0JH2lOMb')
 47 |     3501756485200075
 48 |     """
 49 |     result = reverse_cut_to_length(url, base62_decode)
 50 |     return int(result)
 51 | 
 52 | 
 53 | def parse_time(s):
 54 |     """
 55 |     Wed Oct 19 23:44:36 +0800 2022 => 2022-10-19 23:44:36
 56 |     """
 57 |     return dateutil.parser.parse(s).strftime('%Y-%m-%d %H:%M:%S')
 58 | 
 59 | 
 60 | def parse_user_info(data):
 61 |     """
 62 |     解析用户信息
 63 |     """
 64 |     # 基础信息
 65 |     user = {
 66 |         "_id": str(data['id']),
 67 |         "avatar_hd": data['avatar_hd'],
 68 |         "nick_name": data['screen_name'],
 69 |         "verified": data['verified'],
 70 |     }
 71 |     # 额外的信息
 72 |     keys = ['description', 'followers_count', 'friends_count', 'statuses_count',
 73 |             'gender', 'location', 'mbrank', 'mbtype', 'credit_score']
 74 |     for key in keys:
 75 |         if key in data:
 76 |             user[key] = data[key]
 77 |     if 'created_at' in data:
 78 |         user['created_at'] = parse_time(data.get('created_at'))
 79 |     if user['verified']:
 80 |         user['verified_type'] = data['verified_type']
 81 |         if 'verified_reason' in data:
 82 |             user['verified_reason'] = data['verified_reason']
 83 |     return user
 84 | 
 85 | 
 86 | def parse_tweet_info(data):
 87 |     """
 88 |     解析推文数据
 89 |     """
 90 |     tweet = {
 91 |         "_id": str(data['mid']),
 92 |         "mblogid": data['mblogid'],
 93 |         "created_at": parse_time(data['created_at']),
 94 |         "geo": data.get('geo', None),
 95 |         "ip_location": data.get('region_name', None),
 96 |         "reposts_count": data['reposts_count'],
 97 |         "comments_count": data['comments_count'],
 98 |         "attitudes_count": data['attitudes_count'],
 99 |         "source": data['source'],
100 |         "content": data['text_raw'].replace('\u200b', ''),
101 |         "pic_urls": ["https://wx1.sinaimg.cn/orj960/" + pic_id for pic_id in data.get('pic_ids', [])],
102 |         "pic_num": data['pic_num'],
103 |         'isLongText': False,
104 |         'is_retweet': False,
105 |         "user": parse_user_info(data['user']),
106 |     }
107 |     if '</a>' in tweet['source']:
108 |         tweet['source'] = re.search(r'>(.*?)</a>', tweet['source']).group(1)
109 |     if 'page_info' in data and data['page_info'].get('object_type', '') == 'video':
110 |         media_info = None
111 |         if 'media_info' in data['page_info']:
112 |             media_info = data['page_info']['media_info']
113 |         elif 'cards' in data['page_info'] and 'media_info' in data['page_info']['cards'][0]:
114 |             media_info = data['page_info']['cards'][0]['media_info']
115 |         if media_info:
116 |             tweet['video'] = media_info['stream_url']
117 |             # 视频播放量
118 |             tweet['video_online_numbers'] = media_info.get('online_users_number', None)
119 |     tweet['url'] = f"https://weibo.com/{tweet['user']['_id']}/{tweet['mblogid']}"
120 |     if 'continue_tag' in data and data['isLongText']:
121 |         tweet['isLongText'] = True
122 |     if 'retweeted_status' in data:
123 |         tweet['is_retweet'] = True
124 |         tweet['retweet_id'] = data['retweeted_status']['mid']
125 |     if 'reads_count' in data:
126 |         tweet['reads_count'] = data['reads_count']
127 |     return tweet
128 | 
129 | 
130 | def parse_long_tweet(response):
131 |     """
132 |     解析长推文
133 |     """
134 |     data = json.loads(response.text)['data']
135 |     item = response.meta['item']
136 |     item['content'] = data['longTextContent']
137 |     yield item
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <br>
  3 |     <img src="./.github/weibospider.png" width="400"/>
  4 |     <br>
  5 | <p>
  6 | <p align="center">
  7 |   <a href="https://www.codacy.com/gh/nghuyong/WeiboSpider/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=nghuyong/WeiboSpider&amp;utm_campaign=Badge_Grade">
  8 |     <img src="https://app.codacy.com/project/badge/Grade/cf88a8b1e6e44c5d993d2cbea7d44c85"
  9 |          alt="Codacy Badge">
 10 |   </a>
 11 |     <a href="https://scan.coverity.com/projects/nghuyong-weibospider">
 12 |     <img alt="Coverity Scan Build Status"
 13 |        src="https://scan.coverity.com/projects/26928/badge.svg"/>
 14 |   </a>
 15 |     <a href="https://github.com/nghuyong/WeiboSpider/stargazers">
 16 |     <img src="https://img.shields.io/github/stars/nghuyong/WeiboSpider.svg?colorA=orange&colorB=orange&logo=github"
 17 |          alt="GitHub stars">
 18 |   </a>
 19 |   <a href="https://github.com/nghuyong/WeiboSpider/issues">
 20 |         <img src="https://img.shields.io/github/issues/nghuyong/WeiboSpider.svg"
 21 |              alt="GitHub issues">
 22 |   </a>
 23 |   <a href="https://github.com/nghuyong/WeiboSpider/forks">
 24 |         <img src="https://img.shields.io/github/forks/nghuyong/WeiboSpider.svg"
 25 |              alt="GitHub forks">
 26 |   </a>
 27 |   <a href="https://github.com/nghuyong/WeiboSpider/">
 28 |         <img src="https://img.shields.io/github/last-commit/nghuyong/WeiboSpider.svg">
 29 |   </a>
 30 |   <a href="https://github.com/nghuyong/WeiboSpider/blob/master/LICENSE">
 31 |         <img src="https://img.shields.io/github/license/nghuyong/WeiboSpider.svg"
 32 |              alt="GitHub license">
 33 |   </a>
 34 | </p>
 35 | 
 36 | <h4 align="center">
 37 |     <p>持续维护的新浪微博采集工具🚀🚀🚀</p>
 38 | </h4>
 39 | 
 40 | 
 41 | ## 项目特色
 42 | 
 43 | - 基于weibo.com的新版API构建，拥有最丰富的字段信息
 44 | - 多种采集模式，包含微博用户,推文,粉丝,关注,转发,评论,关键词搜索
 45 | - 核心代码仅100行，代码可读性高，可快速按需进行定制化改造
 46 | 
 47 | ## 快速开始
 48 | 
 49 | ### 拉取&&安装
 50 | 
 51 | ```bash
 52 | git clone https://github.com/nghuyong/WeiboSpider.git --depth 1 
 53 | cd WeiboSpider
 54 | pip install -r requirements.txt
 55 | ```
 56 | 
 57 | ### 替换Cookie
 58 | 
 59 | 访问[https://weibo.com/](https://weibo.com/)， 登陆账号，打开浏览器的开发者模式，再次刷新
 60 | 
 61 | ![](.github/cookie.png)
 62 | 
 63 | 复制`weibo.com`数据包，network中的cookie值。编辑`weibospider/cookie.txt`并替换成刚刚复制的Cookie
 64 | 
 65 | ### 添加代理IP(可选)
 66 | 
 67 | 重写[fetch_proxy](./weibospider/middlewares.py#6L)
 68 | 方法，该方法需要返回一个代理ip，具体代码参考[这里](https://github.com/nghuyong/WeiboSpider/issues/124#issuecomment-654335439)
 69 | 
 70 | 推荐代理:Swiftproxy [链接](https://www.swiftproxy.net/?ref=hy)  **注册可领500MB免费测试流量，使用折扣码“GHB5”立享九折优惠！**
 71 | 
 72 | 
 73 | 
 74 | ## 运行程序
 75 | 
 76 | 根据自己实际需要重写`./weibospider/spiders/*`中的`start_requests`函数
 77 | 
 78 | 采集的数据存在`output`文件中，命名为`{spider.name}_{datetime}.jsonl`
 79 | 
 80 | ### 用户信息采集
 81 | 
 82 | ```bash
 83 | cd weibospider
 84 | python run_spider.py user
 85 | ```
 86 | 
 87 | ```json
 88 | {
 89 |   "crawl_time": 1666863485,
 90 |   "_id": "1749127163",
 91 |   "avatar_hd": "https://tvax4.sinaimg.cn/crop.0.0.1080.1080.1024/001Un9Srly8h3fpj11yjyj60u00u0q7f02.jpg?KID=imgbed,tva&Expires=1666874283&ssig=a%2FMfgFzvRo",
 92 |   "nick_name": "雷军",
 93 |   "verified": true,
 94 |   "description": "小米董事长，金山软件董事长。业余爱好是天使投资。",
 95 |   "followers_count": 22756103,
 96 |   "friends_count": 1373,
 97 |   "statuses_count": 14923,
 98 |   "gender": "m",
 99 |   "location": "北京 海淀区",
100 |   "mbrank": 7,
101 |   "mbtype": 12,
102 |   "verified_type": 0,
103 |   "verified_reason": "小米创办人，董事长兼CEO；金山软件董事长；天使投资人。",
104 |   "birthday": "",
105 |   "created_at": "2010-05-31 23:07:59",
106 |   "desc_text": "小米创办人，董事长兼CEO；金山软件董事长；天使投资人。",
107 |   "ip_location": "IP属地：北京",
108 |   "sunshine_credit": "信用极好",
109 |   "label_desc": [
110 |     "V指数 财经 75.30分",
111 |     "热门财经博主 数据飙升",
112 |     "昨日发博3，阅读数100万+，互动数1.9万",
113 |     "视频累计播放量9819.3万",
114 |     "群友 3132"
115 |   ],
116 |   "company": "金山软件",
117 |   "education": {
118 |     "school": "武汉大学"
119 |   }
120 | }
121 | ```
122 | 
123 | ### 用户粉丝列表采集
124 | 
125 | ```bash
126 | python run_spider.py fan
127 | ```
128 | 
129 | ```json
130 | {
131 |   "crawl_time": 1666863563,
132 |   "_id": "1087770692_5968044974",
133 |   "follower_id": "1087770692",
134 |   "fan_info": {
135 |     "_id": "5968044974",
136 |     "avatar_hd": "https://tvax1.sinaimg.cn/default/images/default_avatar_male_180.gif?KID=imgbed,tva&Expires=1666874363&ssig=UuzaeK437R",
137 |     "nick_name": "用户5968044974",
138 |     "verified": false,
139 |     "description": "",
140 |     "followers_count": 0,
141 |     "friends_count": 195,
142 |     "statuses_count": 9,
143 |     "gender": "m",
144 |     "location": "其他",
145 |     "mbrank": 0,
146 |     "mbtype": 0,
147 |     "credit_score": 80,
148 |     "created_at": "2016-06-25 22:30:13"
149 |   }
150 | }
151 | ...
152 | ```
153 | 
154 | ### 用户关注列表采集
155 | 
156 | ```bash
157 | python run_spider.py follow
158 | ```
159 | 
160 | ```json
161 | {
162 |   "crawl_time": 1666863679,
163 |   "_id": "1087770692_7083568088",
164 |   "fan_id": "1087770692",
165 |   "follower_info": {
166 |     "_id": "7083568088",
167 |     "avatar_hd": "https://tvax4.sinaimg.cn/crop.0.0.1080.1080.1024/007JnVEcly8gyqd9jadjlj30u00u0gpn.jpg?KID=imgbed,tva&Expires=1666874479&ssig=9zhfeMPLzr",
168 |     "nick_name": "蒋昀霖",
169 |     "verified": true,
170 |     "description": "工作请联系：lijialun@kpictures.cn",
171 |     "followers_count": 329216,
172 |     "friends_count": 58,
173 |     "statuses_count": 342,
174 |     "gender": "m",
175 |     "location": "北京",
176 |     "mbrank": 6,
177 |     "mbtype": 12,
178 |     "credit_score": 80,
179 |     "created_at": "2019-04-17 16:25:43",
180 |     "verified_type": 0,
181 |     "verified_reason": "东申未来 演员"
182 |   }
183 | }
184 | ...
185 | ```
186 | 
187 | 
188 | ### 微博评论采集
189 | 
190 | ```bash
191 | python run_spider.py comment
192 | ```
193 | 
194 | ```json
195 | {
196 |   "crawl_time": 1666863805,
197 |   "_id": 4826279188108038,
198 |   "created_at": "2022-10-19 13:41:29",
199 |   "like_counts": 1,
200 |   "ip_location": "来自河南",
201 |   "content": "五周年快乐呀，请坤哥哥继续保持这份热爱，奔赴下一场山海",
202 |   "comment_user": {
203 |     "_id": "2380967841",
204 |     "avatar_hd": "https://tvax4.sinaimg.cn/crop.0.0.888.888.1024/002B8iv7ly8gv647ipgxvj60oo0oojtk02.jpg?KID=imgbed,tva&Expires=1666874604&ssig=%2FdGaaIRkhf",
205 |     "nick_name": "流年执念的二瓜娇",
206 |     "verified": false,
207 |     "description": "蓝桉已遇释怀鸟，不爱万物唯爱你。",
208 |     "followers_count": 238,
209 |     "friends_count": 1655,
210 |     "statuses_count": 12546,
211 |     "gender": "f",
212 |     "location": "河南",
213 |     "mbrank": 6,
214 |     "mbtype": 11
215 |   }
216 | }
217 | ...
218 | ```
219 | 
220 | ### 微博转发采集
221 | 
222 | ```bash
223 | python run_spider.py repost
224 | ```
225 | 
226 | ```json
227 | {
228 |   "_id": "4826312651310475",
229 |   "mblogid": "Mb2vL5uUH",
230 |   "created_at": "2022-10-19 15:54:27",
231 |   "geo": null,
232 |   "ip_location": "发布于 德国",
233 |   "reposts_count": 0,
234 |   "comments_count": 0,
235 |   "attitudes_count": 0,
236 |   "source": "iPhone客户端",
237 |   "content": "共享[鼓掌][太开心][鼓掌]五周年快乐！//@陈坤:#山下学堂五周年# 五年， 感谢同行。",
238 |   "pic_urls": [],
239 |   "pic_num": 0,
240 |   "user": {
241 |     "_id": "2717869081",
242 |     "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.160.160.1024/a1ff6419ly8gz1xoq9oolj204g04g745.jpg?KID=imgbed,tva&Expires=1666876939&ssig=Cl93CLjdB%2F",
243 |     "nick_name": "YuFeeC",
244 |     "verified": false,
245 |     "mbrank": 0,
246 |     "mbtype": 0
247 |   },
248 |   "url": "https://weibo.com/2717869081/Mb2vL5uUH",
249 |   "crawl_time": 1666866139
250 | }
251 | ...
252 | ```
253 | 
254 | ### 基于微博ID的微博采集
255 | 
256 | ```bash
257 | python run_spider.py tweet_by_tweet_id
258 | ```
259 | 
260 | ```json
261 | {
262 |     "_id": "4762810834227120",
263 |     "mblogid": "LqlZNhJFm",
264 |     "created_at": "2022-04-27 10:20:54",
265 |     "geo": null,
266 |     "ip_location": null,
267 |     "reposts_count": 1890,
268 |     "comments_count": 1924,
269 |     "attitudes_count": 12167,
270 |     "source": "三星Galaxy S22 Ultra",
271 |     "content": "生于乱世纵横四海，义之所在不计生死，孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西#  今晚，恭候你！",
272 |     "pic_urls": [],
273 |     "pic_num": 0,
274 |     "isLongText": false,
275 |     "user": {
276 |         "_id": "1087770692",
277 |         "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.1080.1080.1024/40d61044ly8gbhxwgy419j20u00u0goc.jpg?KID=imgbed,tva&Expires=1682768013&ssig=r1QurGoc2L",
278 |         "nick_name": "陈坤",
279 |         "verified": true,
280 |         "mbrank": 7,
281 |         "mbtype": 12,
282 |         "verified_type": 0
283 |     },
284 |     "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1682760813&ssig=26udcPSXFJ&KID=unistore,video",
285 |     "url": "https://weibo.com/1087770692/LqlZNhJFm",
286 |     "crawl_time": 1682757213
287 | }
288 | ...
289 | ```
290 | 
291 | ### 基于用户ID的微博采集
292 | 
293 | ```bash
294 | python run_spider.py tweet_by_user_id
295 | ```
296 | 
297 | ```json
298 | {
299 |   "crawl_time": 1666864583,
300 |   "_id": "4762810834227120",
301 |   "mblogid": "LqlZNhJFm",
302 |   "created_at": "2022-04-27 10:20:54",
303 |   "geo": null,
304 |   "ip_location": null,
305 |   "reposts_count": 1907,
306 |   "comments_count": 1924,
307 |   "attitudes_count": 12169,
308 |   "source": "三星Galaxy S22 Ultra",
309 |   "content": "生于乱世纵横四海，义之所在不计生死，孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西#  今晚，恭候你！",
310 |   "pic_urls": [],
311 |   "pic_num": 0,
312 |   "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1666868183&ssig=RlIeOt286i&KID=unistore,video",
313 |   "url": "https://weibo.com/1087770692/LqlZNhJFm"
314 | }
315 | ...
316 | ```
317 | 
318 | 
319 | ### 基于关键词的微博采集
320 | 
321 | ```bash
322 | python run_spider.py tweet_by_keyword
323 | ```
324 | 
325 | ```json
326 | {
327 |   "crawl_time": 1666869049,
328 |   "keyword": "丽江",
329 |   "_id": "4829255386537989",
330 |   "mblogid": "Mch46rqPr",
331 |   "created_at": "2022-10-27 18:47:50",
332 |   "geo": {
333 |     "type": "Point",
334 |     "coordinates": [
335 |       26.962427,
336 |       100.248299
337 |     ],
338 |     "detail": {
339 |       "poiid": "B2094251D06FAAF44299",
340 |       "title": "山野文创旅拍圣地",
341 |       "type": "checkin",
342 |       "spot_type": "0"
343 |     }
344 |   },
345 |   "ip_location": "发布于 云南",
346 |   "reposts_count": 0,
347 |   "comments_count": 0,
348 |   "attitudes_count": 1,
349 |   "source": "iPhone1314iPhone客户端",
350 |   "content": "丽江小漾日出\n推出户外移动餐桌\n接受私人定制\n让美食融入美景心情自然美丽了！\n#小众宝藏旅行地##超出片的艺术街区#  ",
351 |   "pic_urls": [
352 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k1a56c4oj234022onph",
353 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19eb2kxj22ts1vvb2a",
354 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k1a0wzglj22ua1w7hdw",
355 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19wsafnj231x21a7wj",
356 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19jd1xkj22oh1sbkjo",
357 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19mma74j22ru1ukx6q",
358 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19tf1bfj234022oe85",
359 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19pk37pj234022okjm",
360 |     "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19g6nzfj20wi0lo7my"
361 |   ],
362 |   "pic_num": 9,
363 |   "user": {
364 |     "_id": "1259570181",
365 |     "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.1080.1080.1024/4b138405ly8gzfkfikyqvj20u00u0ag1.jpg?KID=imgbed,tva&Expires=1666879848&ssig=6PUDG5RonQ",
366 |     "nick_name": "飞鸟与鱼",
367 |     "verified": true,
368 |     "mbrank": 7,
369 |     "mbtype": 12,
370 |     "verified_type": 0
371 |   },
372 |   "url": "https://weibo.com/1259570181/Mch46rqPr"
373 | }
374 | ...
375 | ```
376 | 
377 | ## 更新日志
378 | 
379 | - 2024.02: 支持采集自己推文的阅读量 [#313](https://github.com/nghuyong/WeiboSpider/issues/313)
380 | - 2024.02: 支持采集视频的播放量 [#315](https://github.com/nghuyong/WeiboSpider/issues/315)
381 | - 2024.01: 支持转发推文溯源到原推文 [#314](https://github.com/nghuyong/WeiboSpider/issues/314)
382 | - 2023.12: 支持采集推文的二级评论 [#302](https://github.com/nghuyong/WeiboSpider/issues/302)
383 | - 2023.12: 支持采集指定时间段的用户推文 [#308](https://github.com/nghuyong/WeiboSpider/issues/308)
384 | - 2023.04: 支持针对推文id的推文采集 [#272](https://github.com/nghuyong/WeiboSpider/issues/272)
385 | - 2022.11: 支持针对单个关键词获取单天超过1200页的检索结果 [#257](https://github.com/nghuyong/WeiboSpider/issues/257)
386 | - 2022.11: 支持长微博全文的获取
387 | - 2022.11: 基于关键词微博搜索支持指定时间范围
388 | - 2022.10: 添加IP归属地信息的采集，包括用户数据，微博数据和微博评论数据
389 | - 2022.10: 基于weibo.com站点对项目进行重构
390 | 
391 | ## 引用
392 | ```
393 | @inproceedings{hu-etal-2020-weibo,
394 |     title = "{W}eibo-{COV}: A Large-Scale {COVID}-19 Social Media Dataset from {W}eibo",
395 |     author = "Hu, Yong  and
396 |       Huang, Heyan  and
397 |       Chen, Anfan  and
398 |       Mao, Xian-Ling",
399 |     booktitle = "Proceedings of the 1st Workshop on {NLP} for {COVID}-19 (Part 2) at {EMNLP} 2020",
400 |     month = dec,
401 |     year = "2020",
402 |     address = "Online",
403 |     publisher = "Association for Computational Linguistics",
404 |     url = "https://www.aclweb.org/anthology/2020.nlpcovid19-2.34",
405 |     doi = "10.18653/v1/2020.nlpcovid19-2.34",
406 | }
407 | ```
408 | 
409 | ## 其他工作
410 | 
411 | - 已构建超大规模数据集WeiboCOV，可免费申请，包含2千万微博活跃用户以及6千万推文数据，参见[这里](https://github.com/nghuyong/weibo-public-opinion-datasets)
412 | 


--------------------------------------------------------------------------------