├── .github ├── cookie.png └── weibospider.png ├── requirements.txt ├── weibospider ├── spiders │ ├── __init__.py │ ├── tweet_by_tweet_id.py │ ├── repost.py │ ├── follower.py │ ├── fan.py │ ├── user.py │ ├── comment.py │ ├── tweet_by_user_id.py │ ├── tweet_by_keyword.py │ └── common.py ├── cookie.txt ├── middlewares.py ├── settings.py ├── pipelines.py └── run_spider.py ├── LICENSE └── README.md /.github/cookie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nghuyong/WeiboSpider/HEAD/.github/cookie.png -------------------------------------------------------------------------------- /.github/weibospider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nghuyong/WeiboSpider/HEAD/.github/weibospider.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==2.5.1 2 | python_dateutil 3 | cryptography==36.0.2 4 | pyOpenSSL==22.0.0 5 | Twisted==22.10.0 6 | -------------------------------------------------------------------------------- /weibospider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /weibospider/cookie.txt: -------------------------------------------------------------------------------- 1 | OUTFOX_SEARCH_USER_ID_NCOO=1892334049.0611136; _T_WM=5d0d245ef51ec62db21a50916b1e384b; SCF=AkceVOKbLcdJ3i5GuWKRwRkI-sTF8ozuM9kLPqvaNmzIbMz84C_D9mrcfkUDZ5USvQLxxcNSHMeGPwtuCmRzkQA.; SUB=_2A25OUHI9DeRhGeBN6VUX9SvEzT-IHXVtux51rDV6PUJbktAKLVCmkW1NRJ24IFM3w0MsIksejOBZJEcAgJZZobMv; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWYACoMUZFHDoS6U9MYf.vu5NHD95Qce0zNSo-f1hq0Ws4DqcjzdJUQUPLadJMt; SSOLoginState=1666450029 -------------------------------------------------------------------------------- /weibospider/middlewares.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class IPProxyMiddleware(object): 5 | """ 6 | 代理IP中间件 7 | """ 8 | 9 | @staticmethod 10 | def fetch_proxy(): 11 | """ 12 | 获取一个代理IP 13 | """ 14 | # You need to rewrite this function if you want to add proxy pool 15 | # the function should return an ip in the format of "ip:port" like "12.34.1.4:9090" 16 | return None 17 | 18 | def process_request(self, request, spider): 19 | """ 20 | 将代理IP添加到request请求中 21 | """ 22 | proxy_data = self.fetch_proxy() 23 | if proxy_data: 24 | current_proxy = f'http://{proxy_data}' 25 | spider.logger.debug(f"current proxy:{current_proxy}") 26 | request.meta['proxy'] = current_proxy 27 | -------------------------------------------------------------------------------- /weibospider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'spider' 4 | 5 | SPIDER_MODULES = ['spiders'] 6 | NEWSPIDER_MODULE = 'spiders' 7 | 8 | ROBOTSTXT_OBEY = False 9 | 10 | with open('cookie.txt', 'rt', encoding='utf-8') as f: 11 | cookie = f.read().strip() 12 | DEFAULT_REQUEST_HEADERS = { 13 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0', 14 | 'Cookie': cookie 15 | } 16 | 17 | CONCURRENT_REQUESTS = 16 18 | 19 | DOWNLOAD_DELAY = 1 20 | 21 | DOWNLOADER_MIDDLEWARES = { 22 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 23 | 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None, 24 | 'middlewares.IPProxyMiddleware': 100, 25 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 101, 26 | } 27 | 28 | ITEM_PIPELINES = { 29 | 'pipelines.JsonWriterPipeline': 300, 30 | } 31 | -------------------------------------------------------------------------------- /weibospider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import json 4 | import os.path 5 | import time 6 | 7 | 8 | class JsonWriterPipeline(object): 9 | """ 10 | 写入json文件的pipline 11 | """ 12 | 13 | def __init__(self): 14 | self.file = None 15 | if not os.path.exists('../output'): 16 | os.mkdir('../output') 17 | 18 | def process_item(self, item, spider): 19 | """ 20 | 处理item 21 | """ 22 | if not self.file: 23 | now = datetime.datetime.now() 24 | file_name = spider.name + "_" + now.strftime("%Y%m%d%H%M%S") + '.jsonl' 25 | self.file = open(f'../output/{file_name}', 'wt', encoding='utf-8') 26 | item['crawl_time'] = int(time.time()) 27 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 28 | self.file.write(line) 29 | self.file.flush() 30 | return item 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 HuYong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /weibospider/spiders/tweet_by_tweet_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import json 9 | from scrapy import Spider 10 | from scrapy.http import Request 11 | from spiders.common import parse_tweet_info, parse_long_tweet 12 | 13 | 14 | class TweetSpiderByTweetID(Spider): 15 | """ 16 | 用户推文ID采集推文 17 | """ 18 | name = "tweet_spider_by_tweet_id" 19 | base_url = "https://weibo.cn" 20 | 21 | def start_requests(self): 22 | """ 23 | 爬虫入口 24 | """ 25 | # 这里user_ids可替换成实际待采集的数据 26 | tweet_ids = ['LqlZNhJFm'] 27 | for tweet_id in tweet_ids: 28 | url = f"https://weibo.com/ajax/statuses/show?id={tweet_id}" 29 | yield Request(url, callback=self.parse) 30 | 31 | def parse(self, response, **kwargs): 32 | """ 33 | 网页解析 34 | """ 35 | data = json.loads(response.text) 36 | item = parse_tweet_info(data) 37 | if item['isLongText']: 38 | url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid'] 39 | yield Request(url, callback=parse_long_tweet, meta={'item': item}) 40 | yield item 41 | -------------------------------------------------------------------------------- /weibospider/run_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2019-12-07 21:27 7 | """ 8 | import os 9 | import sys 10 | from scrapy.crawler import CrawlerProcess 11 | from scrapy.utils.project import get_project_settings 12 | from spiders.tweet_by_user_id import TweetSpiderByUserID 13 | from spiders.tweet_by_keyword import TweetSpiderByKeyword 14 | from spiders.tweet_by_tweet_id import TweetSpiderByTweetID 15 | from spiders.comment import CommentSpider 16 | from spiders.follower import FollowerSpider 17 | from spiders.user import UserSpider 18 | from spiders.fan import FanSpider 19 | from spiders.repost import RepostSpider 20 | 21 | if __name__ == '__main__': 22 | mode = sys.argv[1] 23 | os.environ['SCRAPY_SETTINGS_MODULE'] = 'settings' 24 | settings = get_project_settings() 25 | process = CrawlerProcess(settings) 26 | mode_to_spider = { 27 | 'comment': CommentSpider, 28 | 'fan': FanSpider, 29 | 'follow': FollowerSpider, 30 | 'user': UserSpider, 31 | 'repost': RepostSpider, 32 | 'tweet_by_tweet_id': TweetSpiderByTweetID, 33 | 'tweet_by_user_id': TweetSpiderByUserID, 34 | 'tweet_by_keyword': TweetSpiderByKeyword, 35 | } 36 | process.crawl(mode_to_spider[mode]) 37 | # the script will block here until the crawling is finished 38 | process.start() 39 | -------------------------------------------------------------------------------- /weibospider/spiders/repost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import json 9 | from scrapy import Spider 10 | from scrapy.http import Request 11 | from spiders.common import parse_tweet_info, url_to_mid 12 | 13 | 14 | class RepostSpider(Spider): 15 | """ 16 | 微博转发数据采集 17 | """ 18 | name = "repost" 19 | 20 | def start_requests(self): 21 | """ 22 | 爬虫入口 23 | """ 24 | # 这里tweet_ids可替换成实际待采集的数据 25 | tweet_ids = ['Mb15BDYR0'] 26 | for tweet_id in tweet_ids: 27 | mid = url_to_mid(tweet_id) 28 | url = f"https://weibo.com/ajax/statuses/repostTimeline?id={mid}&page=1&moduleID=feed&count=10" 29 | yield Request(url, callback=self.parse, meta={'page_num': 1, 'mid': mid}) 30 | 31 | def parse(self, response, **kwargs): 32 | """ 33 | 网页解析 34 | """ 35 | data = json.loads(response.text) 36 | for tweet in data['data']: 37 | item = parse_tweet_info(tweet) 38 | yield item 39 | if data['data']: 40 | mid, page_num = response.meta['mid'], response.meta['page_num'] 41 | page_num += 1 42 | url = f"https://weibo.com/ajax/statuses/repostTimeline?id={mid}&page={page_num}&moduleID=feed&count=10" 43 | yield Request(url, callback=self.parse, meta={'page_num': page_num, 'mid': mid}) 44 | -------------------------------------------------------------------------------- /weibospider/spiders/follower.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import json 9 | from scrapy import Spider 10 | from scrapy.http import Request 11 | from spiders.comment import parse_user_info 12 | 13 | 14 | class FollowerSpider(Spider): 15 | """ 16 | 微博关注数据采集 17 | """ 18 | name = "follower" 19 | base_url = 'https://weibo.com/ajax/friendships/friends' 20 | 21 | def start_requests(self): 22 | """ 23 | 爬虫入口 24 | """ 25 | # 这里user_ids可替换成实际待采集的数据 26 | user_ids = ['1087770692'] 27 | for user_id in user_ids: 28 | url = self.base_url + f"?page=1&uid={user_id}" 29 | yield Request(url, callback=self.parse, meta={'user': user_id, 'page_num': 1}) 30 | 31 | def parse(self, response, **kwargs): 32 | """ 33 | 网页解析 34 | """ 35 | data = json.loads(response.text) 36 | for user in data['users']: 37 | item = dict() 38 | item['fan_id'] = response.meta['user'] 39 | item['follower_info'] = parse_user_info(user) 40 | item['_id'] = response.meta['user'] + '_' + item['follower_info']['_id'] 41 | yield item 42 | if data['users']: 43 | response.meta['page_num'] += 1 44 | url = self.base_url + f"?page={response.meta['page_num']}&uid={response.meta['user']}" 45 | yield Request(url, callback=self.parse, meta=response.meta) 46 | -------------------------------------------------------------------------------- /weibospider/spiders/fan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import json 9 | from scrapy import Spider 10 | from scrapy.http import Request 11 | from spiders.comment import parse_user_info 12 | 13 | 14 | class FanSpider(Spider): 15 | """ 16 | 微博粉丝数据采集 17 | """ 18 | name = "fan" 19 | base_url = 'https://weibo.com/ajax/friendships/friends' 20 | 21 | def start_requests(self): 22 | """ 23 | 爬虫入口 24 | """ 25 | # 这里user_ids可替换成实际待采集的数据 26 | user_ids = ['1087770692'] 27 | for user_id in user_ids: 28 | url = self.base_url + f"?relate=fans&page=1&uid={user_id}&type=fans" 29 | yield Request(url, callback=self.parse, meta={'user': user_id, 'page_num': 1}) 30 | 31 | def parse(self, response, **kwargs): 32 | """ 33 | 网页解析 34 | """ 35 | data = json.loads(response.text) 36 | for user in data['users']: 37 | item = dict() 38 | item['follower_id'] = response.meta['user'] 39 | item['fan_info'] = parse_user_info(user) 40 | item['_id'] = response.meta['user'] + '_' + item['fan_info']['_id'] 41 | yield item 42 | if data['users']: 43 | response.meta['page_num'] += 1 44 | url = self.base_url + f"?relate=fans&page={response.meta['page_num']}&uid={response.meta['user']}&type=fans" 45 | yield Request(url, callback=self.parse, meta=response.meta) 46 | -------------------------------------------------------------------------------- /weibospider/spiders/user.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import json 9 | from scrapy import Spider 10 | from scrapy.http import Request 11 | from spiders.common import parse_user_info 12 | 13 | 14 | class UserSpider(Spider): 15 | """ 16 | 微博用户信息爬虫 17 | """ 18 | name = "user_spider" 19 | base_url = "https://weibo.cn" 20 | 21 | def start_requests(self): 22 | """ 23 | 爬虫入口 24 | """ 25 | # 这里user_ids可替换成实际待采集的数据 26 | user_ids = ['1749127163'] 27 | urls = [f'https://weibo.com/ajax/profile/info?uid={user_id}' for user_id in user_ids] 28 | for url in urls: 29 | yield Request(url, callback=self.parse) 30 | 31 | def parse(self, response, **kwargs): 32 | """ 33 | 网页解析 34 | """ 35 | data = json.loads(response.text) 36 | item = parse_user_info(data['data']['user']) 37 | url = f"https://weibo.com/ajax/profile/detail?uid={item['_id']}" 38 | yield Request(url, callback=self.parse_detail, meta={'item': item}) 39 | 40 | @staticmethod 41 | def parse_detail(response): 42 | """ 43 | 解析详细数据 44 | """ 45 | item = response.meta['item'] 46 | data = json.loads(response.text)['data'] 47 | item['birthday'] = data.get('birthday', '') 48 | if 'created_at' not in item: 49 | item['created_at'] = data.get('created_at', '') 50 | item['desc_text'] = data.get('desc_text', '') 51 | item['ip_location'] = data.get('ip_location', '') 52 | item['sunshine_credit'] = data.get('sunshine_credit', {}).get('level', '') 53 | item['label_desc'] = [label['name'] for label in data.get('label_desc', [])] 54 | if 'company' in data: 55 | item['company'] = data['company'] 56 | if 'education' in data: 57 | item['education'] = data['education'] 58 | yield item 59 | -------------------------------------------------------------------------------- /weibospider/spiders/comment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import json 9 | from scrapy import Spider 10 | from scrapy.http import Request 11 | from spiders.common import parse_user_info, parse_time, url_to_mid 12 | 13 | 14 | class CommentSpider(Spider): 15 | """ 16 | 微博评论数据采集 17 | """ 18 | name = "comment" 19 | 20 | def start_requests(self): 21 | """ 22 | 爬虫入口 23 | """ 24 | # 这里tweet_ids可替换成实际待采集的数据 25 | tweet_ids = ['Mb15BDYR0'] 26 | for tweet_id in tweet_ids: 27 | mid = url_to_mid(tweet_id) 28 | url = f"https://weibo.com/ajax/statuses/buildComments?" \ 29 | f"is_reload=1&id={mid}&is_show_bulletin=2&is_mix=0&count=20" 30 | yield Request(url, callback=self.parse, meta={'source_url': url}) 31 | 32 | def parse(self, response, **kwargs): 33 | """ 34 | 网页解析 35 | """ 36 | data = json.loads(response.text) 37 | for comment_info in data['data']: 38 | item = self.parse_comment(comment_info) 39 | yield item 40 | # 解析二级评论 41 | if 'more_info' in comment_info: 42 | url = f"https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={comment_info['id']}" \ 43 | f"&is_show_bulletin=2&is_mix=1&fetch_level=1&max_id=0&count=100" 44 | yield Request(url, callback=self.parse, priority=20) 45 | if data.get('max_id', 0) != 0 and 'fetch_level=1' not in response.url: 46 | url = response.meta['source_url'] + '&max_id=' + str(data['max_id']) 47 | yield Request(url, callback=self.parse, meta=response.meta) 48 | 49 | @staticmethod 50 | def parse_comment(data): 51 | """ 52 | 解析comment 53 | """ 54 | item = dict() 55 | item['created_at'] = parse_time(data['created_at']) 56 | item['_id'] = data['id'] 57 | item['like_counts'] = data['like_counts'] 58 | item['ip_location'] = data.get('source', '') 59 | item['content'] = data['text_raw'] 60 | item['comment_user'] = parse_user_info(data['user']) 61 | if 'reply_comment' in data: 62 | item['reply_comment'] = { 63 | '_id': data['reply_comment']['id'], 64 | 'text': data['reply_comment']['text'], 65 | 'user': parse_user_info(data['reply_comment']['user']), 66 | } 67 | return item 68 | -------------------------------------------------------------------------------- /weibospider/spiders/tweet_by_user_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: nghuyong 5 | Mail: nghuyong@163.com 6 | Created Time: 2020/4/14 7 | """ 8 | import datetime 9 | import json 10 | import re 11 | 12 | from scrapy import Spider 13 | from scrapy.http import Request 14 | from spiders.common import parse_tweet_info, parse_long_tweet 15 | 16 | 17 | class TweetSpiderByUserID(Spider): 18 | """ 19 | 用户推文数据采集 20 | """ 21 | name = "tweet_spider_by_user_id" 22 | 23 | def start_requests(self): 24 | """ 25 | 爬虫入口 26 | """ 27 | # 这里user_ids可替换成实际待采集的数据 28 | user_ids = ['1087770692'] 29 | # 这里的时间替换成实际需要的时间段,如果要采集用户全部推文 is_crawl_specific_time_span 设置为False 30 | is_crawl_specific_time_span = True 31 | start_time = datetime.datetime(year=2022, month=1, day=1) 32 | end_time = datetime.datetime(year=2023, month=1, day=1) 33 | for user_id in user_ids: 34 | url = f"https://weibo.com/ajax/statuses/searchProfile?uid={user_id}&page=1&hasori=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1&hasret=1" 35 | if not is_crawl_specific_time_span: 36 | yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': 1}) 37 | else: 38 | # 切分成10天进行 39 | tmp_start_time = start_time 40 | while tmp_start_time <= end_time: 41 | tmp_end_time = tmp_start_time + datetime.timedelta(days=10) 42 | tmp_end_time = min(tmp_end_time, end_time) 43 | tmp_url = url + f'&starttime={int(tmp_start_time.timestamp())}&endtime={int(tmp_end_time.timestamp())}' 44 | yield Request(tmp_url, callback=self.parse, meta={'user_id': user_id, 'page_num': 1}) 45 | tmp_start_time = tmp_end_time + datetime.timedelta(days=1) 46 | 47 | def parse(self, response, **kwargs): 48 | """ 49 | 网页解析 50 | """ 51 | data = json.loads(response.text) 52 | tweets = data['data']['list'] 53 | for tweet in tweets: 54 | item = parse_tweet_info(tweet) 55 | del item['user'] 56 | if item['isLongText']: 57 | url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid'] 58 | yield Request(url, callback=parse_long_tweet, meta={'item': item}) 59 | else: 60 | yield item 61 | if tweets: 62 | user_id, page_num = response.meta['user_id'], response.meta['page_num'] 63 | url = response.url.replace(f'page={page_num}', f'page={page_num + 1}') 64 | yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': page_num + 1}) 65 | -------------------------------------------------------------------------------- /weibospider/spiders/tweet_by_keyword.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Author: rightyonghu 5 | Created Time: 2022/10/22 6 | """ 7 | import datetime 8 | import json 9 | import re 10 | from scrapy import Spider, Request 11 | from spiders.common import parse_tweet_info, parse_long_tweet 12 | 13 | 14 | class TweetSpiderByKeyword(Spider): 15 | """ 16 | 关键词搜索采集 17 | """ 18 | name = "tweet_spider_by_keyword" 19 | base_url = "https://s.weibo.com/" 20 | 21 | def start_requests(self): 22 | """ 23 | 爬虫入口 24 | """ 25 | # 这里keywords可替换成实际待采集的数据 26 | keywords = ['丽江'] 27 | # 这里的时间可替换成实际需要的时间段 28 | start_time = datetime.datetime(year=2022, month=10, day=1, hour=0) 29 | end_time = datetime.datetime(year=2022, month=10, day=7, hour=23) 30 | # 是否按照小时进行切分,数据量更大; 对于非热门关键词**不需要**按照小时切分 31 | is_split_by_hour = True 32 | for keyword in keywords: 33 | if not is_split_by_hour: 34 | _start_time = start_time.strftime("%Y-%m-%d-%H") 35 | _end_time = end_time.strftime("%Y-%m-%d-%H") 36 | url = f"https://s.weibo.com/weibo?q={keyword}×cope=custom%3A{_start_time}%3A{_end_time}&page=1" 37 | yield Request(url, callback=self.parse, meta={'keyword': keyword}) 38 | else: 39 | time_cur = start_time 40 | while time_cur < end_time: 41 | _start_time = time_cur.strftime("%Y-%m-%d-%H") 42 | _end_time = (time_cur + datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H") 43 | url = f"https://s.weibo.com/weibo?q={keyword}×cope=custom%3A{_start_time}%3A{_end_time}&page=1" 44 | yield Request(url, callback=self.parse, meta={'keyword': keyword}) 45 | time_cur = time_cur + datetime.timedelta(hours=1) 46 | 47 | def parse(self, response, **kwargs): 48 | """ 49 | 网页解析 50 | """ 51 | html = response.text 52 | if '
抱歉,未找到相关结果。
' in html: 53 | self.logger.info(f'no search result. url: {response.url}') 54 | return 55 | tweets_infos = re.findall('
2 |
3 |
4 |
5 |
6 |
7 |
8 |
10 |
11 |
12 |
14 |
15 |
16 |
18 |
19 |
20 |
22 |
23 |
24 |
26 |
27 |
28 |
29 |
30 |
31 |
33 |
34 |
持续维护的新浪微博采集工具🚀🚀🚀
38 | 39 | 40 | 41 | ## 项目特色 42 | 43 | - 基于weibo.com的新版API构建,拥有最丰富的字段信息 44 | - 多种采集模式,包含微博用户,推文,粉丝,关注,转发,评论,关键词搜索 45 | - 核心代码仅100行,代码可读性高,可快速按需进行定制化改造 46 | 47 | ## 快速开始 48 | 49 | ### 拉取&&安装 50 | 51 | ```bash 52 | git clone https://github.com/nghuyong/WeiboSpider.git --depth 1 53 | cd WeiboSpider 54 | pip install -r requirements.txt 55 | ``` 56 | 57 | ### 替换Cookie 58 | 59 | 访问[https://weibo.com/](https://weibo.com/), 登陆账号,打开浏览器的开发者模式,再次刷新 60 | 61 |  62 | 63 | 复制`weibo.com`数据包,network中的cookie值。编辑`weibospider/cookie.txt`并替换成刚刚复制的Cookie 64 | 65 | ### 添加代理IP(可选) 66 | 67 | 重写[fetch_proxy](./weibospider/middlewares.py#6L) 68 | 方法,该方法需要返回一个代理ip,具体代码参考[这里](https://github.com/nghuyong/WeiboSpider/issues/124#issuecomment-654335439) 69 | 70 | 推荐代理:Swiftproxy [链接](https://www.swiftproxy.net/?ref=hy) **注册可领500MB免费测试流量,使用折扣码“GHB5”立享九折优惠!** 71 | 72 | 73 | 74 | ## 运行程序 75 | 76 | 根据自己实际需要重写`./weibospider/spiders/*`中的`start_requests`函数 77 | 78 | 采集的数据存在`output`文件中,命名为`{spider.name}_{datetime}.jsonl` 79 | 80 | ### 用户信息采集 81 | 82 | ```bash 83 | cd weibospider 84 | python run_spider.py user 85 | ``` 86 | 87 | ```json 88 | { 89 | "crawl_time": 1666863485, 90 | "_id": "1749127163", 91 | "avatar_hd": "https://tvax4.sinaimg.cn/crop.0.0.1080.1080.1024/001Un9Srly8h3fpj11yjyj60u00u0q7f02.jpg?KID=imgbed,tva&Expires=1666874283&ssig=a%2FMfgFzvRo", 92 | "nick_name": "雷军", 93 | "verified": true, 94 | "description": "小米董事长,金山软件董事长。业余爱好是天使投资。", 95 | "followers_count": 22756103, 96 | "friends_count": 1373, 97 | "statuses_count": 14923, 98 | "gender": "m", 99 | "location": "北京 海淀区", 100 | "mbrank": 7, 101 | "mbtype": 12, 102 | "verified_type": 0, 103 | "verified_reason": "小米创办人,董事长兼CEO;金山软件董事长;天使投资人。", 104 | "birthday": "", 105 | "created_at": "2010-05-31 23:07:59", 106 | "desc_text": "小米创办人,董事长兼CEO;金山软件董事长;天使投资人。", 107 | "ip_location": "IP属地:北京", 108 | "sunshine_credit": "信用极好", 109 | "label_desc": [ 110 | "V指数 财经 75.30分", 111 | "热门财经博主 数据飙升", 112 | "昨日发博3,阅读数100万+,互动数1.9万", 113 | "视频累计播放量9819.3万", 114 | "群友 3132" 115 | ], 116 | "company": "金山软件", 117 | "education": { 118 | "school": "武汉大学" 119 | } 120 | } 121 | ``` 122 | 123 | ### 用户粉丝列表采集 124 | 125 | ```bash 126 | python run_spider.py fan 127 | ``` 128 | 129 | ```json 130 | { 131 | "crawl_time": 1666863563, 132 | "_id": "1087770692_5968044974", 133 | "follower_id": "1087770692", 134 | "fan_info": { 135 | "_id": "5968044974", 136 | "avatar_hd": "https://tvax1.sinaimg.cn/default/images/default_avatar_male_180.gif?KID=imgbed,tva&Expires=1666874363&ssig=UuzaeK437R", 137 | "nick_name": "用户5968044974", 138 | "verified": false, 139 | "description": "", 140 | "followers_count": 0, 141 | "friends_count": 195, 142 | "statuses_count": 9, 143 | "gender": "m", 144 | "location": "其他", 145 | "mbrank": 0, 146 | "mbtype": 0, 147 | "credit_score": 80, 148 | "created_at": "2016-06-25 22:30:13" 149 | } 150 | } 151 | ... 152 | ``` 153 | 154 | ### 用户关注列表采集 155 | 156 | ```bash 157 | python run_spider.py follow 158 | ``` 159 | 160 | ```json 161 | { 162 | "crawl_time": 1666863679, 163 | "_id": "1087770692_7083568088", 164 | "fan_id": "1087770692", 165 | "follower_info": { 166 | "_id": "7083568088", 167 | "avatar_hd": "https://tvax4.sinaimg.cn/crop.0.0.1080.1080.1024/007JnVEcly8gyqd9jadjlj30u00u0gpn.jpg?KID=imgbed,tva&Expires=1666874479&ssig=9zhfeMPLzr", 168 | "nick_name": "蒋昀霖", 169 | "verified": true, 170 | "description": "工作请联系:lijialun@kpictures.cn", 171 | "followers_count": 329216, 172 | "friends_count": 58, 173 | "statuses_count": 342, 174 | "gender": "m", 175 | "location": "北京", 176 | "mbrank": 6, 177 | "mbtype": 12, 178 | "credit_score": 80, 179 | "created_at": "2019-04-17 16:25:43", 180 | "verified_type": 0, 181 | "verified_reason": "东申未来 演员" 182 | } 183 | } 184 | ... 185 | ``` 186 | 187 | 188 | ### 微博评论采集 189 | 190 | ```bash 191 | python run_spider.py comment 192 | ``` 193 | 194 | ```json 195 | { 196 | "crawl_time": 1666863805, 197 | "_id": 4826279188108038, 198 | "created_at": "2022-10-19 13:41:29", 199 | "like_counts": 1, 200 | "ip_location": "来自河南", 201 | "content": "五周年快乐呀,请坤哥哥继续保持这份热爱,奔赴下一场山海", 202 | "comment_user": { 203 | "_id": "2380967841", 204 | "avatar_hd": "https://tvax4.sinaimg.cn/crop.0.0.888.888.1024/002B8iv7ly8gv647ipgxvj60oo0oojtk02.jpg?KID=imgbed,tva&Expires=1666874604&ssig=%2FdGaaIRkhf", 205 | "nick_name": "流年执念的二瓜娇", 206 | "verified": false, 207 | "description": "蓝桉已遇释怀鸟,不爱万物唯爱你。", 208 | "followers_count": 238, 209 | "friends_count": 1655, 210 | "statuses_count": 12546, 211 | "gender": "f", 212 | "location": "河南", 213 | "mbrank": 6, 214 | "mbtype": 11 215 | } 216 | } 217 | ... 218 | ``` 219 | 220 | ### 微博转发采集 221 | 222 | ```bash 223 | python run_spider.py repost 224 | ``` 225 | 226 | ```json 227 | { 228 | "_id": "4826312651310475", 229 | "mblogid": "Mb2vL5uUH", 230 | "created_at": "2022-10-19 15:54:27", 231 | "geo": null, 232 | "ip_location": "发布于 德国", 233 | "reposts_count": 0, 234 | "comments_count": 0, 235 | "attitudes_count": 0, 236 | "source": "iPhone客户端", 237 | "content": "共享[鼓掌][太开心][鼓掌]五周年快乐!//@陈坤:#山下学堂五周年# 五年, 感谢同行。", 238 | "pic_urls": [], 239 | "pic_num": 0, 240 | "user": { 241 | "_id": "2717869081", 242 | "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.160.160.1024/a1ff6419ly8gz1xoq9oolj204g04g745.jpg?KID=imgbed,tva&Expires=1666876939&ssig=Cl93CLjdB%2F", 243 | "nick_name": "YuFeeC", 244 | "verified": false, 245 | "mbrank": 0, 246 | "mbtype": 0 247 | }, 248 | "url": "https://weibo.com/2717869081/Mb2vL5uUH", 249 | "crawl_time": 1666866139 250 | } 251 | ... 252 | ``` 253 | 254 | ### 基于微博ID的微博采集 255 | 256 | ```bash 257 | python run_spider.py tweet_by_tweet_id 258 | ``` 259 | 260 | ```json 261 | { 262 | "_id": "4762810834227120", 263 | "mblogid": "LqlZNhJFm", 264 | "created_at": "2022-04-27 10:20:54", 265 | "geo": null, 266 | "ip_location": null, 267 | "reposts_count": 1890, 268 | "comments_count": 1924, 269 | "attitudes_count": 12167, 270 | "source": "三星Galaxy S22 Ultra", 271 | "content": "生于乱世纵横四海,义之所在不计生死,孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西# 今晚,恭候你!", 272 | "pic_urls": [], 273 | "pic_num": 0, 274 | "isLongText": false, 275 | "user": { 276 | "_id": "1087770692", 277 | "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.1080.1080.1024/40d61044ly8gbhxwgy419j20u00u0goc.jpg?KID=imgbed,tva&Expires=1682768013&ssig=r1QurGoc2L", 278 | "nick_name": "陈坤", 279 | "verified": true, 280 | "mbrank": 7, 281 | "mbtype": 12, 282 | "verified_type": 0 283 | }, 284 | "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1682760813&ssig=26udcPSXFJ&KID=unistore,video", 285 | "url": "https://weibo.com/1087770692/LqlZNhJFm", 286 | "crawl_time": 1682757213 287 | } 288 | ... 289 | ``` 290 | 291 | ### 基于用户ID的微博采集 292 | 293 | ```bash 294 | python run_spider.py tweet_by_user_id 295 | ``` 296 | 297 | ```json 298 | { 299 | "crawl_time": 1666864583, 300 | "_id": "4762810834227120", 301 | "mblogid": "LqlZNhJFm", 302 | "created_at": "2022-04-27 10:20:54", 303 | "geo": null, 304 | "ip_location": null, 305 | "reposts_count": 1907, 306 | "comments_count": 1924, 307 | "attitudes_count": 12169, 308 | "source": "三星Galaxy S22 Ultra", 309 | "content": "生于乱世纵横四海,义之所在不计生死,孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西# 今晚,恭候你!", 310 | "pic_urls": [], 311 | "pic_num": 0, 312 | "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1666868183&ssig=RlIeOt286i&KID=unistore,video", 313 | "url": "https://weibo.com/1087770692/LqlZNhJFm" 314 | } 315 | ... 316 | ``` 317 | 318 | 319 | ### 基于关键词的微博采集 320 | 321 | ```bash 322 | python run_spider.py tweet_by_keyword 323 | ``` 324 | 325 | ```json 326 | { 327 | "crawl_time": 1666869049, 328 | "keyword": "丽江", 329 | "_id": "4829255386537989", 330 | "mblogid": "Mch46rqPr", 331 | "created_at": "2022-10-27 18:47:50", 332 | "geo": { 333 | "type": "Point", 334 | "coordinates": [ 335 | 26.962427, 336 | 100.248299 337 | ], 338 | "detail": { 339 | "poiid": "B2094251D06FAAF44299", 340 | "title": "山野文创旅拍圣地", 341 | "type": "checkin", 342 | "spot_type": "0" 343 | } 344 | }, 345 | "ip_location": "发布于 云南", 346 | "reposts_count": 0, 347 | "comments_count": 0, 348 | "attitudes_count": 1, 349 | "source": "iPhone1314iPhone客户端", 350 | "content": "丽江小漾日出\n推出户外移动餐桌\n接受私人定制\n让美食融入美景心情自然美丽了!\n#小众宝藏旅行地##超出片的艺术街区# ", 351 | "pic_urls": [ 352 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k1a56c4oj234022onph", 353 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19eb2kxj22ts1vvb2a", 354 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k1a0wzglj22ua1w7hdw", 355 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19wsafnj231x21a7wj", 356 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19jd1xkj22oh1sbkjo", 357 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19mma74j22ru1ukx6q", 358 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19tf1bfj234022oe85", 359 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19pk37pj234022okjm", 360 | "https://wx1.sinaimg.cn/orj960/4b138405gy1h7k19g6nzfj20wi0lo7my" 361 | ], 362 | "pic_num": 9, 363 | "user": { 364 | "_id": "1259570181", 365 | "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.1080.1080.1024/4b138405ly8gzfkfikyqvj20u00u0ag1.jpg?KID=imgbed,tva&Expires=1666879848&ssig=6PUDG5RonQ", 366 | "nick_name": "飞鸟与鱼", 367 | "verified": true, 368 | "mbrank": 7, 369 | "mbtype": 12, 370 | "verified_type": 0 371 | }, 372 | "url": "https://weibo.com/1259570181/Mch46rqPr" 373 | } 374 | ... 375 | ``` 376 | 377 | ## 更新日志 378 | 379 | - 2024.02: 支持采集自己推文的阅读量 [#313](https://github.com/nghuyong/WeiboSpider/issues/313) 380 | - 2024.02: 支持采集视频的播放量 [#315](https://github.com/nghuyong/WeiboSpider/issues/315) 381 | - 2024.01: 支持转发推文溯源到原推文 [#314](https://github.com/nghuyong/WeiboSpider/issues/314) 382 | - 2023.12: 支持采集推文的二级评论 [#302](https://github.com/nghuyong/WeiboSpider/issues/302) 383 | - 2023.12: 支持采集指定时间段的用户推文 [#308](https://github.com/nghuyong/WeiboSpider/issues/308) 384 | - 2023.04: 支持针对推文id的推文采集 [#272](https://github.com/nghuyong/WeiboSpider/issues/272) 385 | - 2022.11: 支持针对单个关键词获取单天超过1200页的检索结果 [#257](https://github.com/nghuyong/WeiboSpider/issues/257) 386 | - 2022.11: 支持长微博全文的获取 387 | - 2022.11: 基于关键词微博搜索支持指定时间范围 388 | - 2022.10: 添加IP归属地信息的采集,包括用户数据,微博数据和微博评论数据 389 | - 2022.10: 基于weibo.com站点对项目进行重构 390 | 391 | ## 引用 392 | ``` 393 | @inproceedings{hu-etal-2020-weibo, 394 | title = "{W}eibo-{COV}: A Large-Scale {COVID}-19 Social Media Dataset from {W}eibo", 395 | author = "Hu, Yong and 396 | Huang, Heyan and 397 | Chen, Anfan and 398 | Mao, Xian-Ling", 399 | booktitle = "Proceedings of the 1st Workshop on {NLP} for {COVID}-19 (Part 2) at {EMNLP} 2020", 400 | month = dec, 401 | year = "2020", 402 | address = "Online", 403 | publisher = "Association for Computational Linguistics", 404 | url = "https://www.aclweb.org/anthology/2020.nlpcovid19-2.34", 405 | doi = "10.18653/v1/2020.nlpcovid19-2.34", 406 | } 407 | ``` 408 | 409 | ## 其他工作 410 | 411 | - 已构建超大规模数据集WeiboCOV,可免费申请,包含2千万微博活跃用户以及6千万推文数据,参见[这里](https://github.com/nghuyong/weibo-public-opinion-datasets) 412 | --------------------------------------------------------------------------------