├── bbsspider ├── __init__.py ├── spiders │ ├── __init__.py │ ├── const.py │ ├── bbstopten.py │ └── bbsarticle.py ├── items.py ├── pipelines.py └── settings.py ├── examples ├── 1858656.png └── 3208312.png ├── scrapy.cfg └── README.md /bbsspider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/1858656.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuemingzhu/BYR-HeadImgs/HEAD/examples/1858656.png -------------------------------------------------------------------------------- /examples/3208312.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuemingzhu/BYR-HeadImgs/HEAD/examples/3208312.png -------------------------------------------------------------------------------- /bbsspider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bbsspider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bbsspider 12 | -------------------------------------------------------------------------------- /bbsspider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ArtItem(scrapy.Item): 12 | url = scrapy.Field() 13 | avatarUrls = scrapy.Field() 14 | userName = scrapy.Field() 15 | -------------------------------------------------------------------------------- /bbsspider/spiders/const.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | URL = 'http://bbs.byr.cn' 4 | ALLOW_DOMAINS = ['bbs.byr.cn'] 5 | HEADERS = {'User-Agent': 'Mozilla/5.0', 'Host': 'bbs.byr.cn', 'X-Requested-With': 'XMLHttpRequest', 6 | 'Connection': 'keep-alive'} 7 | # 请修改为你自己的用户名id和密码passwd,其他不用修改,提交代码的时候记得抹掉密码 =. = 8 | LOGIN_DATA = {'id': 'fuxuemingzhu', 'passwd': '*********', 'mode': '0', 'CookieDate': '0'} 9 | # 是否去掉重复的头像,默认为False 10 | removeDuplicate = True 11 | -------------------------------------------------------------------------------- /bbsspider/pipelines.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.pipelines.images import ImagesPipeline 3 | 4 | 5 | class ImagePipeline(ImagesPipeline): # 继承ImagesPipeline这个类,实现这个功能 6 | 7 | def get_media_requests(self, item, info): # 重写ImagesPipeline get_media_requests方法 8 | ''' 9 | :param item: 10 | :param info: 11 | :return: 12 | 在工作流程中可以看到, 13 | 管道会得到文件的URL并从项目中下载。 14 | 为了这么做,你需要重写 get_media_requests() 方法, 15 | 并对各个图片URL返回一个Request: 16 | ''' 17 | for i, image_url in enumerate(item['avatarUrls']): 18 | yield scrapy.Request(image_url) 19 | 20 | def file_path(self, request, response=None, info=None): 21 | image_guid = request.url.split('/')[-1] 22 | return '%s' % (image_guid) 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 北邮人论坛用户合影 2 | 3 | 一键合成北邮人论坛十大&主题帖所有回复头像合影~~ 4 | 5 | 现已支持生成合影时的用户头像去重功能。 6 | 7 | 未对用户去重的合影如下,每行头像代表了帖子的一页: 8 | ![此处输入图片的描述][1] 9 | 上图源自该贴的所有用户头像:你有一个研一小哥哥,请查收~ https://bbs.byr.cn/#!article/Friends/1858656 10 | 11 | 对用户去重的合影如下,顺序依然是按照回复帖子的顺序: 12 | ![此处输入图片的描述][2] 13 | 上图来自:十大合影~ https://bbs.byr.cn/#!article/Picture/3208312 14 | 15 | 为迎接北邮锦鲤活动,特增加爬取某文章的所有回复用户的id功能,使用方法同爬取文章,运行结束后会在项目根目录下生成``all_users_ids_time=爬取时间.txt``,里面包括楼层的序号和用户id的对应关系。一个样例如下: 16 | 17 | ```txt 18 | 本爬虫的爬取时间是:2018-10-13 10:02:18 19 | 0,wu111137 20 | 1,z574690129 21 | 2,liangkeng 22 | 3,troubadour 23 | 4,bloodsmail 24 | 5,yqyqyqyqyqy 25 | 6,Alison 26 | 7,zc199102 27 | 8,XingXudong 28 | ...... 29 | ``` 30 | 31 | # 使用环境 32 | 33 | 下面的环境请自行安装: 34 | 35 | 1. python3 36 | 37 | 2. scrapy 38 | 39 | 3. PIL 40 | 41 | # 使用方法 42 | 43 | ## 配置用户名密码 44 | 45 | 在bbsspider/spiders/const.py中填写你的byr bbs**用户名密码**。 46 | 47 | ## 爬取指定文章用户合影以及所有回复id 48 | 49 | 1. 修改bbsspider/spiders/bbsarticle.py中的article_urls为要进行操作的文章地址列表。 50 | 51 | 2. 在本项目根目录(与scrapy.cfg同目录),运行命令: 52 | 53 | 54 | ```bash 55 | scrapy crawl article 56 | ``` 57 | 58 | 生成指定帖子的用户合影,并在项目根目录下保存了所有回复id。 59 | 60 | ## 爬取十大帖子 61 | 62 | 在本项目根目录(与scrapy.cfg同目录),运行命令: 63 | 64 | ```bash 65 | scrapy crawl topten 66 | ``` 67 | 68 | 自动把十大的十篇文章爬取,并生成用户合影。 69 | 70 | ## 爬取结果与头像去重 71 | 72 | 程序运行后,在headImages目录下生成了合影,命名方式为帖子地址结尾的数字。 73 | 74 | 注:bbsspider/spiders/const.py中removeDuplicate变量控制是否对用户头像去重,当其为True时去重,为False时不去重。默认不去重。爬取的用户id与楼层的对应关系不去重。 75 | 76 | # 版权声明 77 | 78 | 1. 本项目为业余开发,代码质量不高。大家随便使用,欢迎交流。 79 | 80 | 2. 所有代码都在你自己的本地运行,我不会收集论坛密码~ 81 | 82 | 3. 感谢前辈写的论坛爬虫:https://github.com/buptbill220/bbsspider 83 | 84 | 85 | [1]: https://raw.githubusercontent.com/fuxuemingzhu/BYR-HeadImgs/master/examples/1858656.png 86 | [2]: https://github.com/fuxuemingzhu/BYR-HeadImgs/blob/592ae4a7683bbfcc200d5e03410d07c207624795/examples/3208312.png?raw=true 87 | -------------------------------------------------------------------------------- /bbsspider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bbsspider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'bbsspider' 13 | 14 | SPIDER_MODULES = ['bbsspider.spiders'] 15 | NEWSPIDER_MODULE = 'bbsspider.spiders' 16 | # 下载图片 17 | IMAGES_STORE = 'images/' 18 | ITEM_PIPELINES = { 19 | 'bbsspider.pipelines.ImagePipeline': 1, 20 | 21 | } 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | # USER_AGENT = 'bbsspider (+http://www.yourdomain.com)' 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | CONCURRENT_REQUESTS = 32 27 | LOG_LEVEL = 'INFO' 28 | RETRY_ENABLED = False 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | DOWNLOAD_DELAY = 0 33 | # The download delay setting will honor only one of: 34 | # CONCURRENT_REQUESTS_PER_DOMAIN=16 35 | # CONCURRENT_REQUESTS_PER_IP=16 36 | 37 | # Disable cookies (enabled by default) 38 | COOKIES_ENABLED = True 39 | COOKIES_DEBUG = True 40 | # Disable Telnet Console (enabled by default) 41 | # TELNETCONSOLE_ENABLED=False 42 | 43 | # Override the default request headers: 44 | # DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | # } 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'bbsspider.middlewares.MyCustomSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'bbsspider.middlewares.MyCustomDownloaderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | # EXTENSIONS = { 64 | # 'scrapy.telnet.TelnetConsole': None, 65 | # } 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | # ITEM_PIPELINES = { 70 | # 'bbsspider.pipelines.SomePipeline': 300, 71 | # } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 76 | # AUTOTHROTTLE_ENABLED=True 77 | # The initial download delay 78 | # AUTOTHROTTLE_START_DELAY=5 79 | # The maximum download delay to be set in case of high latencies 80 | # AUTOTHROTTLE_MAX_DELAY=60 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG=False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED=True 87 | # HTTPCACHE_EXPIRATION_SECS=0 88 | # HTTPCACHE_DIR='httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES=[] 90 | # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /bbsspider/spiders/bbstopten.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from bbsspider.items import ArtItem 5 | import bbsspider.spiders.const as const 6 | import requests 7 | import re 8 | from collections import defaultdict 9 | from scrapy import signals 10 | from scrapy.xlib.pydispatch import dispatcher 11 | import scrapy 12 | import PIL.Image as Image 13 | import math 14 | import os 15 | 16 | 17 | class TopTen(scrapy.Spider): 18 | name = 'topten' 19 | start_urls = const.ALLOW_DOMAINS 20 | headers = const.HEADERS 21 | all_articles = defaultdict(list) 22 | 23 | def __init__(self): 24 | dispatcher.connect(self.spider_closed, signals.spider_closed) 25 | 26 | def spider_closed(self, spider): 27 | print(self.all_articles) 28 | with open('all_articles_users.txt', 'w') as f: 29 | f.write(str(dict(self.all_articles))) 30 | if not os.path.exists('./headImages'): 31 | os.mkdir('./headImages') 32 | for art, users in self.all_articles.items(): 33 | users_len = len(users) 34 | per = 133 35 | rows = 133 # 图片大小+图片间隔 36 | toImage = Image.new('RGBA', (133 * 10, 133 * math.ceil(users_len / 10)), (255, 255, 255)) 37 | ys = int(math.ceil(users_len / 10)) 38 | for y in range(0, ys): 39 | for x in range(0, 10): 40 | if 10 * y + x >= users_len: 41 | break 42 | fname = "images/%s" % users[10 * y + x].split('/')[-1] 43 | try: 44 | img = Image.open(fname) 45 | img.thumbnail((125, 125)) 46 | toImage.paste(img, (x * rows, y * rows)) 47 | except: 48 | print("don't has the image %s" % fname) 49 | toImage.save('./headImages/%s.png' % art.split('/')[-1]) 50 | print(art + '\tsave image success!') 51 | 52 | def parse(self, response): 53 | cur_page_url = response._get_url() 54 | avatarUrls = response.css('div.b-content table.article div.a-u-img ::attr(src)').extract() 55 | motherurl = cur_page_url.split('?')[0] 56 | if const.removeDuplicate: 57 | for avatarUrl in avatarUrls: 58 | if avatarUrl not in self.all_articles[motherurl]: 59 | self.all_articles[motherurl].append(avatarUrl) 60 | else: 61 | self.all_articles[motherurl].extend(avatarUrls) 62 | item = ArtItem() 63 | item['url'] = motherurl 64 | item['avatarUrls'] = avatarUrls 65 | print(item) 66 | yield item 67 | sel_page = response.css('div.t-pre ul.pagination li ol') 68 | cur_page_num = sel_page.css('li.page-select > a::text').extract() 69 | page_list_num = sel_page.css('li.page-normal > a::text').extract() 70 | page_list_url = sel_page.css('li.page-normal > a::attr(href)').extract() 71 | print('cur page is %s' % cur_page_num[0]) 72 | if len(page_list_url) > len(page_list_num): 73 | pre_page_num = '%d' % (int(cur_page_num[0]) - 1) 74 | page_list_num.insert(0, pre_page_num) 75 | for idx, num in enumerate(page_list_num): 76 | print('%d,%s,%s' % (idx, page_list_num[idx], page_list_url[idx])) 77 | if page_list_num[idx] == '>>': 78 | next_url = response.urljoin(page_list_url[idx]) 79 | print('crawl next article page [%s]' % next_url) 80 | yield scrapy.Request(next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, 81 | callback=self.parse) 82 | 83 | def start_requests(self): 84 | return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json", 85 | formdata=const.LOGIN_DATA, 86 | meta={'cookiejar': 1}, 87 | headers=self.headers, 88 | callback=self.get_topten)] 89 | 90 | def get_topten(self, response): 91 | topten_url = 'https://bbs.byr.cn/rss/topten' 92 | top_req = requests.get(topten_url) 93 | top_req.encoding = 'gb2312' 94 | pattern = re.compile('(.*)') 95 | start_urls = pattern.findall(top_req.text) 96 | print(start_urls) 97 | if len(start_urls) != 11: 98 | return 99 | for url in start_urls[1:]: 100 | yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, 101 | callback=self.parse) 102 | -------------------------------------------------------------------------------- /bbsspider/spiders/bbsarticle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from bbsspider.items import ArtItem 5 | import bbsspider.spiders.const as const 6 | from collections import defaultdict 7 | from scrapy import signals 8 | from scrapy.xlib.pydispatch import dispatcher 9 | import scrapy 10 | import PIL.Image as Image 11 | import math 12 | import os 13 | import time # 引入time模块 14 | 15 | 16 | class Article(scrapy.Spider): 17 | name = 'article' 18 | start_urls = const.ALLOW_DOMAINS 19 | article_urls = ['https://bbs.byr.cn/#!article/Talking/6053294'] 20 | headers = const.HEADERS 21 | all_articles = defaultdict(list) 22 | all_users = list() 23 | 24 | def __init__(self): 25 | dispatcher.connect(self.spider_closed, signals.spider_closed) 26 | 27 | def spider_closed(self, spider): 28 | print(self.all_articles) 29 | with open('all_articles_users.txt', 'w') as f: 30 | f.write(str(dict(self.all_articles))) 31 | if not os.path.exists('./headImages'): 32 | os.mkdir('./headImages') 33 | for art, users in self.all_articles.items(): 34 | users_len = len(users) 35 | per = 133 36 | rows = 133 # 图片大小+图片间隔 37 | toImage = Image.new('RGBA', (133 * 10, 133 * math.ceil(users_len / 10)), (255, 255, 255)) 38 | ys = int(math.ceil(users_len / 10)) 39 | for y in range(0, ys): 40 | for x in range(0, 10): 41 | if 10 * y + x >= users_len: 42 | break 43 | fname = "images/%s" % users[10 * y + x].split('/')[-1] 44 | try: 45 | img = Image.open(fname) 46 | img.thumbnail((125, 125)) 47 | toImage.paste(img, (x * rows, y * rows)) 48 | except: 49 | print("don't has the image %s" % fname) 50 | toImage.save('./headImages/%s.png' % art.split('/')[-1]) 51 | print(art + '\tsave image success!') 52 | print(self.all_users) 53 | scrawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 54 | filename = 'all_users_ids_time=' + scrawl_time + '.txt' 55 | with open(filename, 'w') as f: 56 | f.write("本爬虫的爬取时间是:" + scrawl_time + '\n') 57 | for i, user in enumerate(self.all_users): 58 | f.write(str(i) + ',' + user + '\n') 59 | print(filename + '\tsave user ids success!') 60 | 61 | def parse(self, response): 62 | cur_page_url = response._get_url() 63 | avatarUrls = response.css('div.b-content table.article div.a-u-img ::attr(src)').extract() 64 | userName = response.css('div.b-content table.article span.a-u-name a::text').extract() 65 | motherurl = cur_page_url.split('?')[0] 66 | if const.removeDuplicate: 67 | for avatarUrl in avatarUrls: 68 | if avatarUrl not in self.all_articles[motherurl]: 69 | self.all_articles[motherurl].append(avatarUrl) 70 | else: 71 | self.all_articles[motherurl].extend(avatarUrls) 72 | self.all_users.extend(userName) 73 | item = ArtItem() 74 | item['url'] = motherurl 75 | item['avatarUrls'] = avatarUrls 76 | item['userName'] = userName 77 | print(item) 78 | yield item 79 | sel_page = response.css('div.t-pre ul.pagination li ol') 80 | cur_page_num = sel_page.css('li.page-select > a::text').extract() 81 | page_list_num = sel_page.css('li.page-normal > a::text').extract() 82 | page_list_url = sel_page.css('li.page-normal > a::attr(href)').extract() 83 | print('cur page is %s' % cur_page_num[0]) 84 | if len(page_list_url) > len(page_list_num): 85 | pre_page_num = '%d' % (int(cur_page_num[0]) - 1) 86 | page_list_num.insert(0, pre_page_num) 87 | for idx, num in enumerate(page_list_num): 88 | print('%d,%s,%s' % (idx, page_list_num[idx], page_list_url[idx])) 89 | if page_list_num[idx] == '>>': 90 | next_url = response.urljoin(page_list_url[idx]) 91 | print('crawl next article page [%s]' % next_url) 92 | yield scrapy.Request(next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, 93 | callback=self.parse) 94 | 95 | def start_requests(self): 96 | return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json", 97 | formdata=const.LOGIN_DATA, 98 | meta={'cookiejar': 1}, 99 | headers=self.headers, 100 | callback=self.get_articles)] 101 | 102 | def get_articles(self, response): 103 | self.article_urls = map(lambda x: x.replace('#!', ''), self.article_urls) 104 | for url in self.article_urls: 105 | yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, 106 | callback=self.parse) 107 | --------------------------------------------------------------------------------