├── bbsspider
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── const.py
    │   ├── bbstopten.py
    │   └── bbsarticle.py
    ├── items.py
    ├── pipelines.py
    └── settings.py
├── examples
    ├── 1858656.png
    └── 3208312.png
├── scrapy.cfg
└── README.md


/bbsspider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/1858656.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuemingzhu/BYR-HeadImgs/HEAD/examples/1858656.png


--------------------------------------------------------------------------------
/examples/3208312.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuemingzhu/BYR-HeadImgs/HEAD/examples/3208312.png


--------------------------------------------------------------------------------
/bbsspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bbsspider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bbsspider
12 | 


--------------------------------------------------------------------------------
/bbsspider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ArtItem(scrapy.Item):
12 |     url = scrapy.Field()
13 |     avatarUrls = scrapy.Field()
14 |     userName = scrapy.Field()
15 | 


--------------------------------------------------------------------------------
/bbsspider/spiders/const.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | URL = 'http://bbs.byr.cn'
 4 | ALLOW_DOMAINS = ['bbs.byr.cn']
 5 | HEADERS = {'User-Agent': 'Mozilla/5.0', 'Host': 'bbs.byr.cn', 'X-Requested-With': 'XMLHttpRequest',
 6 |            'Connection': 'keep-alive'}
 7 | # 请修改为你自己的用户名id和密码passwd，其他不用修改，提交代码的时候记得抹掉密码 =. =
 8 | LOGIN_DATA = {'id': 'fuxuemingzhu', 'passwd': '*********', 'mode': '0', 'CookieDate': '0'}
 9 | # 是否去掉重复的头像，默认为False
10 | removeDuplicate = True
11 | 


--------------------------------------------------------------------------------
/bbsspider/pipelines.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy.pipelines.images import ImagesPipeline
 3 | 
 4 | 
 5 | class ImagePipeline(ImagesPipeline):  # 继承ImagesPipeline这个类，实现这个功能
 6 | 
 7 |     def get_media_requests(self, item, info):  # 重写ImagesPipeline   get_media_requests方法
 8 |         '''
 9 |         :param item:
10 |         :param info:
11 |         :return:
12 |         在工作流程中可以看到，
13 |         管道会得到文件的URL并从项目中下载。
14 |         为了这么做，你需要重写 get_media_requests() 方法，
15 |         并对各个图片URL返回一个Request:
16 |         '''
17 |         for i, image_url in enumerate(item['avatarUrls']):
18 |             yield scrapy.Request(image_url)
19 | 
20 |     def file_path(self, request, response=None, info=None):
21 |         image_guid = request.url.split('/')[-1]
22 |         return '%s' % (image_guid)
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 北邮人论坛用户合影
 2 | 
 3 | 一键合成北邮人论坛十大&主题帖所有回复头像合影～～
 4 | 
 5 | 现已支持生成合影时的用户头像去重功能。
 6 | 
 7 | 未对用户去重的合影如下，每行头像代表了帖子的一页：
 8 | ![此处输入图片的描述][1]
 9 | 上图源自该贴的所有用户头像：你有一个研一小哥哥，请查收~ https://bbs.byr.cn/#!article/Friends/1858656
10 | 
11 | 对用户去重的合影如下，顺序依然是按照回复帖子的顺序：
12 | ![此处输入图片的描述][2]
13 | 上图来自：十大合影～ https://bbs.byr.cn/#!article/Picture/3208312
14 | 
15 | 为迎接北邮锦鲤活动，特增加爬取某文章的所有回复用户的id功能，使用方法同爬取文章，运行结束后会在项目根目录下生成``all_users_ids_time=爬取时间.txt``，里面包括楼层的序号和用户id的对应关系。一个样例如下：
16 | 
17 | ```txt
18 | 本爬虫的爬取时间是：2018-10-13 10:02:18
19 | 0,wu111137
20 | 1,z574690129
21 | 2,liangkeng
22 | 3,troubadour
23 | 4,bloodsmail
24 | 5,yqyqyqyqyqy
25 | 6,Alison
26 | 7,zc199102
27 | 8,XingXudong
28 | ......
29 | ```
30 | 
31 | # 使用环境
32 | 
33 | 下面的环境请自行安装：
34 | 
35 | 1. python3
36 | 
37 | 2. scrapy
38 | 
39 | 3. PIL
40 | 
41 | # 使用方法
42 | 
43 | ## 配置用户名密码
44 | 
45 | 在bbsspider/spiders/const.py中填写你的byr bbs**用户名密码**。
46 | 
47 | ## 爬取指定文章用户合影以及所有回复id
48 | 
49 | 1. 修改bbsspider/spiders/bbsarticle.py中的article_urls为要进行操作的文章地址列表。
50 | 
51 | 2. 在本项目根目录（与scrapy.cfg同目录），运行命令：
52 | 
53 | 
54 | ```bash
55 | scrapy crawl article
56 | ```
57 | 
58 | 生成指定帖子的用户合影，并在项目根目录下保存了所有回复id。
59 | 
60 | ## 爬取十大帖子
61 | 
62 | 在本项目根目录（与scrapy.cfg同目录），运行命令：
63 | 
64 | ```bash
65 | scrapy crawl topten
66 | ```
67 | 
68 | 自动把十大的十篇文章爬取，并生成用户合影。
69 | 
70 | ## 爬取结果与头像去重
71 | 
72 | 程序运行后，在headImages目录下生成了合影，命名方式为帖子地址结尾的数字。
73 | 
74 | 注：bbsspider/spiders/const.py中removeDuplicate变量控制是否对用户头像去重，当其为True时去重，为False时不去重。默认不去重。爬取的用户id与楼层的对应关系不去重。
75 | 
76 | # 版权声明
77 | 
78 | 1. 本项目为业余开发，代码质量不高。大家随便使用，欢迎交流。
79 | 
80 | 2. 所有代码都在你自己的本地运行，我不会收集论坛密码～
81 | 
82 | 3. 感谢前辈写的论坛爬虫：https://github.com/buptbill220/bbsspider
83 | 
84 | 
85 |   [1]: https://raw.githubusercontent.com/fuxuemingzhu/BYR-HeadImgs/master/examples/1858656.png
86 |   [2]: https://github.com/fuxuemingzhu/BYR-HeadImgs/blob/592ae4a7683bbfcc200d5e03410d07c207624795/examples/3208312.png?raw=true
87 | 


--------------------------------------------------------------------------------
/bbsspider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for bbsspider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'bbsspider'
13 | 
14 | SPIDER_MODULES = ['bbsspider.spiders']
15 | NEWSPIDER_MODULE = 'bbsspider.spiders'
16 | # 下载图片
17 | IMAGES_STORE = 'images/'
18 | ITEM_PIPELINES = {
19 |     'bbsspider.pipelines.ImagePipeline': 1,
20 | 
21 | }
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | # USER_AGENT = 'bbsspider (+http://www.yourdomain.com)'
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | CONCURRENT_REQUESTS = 32
27 | LOG_LEVEL = 'INFO'
28 | RETRY_ENABLED = False
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | DOWNLOAD_DELAY = 0
33 | # The download delay setting will honor only one of:
34 | # CONCURRENT_REQUESTS_PER_DOMAIN=16
35 | # CONCURRENT_REQUESTS_PER_IP=16
36 | 
37 | # Disable cookies (enabled by default)
38 | COOKIES_ENABLED = True
39 | COOKIES_DEBUG = True
40 | # Disable Telnet Console (enabled by default)
41 | # TELNETCONSOLE_ENABLED=False
42 | 
43 | # Override the default request headers:
44 | # DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | # }
48 | 
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | # SPIDER_MIDDLEWARES = {
52 | #    'bbsspider.middlewares.MyCustomSpiderMiddleware': 543,
53 | # }
54 | 
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | # DOWNLOADER_MIDDLEWARES = {
58 | #    'bbsspider.middlewares.MyCustomDownloaderMiddleware': 543,
59 | # }
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | # EXTENSIONS = {
64 | #    'scrapy.telnet.TelnetConsole': None,
65 | # }
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | # ITEM_PIPELINES = {
70 | #    'bbsspider.pipelines.SomePipeline': 300,
71 | # }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
76 | # AUTOTHROTTLE_ENABLED=True
77 | # The initial download delay
78 | # AUTOTHROTTLE_START_DELAY=5
79 | # The maximum download delay to be set in case of high latencies
80 | # AUTOTHROTTLE_MAX_DELAY=60
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG=False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED=True
87 | # HTTPCACHE_EXPIRATION_SECS=0
88 | # HTTPCACHE_DIR='httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES=[]
90 | # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/bbsspider/spiders/bbstopten.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | from bbsspider.items import ArtItem
  5 | import bbsspider.spiders.const as const
  6 | import requests
  7 | import re
  8 | from collections import defaultdict
  9 | from scrapy import signals
 10 | from scrapy.xlib.pydispatch import dispatcher
 11 | import scrapy
 12 | import PIL.Image as Image
 13 | import math
 14 | import os
 15 | 
 16 | 
 17 | class TopTen(scrapy.Spider):
 18 |     name = 'topten'
 19 |     start_urls = const.ALLOW_DOMAINS
 20 |     headers = const.HEADERS
 21 |     all_articles = defaultdict(list)
 22 | 
 23 |     def __init__(self):
 24 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
 25 | 
 26 |     def spider_closed(self, spider):
 27 |         print(self.all_articles)
 28 |         with open('all_articles_users.txt', 'w') as f:
 29 |             f.write(str(dict(self.all_articles)))
 30 |         if not os.path.exists('./headImages'):
 31 |             os.mkdir('./headImages')
 32 |         for art, users in self.all_articles.items():
 33 |             users_len = len(users)
 34 |             per = 133
 35 |             rows = 133  # 图片大小+图片间隔
 36 |             toImage = Image.new('RGBA', (133 * 10, 133 * math.ceil(users_len / 10)), (255, 255, 255))
 37 |             ys = int(math.ceil(users_len / 10))
 38 |             for y in range(0, ys):
 39 |                 for x in range(0, 10):
 40 |                     if 10 * y + x >= users_len:
 41 |                         break
 42 |                     fname = "images/%s" % users[10 * y + x].split('/')[-1]
 43 |                     try:
 44 |                         img = Image.open(fname)
 45 |                         img.thumbnail((125, 125))
 46 |                         toImage.paste(img, (x * rows, y * rows))
 47 |                     except:
 48 |                         print("don't has the image %s" % fname)
 49 |             toImage.save('./headImages/%s.png' % art.split('/')[-1])
 50 |             print(art + '\tsave image success!')
 51 | 
 52 |     def parse(self, response):
 53 |         cur_page_url = response._get_url()
 54 |         avatarUrls = response.css('div.b-content table.article div.a-u-img ::attr(src)').extract()
 55 |         motherurl = cur_page_url.split('?')[0]
 56 |         if const.removeDuplicate:
 57 |             for avatarUrl in avatarUrls:
 58 |                 if avatarUrl not in self.all_articles[motherurl]:
 59 |                     self.all_articles[motherurl].append(avatarUrl)
 60 |         else:
 61 |             self.all_articles[motherurl].extend(avatarUrls)
 62 |         item = ArtItem()
 63 |         item['url'] = motherurl
 64 |         item['avatarUrls'] = avatarUrls
 65 |         print(item)
 66 |         yield item
 67 |         sel_page = response.css('div.t-pre ul.pagination li ol')
 68 |         cur_page_num = sel_page.css('li.page-select > a::text').extract()
 69 |         page_list_num = sel_page.css('li.page-normal > a::text').extract()
 70 |         page_list_url = sel_page.css('li.page-normal > a::attr(href)').extract()
 71 |         print('cur page is %s' % cur_page_num[0])
 72 |         if len(page_list_url) > len(page_list_num):
 73 |             pre_page_num = '%d' % (int(cur_page_num[0]) - 1)
 74 |             page_list_num.insert(0, pre_page_num)
 75 |         for idx, num in enumerate(page_list_num):
 76 |             print('%d,%s,%s' % (idx, page_list_num[idx], page_list_url[idx]))
 77 |             if page_list_num[idx] == '>>':
 78 |                 next_url = response.urljoin(page_list_url[idx])
 79 |                 print('crawl next article page [%s]' % next_url)
 80 |                 yield scrapy.Request(next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers,
 81 |                                      callback=self.parse)
 82 | 
 83 |     def start_requests(self):
 84 |         return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json",
 85 |                                    formdata=const.LOGIN_DATA,
 86 |                                    meta={'cookiejar': 1},
 87 |                                    headers=self.headers,
 88 |                                    callback=self.get_topten)]
 89 | 
 90 |     def get_topten(self, response):
 91 |         topten_url = 'https://bbs.byr.cn/rss/topten'
 92 |         top_req = requests.get(topten_url)
 93 |         top_req.encoding = 'gb2312'
 94 |         pattern = re.compile('<link>(.*)</link>')
 95 |         start_urls = pattern.findall(top_req.text)
 96 |         print(start_urls)
 97 |         if len(start_urls) != 11:
 98 |             return
 99 |         for url in start_urls[1:]:
100 |             yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers,
101 |                                  callback=self.parse)
102 | 


--------------------------------------------------------------------------------
/bbsspider/spiders/bbsarticle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | from bbsspider.items import ArtItem
  5 | import bbsspider.spiders.const as const
  6 | from collections import defaultdict
  7 | from scrapy import signals
  8 | from scrapy.xlib.pydispatch import dispatcher
  9 | import scrapy
 10 | import PIL.Image as Image
 11 | import math
 12 | import os
 13 | import time  # 引入time模块
 14 | 
 15 | 
 16 | class Article(scrapy.Spider):
 17 |     name = 'article'
 18 |     start_urls = const.ALLOW_DOMAINS
 19 |     article_urls = ['https://bbs.byr.cn/#!article/Talking/6053294']
 20 |     headers = const.HEADERS
 21 |     all_articles = defaultdict(list)
 22 |     all_users = list()
 23 | 
 24 |     def __init__(self):
 25 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
 26 | 
 27 |     def spider_closed(self, spider):
 28 |         print(self.all_articles)
 29 |         with open('all_articles_users.txt', 'w') as f:
 30 |             f.write(str(dict(self.all_articles)))
 31 |         if not os.path.exists('./headImages'):
 32 |             os.mkdir('./headImages')
 33 |         for art, users in self.all_articles.items():
 34 |             users_len = len(users)
 35 |             per = 133
 36 |             rows = 133  # 图片大小+图片间隔
 37 |             toImage = Image.new('RGBA', (133 * 10, 133 * math.ceil(users_len / 10)), (255, 255, 255))
 38 |             ys = int(math.ceil(users_len / 10))
 39 |             for y in range(0, ys):
 40 |                 for x in range(0, 10):
 41 |                     if 10 * y + x >= users_len:
 42 |                         break
 43 |                     fname = "images/%s" % users[10 * y + x].split('/')[-1]
 44 |                     try:
 45 |                         img = Image.open(fname)
 46 |                         img.thumbnail((125, 125))
 47 |                         toImage.paste(img, (x * rows, y * rows))
 48 |                     except:
 49 |                         print("don't has the image %s" % fname)
 50 |             toImage.save('./headImages/%s.png' % art.split('/')[-1])
 51 |             print(art + '\tsave image success!')
 52 |         print(self.all_users)
 53 |         scrawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 54 |         filename = 'all_users_ids_time=' + scrawl_time + '.txt'
 55 |         with open(filename, 'w') as f:
 56 |             f.write("本爬虫的爬取时间是：" + scrawl_time + '\n')
 57 |             for i, user in enumerate(self.all_users):
 58 |                 f.write(str(i) + ',' + user + '\n')
 59 |         print(filename + '\tsave user ids success!')
 60 | 
 61 |     def parse(self, response):
 62 |         cur_page_url = response._get_url()
 63 |         avatarUrls = response.css('div.b-content table.article div.a-u-img ::attr(src)').extract()
 64 |         userName = response.css('div.b-content table.article span.a-u-name a::text').extract()
 65 |         motherurl = cur_page_url.split('?')[0]
 66 |         if const.removeDuplicate:
 67 |             for avatarUrl in avatarUrls:
 68 |                 if avatarUrl not in self.all_articles[motherurl]:
 69 |                     self.all_articles[motherurl].append(avatarUrl)
 70 |         else:
 71 |             self.all_articles[motherurl].extend(avatarUrls)
 72 |         self.all_users.extend(userName)
 73 |         item = ArtItem()
 74 |         item['url'] = motherurl
 75 |         item['avatarUrls'] = avatarUrls
 76 |         item['userName'] = userName
 77 |         print(item)
 78 |         yield item
 79 |         sel_page = response.css('div.t-pre ul.pagination li ol')
 80 |         cur_page_num = sel_page.css('li.page-select > a::text').extract()
 81 |         page_list_num = sel_page.css('li.page-normal > a::text').extract()
 82 |         page_list_url = sel_page.css('li.page-normal > a::attr(href)').extract()
 83 |         print('cur page is %s' % cur_page_num[0])
 84 |         if len(page_list_url) > len(page_list_num):
 85 |             pre_page_num = '%d' % (int(cur_page_num[0]) - 1)
 86 |             page_list_num.insert(0, pre_page_num)
 87 |         for idx, num in enumerate(page_list_num):
 88 |             print('%d,%s,%s' % (idx, page_list_num[idx], page_list_url[idx]))
 89 |             if page_list_num[idx] == '>>':
 90 |                 next_url = response.urljoin(page_list_url[idx])
 91 |                 print('crawl next article page [%s]' % next_url)
 92 |                 yield scrapy.Request(next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers,
 93 |                                      callback=self.parse)
 94 | 
 95 |     def start_requests(self):
 96 |         return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json",
 97 |                                    formdata=const.LOGIN_DATA,
 98 |                                    meta={'cookiejar': 1},
 99 |                                    headers=self.headers,
100 |                                    callback=self.get_articles)]
101 | 
102 |     def get_articles(self, response):
103 |         self.article_urls = map(lambda x: x.replace('#!', ''), self.article_urls)
104 |         for url in self.article_urls:
105 |             yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers,
106 |                                  callback=self.parse)
107 | 


--------------------------------------------------------------------------------