├── .gitignore
├── SpiderUtil.py
├── config.py
├── TiebaSpider.py
├── post.py
├── README.md
├── spider.py
├── TiebaUtil.py
└── LICENSE
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/SpiderUtil.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # spider 爬虫操作工具类
5 | # version 0.1.0
6 | # author:changwei [867597730@qq.com]
7 | # website: https://github.com/cw1997
8 | # data:2017-06-12 23:20:04
9 |
10 | # runtime env:windows 7 64bit + python 2.7.10
11 |
12 |
13 | import sys
14 | reload(sys)
15 | sys.setdefaultencoding('utf8')
16 |
17 | from TiebaSpider import *
18 | from TiebaUtil import *
19 |
20 | class SpiderUtil(object):
21 | """docstring for SpiderUtil"""
22 | forum = {}
23 |
24 | def __init__(self, forum_name):
25 | super(SpiderUtil, self).__init__()
26 | self.forum_name = forum_name
27 | tieba_util = TiebaUtil()
28 | self.forum = tieba_util.getForum(forum_name=forum_name)
29 |
30 | def getFidByFname(self):
31 | """通过fname贴吧名字获取fid贴吧id"""
32 | ret = self.forum['forum']['id']
33 | return ret
34 |
35 | def getMemberNumByFname(self):
36 | """通过fname贴吧名字获取贴吧会员数量"""
37 | ret = self.forum['forum']['member_num']
38 | return ret
39 |
40 | def getRankNumByFname(self):
41 | """通过fname贴吧名字获取贴吧会员数量"""
42 | tieba_spider = TiebaSpider()
43 | ret = tieba_spider.fetchRankNumByForum(self.forum_name)
44 | return ret
45 |
46 | @staticmethod
47 | def formatDetail(detail):
48 | """整理已经抓取到的用户信息"""
49 | # print detail
50 | if detail.has_key('birthday'):
51 | birthday = {}
52 | birthday_list = []
53 | birthday_list = detail['birthday'].replace(' ', '') \
54 | .replace(u'年', '-') \
55 | .replace(u'月', '-') \
56 | .replace(u'日', '') \
57 | .split('-')
58 | if len(birthday_list) == 3:
59 | birthday['year'] = birthday_list[0]
60 | birthday['month'] = birthday_list[1]
61 | birthday['day'] = birthday_list[2]
62 | detail['birthday'] = birthday
63 | return detail
64 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # config 配置文件
5 | # version 0.1.0
6 | # author:changwei [867597730@qq.com]
7 | # website: https://github.com/cw1997
8 | # data:2017-06-12 23:39:45
9 |
10 | # runtime env:windows 7 64bit + python 2.7.10
11 |
12 | config = {}
13 |
14 | # ---------- MongoDB ---------- #
15 | # 暂时只支持mongodb进行持久化存储,爬虫使用pymongo库进行操作。
16 | # 安全提示:如果未给数据库加密码,请注意关闭MongoDB的公网访问权限,
17 | # 如果是使用云主机经典网络,请注意设置防火墙和MongoDB所在端口有关的入站规则,防止MongoDB被非法访问造成主机被黑客入侵。
18 | # 如果必须需要外网连接MongoDB,请注意给数据库设置密码。
19 | # MongoDB数据库连接IP
20 | config['ip'] = 'localhost'
21 | # MongoDB数据库连接密码
22 | config['port'] = 27017
23 | # MongoDB数据库名字
24 | config['database'] = 'tieba_birthday_spider'
25 | # MongoDB数据库密码(默认情况下无安全验证请留空)
26 | config['username'] = ''
27 | # MongoDB数据库密码(默认情况下无密码请留空)
28 | config['password'] = ''
29 |
30 | # ---------- network ---------- #
31 | # requests库请求超时(单位:秒)
32 | config['timeout'] = 6
33 |
34 | # ---------- spider ---------- #
35 | # 要抓取的贴吧名称(不要带“吧”字,比如说要抓取昌维吧,直接输入昌维即可)
36 | config['forum_name'] = '昌维'
37 | # 线程数量,根据自己网络环境和硬件配置设置
38 | config['thread_num'] = 50
39 | # 抓取页码集合,一页是20条用户记录,可用range生成器批量生成连续的页码,起始页码为1,所以range第一个参数不能小于1.
40 | config['pages'] = range(1, 1000)
41 |
42 | # ---------- post ---------- #
43 | # 发帖帐号的bduss,一般为192位,使用list进行存储多个账号轮流发送防止验证码,请将下列示例bduss修改为自己的bduss
44 | # bduss获取方法请自行百度
45 | config['bduss'] = [
46 | 'bduss1',
47 | 'bduss2',
48 | 'bduss3',
49 | 'bduss4',
50 | 'bduss5'
51 | ]
52 | # 两次发帖之间的间隔时间(单位:秒),该时间调的越长,出现验证码的概率越少
53 | # 但是如果该吧每天过生日的会员数量过多,比如说一个关注量百万级别的贴吧,
54 | # 按照抽屉原理计算可能会有上千人同一天过生日,则需要适当缩短该时间,并且加入更多的发帖帐号bduss,用于快速发送,
55 | # 这样可以尽可能延长同一帐号两次发帖之间的间隔
56 | config['wait_time'] = 60
57 | # 帖子id,比如说你从地址栏复制的帖子链接为https://tieba.baidu.com/p/4296390791,那么帖子id就为 4296390791
58 | config['post_thread_id'] = '4296390791'
59 | # 发送祝福的吧友所在贴吧,比如说你想给昌维吧的所有当天过生日的吧友发送祝福,那么此处填写“昌维”即可
60 | config['post_forum_name'] = '昌维'
61 | # 祝福语录,发帖时会随机选择并追加至回帖内容末尾
62 | config['wish'] = [
63 | '新的一岁祝你顺顺利利,每天开心。',
64 | '愿这特殊的日子里,你的每时每刻都充满欢乐。',
65 | '愿你在新的一岁,一切的快乐,一切的幸福,一切的温馨,一切的好运永远围绕在你身边。',
66 | '因为你的降临,这一天成了一个美丽的日子,从此世界,便多了一抹诱人的色彩。',
67 | '岁月总是愈来愈短,生日总是愈来愈快,友情总是愈来愈浓,我的祝福也就愈来愈深,愿你的每一天都如画一样的美丽。'
68 | ]
69 |
70 |
--------------------------------------------------------------------------------
/TiebaSpider.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Tieba-Spider 贴吧信息爬虫类
5 | # version 0.1.0
6 | # author:changwei [867597730@qq.com]
7 | # website: https://github.com/cw1997
8 | # 2017-06-12 06:05:11
9 |
10 |
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding('utf8')
14 |
15 | # import bs4
16 | import requests
17 | from bs4 import BeautifulSoup
18 |
19 | from config import *
20 |
21 | # from tieba import TiebaUtil
22 |
23 | class TiebaSpider(object):
24 | """贴吧信息爬虫类"""
25 | html_parser = "html.parser" # or html5lib
26 |
27 | def __init__(self):
28 | pass
29 | # super(TiebaSpider, self).__init__()
30 | # self.arg = arg
31 |
32 | def fetchMembersByForum(self, forum_name, page=1):
33 | members = []
34 | url = 'http://tieba.baidu.com/f/like/furank'
35 | kw = str(forum_name)
36 | pn = str(page)
37 | params = {'kw': kw, 'pn': pn, 'ie': 'utf-8'}
38 | r = requests.get(url, params, timeout=config['timeout'])
39 | content = r.content
40 | soup = BeautifulSoup(content, self.html_parser)
41 | # print soup.prettify()
42 | members_soup = soup.find_all("a", {"class": ["drl_item_name_top", "drl_item_name_nor"]})
43 | for m in members_soup:
44 | members.append(m.get_text())
45 | return members
46 |
47 | def fetchDetailByUsername(self, username):
48 | detail = {}
49 | url = 'https://www.baidu.com/p/%s/detail?ie=utf-8' % username
50 | try:
51 | r = requests.get(url, timeout=config['timeout'])
52 | except Exception, e:
53 | return detail
54 | else:
55 | detail['username'] = username
56 | finally:
57 | pass
58 | content = r.content
59 | soup = BeautifulSoup(content, self.html_parser)
60 | # print soup.prettify()
61 | attr_soup = soup.find_all("span", {"class": "profile-attr"})
62 | cnt_soup = soup.find_all("span", {"class": "profile-cnt"})
63 | # 因为cnt_soup比attr_soup少,是所以使用cnt_soup来遍历
64 | for i in range(len(cnt_soup)):
65 | attr = attr_soup[i].get_text()
66 | # print attr
67 | if attr == u"个人简介":
68 | detail['introduce'] = cnt = cnt_soup[i].get_text()
69 | if attr == u"性别":
70 | detail['gender'] = cnt = cnt_soup[i].get_text()
71 | if attr == u"出生地":
72 | detail['homeplace'] = cnt = cnt_soup[i].get_text()
73 | if attr == u"血型":
74 | detail['blood_type'] = cnt = cnt_soup[i].get_text()
75 | if attr == u"生日":
76 | detail['birthday'] = cnt = cnt_soup[i].get_text()
77 | if attr == u"居住地":
78 | detail['address'] = cnt = cnt_soup[i].get_text()
79 | return detail
80 |
81 | def fetchRankNumByForum(self, forum_name):
82 | """获取贴吧排行榜用户"""
83 | url = 'http://tieba.baidu.com/f/like/furank'
84 | kw = str(forum_name)
85 | params = {'kw': kw, 'pn': 1, 'ie': 'utf-8'}
86 | r = requests.get(url, params, timeout=config['timeout'])
87 | content = r.content
88 | soup = BeautifulSoup(content, self.html_parser)
89 | # print soup.prettify()
90 | num = soup.find('span', {'class': 'drl_info_txt_gray'}).get_text()
91 | return num
92 |
--------------------------------------------------------------------------------
/post.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # post 祝福贴发送
5 | # version 0.1.0
6 | # author:changwei [867597730@qq.com]
7 | # website: https://github.com/cw1997
8 | # data:2017-06-13 00:07:18
9 |
10 | # runtime env:windows 7 64bit + python 2.7.10
11 |
12 | import time
13 | import random
14 | import sys
15 | reload(sys)
16 | sys.setdefaultencoding('utf8')
17 |
18 | import pymongo
19 |
20 | from TiebaUtil import *
21 | from config import *
22 |
23 |
24 | def main():
25 | tieba_util = TiebaUtil()
26 | bduss_list = config['bduss']
27 | bduss_list_num = len(bduss_list)
28 | # bduss检测
29 | all_bduss_is_logout = True
30 | for bduss in bduss_list:
31 | if tieba_util.login(bduss):
32 | all_bduss_is_logout = False
33 | # 如果所有bduss都失效,直接退出脚本
34 | if all_bduss_is_logout == True:
35 | exit(1)
36 |
37 | client = pymongo.MongoClient(config['ip'], config['port'])
38 | db = client.get_database(config['database'])
39 | mongodb_username = config['username']
40 | mongodb_password = config['password']
41 | if mongodb_username != '':
42 | db.authenticate(mongodb_username, mongodb_password)
43 | forum_collection = db['forum']
44 | forum_result = forum_collection.find_one({'forum_name': config['post_forum_name']})
45 | forum_id = forum_result['forum_id']
46 |
47 | y = int(time.strftime("%Y"))
48 | # 通过二次数据类型转换是为了移除单位数日期前面带的0,也就是将06转换为6
49 | m = str(int(time.strftime("%m")))
50 | d = str(int(time.strftime("%d")))
51 |
52 | member_collection = db[forum_id]
53 | find = {'birthday.month': m, 'birthday.day': d}
54 | member_list = member_collection.find(find)
55 | member_count = member_collection.count(find)
56 |
57 | thread_id = config['post_thread_id']
58 | post_thread_id = config['post_thread_id']
59 |
60 | # 用于后续计算取得bduss下标
61 | i = 0
62 | wait_time = config['wait_time']
63 | for member in member_list:
64 | # print member
65 | content = buildContent(member, member_count, y)
66 | # print content
67 | bduss = bduss_list[i % bduss_list_num]
68 | islogin = tieba_util.login(bduss)
69 | # 当帐号登录失败则换下一个帐号登录
70 | while islogin == False:
71 | i = i + 1
72 | bduss = bduss_list[i % bduss_list_num]
73 | islogin = tieba_util.login(bduss)
74 | # print i, bduss
75 | tieba_util.addReply(post_thread_id, content)
76 | # i自增用于顺序取得下一条bduss帐号凭据
77 | i = i + 1
78 | time.sleep(wait_time)
79 |
80 |
81 | def buildContent(member, member_count, now_year):
82 | """回复内容模版构造,member为祝福对象的dict,member_count为当天过生日的总人数,用户可根据自己需要自行修改"""
83 | content = ''
84 | gender = ''
85 | member_gender = member[u'gender']
86 | member_username = member[u'username']
87 | member_birthday_year = int(member[u'birthday'][u'year'])
88 | if member_gender == u'男':
89 | gender = '帅帅哒的汉纸'
90 | elif member_gender == u'女':
91 | gender = '美美哒的妹纸'
92 | else:
93 | gender = '吧友'
94 | wish_num = len(config['wish'])
95 | content = '祝@%s %s生日快乐哈!今天是%s,也是你%d岁的生日,在%s吧内共有%s位吧友和你同样幸运的降临在了这个神奇的日子' \
96 | ',让我们一起祝福他们破壳日快乐!!!%s' % \
97 | (member_username, gender, time.strftime('%Y年%m月%d日'), now_year - member_birthday_year, config['post_forum_name'], \
98 | member_count, config['wish'][random.randint(0, wish_num - 1)])
99 | return content
100 |
101 |
102 | if __name__ == '__main__':
103 | main()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tieba-Birthday-Spider
2 | 百度贴吧生日爬虫,可抓取贴吧内吧友生日,并且在对应日期自动发送祝福
3 |
4 | # 更新日志
5 | - Version 0.1.0 (2017-06-13 16:00:30) 首次提交代码
6 |
7 | # 执行环境
8 | Python 2.7 64bit + MongoDB (请确保环境为64位,防止爬虫中的queue容量过大导致在32位环境下内存不足的异常发生)
9 |
10 | # 项目依赖包
11 | - pymongo
12 | - BeautifulSoup
13 | - requests
14 |
15 | # 使用方法
16 |
17 | 1. 使用pip或者其他方式正确安装好上述项目依赖包
18 | 2. 启动MongoDB
19 | 3. 配置config.py中各项参数
20 | 4. 启动spider.py进行生日等数据抓取
21 | 5. 运行post.py测试是否能正常发送生日祝福贴
22 | 5. 配置cron规则,让post.py能够每天定时运行,并且保证MongoDB服务一直保持开启状态
23 |
24 | # 文件说明
25 |
26 | | 文件名称 | 文件说明 | 备注 |
27 | | -------------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
28 | | config.py | 配置信息 | 内部附有注释,请正确配置。 如果有任何问题或者认为注释有描述不清需要改进的地方欢迎提issue与我讨论。为了防止频繁发帖导致您的百度帐号被误判为SPAM而封号,请尽可能注册几个小号,并且升级到合适的等级以防出现验证码。后期我们会考虑增加接入打码平台。 |
29 | | spider.py | 爬虫主文件 | 在config.py中正确配置好相关参数后,先启动MongoDB服务,然后可直接执行该文件 ,抓取的信息将直接存储在MongoDB中。 |
30 | | post.py | 定时发帖主文件 | 执行该文件将会自动按照配置文件中设置好的参数,将会将指定贴吧内所有过生日的吧友信息提取出来,并且向指定帖子中发送生日祝福。如果需要定时发送,请将该文件加入cron规则,crontab规则:0 0 0 * *表示在每日0点0分0秒自动执行该脚本。并且保证MongoDB服务一直保持开启状态。 如果需要自定义祝福帖内容模版,请参照main函数中的buildContent函数调用点以及相关注释自行修改post.py下的buildContent函数。 |
31 | | TiebaSpider.py | 部分贴吧信息抓取方法 | 默认使用内置的html_parser作为BeautifulSoup的html_parser,用户可以自行修改TiebaSpider类的属性成员html_parser来使用其他html_parser,比如说html5lib。该类使用requests模块进行http通信。 |
32 | | TiebaUtil.py | 部分贴吧发帖回帖以及登录检测模块 | 该类使用urllib2模块进行http通信。 |
33 | | SpiderUtil.py | 爬虫助手函数 | 用于整理抓取到的信息,或者获取一些特殊元数据。 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 | # 程序特点
43 | - 使用threading多线程库+Queue队列,性能高效
44 | - MongoDB持久化存储爬虫内容,适合抓取内容结构随时可变的场景
45 |
46 | # 成功案例
47 | - [BUG吧][1]
48 | - [武汉船舶职业技术学院吧][2]
49 |
50 | # 交流QQ群
51 | - [点击加百度贴吧闲聊群1:255258140][3]
52 | - [点击加编程开发交流群2:578165753][4]
53 |
54 |
55 | 
56 |
57 |
58 | [1]: https://tieba.baidu.com/p/3999225388
59 | [2]: https://tieba.baidu.com/p/4013743860
60 | [3]: https://jq.qq.com/?_wv=1027&k=4AO35rV
61 | [4]: https://jq.qq.com/?_wv=1027&k=4AO3qTM
62 |
--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # spider 爬虫主文件
5 | # version 0.1.0
6 | # author:changwei [867597730@qq.com]
7 | # website: https://github.com/cw1997
8 | # data:2017-06-12 05:09:53
9 |
10 | # runtime env:windows 7 64bit + python 2.7.10
11 |
12 |
13 | import threading
14 | import Queue
15 | import time
16 | import sys
17 | reload(sys)
18 | sys.setdefaultencoding('utf8')
19 |
20 | import pymongo
21 |
22 | from SpiderUtil import *
23 | from config import *
24 |
25 |
26 | class ForumWorker(threading.Thread):
27 | """通过贴吧名字抓取吧友id"""
28 | """该类依赖TiebaSpider"""
29 |
30 | def __init__(self, forum, forum_page_queue, member_queue):
31 | threading.Thread.__init__(self)
32 | self.forum_name = forum['forum_name']
33 | self.forum_page_queue = forum_page_queue
34 | self.member_queue = member_queue
35 | self.tieba_spider = TiebaSpider()
36 |
37 | def run(self):
38 | while True:
39 | page = self.forum_page_queue.get()
40 | print('fetch members at page : ' + str(page))
41 | members = self.tieba_spider.fetchMembersByForum(self.forum_name, page)
42 | # 抓取出错重新丢回队列
43 | if len(members)==0 :
44 | self.forum_page_queue.put(page)
45 | else:
46 | # 遍历member的username并加入member队列
47 | for username in members:
48 | # username = str(username)
49 | self.member_queue.put(username)
50 | print('put the member to member_queue : ' + username)
51 | # signals to queue job is done
52 | self.forum_page_queue.task_done()
53 |
54 |
55 | class MemberWorker(threading.Thread):
56 | """通过贴吧名字抓取吧友id"""
57 | """该类依赖TiebaSpider"""
58 |
59 | def __init__(self, forum, member_queue, member_collection):
60 | threading.Thread.__init__(self)
61 | self.forum = forum
62 | self.member_queue = member_queue
63 | self.member_collection = member_collection
64 | self.tieba_spider = TiebaSpider()
65 |
66 | def run(self):
67 | while True:
68 | username = self.member_queue.get()
69 | print('fetch user infomation by username : ' + username)
70 | detail = self.tieba_spider.fetchDetailByUsername(username)
71 | # 抓取出错重新丢回队列
72 | if len(detail)==0 :
73 | self.member_queue.put(username)
74 | else:
75 | print('store user infomation to the database : ' + username)
76 | detail['forum'] = self.forum
77 | self.member_collection.insert(SpiderUtil.formatDetail(detail))
78 | # signals to queue job is done
79 | self.member_queue.task_done()
80 |
81 |
82 | def main():
83 | forum_name = config['forum_name']
84 | # 对应的贴吧id,此处为系统自动获取函数,无需修改
85 | util = SpiderUtil(forum_name)
86 | forum_id = util.getFidByFname()
87 | thread_num = config['thread_num']
88 | pages = config['pages']
89 |
90 | client = pymongo.MongoClient(config['ip'], config['port'])
91 | db = client.get_database(config['database'])
92 | mongodb_username = config['username']
93 | mongodb_password = config['password']
94 | if mongodb_username != '':
95 | db.authenticate(mongodb_username, mongodb_password)
96 | member_collection = db[forum_id]
97 | forum_collection = db['forum']
98 | # 此处未做并发控制,如果严谨考虑,需要做唯一索引:createIndex({forum_name:1},{unique:true})
99 | forum_result = forum_collection.count({'forum_name': forum_name, 'forum_id': forum_id})
100 | if forum_result == 0:
101 | forum_collection.insert({'forum_name': forum_name, 'forum_id': forum_id})
102 |
103 | forum_page_queue = Queue.Queue()
104 | member_queue = Queue.Queue()
105 |
106 | member_number = int(util.getRankNumByFname())
107 | print 'total member : %d' % member_number
108 | for page in pages:
109 | page = int(page)
110 | # 昌维吧会员数196,也就是只有10页,所以要过滤掉大于10的page_num
111 | if page <= (member_number + 20) / 20:
112 | forum_page_queue.put(page)
113 |
114 | forum = {}
115 | forum['forum_id'] = forum_id
116 | forum['forum_name'] = forum_name
117 |
118 | pages_num = forum_page_queue.qsize()
119 | task_start_time = time.time()
120 | for i in range(thread_num):
121 | t = ForumWorker(forum, forum_page_queue, member_queue)
122 | t.setDaemon(True)
123 | t.start()
124 | forum_page_queue.join()
125 | print 'fetch member complete in : ' + str(time.time() - task_start_time) + ' s'
126 |
127 | member_num = member_queue.qsize()
128 | task_start_time = time.time()
129 | for i in range(thread_num):
130 | t = MemberWorker(forum, member_queue, member_collection)
131 | t.setDaemon(True)
132 | t.start()
133 | member_queue.join()
134 |
135 | print '------------------------------------------------------------'
136 | print 'member_number : %d' % member_num
137 | print 'pages_number : %d' % pages_num
138 | print 'fetch member detail complete in : ' + str(time.time() - task_start_time) + ' s'
139 | print '------------------------------------------------------------'
140 | print 'thanks for using the tieba birthday spider.'
141 | print 'code by changwei [867597730@qq.com] https://github.com/cw1997.'
142 |
143 |
144 | if __name__ == '__main__':
145 | main()
146 |
147 |
148 |
--------------------------------------------------------------------------------
/TiebaUtil.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Tieba-Util 贴吧操作工具类
5 | # version 0.1.0
6 | # author:changwei [867597730@qq.com]
7 | # website: https://github.com/cw1997
8 | # 2017-06-12 04:25:16
9 |
10 | import urllib2
11 | import json
12 | import hashlib
13 | import sys
14 | reload(sys)
15 | sys.setdefaultencoding('utf8')
16 |
17 | class TiebaUtil(object):
18 | """贴吧操作工具类"""
19 | bduss = ''
20 |
21 | def __init__(self):
22 | pass
23 | # super(ClassName, self).__init__()
24 | # login(bduss)
25 |
26 | def login(self, bduss):
27 | """BDUSS方式登录,目前只支持这种类型的登录"""
28 | bduss = 'BDUSS=' + str(bduss)
29 | headers = {'Referer': 'http://tieba.baidu.com/',
30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
31 | 'Connection': 'keep-alive',
32 | 'cookie': bduss}
33 | req = urllib2.Request('http://tieba.baidu.com/dc/common/tbs', '', headers)
34 | response = urllib2.urlopen(req)
35 | # 获取提交后返回的信息
36 | content = response.read()
37 | json_ret = json.loads(content)
38 | islogin = json_ret['is_login']
39 | if islogin == 1:
40 | self.bduss = bduss
41 | return True
42 | return False
43 |
44 | def addReply(self, thread_id, content):
45 | """添加回帖"""
46 | forum = self.getForumByTid(thread_id)
47 | kw = str(forum['forum_name'])
48 | fid = str(forum['forum_id'])
49 | tid = str(thread_id)
50 | tbs = str(self._getTbs())
51 | content = str(content)
52 | post_data = [
53 | 'ie=utf-8',
54 | 'kw=' + kw,
55 | 'fid=' + fid,
56 | 'tid=' + tid,
57 | 'tbs=' + tbs,
58 | 'content=' + content
59 | ]
60 | headers = {'Referer': 'http://tieba.baidu.com/',
61 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
62 | 'Connection': 'keep-alive',
63 | 'cookie': self.bduss}
64 | final_post_data = "&".join(post_data)
65 | req = urllib2.Request('http://tieba.baidu.com/f/commit/post/add', final_post_data, headers)
66 | response = urllib2.urlopen(req)
67 | # 获取提交后返回的信息
68 | content = response.read()
69 | return content
70 |
71 | def getThread(self, thread_id=4296390791, page=1, limit=30):
72 | """获取帖子信息"""
73 | tid = str(thread_id)
74 | page = str(page)
75 | limit = str(limit)
76 | post_data = [
77 | 'kz=' + tid,
78 | 'pn=' + page,
79 | 'q_type=2',
80 | 'rn=' + limit,
81 | 'with_floor=1'
82 | ]
83 | return self._postByAndroidClient('http://c.tieba.baidu.com/c/f/pb/page', post_data)
84 |
85 | def getForumByTid(self, thread_id):
86 | """通过帖子号tid获取贴吧信息"""
87 | ret = self.getThread(thread_id=thread_id)
88 | forum = {}
89 | forum['forum_id'] = ret['forum']['id'].encode('gb2312')
90 | forum['forum_name'] = ret['forum']['name'].encode('gb2312')
91 | return forum
92 |
93 | def getForum(self, forum_name='', forum_id=0, page=1, result_number=1):
94 | """获取贴吧信息"""
95 | kw = str(forum_name)
96 | fid = str(forum_id)
97 | pn = str(page)
98 | rn = str(result_number)
99 | url = 'http://c.tieba.baidu.com/c/f/frs/page'
100 | post_data = [
101 | "kw=" + kw,
102 | # "fid=" + fid,
103 | "pn=" + pn,
104 | "q_type=2",
105 | "rn=" + rn,
106 | "with_group=1"
107 | ]
108 | ret = self._postByAndroidClient(url, post_data)
109 | return ret
110 |
111 | def _getTbs(self):
112 | """获取tbs,即贴吧csrf_token"""
113 | headers = {'Referer': 'http://tieba.baidu.com/',
114 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
115 | 'Connection': 'keep-alive',
116 | 'cookie' : self.bduss}
117 | req = urllib2.Request('http://tieba.baidu.com/dc/common/tbs', '', headers)
118 | response = urllib2.urlopen(req)
119 | # 获取提交后返回的信息
120 | content = response.read()
121 | json_ret = json.loads(content)
122 | print content
123 | tbs = json_ret['tbs']
124 | return tbs
125 |
126 | def _postByAndroidClient(self, url='', post_data=[], headers={}):
127 | """通过客户端协议发送数据包"""
128 | post_data = [
129 | 'BDUSS=' + self.bduss,
130 | '_client_id=wappc_1396611108603_817',
131 | '_client_type=2',
132 | '_client_version=5.7.0',
133 | '_phone_imei=642b43b58d21b7a5814e1fd41b08e2a6',
134 | 'from=tieba'
135 | ] + post_data
136 | post_data.append("sign=" + self._getSignByPostData(post_data))
137 | # final_post_data = urllib.urlencode(post_data)
138 | final_post_data = "&".join(post_data)
139 | # 设置头部
140 | headers = dict({'Referer': 'http://tieba.baidu.com/',
141 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
142 | 'Connection': 'keep-alive',
143 | 'cookie': self.bduss}, **headers)
144 | # req.add_header('Content-Type','application/x-www-form-urlencoded');
145 | # req.add_header('Referer','http://tieba.baidu.com/');
146 | # req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0');
147 | # req.add_header('Connection','keep-alive');
148 | # print final_post_data
149 | # 提交,发送数据
150 | req = urllib2.Request(url, final_post_data, headers)
151 | response = urllib2.urlopen(req)
152 | # 获取提交后返回的信息
153 | content = response.read()
154 | json_ret = json.loads(content)
155 | return json_ret
156 |
157 | def _getSignByPostData(self, post_data):
158 | """通过post数据获得sign校验码"""
159 | sign = hashlib.md5()
160 | # print "".join(post_data)
161 | sign.update("".join(post_data) + "tiebaclient!!!")
162 | return sign.hexdigest()
163 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------