├── .gitignore ├── LICENSE ├── README.md ├── readme..txt ├── src ├── .gitignore ├── Sina_spider3 │ ├── __init__.py │ ├── chromedriver.exe │ ├── cleanRedis │ ├── cookies.py │ ├── items.py │ ├── middleware.py │ ├── pipelines.py │ ├── scrapy_redis │ │ ├── __init__.py │ │ ├── connection.py │ │ ├── dupefilter.py │ │ ├── pipelines.py │ │ ├── queue.py │ │ ├── scheduler.py │ │ ├── spiders.py │ │ └── tests.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ └── sinaSpider.py │ ├── user_agents.py │ ├── weiboID.py │ └── yumdama.py ├── chromedriver.exe ├── cookies(try).txt ├── launch.py ├── mysql.py ├── pipelines.py ├── readme..txt ├── scrapy.cfg ├── sql语句.sql ├── sql语句2.sql └── 用到的工具.txt ├── visio制图 ├── E-R图_Tweets.vsdx ├── E-R图_information.vsdx ├── E-R图_relationships.vsdx └── 数据库概念模型E-R图.vsdx ├── 宋少忠_毕业论文终稿查重版陈巍瑜_大雅详细报告.pdf ├── 开题答辩报告 ├── 开题报告1稿.docx └── 开题报告2稿.docx ├── 毕业论文终稿.doc ├── 毕业设计微博json数据.rar ├── 毕设答辩pt.pptx ├── 论文二稿 ├── readme..txt ├── 摘要与关键字.docx ├── 第一章 │ └── 论文初稿_绪论.docx ├── 第七章 │ └── 第7章结论与展望.docx ├── 第三章 │ ├── 3.1_需求.docx │ ├── 3.2.1_非关系型数据库mongodb及其搭建.docx │ ├── 3.2.4_redis简介及其搭建.docx │ ├── 3.3_Scrapy框架.docx │ └── 3.4_Srcapy+redis架构.docx ├── 第二章 │ ├── 2.1_爬虫的分类与作用.docx │ ├── 2.2_http协议.docx │ ├── 2.3_rebots协议.docx │ └── 2.4_爬虫搜索策略-防止环路的出现.docx ├── 第五章 │ └── 第五章测试.docx ├── 第六章 │ ├── 6.1_数据模型.docx │ └── 6.2_数据分析.docx ├── 第四章 │ ├── 4.1_微博移动版web分析.docx │ ├── 4.2_User-agent伪装.docx │ ├── 4.3_信息过滤规则-正则表达式.docx │ ├── 4.4_查重.docx │ ├── 4.5_反爬技术.docx │ └── 4.6_Cookie池.docx ├── 致谢.docx ├── 草稿.docx ├── 论文初稿_参考文献.docx ├── 论文初稿_目录.docx └── 题目.docx ├── 论文初稿 ├── 第一章 │ └── 论文初稿_绪论.docx ├── 第三章 │ ├── 3.1.1_非关系型数据库mongodb及其搭建.docx │ ├── 3.1.4_redis简介及其搭建.docx │ ├── 3.2_Scrapy框架.docx │ └── 3.3_Srcapy+redis架构.docx ├── 第二章 │ ├── 2.1_爬虫的分类与作用.docx │ ├── 2.2_http协议.docx │ ├── 2.3_rebots协议.docx │ ├── 2.4_微博移动版web分析.docx │ ├── 2.5_User-agent伪装.docx │ └── 2.6_信息过滤规则-正则表达式.docx ├── 第五章 │ ├── 5.1_数据模型.docx │ └── 5.2_数据分析.docx ├── 第六章 │ └── 论文初稿_总结与展望.docx ├── 第四章 │ ├── 4.1_爬虫搜索策略-防止环路的出现.docx │ ├── 4.2_查重.docx │ ├── 4.3_反爬技术.docx │ └── 4.4_Cookie池.docx ├── 论文初稿_参考文献.docx ├── 论文初稿_目录.docx └── 附录 │ └── 环境.txt ├── 论文改一.docx └── 论文改二.docx /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Python爬取网易音乐的网络爬虫 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed_Web_Spider 2 | (本科毕业设计)基于网络爬虫的数据分析系统的实现: 用 python2.7+Scrapy-Redis 分布式架构下的网络爬虫,用 json 编码+Cookies 池+搜索策略BFS+破解验证码+布隆过滤器+对抗AJAX, Redis 放于内存中去重队列并且实现断点继爬而 Mongodb 做磁盘持久化,数据采集微博移动版 web 用户信息关系数据等等共 400w 条数据. 3 | -------------------------------------------------------------------------------- /readme..txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/readme..txt -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask instance folder 58 | instance/ 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | /.idea 85 | /.setting 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | # Created by .ignore support plugin (hsz.mobi) 94 | -------------------------------------------------------------------------------- /src/Sina_spider3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/Sina_spider3/__init__.py -------------------------------------------------------------------------------- /src/Sina_spider3/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/Sina_spider3/chromedriver.exe -------------------------------------------------------------------------------- /src/Sina_spider3/cleanRedis: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ------------------------------------------ 3 | # 作用:清空Redis数据,重新跑数据时用。 4 | # 5 | # ------------------------------------------ 6 | 7 | import settings 8 | import redis 9 | 10 | if __name__ == '__main__': 11 | try: 12 | rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT, settings.REDIS_DB) 13 | except Exception: 14 | rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT) 15 | 16 | try: 17 | rconn_filter = redis.Redis(settings.FILTER_HOST, settings.FILTER_PORT, settings.FILTER_DB) 18 | except Exception: 19 | try: 20 | rconn_filter = redis.Redis(settings.FILTER_HOST, settings.FILTER_PORT) 21 | except Exception: 22 | rconn_filter = None 23 | 24 | if rconn: 25 | if 'SinaSpider:requests' in rconn.keys(): 26 | rconn.delete('SinaSpider:requests') 27 | 28 | if rconn_filter: 29 | if 'SinaSpider:dupefilter0' in rconn.keys(): 30 | rconn.delete('SinaSpider:dupefilter0') 31 | if 'SinaSpider:dupefilter1' in rconn.keys(): 32 | rconn.delete('SinaSpider:dupefilter1') 33 | 34 | print 'Finish!' 35 | -------------------------------------------------------------------------------- /src/Sina_spider3/cookies.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | # ------------------------------------------ 4 | ''' 5 | import base64 6 | import os 7 | import requests 8 | import time 9 | import json 10 | from selenium import webdriver 11 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 12 | import logging 13 | from yumdama import identify 14 | 15 | IDENTIFY = 1 # 验证码输入方式: 1:看截图aa.png,手动输入 2:云打码 16 | COOKIE_GETWAY = 0 # 0 代表从https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18) 获取cookie # 1 代表从https://weibo.cn/login/获取Cookie 17 | dcap = dict(DesiredCapabilities.PHANTOMJS) # PhantomJS需要使用老版手机的user-agent,不然验证码会无法通过 18 | dcap["phantomjs.page.settings.userAgent"] = ( 19 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 20 | ) 21 | logger = logging.getLogger(__name__) 22 | logging.getLogger("selenium").setLevel(logging.WARNING) # 将selenium的日志级别设成WARNING,太烦人 23 | 24 | """ 25 | 输入你的微博账号和密码,可去淘宝买,一元5个。 26 | 建议买几十个,实际生产建议100+,微博反爬得厉害,太频繁了会出现302转移。 27 | """ 28 | myWeiBo = [ 29 | ('13467408430', 'aogan571'), 30 | ('15774109579','bbx4768'), 31 | 32 | ('17877727541','bbx3464'), 33 | 34 | ('15898562769','bbx8712'), 35 | 36 | ('18407320608','bbx2145'), 37 | 38 | ('15973424313','bbx8431'), 39 | 40 | ('13762898341','bbx3186'), 41 | 42 | ('18374112533','bbx9829'), 43 | 44 | ('15274883774','bbx8748'), 45 | 46 | ('13873384591','bbx7247'), 47 | 48 | ('13974708834','bbx2579'), 49 | 50 | ('18474777738','bbx3957'), 51 | 52 | ('18397779843','bbx4491'), 53 | 54 | ('15197752390','bbx1831'), 55 | 56 | ('15273563186','bbx9756') 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | ] 72 | 73 | 74 | def getCookie(account, password): 75 | if COOKIE_GETWAY == 0: 76 | return get_cookie_from_login_sina_com_cn(account, password) 77 | elif COOKIE_GETWAY ==1: 78 | return get_cookie_from_weibo_cn(account, password) 79 | else: 80 | logger.error("COOKIE_GETWAY Error!") 81 | 82 | def get_cookie_from_login_sina_com_cn(account, password): 83 | """ 获取一个账号的Cookie """ 84 | loginURL = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)" 85 | username = base64.b64encode(account.encode("utf-8")).decode("utf-8") 86 | postData = { 87 | "entry": "sso", 88 | "gateway": "1", 89 | "from": "null", 90 | "savestate": "30", 91 | "useticket": "0", 92 | "pagerefer": "", 93 | "vsnf": "1", 94 | "su": username, 95 | "service": "sso", 96 | "sp": password, 97 | "sr": "1440*900", 98 | "encoding": "UTF-8", 99 | "cdult": "3", 100 | "domain": "sina.com.cn", 101 | "prelt": "0", 102 | "returntype": "TEXT", 103 | } 104 | session = requests.Session() 105 | r = session.post(loginURL, data=postData) 106 | jsonStr = r.content.decode("gbk") 107 | info = json.loads(jsonStr) 108 | if info["retcode"] == "0": 109 | logger.warning("Get Cookie Success!( Account:%s )" % account) 110 | cookie = session.cookies.get_dict() 111 | return json.dumps(cookie) 112 | else: 113 | logger.warning("Failed!( Reason:%s )" % info["reason"]) 114 | return "" 115 | 116 | 117 | def get_cookie_from_weibo_cn(account, password): 118 | """ 获取一个账号的Cookie """ 119 | try: 120 | browser = webdriver.PhantomJS(desired_capabilities=dcap) 121 | browser.get("https://weibo.cn/login/") 122 | time.sleep(1) 123 | 124 | failure = 0 125 | while "微博" in browser.title and failure < 5: 126 | failure += 1 127 | browser.save_screenshot("aa.png") 128 | username = browser.find_element_by_name("mobile") 129 | username.clear() 130 | username.send_keys(account) 131 | 132 | psd = browser.find_element_by_xpath('//input[@type="password"]') 133 | psd.clear() 134 | psd.send_keys(password) 135 | try: 136 | code = browser.find_element_by_name("code") 137 | code.clear() 138 | if IDENTIFY == 1: 139 | code_txt = raw_input("请查看路径下新生成的aa.png,然后输入验证码:") # 手动输入验证码 140 | else: 141 | from PIL import Image 142 | img = browser.find_element_by_xpath('//form[@method="post"]/div/img[@alt="请打开图片显示"]') 143 | x = img.location["x"] 144 | y = img.location["y"] 145 | im = Image.open("aa.png") 146 | im.crop((x, y, 100 + x, y + 22)).save("ab.png") # 剪切出验证码 147 | code_txt = identify() # 验证码打码平台识别 148 | code.send_keys(code_txt) 149 | except Exception, e: 150 | pass 151 | 152 | #commit = browser.find_element_by_name("submit") 153 | commit = browser.find_element_by_xpath('//a[@id="loginAction"]') 154 | 155 | commit.click() 156 | time.sleep(3) 157 | if "我的首页" not in browser.title: 158 | time.sleep(4) 159 | if '未激活微博' in browser.page_source: 160 | print '账号未开通微博' 161 | return {} 162 | 163 | cookie = {} 164 | if "我的首页" in browser.title: 165 | for elem in browser.get_cookies(): 166 | cookie[elem["name"]] = elem["value"] 167 | logger.warning("Get Cookie Success!( Account:%s )" % account) 168 | return json.dumps(cookie) 169 | except Exception, e: 170 | logger.warning("Failed %s!" % account) 171 | return "" 172 | finally: 173 | try: 174 | browser.quit() 175 | except Exception, e: 176 | pass 177 | 178 | 179 | def initCookie(rconn, spiderName): 180 | """ 获取所有账号的Cookies,存入Redis。如果Redis已有该账号的Cookie,则不再获取。 """ 181 | for weibo in myWeiBo: 182 | if rconn.get("%s:Cookies:%s--%s" % (spiderName, weibo[0], weibo[1])) is None: # 'SinaSpider:Cookies:账号--密码',为None即不存在。 183 | cookie = getCookie(weibo[0], weibo[1]) 184 | if len(cookie) > 0: 185 | rconn.set("%s:Cookies:%s--%s" % (spiderName, weibo[0], weibo[1]), cookie) 186 | cookieNum = "".join(rconn.keys()).count("SinaSpider:Cookies") 187 | logger.warning("The num of the cookies is %s" % cookieNum) 188 | if cookieNum == 0: 189 | logger.warning('Stopping...') 190 | os.system("pause") 191 | 192 | 193 | def updateCookie(accountText, rconn, spiderName): 194 | """ 更新一个账号的Cookie """ 195 | account = accountText.split("--")[0] 196 | password = accountText.split("--")[1] 197 | cookie = getCookie(account, password) 198 | if len(cookie) > 0: 199 | logger.warning("The cookie of %s has been updated successfully!" % account) 200 | rconn.set("%s:Cookies:%s" % (spiderName, accountText), cookie) 201 | else: 202 | logger.warning("The cookie of %s updated failed! Remove it!" % accountText) 203 | removeCookie(accountText, rconn, spiderName) 204 | 205 | 206 | def removeCookie(accountText, rconn, spiderName): 207 | """ 删除某个账号的Cookie """ 208 | rconn.delete("%s:Cookies:%s" % (spiderName, accountText)) 209 | cookieNum = "".join(rconn.keys()).count("SinaSpider:Cookies") 210 | logger.warning("The num of the cookies left is %s" % cookieNum) 211 | if cookieNum == 0: 212 | logger.warning("Stopping...") 213 | os.system("pause") 214 | 215 | 216 | # encoding=utf-8 217 | 218 | ''' 219 | import base64 220 | import requests 221 | import sys 222 | import time 223 | from selenium import webdriver 224 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 225 | from selenium.webdriver import ActionChains 226 | from selenium.webdriver.common.by import By 227 | from selenium.webdriver.support.ui import WebDriverWait 228 | from selenium.webdriver.support import expected_conditions as EC 229 | import logging 230 | from yumdama import identify 231 | import json 232 | 233 | reload(sys) 234 | sys.setdefaultencoding('utf-8') 235 | IDENTIFY = 1 # 验证码输入方式: 1:看截图aa.png,手动输入 2:云打码 236 | COOKIE_GETWAY =2 # 0 从https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18) 获取cookie # 1 从https://weibo.cn/login/获取Cookie # 2 使用chromedriver获取 237 | dcap = dict(DesiredCapabilities.PHANTOMJS) # PhantomJS需要使用老版手机的user-agent,不然验证码会无法通过 238 | dcap["phantomjs.page.settings.userAgent"] = ( 239 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 240 | ) 241 | logger = logging.getLogger(__name__) 242 | logging.getLogger("selenium").setLevel(logging.WARNING) # 将selenium的日志级别设成WARNING,太烦人 243 | 244 | 245 | """ 246 | 输入你的微博账号和密码,可去淘宝买。 247 | 建议买几十个,微博限制的严,太频繁了会出现302转移。 248 | 或者你也可以把时间间隔调大点。 249 | """ 250 | 251 | myWeiBo = [ 252 | # {'no': '18342808545','psw':'989527tx'}, 这些被封号了 253 | # {'no': '13655458602','psw':'943715tx'}, 254 | # {'no': '15529387149','psw':'222449sl'}, 255 | # {'no': '15273181439','psw':'222449sl'}, 256 | # {'no': '18476497826','psw':'22222a'}, 257 | # {'no': '18475447137','psw':'222449sl'} 258 | # {'no':'18101496480','psw':'325928lg'}, 259 | # {'no':'clab5570@163.com','psw':'7flxtedsnd'}, 260 | # {'no':'17163474885','psw':'216147vz'}, 261 | #{'no':'17084824243','psw':'951554vz'}, 262 | # {'no':'17162241240','psw':'247851vz'}, 263 | # {'no':'15836164273','psw':'897765vz'}, 264 | # {'no':'17162241495','psw':'114831vz'}, 265 | 266 | 267 | 268 | 269 | #测试 270 | 271 | #{'no':'15874173914','psw':'bbx8514'}, 272 | 273 | #{'no':'15774109579','psw':'bbx4768'}, 274 | 275 | #{'no':'17877727541','psw':'bbx3464'}, 276 | 277 | {'no':'15898562769','psw':'bbx8712'}, 278 | 279 | {'no':'18407320608','psw':'bbx2145'}, 280 | 281 | {'no':'15973424313','psw':'bbx8431'}, 282 | 283 | #{'no':'13762898341','psw':'bbx3186'}, 284 | 285 | #{'no':'18374112533','psw':'bbx9829'}, 286 | 287 | #{'no':'15274883774','psw':'bbx8748'}, 288 | 289 | #{'no':'13873384591','psw':'bbx7247'}, 290 | 291 | #{'no':'13974708834','psw':'bbx2579'}, 292 | 293 | #{'no':'18474777738','psw':'bbx3957'}, 294 | 295 | #{'no':'18397779843','psw':'bbx4491'}, 296 | 297 | #{'no':'15197752390','psw':'bbx1831'}, 298 | 299 | #{'no':'15273563186','psw':'bbx9756'}, 300 | 301 | 302 | 303 | 304 | 305 | ] 306 | 307 | def getCookie(account, password): 308 | if COOKIE_GETWAY == 0: 309 | return get_cookie_from_login_sina_com_cn(account, password) 310 | elif COOKIE_GETWAY == 1: 311 | return get_cookie_from_weibo_cn(account, password) 312 | elif COOKIE_GETWAY == 2: 313 | return get_cookie_from_weibo(account, password) 314 | else: 315 | logger.error("COOKIE_GETWAY Error!") 316 | 317 | 318 | def get_cookie_from_login_sina_com_cn(account, password): 319 | """ 获取一个账号的Cookie """ 320 | loginURL = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)" 321 | username = base64.b64encode(account.encode("utf-8")).decode("utf-8") 322 | postData = { 323 | "entry": "sso", 324 | "gateway": "1", 325 | "from": "null", 326 | "savestate": "30", 327 | "useticket": "0", 328 | "pagerefer": "", 329 | "vsnf": "1", 330 | "su": username, 331 | "service": "sso", 332 | "sp": password, 333 | "sr": "1440*900", 334 | "encoding": "UTF-8", 335 | "cdult": "3", 336 | "domain": "sina.com.cn", 337 | "prelt": "0", 338 | "returntype": "TEXT", 339 | } 340 | session = requests.Session() 341 | r = session.post(loginURL, data=postData) 342 | jsonStr = r.content.decode("gbk") 343 | info = json.loads(jsonStr) 344 | if info["retcode"] == "0": 345 | logger.warning("Get Cookie Success!( Account:%s )" % account) 346 | cookie = session.cookies.get_dict() 347 | return json.dumps(cookie) 348 | else: 349 | logger.warning("Failed!( Reason:%s )" % info["reason"]) 350 | return "" 351 | 352 | 353 | def get_cookie_from_weibo_cn(account, password): 354 | """ 获取一个账号的Cookie """ 355 | try: 356 | browser = webdriver.PhantomJS(executable_path='E:\\phantomjs\\bin\\phantomjs.exe',desired_capabilities=dcap) 357 | browser.get("https://weibo.cn/login/") 358 | time.sleep(1) 359 | 360 | failure = 0 361 | while "微博" in browser.title and failure < 5: 362 | failure += 1 363 | browser.save_screenshot("aa.png") 364 | username = browser.find_element_by_name("mobile") 365 | username.clear() 366 | username.send_keys(account) 367 | 368 | psd = browser.find_element_by_xpath('//input[@type="password"]') 369 | psd.clear() 370 | psd.send_keys(password) 371 | try: 372 | code = browser.find_element_by_name("code") 373 | code.clear() 374 | if IDENTIFY == 1: 375 | code_txt = raw_input("请查看路径下新生成的aa.png,然后输入验证码:") # 手动输入验证码 376 | else: 377 | from PIL import Image 378 | img = browser.find_element_by_xpath('//form[@method="post"]/div/img[@alt="请打开图片显示"]') 379 | x = img.location["x"] 380 | y = img.location["y"] 381 | im = Image.open("aa.png") 382 | im.crop((x, y, 100 + x, y + 22)).save("ab.png") # 剪切出验证码 383 | code_txt = identify() # 验证码打码平台识别 384 | code.send_keys(code_txt) 385 | except Exception, e: 386 | pass 387 | 388 | commit = browser.find_element_by_name("submit") 389 | commit.click() 390 | time.sleep(3) 391 | if "我的首页" not in browser.title: 392 | time.sleep(4) 393 | if '未激活微博' in browser.page_source: 394 | print '账号未开通微博' 395 | return {} 396 | 397 | cookie = {} 398 | if "我的首页" in browser.title: 399 | for elem in browser.get_cookies(): 400 | cookie[elem["name"]] = elem["value"] 401 | logger.warning("Get Cookie Success!( Account:%s )" % account) 402 | return json.dumps(cookie) 403 | except Exception, e: 404 | logger.warning("Failed %s!" % account) 405 | logger.warning(e) 406 | return "" 407 | finally: 408 | try: 409 | browser.quit() 410 | except Exception, e: 411 | pass 412 | 413 | 414 | def get_cookie_from_weibo(username, password): 415 | driver = webdriver.Chrome() 416 | driver.get('https://weibo.cn//login/') 417 | time.sleep(10) 418 | assert u"微博" in driver.title 419 | login_link = driver.find_element_by_link_text(u'登录') 420 | ActionChains(driver).move_to_element(login_link).click().perform() 421 | login_name = WebDriverWait(driver, 10).until( 422 | EC.visibility_of_element_located((By.ID, "loginName")) 423 | ) 424 | login_password = driver.find_element_by_id("loginPassword") 425 | login_name.send_keys(username) 426 | login_password.send_keys(password) 427 | login_button = driver.find_element_by_id("loginAction") 428 | login_button.click() #自动按下登陆 429 | WebDriverWait(driver, 30).until(EC.title_is(u"我的首页")) #等待过了验证到首页 430 | cookie = driver.get_cookies() 431 | driver.close() 432 | return json.dumps(cookie) 433 | 434 | 435 | def getCookies(weibo): 436 | """ 获取Cookies """ 437 | cookies = [] 438 | for elem in weibo: 439 | account = elem['no'] 440 | password = elem['psw'] 441 | cookie = getCookie(account, password) 442 | # logger.warning(type(cookie)) 443 | if cookie != None: 444 | cookies.append(cookie) 445 | 446 | return cookies 447 | 448 | 449 | cookies = getCookies(myWeiBo) 450 | # logger.warning(type(cookies)) 451 | # logger.warning(cookies) 452 | logger.warning("Get Cookies Finish!( Num:%d)" % len(cookies)) 453 | 454 | 455 | 456 | 457 | 458 | -------------------------------------------------------------------------------- /src/Sina_spider3/items.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | from scrapy import Item, Field 4 | 5 | 6 | class InformationItem(Item): 7 | """ 个人信息 """ 8 | _id = Field() # 用户ID 9 | NickName = Field() # 昵称 10 | Gender = Field() # 性别 11 | Province = Field() # 所在省 12 | City = Field() # 所在城市 13 | BriefIntroduction = Field() # 简介 14 | Birthday = Field() # 生日 15 | Num_Tweets = Field() # 微博数 16 | Num_Follows = Field() # 关注数 17 | Num_Fans = Field() # 粉丝数 18 | SexOrientation = Field() # 性取向 19 | Sentiment = Field() # 感情状况 20 | VIPlevel = Field() # 会员等级 21 | Authentication = Field() # 认证 22 | URL = Field() # 首页链接 23 | 24 | 25 | class TweetsItem(Item): 26 | """ 微博信息 """ 27 | _id = Field() # 用户ID-微博ID 28 | ID = Field() # 用户ID 29 | Content = Field() # 微博内容 30 | PubTime = Field() # 发表时间 31 | Co_oridinates = Field() # 定位坐标 32 | Tools = Field() # 发表工具/平台 33 | Like = Field() # 点赞数 34 | Comment = Field() # 评论数 35 | Transfer = Field() # 转载数 36 | 37 | 38 | class RelationshipsItem(Item): 39 | """ 用户关系,只保留与关注的关系 """ 40 | Host1 = Field() 41 | Host2 = Field() # 被关注者的ID 42 | -------------------------------------------------------------------------------- /src/Sina_spider3/middleware.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ------------------------------------------ 3 | ''' 4 | import os 5 | import random 6 | import redis 7 | import json 8 | import logging 9 | from user_agents import agents 10 | from cookies import initCookie, updateCookie, removeCookie 11 | #from cookies import cookies 12 | from scrapy.exceptions import IgnoreRequest 13 | from scrapy.utils.response import response_status_message 14 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class UserAgentMiddleware(object): 20 | """ 换User-Agent """ 21 | 22 | def process_request(self, request, spider): 23 | agent = random.choice(agents) 24 | request.headers["User-Agent"] = agent 25 | 26 | 27 | class CookiesMiddleware(RetryMiddleware): 28 | """ 维护Cookie """ 29 | 30 | def __init__(self, settings, crawler): 31 | RetryMiddleware.__init__(self, settings) 32 | self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) 33 | initCookie(self.rconn, crawler.spider.name) 34 | 35 | @classmethod 36 | def from_crawler(cls, crawler): 37 | return cls(crawler.settings, crawler) 38 | 39 | def process_request(self, request, spider): 40 | redisKeys = self.rconn.keys() 41 | while len(redisKeys) > 0: 42 | elem = random.choice(redisKeys) 43 | if "SinaSpider:Cookies" in elem: 44 | cookie = json.loads(self.rconn.get(elem)) 45 | request.cookies = cookie 46 | request.meta["accountText"] = elem.split("Cookies:")[-1] 47 | break 48 | else: 49 | redisKeys.remove(elem) 50 | 51 | def process_response(self, request, response, spider): 52 | if response.status in [300, 301, 302, 303]: 53 | try: 54 | redirect_url = response.headers["location"] 55 | if "login.weibo" in redirect_url or "login.sina" in redirect_url: # Cookie失效 56 | logger.warning("One Cookie need to be updating...") 57 | updateCookie(request.meta['accountText'], self.rconn, spider.name) 58 | elif "weibo.cn/security" in redirect_url: # 账号被限 59 | logger.warning("One Account is locked! Remove it!") 60 | removeCookie(request.meta["accountText"], self.rconn, spider.name) 61 | elif "weibo.cn/pub" in redirect_url: 62 | logger.warning( 63 | "Redirect to 'http://weibo.cn/pub'!( Account:%s )" % request.meta["accountText"].split("--")[0]) 64 | reason = response_status_message(response.status) 65 | return self._retry(request, reason, spider) or response # 重试 66 | except Exception, e: 67 | raise IgnoreRequest 68 | elif response.status in [403, 414]: 69 | logger.error("%s! Stopping..." % response.status) 70 | os.system("pause") 71 | else: 72 | return response''' 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | # encoding=utf-8 81 | import random 82 | from cookies import cookies 83 | from user_agents import agents 84 | # from getproxies import proxies 85 | import json 86 | 87 | 88 | class UserAgentMiddleware(object): 89 | """ 换User-Agent """ 90 | 91 | def process_request(self, request, spider): 92 | agent = random.choice(agents) 93 | request.headers.setdefault("User-Agent", agent) 94 | # request.headers["User-Agent"] = agent 95 | 96 | 97 | class CookiesMiddleware(object): 98 | """ 换Cookie """ 99 | 100 | def process_request(self, request, spider): 101 | cookie = json.loads(random.choice(cookies)) 102 | # print cookie 103 | request.cookies = cookie 104 | 105 | 106 | # class ProxyMiddleware(object): 107 | # """ 获取开放IP """ 108 | # def process_request(self, request, spider): 109 | # url = "http://ip.chinaz.com/getip.aspx" 110 | # while True: 111 | # proxy = random.choice(proxies) 112 | # ip = proxy.strip().split("\t") 113 | # proxy_host = "http://"+ip[0]+":"+ip[1] 114 | # proxy_temp = {"http":proxy_host} 115 | # res = urllib.urlopen(url,proxies=proxy_temp).read() 116 | # request.meta['proxy'] = "http://" + proxy['ip_port'] 117 | # try: 118 | # pass -------------------------------------------------------------------------------- /src/Sina_spider3/pipelines.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import pymongo 3 | from items import InformationItem, TweetsItem, RelationshipsItem 4 | import MySQLdb 5 | 6 | class MysqlDBPipeline(object): 7 | def __init__(self): 8 | self.count = 1 9 | self.conn = MySQLdb.connect( 10 | host='localhost', 11 | port=3306, 12 | user='root', 13 | #这里填写密码 14 | passwd='chen960212', 15 | db='sinaweibo', 16 | charset='utf8', 17 | ) 18 | self.cur = self.conn.cursor() 19 | 20 | def process_item(self, item, spider): 21 | """ 判断item的类型,并作相应的处理,再入数据库 """ 22 | if isinstance(item, RelationshipsItem): 23 | try: 24 | print("***********at beginning of saving**********") 25 | print(dict(item)) 26 | sql = '' 27 | sql+=str('INSERT INTO SinaWeibo.relationship (`Host1`,`Host2`) ') 28 | sql+=str(' Values(\'' ) 29 | sql+=str(item['Host1']) 30 | print(sql) 31 | sql+=str('\', \'') 32 | sql+=str(item['Host2']) 33 | sql+=str('\')') 34 | print("*********** SQL SYNTAX *********** ") 35 | print(''.join(sql)) 36 | self.cur.execute(sql) 37 | self.conn.commit() 38 | print("saved") 39 | self.count = self.count +1 40 | print(self.count) 41 | except Exception: 42 | pass 43 | elif isinstance(item, TweetsItem): 44 | try: 45 | print("***********at beginning of saving**********") 46 | 47 | sql = '' 48 | sql+=str('INSERT INTO SinaWeibo.tweets (`weibo_id`,`User_id`,`Content`,`Pubtime`,`Coordinates`,`Tools`,`Likes`,`Comments`,`Transfers`) ') 49 | sql+=str(' Values(\'' ) 50 | sql+=str(item['_id']) 51 | 52 | sql+=str('\', \'') 53 | sql+=str(item['ID']) 54 | sql+=str('\', \'') 55 | sql+=str(item['Content']) 56 | sql+=str('\', \'') 57 | sql+=str(item['PubTime']) 58 | 59 | sql+=str('\', \'') 60 | sql+=str(item['Co_oridinates']) 61 | sql+=str('\', \'') 62 | 63 | sql+=str(item['Tools']) 64 | print(sql) 65 | sql+=str('\', \'') 66 | sql+=str(item['Like']) 67 | sql+=str('\', \'') 68 | sql+=str(item['Comment']) 69 | sql+=str('\', \'') 70 | sql+=str(item['Transfer']) 71 | sql+=str('\')') 72 | print("*********** SQL SYNTAX *********** ") 73 | print(''.join(sql)) 74 | self.cur.execute(sql) 75 | self.conn.commit() 76 | print("saved") 77 | self.count = self.count +1 78 | print(self.count) 79 | except Exception: 80 | pass 81 | elif isinstance(item, InformationItem): 82 | try: 83 | print("***********at beginning of saving**********") 84 | 85 | sql = '' 86 | sql+=str('INSERT INTO SinaWeibo.information (`User_id`,`NickName`,`Gender`,`Province`,`City`,`BriefIntroduction`,`Birthday`,`Num_Tweets`,`Num_Follows`,`Num_Fans`,`SexOrientation`,`Sentiment`,`VIPlevel`,`Authentication`,`URL`) ') 87 | sql+=str(' Values(\'' ) 88 | sql+=str(item['_id']) 89 | 90 | sql+=str('\', \'') 91 | sql+=str(item['NickName']) 92 | sql+=str('\', \'') 93 | sql+=str(item['Gender']) 94 | sql+=str('\', \'') 95 | sql+=str(item['Province']) 96 | 97 | sql+=str('\', \'') 98 | sql+=str(item['City']) 99 | sql+=str('\', \'') 100 | sql+=str(item['BriefIntroduction']) 101 | sql+=str('\', \'') 102 | print(sql) 103 | sql+=str(item['Birthday']) 104 | sql+=str('\', \'') 105 | sql+=str(item['Num_Tweets']) 106 | 107 | sql+=str('\', \'') 108 | sql+=str(item['Num_Follows']) 109 | sql+=str('\', \'') 110 | sql+=str(item['Num_Fans']) 111 | sql+=str('\', \'') 112 | 113 | sql+=str(item['SexOrientation']) 114 | sql+=str('\', \'') 115 | sql+=str(item['Sentiment']) 116 | 117 | sql+=str('\', \'') 118 | sql+=str(item['VIPlevel']) 119 | sql+=str('\', \'') 120 | sql+=str(item['Authentication']) 121 | sql+=str('\', \'') 122 | sql+=str(item['URL']) 123 | sql+=str('\')') 124 | 125 | print("*********** SQL SYNTAX *********** ") 126 | print(''.join(sql)) 127 | self.cur.execute(sql) 128 | self.conn.commit() 129 | print("saved") 130 | self.count = self.count +1 131 | print(self.count) 132 | except Exception: 133 | pass 134 | 135 | ##在Java开发中,Dao连接会对内存溢出,需要定时断开重连,这里不清楚是否需要,先加上了 136 | if self.count == 1000: 137 | print("try reconnecting") 138 | self.count = 0 139 | self.cur.close() 140 | self.conn.close() 141 | self.conn = MySQLdb.connect( 142 | host='localhost', 143 | port=3306, 144 | user='root', 145 | passwd='chen960212', 146 | db='sinaweibo', 147 | charset='utf8', 148 | ) 149 | self.cur = self.conn.cursor() 150 | print("reconnect") 151 | 152 | return item 153 | 154 | 155 | 156 | class MongoDBPipeline(object): 157 | def __init__(self): 158 | clinet = pymongo.MongoClient("localhost", 27017) 159 | db = clinet["Spider_Sina_weibo"] 160 | self.Information = db["Information"] 161 | self.Tweets = db["Tweets"] 162 | self.Relationships = db["Relationships"] 163 | 164 | def process_item(self, item, spider): 165 | """ 判断item的类型,并作相应的处理,再入数据库 """ 166 | if isinstance(item, RelationshipsItem): 167 | try: 168 | self.Relationships.insert(dict(item)) 169 | except Exception: 170 | pass 171 | elif isinstance(item, TweetsItem): 172 | try: 173 | self.Tweets.insert(dict(item)) 174 | except Exception: 175 | pass 176 | elif isinstance(item, InformationItem): 177 | try: 178 | self.Information.insert(dict(item)) 179 | except Exception: 180 | pass 181 | return item 182 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/Sina_spider3/scrapy_redis/__init__.py -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | # Default values. 4 | REDIS_URL = None 5 | REDIS_HOST = 'localhost' 6 | REDIS_PORT = 6379 7 | 8 | FILTER_URL = None 9 | FILTER_HOST = 'localhost' 10 | FILTER_PORT = 6379 11 | FILTER_DB = 0 12 | 13 | 14 | def from_settings(settings): 15 | url = settings.get('REDIS_URL', REDIS_URL) 16 | host = settings.get('REDIS_HOST', REDIS_HOST) 17 | port = settings.get('REDIS_PORT', REDIS_PORT) 18 | 19 | # REDIS_URL takes precedence over host/port specification. 20 | if url: 21 | return redis.from_url(url) 22 | else: 23 | return redis.Redis(host=host, port=port) 24 | 25 | 26 | def from_settings_filter(settings): 27 | url = settings.get('FILTER_URL', FILTER_URL) 28 | host = settings.get('FILTER_HOST', FILTER_HOST) 29 | port = settings.get('FILTER_PORT', FILTER_PORT) 30 | db = settings.get('FILTER_DB', FILTER_DB) 31 | 32 | if url: 33 | return redis.from_url(url) 34 | else: 35 | return redis.Redis(host=host, port=port, db=db) 36 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | 4 | from scrapy.dupefilters import BaseDupeFilter 5 | 6 | from . import connection 7 | 8 | 9 | class RFPDupeFilter(BaseDupeFilter): 10 | """Redis-based request duplication filter""" 11 | 12 | def __init__(self, server, key): 13 | """Initialize duplication filter 14 | 15 | Parameters 16 | ---------- 17 | server : Redis instance 18 | key : str 19 | Where to store fingerprints 20 | """ 21 | self.server = server 22 | self.key = key 23 | 24 | @classmethod 25 | def from_settings(cls, settings): 26 | server = connection.from_settings_filter(settings) 27 | key = "dupefilter:%s" % int(time.time()) 28 | return cls(server, key) 29 | 30 | @classmethod 31 | def from_crawler(cls, crawler): 32 | return cls.from_settings(crawler.settings) 33 | 34 | def request_seen(self, request): 35 | uid = re.findall('(\d+)/info', request.url) 36 | if uid: 37 | uid = int(uid[0]) 38 | isExist = self.server.getbit(self.key + str(uid / 4000000000), uid % 4000000000) 39 | if isExist == 1: 40 | return True 41 | else: 42 | self.server.setbit(self.key + str(uid / 4000000000), uid % 4000000000, 1) 43 | return False 44 | 45 | def close(self, reason): 46 | """Delete data on close. Called by scrapy's scheduler""" 47 | self.clear() 48 | 49 | def clear(self): 50 | """Clears fingerprints data""" 51 | self.server.delete(self.key) 52 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.serialize import ScrapyJSONEncoder 2 | from twisted.internet.threads import deferToThread 3 | 4 | from . import connection 5 | 6 | 7 | class RedisPipeline(object): 8 | """Pushes serialized item into a redis list/queue""" 9 | 10 | def __init__(self, server): 11 | self.server = server 12 | self.encoder = ScrapyJSONEncoder() 13 | 14 | @classmethod 15 | def from_settings(cls, settings): 16 | server = connection.from_settings(settings) 17 | return cls(server) 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | return cls.from_settings(crawler.settings) 22 | 23 | def process_item(self, item, spider): 24 | return deferToThread(self._process_item, item, spider) 25 | 26 | def _process_item(self, item, spider): 27 | key = self.item_key(item, spider) 28 | data = self.encoder.encode(item) 29 | self.server.rpush(key, data) 30 | return item 31 | 32 | def item_key(self, item, spider): 33 | """Returns redis key based on given spider""" 34 | return "%s:items" % spider.name 35 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | from scrapy.http import Request 3 | 4 | try: 5 | import cPickle as pickle 6 | except ImportError: 7 | import pickle 8 | 9 | 10 | class Base(object): 11 | """Per-spider queue/stack base class""" 12 | 13 | def __init__(self, server, spider, key, queue_name): 14 | """Initialize per-spider redis queue. 15 | 16 | Parameters: 17 | server -- redis connection 18 | spider -- spider instance 19 | key -- key for this queue (e.g. "%(spider)s:queue") 20 | """ 21 | self.server = server 22 | self.spider = spider 23 | self.key = key % {'spider': queue_name} 24 | 25 | def _encode_request(self, request): 26 | """Encode a request object""" 27 | return pickle.dumps(request_to_dict(request, self.spider), protocol=-1) 28 | 29 | def _decode_request(self, encoded_request): 30 | """Decode an request previously encoded""" 31 | return request_from_dict(pickle.loads(encoded_request), self.spider) 32 | 33 | def __len__(self): 34 | """Return the length of the queue""" 35 | raise NotImplementedError 36 | 37 | def push(self, request): 38 | """Push a request""" 39 | raise NotImplementedError 40 | 41 | def pop(self, timeout=0): 42 | """Pop a request""" 43 | raise NotImplementedError 44 | 45 | def clear(self): 46 | """Clear queue/stack""" 47 | self.server.delete(self.key) 48 | 49 | 50 | class SpiderQueue(Base): 51 | """Per-spider FIFO queue""" 52 | 53 | def __len__(self): 54 | """Return the length of the queue""" 55 | return self.server.llen(self.key) 56 | 57 | def push(self, request): 58 | """Push a request""" 59 | self.server.lpush(self.key, self._encode_request(request)) 60 | 61 | def pop(self, timeout=0): 62 | """Pop a request""" 63 | if timeout > 0: 64 | data = self.server.brpop(self.key, timeout) 65 | if isinstance(data, tuple): 66 | data = data[1] 67 | else: 68 | data = self.server.rpop(self.key) 69 | if data: 70 | return self._decode_request(data) 71 | 72 | 73 | class SpiderPriorityQueue(Base): 74 | """Per-spider priority queue abstraction using redis' sorted set""" 75 | 76 | def __len__(self): 77 | """Return the length of the queue""" 78 | return self.server.zcard(self.key) 79 | 80 | def push(self, request): 81 | """Push a request""" 82 | data = self._encode_request(request) 83 | pairs = {data: -request.priority} 84 | self.server.zadd(self.key, **pairs) 85 | 86 | def pop(self, timeout=0): 87 | """ 88 | Pop a request 89 | timeout not support in this queue class 90 | """ 91 | # use atomic range/remove using multi/exec 92 | pipe = self.server.pipeline() 93 | pipe.multi() 94 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 95 | results, count = pipe.execute() 96 | if results: 97 | return self._decode_request(results[0]) 98 | 99 | 100 | class SpiderSimpleQueue(Base): 101 | """ url + callback """ 102 | 103 | def __len__(self): 104 | """Return the length of the queue""" 105 | return self.server.llen(self.key) 106 | 107 | def push(self, request): 108 | """Push a request""" 109 | self.server.lpush(self.key, request.url[16:]) 110 | 111 | def pop(self, timeout=0): 112 | """Pop a request""" 113 | if timeout > 0: 114 | url = self.server.brpop(self.key, timeout=timeout) 115 | if isinstance(url, tuple): 116 | url = url[1] 117 | else: 118 | url = self.server.rpop(self.key) 119 | if url: 120 | try: 121 | if "/follow" in url or "/fans" in url: 122 | cb = getattr(self.spider, "parse_relationship") 123 | elif "/profile" in url: 124 | cb = getattr(self.spider, "parse_tweets") 125 | elif "/info" in url: 126 | cb = getattr(self.spider, "parse_information") 127 | else: 128 | raise ValueError("Method not found in: %s( URL:%s )" % (self.spider, url)) 129 | return Request(url="https://weibo.cn%s" % url, callback=cb) 130 | except AttributeError: 131 | raise ValueError("Method not found in: %s( URL:%s )" % (self.spider, url)) 132 | 133 | 134 | class SpiderStack(Base): 135 | """Per-spider stack""" 136 | 137 | def __len__(self): 138 | """Return the length of the stack""" 139 | return self.server.llen(self.key) 140 | 141 | def push(self, request): 142 | """Push a request""" 143 | self.server.lpush(self.key, self._encode_request(request)) 144 | 145 | def pop(self, timeout=0): 146 | """Pop a request""" 147 | if timeout > 0: 148 | data = self.server.blpop(self.key, timeout) 149 | if isinstance(data, tuple): 150 | data = data[1] 151 | else: 152 | data = self.server.lpop(self.key) 153 | 154 | if data: 155 | return self._decode_request(data) 156 | 157 | 158 | __all__ = ['SpiderQueue', 'SpiderPriorityQueue', 'SpiderSimpleQueue', 'SpiderStack'] 159 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | 3 | from . import connection 4 | from .dupefilter import RFPDupeFilter 5 | 6 | 7 | # default values 8 | SCHEDULER_PERSIST = False 9 | QUEUE_KEY = '%(spider)s:requests' 10 | QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' 11 | DUPEFILTER_KEY = '%(spider)s:dupefilter' 12 | IDLE_BEFORE_CLOSE = 0 13 | 14 | 15 | class Scheduler(object): 16 | """Redis-based scheduler""" 17 | 18 | def __init__(self, server, server_filter, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, queue_name): 19 | """Initialize scheduler. 20 | 21 | Parameters 22 | ---------- 23 | server : Redis instance 24 | persist : bool 25 | queue_key : str 26 | queue_cls : queue class 27 | dupefilter_key : str 28 | idle_before_close : int 29 | """ 30 | self.server = server 31 | self.server_filter = server_filter 32 | self.persist = persist 33 | self.queue_key = queue_key 34 | self.queue_cls = queue_cls 35 | self.dupefilter_key = dupefilter_key 36 | self.idle_before_close = idle_before_close 37 | self.queue_name = queue_name 38 | self.stats = None 39 | 40 | def __len__(self): 41 | return len(self.queue) 42 | 43 | @classmethod 44 | def from_settings(cls, settings): 45 | persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) 46 | queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) 47 | queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) 48 | queue_name = settings.get('REDIS_QUEUE_NAME', None) 49 | dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) 50 | idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) 51 | server = connection.from_settings(settings) 52 | server_filter = connection.from_settings_filter(settings) 53 | return cls(server, server_filter, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, queue_name) 54 | 55 | @classmethod 56 | def from_crawler(cls, crawler): 57 | instance = cls.from_settings(crawler.settings) 58 | # FIXME: for now, stats are only supported from this constructor 59 | instance.stats = crawler.stats 60 | return instance 61 | 62 | def open(self, spider): 63 | self.spider = spider 64 | self.queue = self.queue_cls(self.server, spider, self.queue_key, (self.queue_name if self.queue_name else spider.name)) 65 | self.df = RFPDupeFilter(self.server_filter, self.dupefilter_key % {'spider': (self.queue_name if self.queue_name else spider.name)}) 66 | if self.idle_before_close < 0: 67 | self.idle_before_close = 0 68 | # notice if there are requests already in the queue to resume the crawl 69 | if len(self.queue): 70 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 71 | 72 | def close(self, reason): 73 | if not self.persist: 74 | self.df.clear() 75 | self.queue.clear() 76 | 77 | def enqueue_request(self, request): 78 | if not request.dont_filter and self.df.request_seen(request): 79 | return 80 | if self.stats: 81 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 82 | self.queue.push(request) 83 | 84 | def next_request(self): 85 | block_pop_timeout = self.idle_before_close 86 | request = self.queue.pop(block_pop_timeout) 87 | if request and self.stats: 88 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 89 | return request 90 | 91 | def has_pending_requests(self): 92 | return len(self) > 0 93 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider, signals 2 | from scrapy.exceptions import DontCloseSpider 3 | 4 | from . import connection 5 | 6 | 7 | class RedisMixin(object): 8 | """Mixin class to implement reading urls from a redis queue.""" 9 | redis_key = None # use default ':start_urls' 10 | 11 | def setup_redis(self): 12 | """Setup redis connection and idle signal. 13 | 14 | This should be called after the spider has set its crawler object. 15 | """ 16 | if not self.redis_key: 17 | self.redis_key = '%s:start_urls' % self.name 18 | 19 | self.server = connection.from_settings(self.crawler.settings) 20 | # idle signal is called when the spider has no requests left, 21 | # that's when we will schedule new requests from redis queue 22 | self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 23 | self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 24 | self.log("Reading URLs from redis list '%s'" % self.redis_key) 25 | 26 | def next_request(self): 27 | """Returns a request to be scheduled or none.""" 28 | url = self.server.lpop(self.redis_key) 29 | if url: 30 | return self.make_requests_from_url(url) 31 | 32 | def schedule_next_request(self): 33 | """Schedules a request if available""" 34 | req = self.next_request() 35 | if req: 36 | self.crawler.engine.crawl(req, spider=self) 37 | 38 | def spider_idle(self): 39 | """Schedules a request if available, otherwise waits.""" 40 | self.schedule_next_request() 41 | raise DontCloseSpider 42 | 43 | def item_scraped(self, *args, **kwargs): 44 | """Avoids waiting for the spider to idle before scheduling the next request""" 45 | self.schedule_next_request() 46 | 47 | 48 | class RedisSpider(RedisMixin, Spider): 49 | """Spider that reads urls from redis queue when idle.""" 50 | 51 | def _set_crawler(self, crawler): 52 | super(RedisSpider, self)._set_crawler(crawler) 53 | self.setup_redis() 54 | -------------------------------------------------------------------------------- /src/Sina_spider3/scrapy_redis/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import mock 4 | import redis 5 | 6 | from scrapy import Request, Spider 7 | from unittest import TestCase 8 | 9 | from . import connection 10 | from .dupefilter import RFPDupeFilter 11 | from .queue import SpiderQueue, SpiderPriorityQueue, SpiderStack 12 | from .scheduler import Scheduler 13 | 14 | 15 | # allow test settings from environment 16 | REDIS_HOST = os.environ.get('REDIST_HOST', 'localhost') 17 | REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379)) 18 | 19 | 20 | class RedisTestMixin(object): 21 | 22 | @property 23 | def server(self): 24 | if not hasattr(self, '_redis'): 25 | self._redis = redis.Redis(REDIS_HOST, REDIS_PORT) 26 | return self._redis 27 | 28 | def clear_keys(self, prefix): 29 | keys = self.server.keys(prefix + '*') 30 | if keys: 31 | self.server.delete(*keys) 32 | 33 | 34 | class DupeFilterTest(RedisTestMixin, TestCase): 35 | 36 | def setUp(self): 37 | self.key = 'scrapy_redis:tests:dupefilter:' 38 | self.df = RFPDupeFilter(self.server, self.key) 39 | 40 | def tearDown(self): 41 | self.clear_keys(self.key) 42 | 43 | def test_dupe_filter(self): 44 | req = Request('http://example.com') 45 | 46 | self.assertFalse(self.df.request_seen(req)) 47 | self.assertTrue(self.df.request_seen(req)) 48 | 49 | self.df.close('nothing') 50 | 51 | 52 | class QueueTestMixin(RedisTestMixin): 53 | 54 | queue_cls = None 55 | 56 | def setUp(self): 57 | self.spider = Spider('myspider') 58 | self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name 59 | self.q = self.queue_cls(self.server, Spider('myspider'), self.key) 60 | 61 | def tearDown(self): 62 | self.clear_keys(self.key) 63 | 64 | def test_clear(self): 65 | self.assertEqual(len(self.q), 0) 66 | 67 | for i in range(10): 68 | # XXX: can't use same url for all requests as SpiderPriorityQueue 69 | # uses redis' set implemention and we will end with only one 70 | # request in the set and thus failing the test. It should be noted 71 | # that when using SpiderPriorityQueue it acts as a request 72 | # duplication filter whenever the serielized requests are the same. 73 | # This might be unwanted on repetitive requests to the same page 74 | # even with dont_filter=True flag. 75 | req = Request('http://example.com/?page=%s' % i) 76 | self.q.push(req) 77 | self.assertEqual(len(self.q), 10) 78 | 79 | self.q.clear() 80 | self.assertEqual(len(self.q), 0) 81 | 82 | 83 | class SpiderQueueTest(QueueTestMixin, TestCase): 84 | 85 | queue_cls = SpiderQueue 86 | 87 | def test_queue(self): 88 | req1 = Request('http://example.com/page1') 89 | req2 = Request('http://example.com/page2') 90 | 91 | self.q.push(req1) 92 | self.q.push(req2) 93 | 94 | out1 = self.q.pop() 95 | out2 = self.q.pop() 96 | 97 | self.assertEqual(out1.url, req1.url) 98 | self.assertEqual(out2.url, req2.url) 99 | 100 | 101 | class SpiderPriorityQueueTest(QueueTestMixin, TestCase): 102 | 103 | queue_cls = SpiderPriorityQueue 104 | 105 | def test_queue(self): 106 | req1 = Request('http://example.com/page1', priority=100) 107 | req2 = Request('http://example.com/page2', priority=50) 108 | req3 = Request('http://example.com/page2', priority=200) 109 | 110 | self.q.push(req1) 111 | self.q.push(req2) 112 | self.q.push(req3) 113 | 114 | out1 = self.q.pop() 115 | out2 = self.q.pop() 116 | out3 = self.q.pop() 117 | 118 | self.assertEqual(out1.url, req3.url) 119 | self.assertEqual(out2.url, req1.url) 120 | self.assertEqual(out3.url, req2.url) 121 | 122 | 123 | class SpiderStackTest(QueueTestMixin, TestCase): 124 | 125 | queue_cls = SpiderStack 126 | 127 | def test_queue(self): 128 | req1 = Request('http://example.com/page1') 129 | req2 = Request('http://example.com/page2') 130 | 131 | self.q.push(req1) 132 | self.q.push(req2) 133 | 134 | out1 = self.q.pop() 135 | out2 = self.q.pop() 136 | 137 | self.assertEqual(out1.url, req2.url) 138 | self.assertEqual(out2.url, req1.url) 139 | 140 | 141 | class SchedulerTest(RedisTestMixin, TestCase): 142 | 143 | def setUp(self): 144 | self.persist = False 145 | self.key_prefix = 'scrapy_redis:tests:' 146 | self.queue_key = self.key_prefix + '%(spider)s:requests' 147 | self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter' 148 | self.idle_before_close = 0 149 | self.scheduler = Scheduler(self.server, self.persist, self.queue_key, 150 | SpiderQueue, self.dupefilter_key, 151 | self.idle_before_close) 152 | self.spider = Spider('myspider') 153 | 154 | def tearDown(self): 155 | self.clear_keys(self.key_prefix) 156 | 157 | def test_scheduler(self): 158 | # default no persist 159 | self.assertFalse(self.scheduler.persist) 160 | 161 | self.scheduler.open(self.spider) 162 | self.assertEqual(len(self.scheduler), 0) 163 | 164 | req = Request('http://example.com') 165 | self.scheduler.enqueue_request(req) 166 | self.assertTrue(self.scheduler.has_pending_requests()) 167 | self.assertEqual(len(self.scheduler), 1) 168 | 169 | # dupefilter in action 170 | self.scheduler.enqueue_request(req) 171 | self.assertEqual(len(self.scheduler), 1) 172 | 173 | out = self.scheduler.next_request() 174 | self.assertEqual(out.url, req.url) 175 | 176 | self.assertFalse(self.scheduler.has_pending_requests()) 177 | self.assertEqual(len(self.scheduler), 0) 178 | 179 | self.scheduler.close('finish') 180 | 181 | def test_scheduler_persistent(self): 182 | # TODO: Improve this test to avoid the need to check for log messages. 183 | self.spider.log = mock.Mock(spec=self.spider.log) 184 | 185 | self.scheduler.persist = True 186 | self.scheduler.open(self.spider) 187 | 188 | self.assertEqual(self.spider.log.call_count, 0) 189 | 190 | self.scheduler.enqueue_request(Request('http://example.com/page1')) 191 | self.scheduler.enqueue_request(Request('http://example.com/page2')) 192 | 193 | self.assertTrue(self.scheduler.has_pending_requests()) 194 | self.scheduler.close('finish') 195 | 196 | self.scheduler.open(self.spider) 197 | self.spider.log.assert_has_calls([ 198 | mock.call("Resuming crawl (2 requests scheduled)"), 199 | ]) 200 | self.assertEqual(len(self.scheduler), 2) 201 | 202 | self.scheduler.persist = False 203 | self.scheduler.close('finish') 204 | 205 | self.assertEqual(len(self.scheduler), 0) 206 | 207 | 208 | class ConnectionTest(TestCase): 209 | 210 | # We can get a connection from just REDIS_URL. 211 | def test_redis_url(self): 212 | settings = dict( 213 | REDIS_URL = 'redis://foo:bar@localhost:9001/42' 214 | ) 215 | 216 | server = connection.from_settings(settings) 217 | connect_args = server.connection_pool.connection_kwargs 218 | 219 | self.assertEqual(connect_args['host'], 'localhost') 220 | self.assertEqual(connect_args['port'], 9001) 221 | self.assertEqual(connect_args['password'], 'bar') 222 | self.assertEqual(connect_args['db'], 42) 223 | 224 | # We can get a connection from REDIS_HOST/REDIS_PORT. 225 | def test_redis_host_port(self): 226 | settings = dict( 227 | REDIS_HOST = 'localhost', 228 | REDIS_PORT = 9001 229 | ) 230 | 231 | server = connection.from_settings(settings) 232 | connect_args = server.connection_pool.connection_kwargs 233 | 234 | self.assertEqual(connect_args['host'], 'localhost') 235 | self.assertEqual(connect_args['port'], 9001) 236 | 237 | # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT. 238 | def test_redis_url_precedence(self): 239 | settings = dict( 240 | REDIS_HOST = 'baz', 241 | REDIS_PORT = 1337, 242 | REDIS_URL = 'redis://foo:bar@localhost:9001/42' 243 | ) 244 | 245 | server = connection.from_settings(settings) 246 | connect_args = server.connection_pool.connection_kwargs 247 | 248 | self.assertEqual(connect_args['host'], 'localhost') 249 | self.assertEqual(connect_args['port'], 9001) 250 | self.assertEqual(connect_args['password'], 'bar') 251 | self.assertEqual(connect_args['db'], 42) 252 | 253 | # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None. 254 | def test_redis_host_port_fallback(self): 255 | settings = dict( 256 | REDIS_HOST = 'baz', 257 | REDIS_PORT = 1337, 258 | REDIS_URL = None 259 | ) 260 | 261 | server = connection.from_settings(settings) 262 | connect_args = server.connection_pool.connection_kwargs 263 | 264 | self.assertEqual(connect_args['host'], 'baz') 265 | self.assertEqual(connect_args['port'], 1337) 266 | 267 | # We use default values for REDIS_HOST/REDIS_PORT. 268 | def test_redis_default(self): 269 | settings = dict() 270 | 271 | server = connection.from_settings(settings) 272 | connect_args = server.connection_pool.connection_kwargs 273 | 274 | self.assertEqual(connect_args['host'], 'localhost') 275 | self.assertEqual(connect_args['port'], 6379) 276 | -------------------------------------------------------------------------------- /src/Sina_spider3/settings.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | 4 | BOT_NAME = ['Sina_spider3'] 5 | 6 | SPIDER_MODULES = ['Sina_spider3.spiders'] 7 | NEWSPIDER_MODULE = 'Sina_spider3.spiders' 8 | 9 | DOWNLOADER_MIDDLEWARES = { 10 | "Sina_spider3.middleware.UserAgentMiddleware": 401, 11 | "Sina_spider3.middleware.CookiesMiddleware": 402, 12 | } 13 | ITEM_PIPELINES = { 14 | "Sina_spider3.pipelines.MongoDBPipeline": 403, 15 | } 16 | 17 | SCHEDULER = 'Sina_spider3.scrapy_redis.scheduler.Scheduler' 18 | SCHEDULER_PERSIST = True 19 | SCHEDULER_QUEUE_CLASS = 'Sina_spider3.scrapy_redis.queue.SpiderSimpleQueue' 20 | 21 | # 种子队列的信息 22 | REDIE_URL = None 23 | REDIS_HOST = 'localhost' 24 | REDIS_PORT = 6379 25 | 26 | # 去重队列的信息 27 | FILTER_URL = None 28 | FILTER_HOST = 'localhost' 29 | FILTER_PORT = 6379 30 | FILTER_DB = 0 31 | 32 | DOWNLOAD_DELAY = 10 # 间隔时间 33 | # LOG_LEVEL = 'INFO' # 日志级别 34 | CONCURRENT_REQUESTS = 1 # 默认为16 35 | # CONCURRENT_ITEMS = 1 36 | # CONCURRENT_REQUESTS_PER_IP = 1 37 | REDIRECT_ENABLED = False 38 | -------------------------------------------------------------------------------- /src/Sina_spider3/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /src/Sina_spider3/spiders/sinaSpider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # 3 | import sys 4 | import logging 5 | import datetime 6 | import requests 7 | import re 8 | from lxml import etree 9 | from Sina_spider3.weiboID import weiboID 10 | from Sina_spider3.scrapy_redis.spiders import RedisSpider 11 | from scrapy.selector import Selector 12 | from scrapy.http import Request 13 | from Sina_spider3.items import TweetsItem, InformationItem, RelationshipsItem 14 | 15 | reload(sys) 16 | sys.setdefaultencoding('utf8') 17 | 18 | 19 | class Spider(RedisSpider): 20 | name = "SinaSpider" 21 | host = "https://weibo.cn" 22 | redis_key = "SinaSpider:start_urls" 23 | start_urls = list(set(weiboID)) 24 | logging.getLogger("requests").setLevel(logging.WARNING) # 将requests的日志级别设成WARNING 25 | 26 | def start_requests(self): 27 | for uid in self.start_urls: 28 | yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) 29 | 30 | def parse_information(self, response): 31 | """ 抓取个人信息 """ 32 | informationItem = InformationItem() 33 | selector = Selector(response) 34 | ID = re.findall('(\d+)/info', response.url)[0] 35 | try: 36 | text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract()) # 获取标签里的所有text() 37 | nickname = re.findall('昵称[::]?(.*?);'.decode('utf8'), text1) 38 | gender = re.findall('性别[::]?(.*?);'.decode('utf8'), text1) 39 | place = re.findall('地区[::]?(.*?);'.decode('utf8'), text1) 40 | briefIntroduction = re.findall('简介[::]?(.*?);'.decode('utf8'), text1) 41 | birthday = re.findall('生日[::]?(.*?);'.decode('utf8'), text1) 42 | sexOrientation = re.findall('性取向[::]?(.*?);'.decode('utf8'), text1) 43 | sentiment = re.findall('感情状况[::]?(.*?);'.decode('utf8'), text1) 44 | vipLevel = re.findall('会员等级[::]?(.*?);'.decode('utf8'), text1) 45 | authentication = re.findall('认证[::]?(.*?);'.decode('utf8'), text1) 46 | url = re.findall('互联网[::]?(.*?);'.decode('utf8'), text1) 47 | 48 | informationItem["_id"] = ID 49 | if nickname and nickname[0]: 50 | informationItem["NickName"] = nickname[0].replace(u"\xa0", "") 51 | if gender and gender[0]: 52 | informationItem["Gender"] = gender[0].replace(u"\xa0", "") 53 | if place and place[0]: 54 | place = place[0].replace(u"\xa0", "").split(" ") 55 | informationItem["Province"] = place[0] 56 | if len(place) > 1: 57 | informationItem["City"] = place[1] 58 | if briefIntroduction and briefIntroduction[0]: 59 | informationItem["BriefIntroduction"] = briefIntroduction[0].replace(u"\xa0", "") 60 | if birthday and birthday[0]: 61 | try: 62 | birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d") 63 | informationItem["Birthday"] = birthday - datetime.timedelta(hours=8) 64 | except Exception: 65 | informationItem['Birthday'] = birthday[0] # 有可能是星座,而非时间 66 | if sexOrientation and sexOrientation[0]: 67 | if sexOrientation[0].replace(u"\xa0", "") == gender[0]: 68 | informationItem["SexOrientation"] = "同性恋" 69 | else: 70 | informationItem["SexOrientation"] = "异性恋" 71 | if sentiment and sentiment[0]: 72 | informationItem["Sentiment"] = sentiment[0].replace(u"\xa0", "") 73 | if vipLevel and vipLevel[0]: 74 | informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "") 75 | if authentication and authentication[0]: 76 | informationItem["Authentication"] = authentication[0].replace(u"\xa0", "") 77 | if url: 78 | informationItem["URL"] = url[0] 79 | 80 | try: 81 | urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID 82 | r = requests.get(urlothers, cookies=response.request.cookies, timeout=5) 83 | if r.status_code == 200: 84 | selector = etree.HTML(r.content) 85 | texts = ";".join(selector.xpath('//body//div[@class="tip2"]/a//text()')) 86 | if texts: 87 | num_tweets = re.findall('微博\[(\d+)\]'.decode('utf8'), texts) 88 | num_follows = re.findall('关注\[(\d+)\]'.decode('utf8'), texts) 89 | num_fans = re.findall('粉丝\[(\d+)\]'.decode('utf8'), texts) 90 | if num_tweets: 91 | informationItem["Num_Tweets"] = int(num_tweets[0]) 92 | if num_follows: 93 | informationItem["Num_Follows"] = int(num_follows[0]) 94 | if num_fans: 95 | informationItem["Num_Fans"] = int(num_fans[0]) 96 | except Exception, e: 97 | pass 98 | except Exception, e: 99 | pass 100 | else: 101 | yield informationItem 102 | yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID, callback=self.parse_tweets, dont_filter=True) 103 | yield Request(url="https://weibo.cn/%s/follow" % ID, callback=self.parse_relationship, dont_filter=True) 104 | yield Request(url="https://weibo.cn/%s/fans" % ID, callback=self.parse_relationship, dont_filter=True) 105 | 106 | def parse_tweets(self, response): 107 | """ 抓取微博数据 """ 108 | selector = Selector(response) 109 | ID = re.findall('(\d+)/profile', response.url)[0] 110 | divs = selector.xpath('body/div[@class="c" and @id]') 111 | for div in divs: 112 | try: 113 | tweetsItems = TweetsItem() 114 | id = div.xpath('@id').extract_first() # 微博ID 115 | content = div.xpath('div/span[@class="ctt"]//text()').extract() # 微博内容 116 | cooridinates = div.xpath('div/a/@href').extract() # 定位坐标 117 | like = re.findall('赞\[(\d+)\]'.decode('utf8'), div.extract()) # 点赞数 118 | transfer = re.findall('转发\[(\d+)\]'.decode('utf8'), div.extract()) # 转载数 119 | comment = re.findall('评论\[(\d+)\]'.decode('utf8'), div.extract()) # 评论数 120 | others = div.xpath('div/span[@class="ct"]/text()').extract() # 求时间和使用工具(手机或平台) 121 | 122 | tweetsItems["_id"] = ID + "-" + id 123 | tweetsItems["ID"] = ID 124 | if content: 125 | tweetsItems["Content"] = " ".join(content).strip('[位置]'.decode('utf8')) # 去掉最后的"[位置]" 126 | if cooridinates: 127 | cooridinates = re.findall('center=([\d.,]+)', cooridinates[0]) 128 | if cooridinates: 129 | tweetsItems["Co_oridinates"] = cooridinates[0] 130 | if like: 131 | tweetsItems["Like"] = int(like[0]) 132 | if transfer: 133 | tweetsItems["Transfer"] = int(transfer[0]) 134 | if comment: 135 | tweetsItems["Comment"] = int(comment[0]) 136 | if others: 137 | others = others[0].split('来自'.decode('utf8')) 138 | tweetsItems["PubTime"] = others[0].replace(u"\xa0", "") 139 | if len(others) == 2: 140 | tweetsItems["Tools"] = others[1].replace(u"\xa0", "") 141 | yield tweetsItems 142 | except Exception, e: 143 | pass 144 | 145 | url_next = selector.xpath('body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'.decode('utf8')).extract() 146 | if url_next: 147 | yield Request(url=self.host + url_next[0], callback=self.parse_tweets, dont_filter=True) 148 | 149 | def parse_relationship(self, response): 150 | """ 打开url爬取里面的个人ID """ 151 | selector = Selector(response) 152 | if "/follow" in response.url: 153 | ID = re.findall('(\d+)/follow', response.url)[0] 154 | flag = True 155 | else: 156 | ID = re.findall('(\d+)/fans', response.url)[0] 157 | flag = False 158 | urls = selector.xpath('//a[text()="关注他" or text()="关注她"]/@href'.decode('utf')).extract() 159 | uids = re.findall('uid=(\d+)', ";".join(urls), re.S) 160 | for uid in uids: 161 | relationshipsItem = RelationshipsItem() 162 | relationshipsItem["Host1"] = ID if flag else uid 163 | relationshipsItem["Host2"] = uid if flag else ID 164 | yield relationshipsItem 165 | yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) 166 | 167 | next_url = selector.xpath('//a[text()="下页"]/@href'.decode('utf8')).extract() 168 | if next_url: 169 | yield Request(url=self.host + next_url[0], callback=self.parse_relationship, dont_filter=True) 170 | -------------------------------------------------------------------------------- /src/Sina_spider3/user_agents.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | """ User-Agents """ 4 | agents = [ 5 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 6 | "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", 7 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 8 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 9 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 10 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 11 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 13 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 14 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 15 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", 16 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 17 | "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", 18 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", 19 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", 20 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", 21 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", 22 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 23 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 24 | "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", 25 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", 27 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", 28 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", 29 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", 30 | "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", 31 | "Mozilla/2.02E (Win95; U)", 32 | "Mozilla/3.01Gold (Win95; I)", 33 | "Mozilla/4.8 [en] (Windows NT 5.1; U)", 34 | "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", 35 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 36 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 37 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 38 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 39 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 40 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 41 | "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 42 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 43 | "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 44 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 45 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 46 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", 47 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 48 | "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 49 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", 50 | "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 51 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 52 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 53 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 54 | "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 55 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 56 | "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", 57 | "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 58 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 59 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 60 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 61 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 62 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 63 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 64 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 65 | "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 66 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 67 | ] 68 | -------------------------------------------------------------------------------- /src/Sina_spider3/weiboID.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | """ 初始的待爬队列 """ 4 | weiboID = [ 5 | '1797054534', '2509414473', '2611478681', '5861859392', '2011086863', '5127716917', '1259110474', '5850775634', '1886437464', 6 | '3187474530', '2191982701', '1940562032', '5874450550', '1337925752', '2081079420', '5664530558', '3493173952', '1202806915', 7 | '1864507535', '2032640064', '5585682587', '3083673764', '5342109866', '5878685868', '5728706733', '2103050415', '5876752562', 8 | '3138085045', '5775974583', '1879400644', '2417139911', '5836619975', '5353816265', '5219508427', '1766613205', '2480158031', 9 | '5660754163', '2456764664', '3637354755', '1940087047', '5508473104', '1004454162', '2930327837', '1874608417', '5379621155', 10 | '1720664360', '2714280233', '3769073964', '5624119596', '2754904375', '5710151998', '5331042630', '5748179271', '2146132305', 11 | '2313896275', '3193618787', '5743059299', '1742930277', '5310538088', '1794474362', '2798510462', '3480076671', '5678653833', 12 | '5743657357', '5460191980', '1734164880', '5876988653', '5678031258', '5860163996', '1496924574', '5878970110', '1679704482', 13 | '1142210982', '3628925351', '1196397981', '1747485107', '5675893172', '5438521785', '2192269762', '1992614343', '5878686155', 14 | '2407186895', '5559116241', '2528477652', '1295950295', '5038203354', '3659276765', '2126733792', '5878350307', '2761179623', 15 | '5484511719', '5825708520', '1578230251', '5878686190', '5810946551', '3833070073', '1795047931', '5855789570', '3580125714', 16 | '5709578773', '5236539926', '2907633071', '1709244961', '5405450788', '3251257895', '5054538290', '2713199161', '5698445883', 17 | '1784537661', '3195290182', '1824506454', '5738766939', '5565915740', '5336031840', '5098775138', '5685568105', '1774289524', 18 | '2932662914', '5433223957', '2680044311', '1111523983', '5067889432', '5878686362', '2844992161', '3878314663', '1766548141', 19 | '5763269297', '5878383287', '5235499706', '5876375670', '5866447563', '5129945819', '1704116960', '1929380581', '1223762662', 20 | '1193476843', '2899591923', '5162099453', '5072151301', '5385741066', '5411455765', '2685535005', '2297905950', '1216766752', 21 | '5838668577', '5359133478', '3077460103', '5577802539', '5862392623', '1786700611', '1259258694', '1845191497', '1731838797', 22 | '1740301135', '2816074584', '1217733467', '5345035105', '5050827618', '5486257001', '5767857005', '2050605943', '5733778298', 23 | '1914725244', '5872583558', '5604377483', '1253491601', '5554922386', '3170223002', '5662737311', '3217179555', '1538163622', 24 | '5304533928', '5644198830', '1896650227', '5298774966', '2795873213', '1834378177', '5769651141', '2656256971', '5876433869', 25 | '1826792401', '3002246100', '3082519511', '5780366296', '5704696797', '5204108258', '2090615793', '1739746131', '1378010100', 26 | '5741331445', '2376442895', '3638486041', '5781365789', '1827234850', '5703214121', '1855398955', '1227908142', '5703820334', 27 | ] 28 | -------------------------------------------------------------------------------- /src/Sina_spider3/yumdama.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import httplib, mimetypes, urlparse, json, time 3 | 4 | ###################################################################### 5 | 6 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html 7 | # 所有函数请查询 http://www.yundama.com/apidoc 8 | 9 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号 10 | # 2. http://www.yundama.com/developer/myapp 添加新软件 11 | # 3. 使用添加的软件ID和密钥进行开发,享受丰厚分成 12 | 13 | # 用户名 14 | username = 'XXXXXX' 15 | 16 | # 密码 17 | password = 'XXXXXX' 18 | 19 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 20 | appid = 1 21 | 22 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 23 | appkey = 'XXXXXX' 24 | 25 | # 图片文件 26 | filename = 'ab.png' 27 | 28 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 29 | codetype = 1004 30 | 31 | # 超时时间,秒 32 | timeout = 60 33 | 34 | 35 | ###################################################################### 36 | 37 | class YDMHttp: 38 | apiurl = 'http://api.yundama.com/api.php' 39 | 40 | username = '' 41 | password = '' 42 | appid = '' 43 | appkey = '' 44 | 45 | def __init__(self, username, password, appid, appkey): 46 | self.username = username 47 | self.password = password 48 | self.appid = str(appid) 49 | self.appkey = appkey 50 | 51 | def request(self, fields, files=[]): 52 | try: 53 | response = post_url(self.apiurl, fields, files) 54 | response = json.loads(response) 55 | except Exception as e: 56 | response = None 57 | return response 58 | 59 | def balance(self): 60 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 61 | 'appkey': self.appkey} 62 | response = self.request(data) 63 | if (response): 64 | if (response['ret'] and response['ret'] < 0): 65 | return response['ret'] 66 | else: 67 | return response['balance'] 68 | else: 69 | return -9001 70 | 71 | def login(self): 72 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 73 | 'appkey': self.appkey} 74 | response = self.request(data) 75 | if (response): 76 | if (response['ret'] and response['ret'] < 0): 77 | return response['ret'] 78 | else: 79 | return response['uid'] 80 | else: 81 | return -9001 82 | 83 | def upload(self, filename, codetype, timeout): 84 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 85 | 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 86 | file = {'file': filename} 87 | response = self.request(data, file) 88 | if (response): 89 | if (response['ret'] and response['ret'] < 0): 90 | return response['ret'] 91 | else: 92 | return response['cid'] 93 | else: 94 | return -9001 95 | 96 | def result(self, cid): 97 | data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 98 | 'appkey': self.appkey, 'cid': str(cid)} 99 | response = self.request(data) 100 | return response and response['text'] or '' 101 | 102 | def decode(self, filename, codetype, timeout): 103 | cid = self.upload(filename, codetype, timeout) 104 | if (cid > 0): 105 | for i in range(0, timeout): 106 | result = self.result(cid) 107 | if (result != ''): 108 | return cid, result 109 | else: 110 | time.sleep(1) 111 | return -3003, '' 112 | else: 113 | return cid, '' 114 | 115 | 116 | ###################################################################### 117 | 118 | def post_url(url, fields, files=[]): 119 | urlparts = urlparse.urlsplit(url) 120 | return post_multipart(urlparts[1], urlparts[2], fields, files) 121 | 122 | 123 | def post_multipart(host, selector, fields, files): 124 | content_type, body = encode_multipart_formdata(fields, files) 125 | h = httplib.HTTP(host) 126 | h.putrequest('POST', selector) 127 | h.putheader('Host', host) 128 | h.putheader('Content-Type', content_type) 129 | h.putheader('Content-Length', str(len(body))) 130 | h.endheaders() 131 | h.send(body) 132 | errcode, errmsg, headers = h.getreply() 133 | return h.file.read() 134 | 135 | 136 | def encode_multipart_formdata(fields, files=[]): 137 | BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ' 138 | CRLF = '\r\n' 139 | L = [] 140 | for field in fields: 141 | key = field 142 | value = fields[key] 143 | L.append('--' + BOUNDARY) 144 | L.append('Content-Disposition: form-data; name="%s"' % key) 145 | L.append('') 146 | L.append(value) 147 | for field in files: 148 | key = field 149 | filepath = files[key] 150 | L.append('--' + BOUNDARY) 151 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath)) 152 | L.append('Content-Type: %s' % get_content_type(filepath)) 153 | L.append('') 154 | L.append(open(filepath, 'rb').read()) 155 | L.append('--' + BOUNDARY + '--') 156 | L.append('') 157 | body = CRLF.join(L) 158 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 159 | return content_type, body 160 | 161 | 162 | def get_content_type(filename): 163 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 164 | 165 | 166 | ###################################################################### 167 | 168 | 169 | def identify(): 170 | if (username == 'username'): 171 | print '请设置好相关参数再测试' 172 | else: 173 | # 初始化 174 | yundama = YDMHttp(username, password, appid, appkey) 175 | 176 | # 登陆云打码 177 | uid = yundama.login(); 178 | # print 'uid: %s' % uid 179 | 180 | # 查询余额 181 | balance = yundama.balance(); 182 | # print 'balance: %s' % balance 183 | 184 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 185 | cid, result = yundama.decode(filename, codetype, timeout); 186 | # print 'cid: %s, result: %s' % (cid, result) 187 | return result 188 | -------------------------------------------------------------------------------- /src/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/chromedriver.exe -------------------------------------------------------------------------------- /src/cookies(try).txt: -------------------------------------------------------------------------------- 1 | ['{"SUB": "_2A253J_OfDeRhGeBK6lYS8y_NyjuIHXVUVWJXrDV_PUNbm9BeLU77kW9NR8utTT0yzIaoF5HGd-EmRCrfeBXzKM7Q", 2 | "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt", 3 | "ALF": "1543812943", 4 | "SCF": "Ag6Ni5azmMQ957nRhDnSGJmVHxiHUapdd61kG2zCA9K9KPjccT2UsglYOS8Q8AEo0toYwdOe_F9aMaVIp3FLogA.", 5 | "ALC": "ac%3D27%26bt%3D1512276943%26cv%3D5.0%26et%3D1543812943%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D05f090c8e2e71b7d774d498801c8df66", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==", 6 | "tgc": "TGT-NjQxNDMzMTExNw==-1512276943-gz-82D6EDC638D32EF9BFFEC6DAA8FBAE03-1", 7 | "LT": "1512276943"}', 8 | '{"SUB": "_2A253J_OfDeRhGeBK6VAQ8C3FzDmIHXVUVWJXrDV_PUNbm9BeLRf3kW9NR8utfC-xU6BDb6Et1jLT7Q9nEMXvUbDL", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt", "ALF": "1543812944", "SCF": "AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3c4bd7iQCwA_8N_QyL5ZWWcbjXfIUDptjuZiDo7Oq2j9E.", "ALC": "ac%3D27%26bt%3D1512276944%26cv%3D5.0%26et%3D1543812944%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D2%26es%3Db1b7b23a680b8271df5b3f8ff0978986", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==", "tgc": "TGT-NjQyMjEwMzk3NQ==-1512276943-gz-31834B2CBDB9E531BFC2FF211A10A8C4-1", "LT": "1512276944"}'] 9 | 10 | {"SUB": "_2A253J_TsDeRhGeBK6lYS8y_NyjuIHXVUVWEkrDV_PUNbm9BeLXLZkW9NR8utTQQCqTejxWGdgx4RHgURbRfeKnlU", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt", "ALF": "1543813180", "SCF": "At5EcfdKs8RgVrq2WrBz31C3JbisCJmAnfw_7tmYLzucchfl2npE6hG6bxG7aGDtKb5dWqhxB1u5jydhLJAFvGQ.", "ALC": "ac%3D27%26bt%3D1512277180%26cv%3D5.0%26et%3D1543813180%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3Dde87590dae6f45a12243530dbf1f209d", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==", "tgc": "TGT-NjQxNDMzMTExNw==-1512277180-gz-1C384532703AE9AA3CF8E0C9D3C78530-1", "LT": "1512277180"} 11 | 12 | [{'SUB': '_2A253J96tDeRhGeBK6lYS8y_NyjuIHXVUVLdlrDV_PUNbm9BeLUSikW9NR8utTU46f9Z1Wf4a_FrrtdzGVO6eYg2J', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt', 'ALF': '1543823997', 'SCF': 'AvtM_MHAGrJD8WT8psqmc_baZii63aFFNnjUs0Kx1UZ7UxFjSWW9MODX8k2SK8S_S_4EexNVt9CwATDlx7_MaCk.', 'ALC': 'ac%3D27%26bt%3D1512287997%26cv%3D5.0%26et%3D1543823997%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D7aede323af92903f037bee59c6a2497b', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==', 'tgc': 'TGT-NjQxNDMzMTExNw==-1512287997-gz-82E1CAFE0818D0316258D0028E4781D9-1', 'LT': '1512287997'}, 13 | {'SUB': '_2A253J96uDeRhGeBK6VAQ8C3FzDmIHXVUVLdmrDV_PUNbm9BeLUnZkW9NR8utfJ8o12XYCwhV8uoqegvx8lLCjceC', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt', 'ALF': '1543823998', 'SCF': 'AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3c1vGL6dX62yPY_77XZzbZ4hzGG7RPsnTsGpOb0Ybr0o4.', 'ALC': 'ac%3D27%26bt%3D1512287998%26cv%3D5.0%26et%3D1543823998%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D8ff4ae6bd2388a48f842a6da4cd4a07d', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==', 'tgc': 'TGT-NjQyMjEwMzk3NQ==-1512287998-gz-2E50C7C8D61D64155895A7D27ACBB3B0-1', 'LT': '1512287998'}] 14 | 15 | [{'SUB': '_2A253J8CyDeRhGeBK6lYS8y_NyjuIHXVUVLV6rDV_PUNbm9BeLWbNkW9NR8utTR2RqV0cpDAVZdu20M3Lsv62gF6H', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt', 'ALF': '1543824482', 'SCF': 'AvtM_MHAGrJD8WT8psqmc_baZii63aFFNnjUs0Kx1UZ7M8uryPp95dGAZ5dbY7nVglXtwSWFN4UbDwVyOwLKXlc.', 'ALC': 'ac%3D27%26bt%3D1512288482%26cv%3D5.0%26et%3D1543824482%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3Dda69c114d39f358c2ee0e1b023148b0b', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==', 'tgc': 'TGT-NjQxNDMzMTExNw==-1512288482-gz-796C0D50F4566A5F8BB38982D7D415EB-1', 'LT': '1512288482'}, 16 | {'SUB': '_2A253J8CzDeRhGeBK6VAQ8C3FzDmIHXVUVLV7rDV_PUNbm9BeLUbhkW9NR8utfFOAJ5M8AwzMq-OZPJ9nwOF6MOys', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt', 'ALF': '1543824483', 'SCF': 'AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3cI4SOLSi7zj_q6Q-sRCkvQ1flnpEiR49tBIINTa8-XGY.', 'ALC': 'ac%3D27%26bt%3D1512288483%26cv%3D5.0%26et%3D1543824483%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D07719a22f45017ae94e27314d78d8b77', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==', 'tgc': 'TGT-NjQyMjEwMzk3NQ==-1512288483-gz-98F65A3B74E854C842A1D6F7BA5C266B-1', 'LT': '1512288483'}] 17 | 18 | ['{"SUB": "_2A253IKJtDeRhGeBK6lYS8y_NyjuIHXVUV5SlrDV_PUNbm9BeLVL4kW9NR8utTZIZ3GE_eZIFKU8JXhMGiyLZOHUa", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt", "ALF": "1543898557", "SCF": "AvtM_MHAGrJD8WT8psqmc_baZii63aFFNnjUs0Kx1UZ7_GE9xQnS9YxrxtZ3QJfz5vilaQuQI2Rp1Y96oeNfPRo.", "ALC": "ac%3D27%26bt%3D1512362557%26cv%3D5.0%26et%3D1543898557%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D44ff4bd6c3e5f54404f51281fa6e2fe2", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==", "tgc": "TGT-NjQxNDMzMTExNw==-1512362557-gz-A15094AC1DA33790F11FA13BF4C2DBC1-1", "LT": "1512362557"}', '{"SUB": "_2A253IKJuDeRhGeBK6VAQ8C3FzDmIHXVUV5SmrDV_PUNbm9BeLVPNkW9NR8utfCIEKBaKXbjbPLQXwGCjAl9X_Rx3", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt", "ALF": "1543898558", "SCF": "AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3cUDGokgFmPgwrICSJaq-JrP7DN9tHw_8WnP71flX0Vu4.", "ALC": "ac%3D27%26bt%3D1512362558%26cv%3D5.0%26et%3D1543898558%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D22cea20a5941a2f3ffdbd8f8d756dcc8", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==", "tgc": "TGT-NjQyMjEwMzk3NQ==-1512362558-gz-4CE45D4E434B23FDDBBD66722625BA5A-1", "LT": "1512362558"}', '{"SUB": "_2A253IKJvDeRhGeNJ61UY8SzFzz-IHXVUV5SnrDV_PUNbm9BeLWT6kW9NS-ueMidblNF3-Xvz7h5b0SRkUCYNevb8", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhQYAFbpdVSPmb1zZRc_y5o5NHD95QfS05N1K2E1KB0Ws4Dqcjci--fiKyFi-isi--Ni-82iK.ci--Ni-82iK.pi--fiKnfi-iFi--Ri-z7iKy8i--ciK.ci-2f", "ALF": "1543898559", "SCF": "AiPttH0k24V2BSsqaUeK1grOIZ5cZZMUbngtrx787lf10y_gUiHtPH6_1wYTYiYU9gZ0wtjb3357yOHGS0ZUW70.", "ALC": "ac%3D27%26bt%3D1512362559%26cv%3D5.0%26et%3D1543898559%26ic%3D-611539878%26scf%3D%26uid%3D5707912943%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D4657943575c6abc01e10b0bad733a3dd", "sso_info": "v02m6alo5qztKWRk5SljoSIpZCkmKWRk5ylkJSQpY6TmKWRk5ylkJSQpY6ThKWRk5SljoOUpZCkiKWRk6ClkKOApY6ElKWRk5iljpOYpZCTlKadlqWkj5OUt4yDnLmMk4i5jYOMwA==", "tgc": "TGT-NTcwNzkxMjk0Mw==-1512362559-gz-D0E6B9753936773DCDB242BC365DB1CD-1", "LT": "1512362559"}', '{"SUB": "_2A253IKIQDeRhGeNJ61sU8y7NzTSIHXVUV5TYrDV_PUNbm9BeLVrEkW9NS-gN80urLJV2RAiHaLuXpLXAOkkZgE6E", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhI7S.CQz7ZCvg1.r4fXrmW5NHD95QfS054SKe7eKqRWs4Dqcjci--fi-8si-iWi--ciKL2iKn7i--Xi-zRiKyWi--fiKL2iKL2i--Xi-iFi-2fi--Ri-2ciKnc", "ALF": "1543898560", "SCF": "Au2fEGreoPmIQTogBV7H6aKO_QuHq6N0O4jF6CLzGnXOrGYFMTbrhzKDiwWFHTwsR08C2dinckeHrOLXkZgARxg.", "ALC": "ac%3D27%26bt%3D1512362560%26cv%3D5.0%26et%3D1543898560%26ic%3D-611539878%26scf%3D%26uid%3D5709530168%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D89ad3e1b6ef03c5773d35ee42a6fd23e", "sso_info": "v02m6alo5qztKWRk5SlkJSYpZCkhKWRk5iljpSQpY6DgKWRk5ClkKOgpY6EhKWRk5SljpSQpY6UkKWRk5ClkKSIpZCTlKWRk6ClkJOYpY6DmKadlqWkj5OUt4yDpLWMs4CxjaOgwA==", "tgc": "TGT-NTcwOTUzMDE2OA==-1512362560-gz-BBAA64432EB0BB53614714271076317E-1", "LT": "1512362560"}', '{"SUB": "_2A253IKIRDeRhGeNJ6VYS-CjEyDyIHXVUV5TZrDV_PUNbm9BeLVb3kW9NS_Amb56o_iwCfT4LciqjB2egwTyGlOOa", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WWaJu4.SN-wOGkOfNOlLGni5NHD95QfS0zXe0nc1he7Ws4Dqcjci--ciKL8i-zci--Ri-88i-24i--ci-8hi-2Ei--fiK.ciKLhi--ciK.Ri-8si--4iKL2iKL8", "ALF": "1543898561", "SCF": "AvFTp3DMCcec05kPPAss21MmHrXolkjrDf7ZZdz5O9_jLhgz5JjWNh8HVo6JNI9p9Xl7fphmaq_Yr5YMcTMj8d0.", "ALC": "ac%3D19%26bt%3D1512362561%26cv%3D5.0%26et%3D1543898561%26ic%3D-611539878%26scf%3D%26uid%3D5724386830%26vf%3D1%26vs%3D0%26vt%3D4%26es%3De9f4b38125173257023bd932942cde44", "sso_info": "v02m6alo5qztKWRk5iljpSUpZCjmKWRk6ClkJSUpZCTpKWRk5ilkJSMpZCTiKWRk5SljpOYpY6UjKWRk5iljpOgpZCUmKWRk6SljpSQpY6UlKadlqWkj5OUt4yjkLOOg5i4jLOAwA==", "tgc": "TGT-NTcyNDM4NjgzMA==-1512362561-gz-4CB2B2AA3612BABE7DC1422ACC113933-1", "LT": "1512362561"}', '{"SUB": "_2A253IKISDeRhGeNJ6lIV9y3KyTqIHXVUV5TarDV_PUNbm9BeLVTMkW9NS-jWtQFQa3jT016M--usQjNo35yKKpbN", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4AbLNhGKLG81j0opnQ1iX5NHD95QfS027ShM0SozcWs4Dqcjci--Ri-z7iKLhi--fi-82iKL2i--ci-z4iK.ci--ciKLhi-8hi--ciKL2iKysi--ciKyWi-ih", "ALF": "1543898562", "SCF": "AnUo8TBZrAHEYykaGzvHxD4_Vmse4peKtNi0sDOfXy8zDRlBQspGQIaBIfWmUaoFS738RoE9ROl3nriS2FAAETY.", "ALC": "ac%3D27%26bt%3D1512362562%26cv%3D5.0%26et%3D1543898562%26ic%3D-611539878%26scf%3D%26uid%3D5710473626%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D1f0656c8485bd122d6da6d8bcf291877", "sso_info": "v02m6alo5qztKWRk6ClkKOApY6UjKWRk5SlkJSQpY6UkKWRk5ilkKOkpY6TmKWRk5iljpSMpZCUjKWRk5iljpSQpY6EmKWRk5iljoSEpZCkjKadlqWkj5OUt4yTgLSNs4y2jKOYwA==", "tgc": "TGT-NTcxMDQ3MzYyNg==-1512362562-gz-F0A9EC765F0AE8EFEE62ECB62C8F4E11-1", "LT": "1512362562"}'] -------------------------------------------------------------------------------- /src/launch.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | cmdline.execute("scrapy crawl SinaSpider".split()) 4 | -------------------------------------------------------------------------------- /src/mysql.py: -------------------------------------------------------------------------------- 1 | import MySQLdb 2 | conn=MySQLdb.connect(host='localhost', 3 | user='root', 4 | passwd='chen960212', 5 | db='sinaweibo') 6 | cursor = conn.cursor() 7 | cursor.execute ("SELECT VERSION()") 8 | row = cursor.fetchone () 9 | print "server version:", row[0] 10 | cursor.close() 11 | conn.close() 12 | 13 | ''' 14 | #Python 2.7.6 (default, Nov 10 2013, 19:24:24) [MSC v.1500 64 bit (AMD64)] on win32 15 | import MySQLdb 16 | conn = MySQLdb.connect(host='localhost',port=3306,user='root',passwd='chen960212',db='test') 17 | cur = conn.cursor() 18 | cur.execute('select `title`, `text` from `entries` limit 10') 19 | 2L 20 | cur.fetchall() 21 | (('bokeyuan', 'bokeyuan text...'), ('google translate', 'google translate text...')) 22 | cur.close() 23 | conn.close() 24 | ''' 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/pipelines.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | # __________________________________________ 4 | # 增加了向Mysql数据库中保存pipeline 5 | # 需要有MysqlDB,同时修改Spider文件,增加Item类所有变量的if else的返回值,使得可以标准化存储 6 | # Updated by Charles Yan 7 | # Date:2017.1.4 8 | # Added Mysql insert method 9 | # ------------------------------------------ 10 | 11 | import pymongo 12 | from Sina_spider3.items import InformationItem, TweetsItem, RelationshipsItem 13 | import MySQLdb 14 | 15 | 16 | 17 | class MysqlDBPipleline(object): 18 | def __init__(self): 19 | self.count = 1 20 | self.conn = MySQLdb.connect( 21 | host='localhost', 22 | port=3306, 23 | user='root', 24 | #这里填写密码 25 | passwd='chen960212', 26 | db='sinaweibo', 27 | charset='utf8', 28 | ) 29 | self.cur = self.conn.cursor() 30 | 31 | def process_item(self, item, spider): 32 | """ 判断item的类型,并作相应的处理,再入数据库 """ 33 | if isinstance(item, RelationshipsItem): 34 | try: 35 | print("***********at beginning of saving**********") 36 | print(dict(item)) 37 | sql = '' 38 | sql+=str('INSERT INTO SinaWeibo.Relationship (`Host1`,`Host2`) ') 39 | sql+=str(' Values(\'' ) 40 | sql+=str(item['Host1']) 41 | print(sql) 42 | sql+=str('\', \'') 43 | sql+=str(item['Host2']) 44 | sql+=str('\')') 45 | print("*********** SQL SYNTAX *********** ") 46 | print(''.join(sql)) 47 | self.cur.execute(sql) 48 | self.conn.commit() 49 | print("saved") 50 | self.count = self.count +1 51 | print(self.count) 52 | except Exception: 53 | pass 54 | elif isinstance(item, TweetsItem): 55 | try: 56 | print("***********at beginning of saving**********") 57 | 58 | sql = '' 59 | sql+=str('INSERT INTO SinaWeibo.Tweets (`weibo_id`,`User_id`,`Content`,`Pubtime`,`Coordinates`,`Tools`,`Likes`,`Comments`,`Transfers`) ') 60 | sql+=str(' Values(\'' ) 61 | sql+=str(item['_id']) 62 | 63 | sql+=str('\', \'') 64 | sql+=str(item['ID']) 65 | sql+=str('\', \'') 66 | sql+=str(item['Content']) 67 | sql+=str('\', \'') 68 | sql+=str(item['PubTime']) 69 | 70 | sql+=str('\', \'') 71 | 72 | sql+=str(item['Co_oridinates']) 73 | 74 | sql+=str('\', \'') 75 | sql+=str(item['Tools']) 76 | print(sql) 77 | sql+=str('\', \'') 78 | sql+=str(item['Like']) 79 | sql+=str('\', \'') 80 | sql+=str(item['Comment']) 81 | sql+=str('\', \'') 82 | sql+=str(item['Transfer']) 83 | sql+=str('\')') 84 | print("*********** SQL SYNTAX *********** ") 85 | print(''.join(sql)) 86 | self.cur.execute(sql) 87 | self.conn.commit() 88 | print("saved") 89 | self.count = self.count +1 90 | print(self.count) 91 | except Exception: 92 | pass 93 | elif isinstance(item, InformationItem): 94 | try: 95 | print("***********at beginning of saving**********") 96 | 97 | sql = '' 98 | sql+=str('INSERT INTO SinaWeibo.Information (`User_id`,`NickName`,`Gender`,`Province`,`City`,`BriefIntroduction`,`Birthday`,`Num_Tweets`,`Num_Follows`,`Num_Fans`,`SexOrientation`,`Sentiment`,`VIPlevel`,`Authentication`,`URL`) ') 99 | sql+=str(' Values(\'' ) 100 | sql+=str(item['_id']) 101 | 102 | sql+=str('\', \'') 103 | sql+=str(item['NickName']) 104 | sql+=str('\', \'') 105 | sql+=str(item['Gender']) 106 | sql+=str('\', \'') 107 | sql+=str(item['Province']) 108 | 109 | sql+=str('\', \'') 110 | sql+=str(item['City']) 111 | sql+=str('\', \'') 112 | sql+=str(item['BriefIntroduction']) 113 | sql+=str('\', \'') 114 | print(sql) 115 | sql+=str(item['Birthday']) 116 | sql+=str('\', \'') 117 | sql+=str(item['Num_Tweets']) 118 | 119 | sql+=str('\', \'') 120 | sql+=str(item['Num_Follows']) 121 | sql+=str('\', \'') 122 | sql+=str(item['Num_Fans']) 123 | sql+=str('\', \'') 124 | 125 | sql+=str(item['SexOrientation']) 126 | sql+=str('\', \'') 127 | sql+=str(item['Sentiment']) 128 | 129 | sql+=str('\', \'') 130 | sql+=str(item['VIPlevel']) 131 | sql+=str('\', \'') 132 | sql+=str(item['Authentication']) 133 | sql+=str('\', \'') 134 | sql+=str(item['URL']) 135 | sql+=str('\')') 136 | 137 | print("*********** SQL SYNTAX *********** ") 138 | print(''.join(sql)) 139 | self.cur.execute(sql) 140 | self.conn.commit() 141 | print("saved") 142 | self.count = self.count +1 143 | print(self.count) 144 | except Exception: 145 | pass 146 | 147 | ##在Java开发中,Dao连接会对内存溢出,需要定时断开重连,这里不清楚是否需要,先加上了 148 | if self.count == 1000: 149 | print("try reconnecting") 150 | self.count = 0 151 | self.cur.close() 152 | self.conn.close() 153 | self.conn = MySQLdb.connect( 154 | host='localhost', 155 | port=3306, 156 | user='root', 157 | passwd='***', 158 | db='SinaWeibo', 159 | charset='utf8', 160 | ) 161 | self.cur = self.conn.cursor() 162 | print("reconnect") 163 | 164 | return item 165 | 166 | 167 | 168 | class MongoDBPipleline(object): 169 | def __init__(self): 170 | clinet = pymongo.MongoClient("localhost", 27017) 171 | db = clinet["Sina"] 172 | self.Information = db["Information"] 173 | self.Tweets = db["Tweets"] 174 | self.Relationships = db["Relationships"] 175 | 176 | def process_item(self, item, spider): 177 | """ 判断item的类型,并作相应的处理,再入数据库 """ 178 | if isinstance(item, RelationshipsItem): 179 | try: 180 | self.Relationships.insert(dict(item)) 181 | except Exception: 182 | pass 183 | elif isinstance(item, TweetsItem): 184 | try: 185 | self.Tweets.insert(dict(item)) 186 | except Exception: 187 | pass 188 | elif isinstance(item, InformationItem): 189 | try: 190 | self.Information.insert(dict(item)) 191 | except Exception: 192 | pass 193 | return item 194 | -------------------------------------------------------------------------------- /src/readme..txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/readme..txt -------------------------------------------------------------------------------- /src/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Sina_spider3.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Sina_spider3 12 | -------------------------------------------------------------------------------- /src/sql语句.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/sql语句.sql -------------------------------------------------------------------------------- /src/sql语句2.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `relationship` ( 2 | `Host1` VARCHAR(50) NULL DEFAULT NULL, 3 | `Host2` VARCHAR(50) NULL DEFAULT NULL 4 | ) 5 | COLLATE='utf8_general_ci' 6 | ENGINE=InnoDB 7 | ; 8 | SHOW CREATE TABLE `sinaweibo`.`relationship`; 9 | SHOW CREATE TABLE `sinaweibo`.`tweets`; 10 | SHOW CREATE TABLE `sinaweibo`.`relationship`; 11 | SHOW CREATE TABLE `sinaweibo`.`information`; 12 | 13 | 14 | 15 | CREATE TABLE `information` ( 16 | `User_id` VARCHAR(50) NULL DEFAULT NULL, 17 | `NickName` VARCHAR(50) NULL DEFAULT NULL, 18 | `Gender` VARCHAR(50) NULL DEFAULT NULL, 19 | `Province` VARCHAR(50) NULL DEFAULT NULL, 20 | `City` VARCHAR(50) NULL DEFAULT NULL, 21 | `BriefIntroduction` VARCHAR(50) NULL DEFAULT NULL, 22 | `Birthday` VARCHAR(50) NULL DEFAULT NULL, 23 | `Num_Tweets` VARCHAR(50) NULL DEFAULT NULL, 24 | `Num_Follows` VARCHAR(50) NULL DEFAULT NULL, 25 | `Num_Fans` VARCHAR(50) NULL DEFAULT NULL, 26 | `SexOrientation` VARCHAR(50) NULL DEFAULT NULL, 27 | `Sentiment` VARCHAR(50) NULL DEFAULT NULL, 28 | `VIPlevel` VARCHAR(50) NULL DEFAULT NULL, 29 | `Authentication` VARCHAR(50) NULL DEFAULT NULL, 30 | `URL` VARCHAR(50) NULL DEFAULT NULL 31 | ) 32 | COLLATE='utf8_general_ci' 33 | ENGINE=InnoDB 34 | ; 35 | 36 | 37 | CREATE TABLE `tweets` ( 38 | `weibo_id` VARCHAR(50) NULL DEFAULT NULL, 39 | `User_id` VARCHAR(50) NULL DEFAULT NULL, 40 | `Content` VARCHAR(50) NULL DEFAULT NULL, 41 | `Pubtime` VARCHAR(50) NULL DEFAULT NULL, 42 | `Coordinates` VARCHAR(50) NULL DEFAULT NULL, 43 | `Tools` VARCHAR(50) NULL DEFAULT NULL, 44 | `Likes` INT(11) NULL DEFAULT NULL, 45 | `Comments` INT(11) NULL DEFAULT NULL, 46 | `Transfers` INT(11) NULL DEFAULT NULL 47 | ) 48 | COLLATE='utf8_general_ci' 49 | ENGINE=InnoDB 50 | ; 51 | -------------------------------------------------------------------------------- /src/用到的工具.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/用到的工具.txt -------------------------------------------------------------------------------- /visio制图/E-R图_Tweets.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/E-R图_Tweets.vsdx -------------------------------------------------------------------------------- /visio制图/E-R图_information.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/E-R图_information.vsdx -------------------------------------------------------------------------------- /visio制图/E-R图_relationships.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/E-R图_relationships.vsdx -------------------------------------------------------------------------------- /visio制图/数据库概念模型E-R图.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/数据库概念模型E-R图.vsdx -------------------------------------------------------------------------------- /宋少忠_毕业论文终稿查重版陈巍瑜_大雅详细报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/宋少忠_毕业论文终稿查重版陈巍瑜_大雅详细报告.pdf -------------------------------------------------------------------------------- /开题答辩报告/开题报告1稿.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/开题答辩报告/开题报告1稿.docx -------------------------------------------------------------------------------- /开题答辩报告/开题报告2稿.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/开题答辩报告/开题报告2稿.docx -------------------------------------------------------------------------------- /毕业论文终稿.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/毕业论文终稿.doc -------------------------------------------------------------------------------- /毕业设计微博json数据.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/毕业设计微博json数据.rar -------------------------------------------------------------------------------- /毕设答辩pt.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/毕设答辩pt.pptx -------------------------------------------------------------------------------- /论文二稿/readme..txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/readme..txt -------------------------------------------------------------------------------- /论文二稿/摘要与关键字.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/摘要与关键字.docx -------------------------------------------------------------------------------- /论文二稿/第一章/论文初稿_绪论.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第一章/论文初稿_绪论.docx -------------------------------------------------------------------------------- /论文二稿/第七章/第7章结论与展望.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第七章/第7章结论与展望.docx -------------------------------------------------------------------------------- /论文二稿/第三章/3.1_需求.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.1_需求.docx -------------------------------------------------------------------------------- /论文二稿/第三章/3.2.1_非关系型数据库mongodb及其搭建.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.2.1_非关系型数据库mongodb及其搭建.docx -------------------------------------------------------------------------------- /论文二稿/第三章/3.2.4_redis简介及其搭建.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.2.4_redis简介及其搭建.docx -------------------------------------------------------------------------------- /论文二稿/第三章/3.3_Scrapy框架.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.3_Scrapy框架.docx -------------------------------------------------------------------------------- /论文二稿/第三章/3.4_Srcapy+redis架构.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.4_Srcapy+redis架构.docx -------------------------------------------------------------------------------- /论文二稿/第二章/2.1_爬虫的分类与作用.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.1_爬虫的分类与作用.docx -------------------------------------------------------------------------------- /论文二稿/第二章/2.2_http协议.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.2_http协议.docx -------------------------------------------------------------------------------- /论文二稿/第二章/2.3_rebots协议.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.3_rebots协议.docx -------------------------------------------------------------------------------- /论文二稿/第二章/2.4_爬虫搜索策略-防止环路的出现.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.4_爬虫搜索策略-防止环路的出现.docx -------------------------------------------------------------------------------- /论文二稿/第五章/第五章测试.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第五章/第五章测试.docx -------------------------------------------------------------------------------- /论文二稿/第六章/6.1_数据模型.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第六章/6.1_数据模型.docx -------------------------------------------------------------------------------- /论文二稿/第六章/6.2_数据分析.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第六章/6.2_数据分析.docx -------------------------------------------------------------------------------- /论文二稿/第四章/4.1_微博移动版web分析.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.1_微博移动版web分析.docx -------------------------------------------------------------------------------- /论文二稿/第四章/4.2_User-agent伪装.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.2_User-agent伪装.docx -------------------------------------------------------------------------------- /论文二稿/第四章/4.3_信息过滤规则-正则表达式.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.3_信息过滤规则-正则表达式.docx -------------------------------------------------------------------------------- /论文二稿/第四章/4.4_查重.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.4_查重.docx -------------------------------------------------------------------------------- /论文二稿/第四章/4.5_反爬技术.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.5_反爬技术.docx -------------------------------------------------------------------------------- /论文二稿/第四章/4.6_Cookie池.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.6_Cookie池.docx -------------------------------------------------------------------------------- /论文二稿/致谢.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/致谢.docx -------------------------------------------------------------------------------- /论文二稿/草稿.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/草稿.docx -------------------------------------------------------------------------------- /论文二稿/论文初稿_参考文献.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/论文初稿_参考文献.docx -------------------------------------------------------------------------------- /论文二稿/论文初稿_目录.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/论文初稿_目录.docx -------------------------------------------------------------------------------- /论文二稿/题目.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/题目.docx -------------------------------------------------------------------------------- /论文初稿/第一章/论文初稿_绪论.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第一章/论文初稿_绪论.docx -------------------------------------------------------------------------------- /论文初稿/第三章/3.1.1_非关系型数据库mongodb及其搭建.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.1.1_非关系型数据库mongodb及其搭建.docx -------------------------------------------------------------------------------- /论文初稿/第三章/3.1.4_redis简介及其搭建.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.1.4_redis简介及其搭建.docx -------------------------------------------------------------------------------- /论文初稿/第三章/3.2_Scrapy框架.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.2_Scrapy框架.docx -------------------------------------------------------------------------------- /论文初稿/第三章/3.3_Srcapy+redis架构.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.3_Srcapy+redis架构.docx -------------------------------------------------------------------------------- /论文初稿/第二章/2.1_爬虫的分类与作用.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.1_爬虫的分类与作用.docx -------------------------------------------------------------------------------- /论文初稿/第二章/2.2_http协议.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.2_http协议.docx -------------------------------------------------------------------------------- /论文初稿/第二章/2.3_rebots协议.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.3_rebots协议.docx -------------------------------------------------------------------------------- /论文初稿/第二章/2.4_微博移动版web分析.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.4_微博移动版web分析.docx -------------------------------------------------------------------------------- /论文初稿/第二章/2.5_User-agent伪装.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.5_User-agent伪装.docx -------------------------------------------------------------------------------- /论文初稿/第二章/2.6_信息过滤规则-正则表达式.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.6_信息过滤规则-正则表达式.docx -------------------------------------------------------------------------------- /论文初稿/第五章/5.1_数据模型.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第五章/5.1_数据模型.docx -------------------------------------------------------------------------------- /论文初稿/第五章/5.2_数据分析.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第五章/5.2_数据分析.docx -------------------------------------------------------------------------------- /论文初稿/第六章/论文初稿_总结与展望.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第六章/论文初稿_总结与展望.docx -------------------------------------------------------------------------------- /论文初稿/第四章/4.1_爬虫搜索策略-防止环路的出现.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.1_爬虫搜索策略-防止环路的出现.docx -------------------------------------------------------------------------------- /论文初稿/第四章/4.2_查重.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.2_查重.docx -------------------------------------------------------------------------------- /论文初稿/第四章/4.3_反爬技术.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.3_反爬技术.docx -------------------------------------------------------------------------------- /论文初稿/第四章/4.4_Cookie池.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.4_Cookie池.docx -------------------------------------------------------------------------------- /论文初稿/论文初稿_参考文献.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/论文初稿_参考文献.docx -------------------------------------------------------------------------------- /论文初稿/论文初稿_目录.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/论文初稿_目录.docx -------------------------------------------------------------------------------- /论文初稿/附录/环境.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/附录/环境.txt -------------------------------------------------------------------------------- /论文改一.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文改一.docx -------------------------------------------------------------------------------- /论文改二.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文改二.docx --------------------------------------------------------------------------------