├── .gitignore
├── LICENSE
├── README.md
├── readme..txt
├── src
    ├── .gitignore
    ├── Sina_spider3
    │   ├── __init__.py
    │   ├── chromedriver.exe
    │   ├── cleanRedis
    │   ├── cookies.py
    │   ├── items.py
    │   ├── middleware.py
    │   ├── pipelines.py
    │   ├── scrapy_redis
    │   │   ├── __init__.py
    │   │   ├── connection.py
    │   │   ├── dupefilter.py
    │   │   ├── pipelines.py
    │   │   ├── queue.py
    │   │   ├── scheduler.py
    │   │   ├── spiders.py
    │   │   └── tests.py
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── sinaSpider.py
    │   ├── user_agents.py
    │   ├── weiboID.py
    │   └── yumdama.py
    ├── chromedriver.exe
    ├── cookies（try）.txt
    ├── launch.py
    ├── mysql.py
    ├── pipelines.py
    ├── readme..txt
    ├── scrapy.cfg
    ├── sql语句.sql
    ├── sql语句2.sql
    └── 用到的工具.txt
├── visio制图
    ├── E-R图_Tweets.vsdx
    ├── E-R图_information.vsdx
    ├── E-R图_relationships.vsdx
    └── 数据库概念模型E-R图.vsdx
├── 宋少忠_毕业论文终稿查重版陈巍瑜_大雅详细报告.pdf
├── 开题答辩报告
    ├── 开题报告1稿.docx
    └── 开题报告2稿.docx
├── 毕业论文终稿.doc
├── 毕业设计微博json数据.rar
├── 毕设答辩pt.pptx
├── 论文二稿
    ├── readme..txt
    ├── 摘要与关键字.docx
    ├── 第一章
    │   └── 论文初稿_绪论.docx
    ├── 第七章
    │   └── 第7章结论与展望.docx
    ├── 第三章
    │   ├── 3.1_需求.docx
    │   ├── 3.2.1_非关系型数据库mongodb及其搭建.docx
    │   ├── 3.2.4_redis简介及其搭建.docx
    │   ├── 3.3_Scrapy框架.docx
    │   └── 3.4_Srcapy+redis架构.docx
    ├── 第二章
    │   ├── 2.1_爬虫的分类与作用.docx
    │   ├── 2.2_http协议.docx
    │   ├── 2.3_rebots协议.docx
    │   └── 2.4_爬虫搜索策略-防止环路的出现.docx
    ├── 第五章
    │   └── 第五章测试.docx
    ├── 第六章
    │   ├── 6.1_数据模型.docx
    │   └── 6.2_数据分析.docx
    ├── 第四章
    │   ├── 4.1_微博移动版web分析.docx
    │   ├── 4.2_User-agent伪装.docx
    │   ├── 4.3_信息过滤规则-正则表达式.docx
    │   ├── 4.4_查重.docx
    │   ├── 4.5_反爬技术.docx
    │   └── 4.6_Cookie池.docx
    ├── 致谢.docx
    ├── 草稿.docx
    ├── 论文初稿_参考文献.docx
    ├── 论文初稿_目录.docx
    └── 题目.docx
├── 论文初稿
    ├── 第一章
    │   └── 论文初稿_绪论.docx
    ├── 第三章
    │   ├── 3.1.1_非关系型数据库mongodb及其搭建.docx
    │   ├── 3.1.4_redis简介及其搭建.docx
    │   ├── 3.2_Scrapy框架.docx
    │   └── 3.3_Srcapy+redis架构.docx
    ├── 第二章
    │   ├── 2.1_爬虫的分类与作用.docx
    │   ├── 2.2_http协议.docx
    │   ├── 2.3_rebots协议.docx
    │   ├── 2.4_微博移动版web分析.docx
    │   ├── 2.5_User-agent伪装.docx
    │   └── 2.6_信息过滤规则-正则表达式.docx
    ├── 第五章
    │   ├── 5.1_数据模型.docx
    │   └── 5.2_数据分析.docx
    ├── 第六章
    │   └── 论文初稿_总结与展望.docx
    ├── 第四章
    │   ├── 4.1_爬虫搜索策略-防止环路的出现.docx
    │   ├── 4.2_查重.docx
    │   ├── 4.3_反爬技术.docx
    │   └── 4.4_Cookie池.docx
    ├── 论文初稿_参考文献.docx
    ├── 论文初稿_目录.docx
    └── 附录
    │   └── 环境.txt
├── 论文改一.docx
└── 论文改二.docx


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Python爬取网易音乐的网络爬虫
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Distributed_Web_Spider
2 | （本科毕业设计）基于网络爬虫的数据分析系统的实现: 用 python2.7+Scrapy-Redis 分布式架构下的网络爬虫,用 json 编码+Cookies 池+搜索策略BFS+破解验证码+布隆过滤器+对抗AJAX, Redis 放于内存中去重队列并且实现断点继爬而 Mongodb 做磁盘持久化,数据采集微博移动版 web 用户信息关系数据等等共 400w 条数据.
3 | 


--------------------------------------------------------------------------------
/readme..txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/readme..txt


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask instance folder
58 | instance/
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | /.idea
85 | /.setting
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | 
93 | # Created by .ignore support plugin (hsz.mobi)
94 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/Sina_spider3/__init__.py


--------------------------------------------------------------------------------
/src/Sina_spider3/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/Sina_spider3/chromedriver.exe


--------------------------------------------------------------------------------
/src/Sina_spider3/cleanRedis:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | # ------------------------------------------
 3 | #   作用：清空Redis数据，重新跑数据时用。
 4 | #
 5 | # ------------------------------------------
 6 | 
 7 | import settings
 8 | import redis
 9 | 
10 | if __name__ == '__main__':
11 |     try:
12 |         rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT, settings.REDIS_DB)
13 |     except Exception:
14 |         rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT)
15 | 
16 |     try:
17 |         rconn_filter = redis.Redis(settings.FILTER_HOST, settings.FILTER_PORT, settings.FILTER_DB)
18 |     except Exception:
19 |         try:
20 |             rconn_filter = redis.Redis(settings.FILTER_HOST, settings.FILTER_PORT)
21 |         except Exception:
22 |             rconn_filter = None
23 | 
24 |     if rconn:
25 |         if 'SinaSpider:requests' in rconn.keys():
26 |             rconn.delete('SinaSpider:requests')
27 | 
28 |     if rconn_filter:
29 |         if 'SinaSpider:dupefilter0' in rconn.keys():
30 |             rconn.delete('SinaSpider:dupefilter0')
31 |         if 'SinaSpider:dupefilter1' in rconn.keys():
32 |             rconn.delete('SinaSpider:dupefilter1')
33 | 
34 |     print 'Finish!'
35 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/cookies.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | 
  3 | # ------------------------------------------
  4 | '''
  5 | import base64
  6 | import os
  7 | import requests
  8 | import time
  9 | import json
 10 | from selenium import webdriver
 11 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 12 | import logging
 13 | from yumdama import identify
 14 | 
 15 | IDENTIFY = 1  # 验证码输入方式:        1:看截图aa.png，手动输入     2:云打码
 16 | COOKIE_GETWAY = 0 # 0 代表从https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18) 获取cookie   # 1 代表从https://weibo.cn/login/获取Cookie
 17 | dcap = dict(DesiredCapabilities.PHANTOMJS)  # PhantomJS需要使用老版手机的user-agent，不然验证码会无法通过
 18 | dcap["phantomjs.page.settings.userAgent"] = (
 19 |     "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
 20 | )
 21 | logger = logging.getLogger(__name__)
 22 | logging.getLogger("selenium").setLevel(logging.WARNING)  # 将selenium的日志级别设成WARNING，太烦人
 23 | 
 24 | """
 25 |     输入你的微博账号和密码，可去淘宝买，一元5个。
 26 |     建议买几十个，实际生产建议100+，微博反爬得厉害，太频繁了会出现302转移。
 27 | """
 28 | myWeiBo = [
 29 |     ('13467408430', 'aogan571'),
 30 |     ('15774109579','bbx4768'),
 31 | 
 32 |     ('17877727541','bbx3464'),
 33 | 
 34 |     ('15898562769','bbx8712'),
 35 | 
 36 |     ('18407320608','bbx2145'),
 37 | 
 38 |     ('15973424313','bbx8431'),
 39 | 
 40 |     ('13762898341','bbx3186'),
 41 | 
 42 |     ('18374112533','bbx9829'),
 43 | 
 44 |     ('15274883774','bbx8748'),
 45 | 
 46 |     ('13873384591','bbx7247'),
 47 | 
 48 |     ('13974708834','bbx2579'),
 49 | 
 50 |     ('18474777738','bbx3957'),
 51 | 
 52 |     ('18397779843','bbx4491'),
 53 | 
 54 |     ('15197752390','bbx1831'),
 55 | 
 56 |     ('15273563186','bbx9756')
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | ]
 72 | 
 73 | 
 74 | def getCookie(account, password):
 75 |     if COOKIE_GETWAY == 0:
 76 |         return get_cookie_from_login_sina_com_cn(account, password)
 77 |     elif COOKIE_GETWAY ==1:
 78 |         return get_cookie_from_weibo_cn(account, password)
 79 |     else:
 80 |         logger.error("COOKIE_GETWAY Error!")
 81 | 
 82 | def get_cookie_from_login_sina_com_cn(account, password):
 83 |     """ 获取一个账号的Cookie """
 84 |     loginURL = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)"
 85 |     username = base64.b64encode(account.encode("utf-8")).decode("utf-8")
 86 |     postData = {
 87 |         "entry": "sso",
 88 |         "gateway": "1",
 89 |         "from": "null",
 90 |         "savestate": "30",
 91 |         "useticket": "0",
 92 |         "pagerefer": "",
 93 |         "vsnf": "1",
 94 |         "su": username,
 95 |         "service": "sso",
 96 |         "sp": password,
 97 |         "sr": "1440*900",
 98 |         "encoding": "UTF-8",
 99 |         "cdult": "3",
100 |         "domain": "sina.com.cn",
101 |         "prelt": "0",
102 |         "returntype": "TEXT",
103 |     }
104 |     session = requests.Session()
105 |     r = session.post(loginURL, data=postData)
106 |     jsonStr = r.content.decode("gbk")
107 |     info = json.loads(jsonStr)
108 |     if info["retcode"] == "0":
109 |         logger.warning("Get Cookie Success!( Account:%s )" % account)
110 |         cookie = session.cookies.get_dict()
111 |         return json.dumps(cookie)
112 |     else:
113 |         logger.warning("Failed!( Reason:%s )" % info["reason"])
114 |         return ""
115 | 
116 | 
117 | def get_cookie_from_weibo_cn(account, password):
118 |     """ 获取一个账号的Cookie """
119 |     try:
120 |         browser = webdriver.PhantomJS(desired_capabilities=dcap)
121 |         browser.get("https://weibo.cn/login/")
122 |         time.sleep(1)
123 | 
124 |         failure = 0
125 |         while "微博" in browser.title and failure < 5:
126 |             failure += 1
127 |             browser.save_screenshot("aa.png")
128 |             username = browser.find_element_by_name("mobile")
129 |             username.clear()
130 |             username.send_keys(account)
131 | 
132 |             psd = browser.find_element_by_xpath('//input[@type="password"]')
133 |             psd.clear()
134 |             psd.send_keys(password)
135 |             try:
136 |                 code = browser.find_element_by_name("code")
137 |                 code.clear()
138 |                 if IDENTIFY == 1:
139 |                     code_txt = raw_input("请查看路径下新生成的aa.png，然后输入验证码:")  # 手动输入验证码
140 |                 else:
141 |                     from PIL import Image
142 |                     img = browser.find_element_by_xpath('//form[@method="post"]/div/img[@alt="请打开图片显示"]')
143 |                     x = img.location["x"]
144 |                     y = img.location["y"]
145 |                     im = Image.open("aa.png")
146 |                     im.crop((x, y, 100 + x, y + 22)).save("ab.png")  # 剪切出验证码
147 |                     code_txt = identify()  # 验证码打码平台识别
148 |                 code.send_keys(code_txt)
149 |             except Exception, e:
150 |                 pass
151 | 
152 |             #commit = browser.find_element_by_name("submit")
153 |             commit = browser.find_element_by_xpath('//a[@id="loginAction"]')
154 | 
155 |             commit.click()
156 |             time.sleep(3)
157 |             if "我的首页" not in browser.title:
158 |                 time.sleep(4)
159 |             if '未激活微博' in browser.page_source:
160 |                 print '账号未开通微博'
161 |                 return {}
162 | 
163 |         cookie = {}
164 |         if "我的首页" in browser.title:
165 |             for elem in browser.get_cookies():
166 |                 cookie[elem["name"]] = elem["value"]
167 |             logger.warning("Get Cookie Success!( Account:%s )" % account)
168 |         return json.dumps(cookie)
169 |     except Exception, e:
170 |         logger.warning("Failed %s!" % account)
171 |         return ""
172 |     finally:
173 |         try:
174 |             browser.quit()
175 |         except Exception, e:
176 |             pass
177 | 
178 | 
179 | def initCookie(rconn, spiderName):
180 |     """ 获取所有账号的Cookies，存入Redis。如果Redis已有该账号的Cookie，则不再获取。 """
181 |     for weibo in myWeiBo:
182 |         if rconn.get("%s:Cookies:%s--%s" % (spiderName, weibo[0], weibo[1])) is None:  # 'SinaSpider:Cookies:账号--密码'，为None即不存在。
183 |             cookie = getCookie(weibo[0], weibo[1])
184 |             if len(cookie) > 0:
185 |                 rconn.set("%s:Cookies:%s--%s" % (spiderName, weibo[0], weibo[1]), cookie)
186 |     cookieNum = "".join(rconn.keys()).count("SinaSpider:Cookies")
187 |     logger.warning("The num of the cookies is %s" % cookieNum)
188 |     if cookieNum == 0:
189 |         logger.warning('Stopping...')
190 |         os.system("pause")
191 | 
192 | 
193 | def updateCookie(accountText, rconn, spiderName):
194 |     """ 更新一个账号的Cookie """
195 |     account = accountText.split("--")[0]
196 |     password = accountText.split("--")[1]
197 |     cookie = getCookie(account, password)
198 |     if len(cookie) > 0:
199 |         logger.warning("The cookie of %s has been updated successfully!" % account)
200 |         rconn.set("%s:Cookies:%s" % (spiderName, accountText), cookie)
201 |     else:
202 |         logger.warning("The cookie of %s updated failed! Remove it!" % accountText)
203 |         removeCookie(accountText, rconn, spiderName)
204 | 
205 | 
206 | def removeCookie(accountText, rconn, spiderName):
207 |     """ 删除某个账号的Cookie """
208 |     rconn.delete("%s:Cookies:%s" % (spiderName, accountText))
209 |     cookieNum = "".join(rconn.keys()).count("SinaSpider:Cookies")
210 |     logger.warning("The num of the cookies left is %s" % cookieNum)
211 |     if cookieNum == 0:
212 |         logger.warning("Stopping...")
213 |         os.system("pause")
214 | 
215 | 
216 | # encoding=utf-8
217 | 
218 | '''
219 | import base64
220 | import requests
221 | import sys
222 | import time
223 | from selenium import webdriver
224 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
225 | from selenium.webdriver import ActionChains
226 | from selenium.webdriver.common.by import By
227 | from selenium.webdriver.support.ui import WebDriverWait
228 | from selenium.webdriver.support import expected_conditions as EC
229 | import logging
230 | from yumdama import identify
231 | import json
232 | 
233 | reload(sys)
234 | sys.setdefaultencoding('utf-8')
235 | IDENTIFY = 1 # 验证码输入方式:        1:看截图aa.png，手动输入     2:云打码
236 | COOKIE_GETWAY =2 # 0 从https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18) 获取cookie   # 1 从https://weibo.cn/login/获取Cookie  # 2 使用chromedriver获取
237 | dcap = dict(DesiredCapabilities.PHANTOMJS)  # PhantomJS需要使用老版手机的user-agent，不然验证码会无法通过
238 | dcap["phantomjs.page.settings.userAgent"] = (
239 |     "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
240 | )
241 | logger = logging.getLogger(__name__)
242 | logging.getLogger("selenium").setLevel(logging.WARNING)  # 将selenium的日志级别设成WARNING，太烦人
243 | 
244 | 
245 | """
246 | 输入你的微博账号和密码，可去淘宝买。
247 | 建议买几十个，微博限制的严，太频繁了会出现302转移。
248 | 或者你也可以把时间间隔调大点。
249 | """
250 | 
251 | myWeiBo = [
252 |         # {'no': '18342808545','psw':'989527tx'},  这些被封号了
253 |         # {'no': '13655458602','psw':'943715tx'},
254 |         # {'no': '15529387149','psw':'222449sl'},
255 |         # {'no': '15273181439','psw':'222449sl'},
256 |         # {'no': '18476497826','psw':'22222a'},
257 |         # {'no': '18475447137','psw':'222449sl'}
258 |       #  {'no':'18101496480','psw':'325928lg'},
259 |        # {'no':'clab5570@163.com','psw':'7flxtedsnd'},
260 |        # {'no':'17163474885','psw':'216147vz'},
261 |         #{'no':'17084824243','psw':'951554vz'},
262 |        # {'no':'17162241240','psw':'247851vz'},
263 |        # {'no':'15836164273','psw':'897765vz'},
264 |        # {'no':'17162241495','psw':'114831vz'},
265 | 
266 | 
267 | 
268 | 
269 |     #测试
270 | 
271 | #{'no':'15874173914','psw':'bbx8514'},
272 | 
273 | #{'no':'15774109579','psw':'bbx4768'},
274 | 
275 | #{'no':'17877727541','psw':'bbx3464'},
276 | 
277 | {'no':'15898562769','psw':'bbx8712'},
278 | 
279 | {'no':'18407320608','psw':'bbx2145'},
280 | 
281 | {'no':'15973424313','psw':'bbx8431'},
282 | 
283 | #{'no':'13762898341','psw':'bbx3186'},
284 | 
285 | #{'no':'18374112533','psw':'bbx9829'},
286 | 
287 | #{'no':'15274883774','psw':'bbx8748'},
288 | 
289 | #{'no':'13873384591','psw':'bbx7247'},
290 | 
291 | #{'no':'13974708834','psw':'bbx2579'},
292 | 
293 | #{'no':'18474777738','psw':'bbx3957'},
294 | 
295 | #{'no':'18397779843','psw':'bbx4491'},
296 | 
297 | #{'no':'15197752390','psw':'bbx1831'},
298 | 
299 | #{'no':'15273563186','psw':'bbx9756'},
300 | 
301 | 
302 | 
303 | 
304 | 
305 | ]
306 | 
307 | def getCookie(account, password):
308 |     if COOKIE_GETWAY == 0:
309 |         return get_cookie_from_login_sina_com_cn(account, password)
310 |     elif COOKIE_GETWAY == 1:
311 |         return get_cookie_from_weibo_cn(account, password)
312 |     elif COOKIE_GETWAY == 2:
313 |         return get_cookie_from_weibo(account, password)
314 |     else:
315 |         logger.error("COOKIE_GETWAY Error!")
316 | 
317 | 
318 | def get_cookie_from_login_sina_com_cn(account, password):
319 |     """ 获取一个账号的Cookie """
320 |     loginURL = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)"
321 |     username = base64.b64encode(account.encode("utf-8")).decode("utf-8")
322 |     postData = {
323 |         "entry": "sso",
324 |         "gateway": "1",
325 |         "from": "null",
326 |         "savestate": "30",
327 |         "useticket": "0",
328 |         "pagerefer": "",
329 |         "vsnf": "1",
330 |         "su": username,
331 |         "service": "sso",
332 |         "sp": password,
333 |         "sr": "1440*900",
334 |         "encoding": "UTF-8",
335 |         "cdult": "3",
336 |         "domain": "sina.com.cn",
337 |         "prelt": "0",
338 |         "returntype": "TEXT",
339 |     }
340 |     session = requests.Session()
341 |     r = session.post(loginURL, data=postData)
342 |     jsonStr = r.content.decode("gbk")
343 |     info = json.loads(jsonStr)
344 |     if info["retcode"] == "0":
345 |         logger.warning("Get Cookie Success!( Account:%s )" % account)
346 |         cookie = session.cookies.get_dict()
347 |         return json.dumps(cookie)
348 |     else:
349 |         logger.warning("Failed!( Reason:%s )" % info["reason"])
350 |         return ""
351 | 
352 | 
353 | def get_cookie_from_weibo_cn(account, password):
354 |     """ 获取一个账号的Cookie """
355 |     try:
356 |         browser = webdriver.PhantomJS(executable_path='E:\\phantomjs\\bin\\phantomjs.exe',desired_capabilities=dcap)
357 |         browser.get("https://weibo.cn/login/")
358 |         time.sleep(1)
359 | 
360 |         failure = 0
361 |         while "微博" in browser.title and failure < 5:
362 |             failure += 1
363 |             browser.save_screenshot("aa.png")
364 |             username = browser.find_element_by_name("mobile")
365 |             username.clear()
366 |             username.send_keys(account)
367 | 
368 |             psd = browser.find_element_by_xpath('//input[@type="password"]')
369 |             psd.clear()
370 |             psd.send_keys(password)
371 |             try:
372 |                 code = browser.find_element_by_name("code")
373 |                 code.clear()
374 |                 if IDENTIFY == 1:
375 |                     code_txt = raw_input("请查看路径下新生成的aa.png，然后输入验证码:")  # 手动输入验证码
376 |                 else:
377 |                     from PIL import Image
378 |                     img = browser.find_element_by_xpath('//form[@method="post"]/div/img[@alt="请打开图片显示"]')
379 |                     x = img.location["x"]
380 |                     y = img.location["y"]
381 |                     im = Image.open("aa.png")
382 |                     im.crop((x, y, 100 + x, y + 22)).save("ab.png")  # 剪切出验证码
383 |                     code_txt = identify()  # 验证码打码平台识别
384 |                 code.send_keys(code_txt)
385 |             except Exception, e:
386 |                 pass
387 | 
388 |             commit = browser.find_element_by_name("submit")
389 |             commit.click()
390 |             time.sleep(3)
391 |             if "我的首页" not in browser.title:
392 |                 time.sleep(4)
393 |             if '未激活微博' in browser.page_source:
394 |                 print '账号未开通微博'
395 |                 return {}
396 | 
397 |         cookie = {}
398 |         if "我的首页" in browser.title:
399 |             for elem in browser.get_cookies():
400 |                 cookie[elem["name"]] = elem["value"]
401 |             logger.warning("Get Cookie Success!( Account:%s )" % account)
402 |         return json.dumps(cookie)
403 |     except Exception, e:
404 |         logger.warning("Failed %s!" % account)
405 |         logger.warning(e)
406 |         return ""
407 |     finally:
408 |         try:
409 |             browser.quit()
410 |         except Exception, e:
411 |             pass
412 | 
413 | 
414 | def get_cookie_from_weibo(username, password):
415 |     driver = webdriver.Chrome()
416 |     driver.get('https://weibo.cn//login/')
417 |     time.sleep(10)
418 |     assert u"微博" in driver.title
419 |     login_link = driver.find_element_by_link_text(u'登录')
420 |     ActionChains(driver).move_to_element(login_link).click().perform()
421 |     login_name = WebDriverWait(driver, 10).until(
422 |         EC.visibility_of_element_located((By.ID, "loginName"))
423 |     )
424 |     login_password = driver.find_element_by_id("loginPassword")
425 |     login_name.send_keys(username)
426 |     login_password.send_keys(password)
427 |     login_button = driver.find_element_by_id("loginAction")
428 |     login_button.click() #自动按下登陆
429 |     WebDriverWait(driver, 30).until(EC.title_is(u"我的首页")) #等待过了验证到首页
430 |     cookie = driver.get_cookies()
431 |     driver.close()
432 |     return json.dumps(cookie)
433 | 
434 | 
435 | def getCookies(weibo):
436 |     """ 获取Cookies """
437 |     cookies = []
438 |     for elem in weibo:
439 |         account = elem['no']
440 |         password = elem['psw']
441 |         cookie  =  getCookie(account, password)
442 |         # logger.warning(type(cookie))
443 |         if cookie != None:
444 |             cookies.append(cookie)
445 | 
446 |     return cookies
447 | 
448 | 
449 | cookies = getCookies(myWeiBo)
450 | # logger.warning(type(cookies))
451 | # logger.warning(cookies)
452 | logger.warning("Get Cookies Finish!( Num:%d)" % len(cookies))
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/items.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | 
 3 | from scrapy import Item, Field
 4 | 
 5 | 
 6 | class InformationItem(Item):
 7 |     """ 个人信息 """
 8 |     _id = Field()  # 用户ID
 9 |     NickName = Field()  # 昵称
10 |     Gender = Field()  # 性别
11 |     Province = Field()  # 所在省
12 |     City = Field()  # 所在城市
13 |     BriefIntroduction = Field()  # 简介
14 |     Birthday = Field()  # 生日
15 |     Num_Tweets = Field()  # 微博数
16 |     Num_Follows = Field()  # 关注数
17 |     Num_Fans = Field()  # 粉丝数
18 |     SexOrientation = Field()  # 性取向
19 |     Sentiment = Field()  # 感情状况
20 |     VIPlevel = Field()  # 会员等级
21 |     Authentication = Field()  # 认证
22 |     URL = Field()  # 首页链接
23 | 
24 | 
25 | class TweetsItem(Item):
26 |     """ 微博信息 """
27 |     _id = Field()  # 用户ID-微博ID
28 |     ID = Field()  # 用户ID
29 |     Content = Field()  # 微博内容
30 |     PubTime = Field()  # 发表时间
31 |     Co_oridinates = Field()  # 定位坐标
32 |     Tools = Field()  # 发表工具/平台
33 |     Like = Field()  # 点赞数
34 |     Comment = Field()  # 评论数
35 |     Transfer = Field()  # 转载数
36 | 
37 | 
38 | class RelationshipsItem(Item):
39 |     """ 用户关系，只保留与关注的关系 """
40 |     Host1 = Field()
41 |     Host2 = Field()  # 被关注者的ID
42 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/middleware.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | # ------------------------------------------
  3 | '''
  4 | import os
  5 | import random
  6 | import redis
  7 | import json
  8 | import logging
  9 | from user_agents import agents
 10 | from cookies import initCookie, updateCookie, removeCookie
 11 | #from cookies import cookies
 12 | from scrapy.exceptions import IgnoreRequest
 13 | from scrapy.utils.response import response_status_message
 14 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class UserAgentMiddleware(object):
 20 |     """ 换User-Agent """
 21 | 
 22 |     def process_request(self, request, spider):
 23 |         agent = random.choice(agents)
 24 |         request.headers["User-Agent"] = agent
 25 | 
 26 | 
 27 | class CookiesMiddleware(RetryMiddleware):
 28 |     """ 维护Cookie """
 29 | 
 30 |     def __init__(self, settings, crawler):
 31 |         RetryMiddleware.__init__(self, settings)
 32 |         self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))
 33 |         initCookie(self.rconn, crawler.spider.name)
 34 | 
 35 |     @classmethod
 36 |     def from_crawler(cls, crawler):
 37 |         return cls(crawler.settings, crawler)
 38 | 
 39 |     def process_request(self, request, spider):
 40 |         redisKeys = self.rconn.keys()
 41 |         while len(redisKeys) > 0:
 42 |             elem = random.choice(redisKeys)
 43 |             if "SinaSpider:Cookies" in elem:
 44 |                 cookie = json.loads(self.rconn.get(elem))
 45 |                 request.cookies = cookie
 46 |                 request.meta["accountText"] = elem.split("Cookies:")[-1]
 47 |                 break
 48 |             else:
 49 |                 redisKeys.remove(elem)
 50 | 
 51 |     def process_response(self, request, response, spider):
 52 |         if response.status in [300, 301, 302, 303]:
 53 |             try:
 54 |                 redirect_url = response.headers["location"]
 55 |                 if "login.weibo" in redirect_url or "login.sina" in redirect_url:  # Cookie失效
 56 |                     logger.warning("One Cookie need to be updating...")
 57 |                     updateCookie(request.meta['accountText'], self.rconn, spider.name)
 58 |                 elif "weibo.cn/security" in redirect_url:  # 账号被限
 59 |                     logger.warning("One Account is locked! Remove it!")
 60 |                     removeCookie(request.meta["accountText"], self.rconn, spider.name)
 61 |                 elif "weibo.cn/pub" in redirect_url:
 62 |                     logger.warning(
 63 |                         "Redirect to 'http://weibo.cn/pub'!( Account:%s )" % request.meta["accountText"].split("--")[0])
 64 |                 reason = response_status_message(response.status)
 65 |                 return self._retry(request, reason, spider) or response  # 重试
 66 |             except Exception, e:
 67 |                 raise IgnoreRequest
 68 |         elif response.status in [403, 414]:
 69 |             logger.error("%s! Stopping..." % response.status)
 70 |             os.system("pause")
 71 |         else:
 72 |             return response'''
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | # encoding=utf-8
 81 | import random
 82 | from cookies import cookies
 83 | from user_agents import agents
 84 | # from getproxies import proxies
 85 | import json
 86 | 
 87 | 
 88 | class UserAgentMiddleware(object):
 89 |     """ 换User-Agent """
 90 | 
 91 |     def process_request(self, request, spider):
 92 |         agent = random.choice(agents)
 93 |         request.headers.setdefault("User-Agent", agent)
 94 |         # request.headers["User-Agent"] = agent
 95 | 
 96 | 
 97 | class CookiesMiddleware(object):
 98 |     """ 换Cookie """
 99 | 
100 |     def process_request(self, request, spider):
101 |         cookie = json.loads(random.choice(cookies))
102 |         # print cookie
103 |         request.cookies = cookie
104 | 
105 | 
106 | # class ProxyMiddleware(object):
107 | #     """ 获取开放IP """
108 | #     def process_request(self, request, spider):
109 | #         url = "http://ip.chinaz.com/getip.aspx"
110 | #         while True:
111 | #             proxy = random.choice(proxies)
112 | #             ip = proxy.strip().split("\t")
113 | #             proxy_host = "http://"+ip[0]+":"+ip[1]
114 | #             proxy_temp = {"http":proxy_host}
115 | #             res = urllib.urlopen(url,proxies=proxy_temp).read()
116 | #             request.meta['proxy'] = "http://" + proxy['ip_port']
117 | #         try:
118 | #             pass


--------------------------------------------------------------------------------
/src/Sina_spider3/pipelines.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | import pymongo
  3 | from items import InformationItem, TweetsItem, RelationshipsItem
  4 | import MySQLdb
  5 | 
  6 | class MysqlDBPipeline(object):
  7 |     def __init__(self):
  8 |         self.count = 1
  9 |         self.conn = MySQLdb.connect(
 10 |                 host='localhost',
 11 |                 port=3306,
 12 |                 user='root',
 13 |                 #这里填写密码
 14 |                 passwd='chen960212',
 15 |                 db='sinaweibo',
 16 |                 charset='utf8',
 17 |                 )
 18 |         self.cur = self.conn.cursor()
 19 | 
 20 |     def process_item(self, item, spider):
 21 |         """ 判断item的类型，并作相应的处理，再入数据库 """
 22 |         if isinstance(item, RelationshipsItem):
 23 |             try:
 24 |                 print("***********at beginning of saving**********")
 25 |                 print(dict(item))
 26 |                 sql = ''
 27 |                 sql+=str('INSERT INTO SinaWeibo.relationship (`Host1`,`Host2`) ')
 28 |                 sql+=str(' Values(\'' )
 29 |                 sql+=str(item['Host1'])
 30 |                 print(sql)
 31 |                 sql+=str('\', \'')
 32 |                 sql+=str(item['Host2'])
 33 |                 sql+=str('\')')
 34 |                 print("*********** SQL SYNTAX *********** ")
 35 |                 print(''.join(sql))
 36 |                 self.cur.execute(sql)
 37 |                 self.conn.commit()
 38 |                 print("saved")
 39 |                 self.count = self.count +1
 40 |                 print(self.count)
 41 |             except Exception:
 42 |                 pass
 43 |         elif isinstance(item, TweetsItem):
 44 |             try:
 45 |                 print("***********at beginning of saving**********")
 46 |                 
 47 |                 sql = ''
 48 |                 sql+=str('INSERT INTO SinaWeibo.tweets (`weibo_id`,`User_id`,`Content`,`Pubtime`,`Coordinates`,`Tools`,`Likes`,`Comments`,`Transfers`) ')
 49 |                 sql+=str(' Values(\'' )
 50 |                 sql+=str(item['_id'])
 51 |                 
 52 |                 sql+=str('\', \'')
 53 |                 sql+=str(item['ID'])
 54 |                 sql+=str('\', \'')
 55 |                 sql+=str(item['Content'])
 56 |                 sql+=str('\', \'')
 57 |                 sql+=str(item['PubTime'])
 58 |                 
 59 |                 sql+=str('\', \'')
 60 |                 sql+=str(item['Co_oridinates'])
 61 |                 sql+=str('\', \'')
 62 |                 
 63 |                 sql+=str(item['Tools'])
 64 |                 print(sql)
 65 |                 sql+=str('\', \'')
 66 |                 sql+=str(item['Like'])
 67 |                 sql+=str('\', \'')
 68 |                 sql+=str(item['Comment'])
 69 |                 sql+=str('\', \'')
 70 |                 sql+=str(item['Transfer'])
 71 |                 sql+=str('\')')
 72 |                 print("*********** SQL SYNTAX *********** ")
 73 |                 print(''.join(sql))
 74 |                 self.cur.execute(sql)
 75 |                 self.conn.commit()
 76 |                 print("saved")
 77 |                 self.count = self.count +1
 78 |                 print(self.count)
 79 |             except Exception:
 80 |                 pass
 81 |         elif isinstance(item, InformationItem):
 82 |             try:
 83 |                 print("***********at beginning of saving**********")
 84 |                 
 85 |                 sql = ''
 86 |                 sql+=str('INSERT INTO SinaWeibo.information (`User_id`,`NickName`,`Gender`,`Province`,`City`,`BriefIntroduction`,`Birthday`,`Num_Tweets`,`Num_Follows`,`Num_Fans`,`SexOrientation`,`Sentiment`,`VIPlevel`,`Authentication`,`URL`) ')
 87 |                 sql+=str(' Values(\'' )
 88 |                 sql+=str(item['_id'])
 89 |                 
 90 |                 sql+=str('\', \'')
 91 |                 sql+=str(item['NickName'])
 92 |                 sql+=str('\', \'')
 93 |                 sql+=str(item['Gender'])
 94 |                 sql+=str('\', \'')
 95 |                 sql+=str(item['Province'])
 96 |                 
 97 |                 sql+=str('\', \'')
 98 |                 sql+=str(item['City'])
 99 |                 sql+=str('\', \'')
100 |                 sql+=str(item['BriefIntroduction'])
101 |                 sql+=str('\', \'')
102 |                 print(sql)
103 |                 sql+=str(item['Birthday'])
104 |                 sql+=str('\', \'')
105 |                 sql+=str(item['Num_Tweets'])
106 |                 
107 |                 sql+=str('\', \'')
108 |                 sql+=str(item['Num_Follows'])
109 |                 sql+=str('\', \'')
110 |                 sql+=str(item['Num_Fans'])
111 |                 sql+=str('\', \'')
112 |                 
113 |                 sql+=str(item['SexOrientation'])
114 |                 sql+=str('\', \'')
115 |                 sql+=str(item['Sentiment'])
116 |                 
117 |                 sql+=str('\', \'')
118 |                 sql+=str(item['VIPlevel'])
119 |                 sql+=str('\', \'')
120 |                 sql+=str(item['Authentication'])
121 |                 sql+=str('\', \'')
122 |                 sql+=str(item['URL'])
123 |                 sql+=str('\')')
124 |                 
125 |                 print("*********** SQL SYNTAX *********** ")
126 |                 print(''.join(sql))
127 |                 self.cur.execute(sql)
128 |                 self.conn.commit()
129 |                 print("saved")
130 |                 self.count = self.count +1
131 |                 print(self.count)
132 |             except Exception:
133 |                 pass
134 |             
135 |             ##在Java开发中，Dao连接会对内存溢出，需要定时断开重连，这里不清楚是否需要，先加上了
136 |             if self.count == 1000:
137 |                 print("try reconnecting")
138 |                 self.count = 0
139 |                 self.cur.close()
140 |                 self.conn.close()
141 |                 self.conn = MySQLdb.connect(
142 |                     host='localhost',
143 |                     port=3306,
144 |                     user='root',
145 |                     passwd='chen960212',
146 |                     db='sinaweibo',
147 |                     charset='utf8',
148 |                 )
149 |                 self.cur = self.conn.cursor()
150 |                 print("reconnect")
151 |                 
152 |         return item
153 |     
154 | 
155 | 
156 | class MongoDBPipeline(object):
157 |     def __init__(self):
158 |         clinet = pymongo.MongoClient("localhost", 27017)
159 |         db = clinet["Spider_Sina_weibo"]
160 |         self.Information = db["Information"]
161 |         self.Tweets = db["Tweets"]
162 |         self.Relationships = db["Relationships"]
163 | 
164 |     def process_item(self, item, spider):
165 |         """ 判断item的类型，并作相应的处理，再入数据库 """
166 |         if isinstance(item, RelationshipsItem):
167 |             try:
168 |                 self.Relationships.insert(dict(item))
169 |             except Exception:
170 |                 pass
171 |         elif isinstance(item, TweetsItem):
172 |             try:
173 |                 self.Tweets.insert(dict(item))
174 |             except Exception:
175 |                 pass
176 |         elif isinstance(item, InformationItem):
177 |             try:
178 |                 self.Information.insert(dict(item))
179 |             except Exception:
180 |                 pass
181 |         return item
182 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/Sina_spider3/scrapy_redis/__init__.py


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/connection.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | 
 3 | # Default values.
 4 | REDIS_URL = None
 5 | REDIS_HOST = 'localhost'
 6 | REDIS_PORT = 6379
 7 | 
 8 | FILTER_URL = None
 9 | FILTER_HOST = 'localhost'
10 | FILTER_PORT = 6379
11 | FILTER_DB = 0
12 | 
13 | 
14 | def from_settings(settings):
15 |     url = settings.get('REDIS_URL', REDIS_URL)
16 |     host = settings.get('REDIS_HOST', REDIS_HOST)
17 |     port = settings.get('REDIS_PORT', REDIS_PORT)
18 | 
19 |     # REDIS_URL takes precedence over host/port specification.
20 |     if url:
21 |         return redis.from_url(url)
22 |     else:
23 |         return redis.Redis(host=host, port=port)
24 | 
25 | 
26 | def from_settings_filter(settings):
27 |     url = settings.get('FILTER_URL', FILTER_URL)
28 |     host = settings.get('FILTER_HOST', FILTER_HOST)
29 |     port = settings.get('FILTER_PORT', FILTER_PORT)
30 |     db = settings.get('FILTER_DB', FILTER_DB)
31 | 
32 |     if url:
33 |         return redis.from_url(url)
34 |     else:
35 |         return redis.Redis(host=host, port=port, db=db)
36 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/dupefilter.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re
 3 | 
 4 | from scrapy.dupefilters import BaseDupeFilter
 5 | 
 6 | from . import connection
 7 | 
 8 | 
 9 | class RFPDupeFilter(BaseDupeFilter):
10 |     """Redis-based request duplication filter"""
11 | 
12 |     def __init__(self, server, key):
13 |         """Initialize duplication filter
14 | 
15 |         Parameters
16 |         ----------
17 |         server : Redis instance
18 |         key : str
19 |             Where to store fingerprints
20 |         """
21 |         self.server = server
22 |         self.key = key
23 | 
24 |     @classmethod
25 |     def from_settings(cls, settings):
26 |         server = connection.from_settings_filter(settings)
27 |         key = "dupefilter:%s" % int(time.time())
28 |         return cls(server, key)
29 | 
30 |     @classmethod
31 |     def from_crawler(cls, crawler):
32 |         return cls.from_settings(crawler.settings)
33 | 
34 |     def request_seen(self, request):
35 |         uid = re.findall('(\d+)/info', request.url)
36 |         if uid:
37 |             uid = int(uid[0])
38 |             isExist = self.server.getbit(self.key + str(uid / 4000000000), uid % 4000000000)
39 |             if isExist == 1:
40 |                 return True
41 |             else:
42 |                 self.server.setbit(self.key + str(uid / 4000000000), uid % 4000000000, 1)
43 |                 return False
44 | 
45 |     def close(self, reason):
46 |         """Delete data on close. Called by scrapy's scheduler"""
47 |         self.clear()
48 | 
49 |     def clear(self):
50 |         """Clears fingerprints data"""
51 |         self.server.delete(self.key)
52 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy.utils.serialize import ScrapyJSONEncoder
 2 | from twisted.internet.threads import deferToThread
 3 | 
 4 | from . import connection
 5 | 
 6 | 
 7 | class RedisPipeline(object):
 8 |     """Pushes serialized item into a redis list/queue"""
 9 | 
10 |     def __init__(self, server):
11 |         self.server = server
12 |         self.encoder = ScrapyJSONEncoder()
13 | 
14 |     @classmethod
15 |     def from_settings(cls, settings):
16 |         server = connection.from_settings(settings)
17 |         return cls(server)
18 | 
19 |     @classmethod
20 |     def from_crawler(cls, crawler):
21 |         return cls.from_settings(crawler.settings)
22 | 
23 |     def process_item(self, item, spider):
24 |         return deferToThread(self._process_item, item, spider)
25 | 
26 |     def _process_item(self, item, spider):
27 |         key = self.item_key(item, spider)
28 |         data = self.encoder.encode(item)
29 |         self.server.rpush(key, data)
30 |         return item
31 | 
32 |     def item_key(self, item, spider):
33 |         """Returns redis key based on given spider"""
34 |         return "%s:items" % spider.name
35 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/queue.py:
--------------------------------------------------------------------------------
  1 | from scrapy.utils.reqser import request_to_dict, request_from_dict
  2 | from scrapy.http import Request
  3 | 
  4 | try:
  5 |     import cPickle as pickle
  6 | except ImportError:
  7 |     import pickle
  8 | 
  9 | 
 10 | class Base(object):
 11 |     """Per-spider queue/stack base class"""
 12 | 
 13 |     def __init__(self, server, spider, key, queue_name):
 14 |         """Initialize per-spider redis queue.
 15 | 
 16 |         Parameters:
 17 |             server -- redis connection
 18 |             spider -- spider instance
 19 |             key -- key for this queue (e.g. "%(spider)s:queue")
 20 |         """
 21 |         self.server = server
 22 |         self.spider = spider
 23 |         self.key = key % {'spider': queue_name}
 24 | 
 25 |     def _encode_request(self, request):
 26 |         """Encode a request object"""
 27 |         return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
 28 | 
 29 |     def _decode_request(self, encoded_request):
 30 |         """Decode an request previously encoded"""
 31 |         return request_from_dict(pickle.loads(encoded_request), self.spider)
 32 | 
 33 |     def __len__(self):
 34 |         """Return the length of the queue"""
 35 |         raise NotImplementedError
 36 | 
 37 |     def push(self, request):
 38 |         """Push a request"""
 39 |         raise NotImplementedError
 40 | 
 41 |     def pop(self, timeout=0):
 42 |         """Pop a request"""
 43 |         raise NotImplementedError
 44 | 
 45 |     def clear(self):
 46 |         """Clear queue/stack"""
 47 |         self.server.delete(self.key)
 48 | 
 49 | 
 50 | class SpiderQueue(Base):
 51 |     """Per-spider FIFO queue"""
 52 | 
 53 |     def __len__(self):
 54 |         """Return the length of the queue"""
 55 |         return self.server.llen(self.key)
 56 | 
 57 |     def push(self, request):
 58 |         """Push a request"""
 59 |         self.server.lpush(self.key, self._encode_request(request))
 60 | 
 61 |     def pop(self, timeout=0):
 62 |         """Pop a request"""
 63 |         if timeout > 0:
 64 |             data = self.server.brpop(self.key, timeout)
 65 |             if isinstance(data, tuple):
 66 |                 data = data[1]
 67 |         else:
 68 |             data = self.server.rpop(self.key)
 69 |         if data:
 70 |             return self._decode_request(data)
 71 | 
 72 | 
 73 | class SpiderPriorityQueue(Base):
 74 |     """Per-spider priority queue abstraction using redis' sorted set"""
 75 | 
 76 |     def __len__(self):
 77 |         """Return the length of the queue"""
 78 |         return self.server.zcard(self.key)
 79 | 
 80 |     def push(self, request):
 81 |         """Push a request"""
 82 |         data = self._encode_request(request)
 83 |         pairs = {data: -request.priority}
 84 |         self.server.zadd(self.key, **pairs)
 85 | 
 86 |     def pop(self, timeout=0):
 87 |         """
 88 |         Pop a request
 89 |         timeout not support in this queue class
 90 |         """
 91 |         # use atomic range/remove using multi/exec
 92 |         pipe = self.server.pipeline()
 93 |         pipe.multi()
 94 |         pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
 95 |         results, count = pipe.execute()
 96 |         if results:
 97 |             return self._decode_request(results[0])
 98 | 
 99 | 
100 | class SpiderSimpleQueue(Base):
101 |     """ url + callback """
102 | 
103 |     def __len__(self):
104 |         """Return the length of the queue"""
105 |         return self.server.llen(self.key)
106 | 
107 |     def push(self, request):
108 |         """Push a request"""
109 |         self.server.lpush(self.key, request.url[16:])
110 | 
111 |     def pop(self, timeout=0):
112 |         """Pop a request"""
113 |         if timeout > 0:
114 |             url = self.server.brpop(self.key, timeout=timeout)
115 |             if isinstance(url, tuple):
116 |                 url = url[1]
117 |         else:
118 |             url = self.server.rpop(self.key)
119 |         if url:
120 |             try:
121 |                 if "/follow" in url or "/fans" in url:
122 |                     cb = getattr(self.spider, "parse_relationship")
123 |                 elif "/profile" in url:
124 |                     cb = getattr(self.spider, "parse_tweets")
125 |                 elif "/info" in url:
126 |                     cb = getattr(self.spider, "parse_information")
127 |                 else:
128 |                     raise ValueError("Method not found in: %s( URL:%s )" % (self.spider, url))
129 |                 return Request(url="https://weibo.cn%s" % url, callback=cb)
130 |             except AttributeError:
131 |                 raise ValueError("Method not found in: %s( URL:%s )" % (self.spider, url))
132 | 
133 | 
134 | class SpiderStack(Base):
135 |     """Per-spider stack"""
136 | 
137 |     def __len__(self):
138 |         """Return the length of the stack"""
139 |         return self.server.llen(self.key)
140 | 
141 |     def push(self, request):
142 |         """Push a request"""
143 |         self.server.lpush(self.key, self._encode_request(request))
144 | 
145 |     def pop(self, timeout=0):
146 |         """Pop a request"""
147 |         if timeout > 0:
148 |             data = self.server.blpop(self.key, timeout)
149 |             if isinstance(data, tuple):
150 |                 data = data[1]
151 |         else:
152 |             data = self.server.lpop(self.key)
153 | 
154 |         if data:
155 |             return self._decode_request(data)
156 | 
157 | 
158 | __all__ = ['SpiderQueue', 'SpiderPriorityQueue', 'SpiderSimpleQueue', 'SpiderStack']
159 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/scheduler.py:
--------------------------------------------------------------------------------
 1 | from scrapy.utils.misc import load_object
 2 | 
 3 | from . import connection
 4 | from .dupefilter import RFPDupeFilter
 5 | 
 6 | 
 7 | # default values
 8 | SCHEDULER_PERSIST = False
 9 | QUEUE_KEY = '%(spider)s:requests'
10 | QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
11 | DUPEFILTER_KEY = '%(spider)s:dupefilter'
12 | IDLE_BEFORE_CLOSE = 0
13 | 
14 | 
15 | class Scheduler(object):
16 |     """Redis-based scheduler"""
17 | 
18 |     def __init__(self, server, server_filter, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, queue_name):
19 |         """Initialize scheduler.
20 | 
21 |         Parameters
22 |         ----------
23 |         server : Redis instance
24 |         persist : bool
25 |         queue_key : str
26 |         queue_cls : queue class
27 |         dupefilter_key : str
28 |         idle_before_close : int
29 |         """
30 |         self.server = server
31 |         self.server_filter = server_filter
32 |         self.persist = persist
33 |         self.queue_key = queue_key
34 |         self.queue_cls = queue_cls
35 |         self.dupefilter_key = dupefilter_key
36 |         self.idle_before_close = idle_before_close
37 |         self.queue_name = queue_name
38 |         self.stats = None
39 | 
40 |     def __len__(self):
41 |         return len(self.queue)
42 | 
43 |     @classmethod
44 |     def from_settings(cls, settings):
45 |         persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
46 |         queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
47 |         queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
48 |         queue_name = settings.get('REDIS_QUEUE_NAME', None)
49 |         dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
50 |         idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
51 |         server = connection.from_settings(settings)
52 |         server_filter = connection.from_settings_filter(settings)
53 |         return cls(server, server_filter, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, queue_name)
54 | 
55 |     @classmethod
56 |     def from_crawler(cls, crawler):
57 |         instance = cls.from_settings(crawler.settings)
58 |         # FIXME: for now, stats are only supported from this constructor
59 |         instance.stats = crawler.stats
60 |         return instance
61 | 
62 |     def open(self, spider):
63 |         self.spider = spider
64 |         self.queue = self.queue_cls(self.server, spider, self.queue_key, (self.queue_name if self.queue_name else spider.name))
65 |         self.df = RFPDupeFilter(self.server_filter, self.dupefilter_key % {'spider': (self.queue_name if self.queue_name else spider.name)})
66 |         if self.idle_before_close < 0:
67 |             self.idle_before_close = 0
68 |         # notice if there are requests already in the queue to resume the crawl
69 |         if len(self.queue):
70 |             spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
71 | 
72 |     def close(self, reason):
73 |         if not self.persist:
74 |             self.df.clear()
75 |             self.queue.clear()
76 | 
77 |     def enqueue_request(self, request):
78 |         if not request.dont_filter and self.df.request_seen(request):
79 |             return
80 |         if self.stats:
81 |             self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
82 |         self.queue.push(request)
83 | 
84 |     def next_request(self):
85 |         block_pop_timeout = self.idle_before_close
86 |         request = self.queue.pop(block_pop_timeout)
87 |         if request and self.stats:
88 |             self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
89 |         return request
90 | 
91 |     def has_pending_requests(self):
92 |         return len(self) > 0
93 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/spiders.py:
--------------------------------------------------------------------------------
 1 | from scrapy import Spider, signals
 2 | from scrapy.exceptions import DontCloseSpider
 3 | 
 4 | from . import connection
 5 | 
 6 | 
 7 | class RedisMixin(object):
 8 |     """Mixin class to implement reading urls from a redis queue."""
 9 |     redis_key = None  # use default '<spider>:start_urls'
10 | 
11 |     def setup_redis(self):
12 |         """Setup redis connection and idle signal.
13 | 
14 |         This should be called after the spider has set its crawler object.
15 |         """
16 |         if not self.redis_key:
17 |             self.redis_key = '%s:start_urls' % self.name
18 | 
19 |         self.server = connection.from_settings(self.crawler.settings)
20 |         # idle signal is called when the spider has no requests left,
21 |         # that's when we will schedule new requests from redis queue
22 |         self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
23 |         self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
24 |         self.log("Reading URLs from redis list '%s'" % self.redis_key)
25 | 
26 |     def next_request(self):
27 |         """Returns a request to be scheduled or none."""
28 |         url = self.server.lpop(self.redis_key)
29 |         if url:
30 |             return self.make_requests_from_url(url)
31 | 
32 |     def schedule_next_request(self):
33 |         """Schedules a request if available"""
34 |         req = self.next_request()
35 |         if req:
36 |             self.crawler.engine.crawl(req, spider=self)
37 | 
38 |     def spider_idle(self):
39 |         """Schedules a request if available, otherwise waits."""
40 |         self.schedule_next_request()
41 |         raise DontCloseSpider
42 | 
43 |     def item_scraped(self, *args, **kwargs):
44 |         """Avoids waiting for the spider to  idle before scheduling the next request"""
45 |         self.schedule_next_request()
46 | 
47 | 
48 | class RedisSpider(RedisMixin, Spider):
49 |     """Spider that reads urls from redis queue when idle."""
50 | 
51 |     def _set_crawler(self, crawler):
52 |         super(RedisSpider, self)._set_crawler(crawler)
53 |         self.setup_redis()
54 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/scrapy_redis/tests.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import mock
  4 | import redis
  5 | 
  6 | from scrapy import Request, Spider
  7 | from unittest import TestCase
  8 | 
  9 | from . import connection
 10 | from .dupefilter import RFPDupeFilter
 11 | from .queue import SpiderQueue, SpiderPriorityQueue, SpiderStack
 12 | from .scheduler import Scheduler
 13 | 
 14 | 
 15 | # allow test settings from environment
 16 | REDIS_HOST = os.environ.get('REDIST_HOST', 'localhost')
 17 | REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379))
 18 | 
 19 | 
 20 | class RedisTestMixin(object):
 21 | 
 22 |     @property
 23 |     def server(self):
 24 |         if not hasattr(self, '_redis'):
 25 |             self._redis = redis.Redis(REDIS_HOST, REDIS_PORT)
 26 |         return self._redis
 27 | 
 28 |     def clear_keys(self, prefix):
 29 |         keys = self.server.keys(prefix + '*')
 30 |         if keys:
 31 |             self.server.delete(*keys)
 32 | 
 33 | 
 34 | class DupeFilterTest(RedisTestMixin, TestCase):
 35 | 
 36 |     def setUp(self):
 37 |         self.key = 'scrapy_redis:tests:dupefilter:'
 38 |         self.df = RFPDupeFilter(self.server, self.key)
 39 | 
 40 |     def tearDown(self):
 41 |         self.clear_keys(self.key)
 42 | 
 43 |     def test_dupe_filter(self):
 44 |         req = Request('http://example.com')
 45 | 
 46 |         self.assertFalse(self.df.request_seen(req))
 47 |         self.assertTrue(self.df.request_seen(req))
 48 | 
 49 |         self.df.close('nothing')
 50 | 
 51 | 
 52 | class QueueTestMixin(RedisTestMixin):
 53 | 
 54 |     queue_cls = None
 55 | 
 56 |     def setUp(self):
 57 |         self.spider = Spider('myspider')
 58 |         self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name
 59 |         self.q = self.queue_cls(self.server, Spider('myspider'), self.key)
 60 | 
 61 |     def tearDown(self):
 62 |         self.clear_keys(self.key)
 63 | 
 64 |     def test_clear(self):
 65 |         self.assertEqual(len(self.q), 0)
 66 | 
 67 |         for i in range(10):
 68 |             # XXX: can't use same url for all requests as SpiderPriorityQueue
 69 |             # uses redis' set implemention and we will end with only one
 70 |             # request in the set and thus failing the test. It should be noted
 71 |             # that when using SpiderPriorityQueue it acts as a request
 72 |             # duplication filter whenever the serielized requests are the same.
 73 |             # This might be unwanted on repetitive requests to the same page
 74 |             # even with dont_filter=True flag.
 75 |             req = Request('http://example.com/?page=%s' % i)
 76 |             self.q.push(req)
 77 |         self.assertEqual(len(self.q), 10)
 78 | 
 79 |         self.q.clear()
 80 |         self.assertEqual(len(self.q), 0)
 81 | 
 82 | 
 83 | class SpiderQueueTest(QueueTestMixin, TestCase):
 84 | 
 85 |     queue_cls = SpiderQueue
 86 | 
 87 |     def test_queue(self):
 88 |         req1 = Request('http://example.com/page1')
 89 |         req2 = Request('http://example.com/page2')
 90 | 
 91 |         self.q.push(req1)
 92 |         self.q.push(req2)
 93 | 
 94 |         out1 = self.q.pop()
 95 |         out2 = self.q.pop()
 96 | 
 97 |         self.assertEqual(out1.url, req1.url)
 98 |         self.assertEqual(out2.url, req2.url)
 99 | 
100 | 
101 | class SpiderPriorityQueueTest(QueueTestMixin, TestCase):
102 | 
103 |     queue_cls = SpiderPriorityQueue
104 | 
105 |     def test_queue(self):
106 |         req1 = Request('http://example.com/page1', priority=100)
107 |         req2 = Request('http://example.com/page2', priority=50)
108 |         req3 = Request('http://example.com/page2', priority=200)
109 | 
110 |         self.q.push(req1)
111 |         self.q.push(req2)
112 |         self.q.push(req3)
113 | 
114 |         out1 = self.q.pop()
115 |         out2 = self.q.pop()
116 |         out3 = self.q.pop()
117 | 
118 |         self.assertEqual(out1.url, req3.url)
119 |         self.assertEqual(out2.url, req1.url)
120 |         self.assertEqual(out3.url, req2.url)
121 | 
122 | 
123 | class SpiderStackTest(QueueTestMixin, TestCase):
124 | 
125 |     queue_cls = SpiderStack
126 | 
127 |     def test_queue(self):
128 |         req1 = Request('http://example.com/page1')
129 |         req2 = Request('http://example.com/page2')
130 | 
131 |         self.q.push(req1)
132 |         self.q.push(req2)
133 | 
134 |         out1 = self.q.pop()
135 |         out2 = self.q.pop()
136 | 
137 |         self.assertEqual(out1.url, req2.url)
138 |         self.assertEqual(out2.url, req1.url)
139 | 
140 | 
141 | class SchedulerTest(RedisTestMixin, TestCase):
142 | 
143 |     def setUp(self):
144 |         self.persist = False
145 |         self.key_prefix = 'scrapy_redis:tests:'
146 |         self.queue_key = self.key_prefix + '%(spider)s:requests'
147 |         self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'
148 |         self.idle_before_close = 0
149 |         self.scheduler = Scheduler(self.server, self.persist, self.queue_key,
150 |                                    SpiderQueue, self.dupefilter_key,
151 |                                    self.idle_before_close)
152 |         self.spider = Spider('myspider')
153 | 
154 |     def tearDown(self):
155 |         self.clear_keys(self.key_prefix)
156 | 
157 |     def test_scheduler(self):
158 |         # default no persist
159 |         self.assertFalse(self.scheduler.persist)
160 | 
161 |         self.scheduler.open(self.spider)
162 |         self.assertEqual(len(self.scheduler), 0)
163 | 
164 |         req = Request('http://example.com')
165 |         self.scheduler.enqueue_request(req)
166 |         self.assertTrue(self.scheduler.has_pending_requests())
167 |         self.assertEqual(len(self.scheduler), 1)
168 | 
169 |         # dupefilter in action
170 |         self.scheduler.enqueue_request(req)
171 |         self.assertEqual(len(self.scheduler), 1)
172 | 
173 |         out = self.scheduler.next_request()
174 |         self.assertEqual(out.url, req.url)
175 | 
176 |         self.assertFalse(self.scheduler.has_pending_requests())
177 |         self.assertEqual(len(self.scheduler), 0)
178 | 
179 |         self.scheduler.close('finish')
180 | 
181 |     def test_scheduler_persistent(self):
182 |         # TODO: Improve this test to avoid the need to check for log messages.
183 |         self.spider.log = mock.Mock(spec=self.spider.log)
184 | 
185 |         self.scheduler.persist = True
186 |         self.scheduler.open(self.spider)
187 | 
188 |         self.assertEqual(self.spider.log.call_count, 0)
189 | 
190 |         self.scheduler.enqueue_request(Request('http://example.com/page1'))
191 |         self.scheduler.enqueue_request(Request('http://example.com/page2'))
192 | 
193 |         self.assertTrue(self.scheduler.has_pending_requests())
194 |         self.scheduler.close('finish')
195 | 
196 |         self.scheduler.open(self.spider)
197 |         self.spider.log.assert_has_calls([
198 |             mock.call("Resuming crawl (2 requests scheduled)"),
199 |         ])
200 |         self.assertEqual(len(self.scheduler), 2)
201 | 
202 |         self.scheduler.persist = False
203 |         self.scheduler.close('finish')
204 | 
205 |         self.assertEqual(len(self.scheduler), 0)
206 | 
207 | 
208 | class ConnectionTest(TestCase):
209 | 
210 |     # We can get a connection from just REDIS_URL.
211 |     def test_redis_url(self):
212 |         settings = dict(
213 |             REDIS_URL = 'redis://foo:bar@localhost:9001/42'
214 |         )
215 | 
216 |         server = connection.from_settings(settings)
217 |         connect_args = server.connection_pool.connection_kwargs
218 | 
219 |         self.assertEqual(connect_args['host'], 'localhost')
220 |         self.assertEqual(connect_args['port'], 9001)
221 |         self.assertEqual(connect_args['password'], 'bar')
222 |         self.assertEqual(connect_args['db'], 42)
223 | 
224 |     # We can get a connection from REDIS_HOST/REDIS_PORT.
225 |     def test_redis_host_port(self):
226 |         settings = dict(
227 |             REDIS_HOST = 'localhost',
228 |             REDIS_PORT = 9001
229 |         )
230 | 
231 |         server = connection.from_settings(settings)
232 |         connect_args = server.connection_pool.connection_kwargs
233 | 
234 |         self.assertEqual(connect_args['host'], 'localhost')
235 |         self.assertEqual(connect_args['port'], 9001)
236 | 
237 |     # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT.
238 |     def test_redis_url_precedence(self):
239 |         settings = dict(
240 |             REDIS_HOST = 'baz',
241 |             REDIS_PORT = 1337,
242 |             REDIS_URL = 'redis://foo:bar@localhost:9001/42'
243 |         )
244 | 
245 |         server = connection.from_settings(settings)
246 |         connect_args = server.connection_pool.connection_kwargs
247 | 
248 |         self.assertEqual(connect_args['host'], 'localhost')
249 |         self.assertEqual(connect_args['port'], 9001)
250 |         self.assertEqual(connect_args['password'], 'bar')
251 |         self.assertEqual(connect_args['db'], 42)
252 | 
253 |     # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None.
254 |     def test_redis_host_port_fallback(self):
255 |         settings = dict(
256 |             REDIS_HOST = 'baz',
257 |             REDIS_PORT = 1337,
258 |             REDIS_URL = None
259 |         )
260 | 
261 |         server = connection.from_settings(settings)
262 |         connect_args = server.connection_pool.connection_kwargs
263 | 
264 |         self.assertEqual(connect_args['host'], 'baz')
265 |         self.assertEqual(connect_args['port'], 1337)
266 | 
267 |     # We use default values for REDIS_HOST/REDIS_PORT.
268 |     def test_redis_default(self):
269 |         settings = dict()
270 | 
271 |         server = connection.from_settings(settings)
272 |         connect_args = server.connection_pool.connection_kwargs
273 | 
274 |         self.assertEqual(connect_args['host'], 'localhost')
275 |         self.assertEqual(connect_args['port'], 6379)
276 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/settings.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | 
 3 | 
 4 | BOT_NAME = ['Sina_spider3']
 5 | 
 6 | SPIDER_MODULES = ['Sina_spider3.spiders']
 7 | NEWSPIDER_MODULE = 'Sina_spider3.spiders'
 8 | 
 9 | DOWNLOADER_MIDDLEWARES = {
10 |     "Sina_spider3.middleware.UserAgentMiddleware": 401,
11 |     "Sina_spider3.middleware.CookiesMiddleware": 402,
12 | }
13 | ITEM_PIPELINES = {
14 |     "Sina_spider3.pipelines.MongoDBPipeline": 403,
15 | }
16 | 
17 | SCHEDULER = 'Sina_spider3.scrapy_redis.scheduler.Scheduler'
18 | SCHEDULER_PERSIST = True
19 | SCHEDULER_QUEUE_CLASS = 'Sina_spider3.scrapy_redis.queue.SpiderSimpleQueue'
20 | 
21 | # 种子队列的信息
22 | REDIE_URL = None
23 | REDIS_HOST = 'localhost'
24 | REDIS_PORT = 6379
25 | 
26 | # 去重队列的信息
27 | FILTER_URL = None
28 | FILTER_HOST = 'localhost'
29 | FILTER_PORT = 6379
30 | FILTER_DB = 0
31 | 
32 | DOWNLOAD_DELAY = 10  # 间隔时间
33 | # LOG_LEVEL = 'INFO'  # 日志级别
34 | CONCURRENT_REQUESTS = 1  # 默认为16
35 | # CONCURRENT_ITEMS = 1
36 | # CONCURRENT_REQUESTS_PER_IP = 1
37 | REDIRECT_ENABLED = False
38 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/spiders/sinaSpider.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | #
  3 | import sys
  4 | import logging
  5 | import datetime
  6 | import requests
  7 | import re
  8 | from lxml import etree
  9 | from Sina_spider3.weiboID import weiboID
 10 | from Sina_spider3.scrapy_redis.spiders import RedisSpider
 11 | from scrapy.selector import Selector
 12 | from scrapy.http import Request
 13 | from Sina_spider3.items import TweetsItem, InformationItem, RelationshipsItem
 14 | 
 15 | reload(sys)
 16 | sys.setdefaultencoding('utf8')
 17 | 
 18 | 
 19 | class Spider(RedisSpider):
 20 |     name = "SinaSpider"
 21 |     host = "https://weibo.cn"
 22 |     redis_key = "SinaSpider:start_urls"
 23 |     start_urls = list(set(weiboID))
 24 |     logging.getLogger("requests").setLevel(logging.WARNING)  # 将requests的日志级别设成WARNING
 25 | 
 26 |     def start_requests(self):
 27 |         for uid in self.start_urls:
 28 |             yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information)
 29 | 
 30 |     def parse_information(self, response):
 31 |         """ 抓取个人信息 """
 32 |         informationItem = InformationItem()
 33 |         selector = Selector(response)
 34 |         ID = re.findall('(\d+)/info', response.url)[0]
 35 |         try:
 36 |             text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
 37 |             nickname = re.findall('昵称[：:]?(.*?);'.decode('utf8'), text1)
 38 |             gender = re.findall('性别[：:]?(.*?);'.decode('utf8'), text1)
 39 |             place = re.findall('地区[：:]?(.*?);'.decode('utf8'), text1)
 40 |             briefIntroduction = re.findall('简介[：:]?(.*?);'.decode('utf8'), text1)
 41 |             birthday = re.findall('生日[：:]?(.*?);'.decode('utf8'), text1)
 42 |             sexOrientation = re.findall('性取向[：:]?(.*?);'.decode('utf8'), text1)
 43 |             sentiment = re.findall('感情状况[：:]?(.*?);'.decode('utf8'), text1)
 44 |             vipLevel = re.findall('会员等级[：:]?(.*?);'.decode('utf8'), text1)
 45 |             authentication = re.findall('认证[：:]?(.*?);'.decode('utf8'), text1)
 46 |             url = re.findall('互联网[：:]?(.*?);'.decode('utf8'), text1)
 47 | 
 48 |             informationItem["_id"] = ID
 49 |             if nickname and nickname[0]:
 50 |                 informationItem["NickName"] = nickname[0].replace(u"\xa0", "")
 51 |             if gender and gender[0]:
 52 |                 informationItem["Gender"] = gender[0].replace(u"\xa0", "")
 53 |             if place and place[0]:
 54 |                 place = place[0].replace(u"\xa0", "").split(" ")
 55 |                 informationItem["Province"] = place[0]
 56 |                 if len(place) > 1:
 57 |                     informationItem["City"] = place[1]
 58 |             if briefIntroduction and briefIntroduction[0]:
 59 |                 informationItem["BriefIntroduction"] = briefIntroduction[0].replace(u"\xa0", "")
 60 |             if birthday and birthday[0]:
 61 |                 try:
 62 |                     birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
 63 |                     informationItem["Birthday"] = birthday - datetime.timedelta(hours=8)
 64 |                 except Exception:
 65 |                     informationItem['Birthday'] = birthday[0]   # 有可能是星座，而非时间
 66 |             if sexOrientation and sexOrientation[0]:
 67 |                 if sexOrientation[0].replace(u"\xa0", "") == gender[0]:
 68 |                     informationItem["SexOrientation"] = "同性恋"
 69 |                 else:
 70 |                     informationItem["SexOrientation"] = "异性恋"
 71 |             if sentiment and sentiment[0]:
 72 |                 informationItem["Sentiment"] = sentiment[0].replace(u"\xa0", "")
 73 |             if vipLevel and vipLevel[0]:
 74 |                 informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "")
 75 |             if authentication and authentication[0]:
 76 |                 informationItem["Authentication"] = authentication[0].replace(u"\xa0", "")
 77 |             if url:
 78 |                 informationItem["URL"] = url[0]
 79 | 
 80 |             try:
 81 |                 urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID
 82 |                 r = requests.get(urlothers, cookies=response.request.cookies, timeout=5)
 83 |                 if r.status_code == 200:
 84 |                     selector = etree.HTML(r.content)
 85 |                     texts = ";".join(selector.xpath('//body//div[@class="tip2"]/a//text()'))
 86 |                     if texts:
 87 |                         num_tweets = re.findall('微博\[(\d+)\]'.decode('utf8'), texts)
 88 |                         num_follows = re.findall('关注\[(\d+)\]'.decode('utf8'), texts)
 89 |                         num_fans = re.findall('粉丝\[(\d+)\]'.decode('utf8'), texts)
 90 |                         if num_tweets:
 91 |                             informationItem["Num_Tweets"] = int(num_tweets[0])
 92 |                         if num_follows:
 93 |                             informationItem["Num_Follows"] = int(num_follows[0])
 94 |                         if num_fans:
 95 |                             informationItem["Num_Fans"] = int(num_fans[0])
 96 |             except Exception, e:
 97 |                 pass
 98 |         except Exception, e:
 99 |             pass
100 |         else:
101 |             yield informationItem
102 |         yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID, callback=self.parse_tweets, dont_filter=True)
103 |         yield Request(url="https://weibo.cn/%s/follow" % ID, callback=self.parse_relationship, dont_filter=True)
104 |         yield Request(url="https://weibo.cn/%s/fans" % ID, callback=self.parse_relationship, dont_filter=True)
105 | 
106 |     def parse_tweets(self, response):
107 |         """ 抓取微博数据 """
108 |         selector = Selector(response)
109 |         ID = re.findall('(\d+)/profile', response.url)[0]
110 |         divs = selector.xpath('body/div[@class="c" and @id]')
111 |         for div in divs:
112 |             try:
113 |                 tweetsItems = TweetsItem()
114 |                 id = div.xpath('@id').extract_first()  # 微博ID
115 |                 content = div.xpath('div/span[@class="ctt"]//text()').extract()  # 微博内容
116 |                 cooridinates = div.xpath('div/a/@href').extract()  # 定位坐标
117 |                 like = re.findall('赞\[(\d+)\]'.decode('utf8'), div.extract())  # 点赞数
118 |                 transfer = re.findall('转发\[(\d+)\]'.decode('utf8'), div.extract())  # 转载数
119 |                 comment = re.findall('评论\[(\d+)\]'.decode('utf8'), div.extract())  # 评论数
120 |                 others = div.xpath('div/span[@class="ct"]/text()').extract()  # 求时间和使用工具（手机或平台）
121 | 
122 |                 tweetsItems["_id"] = ID + "-" + id
123 |                 tweetsItems["ID"] = ID
124 |                 if content:
125 |                     tweetsItems["Content"] = " ".join(content).strip('[位置]'.decode('utf8'))  # 去掉最后的"[位置]"
126 |                 if cooridinates:
127 |                     cooridinates = re.findall('center=([\d.,]+)', cooridinates[0])
128 |                     if cooridinates:
129 |                         tweetsItems["Co_oridinates"] = cooridinates[0]
130 |                 if like:
131 |                     tweetsItems["Like"] = int(like[0])
132 |                 if transfer:
133 |                     tweetsItems["Transfer"] = int(transfer[0])
134 |                 if comment:
135 |                     tweetsItems["Comment"] = int(comment[0])
136 |                 if others:
137 |                     others = others[0].split('来自'.decode('utf8'))
138 |                     tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
139 |                     if len(others) == 2:
140 |                         tweetsItems["Tools"] = others[1].replace(u"\xa0", "")
141 |                 yield tweetsItems
142 |             except Exception, e:
143 |                 pass
144 | 
145 |         url_next = selector.xpath('body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'.decode('utf8')).extract()
146 |         if url_next:
147 |             yield Request(url=self.host + url_next[0], callback=self.parse_tweets, dont_filter=True)
148 | 
149 |     def parse_relationship(self, response):
150 |         """ 打开url爬取里面的个人ID """
151 |         selector = Selector(response)
152 |         if "/follow" in response.url:
153 |             ID = re.findall('(\d+)/follow', response.url)[0]
154 |             flag = True
155 |         else:
156 |             ID = re.findall('(\d+)/fans', response.url)[0]
157 |             flag = False
158 |         urls = selector.xpath('//a[text()="关注他" or text()="关注她"]/@href'.decode('utf')).extract()
159 |         uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
160 |         for uid in uids:
161 |             relationshipsItem = RelationshipsItem()
162 |             relationshipsItem["Host1"] = ID if flag else uid
163 |             relationshipsItem["Host2"] = uid if flag else ID
164 |             yield relationshipsItem
165 |             yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information)
166 | 
167 |         next_url = selector.xpath('//a[text()="下页"]/@href'.decode('utf8')).extract()
168 |         if next_url:
169 |             yield Request(url=self.host + next_url[0], callback=self.parse_relationship, dont_filter=True)
170 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/user_agents.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | 
 3 | """ User-Agents """
 4 | agents = [
 5 |     "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
 6 |     "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
 7 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
 8 |     "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
 9 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
10 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
11 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
12 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
13 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
14 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
15 |     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
16 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
17 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
18 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
19 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
20 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
21 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
22 |     "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
23 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
24 |     "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
25 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
26 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
27 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
28 |     "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
29 |     "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
30 |     "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
31 |     "Mozilla/2.02E (Win95; U)",
32 |     "Mozilla/3.01Gold (Win95; I)",
33 |     "Mozilla/4.8 [en] (Windows NT 5.1; U)",
34 |     "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
35 |     "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
36 |     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
37 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
38 |     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
39 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
40 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
41 |     "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
42 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
43 |     "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
44 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
45 |     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
46 |     "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
47 |     "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
48 |     "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
49 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
50 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
51 |     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
52 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
53 |     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
54 |     "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
55 |     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
56 |     "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
57 |     "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
58 |     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
59 |     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
60 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
61 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
62 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
63 |     "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
64 |     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
65 |     "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
66 |     "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
67 | ]
68 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/weiboID.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | 
 3 | """ 初始的待爬队列 """
 4 | weiboID = [
 5 |     '1797054534', '2509414473', '2611478681', '5861859392', '2011086863', '5127716917', '1259110474', '5850775634', '1886437464',
 6 |     '3187474530', '2191982701', '1940562032', '5874450550', '1337925752', '2081079420', '5664530558', '3493173952', '1202806915',
 7 |     '1864507535', '2032640064', '5585682587', '3083673764', '5342109866', '5878685868', '5728706733', '2103050415', '5876752562',
 8 |     '3138085045', '5775974583', '1879400644', '2417139911', '5836619975', '5353816265', '5219508427', '1766613205', '2480158031',
 9 |     '5660754163', '2456764664', '3637354755', '1940087047', '5508473104', '1004454162', '2930327837', '1874608417', '5379621155',
10 |     '1720664360', '2714280233', '3769073964', '5624119596', '2754904375', '5710151998', '5331042630', '5748179271', '2146132305',
11 |     '2313896275', '3193618787', '5743059299', '1742930277', '5310538088', '1794474362', '2798510462', '3480076671', '5678653833',
12 |     '5743657357', '5460191980', '1734164880', '5876988653', '5678031258', '5860163996', '1496924574', '5878970110', '1679704482',
13 |     '1142210982', '3628925351', '1196397981', '1747485107', '5675893172', '5438521785', '2192269762', '1992614343', '5878686155',
14 |     '2407186895', '5559116241', '2528477652', '1295950295', '5038203354', '3659276765', '2126733792', '5878350307', '2761179623',
15 |     '5484511719', '5825708520', '1578230251', '5878686190', '5810946551', '3833070073', '1795047931', '5855789570', '3580125714',
16 |     '5709578773', '5236539926', '2907633071', '1709244961', '5405450788', '3251257895', '5054538290', '2713199161', '5698445883',
17 |     '1784537661', '3195290182', '1824506454', '5738766939', '5565915740', '5336031840', '5098775138', '5685568105', '1774289524',
18 |     '2932662914', '5433223957', '2680044311', '1111523983', '5067889432', '5878686362', '2844992161', '3878314663', '1766548141',
19 |     '5763269297', '5878383287', '5235499706', '5876375670', '5866447563', '5129945819', '1704116960', '1929380581', '1223762662',
20 |     '1193476843', '2899591923', '5162099453', '5072151301', '5385741066', '5411455765', '2685535005', '2297905950', '1216766752',
21 |     '5838668577', '5359133478', '3077460103', '5577802539', '5862392623', '1786700611', '1259258694', '1845191497', '1731838797',
22 |     '1740301135', '2816074584', '1217733467', '5345035105', '5050827618', '5486257001', '5767857005', '2050605943', '5733778298',
23 |     '1914725244', '5872583558', '5604377483', '1253491601', '5554922386', '3170223002', '5662737311', '3217179555', '1538163622',
24 |     '5304533928', '5644198830', '1896650227', '5298774966', '2795873213', '1834378177', '5769651141', '2656256971', '5876433869',
25 |     '1826792401', '3002246100', '3082519511', '5780366296', '5704696797', '5204108258', '2090615793', '1739746131', '1378010100',
26 |     '5741331445', '2376442895', '3638486041', '5781365789', '1827234850', '5703214121', '1855398955', '1227908142', '5703820334',
27 | ]
28 | 


--------------------------------------------------------------------------------
/src/Sina_spider3/yumdama.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | import httplib, mimetypes, urlparse, json, time
  3 | 
  4 | ######################################################################
  5 | 
  6 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html
  7 | # 所有函数请查询 http://www.yundama.com/apidoc
  8 | 
  9 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号
 10 | # 2. http://www.yundama.com/developer/myapp 添加新软件
 11 | # 3. 使用添加的软件ID和密钥进行开发，享受丰厚分成
 12 | 
 13 | # 用户名
 14 | username = 'XXXXXX'
 15 | 
 16 | # 密码
 17 | password = 'XXXXXX'
 18 | 
 19 | # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
 20 | appid = 1
 21 | 
 22 | # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
 23 | appkey = 'XXXXXX'
 24 | 
 25 | # 图片文件
 26 | filename = 'ab.png'
 27 | 
 28 | # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
 29 | codetype = 1004
 30 | 
 31 | # 超时时间，秒
 32 | timeout = 60
 33 | 
 34 | 
 35 | ######################################################################
 36 | 
 37 | class YDMHttp:
 38 |     apiurl = 'http://api.yundama.com/api.php'
 39 | 
 40 |     username = ''
 41 |     password = ''
 42 |     appid = ''
 43 |     appkey = ''
 44 | 
 45 |     def __init__(self, username, password, appid, appkey):
 46 |         self.username = username
 47 |         self.password = password
 48 |         self.appid = str(appid)
 49 |         self.appkey = appkey
 50 | 
 51 |     def request(self, fields, files=[]):
 52 |         try:
 53 |             response = post_url(self.apiurl, fields, files)
 54 |             response = json.loads(response)
 55 |         except Exception as e:
 56 |             response = None
 57 |         return response
 58 | 
 59 |     def balance(self):
 60 |         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
 61 |                 'appkey': self.appkey}
 62 |         response = self.request(data)
 63 |         if (response):
 64 |             if (response['ret'] and response['ret'] < 0):
 65 |                 return response['ret']
 66 |             else:
 67 |                 return response['balance']
 68 |         else:
 69 |             return -9001
 70 | 
 71 |     def login(self):
 72 |         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
 73 |                 'appkey': self.appkey}
 74 |         response = self.request(data)
 75 |         if (response):
 76 |             if (response['ret'] and response['ret'] < 0):
 77 |                 return response['ret']
 78 |             else:
 79 |                 return response['uid']
 80 |         else:
 81 |             return -9001
 82 | 
 83 |     def upload(self, filename, codetype, timeout):
 84 |         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
 85 |                 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 86 |         file = {'file': filename}
 87 |         response = self.request(data, file)
 88 |         if (response):
 89 |             if (response['ret'] and response['ret'] < 0):
 90 |                 return response['ret']
 91 |             else:
 92 |                 return response['cid']
 93 |         else:
 94 |             return -9001
 95 | 
 96 |     def result(self, cid):
 97 |         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
 98 |                 'appkey': self.appkey, 'cid': str(cid)}
 99 |         response = self.request(data)
100 |         return response and response['text'] or ''
101 | 
102 |     def decode(self, filename, codetype, timeout):
103 |         cid = self.upload(filename, codetype, timeout)
104 |         if (cid > 0):
105 |             for i in range(0, timeout):
106 |                 result = self.result(cid)
107 |                 if (result != ''):
108 |                     return cid, result
109 |                 else:
110 |                     time.sleep(1)
111 |             return -3003, ''
112 |         else:
113 |             return cid, ''
114 | 
115 | 
116 | ######################################################################
117 | 
118 | def post_url(url, fields, files=[]):
119 |     urlparts = urlparse.urlsplit(url)
120 |     return post_multipart(urlparts[1], urlparts[2], fields, files)
121 | 
122 | 
123 | def post_multipart(host, selector, fields, files):
124 |     content_type, body = encode_multipart_formdata(fields, files)
125 |     h = httplib.HTTP(host)
126 |     h.putrequest('POST', selector)
127 |     h.putheader('Host', host)
128 |     h.putheader('Content-Type', content_type)
129 |     h.putheader('Content-Length', str(len(body)))
130 |     h.endheaders()
131 |     h.send(body)
132 |     errcode, errmsg, headers = h.getreply()
133 |     return h.file.read()
134 | 
135 | 
136 | def encode_multipart_formdata(fields, files=[]):
137 |     BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ'
138 |     CRLF = '\r\n'
139 |     L = []
140 |     for field in fields:
141 |         key = field
142 |         value = fields[key]
143 |         L.append('--' + BOUNDARY)
144 |         L.append('Content-Disposition: form-data; name="%s"' % key)
145 |         L.append('')
146 |         L.append(value)
147 |     for field in files:
148 |         key = field
149 |         filepath = files[key]
150 |         L.append('--' + BOUNDARY)
151 |         L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath))
152 |         L.append('Content-Type: %s' % get_content_type(filepath))
153 |         L.append('')
154 |         L.append(open(filepath, 'rb').read())
155 |     L.append('--' + BOUNDARY + '--')
156 |     L.append('')
157 |     body = CRLF.join(L)
158 |     content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
159 |     return content_type, body
160 | 
161 | 
162 | def get_content_type(filename):
163 |     return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
164 | 
165 | 
166 | ######################################################################
167 | 
168 | 
169 | def identify():
170 |     if (username == 'username'):
171 |         print '请设置好相关参数再测试'
172 |     else:
173 |         # 初始化
174 |         yundama = YDMHttp(username, password, appid, appkey)
175 | 
176 |         # 登陆云打码
177 |         uid = yundama.login();
178 |         # print 'uid: %s' % uid
179 | 
180 |         # 查询余额
181 |         balance = yundama.balance();
182 |         # print 'balance: %s' % balance
183 | 
184 |         # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
185 |         cid, result = yundama.decode(filename, codetype, timeout);
186 |         # print 'cid: %s, result: %s' % (cid, result)
187 |         return result
188 | 


--------------------------------------------------------------------------------
/src/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/chromedriver.exe


--------------------------------------------------------------------------------
/src/cookies（try）.txt:
--------------------------------------------------------------------------------
 1 |  ['{"SUB": "_2A253J_OfDeRhGeBK6lYS8y_NyjuIHXVUVWJXrDV_PUNbm9BeLU77kW9NR8utTT0yzIaoF5HGd-EmRCrfeBXzKM7Q", 
 2 |  "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt", 
 3 |  "ALF": "1543812943",
 4 |   "SCF": "Ag6Ni5azmMQ957nRhDnSGJmVHxiHUapdd61kG2zCA9K9KPjccT2UsglYOS8Q8AEo0toYwdOe_F9aMaVIp3FLogA.", 
 5 |  "ALC": "ac%3D27%26bt%3D1512276943%26cv%3D5.0%26et%3D1543812943%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D05f090c8e2e71b7d774d498801c8df66", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==",
 6 |   "tgc": "TGT-NjQxNDMzMTExNw==-1512276943-gz-82D6EDC638D32EF9BFFEC6DAA8FBAE03-1",
 7 |    "LT": "1512276943"}', 
 8 |  '{"SUB": "_2A253J_OfDeRhGeBK6VAQ8C3FzDmIHXVUVWJXrDV_PUNbm9BeLRf3kW9NR8utfC-xU6BDb6Et1jLT7Q9nEMXvUbDL", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt", "ALF": "1543812944", "SCF": "AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3c4bd7iQCwA_8N_QyL5ZWWcbjXfIUDptjuZiDo7Oq2j9E.", "ALC": "ac%3D27%26bt%3D1512276944%26cv%3D5.0%26et%3D1543812944%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D2%26es%3Db1b7b23a680b8271df5b3f8ff0978986", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==", "tgc": "TGT-NjQyMjEwMzk3NQ==-1512276943-gz-31834B2CBDB9E531BFC2FF211A10A8C4-1", "LT": "1512276944"}']
 9 | 
10 |   {"SUB": "_2A253J_TsDeRhGeBK6lYS8y_NyjuIHXVUVWEkrDV_PUNbm9BeLXLZkW9NR8utTQQCqTejxWGdgx4RHgURbRfeKnlU", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt", "ALF": "1543813180", "SCF": "At5EcfdKs8RgVrq2WrBz31C3JbisCJmAnfw_7tmYLzucchfl2npE6hG6bxG7aGDtKb5dWqhxB1u5jydhLJAFvGQ.", "ALC": "ac%3D27%26bt%3D1512277180%26cv%3D5.0%26et%3D1543813180%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3Dde87590dae6f45a12243530dbf1f209d", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==", "tgc": "TGT-NjQxNDMzMTExNw==-1512277180-gz-1C384532703AE9AA3CF8E0C9D3C78530-1", "LT": "1512277180"}
11 | 
12 |   [{'SUB': '_2A253J96tDeRhGeBK6lYS8y_NyjuIHXVUVLdlrDV_PUNbm9BeLUSikW9NR8utTU46f9Z1Wf4a_FrrtdzGVO6eYg2J', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt', 'ALF': '1543823997', 'SCF': 'AvtM_MHAGrJD8WT8psqmc_baZii63aFFNnjUs0Kx1UZ7UxFjSWW9MODX8k2SK8S_S_4EexNVt9CwATDlx7_MaCk.', 'ALC': 'ac%3D27%26bt%3D1512287997%26cv%3D5.0%26et%3D1543823997%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D7aede323af92903f037bee59c6a2497b', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==', 'tgc': 'TGT-NjQxNDMzMTExNw==-1512287997-gz-82E1CAFE0818D0316258D0028E4781D9-1', 'LT': '1512287997'},
13 |    {'SUB': '_2A253J96uDeRhGeBK6VAQ8C3FzDmIHXVUVLdmrDV_PUNbm9BeLUnZkW9NR8utfJ8o12XYCwhV8uoqegvx8lLCjceC', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt', 'ALF': '1543823998', 'SCF': 'AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3c1vGL6dX62yPY_77XZzbZ4hzGG7RPsnTsGpOb0Ybr0o4.', 'ALC': 'ac%3D27%26bt%3D1512287998%26cv%3D5.0%26et%3D1543823998%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D8ff4ae6bd2388a48f842a6da4cd4a07d', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==', 'tgc': 'TGT-NjQyMjEwMzk3NQ==-1512287998-gz-2E50C7C8D61D64155895A7D27ACBB3B0-1', 'LT': '1512287998'}]
14 | 
15 |    [{'SUB': '_2A253J8CyDeRhGeBK6lYS8y_NyjuIHXVUVLV6rDV_PUNbm9BeLWbNkW9NR8utTR2RqV0cpDAVZdu20M3Lsv62gF6H', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt', 'ALF': '1543824482', 'SCF': 'AvtM_MHAGrJD8WT8psqmc_baZii63aFFNnjUs0Kx1UZ7M8uryPp95dGAZ5dbY7nVglXtwSWFN4UbDwVyOwLKXlc.', 'ALC': 'ac%3D27%26bt%3D1512288482%26cv%3D5.0%26et%3D1543824482%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D2%26es%3Dda69c114d39f358c2ee0e1b023148b0b', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==', 'tgc': 'TGT-NjQxNDMzMTExNw==-1512288482-gz-796C0D50F4566A5F8BB38982D7D415EB-1', 'LT': '1512288482'},
16 |     {'SUB': '_2A253J8CzDeRhGeBK6VAQ8C3FzDmIHXVUVLV7rDV_PUNbm9BeLUbhkW9NR8utfFOAJ5M8AwzMq-OZPJ9nwOF6MOys', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt', 'ALF': '1543824483', 'SCF': 'AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3cI4SOLSi7zj_q6Q-sRCkvQ1flnpEiR49tBIINTa8-XGY.', 'ALC': 'ac%3D27%26bt%3D1512288483%26cv%3D5.0%26et%3D1543824483%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D2%26es%3D07719a22f45017ae94e27314d78d8b77', 'sso_info': 'v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==', 'tgc': 'TGT-NjQyMjEwMzk3NQ==-1512288483-gz-98F65A3B74E854C842A1D6F7BA5C266B-1', 'LT': '1512288483'}]
17 | 
18 |     ['{"SUB": "_2A253IKJtDeRhGeBK6lYS8y_NyjuIHXVUV5SlrDV_PUNbm9BeLVL4kW9NR8utTZIZ3GE_eZIFKU8JXhMGiyLZOHUa", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhAnc96kpmQ5izHQ-M0.hf95NHD95QcSh2Xe0epeK2NWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBpShe0eK2pS7tt", "ALF": "1543898557", "SCF": "AvtM_MHAGrJD8WT8psqmc_baZii63aFFNnjUs0Kx1UZ7_GE9xQnS9YxrxtZ3QJfz5vilaQuQI2Rp1Y96oeNfPRo.", "ALC": "ac%3D27%26bt%3D1512362557%26cv%3D5.0%26et%3D1543898557%26ic%3D-611539878%26scf%3D%26uid%3D6414331117%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D44ff4bd6c3e5f54404f51281fa6e2fe2", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4S0jLOMsYyThLeJp5WpmYO0to2DhLSMs4yxjJOEtw==", "tgc": "TGT-NjQxNDMzMTExNw==-1512362557-gz-A15094AC1DA33790F11FA13BF4C2DBC1-1", "LT": "1512362557"}', '{"SUB": "_2A253IKJuDeRhGeBK6VAQ8C3FzDmIHXVUV5SmrDV_PUNbm9BeLVPNkW9NR8utfCIEKBaKXbjbPLQXwGCjAl9X_Rx3", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5.-ZSvErZGOQ1.QU_-o.Ac5NHD95QcShzEeK501KMfWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSoBEeo27e0.NSBtt", "ALF": "1543898558", "SCF": "AvavFy_Cd-s791I3uWbUZrMHiaCPKBbK44RrNftotY3cUDGokgFmPgwrICSJaq-JrP7DN9tHw_8WnP71flX0Vu4.", "ALC": "ac%3D27%26bt%3D1512362558%26cv%3D5.0%26et%3D1543898558%26ic%3D-611539878%26scf%3D%26uid%3D6422103975%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D22cea20a5941a2f3ffdbd8f8d756dcc8", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaNg4iyjJOAs46TnLWJp5WpmYO0to2DiLKMk4CzjpOctQ==", "tgc": "TGT-NjQyMjEwMzk3NQ==-1512362558-gz-4CE45D4E434B23FDDBBD66722625BA5A-1", "LT": "1512362558"}', '{"SUB": "_2A253IKJvDeRhGeNJ61UY8SzFzz-IHXVUV5SnrDV_PUNbm9BeLWT6kW9NS-ueMidblNF3-Xvz7h5b0SRkUCYNevb8", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhQYAFbpdVSPmb1zZRc_y5o5NHD95QfS05N1K2E1KB0Ws4Dqcjci--fiKyFi-isi--Ni-82iK.ci--Ni-82iK.pi--fiKnfi-iFi--Ri-z7iKy8i--ciK.ci-2f", "ALF": "1543898559", "SCF": "AiPttH0k24V2BSsqaUeK1grOIZ5cZZMUbngtrx787lf10y_gUiHtPH6_1wYTYiYU9gZ0wtjb3357yOHGS0ZUW70.", "ALC": "ac%3D27%26bt%3D1512362559%26cv%3D5.0%26et%3D1543898559%26ic%3D-611539878%26scf%3D%26uid%3D5707912943%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D4657943575c6abc01e10b0bad733a3dd", "sso_info": "v02m6alo5qztKWRk5SljoSIpZCkmKWRk5ylkJSQpY6TmKWRk5ylkJSQpY6ThKWRk5SljoOUpZCkiKWRk6ClkKOApY6ElKWRk5iljpOYpZCTlKadlqWkj5OUt4yDnLmMk4i5jYOMwA==", "tgc": "TGT-NTcwNzkxMjk0Mw==-1512362559-gz-D0E6B9753936773DCDB242BC365DB1CD-1", "LT": "1512362559"}', '{"SUB": "_2A253IKIQDeRhGeNJ61sU8y7NzTSIHXVUV5TYrDV_PUNbm9BeLVrEkW9NS-gN80urLJV2RAiHaLuXpLXAOkkZgE6E", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WhI7S.CQz7ZCvg1.r4fXrmW5NHD95QfS054SKe7eKqRWs4Dqcjci--fi-8si-iWi--ciKL2iKn7i--Xi-zRiKyWi--fiKL2iKL2i--Xi-iFi-2fi--Ri-2ciKnc", "ALF": "1543898560", "SCF": "Au2fEGreoPmIQTogBV7H6aKO_QuHq6N0O4jF6CLzGnXOrGYFMTbrhzKDiwWFHTwsR08C2dinckeHrOLXkZgARxg.", "ALC": "ac%3D27%26bt%3D1512362560%26cv%3D5.0%26et%3D1543898560%26ic%3D-611539878%26scf%3D%26uid%3D5709530168%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D89ad3e1b6ef03c5773d35ee42a6fd23e", "sso_info": "v02m6alo5qztKWRk5SlkJSYpZCkhKWRk5iljpSQpY6DgKWRk5ClkKOgpY6EhKWRk5SljpSQpY6UkKWRk5ClkKSIpZCTlKWRk6ClkJOYpY6DmKadlqWkj5OUt4yDpLWMs4CxjaOgwA==", "tgc": "TGT-NTcwOTUzMDE2OA==-1512362560-gz-BBAA64432EB0BB53614714271076317E-1", "LT": "1512362560"}', '{"SUB": "_2A253IKIRDeRhGeNJ6VYS-CjEyDyIHXVUV5TZrDV_PUNbm9BeLVb3kW9NS_Amb56o_iwCfT4LciqjB2egwTyGlOOa", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WWaJu4.SN-wOGkOfNOlLGni5NHD95QfS0zXe0nc1he7Ws4Dqcjci--ciKL8i-zci--Ri-88i-24i--ci-8hi-2Ei--fiK.ciKLhi--ciK.Ri-8si--4iKL2iKL8", "ALF": "1543898561", "SCF": "AvFTp3DMCcec05kPPAss21MmHrXolkjrDf7ZZdz5O9_jLhgz5JjWNh8HVo6JNI9p9Xl7fphmaq_Yr5YMcTMj8d0.", "ALC": "ac%3D19%26bt%3D1512362561%26cv%3D5.0%26et%3D1543898561%26ic%3D-611539878%26scf%3D%26uid%3D5724386830%26vf%3D1%26vs%3D0%26vt%3D4%26es%3De9f4b38125173257023bd932942cde44", "sso_info": "v02m6alo5qztKWRk5iljpSUpZCjmKWRk6ClkJSUpZCTpKWRk5ilkJSMpZCTiKWRk5SljpOYpY6UjKWRk5iljpOgpZCUmKWRk6SljpSQpY6UlKadlqWkj5OUt4yjkLOOg5i4jLOAwA==", "tgc": "TGT-NTcyNDM4NjgzMA==-1512362561-gz-4CB2B2AA3612BABE7DC1422ACC113933-1", "LT": "1512362561"}', '{"SUB": "_2A253IKISDeRhGeNJ6lIV9y3KyTqIHXVUV5TarDV_PUNbm9BeLVTMkW9NS-jWtQFQa3jT016M--usQjNo35yKKpbN", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4AbLNhGKLG81j0opnQ1iX5NHD95QfS027ShM0SozcWs4Dqcjci--Ri-z7iKLhi--fi-82iKL2i--ci-z4iK.ci--ciKLhi-8hi--ciKL2iKysi--ciKyWi-ih", "ALF": "1543898562", "SCF": "AnUo8TBZrAHEYykaGzvHxD4_Vmse4peKtNi0sDOfXy8zDRlBQspGQIaBIfWmUaoFS738RoE9ROl3nriS2FAAETY.", "ALC": "ac%3D27%26bt%3D1512362562%26cv%3D5.0%26et%3D1543898562%26ic%3D-611539878%26scf%3D%26uid%3D5710473626%26vf%3D1%26vs%3D0%26vt%3D4%26es%3D1f0656c8485bd122d6da6d8bcf291877", "sso_info": "v02m6alo5qztKWRk6ClkKOApY6UjKWRk5SlkJSQpY6UkKWRk5ilkKOkpY6TmKWRk5iljpSMpZCUjKWRk5iljpSQpY6EmKWRk5iljoSEpZCkjKadlqWkj5OUt4yTgLSNs4y2jKOYwA==", "tgc": "TGT-NTcxMDQ3MzYyNg==-1512362562-gz-F0A9EC765F0AE8EFEE62ECB62C8F4E11-1", "LT": "1512362562"}']


--------------------------------------------------------------------------------
/src/launch.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl SinaSpider".split())
4 | 


--------------------------------------------------------------------------------
/src/mysql.py:
--------------------------------------------------------------------------------
 1 | import MySQLdb
 2 | conn=MySQLdb.connect(host='localhost',
 3 |           user='root',
 4 |           passwd='chen960212',
 5 |           db='sinaweibo')
 6 | cursor = conn.cursor()
 7 | cursor.execute ("SELECT VERSION()")
 8 | row = cursor.fetchone ()
 9 | print "server version:", row[0]
10 | cursor.close()
11 | conn.close()
12 | 
13 | '''
14 | #Python 2.7.6 (default, Nov 10 2013, 19:24:24) [MSC v.1500 64 bit (AMD64)] on win32
15 | import MySQLdb
16 | conn = MySQLdb.connect(host='localhost',port=3306,user='root',passwd='chen960212',db='test')
17 | cur = conn.cursor()
18 | cur.execute('select `title`, `text` from `entries` limit 10')
19 | 2L
20 | cur.fetchall()
21 | (('bokeyuan', 'bokeyuan text...'), ('google translate', 'google translate text...'))
22 | cur.close()
23 | conn.close()
24 | '''
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/src/pipelines.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | 
  3 | # __________________________________________
  4 | #   增加了向Mysql数据库中保存pipeline
  5 | #   需要有MysqlDB,同时修改Spider文件，增加Item类所有变量的if else的返回值，使得可以标准化存储       
  6 | #   Updated by Charles Yan
  7 | #   Date:2017.1.4
  8 | #   Added Mysql insert method
  9 | # ------------------------------------------
 10 | 
 11 | import pymongo
 12 | from Sina_spider3.items import InformationItem, TweetsItem, RelationshipsItem
 13 | import MySQLdb
 14 | 
 15 | 
 16 | 
 17 | class MysqlDBPipleline(object):
 18 |     def __init__(self):
 19 |         self.count = 1
 20 |         self.conn = MySQLdb.connect(
 21 |                 host='localhost',
 22 |                 port=3306,
 23 |                 user='root',
 24 |                 #这里填写密码
 25 |                 passwd='chen960212',
 26 |                 db='sinaweibo',
 27 |                 charset='utf8',
 28 |                 )
 29 |         self.cur = self.conn.cursor()
 30 | 
 31 |     def process_item(self, item, spider):
 32 |         """ 判断item的类型，并作相应的处理，再入数据库 """
 33 |         if isinstance(item, RelationshipsItem):
 34 |             try:
 35 |                 print("***********at beginning of saving**********")
 36 |                 print(dict(item))
 37 |                 sql = ''
 38 |                 sql+=str('INSERT INTO SinaWeibo.Relationship (`Host1`,`Host2`) ')
 39 |                 sql+=str(' Values(\'' )
 40 |                 sql+=str(item['Host1'])
 41 |                 print(sql)
 42 |                 sql+=str('\', \'')
 43 |                 sql+=str(item['Host2'])
 44 |                 sql+=str('\')')
 45 |                 print("*********** SQL SYNTAX *********** ")
 46 |                 print(''.join(sql))
 47 |                 self.cur.execute(sql)
 48 |                 self.conn.commit()
 49 |                 print("saved")
 50 |                 self.count = self.count +1
 51 |                 print(self.count)
 52 |             except Exception:
 53 |                 pass
 54 |         elif isinstance(item, TweetsItem):
 55 |             try:
 56 |                 print("***********at beginning of saving**********")
 57 |                 
 58 |                 sql = ''
 59 |                 sql+=str('INSERT INTO SinaWeibo.Tweets (`weibo_id`,`User_id`,`Content`,`Pubtime`,`Coordinates`,`Tools`,`Likes`,`Comments`,`Transfers`) ')
 60 |                 sql+=str(' Values(\'' )
 61 |                 sql+=str(item['_id'])
 62 |            
 63 |                 sql+=str('\', \'')
 64 |                 sql+=str(item['ID'])
 65 |                 sql+=str('\', \'')
 66 |                 sql+=str(item['Content'])
 67 |                 sql+=str('\', \'')
 68 |                 sql+=str(item['PubTime'])
 69 |                
 70 |                 sql+=str('\', \'')
 71 |                
 72 |                 sql+=str(item['Co_oridinates'])
 73 |                
 74 |                 sql+=str('\', \'')
 75 |                 sql+=str(item['Tools'])
 76 |                 print(sql)
 77 |                 sql+=str('\', \'')
 78 |                 sql+=str(item['Like'])
 79 |                 sql+=str('\', \'')
 80 |                 sql+=str(item['Comment'])
 81 |                 sql+=str('\', \'')
 82 |                 sql+=str(item['Transfer'])
 83 |                 sql+=str('\')')
 84 |                 print("*********** SQL SYNTAX *********** ")
 85 |                 print(''.join(sql))
 86 |                 self.cur.execute(sql)
 87 |                 self.conn.commit()
 88 |                 print("saved")
 89 |                 self.count = self.count +1
 90 |                 print(self.count)
 91 |             except Exception:
 92 |                 pass
 93 |         elif isinstance(item, InformationItem):
 94 |             try:
 95 |                 print("***********at beginning of saving**********")
 96 |                 
 97 |                 sql = ''
 98 |                 sql+=str('INSERT INTO SinaWeibo.Information (`User_id`,`NickName`,`Gender`,`Province`,`City`,`BriefIntroduction`,`Birthday`,`Num_Tweets`,`Num_Follows`,`Num_Fans`,`SexOrientation`,`Sentiment`,`VIPlevel`,`Authentication`,`URL`) ')
 99 |                 sql+=str(' Values(\'' )
100 |                 sql+=str(item['_id'])
101 |                
102 |                 sql+=str('\', \'')
103 |                 sql+=str(item['NickName'])
104 |                 sql+=str('\', \'')
105 |                 sql+=str(item['Gender'])
106 |                 sql+=str('\', \'')
107 |                 sql+=str(item['Province'])
108 |                 
109 |                 sql+=str('\', \'')
110 |                 sql+=str(item['City'])
111 |                 sql+=str('\', \'')
112 |                 sql+=str(item['BriefIntroduction'])
113 |                 sql+=str('\', \'')
114 |                 print(sql)
115 |                 sql+=str(item['Birthday'])
116 |                 sql+=str('\', \'')
117 |                 sql+=str(item['Num_Tweets'])
118 |                
119 |                 sql+=str('\', \'')
120 |                 sql+=str(item['Num_Follows'])
121 |                 sql+=str('\', \'')
122 |                 sql+=str(item['Num_Fans'])
123 |                 sql+=str('\', \'')
124 |                 
125 |                 sql+=str(item['SexOrientation'])
126 |                 sql+=str('\', \'')
127 |                 sql+=str(item['Sentiment'])
128 |                 
129 |                 sql+=str('\', \'')
130 |                 sql+=str(item['VIPlevel'])
131 |                 sql+=str('\', \'')
132 |                 sql+=str(item['Authentication'])
133 |                 sql+=str('\', \'')
134 |                 sql+=str(item['URL'])
135 |                 sql+=str('\')')
136 |                
137 |                 print("*********** SQL SYNTAX *********** ")
138 |                 print(''.join(sql))
139 |                 self.cur.execute(sql)
140 |                 self.conn.commit()
141 |                 print("saved")
142 |                 self.count = self.count +1
143 |                 print(self.count)
144 |             except Exception:
145 |                 pass
146 |             
147 |             ##在Java开发中，Dao连接会对内存溢出，需要定时断开重连，这里不清楚是否需要，先加上了
148 |             if self.count == 1000:
149 |                 print("try reconnecting")
150 |                 self.count = 0
151 |                 self.cur.close()
152 |                 self.conn.close()
153 |                 self.conn = MySQLdb.connect(
154 |                     host='localhost',
155 |                     port=3306,
156 |                     user='root',
157 |                     passwd='***',
158 |                     db='SinaWeibo',
159 |                     charset='utf8',
160 |                 )
161 |                 self.cur = self.conn.cursor()
162 |                 print("reconnect")
163 |                 
164 |         return item
165 |     
166 | 
167 | 
168 | class MongoDBPipleline(object):
169 |     def __init__(self):
170 |         clinet = pymongo.MongoClient("localhost", 27017)
171 |         db = clinet["Sina"]
172 |         self.Information = db["Information"]
173 |         self.Tweets = db["Tweets"]
174 |         self.Relationships = db["Relationships"]
175 | 
176 |     def process_item(self, item, spider):
177 |         """ 判断item的类型，并作相应的处理，再入数据库 """
178 |         if isinstance(item, RelationshipsItem):
179 |             try:
180 |                 self.Relationships.insert(dict(item))
181 |             except Exception:
182 |                 pass
183 |         elif isinstance(item, TweetsItem):
184 |             try:
185 |                 self.Tweets.insert(dict(item))
186 |             except Exception:
187 |                 pass
188 |         elif isinstance(item, InformationItem):
189 |             try:
190 |                 self.Information.insert(dict(item))
191 |             except Exception:
192 |                 pass
193 |         return item
194 | 


--------------------------------------------------------------------------------
/src/readme..txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/readme..txt


--------------------------------------------------------------------------------
/src/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Sina_spider3.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Sina_spider3
12 | 


--------------------------------------------------------------------------------
/src/sql语句.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/sql语句.sql


--------------------------------------------------------------------------------
/src/sql语句2.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE `relationship` (
 2 | 	`Host1` VARCHAR(50) NULL DEFAULT NULL,
 3 | 	`Host2` VARCHAR(50) NULL DEFAULT NULL
 4 | )
 5 | COLLATE='utf8_general_ci'
 6 | ENGINE=InnoDB
 7 | ;
 8 | SHOW CREATE TABLE `sinaweibo`.`relationship`;
 9 | SHOW CREATE TABLE `sinaweibo`.`tweets`;
10 | SHOW CREATE TABLE `sinaweibo`.`relationship`;
11 | SHOW CREATE TABLE `sinaweibo`.`information`;
12 | 
13 | 
14 | 
15 | CREATE TABLE `information` (
16 | 	`User_id` VARCHAR(50) NULL DEFAULT NULL,
17 | 	`NickName` VARCHAR(50) NULL DEFAULT NULL,
18 | 	`Gender` VARCHAR(50) NULL DEFAULT NULL,
19 | 	`Province` VARCHAR(50) NULL DEFAULT NULL,
20 | 	`City` VARCHAR(50) NULL DEFAULT NULL,
21 | 	`BriefIntroduction` VARCHAR(50) NULL DEFAULT NULL,
22 | 	`Birthday` VARCHAR(50) NULL DEFAULT NULL,
23 | 	`Num_Tweets` VARCHAR(50) NULL DEFAULT NULL,
24 | 	`Num_Follows` VARCHAR(50) NULL DEFAULT NULL,
25 | 	`Num_Fans` VARCHAR(50) NULL DEFAULT NULL,
26 | 	`SexOrientation` VARCHAR(50) NULL DEFAULT NULL,
27 | 	`Sentiment` VARCHAR(50) NULL DEFAULT NULL,
28 | 	`VIPlevel` VARCHAR(50) NULL DEFAULT NULL,
29 | 	`Authentication` VARCHAR(50) NULL DEFAULT NULL,
30 | 	`URL` VARCHAR(50) NULL DEFAULT NULL
31 | )
32 | COLLATE='utf8_general_ci'
33 | ENGINE=InnoDB
34 | ;
35 | 
36 | 
37 | CREATE TABLE `tweets` (
38 | 	`weibo_id` VARCHAR(50) NULL DEFAULT NULL,
39 | 	`User_id` VARCHAR(50) NULL DEFAULT NULL,
40 | 	`Content` VARCHAR(50) NULL DEFAULT NULL,
41 | 	`Pubtime` VARCHAR(50) NULL DEFAULT NULL,
42 | 	`Coordinates` VARCHAR(50) NULL DEFAULT NULL,
43 | 	`Tools` VARCHAR(50) NULL DEFAULT NULL,
44 | 	`Likes` INT(11) NULL DEFAULT NULL,
45 | 	`Comments` INT(11) NULL DEFAULT NULL,
46 | 	`Transfers` INT(11) NULL DEFAULT NULL
47 | )
48 | COLLATE='utf8_general_ci'
49 | ENGINE=InnoDB
50 | ;
51 | 


--------------------------------------------------------------------------------
/src/用到的工具.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/src/用到的工具.txt


--------------------------------------------------------------------------------
/visio制图/E-R图_Tweets.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/E-R图_Tweets.vsdx


--------------------------------------------------------------------------------
/visio制图/E-R图_information.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/E-R图_information.vsdx


--------------------------------------------------------------------------------
/visio制图/E-R图_relationships.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/E-R图_relationships.vsdx


--------------------------------------------------------------------------------
/visio制图/数据库概念模型E-R图.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/visio制图/数据库概念模型E-R图.vsdx


--------------------------------------------------------------------------------
/宋少忠_毕业论文终稿查重版陈巍瑜_大雅详细报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/宋少忠_毕业论文终稿查重版陈巍瑜_大雅详细报告.pdf


--------------------------------------------------------------------------------
/开题答辩报告/开题报告1稿.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/开题答辩报告/开题报告1稿.docx


--------------------------------------------------------------------------------
/开题答辩报告/开题报告2稿.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/开题答辩报告/开题报告2稿.docx


--------------------------------------------------------------------------------
/毕业论文终稿.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/毕业论文终稿.doc


--------------------------------------------------------------------------------
/毕业设计微博json数据.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/毕业设计微博json数据.rar


--------------------------------------------------------------------------------
/毕设答辩pt.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/毕设答辩pt.pptx


--------------------------------------------------------------------------------
/论文二稿/readme..txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/readme..txt


--------------------------------------------------------------------------------
/论文二稿/摘要与关键字.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/摘要与关键字.docx


--------------------------------------------------------------------------------
/论文二稿/第一章/论文初稿_绪论.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第一章/论文初稿_绪论.docx


--------------------------------------------------------------------------------
/论文二稿/第七章/第7章结论与展望.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第七章/第7章结论与展望.docx


--------------------------------------------------------------------------------
/论文二稿/第三章/3.1_需求.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.1_需求.docx


--------------------------------------------------------------------------------
/论文二稿/第三章/3.2.1_非关系型数据库mongodb及其搭建.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.2.1_非关系型数据库mongodb及其搭建.docx


--------------------------------------------------------------------------------
/论文二稿/第三章/3.2.4_redis简介及其搭建.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.2.4_redis简介及其搭建.docx


--------------------------------------------------------------------------------
/论文二稿/第三章/3.3_Scrapy框架.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.3_Scrapy框架.docx


--------------------------------------------------------------------------------
/论文二稿/第三章/3.4_Srcapy+redis架构.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第三章/3.4_Srcapy+redis架构.docx


--------------------------------------------------------------------------------
/论文二稿/第二章/2.1_爬虫的分类与作用.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.1_爬虫的分类与作用.docx


--------------------------------------------------------------------------------
/论文二稿/第二章/2.2_http协议.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.2_http协议.docx


--------------------------------------------------------------------------------
/论文二稿/第二章/2.3_rebots协议.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.3_rebots协议.docx


--------------------------------------------------------------------------------
/论文二稿/第二章/2.4_爬虫搜索策略-防止环路的出现.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第二章/2.4_爬虫搜索策略-防止环路的出现.docx


--------------------------------------------------------------------------------
/论文二稿/第五章/第五章测试.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第五章/第五章测试.docx


--------------------------------------------------------------------------------
/论文二稿/第六章/6.1_数据模型.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第六章/6.1_数据模型.docx


--------------------------------------------------------------------------------
/论文二稿/第六章/6.2_数据分析.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第六章/6.2_数据分析.docx


--------------------------------------------------------------------------------
/论文二稿/第四章/4.1_微博移动版web分析.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.1_微博移动版web分析.docx


--------------------------------------------------------------------------------
/论文二稿/第四章/4.2_User-agent伪装.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.2_User-agent伪装.docx


--------------------------------------------------------------------------------
/论文二稿/第四章/4.3_信息过滤规则-正则表达式.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.3_信息过滤规则-正则表达式.docx


--------------------------------------------------------------------------------
/论文二稿/第四章/4.4_查重.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.4_查重.docx


--------------------------------------------------------------------------------
/论文二稿/第四章/4.5_反爬技术.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.5_反爬技术.docx


--------------------------------------------------------------------------------
/论文二稿/第四章/4.6_Cookie池.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/第四章/4.6_Cookie池.docx


--------------------------------------------------------------------------------
/论文二稿/致谢.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/致谢.docx


--------------------------------------------------------------------------------
/论文二稿/草稿.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/草稿.docx


--------------------------------------------------------------------------------
/论文二稿/论文初稿_参考文献.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/论文初稿_参考文献.docx


--------------------------------------------------------------------------------
/论文二稿/论文初稿_目录.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/论文初稿_目录.docx


--------------------------------------------------------------------------------
/论文二稿/题目.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文二稿/题目.docx


--------------------------------------------------------------------------------
/论文初稿/第一章/论文初稿_绪论.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第一章/论文初稿_绪论.docx


--------------------------------------------------------------------------------
/论文初稿/第三章/3.1.1_非关系型数据库mongodb及其搭建.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.1.1_非关系型数据库mongodb及其搭建.docx


--------------------------------------------------------------------------------
/论文初稿/第三章/3.1.4_redis简介及其搭建.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.1.4_redis简介及其搭建.docx


--------------------------------------------------------------------------------
/论文初稿/第三章/3.2_Scrapy框架.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.2_Scrapy框架.docx


--------------------------------------------------------------------------------
/论文初稿/第三章/3.3_Srcapy+redis架构.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第三章/3.3_Srcapy+redis架构.docx


--------------------------------------------------------------------------------
/论文初稿/第二章/2.1_爬虫的分类与作用.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.1_爬虫的分类与作用.docx


--------------------------------------------------------------------------------
/论文初稿/第二章/2.2_http协议.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.2_http协议.docx


--------------------------------------------------------------------------------
/论文初稿/第二章/2.3_rebots协议.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.3_rebots协议.docx


--------------------------------------------------------------------------------
/论文初稿/第二章/2.4_微博移动版web分析.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.4_微博移动版web分析.docx


--------------------------------------------------------------------------------
/论文初稿/第二章/2.5_User-agent伪装.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.5_User-agent伪装.docx


--------------------------------------------------------------------------------
/论文初稿/第二章/2.6_信息过滤规则-正则表达式.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第二章/2.6_信息过滤规则-正则表达式.docx


--------------------------------------------------------------------------------
/论文初稿/第五章/5.1_数据模型.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第五章/5.1_数据模型.docx


--------------------------------------------------------------------------------
/论文初稿/第五章/5.2_数据分析.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第五章/5.2_数据分析.docx


--------------------------------------------------------------------------------
/论文初稿/第六章/论文初稿_总结与展望.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第六章/论文初稿_总结与展望.docx


--------------------------------------------------------------------------------
/论文初稿/第四章/4.1_爬虫搜索策略-防止环路的出现.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.1_爬虫搜索策略-防止环路的出现.docx


--------------------------------------------------------------------------------
/论文初稿/第四章/4.2_查重.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.2_查重.docx


--------------------------------------------------------------------------------
/论文初稿/第四章/4.3_反爬技术.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.3_反爬技术.docx


--------------------------------------------------------------------------------
/论文初稿/第四章/4.4_Cookie池.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/第四章/4.4_Cookie池.docx


--------------------------------------------------------------------------------
/论文初稿/论文初稿_参考文献.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/论文初稿_参考文献.docx


--------------------------------------------------------------------------------
/论文初稿/论文初稿_目录.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/论文初稿_目录.docx


--------------------------------------------------------------------------------
/论文初稿/附录/环境.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文初稿/附录/环境.txt


--------------------------------------------------------------------------------
/论文改一.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文改一.docx


--------------------------------------------------------------------------------
/论文改二.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiyu666/Graduation_Design-Distributed_Web_Spider/a0fbe38b1dbab61217fc3253571aaf09f0a9dd59/论文改二.docx


--------------------------------------------------------------------------------