├── README.md
├── demos_carprice
    └── __init__.py
├── demos_dangdang
    ├── README.md
    ├── __init__.py
    ├── book_fetcher.py
    ├── book_parser.py
    └── book_saver.py
├── demos_doubanmovies
    ├── __init__.py
    ├── movie_fetcher.py
    └── movie_parser.py
├── demos_nbastats
    ├── __init__.py
    └── nba_main.py
├── demos_taobao
    ├── chromedriver72
    └── taobao.py
├── demos_weibo
    ├── __init__.py
    ├── weibo_login.py
    ├── weibo_search.py
    └── weibo_user.py
├── demos_weixin
    ├── __init__.py
    └── weixin_public.py
├── demos_yundama
    ├── __init__.py
    └── yundama.py
├── demos_zhihu
    ├── __init__.py
    └── zhihu_login.py
├── otherfiles
    ├── Dockerfile
    └── pylint.conf
└── test_demos.py


/README.md:
--------------------------------------------------------------------------------
1 | # PSpiderDemos
2 | demos based on PSpider
3 | 
4 | 由于PSpider不断更新，不保证demo的正常运行！
5 | 


--------------------------------------------------------------------------------
/demos_carprice/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 抓取不同汽车网站的汽车价格
5 | """
6 | 


--------------------------------------------------------------------------------
/demos_dangdang/README.md:
--------------------------------------------------------------------------------
 1 | ## 当当童书数据爬取
 2 | 
 3 | ### 爬取所有书籍链接
 4 | 
 5 | 使用 `key = "lists"` 进行爬取, 爬取之后直接存入数据库中
 6 | 
 7 | ### 爬取对应书籍页面的详细信息
 8 | 
 9 | -  使用 `key = "detail"` 进行爬取
10 | - 当当书籍信息中, 内容推荐, 媒体推荐, 作者简介等信息只有当屏幕显示到那里时才会被 `javascript` 给渲染。因此使用 `selenium + PhantomJS` 爬取时, 要将窗口开大, 同时等待加载完毕。
11 | - 由于 `selenium` 本身一个 `driver` 只能单线程, 如果每一次爬取都反复开关 `driver` 开销太大, 因此在下修改了框架, 在初始化 `spider` 时传入一个 `fetcher` 的 `list`。
12 | 
13 | ### 文件说明
14 | 
15 | - `demos_dangdang` :  存放 `fetcher`, `parser`, `saver` 的三个类
16 | - `dangdang_book.py`: 分两步抓取链接以及详细信息（合并到test_demos.py中）
17 | 


--------------------------------------------------------------------------------
/demos_dangdang/__init__.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | """
 4 | 抓取当当儿童的所有书目
 5 | """
 6 | 
 7 | from .book_fetcher import BookFetcher
 8 | from .book_parser import BookParser
 9 | from .book_saver import BookSaver
10 | 


--------------------------------------------------------------------------------
/demos_dangdang/book_fetcher.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import spider
 4 | import logging
 5 | import requests.adapters
 6 | from selenium import webdriver
 7 | from selenium.webdriver.support.ui import WebDriverWait
 8 | from selenium.webdriver.support import expected_conditions as EC
 9 | from selenium.webdriver.common.by import By
10 | import sys
11 | requests.packages.urllib3.disable_warnings()
12 | 
13 | 
14 | class BookFetcher(spider.Fetcher):
15 | 
16 |     def __init__(self):
17 |         spider.Fetcher.__init__(self, normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=3, critical_sleep_time=0)
18 |         self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
19 |         self.driver.set_window_size(1120, 2000)
20 |         return
21 | 
22 |     def clear_session(self):
23 |         self.driver.delete_all_cookies()
24 |         return
25 | 
26 |     def driver_quit(self):
27 |         self.driver.quit()
28 |         return
29 | 
30 |     def url_fetch(self, url, keys, critical, fetch_repeat):
31 |         try:
32 |             logging.warning("-------------------------------")
33 |             if keys[0] == "detail":
34 |                 logging.warning("fetch %s", url)
35 |                 x_str = "//*[@id='detail'][contains(@isloaded, '1')]"
36 |                 self.driver.get(url)
37 |                 element_present = EC.presence_of_element_located((By.XPATH, x_str))
38 |                 WebDriverWait(self.driver, 60).until(element_present)
39 |         except:
40 |             logging.warning("Unexpected error: %s", sys.exc_info()[0])
41 |             #self.clear_session()
42 |             return 0, ""
43 |         response = self.driver.page_source
44 |         if not response:
45 |             logging.warning("not response %s", response)
46 |             return 0, ""
47 |         logging.warning("fetch done!")
48 |         return 1, response
49 | 


--------------------------------------------------------------------------------
/demos_dangdang/book_parser.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import spider
  4 | import re
  5 | import urllib
  6 | import logging
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | 
 10 | class BookParser(spider.Parser):
 11 | 
 12 |     def __init__(self):
 13 |         spider.Parser.__init__(self)
 14 |         self.contents_need = ['isbn', 'pic', 'title', 'con_reco', 'comment', 'brand', 'series', 'author', 'author_origin', 'author_country', 'translator', 'publicator', 'author_prize', 'book_prize', 'raw_title', 'age', 'responsibility', 'lan', 'words', 'size', 'binding', 'pub_date', 'pub_times', 'pages', 'price', 'editor_reco', 'media_reco', 'author_intro', 'review_num', 'dangdang_rank']
 15 | 
 16 |     def clean_str(self, in_str):
 17 |         clean = re.compile('<.*?>')
 18 |         clean_text = re.sub(clean, '', in_str)
 19 |         return clean_text
 20 | 
 21 |     def getdetail_descripe(self, soup):
 22 |         p_info = soup.find(id="detail_describe")
 23 |         series = ""
 24 |         isbn = ""
 25 |         pub_times = ""
 26 |         pages = ""
 27 |         words = ""
 28 |         pub_date = ""
 29 |         size = ""
 30 |         binding = ""
 31 |         if p_info:
 32 |             li_list = p_info.findAll("li")
 33 |             if li_list:
 34 |                 try:
 35 |                     for item in li_list:
 36 |                         item_str = str(item.get_text().strip())
 37 |                         if item_str.find("版 次") > -1:
 38 |                             pub_times = item_str.replace("版 次：", "")
 39 |                             continue
 40 |                         if item_str.find("页 数") > -1:
 41 |                             pages = item_str.replace("页 数：", "")
 42 |                             continue
 43 |                         if item_str.find("字 数") > -1:
 44 |                             words = item_str.replace("字 数：", "")
 45 |                             continue
 46 |                         if item_str.find("印刷时间") > -1:
 47 |                             pub_date = item_str.replace("印刷时间：", "")
 48 |                             continue
 49 |                         if item_str.find("开 本") > -1:
 50 |                             size = item_str.replace("开 本：", "")
 51 |                             continue
 52 |                         if item_str.find("包 装") > -1:
 53 |                             binding = item_str.replace("包 装：", "")
 54 |                             continue
 55 |                         if item_str.find("国际标准书号ISBN") > -1:
 56 |                             isbn = item_str.replace("国际标准书号ISBN：", "")
 57 |                             continue
 58 |                         if item_str.find("丛书名") > -1:
 59 |                             series = item_str.replace("丛书名：", "")
 60 |                     # print [series, isbn, pub_times, pages, words, pub_date, size, binding]
 61 |                     return [series, isbn, pub_times, pages, words, pub_date, size, binding]
 62 |                 except:
 63 |                     pass
 64 |                     return ["", "", "", "", "", "", "", ""]
 65 |             return ["", "", "", "", "", "", "", ""]
 66 |         else:
 67 |             return ["", "", "", "", "", "", "", ""]
 68 | 
 69 |     def get_title(self, soup):
 70 |         name_tag = soup.find("div", class_="name_info")
 71 |         if name_tag:
 72 |             title_tag = name_tag.find("h1")
 73 |             if title_tag:
 74 |                 try:
 75 |                     title = title_tag.get_text().strip()
 76 |                     return str(title)
 77 |                 except:
 78 |                     return ""
 79 |             else:
 80 |                 return ""
 81 |         else:
 82 |             return ""
 83 | 
 84 |     def get_comment(self, soup):
 85 |         name_tag = soup.find("div", class_="name_info")
 86 |         if name_tag:
 87 |             content_tag = name_tag.find("h2")
 88 |             if content_tag:
 89 |                 try:
 90 |                     content = content_tag.get_text().strip()
 91 |                     return str(content)
 92 |                 except:
 93 |                     return ""
 94 |             else:
 95 |                 return ""
 96 |         else:
 97 |             return ""
 98 | 
 99 |     def get_content(self, soup):
100 |         content_tag = soup.find(id="content")
101 |         if content_tag:
102 |             dsc_tag = content_tag.find(id="content-textarea")
103 |             if not dsc_tag:
104 |                 dsc_tag = content_tag.find("div", class_="descrip")
105 |             try:
106 |                 content = dsc_tag.get_text().strip()
107 |                 return self.clean_str(str(content))
108 |             except:
109 |                 return ""
110 |         else:
111 |             return ""
112 | 
113 |     def get_country(self, soup):
114 |         name_tag = soup.find(id="author")
115 |         name = ""
116 |         if name_tag:
117 |             name = str(name_tag.get_text()).strip().replace("作者:", "")
118 |         pattern1 = re.compile(r'.*\【(.+?)\】.*')
119 |         pattern2 = re.compile(r'.*\[(.+?)\].*')
120 |         pattern3 = re.compile(r'.*\（(.+?)\）.*')
121 |         pattern_list = [pattern1, pattern2, pattern3]
122 |         for pattern in pattern_list:
123 |             match = pattern.match(name)
124 |             if match:
125 |                 return match.group(1)
126 |         return ""
127 | 
128 |     def get_author_and_trans(self, soup):
129 |         name_tag = soup.find(id="author")
130 |         author = ""
131 |         trans = ""
132 |         previous = ""
133 |         if name_tag:
134 |             for item_l in name_tag.contents:
135 |                 if type(item_l).__name__ == "NavigableString":
136 |                     content = str(item_l.string).strip()
137 |                     if content.find("译") > -1:
138 |                         if content.find("编译") > -1:
139 |                             content.replace("编译", "")
140 |                         else:
141 |                             content.replace("译", "")
142 |                         trans = previous
143 |                         previous = ""
144 |                     else:
145 |                         if content.find("作者") > -1:
146 |                             previous = previous + content.replace("作者:", "")
147 |                         else:
148 |                             if content and content != "，":
149 |                                 author = author + previous + content
150 |                                 previous = ""
151 |                             else:
152 |                                 previous = previous + " " + content
153 |                 else:
154 |                     content = str(item_l.get_text()).strip()
155 |                     previous = previous + content
156 |             author = author + previous
157 |             return [author, trans]
158 |         else:
159 |             return ["", ""]
160 | 
161 |     def get_publicator(self, soup):
162 |         pub_tag = soup.find("span", {"dd_name": "出版社"})
163 |         if pub_tag:
164 |             pub_a = pub_tag.find("a")
165 |             if pub_a:
166 |                 return str(pub_a.get_text()).strip()
167 |             return ""
168 |         else:
169 |             return ""
170 | 
171 |     def get_price(self, soup):
172 |         o_price_tag = soup.find(id="original-price")
173 |         if o_price_tag:
174 |             return str(o_price_tag.get_text()).strip()
175 |         price_tag = soup.find(id="dd-price")
176 |         if price_tag:
177 |             return str(price_tag.get_text()).strip()
178 |         else:
179 |             return ""
180 | 
181 |     def get_editor_reco(self, soup):
182 |         abstract_tag = soup.find(id="abstract")
183 |         editor_reco = ""
184 |         if abstract_tag:
185 |             reco_tag = abstract_tag.find(id="abstract-all")
186 |             if reco_tag:
187 |                 editor_reco = str(reco_tag.get_text()).strip()
188 |             if not editor_reco:
189 |                 dis_tag = abstract_tag.find("div", class_="descrip")
190 |                 if dis_tag:
191 |                     editor_reco = str(dis_tag.get_text()).strip()
192 |         return editor_reco
193 | 
194 |     def get_media_reco(self, soup):
195 |         content_tag = soup.find(id="mediaFeedback")
196 |         if content_tag:
197 |             media_tag = content_tag.find(id="mediaFeedback-textarea")
198 |             if not media_tag:
199 |                 media_tag = content_tag.find("div", class_="descrip")
200 |             try:
201 |                 content = media_tag.get_text().strip()
202 |                 return self.clean_str(str(content))
203 |             except:
204 |                 return ""
205 |         else:
206 |             return ""
207 | 
208 |     def get_author_intro(self, soup):
209 |         content_tag = soup.find(id="authorIntroduction")
210 |         if content_tag:
211 |             media_tag = content_tag.find(id="authorIntroduction-textarea")
212 |             if not media_tag:
213 |                 media_tag = content_tag.find("div", class_="descrip")
214 |             try:
215 |                 content = media_tag.get_text().strip()
216 |                 return self.clean_str(str(content))
217 |             except:
218 |                 return ""
219 |         else:
220 |             return ""
221 | 
222 |     def get_reviws_number(self, soup):
223 |         num_tag = soup.find(id="comm_num_down")
224 |         if num_tag:
225 |             return str(num_tag.get_text()).strip()
226 |         else:
227 |             return ""
228 | 
229 |     def get_rank(self, soup):
230 |         rank_tag = soup.find("span", {"dd_name": "图书排行榜排名"})
231 |         if rank_tag:
232 |             return str(rank_tag.get_text()).strip()
233 |         else:
234 |             return ""
235 | 
236 |     def get_pic(self, soup, lines_map):
237 |         lines_map[self.contents_need[1]] = "miss"
238 |         img_tag = soup.find(id="main-img-slider")
239 |         img_url = []
240 |         if img_tag:
241 |             img_lists = img_tag.findAll("a")
242 |             if img_lists:
243 |                 for img_li in img_lists:
244 |                     if str(img_li["data-imghref"]) not in img_url:
245 |                         img_url.append(str(img_li["data-imghref"]))
246 |         lines_map[self.contents_need[1]] = ','.join(img_url)
247 | 
248 | 
249 |     def init_map(self):
250 |         lines_map = {}
251 |         c_len = len(self.contents_need)
252 |         for i in range(0, c_len):
253 |             lines_map[self.contents_need[i]] = ''
254 |         # lines_map['age'] = age[option]
255 |         return lines_map
256 | 
257 |     def write_to_line(self, lines_map):
258 |         line = []
259 |         c_len = len(self.contents_need)
260 |         for i in range(0, c_len):
261 |             line.append(lines_map[self.contents_need[i]])
262 |         return line
263 | 
264 |     def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content):
265 |         url_list, save_list = [], []
266 |         soup = BeautifulSoup(content, "lxml")
267 |         if keys[0] == "lists":
268 |             ul_tag = soup.find("ul", class_="list_aa listimg")
269 |             if not ul_tag:
270 |                 return 1, url_list, save_list
271 |             lists_tag = ul_tag.findAll("a", class_="pic")
272 |             if not lists_tag:
273 |                 return 1, url_list, save_list
274 |             for link in lists_tag:
275 |                 save_list.append([str(link["href"]), str(link["title"])])
276 |         elif keys[0] == "detail":
277 |             lines_map = self.init_map()
278 |             [series, isbn, pub_times, pages, words, pub_date, size, binding] = self.getdetail_descripe(soup)
279 |             lines_map[self.contents_need[6]] = series
280 |             lines_map[self.contents_need[0]] = isbn
281 |             lines_map[self.contents_need[22]] = pub_times
282 |             lines_map[self.contents_need[23]] = pages
283 |             lines_map[self.contents_need[18]] = words
284 |             lines_map[self.contents_need[21]] = pub_date
285 |             lines_map[self.contents_need[19]] = size
286 |             lines_map[self.contents_need[20]] = binding
287 |             lines_map[self.contents_need[2]] = self.get_title(soup)
288 |             lines_map[self.contents_need[3]] = self.get_content(soup)
289 |             lines_map[self.contents_need[4]] = self.get_comment(soup)
290 |             [author, translator] = self.get_author_and_trans(soup)
291 |             lines_map[self.contents_need[7]] = author
292 |             lines_map[self.contents_need[10]] = translator
293 |             lines_map[self.contents_need[9]] = self.get_country(soup)
294 |             lines_map[self.contents_need[11]] = self.get_publicator(soup)
295 |             lines_map[self.contents_need[24]] = self.get_price(soup)
296 |             lines_map[self.contents_need[25]] = self.get_editor_reco(soup)
297 |             lines_map[self.contents_need[26]] = self.get_media_reco(soup)
298 |             lines_map[self.contents_need[27]] = self.get_author_intro(soup)
299 |             lines_map[self.contents_need[28]] = self.get_reviws_number(soup)
300 |             lines_map[self.contents_need[29]] = self.get_rank(soup)
301 |             self.get_pic(soup, lines_map)
302 |             item = self.write_to_line(lines_map)
303 |             item.append(url)
304 |             #logging.warning(item)
305 |             save_list.append(item)
306 |         return 1, url_list, save_list
307 | 


--------------------------------------------------------------------------------
/demos_dangdang/book_saver.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import spider
 4 | import pymysql
 5 | 
 6 | 
 7 | class BookSaver(spider.Saver):
 8 | 
 9 |     def __init__(self):
10 |         spider.Saver.__init__(self)
11 |         self.conn = pymysql.connect(host="localhost", user="username", password="password", db="dangdang_book", charset="utf8")
12 |         self.cursor = self.conn.cursor()
13 |         self.conn.autocommit(1)
14 |         return
15 | 
16 |     def item_save(self, url, keys, item):
17 |         '''
18 |         self.cursor.execute("insert into t_doubanmovies (m_url, m_name, m_year, m_imgurl, m_director, m_writer, m_actors, "
19 |                             "m_genre, m_country, m_language, m_release, m_season, m_jishu, m_length, m_alias, m_website, m_dbsite, "
20 |                             "m_imdb, m_score, m_comment, m_starpercent)"
21 |                             " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
22 |                             [i.strip() if i is not None else "" for i in item])
23 |         '''
24 |         if keys[0] == "lists":
25 |             self.cursor.execute("insert into book_urls (url, title) values(%s, %s);", [i.strip() if i is not None else "" for i in item])
26 |         elif keys[0] == "detail":
27 | 
28 |             self.cursor.execute(
29 |                 "insert into book_detail (isbn, pic, title, con_reco, comment, brand, series, "
30 |                 "author, author_origin, author_country, translator, publicator, author_prize, book_prize, raw_title, age, responsibility, "
31 |                 "lan, words, size, binding, pub_date, pub_times, pages, price, editor_reco, media_reco, author_intro, review_num, dangdang_rank, link)"
32 |                 " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
33 |                 [i.strip() if i is not None else "" for i in item])
34 | 
35 |         return True
36 | 


--------------------------------------------------------------------------------
/demos_doubanmovies/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 抓取豆瓣电影的全部数据
5 | """
6 | 
7 | from .movie_fetcher import MovieFetcher
8 | from .movie_parser import MovieParser
9 | 


--------------------------------------------------------------------------------
/demos_doubanmovies/movie_fetcher.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import random
 4 | import string
 5 | import spider
 6 | import logging
 7 | import requests
 8 | import requests.adapters
 9 | requests.packages.urllib3.disable_warnings()
10 | 
11 | 
12 | class MovieFetcher(spider.Fetcher):
13 | 
14 |     def __init__(self):
15 |         spider.Fetcher.__init__(self, max_repeat=3, sleep_time=0)
16 | 
17 |         self.session = requests.Session()
18 |         self.session.mount('https://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100))
19 |         self.clear_session()
20 |         return
21 | 
22 |     def clear_session(self):
23 |         self.session.headers.clear()
24 |         self.session.cookies.clear()
25 |         self.session.headers = {
26 |             "User-Agent": spider.make_random_useragent("pc"),
27 |             "Host": "movie.douban.com",
28 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
29 |             "Accept-Encoding": "gzip, deflate, sdch, br",
30 |             "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6",
31 |             "Cookie": "bid=%s" % "".join(random.sample(string.ascii_letters + string.digits, 11))
32 |         }
33 |         return
34 | 
35 |     def url_fetch(self, url, keys, repeat):
36 |         resp = self.session.get(url, allow_redirects=False, verify=False, timeout=5)
37 |         if resp.status_code == 200:
38 |             return 1, resp.text
39 |         logging.warning("Fetcher change cookie: %s", resp.status_code)
40 |         self.clear_session()
41 |         resp.raise_for_status()
42 |         return 1, resp.text
43 | 


--------------------------------------------------------------------------------
/demos_doubanmovies/movie_parser.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | import spider
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | class MovieParser(spider.Parser):
 8 | 
 9 |     def htm_parse(self, priority, url, keys, deep, content):
10 |         url_list, save_list = [], []
11 |         soup = BeautifulSoup(content, "html5lib")
12 | 
13 |         if keys[0] == "index":
14 |             # 获取列表页中所有的电影页面Url
15 |             div_movies = soup.find_all("a", class_="nbg", title=True)
16 |             url_list.extend([(item.get("href"), ("detail", keys[1]), 0) for item in div_movies])
17 | 
18 |             # 获取列表页的下一页
19 |             next_page = soup.find("span", class_="next")
20 |             if next_page:
21 |                 next_page_a = next_page.find("a")
22 |                 if next_page_a:
23 |                     url_list.append((next_page_a.get("href"), ("index", keys[1]), 1))
24 |         else:
25 |             content = soup.find("div", id="content")
26 | 
27 |             # 标题
28 |             name_and_year = [item.get_text() for item in content.find("h1").find_all("span")]
29 |             name, year = name_and_year if len(name_and_year) == 2 else (name_and_year[0], "")
30 |             movie = [url, name.strip(), year.strip("()")]
31 | 
32 |             # 左边
33 |             content_left = soup.find("div", class_="subject clearfix")
34 | 
35 |             nbg_soup = content_left.find("a", class_="nbgnbg").find("img")
36 |             movie.append(nbg_soup.get("src") if nbg_soup else "")
37 | 
38 |             info = content_left.find("div", id="info").get_text()
39 |             info_dict = dict([line.strip().split(":", 1) for line in info.strip().split("\n") if line.strip().find(":") > 0])
40 | 
41 |             movie.append(info_dict.get("导演", "").replace("\t", " "))
42 |             movie.append(info_dict.get("编剧", "").replace("\t", " "))
43 |             movie.append(info_dict.get("主演", "").replace("\t", " "))
44 | 
45 |             movie.append(info_dict.get("类型", "").replace("\t", " "))
46 |             movie.append(info_dict.get("制片国家/地区", "").replace("\t", " "))
47 |             movie.append(info_dict.get("语言", "").replace("\t", " "))
48 | 
49 |             movie.append(info_dict.get("上映日期", "").replace("\t", " ") if "上映日期" in info_dict else info_dict.get("首播", "").replace("\t", " "))
50 |             movie.append(info_dict.get("季数", "").replace("\t", " "))
51 |             movie.append(info_dict.get("集数", "").replace("\t", " "))
52 |             movie.append(info_dict.get("片长", "").replace("\t", " ") if "片长" in info_dict else info_dict.get("单集片长", "").replace("\t", " "))
53 | 
54 |             movie.append(info_dict.get("又名", "").replace("\t", " "))
55 |             movie.append(info_dict.get("官方网站", "").replace("\t", " "))
56 |             movie.append(info_dict.get("官方小站", "").replace("\t", " "))
57 |             movie.append(info_dict.get("IMDb链接", "").replace("\t", " "))
58 | 
59 |             # 右边
60 |             content_right = soup.find("div", class_="rating_wrap clearbox")
61 |             if content_right:
62 |                 movie.append(content_right.find("strong", class_="ll rating_num").get_text())
63 | 
64 |                 rating_people = content_right.find("a", class_="rating_people")
65 |                 movie.append(rating_people.find("span").get_text() if rating_people else "")
66 | 
67 |                 rating_per_list = [item.get_text() for item in content_right.find_all("span", class_="rating_per")]
68 |                 movie.append(", ".join(rating_per_list))
69 |             else:
70 |                 movie.extend(["", "", ""])
71 | 
72 |             assert len(movie) == 21, "length of movie is invalid"
73 |             save_list.append(movie)
74 |         return 1, url_list, save_list
75 | 


--------------------------------------------------------------------------------
/demos_nbastats/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 从NBA官网http://stats.nba.com/获取球员数据
5 | """
6 | 


--------------------------------------------------------------------------------
/demos_nbastats/nba_main.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import spider
  4 | import requests
  5 | 
  6 | # NBA球员索引URL
  7 | url_player_index = "http://stats.nba.com/stats/commonallplayers?IsOnlyCurrentSeason=1&LeagueID=00&Season=2016-17"
  8 | 
  9 | # NBA球员统计数据URL,传递参数PlayerID和PerMode("PerGame", "Totals")
 10 | url_player_stats = "http://stats.nba.com/stats/playercareerstats?LeagueID=00&PlayerID=%s&PerMode=%s"
 11 | 
 12 | 
 13 | # 定义抓取过程
 14 | class NBAFetcher(spider.Fetcher):
 15 | 
 16 |     def url_fetch(self, url, keys, critical, fetch_repeat):
 17 |         """
 18 |         这里只需要重写url_fetch函数,参数含义及返回结果见框架
 19 |         """
 20 |         headers = {"User-Agent": spider.make_random_useragent("pc"), "Accept-Encoding": "gzip"}
 21 |         response = requests.get(url, headers=headers, timeout=10)
 22 |         return 1, (response.json(), )
 23 | 
 24 | 
 25 | # 定义解析过程
 26 | class NBAParser(spider.Parser):
 27 | 
 28 |     def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content):
 29 |         """
 30 |         这里只需要重写htm_parse函数,参数含义及返回结果见框架
 31 |         """
 32 |         url_list, saver_list = [], []
 33 |         if keys[0] == "index":
 34 |             # 解析索引页
 35 |             content_json = content[0]
 36 | 
 37 |             # 解析所有的球员
 38 |             for item in content_json["resultSets"][0]["rowSet"]:
 39 |                 # 这里放入url_list的item为(url, keys, critical, priority), 注意这里keys的用法
 40 |                 url_list.append((url_player_stats % (item[0], "Totals"), ("Totals", item[2]), True, 0))
 41 |                 url_list.append((url_player_stats % (item[0], "PerGame"), ("PerGame", item[2]), True, 0))
 42 |         else:
 43 |             # 解析球员数据页
 44 |             content_json = content[0]
 45 | 
 46 |             # 解析球员数据
 47 |             saver_list = content_json["resultSets"][0]["rowSet"]
 48 |         return 1, url_list, saver_list
 49 | 
 50 | 
 51 | # 定义保存过程
 52 | class NBASaver(spider.Saver):
 53 | 
 54 |     def __init__(self, file_name_total, file_name_pergame):
 55 |         """
 56 |         构造函数,重写的目的是为了添加表头,并且不同的数据源写入到不同的文件
 57 |         """
 58 |         spider.Saver.__init__(self)
 59 | 
 60 |         # 打开文件,并写入表头
 61 |         self.save_pipe_total = open(file_name_total, "w", encoding="utf-8")
 62 |         self.save_pipe_total.write("\t".join(["PLAYER_NAME", "PLAYER_ID", "SEASON_ID", "LEAGUE_ID", "TEAM_ID", "TEAM_ABBREVIATION", "PLAYER_AGE",
 63 |                                               "GP", "GS", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT",
 64 |                                               "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]) + "\n")
 65 |         self.save_pipe_pergame = open(file_name_pergame, "w", encoding="utf-8")
 66 |         self.save_pipe_pergame.write("\t".join(["PLAYER_NAME", "PLAYER_ID", "SEASON_ID", "LEAGUE_ID", "TEAM_ID", "TEAM_ABBREVIATION", "PLAYER_AGE",
 67 |                                                 "GP", "GS", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT",
 68 |                                                 "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]) + "\n")
 69 |         return
 70 | 
 71 |     def item_save(self, url, keys, item):
 72 |         """
 73 |         这里只需要重写item_save函数,参数含义及返回结果见框架
 74 |         """
 75 |         if keys[0] == "Totals":
 76 |             self.save_pipe_total.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n")
 77 |         elif keys[0] == "PerGame":
 78 |             self.save_pipe_pergame.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n")
 79 |         else:
 80 |             return False
 81 |         return True
 82 | 
 83 | 
 84 | if __name__ == "__main__":
 85 |     """
 86 |     main流程
 87 |     """
 88 |     # 初始化fetcher, parser和saver
 89 |     fetcher = NBAFetcher(critical_max_repeat=3, critical_sleep_time=0)
 90 |     parser = NBAParser(max_deep=-1, max_repeat=3)
 91 |     saver = NBASaver(file_name_total="nba_total.txt", file_name_pergame="nba_pergame.txt")
 92 | 
 93 |     # 初始化爬虫, 并传入初始Url
 94 |     nba_spider = spider.WebSpider(fetcher, parser, saver, url_filter=None)
 95 |     nba_spider.set_start_url(url_player_index, ("index",), critical=True)
 96 | 
 97 |     # 开启10个线程抓取数据
 98 |     nba_spider.start_work_and_wait_done(fetcher_num=10, is_over=True)
 99 | 
100 |     exit()
101 | 


--------------------------------------------------------------------------------
/demos_taobao/chromedriver72:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xianhu/PSpiderDemos/ac4aa44b17eff223ebee059603bbcd4a8d28926e/demos_taobao/chromedriver72


--------------------------------------------------------------------------------
/demos_taobao/taobao.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | """
  4 | 模拟登陆淘宝，并抓取商品
  5 | """
  6 | 
  7 | import time
  8 | from urllib.parse import quote
  9 | from pyquery import PyQuery
 10 | from selenium import webdriver
 11 | from selenium.webdriver.common.by import By
 12 | from selenium.common.exceptions import TimeoutException
 13 | from selenium.webdriver.support.wait import WebDriverWait
 14 | from selenium.webdriver.support import expected_conditions
 15 | from selenium.webdriver import ActionChains
 16 | 
 17 | 
 18 | # webdriver
 19 | option = webdriver.ChromeOptions()
 20 | # option.add_argument("--proxy-server=127.0.0.1:9000")
 21 | option.add_experimental_option("excludeSwitches", ["enable-automation"])
 22 | # option.add_argument("--headless")
 23 | browser = webdriver.Chrome("./chromedriver72", options=option)
 24 | 
 25 | 
 26 | def login(name, password):
 27 | 	"""
 28 | 	登陆
 29 | 	"""
 30 | 	url = "https://login.taobao.com/member/login.jhtml"
 31 | 	browser.get(url)
 32 | 	try:
 33 | 		browser.find_element_by_css_selector("div.login-switch #J_Quick2Static").click()
 34 | 	except Exception as excep:
 35 | 		print(excep)
 36 | 
 37 | 	# 输入用户名密码
 38 | 	browser.find_element_by_id("TPL_username_1").send_keys(name)
 39 | 	browser.find_element_by_id("TPL_password_1").send_keys(password)
 40 | 	time.sleep(1)
 41 | 
 42 | 	try:
 43 | 		# 拖动滑块
 44 | 		slider = browser.find_element_by_css_selector("#nc_1_n1z")
 45 | 		action = ActionChains(browser)
 46 | 		action.drag_and_drop_by_offset(slider, 500, 0).perform()
 47 | 		time.sleep(3)
 48 | 	except Exception as excep:
 49 | 		print(excep)
 50 | 
 51 | 	time.sleep(2)
 52 | 	browser.find_element_by_id("J_SubmitStatic").click()
 53 | 	return
 54 | 
 55 | 
 56 | def index_page(page, key):
 57 | 	print("正在爬去第", page, "页")
 58 | 	try:
 59 | 		browser.get("https://s.taobao.com/search?q="+quote(key))
 60 | 		try:
 61 | 			slider2 = browser.find_element_by_css_selector("#nc_1__scale_text span.nc-lang-cnt")
 62 | 			action2 = ActionChains(browser)
 63 | 			action2.drag_and_drop_by_offset(slider2, 500, 0).perform()
 64 | 			time.sleep(5)
 65 | 		except Exception as excep:
 66 | 			print(excep)
 67 | 
 68 | 		input1 = wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager div.form > input")))
 69 | 		submit = wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager div.form > span.btn.J_Submit")))
 70 | 		input1.clear()
 71 | 		input1.send_keys(page)
 72 | 		submit.click()
 73 | 
 74 | 		wait.until(expected_conditions.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager li.item.active > span"), str(page)))
 75 | 		wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, ".m-itemlist .items .item")))
 76 | 
 77 | 		# 获取商品
 78 | 		doc = PyQuery(browser.page_source)
 79 | 		items = doc("#mainsrp-itemlist .items .item").items()
 80 | 		for item in items:
 81 | 			product = {
 82 | 				"image": item.find(".pic .img").attr("data-src"),
 83 | 				"price": item.find(".price").text(),
 84 | 				"deal": item.find(".deal-cnt").text(),
 85 | 				"title": item.find(".title").text(),
 86 | 				"shop": item.find(".shop").text(),
 87 | 				"location": item.find(".location").text(),
 88 | 			}
 89 | 			print(product)
 90 | 	except TimeoutException:
 91 | 		index_page(page, key)
 92 | 	except Exception as excep:
 93 | 		print(excep)
 94 | 	return
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 | 	wait = WebDriverWait(browser, 10)
 99 | 	login("username", "password")
100 | 	# 抓取商品
101 | 	for x in range(2, 10):
102 | 		time.sleep(5)
103 | 		index_page(x, "python")
104 | 	browser.close()
105 | 


--------------------------------------------------------------------------------
/demos_weibo/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 获取微博数据
5 | """
6 | 
7 | from .weibo_user import WeiBoUser
8 | from .weibo_search import WeiBoSearch
9 | 


--------------------------------------------------------------------------------
/demos_weibo/weibo_login.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import re
  4 | import rsa
  5 | import ssl
  6 | import time
  7 | import json
  8 | import queue
  9 | import base64
 10 | import spider
 11 | import logging
 12 | import binascii
 13 | import urllib.parse
 14 | ssl._create_default_https_context = ssl._create_unverified_context
 15 | 
 16 | 
 17 | class WeiBoLogin(object):
 18 |     """
 19 |     class of WeiBoLogin, to login weibo.com
 20 |     """
 21 | 
 22 |     def __init__(self):
 23 |         """
 24 |         constructor
 25 |         """
 26 |         self.user_name = None
 27 |         self.pass_word = None
 28 |         self.user_uniqueid = None
 29 |         self.user_nick = None
 30 | 
 31 |         self.cookie_jar, self.opener = None, None
 32 |         self.yundama = spider.YunDaMa("", "")
 33 |         return
 34 | 
 35 |     def login(self, user_name, pass_word, proxies=None):
 36 |         """
 37 |         login weibo.com, return True or False
 38 |         """
 39 |         self.user_name = user_name
 40 |         self.pass_word = pass_word
 41 |         self.user_uniqueid = None
 42 |         self.user_nick = None
 43 | 
 44 |         self.cookie_jar, self.opener = spider.make_cookiejar_opener(is_cookie=True, proxies=proxies)
 45 |         self.opener.addheaders = spider.make_headers(
 46 |             user_agent="pc",
 47 |             host="weibo.com",
 48 |             referer="http://weibo.com/",
 49 |             accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 50 |             accept_encoding="gzip, deflate",
 51 |             accept_language="zh-CN,zh;q=0.8"
 52 |         ).items()
 53 |         self.opener.open("http://weibo.com/login.php")
 54 | 
 55 |         # get json data
 56 |         s_user_name = self.get_username()
 57 |         json_data = self.get_json_data(su_value=s_user_name)
 58 |         if not json_data:
 59 |             return False
 60 |         s_pass_word = self.get_password(json_data["servertime"], json_data["nonce"], json_data["pubkey"])
 61 | 
 62 |         # make post_dict
 63 |         post_dict = {
 64 |             "entry": "weibo",
 65 |             "gateway": "1",
 66 |             "from": "",
 67 |             "savestate": "7",
 68 |             "userticket": "1",
 69 |             "vsnf": "1",
 70 |             "service": "miniblog",
 71 |             "encoding": "UTF-8",
 72 |             "pwencode": "rsa2",
 73 |             "sr": "1280*800",
 74 |             "prelt": "529",
 75 |             "url": "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack",
 76 |             "rsakv": json_data["rsakv"],
 77 |             "servertime": json_data["servertime"],
 78 |             "nonce": json_data["nonce"],
 79 |             "su": s_user_name,
 80 |             "sp": s_pass_word,
 81 |             "returntype": "TEXT",
 82 |         }
 83 | 
 84 |         # get captcha code
 85 |         if json_data["showpin"] == 1:
 86 |             url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int(time.time()), json_data["pcid"])
 87 |             cid, code = self.yundama.get_captcha(self.opener.open(url).read(), "captcha.jpeg", "image/jpeg", codetype="1005")
 88 |             if not code:
 89 |                 return False
 90 |             else:
 91 |                 post_dict["pcid"] = json_data["pcid"]
 92 |                 post_dict["door"] = code
 93 | 
 94 |         # login weibo.com
 95 |         login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int(time.time())
 96 |         json_data_1 = json.loads(spider.get_html_content(self.opener.open(login_url_1, data=spider.make_post_data(post_dict))))
 97 |         if json_data_1["retcode"] == "0":
 98 |             # callback
 99 |             post_dict = {
100 |                 "callback": "sinaSSOController.callbackLoginStatus",
101 |                 "ticket": json_data_1["ticket"],
102 |                 "ssosavestate": int(time.time()),
103 |                 "client": "ssologin.js(v1.4.18)",
104 |                 "_": int(time.time()*1000),
105 |             }
106 |             login_url_2 = "https://passport.weibo.com/wbsso/login?" + urllib.parse.urlencode(post_dict)
107 |             html_data = spider.get_html_content(self.opener.open(login_url_2), charset="gbk")
108 |             json_data_2 = json.loads(re.search("\((?P<result>.*)\)", html_data).group("result"))
109 |             if json_data_2["result"] is True:
110 |                 self.user_uniqueid = json_data_2["userinfo"]["uniqueid"]
111 |                 self.user_nick = json_data_2["userinfo"]["displayname"]
112 |                 logging.warning("WeiBoLogin succeed: %s", json_data_2)
113 |             else:
114 |                 logging.warning("WeiBoLogin failed: %s", json_data_2)
115 |         else:
116 |             logging.warning("WeiBoLogin failed: %s", json_data_1)
117 |         return True if self.user_uniqueid and self.user_nick else False
118 | 
119 |     def get_username(self):
120 |         """
121 |         get legal username
122 |         """
123 |         username_quote = urllib.parse.quote_plus(self.user_name)
124 |         username_base64 = base64.b64encode(username_quote.encode("utf-8"))
125 |         return username_base64.decode("utf-8")
126 | 
127 |     def get_json_data(self, su_value):
128 |         """
129 |         get the value of "servertime", "nonce", "pubkey", "rsakv" and "showpin", etc
130 |         """
131 |         post_data = urllib.parse.urlencode({
132 |             "entry": "weibo",
133 |             "callback": "sinaSSOController.preloginCallBack",
134 |             "rsakt": "mod",
135 |             "checkpin": "1",
136 |             "client": "ssologin.js(v1.4.18)",
137 |             "su": su_value,
138 |             "_": int(time.time()*1000),
139 |         })
140 | 
141 |         try:
142 |             response = self.opener.open('http://login.sina.com.cn/sso/prelogin.php?'+post_data)
143 |             data = spider.get_html_content(response, charset="utf-8")
144 |             json_data = json.loads(re.search("\((?P<data>.*)\)", data).group("data"))
145 |         except Exception as excep:
146 |             json_data = {}
147 |             logging.error("WeiBoLogin get_json_data error: %s", excep)
148 | 
149 |         logging.debug("WeiBoLogin get_json_data: %s", json_data)
150 |         return json_data
151 | 
152 |     def get_password(self, servertime, nonce, pubkey):
153 |         """
154 |         get legal password, encrypt file: http://i.sso.sina.com.cn/js/ssologin.js
155 |         """
156 |         string = (str(servertime) + '\t' + str(nonce) + '\n' + str(self.pass_word)).encode("utf-8")
157 |         public_key = rsa.PublicKey(int(pubkey, 16), int("10001", 16))
158 |         password = rsa.encrypt(string, public_key)
159 |         password = binascii.b2a_hex(password)
160 |         return password.decode()
161 | 
162 | 
163 | class WeiBoBase(WeiBoLogin):
164 |     """
165 |     class of WeiBoBase, as a base class
166 |     """
167 |     def __init__(self, users_pair=None):
168 |         """
169 |         constructor, users_pair: [(u1, p1), (u2, p2), ...]
170 |         """
171 |         WeiBoLogin.__init__(self)
172 | 
173 |         self.users_pair = users_pair
174 |         self.users_index = 0
175 | 
176 |         self.base_url = "http://weibo.com/"
177 |         self.header_re = re.compile("\$CONFIG\[[\'\"](?P<key>[\w]+?)[\'\"]\]=[\'\"](?P<value>[\w]*?)[\'\"]")
178 | 
179 |         self.fetch_queue = queue.Queue()    # unfetched url queue (url, keys, repeat)
180 |         self.saved_set = set()              # saved url or other id
181 | 
182 |         self.current_page = 1               # current page which is fetching
183 |         self.max_repeat = 5                 # maxinum repeat time
184 |         self.out_file = None                # output file
185 |         self.out_list = []                  # output list
186 |         self.out_length = 0                 # output length
187 |         return
188 | 
189 |     def re_login(self):
190 |         """
191 |         login repeat according to self.users_index
192 |         """
193 |         user_name, pass_word = self.users_pair[self.users_index % len(self.users_pair)]
194 |         if not self.login(user_name, pass_word):
195 |             exit()
196 |         self.users_index += 1
197 |         return
198 | 


--------------------------------------------------------------------------------
/demos_weibo/weibo_search.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import re
  4 | import bs4
  5 | import time
  6 | import json
  7 | import spider
  8 | import logging
  9 | import datetime
 10 | import urllib.parse
 11 | from .weibo_login import WeiBoBase
 12 | 
 13 | 
 14 | class WeiBoSearch(WeiBoBase):
 15 |     """
 16 |     class of WeiBoSearch
 17 |     """
 18 | 
 19 |     def __init__(self, users_pair):
 20 |         """
 21 |         constructor
 22 |         """
 23 |         WeiBoBase.__init__(self, users_pair=users_pair)
 24 | 
 25 |         # parameters which are needed in this class
 26 |         self.search_url = "http://s.weibo.com/weibo/"
 27 |         self.fetch_keys = tuple()       # type: (A, B, ...)
 28 |         self.fetch_timescope = None     # type: custom:2016-04-01-0:2016-04-02
 29 |         self.fetch_type = None          # type: typeall=1, xsort=hot, scope=ori
 30 |         return
 31 | 
 32 |     def update_fetch_queue(self):
 33 |         """
 34 |         update fetch queue
 35 |         """
 36 |         search_url = self.search_url + urllib.parse.quote(urllib.parse.quote(" ".join(self.fetch_keys)))
 37 |         search_url += "&%s&suball=1&timescope=%s&page=%d" % (self.fetch_type, self.fetch_timescope, self.current_page)
 38 |         self.fetch_queue.put(item=(search_url, "search", 0))
 39 |         return
 40 | 
 41 |     def fetch_search_weibo(self, fetch_keys, fetch_timescope, fetch_type="typeall=1", out_file=None):
 42 |         """
 43 |         fetch search weibo
 44 |         """
 45 |         assert fetch_type in ["typeall=1", "xsort=hot", "scope=ori"]
 46 |         self.re_login() if not self.user_uniqueid else 0
 47 | 
 48 |         # base class variables
 49 |         self.fetch_queue.queue.clear()
 50 |         self.saved_set.clear()
 51 |         self.current_page = 1
 52 |         self.out_file = out_file
 53 |         self.out_list = []
 54 |         self.out_length = 0
 55 | 
 56 |         # this class variables
 57 |         self.fetch_keys = fetch_keys
 58 |         self.fetch_timescope = fetch_timescope
 59 |         self.fetch_type = fetch_type
 60 | 
 61 |         # update fetch queue
 62 |         self.update_fetch_queue()
 63 | 
 64 |         while self.fetch_queue.qsize() > 0:
 65 |             url, keys, repeat = self.fetch_queue.get()
 66 |             logging.debug("WeiBoSearch: keys=%s, repeat=%s, url=%s", keys, repeat, url)
 67 | 
 68 |             try:
 69 |                 html_all = spider.get_html_content(self.opener.open(url))
 70 |                 for sc in re.findall("<script>[\w\W]+?STK\.pageletM\.view\(([\w\W]+?)\)</script>", html_all):
 71 |                     json_data = json.loads(sc)
 72 | 
 73 |                     if json_data.get("pid") == "pl_common_sassfilter":
 74 |                         self.check_anti_by_captcha(json_data["html"])
 75 |                         self.update_fetch_queue()
 76 |                         break
 77 | 
 78 |                     if json_data.get("pid") == "pl_weibo_direct":
 79 |                         self.parse_search_weibo_page(json_data["html"])
 80 |                         break
 81 |             except Exception as excep:
 82 |                 if repeat < self.max_repeat:
 83 |                     self.fetch_queue.put(item=(url, keys, repeat+1))
 84 |                 else:
 85 |                     logging.error("WeiBoSearch error: %s, url=%s", excep, url)
 86 |         return
 87 | 
 88 |     def parse_search_weibo_page(self, html):
 89 |         """
 90 |         parse search weibo page
 91 |         """
 92 |         soup = bs4.BeautifulSoup(html, "html.parser")
 93 | 
 94 |         if soup.find("div", class_="search_noresult"):
 95 |             logging.warning("WeiBoSearch: no result")
 96 |             return
 97 | 
 98 |         for item in soup.find_all("div", attrs={"action-type": "feed_list_item", "mid": True}):
 99 |             weibo_id = item.get("mid")
100 | 
101 |             soup_info = item.find("a", class_="W_textb", date=True)
102 |             weibo_url = soup_info.get("href")
103 |             weibo_date = datetime.datetime.fromtimestamp(int(soup_info.get("date")) / 1000.0)
104 |             weibo_content = spider.get_string_strip(item.find("p", class_="comment_txt").get_text())
105 | 
106 |             soup_user = item.find("img", class_="W_face_radius")
107 |             weibo_user = soup_user.get("alt")
108 |             _, querys = spider.get_url_params("http://xxx.com/?" + soup_user.get("usercard"))
109 |             weibo_user_id = querys["id"] if "id" in querys else ""
110 | 
111 |             weibo_list = (weibo_id, weibo_url, weibo_date, weibo_user_id, weibo_user, weibo_content)
112 |             if self.out_file:
113 |                 self.out_file.write("\t".join(map(str, weibo_list)) + "\n")
114 |             else:
115 |                 self.out_list.append(weibo_list)
116 |             self.out_length += 1
117 |         logging.debug("WeiBoSearch: current_page=%s, out_length=%s", self.current_page, self.out_length)
118 | 
119 |         if soup.find("a", class_="page next S_txt1 S_line1", href=True):
120 |             self.current_page += 1
121 |             self.update_fetch_queue()
122 |         return
123 | 
124 |     def check_anti_by_captcha(self, html):
125 |         """
126 |         check anti-spider by captcha
127 |         """
128 |         soup = bs4.BeautifulSoup(html, "html.parser")
129 | 
130 |         cid, code = None, None
131 |         while not code:
132 |             captcha_url = soup.find("img", attrs={"node-type": "yzm_img"}).get("src")
133 |             response = self.opener.open(spider.get_url_legal(captcha_url, self.search_url))
134 |             cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1004")
135 | 
136 |         verified_url = "http://s.weibo.com/ajax/pincode/verified?__rnd=%d" % int(time.time() * 1000)
137 |         post_data = spider.make_post_data({
138 |             "secode": code,
139 |             "type": "sass",
140 |             "pageid": "weibo",
141 |             "_t": 0
142 |         })
143 |         temp = json.loads(spider.get_html_content(self.opener.open(verified_url, data=post_data)))
144 |         if temp["code"] == "100000":
145 |             logging.warning("WeiBoSearch anti-spider succeed")
146 |         else:
147 |             logging.warning("WeiBoSearch anti-spider failed")
148 |             self.yundama.report(cid) if cid else 0
149 |         return
150 | 


--------------------------------------------------------------------------------
/demos_weibo/weibo_user.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import re
  4 | import bs4
  5 | import sys
  6 | import json
  7 | import time
  8 | import spider
  9 | import random
 10 | import logging
 11 | import datetime
 12 | import urllib.parse
 13 | from .weibo_login import WeiBoBase
 14 | 
 15 | 
 16 | class WeiBoUser(WeiBoBase):
 17 |     """
 18 |     class of WeiBoUser
 19 |     """
 20 | 
 21 |     def __init__(self, users_pair):
 22 |         """
 23 |         constructor
 24 |         """
 25 |         WeiBoBase.__init__(self, users_pair=users_pair)
 26 | 
 27 |         # parameters which are needed in this class
 28 |         self.bar_url = "http://weibo.com/p/aj/v6/mblog/mbloglist?"
 29 |         self.html_re = re.compile("<script>FM\.view\(([\w\W]+?)\);*</script>")
 30 |         return
 31 | 
 32 |     def fetch_user_from_id(self, user_id):
 33 |         """
 34 |         fetch user data from user_id
 35 |         :return user_name, [user_page_follows, user_page_fans, user_page_weibos], [follows, fans, weibos]
 36 |         """
 37 |         self.re_login() if not self.user_uniqueid else 0
 38 | 
 39 |         user_url_base = "http://weibo.com/%s/profile" % user_id.strip()
 40 |         user_name, user_pages, user_counts = None, [], []
 41 | 
 42 |         repeat_time = 0
 43 |         while repeat_time <= self.max_repeat:
 44 |             logging.debug("WeiBoUser repeat: repeat_time=%d" % repeat_time) if repeat_time > 0 else 0
 45 |             html_all = spider.get_html_content(self.opener.open(user_url_base, timeout=5))
 46 | 
 47 |             header_dict = {key: value for key, value in self.header_re.findall(html_all)}
 48 |             if ("uid" not in header_dict) or ("nick" not in header_dict):
 49 |                 repeat_time += 1
 50 |                 continue
 51 | 
 52 |             if ("onick" not in header_dict) or (header_dict["onick"] == header_dict["nick"]):
 53 |                 repeat_time += 1
 54 |                 continue
 55 | 
 56 |             for sc_string in self.html_re.findall(html_all):
 57 |                 json_data = json.loads(sc_string)
 58 |                 if json_data["domid"] == "Pl_Core_T8CustomTriColumn__3" and "html" in json_data:
 59 |                     soup = bs4.BeautifulSoup(json_data["html"], "html.parser")
 60 |                     a_soup_list = soup.find_all("a", class_="S_txt1")
 61 |                     user_pages = [a_soup.get("href") for a_soup in a_soup_list]
 62 |                     user_counts = [int(spider.get_string_num(a_soup.get_text())) for a_soup in a_soup_list]
 63 |                     user_name = header_dict["onick"]
 64 |                     break
 65 | 
 66 |             if user_name:
 67 |                 break
 68 | 
 69 |             repeat_time += 1
 70 |         # return result
 71 |         logging.warning("WeiBoUser fetch_user_from_id: user_id=%s, user_name=%s" % (user_id, user_name))
 72 |         return user_name, user_pages, user_counts
 73 | 
 74 |     def fetch_user_weibos(self, user_url, key_dict, file_out=sys.stdout, sleep_time=0):
 75 |         """
 76 |         fetch user weibo, user_url like: http://weibo.com/p/1005051750270991/home?parameters
 77 |         :param key_dict: {"mod": "data", "is_all": 1}
 78 |         :param key_dict: {"stat_date": "201512", "is_all": 1}
 79 |         :param key_dict: {
 80 |             "is_ori": 1, "is_forward": 1, "is_text": 1, "is_pic": 1, "is_video": 1, "is_music": 1, "is_article": 1,
 81 |             "key_word": "a b", "start_time": "2016-06-01", "end_time": "2016-06-04", "is_search": 1, "is_searchadv": 1
 82 |         }
 83 |         """
 84 |         self.re_login() if not self.user_uniqueid else 0
 85 | 
 86 |         self.fetch_queue.queue.clear()
 87 |         self.current_page = 1
 88 |         self.file_out = file_out
 89 | 
 90 |         # get the start url
 91 |         url_main, _ = spider.get_url_params(user_url, is_unique_values=True)
 92 |         self.fetch_queue.put((url_main+"?"+urllib.parse.urlencode(key_dict), "page_index", 0))
 93 | 
 94 |         # get data from url
 95 |         while self.fetch_queue.qsize() > 0:
 96 |             time.sleep(random.randint(0, sleep_time)) if sleep_time > 0 else 0
 97 |             url, keys, repeat = self.fetch_queue.get()
 98 | 
 99 |             try:
100 |                 html_all = spider.get_html_content(self.opener.open(url, timeout=5))
101 |                 main, querys = spider.get_url_params(url, is_unique_values=True)
102 | 
103 |                 if keys == "page_index":
104 |                     logging.warning("WeiBoUser index: repeat=%d, page=%d, url=%s" % (repeat, self.current_page, url))
105 | 
106 |                     header_dict = {key: value for key, value in self.header_re.findall(html_all)}
107 |                     for sc_string in self.html_re.findall(html_all):
108 |                         json_data = json.loads(sc_string)
109 |                         if json_data.get("ns") == "pl.content.homeFeed.index" and \
110 |                                 json_data["domid"].startswith("Pl_Official_MyProfileFeed"):
111 |                             # get index data
112 |                             weibo_count, is_loading, next_page = self.parse_user_weibo_page(json_data["html"])
113 |                             if is_loading:
114 |                                 # pagebar 0 and 1
115 |                                 post_dict = {
116 |                                     "id": querys.get("id", header_dict["page_id"]),
117 |                                     "domain": querys.get("domain", header_dict["domain"]),
118 |                                     "domain_op": querys.get("domain_op", header_dict["domain"]),
119 |                                     "pre_page": querys.get("page", 1),
120 |                                     "page": querys.get("page", 1),
121 |                                     "pagebar": 0,
122 |                                     "feed_type": 0,
123 |                                     "ajwvr": 6,
124 |                                     "__rnd": int(time.time() * 1000)
125 |                                 }
126 |                                 post_dict.update(key_dict)
127 |                                 self.fetch_queue.put((self.bar_url+urllib.parse.urlencode(post_dict), "page_bar", 0))
128 |                             break
129 | 
130 |                 elif keys == "page_bar":
131 |                     logging.warning("WeiBoUser bar=%s: page=%d url=%s" % (querys["pagebar"], self.current_page, url))
132 | 
133 |                     # get bar data
134 |                     weibo_count, is_loading, next_page = self.parse_user_weibo_page(json.loads(html_all)["data"])
135 |                     if is_loading:
136 |                         querys["pagebar"] = 1
137 |                         self.fetch_queue.put((self.bar_url+urllib.parse.urlencode(querys), "page_bar", 0))
138 | 
139 |                     if next_page:
140 |                         self.current_page += 1
141 |                         _temp = next_page.get("href")
142 |                         self.fetch_queue.put((url_main+_temp[_temp.find("?"):], "page_index", 0))
143 | 
144 |             except Exception as e:
145 |                 if repeat < self.max_repeat:
146 |                     self.fetch_queue.put((url, keys, repeat+1))
147 |                 else:
148 |                     logging.error("WeiBoUser error: error=%s, url=%s" % (str(e), url))
149 |         return
150 | 
151 |     def parse_user_weibo_page(self, html):
152 |         """
153 |         parse user weibo page, return weibo_count, is_loading, next_page_soup
154 |         """
155 |         # check frequence
156 |         if html.find("你搜的太频繁了") > 0:
157 |             logging.warning("WeiBoUser frequence warning: re_login!")
158 |             self.re_login()
159 |             assert False
160 | 
161 |         soup = bs4.BeautifulSoup(html, "html.parser")
162 |         weibo_count, is_loading, next_page_soup = 0, False, soup.find("a", class_="page next S_txt1 S_line1")
163 | 
164 |         # check weibo number
165 |         count_soup = soup.find("em", class_="W_fb S_spetxt")
166 |         if (not count_soup) or int(count_soup.get_text()) > 0:
167 |             for weibo_soup in soup.find_all("div", class_=re.compile("WB_cardwrap"), mid=True):
168 |                 weibo_count += 1
169 | 
170 |                 weibo_id = weibo_soup.get("mid")
171 |                 if weibo_id in self.saved_set:
172 |                     continue
173 | 
174 |                 # user information -- user_name and user_href
175 |                 user_div = weibo_soup.find("div", class_="WB_info")
176 |                 user_name = user_div.find("a", usercard=True).get_text()
177 |                 assert user_name, "WeiBoUser error: user_name is null!"
178 | 
179 |                 # content information -- content_date
180 |                 date_div = weibo_soup.find("div", class_="WB_from S_txt2")
181 |                 content_date = datetime.datetime.fromtimestamp(int(date_div.find("a", date=True).get("date")) / 1000.0)
182 | 
183 |                 # content information -- content and expand_users
184 |                 content_div = weibo_soup.find("div", class_="WB_text W_f14")
185 |                 content = spider.get_string_strip(content_div.get_text())
186 |                 self.file_out.write("\t".join([user_name, str(content_date), content]) + "\n")
187 | 
188 |                 # expand information
189 |                 expand_weibo = weibo_soup.find("div", class_="WB_feed_expand")
190 |                 if expand_weibo and (not expand_weibo.find("div", class_="WB_empty")):
191 |                     expand_user_div = expand_weibo.find("div", class_="WB_info")
192 |                     expand_user_name = expand_user_div.find("a", usercard=True).get_text().strip("@")
193 | 
194 |                     # expand_date_div = expand_weibo.find("div", class_="WB_from S_txt2")
195 |                     expand_content = spider.get_string_strip(expand_weibo.find("div", class_="WB_text").get_text())
196 |                     self.file_out.write("\t".join([user_name, "ex_c", expand_user_name, expand_content]) + "\n")
197 | 
198 |                 self.saved_set.add(weibo_id)
199 |             # if is_loading
200 |             is_loading = True if html.rfind("正在加载中") > 0 else False
201 | 
202 |         logging.debug("WeiBoUser: weibo_count=%d, weibo_all=%d, is_loading=%s, next_page=%s" %
203 |                       (weibo_count, len(self.saved_set), str(is_loading), str(bool(next_page_soup))))
204 |         return weibo_count, is_loading, next_page_soup
205 | 


--------------------------------------------------------------------------------
/demos_weixin/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 获取微信公众号数据
5 | """
6 | 
7 | from .weixin_public import WeiXinPublic
8 | 


--------------------------------------------------------------------------------
/demos_weixin/weixin_public.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | import sys
  4 | import time
  5 | import json
  6 | import spider
  7 | import logging
  8 | import urllib.parse
  9 | import urllib.request
 10 | from queue import Queue
 11 | from bs4 import BeautifulSoup
 12 | 
 13 | 
 14 | class WeiXinPublic(object):
 15 |     """
 16 |     class of WeiXinPublic
 17 |     """
 18 | 
 19 |     def __init__(self, max_repeat=5):
 20 |         """
 21 |         constructor
 22 |         """
 23 |         self.base_url = "http://weixin.sogou.com/"
 24 |         self.base_url_gzhjs = "http://weixin.sogou.com/gzhjs?"
 25 |         self.base_url_weixin = "http://weixin.sogou.com/weixin?"
 26 |         self.base_url_antispider = "http://weixin.sogou.com/antispider/"
 27 |         self.base_url_weixinqq = "http://mp.weixin.qq.com/"
 28 | 
 29 |         self.fetch_queue = Queue()      # unfetched url queue (url, keys, repeat)
 30 |         self.saved_set = set()          # saved url or other id
 31 |         self.current_page = 1           # current page which is fetching
 32 |         self.max_repeat = max_repeat    # maxinum repeat time
 33 | 
 34 |         self.arts_key = None            # key words for fetching articals
 35 |         self.user_id = None             # user id, not the open_id; None if fetch_type is 2
 36 |         self.search_keys = None         # search keys, (key, others)
 37 | 
 38 |         self.fetch_type = 1             # fetch type, 1: public_user, 2: public_artical
 39 |         self.fetch_tsn = 0              # fetch tsn, 0: all, 1: one day, 2: one week, 3: one month
 40 | 
 41 |         self.cookie_jar, self.opener = spider.make_cookiejar_opener()
 42 |         self.opener.addheaders = spider.make_headers(
 43 |             user_agent="pc",
 44 |             host="weixin.sogou.com",
 45 |             accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 46 |             accept_encoding="gzip, deflate",
 47 |             accept_language="zh-CN"
 48 |         ).items()
 49 | 
 50 |         # prepare to identify the captcha, and reset this class
 51 |         self.yundama = spider.YunDaMa("qixianhu", "mimaMIMA123456")
 52 |         self.file_out = None
 53 |         return
 54 | 
 55 |     def fetch_user(self, user_id, file_out=sys.stdout):
 56 |         """
 57 |         fetch user
 58 |         """
 59 |         self.file_out = file_out
 60 |         self.user_id = user_id
 61 |         self.search_keys = ("user_search", user_id)
 62 | 
 63 |         self.fetch_type = 1
 64 |         self.fetch_tsn = 0
 65 | 
 66 |         self.reset_this_class()
 67 |         self.work()
 68 |         return
 69 | 
 70 |     def fetch_arts(self, arts_key, fetch_tsn=0, file_out=sys.stdout):
 71 |         """
 72 |         fetch articles
 73 |         """
 74 |         self.file_out = file_out
 75 |         self.arts_key = arts_key
 76 |         self.search_keys = ("arts_search", arts_key)
 77 | 
 78 |         self.fetch_type = 2
 79 |         self.fetch_tsn = fetch_tsn
 80 | 
 81 |         self.reset_this_class()
 82 |         self.work()
 83 |         return
 84 | 
 85 |     def reset_this_class(self):
 86 |         """
 87 |         reset this class
 88 |         """
 89 |         post_dict = {
 90 |             "type": self.fetch_type,
 91 |             "query": self.arts_key if self.fetch_type == 2 else self.user_id,
 92 |             "ie": "utf-8",
 93 |             "_sug_": "n",
 94 |             "_sug_type_": "",
 95 |             "t": int(time.time() * 1000)
 96 |         }
 97 |         if self.fetch_type == 2:
 98 |             post_dict["tsn"] = self.fetch_tsn
 99 |             post_dict["page"] = self.current_page
100 |         post_data = urllib.parse.urlencode(post_dict)
101 | 
102 |         self.fetch_queue.queue.clear()
103 |         self.fetch_queue.put(item=(self.base_url_weixin+post_data, self.search_keys, 0))
104 |         logging.debug("WeiXinPublic reset_this_class success: current_page=%d" % self.current_page)
105 |         return
106 | 
107 |     def work(self):
108 |         """
109 |         process of fetching and parsing
110 |         """
111 |         while self.fetch_queue.qsize() > 0:
112 |             url, keys, repeat = self.fetch_queue.get()
113 |             logging.debug("WeiXinPublic work: keys=%s, repeat=%d, url=%s" % (str(keys), repeat, url))
114 |             try:
115 |                 response = self.opener.open(url, timeout=5)
116 |                 if keys[0] == "user_search":
117 |                     self.parse_user_search(url, keys, response)
118 | 
119 |                 if keys[0] == "user_arts":
120 |                     self.parse_user_arts(url, keys, response)
121 | 
122 |                 if keys[0] == "arts_search":
123 |                     self.parse_arts_search(url, keys, response)
124 | 
125 |                 if keys[0] == "get_art":
126 |                     self.parse_get_art(url, keys, response)
127 |             except Exception as excep:
128 |                 if repeat < self.max_repeat:
129 |                     self.fetch_queue.put(item=(url, keys, repeat+1))
130 |                 else:
131 |                     logging.error("WeiXinPublic work: error=%s, url=%s" % (str(excep), url))
132 |         return
133 | 
134 |     def parse_user_search(self, url, keys, response):
135 |         """
136 |         parser, keys: ("user_search", user_id)
137 |         """
138 |         soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser")
139 |         if not self.check_anti_by_captcha(soup):
140 |             self.reset_this_class()
141 |             return
142 | 
143 |         user_name = ""
144 |         for user_item in soup.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item"):
145 |             if user_item.find("label", attrs={"name": "em_weixinhao"}).get_text() == self.user_id:
146 |                 user_name = user_item.find("div", class_="txt-box").find("h3").get_text()
147 |                 self.fetch_queue.put(item=(user_item.get("href"), ("user_arts", self.user_id, user_name), 0))
148 |         logging.debug("WeiXinPublic parse_user_search: user_name=%s" % user_name)
149 |         return
150 | 
151 |     def parse_user_arts(self, url, keys, response):
152 |         """
153 |         parser, keys: ("user_arts", user_id, user_name)
154 |         """
155 |         html = spider.get_html_content(response, charset="utf-8")
156 |         json_data = spider.get_json_data(html, "msgList = '(?P<item>\{[\w\W]+?\})'")
157 |         if json_data:
158 |             for item in json_data.get("list", []):
159 |                 item_url = spider.get_url_legal(item["app_msg_ext_info"]["content_url"][1:], self.base_url_weixinqq).replace("&amp;", "&")
160 |                 self.fetch_queue.put(item=(item_url, ("get_art", None, keys[1], keys[2]), 0))
161 |                 for subitem in item["app_msg_ext_info"]["multi_app_msg_item_list"]:
162 |                     subitem_url = spider.get_url_legal(subitem["content_url"][1:], self.base_url_weixinqq).replace("&amp;", "&")
163 |                     self.fetch_queue.put(item=(subitem_url, ("get_art", None, keys[1], keys[2]), 0))
164 |         logging.debug("WeiXinPublic parse_user_arts: len(fetch_queue)=%d" % self.fetch_queue.qsize())
165 |         return
166 | 
167 |     def parse_arts_search(self, url, keys, response):
168 |         """
169 |         parser, keys: ("arts_search", arts_key)
170 |         """
171 |         _, querys = spider.get_url_params(url)
172 |         self.current_page = int(querys["page"][0]) if "page" in querys else self.current_page
173 |         logging.debug("WeiXinPublic parse_arts_search: update current page, current_page=%d" % self.current_page)
174 | 
175 |         soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser")
176 |         if not self.check_anti_by_captcha(soup):
177 |             self.reset_this_class()
178 |             return
179 | 
180 |         # current page
181 |         for art_soup in soup.find_all("div", class_="txt-box"):
182 |             art_url = spider.get_url_legal(art_soup.find("a").get("href"), base_url=url)
183 |             user_openid = art_soup.find("a", id="weixin_account").get("i")
184 |             user_name = art_soup.find("a", id="weixin_account").get("title")
185 |             self.fetch_queue.put(item=(art_url, ("get_art", keys[1], user_openid, user_name), 0))
186 | 
187 |         # next page
188 |         next_page = soup.find("a", id="sogou_next")
189 |         if next_page:
190 |             next_page_url = spider.get_url_legal(next_page.get("href"), base_url=url)
191 |             self.fetch_queue.put(item=(next_page_url, keys, 0))
192 |         return
193 | 
194 |     def parse_get_art(self, url, keys, response):
195 |         """
196 |         parser, keys: ("get_art", None or arts_key, user_id or user_openid, user_name)
197 |         """
198 |         soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser")
199 | 
200 |         _, querys = spider.get_url_params(url)
201 |         s_title = spider.get_string_strip(soup.title.string)
202 |         s_date = soup.find("em", id="post-date").get_text()
203 |         self.file_out.write("\t".join([s_title, s_date, str(keys[1:])]) + "\n")
204 | 
205 |         self.saved_set.add(keys[2] + s_date + s_title)
206 |         logging.debug("WeiXinPublic parse_get_art: len(saved_set)=%d" % len(self.saved_set))
207 |         return
208 | 
209 |     def check_anti_by_captcha(self, soup):
210 |         """
211 |         check anti-spider by captcha
212 |         :return 1, 0: 1(can continue), 0(can repeat)
213 |         """
214 |         if not soup.find("img", id="seccodeImage"):
215 |             return 1
216 | 
217 |         while 1:
218 |             cid, code = None, None
219 |             while not code:
220 |                 captcha_url = soup.find("img", id="seccodeImage").get("src")
221 |                 response = self.opener.open(spider.get_url_legal(captcha_url, self.base_url_antispider))
222 |                 cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1006")
223 | 
224 |             post_data = urllib.parse.urlencode({
225 |                 "c": code,
226 |                 "r": soup.find("input", id="from").get("value"),
227 |                 "v": 5
228 |             }).encode()
229 |             response = self.opener.open("http://weixin.sogou.com/antispider/thank.php", data=post_data)
230 | 
231 |             json_data = json.loads(spider.get_html_content(response, charset="utf-8"))
232 |             if json_data["msg"].find("解封成功") >= 0:
233 |                 snuid = json_data["id"]
234 |                 self.cookie_jar.set_cookie(spider.make_cookie(name="SNUID", value=snuid, domain="weixin.sogou.com"))
235 | 
236 |                 post_dict = {
237 |                     "uigs_productid": "webapp",
238 |                     "type": "antispider",
239 |                     "subtype": "",
240 |                     "domain": "weixin",
241 |                     "suv": "",
242 |                     "snuid": snuid,
243 |                     "t": int(time.time() * 1000)
244 |                 }
245 |                 for cookie in self.cookie_jar:
246 |                     if cookie.name == "SUV":
247 |                         post_dict["suv"] = cookie.value
248 | 
249 |                 post_dict["subtype"] = "0_seccodeInputSuccess"
250 |                 post_dict["t"] = int(time.time() * 1000)
251 |                 self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict))
252 | 
253 |                 post_dict["subtype"] = "close_refresh"
254 |                 post_dict["t"] = int(time.time() * 1000)
255 |                 self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict))
256 |                 break
257 |             else:
258 |                 self.yundama.report(cid=cid) if cid else 0
259 |         logging.warning("WeiXinPublic check_anti_by_captcha: anti-spider success!")
260 |         return 0
261 | 
262 | 
263 | if __name__ == '__main__':
264 |     logging.basicConfig(level=logging.DEBUG, format="%(asctime)s\t%(levelname)s\t%(message)s")
265 | 
266 |     weixin = WeiXinPublic()
267 |     # weixin.fetch_user(user_id="diyCRT")
268 |     # weixin.fetch_arts("北京国安", fetch_tsn=0)
269 |     exit()
270 | 


--------------------------------------------------------------------------------
/demos_yundama/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 封装关于YunDaMa的接口,识别验证码
5 | """
6 | 
7 | from .yundama import YunDaMa
8 | 


--------------------------------------------------------------------------------
/demos_yundama/yundama.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | """
 4 | yundama.py by xianhu
 5 | """
 6 | 
 7 | import time
 8 | import spider
 9 | import logging
10 | import requests
11 | 
12 | 
13 | class YunDaMa(object):
14 |     """
15 |     class of YunDaMa, to identify captcha by yundama.com
16 |     """
17 | 
18 |     def __init__(self, user_name, pass_word, appid=None, appkey=None):
19 |         """
20 |         constructor
21 |         """
22 |         self.base_url = "http://api.yundama.com/api.php"
23 |         self.base_headers = {
24 |             "User-Agent": spider.make_random_useragent("pc"),
25 |             "Host": "api.yundama.com",
26 |             "Referer": "http://www.yundama.com/download/YDMHttp.html",
27 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
28 |             "Accept-Language": "zh-CN,zh;q=0.8",
29 |             "Origin": "http://www.yundama.com",
30 |         }
31 | 
32 |         self.user_name = user_name
33 |         self.pass_word = pass_word
34 | 
35 |         self.appid = "1" if not appid else appid
36 |         self.appkey = "22cc5376925e9387a23cf797cb9ba745" if not appkey else appkey
37 |         return
38 | 
39 |     def get_captcha(self, file_name, file_bytes, file_type="image/jpeg", codetype="1000", repeat=10):
40 |         """
41 |         get captcha result(cid, code), based on file_name, file_bytes, file_type
42 |         :key: http://www.yundama.com/apidoc/YDM_ErrorCode.html
43 |         :param codetype: http://www.yundama.com/price.html
44 |         """
45 |         cid = self.upload(file_name, file_bytes, file_type, codetype)
46 |         if not cid:
47 |             return None, None
48 |         while repeat > 0:
49 |             code = self.result(cid)
50 |             if code:
51 |                 return cid, code
52 |             repeat -= 1
53 |             time.sleep(2)
54 |         return cid, None
55 | 
56 |     def upload(self, file_name, file_bytes, file_type, codetype):
57 |         """
58 |         upload image file, return cid or None
59 |         """
60 |         post_data = {
61 |             "username": self.user_name,
62 |             "password": self.pass_word,
63 |             "codetype": codetype,
64 |             "appid": self.appid,
65 |             "appkey": self.appkey,
66 |             "timeout": 60,
67 |             "method": "upload",
68 |         }
69 |         files = {"file": (file_name, file_bytes, file_type)}
70 |         try:
71 |             response = requests.post(self.base_url, data=post_data, headers=self.base_headers, files=files)
72 |             json_data = response.json()
73 |         except Exception as excep:
74 |             json_data = {"ret": -1, "errMsg": excep}
75 |         logging.warning("YunDaMa upload %s: %s", "succeed" if json_data["ret"] == 0 else "failed", json_data)
76 |         return json_data.get("cid", "")
77 | 
78 |     def result(self, cid):
79 |         """
80 |         get result from cid, return code or None
81 |         """
82 |         try:
83 |             response = requests.get(self.base_url+("?cid=%d&method=result" % cid), headers=self.base_headers)
84 |             json_data = response.json()
85 |         except Exception as excep:
86 |             json_data = {"ret": -1, "errMsg": excep}
87 |         logging.warning("YunDaMa result %s: %s", "succeed" if json_data["ret"] == 0 else "failed", json_data)
88 |         return json_data.get("text", "")
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     ydm = YunDaMa("username", "password")
93 |     cid_t, code_t = ydm.get_captcha("captcha.jpeg", requests.get("http://www.yundama.com/index/captcha").content)
94 |     print(cid_t, code_t)
95 |     if cid_t and (not code_t):
96 |         ydm.result(cid_t)
97 | 


--------------------------------------------------------------------------------
/demos_zhihu/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | 
3 | """
4 | 获取知乎数据
5 | """
6 | 


--------------------------------------------------------------------------------
/demos_zhihu/zhihu_login.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | 
 3 | """
 4 | define login_zhihu to login zhihu.com, just as a demonstration
 5 | """
 6 | 
 7 | import re
 8 | import time
 9 | import json
10 | import logging
11 | import urllib.parse
12 | import urllib.request
13 | import http.cookiejar
14 | 
15 | 
16 | def login_zhihu(user_name, pass_word):
17 |     """
18 |     login zhihu.com, just as a demonstration
19 |     """
20 |     cookie_handler = urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())
21 |     urllib.request.install_opener(urllib.request.build_opener(cookie_handler))
22 | 
23 |     # get _xsrf
24 |     response = urllib.request.urlopen("http://www.zhihu.com")
25 |     data = response.read().decode("utf-8")
26 |     _xsrf = re.search("name=\"_xsrf\" value=\"(?P<value>.*)\"", data).group("value")
27 | 
28 |     # get captcha
29 |     response = urllib.request.urlopen('http://www.zhihu.com/captcha.gif?r=%d&type=login' % int(time.time() * 1000))
30 |     with open('captcha.jpg', 'wb') as file_image:
31 |         file_image.write(response.read())
32 |     captcha = input("input the captcha: ")
33 | 
34 |     # login
35 |     url = "http://www.zhihu.com/login/email"
36 |     headers = {
37 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:41.0) Gecko/20100101 Firefox/41.0",
38 |         "Referer": "http://www.zhihu.com/"
39 |     }
40 |     post_data = urllib.parse.urlencode({
41 |         "_xsrf": _xsrf,
42 |         "email": user_name,
43 |         "password": pass_word,
44 |         "captcha": captcha,
45 |         "remember_me": "true"
46 |     }).encode()
47 |     response = urllib.request.urlopen(urllib.request.Request(url, data=post_data, headers=headers))
48 |     result = json.loads(response.read().decode("utf-8"))
49 | 
50 |     if result["r"] == 0:
51 |         logging.warning("login zhihu success!")
52 |         return True
53 |     logging.error("login zhihu failed! %s" % str(result))
54 |     return False
55 | 


--------------------------------------------------------------------------------
/otherfiles/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile by xianhu: build a docker image for spider or flask
 2 | # usage: docker build -t user/centos:v06 .
 3 | 
 4 | FROM centos:6.8
 5 | 
 6 | MAINTAINER xianhu <qixianhu@qq.com>
 7 | 
 8 | # change system environments
 9 | ENV LANG en_US.UTF-8
10 | ENV LC_ALL en_US.UTF-8
11 | 
12 | # change system local time
13 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
14 | 
15 | # update yum and install something
16 | RUN yum update -y
17 | RUN yum install -y xz
18 | RUN yum install -y vim
19 | RUN yum install -y git
20 | RUN yum install -y wget
21 | RUN yum install -y crontabs
22 | RUN yum install -y gcc
23 | RUN yum install -y make
24 | RUN yum install -y zlib-devel
25 | RUN yum install -y openssl-devel
26 | RUN yum clean all
27 | 
28 | # restart crontab service
29 | RUN service crond restart
30 | 
31 | # download python3
32 | WORKDIR /root/
33 | RUN wget https://www.python.org/ftp/python/3.5.3/Python-3.5.3.tar.xz
34 | RUN tar -xf Python-3.5.3.tar.xz
35 | 
36 | # install python3
37 | WORKDIR /root/Python-3.5.3
38 | RUN ./configure
39 | RUN make install
40 | RUN make clean
41 | RUN make distclean
42 | 
43 | # install libs of python3
44 | ADD ./Dockerfile_requirements.txt /root/
45 | WORKDIR /root/
46 | RUN pip3 install --upgrade pip
47 | RUN pip3 install -r Dockerfile_requirements.txt
48 | RUN rm -rf /root/*
49 | 
50 | # change python to python3
51 | RUN ln -sf /usr/local/bin/python3 /usr/bin/python
52 | RUN ln -sf /usr/bin/python2.6 /usr/bin/python2
53 | 
54 | # change /usr/bin/yum
55 | RUN sed -i 's/usr\/bin\/python/usr\/bin\/python2/g' /usr/bin/yum
56 | 
57 | # cmd command
58 | CMD /bin/bash
59 | 


--------------------------------------------------------------------------------
/otherfiles/pylint.conf:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Add files or directories to the blacklist. They should be base names, not
 11 | # paths.
 12 | ignore=CVS
 13 | 
 14 | # Add files or directories matching the regex patterns to the blacklist. The
 15 | # regex matches against base names, not paths.
 16 | ignore-patterns=
 17 | 
 18 | # Pickle collected data for later comparisons.
 19 | persistent=yes
 20 | 
 21 | # List of plugins (as comma separated values of python modules names) to load,
 22 | # usually to register additional checkers.
 23 | load-plugins=
 24 | 
 25 | # Use multiple processes to speed up Pylint.
 26 | jobs=1
 27 | 
 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 29 | # active Python interpreter and may run arbitrary code.
 30 | unsafe-load-any-extension=no
 31 | 
 32 | # A comma-separated list of package or module names from where C extensions may
 33 | # be loaded. Extensions are loading into the active Python interpreter and may
 34 | # run arbitrary code
 35 | extension-pkg-whitelist=
 36 | 
 37 | # Allow optimization of some AST trees. This will activate a peephole AST
 38 | # optimizer, which will apply various small optimizations. For instance, it can
 39 | # be used to obtain the result of joining multiple strings with the addition
 40 | # operator. Joining a lot of strings can lead to a maximum recursion error in
 41 | # Pylint and this flag can prevent that. It has one side effect, the resulting
 42 | # AST will be different than the one from reality. This option is deprecated
 43 | # and it will be removed in Pylint 2.0.
 44 | optimize-ast=no
 45 | 
 46 | 
 47 | [MESSAGES CONTROL]
 48 | 
 49 | # Only show warnings with the listed confidence levels. Leave empty to show
 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 51 | confidence=
 52 | 
 53 | # Enable the message, report, category or checker with the given id(s). You can
 54 | # either give multiple identifier separated by comma (,) or put this option
 55 | # multiple time (only on the command line, not in the configuration file where
 56 | # it should appear only once). See also the "--disable" option for examples.
 57 | #enable=
 58 | 
 59 | # Disable the message, report, category or checker with the given id(s). You
 60 | # can either give multiple identifiers separated by comma (,) or put this
 61 | # option multiple times (only on the command line, not in the configuration
 62 | # file where it should appear only once).You can also use "--disable=all" to
 63 | # disable everything first and then reenable specific checks. For example, if
 64 | # you want to run only the similarities checker, you can use "--disable=all
 65 | # --enable=similarities". If you want to run only the classes checker, but have
 66 | # no Warning level messages displayed, use"--disable=all --enable=classes
 67 | # --disable=W"
 68 | disable=backtick,basestring-builtin,zip-builtin-not-iterating,old-ne-operator,dict-view-method,input-builtin,unichr-builtin,raw_input-builtin,xrange-builtin,parameter-unpacking,unicode-builtin,reduce-builtin,old-raise-syntax,raising-string,print-statement,delslice-method,next-method-called,dict-iter-method,standarderror-builtin,buffer-builtin,intern-builtin,long-builtin,nonzero-method,hex-method,oct-method,range-builtin-not-iterating,coerce-builtin,useless-suppression,setslice-method,indexing-exception,execfile-builtin,getslice-method,import-star-module-level,metaclass-assignment,map-builtin-not-iterating,unpacking-in-except,using-cmp-argument,long-suffix,round-builtin,old-octal-literal,file-builtin,apply-builtin,reload-builtin,old-division,filter-builtin-not-iterating,no-absolute-import,cmp-method,suppressed-message,coerce-method,cmp-builtin
 69 | 
 70 | 
 71 | [REPORTS]
 72 | 
 73 | # Set the output format. Available formats are text, parseable, colorized, msvs
 74 | # (visual studio) and html. You can also give a reporter class, eg
 75 | # mypackage.mymodule.MyReporterClass.
 76 | output-format=text
 77 | 
 78 | # Put messages in a separate file for each module / package specified on the
 79 | # command line instead of printing them on stdout. Reports (if any) will be
 80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated
 81 | # and it will be removed in Pylint 2.0.
 82 | files-output=no
 83 | 
 84 | # Tells whether to display a full report or only the messages
 85 | reports=yes
 86 | 
 87 | # Python expression which should return a note less than 10 (10 is the highest
 88 | # note). You have access to the variables errors warning, statement which
 89 | # respectively contain the number of errors / warnings messages and the total
 90 | # number of statements analyzed. This is used by the global evaluation report
 91 | # (RP0004).
 92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 93 | 
 94 | # Template used to display messages. This is a python new-style format string
 95 | # used to format the message information. See doc for all details
 96 | #msg-template=
 97 | 
 98 | 
 99 | [BASIC]
100 | 
101 | # Good variable names which should always be accepted, separated by a comma
102 | good-names=i,j,k,ex,Run,_
103 | 
104 | # Bad variable names which should always be refused, separated by a comma
105 | bad-names=foo,bar,baz,toto,tutu,tata
106 | 
107 | # Colon-delimited sets of names that determine each other's naming style when
108 | # the name regexes allow several styles.
109 | name-group=
110 | 
111 | # Include a hint for the correct naming format with invalid-name
112 | include-naming-hint=no
113 | 
114 | # List of decorators that produce properties, such as abc.abstractproperty. Add
115 | # to this list to register other decorators that produce valid properties.
116 | property-classes=abc.abstractproperty
117 | 
118 | # Regular expression matching correct module names
119 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
120 | 
121 | # Naming hint for module names
122 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
123 | 
124 | # Regular expression matching correct function names
125 | function-rgx=[a-z_][a-z0-9_]{2,30}$
126 | 
127 | # Naming hint for function names
128 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
129 | 
130 | # Regular expression matching correct attribute names
131 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
132 | 
133 | # Naming hint for attribute names
134 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
135 | 
136 | # Regular expression matching correct constant names
137 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
138 | 
139 | # Naming hint for constant names
140 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
141 | 
142 | # Regular expression matching correct inline iteration names
143 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
144 | 
145 | # Naming hint for inline iteration names
146 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
147 | 
148 | # Regular expression matching correct argument names
149 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
150 | 
151 | # Naming hint for argument names
152 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
153 | 
154 | # Regular expression matching correct class names
155 | class-rgx=[A-Z_][a-zA-Z0-9]+$
156 | 
157 | # Naming hint for class names
158 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
159 | 
160 | # Regular expression matching correct class attribute names
161 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
162 | 
163 | # Naming hint for class attribute names
164 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
165 | 
166 | # Regular expression matching correct variable names
167 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
168 | 
169 | # Naming hint for variable names
170 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
171 | 
172 | # Regular expression matching correct method names
173 | method-rgx=[a-z_][a-z0-9_]{2,30}$
174 | 
175 | # Naming hint for method names
176 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
177 | 
178 | # Regular expression which should only match function or class names that do
179 | # not require a docstring.
180 | no-docstring-rgx=^_
181 | 
182 | # Minimum line length for functions/classes that require docstrings, shorter
183 | # ones are exempt.
184 | docstring-min-length=-1
185 | 
186 | 
187 | [ELIF]
188 | 
189 | # Maximum number of nested blocks for function / method body
190 | max-nested-blocks=5
191 | 
192 | 
193 | [FORMAT]
194 | 
195 | # Maximum number of characters on a single line.
196 | max-line-length=200
197 | 
198 | # Regexp for a line that is allowed to be longer than the limit.
199 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
200 | 
201 | # Allow the body of an if to be on the same line as the test if there is no
202 | # else.
203 | single-line-if-stmt=no
204 | 
205 | # List of optional constructs for which whitespace checking is disabled. `dict-
206 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
207 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
208 | # `empty-line` allows space-only lines.
209 | no-space-check=trailing-comma,dict-separator
210 | 
211 | # Maximum number of lines in a module
212 | max-module-lines=1000
213 | 
214 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
215 | # tab).
216 | indent-string='    '
217 | 
218 | # Number of spaces of indent required inside a hanging  or continued line.
219 | indent-after-paren=4
220 | 
221 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
222 | expected-line-ending-format=
223 | 
224 | 
225 | [LOGGING]
226 | 
227 | # Logging modules to check that the string format arguments are in logging
228 | # function parameter format
229 | logging-modules=logging
230 | 
231 | 
232 | [MISCELLANEOUS]
233 | 
234 | # List of note tags to take in consideration, separated by a comma.
235 | notes=FIXME,XXX,TODO
236 | 
237 | 
238 | [SIMILARITIES]
239 | 
240 | # Minimum lines number of a similarity.
241 | min-similarity-lines=4
242 | 
243 | # Ignore comments when computing similarities.
244 | ignore-comments=yes
245 | 
246 | # Ignore docstrings when computing similarities.
247 | ignore-docstrings=yes
248 | 
249 | # Ignore imports when computing similarities.
250 | ignore-imports=no
251 | 
252 | 
253 | [SPELLING]
254 | 
255 | # Spelling dictionary name. Available dictionaries: none. To make it working
256 | # install python-enchant package.
257 | spelling-dict=
258 | 
259 | # List of comma separated words that should not be checked.
260 | spelling-ignore-words=
261 | 
262 | # A path to a file that contains private dictionary; one word per line.
263 | spelling-private-dict-file=
264 | 
265 | # Tells whether to store unknown words to indicated private dictionary in
266 | # --spelling-private-dict-file option instead of raising a message.
267 | spelling-store-unknown-words=no
268 | 
269 | 
270 | [TYPECHECK]
271 | 
272 | # Tells whether missing members accessed in mixin class should be ignored. A
273 | # mixin class is detected if its name ends with "mixin" (case insensitive).
274 | ignore-mixin-members=yes
275 | 
276 | # List of module names for which member attributes should not be checked
277 | # (useful for modules/projects where namespaces are manipulated during runtime
278 | # and thus existing member attributes cannot be deduced by static analysis. It
279 | # supports qualified module names, as well as Unix pattern matching.
280 | ignored-modules=Levenshtein
281 | 
282 | # List of class names for which member attributes should not be checked (useful
283 | # for classes with dynamically set attributes). This supports the use of
284 | # qualified names.
285 | ignored-classes=optparse.Values,thread._local,_thread._local
286 | 
287 | # List of members which are set dynamically and missed by pylint inference
288 | # system, and so shouldn't trigger E1101 when accessed. Python regular
289 | # expressions are accepted.
290 | generated-members=
291 | 
292 | # List of decorators that produce context managers, such as
293 | # contextlib.contextmanager. Add to this list to register other decorators that
294 | # produce valid context managers.
295 | contextmanager-decorators=contextlib.contextmanager
296 | 
297 | 
298 | [VARIABLES]
299 | 
300 | # Tells whether we should check for unused import in __init__ files.
301 | init-import=no
302 | 
303 | # A regular expression matching the name of dummy variables (i.e. expectedly
304 | # not used).
305 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
306 | 
307 | # List of additional names supposed to be defined in builtins. Remember that
308 | # you should avoid to define new builtins when possible.
309 | additional-builtins=
310 | 
311 | # List of strings which can identify a callback function by name. A callback
312 | # name must start or end with one of those strings.
313 | callbacks=cb_,_cb
314 | 
315 | # List of qualified module names which can have objects that can redefine
316 | # builtins.
317 | redefining-builtins-modules=six.moves,future.builtins
318 | 
319 | 
320 | [CLASSES]
321 | 
322 | # List of method names used to declare (i.e. assign) instance attributes.
323 | defining-attr-methods=__init__,__new__,setUp
324 | 
325 | # List of valid names for the first argument in a class method.
326 | valid-classmethod-first-arg=cls
327 | 
328 | # List of valid names for the first argument in a metaclass class method.
329 | valid-metaclass-classmethod-first-arg=mcs
330 | 
331 | # List of member names, which should be excluded from the protected access
332 | # warning.
333 | exclude-protected=_asdict,_fields,_replace,_source,_make
334 | 
335 | 
336 | [DESIGN]
337 | 
338 | # Maximum number of arguments for function / method
339 | max-args=10
340 | 
341 | # Argument names that match this expression will be ignored. Default to name
342 | # with leading underscore
343 | ignored-argument-names=_.*
344 | 
345 | # Maximum number of locals for function / method body
346 | max-locals=30
347 | 
348 | # Maximum number of return / yield for function / method body
349 | max-returns=10
350 | 
351 | # Maximum number of branch for function / method body
352 | max-branches=30
353 | 
354 | # Maximum number of statements in function / method body
355 | max-statements=100
356 | 
357 | # Maximum number of parents for a class (see R0901).
358 | max-parents=10
359 | 
360 | # Maximum number of attributes for a class (see R0902).
361 | max-attributes=50
362 | 
363 | # Minimum number of public methods for a class (see R0903).
364 | min-public-methods=1
365 | 
366 | # Maximum number of public methods for a class (see R0904).
367 | max-public-methods=30
368 | 
369 | # Maximum number of boolean expressions in a if statement
370 | max-bool-expr=10
371 | 
372 | 
373 | [IMPORTS]
374 | 
375 | # Deprecated modules which should not be used, separated by a comma
376 | deprecated-modules=optparse
377 | 
378 | # Create a graph of every (i.e. internal and external) dependencies in the
379 | # given file (report RP0402 must not be disabled)
380 | import-graph=
381 | 
382 | # Create a graph of external dependencies in the given file (report RP0402 must
383 | # not be disabled)
384 | ext-import-graph=
385 | 
386 | # Create a graph of internal dependencies in the given file (report RP0402 must
387 | # not be disabled)
388 | int-import-graph=
389 | 
390 | # Force import order to recognize a module as part of the standard
391 | # compatibility libraries.
392 | known-standard-library=
393 | 
394 | # Force import order to recognize a module as part of a third party library.
395 | known-third-party=enchant
396 | 
397 | # Analyse import fallback blocks. This can be used to support both Python 2 and
398 | # 3 compatible code, which means that the block might have code that exists
399 | # only in one or another interpreter, leading to false positives when analysed.
400 | analyse-fallback-blocks=no
401 | 
402 | 
403 | [EXCEPTIONS]
404 | 
405 | # Exceptions that will emit a warning when being caught. Defaults to
406 | # "Exception"
407 | overgeneral-exceptions=
408 | 


--------------------------------------------------------------------------------
/test_demos.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding: utf-8 _*_
  2 | 
  3 | """
  4 | test_demos.py by xianhu
  5 | """
  6 | 
  7 | import re
  8 | import spider
  9 | import pymysql
 10 | import logging
 11 | import requests
 12 | from bs4 import BeautifulSoup
 13 | from demos_doubanmovies import MovieFetcher, MovieParser
 14 | from demos_dangdang import BookFetcher, BookParser, BookSaver
 15 | 
 16 | 
 17 | def get_douban_movies():
 18 |     """
 19 |     测试豆瓣电影爬虫
 20 |     """
 21 |     headers = {
 22 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
 23 |         "Host": "movie.douban.com",
 24 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 25 |         "Accept-Encoding": "gzip, deflate, sdch, br",
 26 |         "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6",
 27 |         "Cache-Control": "max-age=0",
 28 |         "Connection": "keep-alive",
 29 |         "Upgrade-Insecure-Requests": "1",
 30 |         "Cookie": "bid=Pd48iLTpsf8"
 31 |     }
 32 | 
 33 |     # 获取初始url
 34 |     all_urls = set()
 35 | 
 36 |     resp = requests.get("https://movie.douban.com/tag/", headers=headers, verify=False)
 37 |     assert resp.status_code == 200, resp.status_code
 38 | 
 39 |     soup = BeautifulSoup(resp.text, "html5lib")
 40 |     a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE))
 41 |     all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list])
 42 | 
 43 |     resp = requests.get("https://movie.douban.com/tag/?view=cloud", headers=headers, verify=False)
 44 |     assert resp.status_code == 200, resp.status_code
 45 | 
 46 |     soup = BeautifulSoup(resp.text, "html5lib")
 47 |     a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE))
 48 |     all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list])
 49 |     logging.warning("all urls: %s", len(all_urls))
 50 | 
 51 |     # 构造爬虫
 52 |     dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1), spider.Saver(), spider.UrlFilter())
 53 |     for tag, url in all_urls:
 54 |         dou_spider.set_start_url(url, ("index", tag), priority=1)
 55 |     dou_spider.start_work_and_wait_done(fetcher_num=20)
 56 |     return
 57 | 
 58 | 
 59 | def get_dangdang_books():
 60 |     """
 61 |     测试当当网爬虫
 62 |     """
 63 |     fetcher_number = 10
 64 |     fetcher_list = []
 65 |     for i in range(fetcher_number):
 66 |         fetcher_list.append(BookFetcher())
 67 |     parser = BookParser()
 68 |     saver = BookSaver()
 69 |     dang_spider = spider.WebSpider(fetcher_list, parser, saver, None)
 70 | 
 71 |     # 获取所有链接并存入数据库,由于时间太长,因此抓取链接和信息分开进行
 72 |     url_prefix_list = ["http://category.dangdang.com/pg{}-cp01.41.43.05.00.00.html", "http://category.dangdang.com/pg{}-cp01.41.59.00.00.00.html"]
 73 | 
 74 |     for url_prefix in url_prefix_list:
 75 |         for i in range(100):
 76 |             url = url_prefix.format(i)
 77 |             dang_spider.set_start_url(url, ("lists",), priority=1)
 78 |     dang_spider.start_work_and_wait_done(fetcher_num=fetcher_number)
 79 | 
 80 |     # 开始抓取所有的详细信息
 81 |     dang_spider = spider.WebSpider(fetcher_list, parser, saver, None)
 82 |     conn = pymysql.connect(host="localhost", user="username", password="password", db="dangdang_book", charset="utf8")
 83 |     cursor = conn.cursor()
 84 |     conn.autocommit(1)
 85 |     cursor.execute("select url from book_urls;")
 86 |     url_list = [item[0] for item in cursor.fetchall()]
 87 | 
 88 |     for url in url_list:
 89 |         dang_spider.set_start_url(url, ("detail",), priority=1)
 90 | 
 91 |     dang_spider.start_work_and_wait_done(fetcher_num=fetcher_number)
 92 |     for f_er in fetcher_list:
 93 |         f_er.driver_quit()
 94 |     return
 95 | 
 96 | 
 97 | def get_car_price():
 98 |     """
 99 |     测试汽车价格爬虫
100 |     """
101 |     return
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     logging.basicConfig(level=logging.WARNING, format="%(asctime)s\t%(levelname)s\t%(message)s")
106 |     # get_douban_movies()
107 |     # get_dangdang_books()
108 |     get_car_price()
109 | 


--------------------------------------------------------------------------------