├── README.md ├── demos_carprice └── __init__.py ├── demos_dangdang ├── README.md ├── __init__.py ├── book_fetcher.py ├── book_parser.py └── book_saver.py ├── demos_doubanmovies ├── __init__.py ├── movie_fetcher.py └── movie_parser.py ├── demos_nbastats ├── __init__.py └── nba_main.py ├── demos_taobao ├── chromedriver72 └── taobao.py ├── demos_weibo ├── __init__.py ├── weibo_login.py ├── weibo_search.py └── weibo_user.py ├── demos_weixin ├── __init__.py └── weixin_public.py ├── demos_yundama ├── __init__.py └── yundama.py ├── demos_zhihu ├── __init__.py └── zhihu_login.py ├── otherfiles ├── Dockerfile └── pylint.conf └── test_demos.py /README.md: -------------------------------------------------------------------------------- 1 | # PSpiderDemos 2 | demos based on PSpider 3 | 4 | 由于PSpider不断更新,不保证demo的正常运行! 5 | -------------------------------------------------------------------------------- /demos_carprice/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 抓取不同汽车网站的汽车价格 5 | """ 6 | -------------------------------------------------------------------------------- /demos_dangdang/README.md: -------------------------------------------------------------------------------- 1 | ## 当当童书数据爬取 2 | 3 | ### 爬取所有书籍链接 4 | 5 | 使用 `key = "lists"` 进行爬取, 爬取之后直接存入数据库中 6 | 7 | ### 爬取对应书籍页面的详细信息 8 | 9 | - 使用 `key = "detail"` 进行爬取 10 | - 当当书籍信息中, 内容推荐, 媒体推荐, 作者简介等信息只有当屏幕显示到那里时才会被 `javascript` 给渲染。因此使用 `selenium + PhantomJS` 爬取时, 要将窗口开大, 同时等待加载完毕。 11 | - 由于 `selenium` 本身一个 `driver` 只能单线程, 如果每一次爬取都反复开关 `driver` 开销太大, 因此在下修改了框架, 在初始化 `spider` 时传入一个 `fetcher` 的 `list`。 12 | 13 | ### 文件说明 14 | 15 | - `demos_dangdang` : 存放 `fetcher`, `parser`, `saver` 的三个类 16 | - `dangdang_book.py`: 分两步抓取链接以及详细信息(合并到test_demos.py中) 17 | -------------------------------------------------------------------------------- /demos_dangdang/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 抓取当当儿童的所有书目 5 | """ 6 | 7 | from .book_fetcher import BookFetcher 8 | from .book_parser import BookParser 9 | from .book_saver import BookSaver 10 | -------------------------------------------------------------------------------- /demos_dangdang/book_fetcher.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import spider 4 | import logging 5 | import requests.adapters 6 | from selenium import webdriver 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | from selenium.webdriver.support import expected_conditions as EC 9 | from selenium.webdriver.common.by import By 10 | import sys 11 | requests.packages.urllib3.disable_warnings() 12 | 13 | 14 | class BookFetcher(spider.Fetcher): 15 | 16 | def __init__(self): 17 | spider.Fetcher.__init__(self, normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=3, critical_sleep_time=0) 18 | self.driver = webdriver.PhantomJS(service_args=['--load-images=no']) 19 | self.driver.set_window_size(1120, 2000) 20 | return 21 | 22 | def clear_session(self): 23 | self.driver.delete_all_cookies() 24 | return 25 | 26 | def driver_quit(self): 27 | self.driver.quit() 28 | return 29 | 30 | def url_fetch(self, url, keys, critical, fetch_repeat): 31 | try: 32 | logging.warning("-------------------------------") 33 | if keys[0] == "detail": 34 | logging.warning("fetch %s", url) 35 | x_str = "//*[@id='detail'][contains(@isloaded, '1')]" 36 | self.driver.get(url) 37 | element_present = EC.presence_of_element_located((By.XPATH, x_str)) 38 | WebDriverWait(self.driver, 60).until(element_present) 39 | except: 40 | logging.warning("Unexpected error: %s", sys.exc_info()[0]) 41 | #self.clear_session() 42 | return 0, "" 43 | response = self.driver.page_source 44 | if not response: 45 | logging.warning("not response %s", response) 46 | return 0, "" 47 | logging.warning("fetch done!") 48 | return 1, response 49 | -------------------------------------------------------------------------------- /demos_dangdang/book_parser.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import spider 4 | import re 5 | import urllib 6 | import logging 7 | from bs4 import BeautifulSoup 8 | 9 | 10 | class BookParser(spider.Parser): 11 | 12 | def __init__(self): 13 | spider.Parser.__init__(self) 14 | self.contents_need = ['isbn', 'pic', 'title', 'con_reco', 'comment', 'brand', 'series', 'author', 'author_origin', 'author_country', 'translator', 'publicator', 'author_prize', 'book_prize', 'raw_title', 'age', 'responsibility', 'lan', 'words', 'size', 'binding', 'pub_date', 'pub_times', 'pages', 'price', 'editor_reco', 'media_reco', 'author_intro', 'review_num', 'dangdang_rank'] 15 | 16 | def clean_str(self, in_str): 17 | clean = re.compile('<.*?>') 18 | clean_text = re.sub(clean, '', in_str) 19 | return clean_text 20 | 21 | def getdetail_descripe(self, soup): 22 | p_info = soup.find(id="detail_describe") 23 | series = "" 24 | isbn = "" 25 | pub_times = "" 26 | pages = "" 27 | words = "" 28 | pub_date = "" 29 | size = "" 30 | binding = "" 31 | if p_info: 32 | li_list = p_info.findAll("li") 33 | if li_list: 34 | try: 35 | for item in li_list: 36 | item_str = str(item.get_text().strip()) 37 | if item_str.find("版 次") > -1: 38 | pub_times = item_str.replace("版 次:", "") 39 | continue 40 | if item_str.find("页 数") > -1: 41 | pages = item_str.replace("页 数:", "") 42 | continue 43 | if item_str.find("字 数") > -1: 44 | words = item_str.replace("字 数:", "") 45 | continue 46 | if item_str.find("印刷时间") > -1: 47 | pub_date = item_str.replace("印刷时间:", "") 48 | continue 49 | if item_str.find("开 本") > -1: 50 | size = item_str.replace("开 本:", "") 51 | continue 52 | if item_str.find("包 装") > -1: 53 | binding = item_str.replace("包 装:", "") 54 | continue 55 | if item_str.find("国际标准书号ISBN") > -1: 56 | isbn = item_str.replace("国际标准书号ISBN:", "") 57 | continue 58 | if item_str.find("丛书名") > -1: 59 | series = item_str.replace("丛书名:", "") 60 | # print [series, isbn, pub_times, pages, words, pub_date, size, binding] 61 | return [series, isbn, pub_times, pages, words, pub_date, size, binding] 62 | except: 63 | pass 64 | return ["", "", "", "", "", "", "", ""] 65 | return ["", "", "", "", "", "", "", ""] 66 | else: 67 | return ["", "", "", "", "", "", "", ""] 68 | 69 | def get_title(self, soup): 70 | name_tag = soup.find("div", class_="name_info") 71 | if name_tag: 72 | title_tag = name_tag.find("h1") 73 | if title_tag: 74 | try: 75 | title = title_tag.get_text().strip() 76 | return str(title) 77 | except: 78 | return "" 79 | else: 80 | return "" 81 | else: 82 | return "" 83 | 84 | def get_comment(self, soup): 85 | name_tag = soup.find("div", class_="name_info") 86 | if name_tag: 87 | content_tag = name_tag.find("h2") 88 | if content_tag: 89 | try: 90 | content = content_tag.get_text().strip() 91 | return str(content) 92 | except: 93 | return "" 94 | else: 95 | return "" 96 | else: 97 | return "" 98 | 99 | def get_content(self, soup): 100 | content_tag = soup.find(id="content") 101 | if content_tag: 102 | dsc_tag = content_tag.find(id="content-textarea") 103 | if not dsc_tag: 104 | dsc_tag = content_tag.find("div", class_="descrip") 105 | try: 106 | content = dsc_tag.get_text().strip() 107 | return self.clean_str(str(content)) 108 | except: 109 | return "" 110 | else: 111 | return "" 112 | 113 | def get_country(self, soup): 114 | name_tag = soup.find(id="author") 115 | name = "" 116 | if name_tag: 117 | name = str(name_tag.get_text()).strip().replace("作者:", "") 118 | pattern1 = re.compile(r'.*\【(.+?)\】.*') 119 | pattern2 = re.compile(r'.*\[(.+?)\].*') 120 | pattern3 = re.compile(r'.*\((.+?)\).*') 121 | pattern_list = [pattern1, pattern2, pattern3] 122 | for pattern in pattern_list: 123 | match = pattern.match(name) 124 | if match: 125 | return match.group(1) 126 | return "" 127 | 128 | def get_author_and_trans(self, soup): 129 | name_tag = soup.find(id="author") 130 | author = "" 131 | trans = "" 132 | previous = "" 133 | if name_tag: 134 | for item_l in name_tag.contents: 135 | if type(item_l).__name__ == "NavigableString": 136 | content = str(item_l.string).strip() 137 | if content.find("译") > -1: 138 | if content.find("编译") > -1: 139 | content.replace("编译", "") 140 | else: 141 | content.replace("译", "") 142 | trans = previous 143 | previous = "" 144 | else: 145 | if content.find("作者") > -1: 146 | previous = previous + content.replace("作者:", "") 147 | else: 148 | if content and content != ",": 149 | author = author + previous + content 150 | previous = "" 151 | else: 152 | previous = previous + " " + content 153 | else: 154 | content = str(item_l.get_text()).strip() 155 | previous = previous + content 156 | author = author + previous 157 | return [author, trans] 158 | else: 159 | return ["", ""] 160 | 161 | def get_publicator(self, soup): 162 | pub_tag = soup.find("span", {"dd_name": "出版社"}) 163 | if pub_tag: 164 | pub_a = pub_tag.find("a") 165 | if pub_a: 166 | return str(pub_a.get_text()).strip() 167 | return "" 168 | else: 169 | return "" 170 | 171 | def get_price(self, soup): 172 | o_price_tag = soup.find(id="original-price") 173 | if o_price_tag: 174 | return str(o_price_tag.get_text()).strip() 175 | price_tag = soup.find(id="dd-price") 176 | if price_tag: 177 | return str(price_tag.get_text()).strip() 178 | else: 179 | return "" 180 | 181 | def get_editor_reco(self, soup): 182 | abstract_tag = soup.find(id="abstract") 183 | editor_reco = "" 184 | if abstract_tag: 185 | reco_tag = abstract_tag.find(id="abstract-all") 186 | if reco_tag: 187 | editor_reco = str(reco_tag.get_text()).strip() 188 | if not editor_reco: 189 | dis_tag = abstract_tag.find("div", class_="descrip") 190 | if dis_tag: 191 | editor_reco = str(dis_tag.get_text()).strip() 192 | return editor_reco 193 | 194 | def get_media_reco(self, soup): 195 | content_tag = soup.find(id="mediaFeedback") 196 | if content_tag: 197 | media_tag = content_tag.find(id="mediaFeedback-textarea") 198 | if not media_tag: 199 | media_tag = content_tag.find("div", class_="descrip") 200 | try: 201 | content = media_tag.get_text().strip() 202 | return self.clean_str(str(content)) 203 | except: 204 | return "" 205 | else: 206 | return "" 207 | 208 | def get_author_intro(self, soup): 209 | content_tag = soup.find(id="authorIntroduction") 210 | if content_tag: 211 | media_tag = content_tag.find(id="authorIntroduction-textarea") 212 | if not media_tag: 213 | media_tag = content_tag.find("div", class_="descrip") 214 | try: 215 | content = media_tag.get_text().strip() 216 | return self.clean_str(str(content)) 217 | except: 218 | return "" 219 | else: 220 | return "" 221 | 222 | def get_reviws_number(self, soup): 223 | num_tag = soup.find(id="comm_num_down") 224 | if num_tag: 225 | return str(num_tag.get_text()).strip() 226 | else: 227 | return "" 228 | 229 | def get_rank(self, soup): 230 | rank_tag = soup.find("span", {"dd_name": "图书排行榜排名"}) 231 | if rank_tag: 232 | return str(rank_tag.get_text()).strip() 233 | else: 234 | return "" 235 | 236 | def get_pic(self, soup, lines_map): 237 | lines_map[self.contents_need[1]] = "miss" 238 | img_tag = soup.find(id="main-img-slider") 239 | img_url = [] 240 | if img_tag: 241 | img_lists = img_tag.findAll("a") 242 | if img_lists: 243 | for img_li in img_lists: 244 | if str(img_li["data-imghref"]) not in img_url: 245 | img_url.append(str(img_li["data-imghref"])) 246 | lines_map[self.contents_need[1]] = ','.join(img_url) 247 | 248 | 249 | def init_map(self): 250 | lines_map = {} 251 | c_len = len(self.contents_need) 252 | for i in range(0, c_len): 253 | lines_map[self.contents_need[i]] = '' 254 | # lines_map['age'] = age[option] 255 | return lines_map 256 | 257 | def write_to_line(self, lines_map): 258 | line = [] 259 | c_len = len(self.contents_need) 260 | for i in range(0, c_len): 261 | line.append(lines_map[self.contents_need[i]]) 262 | return line 263 | 264 | def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content): 265 | url_list, save_list = [], [] 266 | soup = BeautifulSoup(content, "lxml") 267 | if keys[0] == "lists": 268 | ul_tag = soup.find("ul", class_="list_aa listimg") 269 | if not ul_tag: 270 | return 1, url_list, save_list 271 | lists_tag = ul_tag.findAll("a", class_="pic") 272 | if not lists_tag: 273 | return 1, url_list, save_list 274 | for link in lists_tag: 275 | save_list.append([str(link["href"]), str(link["title"])]) 276 | elif keys[0] == "detail": 277 | lines_map = self.init_map() 278 | [series, isbn, pub_times, pages, words, pub_date, size, binding] = self.getdetail_descripe(soup) 279 | lines_map[self.contents_need[6]] = series 280 | lines_map[self.contents_need[0]] = isbn 281 | lines_map[self.contents_need[22]] = pub_times 282 | lines_map[self.contents_need[23]] = pages 283 | lines_map[self.contents_need[18]] = words 284 | lines_map[self.contents_need[21]] = pub_date 285 | lines_map[self.contents_need[19]] = size 286 | lines_map[self.contents_need[20]] = binding 287 | lines_map[self.contents_need[2]] = self.get_title(soup) 288 | lines_map[self.contents_need[3]] = self.get_content(soup) 289 | lines_map[self.contents_need[4]] = self.get_comment(soup) 290 | [author, translator] = self.get_author_and_trans(soup) 291 | lines_map[self.contents_need[7]] = author 292 | lines_map[self.contents_need[10]] = translator 293 | lines_map[self.contents_need[9]] = self.get_country(soup) 294 | lines_map[self.contents_need[11]] = self.get_publicator(soup) 295 | lines_map[self.contents_need[24]] = self.get_price(soup) 296 | lines_map[self.contents_need[25]] = self.get_editor_reco(soup) 297 | lines_map[self.contents_need[26]] = self.get_media_reco(soup) 298 | lines_map[self.contents_need[27]] = self.get_author_intro(soup) 299 | lines_map[self.contents_need[28]] = self.get_reviws_number(soup) 300 | lines_map[self.contents_need[29]] = self.get_rank(soup) 301 | self.get_pic(soup, lines_map) 302 | item = self.write_to_line(lines_map) 303 | item.append(url) 304 | #logging.warning(item) 305 | save_list.append(item) 306 | return 1, url_list, save_list 307 | -------------------------------------------------------------------------------- /demos_dangdang/book_saver.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import spider 4 | import pymysql 5 | 6 | 7 | class BookSaver(spider.Saver): 8 | 9 | def __init__(self): 10 | spider.Saver.__init__(self) 11 | self.conn = pymysql.connect(host="localhost", user="username", password="password", db="dangdang_book", charset="utf8") 12 | self.cursor = self.conn.cursor() 13 | self.conn.autocommit(1) 14 | return 15 | 16 | def item_save(self, url, keys, item): 17 | ''' 18 | self.cursor.execute("insert into t_doubanmovies (m_url, m_name, m_year, m_imgurl, m_director, m_writer, m_actors, " 19 | "m_genre, m_country, m_language, m_release, m_season, m_jishu, m_length, m_alias, m_website, m_dbsite, " 20 | "m_imdb, m_score, m_comment, m_starpercent)" 21 | " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", 22 | [i.strip() if i is not None else "" for i in item]) 23 | ''' 24 | if keys[0] == "lists": 25 | self.cursor.execute("insert into book_urls (url, title) values(%s, %s);", [i.strip() if i is not None else "" for i in item]) 26 | elif keys[0] == "detail": 27 | 28 | self.cursor.execute( 29 | "insert into book_detail (isbn, pic, title, con_reco, comment, brand, series, " 30 | "author, author_origin, author_country, translator, publicator, author_prize, book_prize, raw_title, age, responsibility, " 31 | "lan, words, size, binding, pub_date, pub_times, pages, price, editor_reco, media_reco, author_intro, review_num, dangdang_rank, link)" 32 | " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", 33 | [i.strip() if i is not None else "" for i in item]) 34 | 35 | return True 36 | -------------------------------------------------------------------------------- /demos_doubanmovies/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 抓取豆瓣电影的全部数据 5 | """ 6 | 7 | from .movie_fetcher import MovieFetcher 8 | from .movie_parser import MovieParser 9 | -------------------------------------------------------------------------------- /demos_doubanmovies/movie_fetcher.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import random 4 | import string 5 | import spider 6 | import logging 7 | import requests 8 | import requests.adapters 9 | requests.packages.urllib3.disable_warnings() 10 | 11 | 12 | class MovieFetcher(spider.Fetcher): 13 | 14 | def __init__(self): 15 | spider.Fetcher.__init__(self, max_repeat=3, sleep_time=0) 16 | 17 | self.session = requests.Session() 18 | self.session.mount('https://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)) 19 | self.clear_session() 20 | return 21 | 22 | def clear_session(self): 23 | self.session.headers.clear() 24 | self.session.cookies.clear() 25 | self.session.headers = { 26 | "User-Agent": spider.make_random_useragent("pc"), 27 | "Host": "movie.douban.com", 28 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 29 | "Accept-Encoding": "gzip, deflate, sdch, br", 30 | "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6", 31 | "Cookie": "bid=%s" % "".join(random.sample(string.ascii_letters + string.digits, 11)) 32 | } 33 | return 34 | 35 | def url_fetch(self, url, keys, repeat): 36 | resp = self.session.get(url, allow_redirects=False, verify=False, timeout=5) 37 | if resp.status_code == 200: 38 | return 1, resp.text 39 | logging.warning("Fetcher change cookie: %s", resp.status_code) 40 | self.clear_session() 41 | resp.raise_for_status() 42 | return 1, resp.text 43 | -------------------------------------------------------------------------------- /demos_doubanmovies/movie_parser.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import spider 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | class MovieParser(spider.Parser): 8 | 9 | def htm_parse(self, priority, url, keys, deep, content): 10 | url_list, save_list = [], [] 11 | soup = BeautifulSoup(content, "html5lib") 12 | 13 | if keys[0] == "index": 14 | # 获取列表页中所有的电影页面Url 15 | div_movies = soup.find_all("a", class_="nbg", title=True) 16 | url_list.extend([(item.get("href"), ("detail", keys[1]), 0) for item in div_movies]) 17 | 18 | # 获取列表页的下一页 19 | next_page = soup.find("span", class_="next") 20 | if next_page: 21 | next_page_a = next_page.find("a") 22 | if next_page_a: 23 | url_list.append((next_page_a.get("href"), ("index", keys[1]), 1)) 24 | else: 25 | content = soup.find("div", id="content") 26 | 27 | # 标题 28 | name_and_year = [item.get_text() for item in content.find("h1").find_all("span")] 29 | name, year = name_and_year if len(name_and_year) == 2 else (name_and_year[0], "") 30 | movie = [url, name.strip(), year.strip("()")] 31 | 32 | # 左边 33 | content_left = soup.find("div", class_="subject clearfix") 34 | 35 | nbg_soup = content_left.find("a", class_="nbgnbg").find("img") 36 | movie.append(nbg_soup.get("src") if nbg_soup else "") 37 | 38 | info = content_left.find("div", id="info").get_text() 39 | info_dict = dict([line.strip().split(":", 1) for line in info.strip().split("\n") if line.strip().find(":") > 0]) 40 | 41 | movie.append(info_dict.get("导演", "").replace("\t", " ")) 42 | movie.append(info_dict.get("编剧", "").replace("\t", " ")) 43 | movie.append(info_dict.get("主演", "").replace("\t", " ")) 44 | 45 | movie.append(info_dict.get("类型", "").replace("\t", " ")) 46 | movie.append(info_dict.get("制片国家/地区", "").replace("\t", " ")) 47 | movie.append(info_dict.get("语言", "").replace("\t", " ")) 48 | 49 | movie.append(info_dict.get("上映日期", "").replace("\t", " ") if "上映日期" in info_dict else info_dict.get("首播", "").replace("\t", " ")) 50 | movie.append(info_dict.get("季数", "").replace("\t", " ")) 51 | movie.append(info_dict.get("集数", "").replace("\t", " ")) 52 | movie.append(info_dict.get("片长", "").replace("\t", " ") if "片长" in info_dict else info_dict.get("单集片长", "").replace("\t", " ")) 53 | 54 | movie.append(info_dict.get("又名", "").replace("\t", " ")) 55 | movie.append(info_dict.get("官方网站", "").replace("\t", " ")) 56 | movie.append(info_dict.get("官方小站", "").replace("\t", " ")) 57 | movie.append(info_dict.get("IMDb链接", "").replace("\t", " ")) 58 | 59 | # 右边 60 | content_right = soup.find("div", class_="rating_wrap clearbox") 61 | if content_right: 62 | movie.append(content_right.find("strong", class_="ll rating_num").get_text()) 63 | 64 | rating_people = content_right.find("a", class_="rating_people") 65 | movie.append(rating_people.find("span").get_text() if rating_people else "") 66 | 67 | rating_per_list = [item.get_text() for item in content_right.find_all("span", class_="rating_per")] 68 | movie.append(", ".join(rating_per_list)) 69 | else: 70 | movie.extend(["", "", ""]) 71 | 72 | assert len(movie) == 21, "length of movie is invalid" 73 | save_list.append(movie) 74 | return 1, url_list, save_list 75 | -------------------------------------------------------------------------------- /demos_nbastats/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 从NBA官网http://stats.nba.com/获取球员数据 5 | """ 6 | -------------------------------------------------------------------------------- /demos_nbastats/nba_main.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import spider 4 | import requests 5 | 6 | # NBA球员索引URL 7 | url_player_index = "http://stats.nba.com/stats/commonallplayers?IsOnlyCurrentSeason=1&LeagueID=00&Season=2016-17" 8 | 9 | # NBA球员统计数据URL,传递参数PlayerID和PerMode("PerGame", "Totals") 10 | url_player_stats = "http://stats.nba.com/stats/playercareerstats?LeagueID=00&PlayerID=%s&PerMode=%s" 11 | 12 | 13 | # 定义抓取过程 14 | class NBAFetcher(spider.Fetcher): 15 | 16 | def url_fetch(self, url, keys, critical, fetch_repeat): 17 | """ 18 | 这里只需要重写url_fetch函数,参数含义及返回结果见框架 19 | """ 20 | headers = {"User-Agent": spider.make_random_useragent("pc"), "Accept-Encoding": "gzip"} 21 | response = requests.get(url, headers=headers, timeout=10) 22 | return 1, (response.json(), ) 23 | 24 | 25 | # 定义解析过程 26 | class NBAParser(spider.Parser): 27 | 28 | def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content): 29 | """ 30 | 这里只需要重写htm_parse函数,参数含义及返回结果见框架 31 | """ 32 | url_list, saver_list = [], [] 33 | if keys[0] == "index": 34 | # 解析索引页 35 | content_json = content[0] 36 | 37 | # 解析所有的球员 38 | for item in content_json["resultSets"][0]["rowSet"]: 39 | # 这里放入url_list的item为(url, keys, critical, priority), 注意这里keys的用法 40 | url_list.append((url_player_stats % (item[0], "Totals"), ("Totals", item[2]), True, 0)) 41 | url_list.append((url_player_stats % (item[0], "PerGame"), ("PerGame", item[2]), True, 0)) 42 | else: 43 | # 解析球员数据页 44 | content_json = content[0] 45 | 46 | # 解析球员数据 47 | saver_list = content_json["resultSets"][0]["rowSet"] 48 | return 1, url_list, saver_list 49 | 50 | 51 | # 定义保存过程 52 | class NBASaver(spider.Saver): 53 | 54 | def __init__(self, file_name_total, file_name_pergame): 55 | """ 56 | 构造函数,重写的目的是为了添加表头,并且不同的数据源写入到不同的文件 57 | """ 58 | spider.Saver.__init__(self) 59 | 60 | # 打开文件,并写入表头 61 | self.save_pipe_total = open(file_name_total, "w", encoding="utf-8") 62 | self.save_pipe_total.write("\t".join(["PLAYER_NAME", "PLAYER_ID", "SEASON_ID", "LEAGUE_ID", "TEAM_ID", "TEAM_ABBREVIATION", "PLAYER_AGE", 63 | "GP", "GS", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", 64 | "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]) + "\n") 65 | self.save_pipe_pergame = open(file_name_pergame, "w", encoding="utf-8") 66 | self.save_pipe_pergame.write("\t".join(["PLAYER_NAME", "PLAYER_ID", "SEASON_ID", "LEAGUE_ID", "TEAM_ID", "TEAM_ABBREVIATION", "PLAYER_AGE", 67 | "GP", "GS", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", 68 | "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]) + "\n") 69 | return 70 | 71 | def item_save(self, url, keys, item): 72 | """ 73 | 这里只需要重写item_save函数,参数含义及返回结果见框架 74 | """ 75 | if keys[0] == "Totals": 76 | self.save_pipe_total.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n") 77 | elif keys[0] == "PerGame": 78 | self.save_pipe_pergame.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n") 79 | else: 80 | return False 81 | return True 82 | 83 | 84 | if __name__ == "__main__": 85 | """ 86 | main流程 87 | """ 88 | # 初始化fetcher, parser和saver 89 | fetcher = NBAFetcher(critical_max_repeat=3, critical_sleep_time=0) 90 | parser = NBAParser(max_deep=-1, max_repeat=3) 91 | saver = NBASaver(file_name_total="nba_total.txt", file_name_pergame="nba_pergame.txt") 92 | 93 | # 初始化爬虫, 并传入初始Url 94 | nba_spider = spider.WebSpider(fetcher, parser, saver, url_filter=None) 95 | nba_spider.set_start_url(url_player_index, ("index",), critical=True) 96 | 97 | # 开启10个线程抓取数据 98 | nba_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) 99 | 100 | exit() 101 | -------------------------------------------------------------------------------- /demos_taobao/chromedriver72: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xianhu/PSpiderDemos/ac4aa44b17eff223ebee059603bbcd4a8d28926e/demos_taobao/chromedriver72 -------------------------------------------------------------------------------- /demos_taobao/taobao.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 模拟登陆淘宝,并抓取商品 5 | """ 6 | 7 | import time 8 | from urllib.parse import quote 9 | from pyquery import PyQuery 10 | from selenium import webdriver 11 | from selenium.webdriver.common.by import By 12 | from selenium.common.exceptions import TimeoutException 13 | from selenium.webdriver.support.wait import WebDriverWait 14 | from selenium.webdriver.support import expected_conditions 15 | from selenium.webdriver import ActionChains 16 | 17 | 18 | # webdriver 19 | option = webdriver.ChromeOptions() 20 | # option.add_argument("--proxy-server=127.0.0.1:9000") 21 | option.add_experimental_option("excludeSwitches", ["enable-automation"]) 22 | # option.add_argument("--headless") 23 | browser = webdriver.Chrome("./chromedriver72", options=option) 24 | 25 | 26 | def login(name, password): 27 | """ 28 | 登陆 29 | """ 30 | url = "https://login.taobao.com/member/login.jhtml" 31 | browser.get(url) 32 | try: 33 | browser.find_element_by_css_selector("div.login-switch #J_Quick2Static").click() 34 | except Exception as excep: 35 | print(excep) 36 | 37 | # 输入用户名密码 38 | browser.find_element_by_id("TPL_username_1").send_keys(name) 39 | browser.find_element_by_id("TPL_password_1").send_keys(password) 40 | time.sleep(1) 41 | 42 | try: 43 | # 拖动滑块 44 | slider = browser.find_element_by_css_selector("#nc_1_n1z") 45 | action = ActionChains(browser) 46 | action.drag_and_drop_by_offset(slider, 500, 0).perform() 47 | time.sleep(3) 48 | except Exception as excep: 49 | print(excep) 50 | 51 | time.sleep(2) 52 | browser.find_element_by_id("J_SubmitStatic").click() 53 | return 54 | 55 | 56 | def index_page(page, key): 57 | print("正在爬去第", page, "页") 58 | try: 59 | browser.get("https://s.taobao.com/search?q="+quote(key)) 60 | try: 61 | slider2 = browser.find_element_by_css_selector("#nc_1__scale_text span.nc-lang-cnt") 62 | action2 = ActionChains(browser) 63 | action2.drag_and_drop_by_offset(slider2, 500, 0).perform() 64 | time.sleep(5) 65 | except Exception as excep: 66 | print(excep) 67 | 68 | input1 = wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager div.form > input"))) 69 | submit = wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager div.form > span.btn.J_Submit"))) 70 | input1.clear() 71 | input1.send_keys(page) 72 | submit.click() 73 | 74 | wait.until(expected_conditions.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager li.item.active > span"), str(page))) 75 | wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, ".m-itemlist .items .item"))) 76 | 77 | # 获取商品 78 | doc = PyQuery(browser.page_source) 79 | items = doc("#mainsrp-itemlist .items .item").items() 80 | for item in items: 81 | product = { 82 | "image": item.find(".pic .img").attr("data-src"), 83 | "price": item.find(".price").text(), 84 | "deal": item.find(".deal-cnt").text(), 85 | "title": item.find(".title").text(), 86 | "shop": item.find(".shop").text(), 87 | "location": item.find(".location").text(), 88 | } 89 | print(product) 90 | except TimeoutException: 91 | index_page(page, key) 92 | except Exception as excep: 93 | print(excep) 94 | return 95 | 96 | 97 | if __name__ == "__main__": 98 | wait = WebDriverWait(browser, 10) 99 | login("username", "password") 100 | # 抓取商品 101 | for x in range(2, 10): 102 | time.sleep(5) 103 | index_page(x, "python") 104 | browser.close() 105 | -------------------------------------------------------------------------------- /demos_weibo/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 获取微博数据 5 | """ 6 | 7 | from .weibo_user import WeiBoUser 8 | from .weibo_search import WeiBoSearch 9 | -------------------------------------------------------------------------------- /demos_weibo/weibo_login.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import re 4 | import rsa 5 | import ssl 6 | import time 7 | import json 8 | import queue 9 | import base64 10 | import spider 11 | import logging 12 | import binascii 13 | import urllib.parse 14 | ssl._create_default_https_context = ssl._create_unverified_context 15 | 16 | 17 | class WeiBoLogin(object): 18 | """ 19 | class of WeiBoLogin, to login weibo.com 20 | """ 21 | 22 | def __init__(self): 23 | """ 24 | constructor 25 | """ 26 | self.user_name = None 27 | self.pass_word = None 28 | self.user_uniqueid = None 29 | self.user_nick = None 30 | 31 | self.cookie_jar, self.opener = None, None 32 | self.yundama = spider.YunDaMa("", "") 33 | return 34 | 35 | def login(self, user_name, pass_word, proxies=None): 36 | """ 37 | login weibo.com, return True or False 38 | """ 39 | self.user_name = user_name 40 | self.pass_word = pass_word 41 | self.user_uniqueid = None 42 | self.user_nick = None 43 | 44 | self.cookie_jar, self.opener = spider.make_cookiejar_opener(is_cookie=True, proxies=proxies) 45 | self.opener.addheaders = spider.make_headers( 46 | user_agent="pc", 47 | host="weibo.com", 48 | referer="http://weibo.com/", 49 | accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 50 | accept_encoding="gzip, deflate", 51 | accept_language="zh-CN,zh;q=0.8" 52 | ).items() 53 | self.opener.open("http://weibo.com/login.php") 54 | 55 | # get json data 56 | s_user_name = self.get_username() 57 | json_data = self.get_json_data(su_value=s_user_name) 58 | if not json_data: 59 | return False 60 | s_pass_word = self.get_password(json_data["servertime"], json_data["nonce"], json_data["pubkey"]) 61 | 62 | # make post_dict 63 | post_dict = { 64 | "entry": "weibo", 65 | "gateway": "1", 66 | "from": "", 67 | "savestate": "7", 68 | "userticket": "1", 69 | "vsnf": "1", 70 | "service": "miniblog", 71 | "encoding": "UTF-8", 72 | "pwencode": "rsa2", 73 | "sr": "1280*800", 74 | "prelt": "529", 75 | "url": "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack", 76 | "rsakv": json_data["rsakv"], 77 | "servertime": json_data["servertime"], 78 | "nonce": json_data["nonce"], 79 | "su": s_user_name, 80 | "sp": s_pass_word, 81 | "returntype": "TEXT", 82 | } 83 | 84 | # get captcha code 85 | if json_data["showpin"] == 1: 86 | url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int(time.time()), json_data["pcid"]) 87 | cid, code = self.yundama.get_captcha(self.opener.open(url).read(), "captcha.jpeg", "image/jpeg", codetype="1005") 88 | if not code: 89 | return False 90 | else: 91 | post_dict["pcid"] = json_data["pcid"] 92 | post_dict["door"] = code 93 | 94 | # login weibo.com 95 | login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int(time.time()) 96 | json_data_1 = json.loads(spider.get_html_content(self.opener.open(login_url_1, data=spider.make_post_data(post_dict)))) 97 | if json_data_1["retcode"] == "0": 98 | # callback 99 | post_dict = { 100 | "callback": "sinaSSOController.callbackLoginStatus", 101 | "ticket": json_data_1["ticket"], 102 | "ssosavestate": int(time.time()), 103 | "client": "ssologin.js(v1.4.18)", 104 | "_": int(time.time()*1000), 105 | } 106 | login_url_2 = "https://passport.weibo.com/wbsso/login?" + urllib.parse.urlencode(post_dict) 107 | html_data = spider.get_html_content(self.opener.open(login_url_2), charset="gbk") 108 | json_data_2 = json.loads(re.search("\((?P.*)\)", html_data).group("result")) 109 | if json_data_2["result"] is True: 110 | self.user_uniqueid = json_data_2["userinfo"]["uniqueid"] 111 | self.user_nick = json_data_2["userinfo"]["displayname"] 112 | logging.warning("WeiBoLogin succeed: %s", json_data_2) 113 | else: 114 | logging.warning("WeiBoLogin failed: %s", json_data_2) 115 | else: 116 | logging.warning("WeiBoLogin failed: %s", json_data_1) 117 | return True if self.user_uniqueid and self.user_nick else False 118 | 119 | def get_username(self): 120 | """ 121 | get legal username 122 | """ 123 | username_quote = urllib.parse.quote_plus(self.user_name) 124 | username_base64 = base64.b64encode(username_quote.encode("utf-8")) 125 | return username_base64.decode("utf-8") 126 | 127 | def get_json_data(self, su_value): 128 | """ 129 | get the value of "servertime", "nonce", "pubkey", "rsakv" and "showpin", etc 130 | """ 131 | post_data = urllib.parse.urlencode({ 132 | "entry": "weibo", 133 | "callback": "sinaSSOController.preloginCallBack", 134 | "rsakt": "mod", 135 | "checkpin": "1", 136 | "client": "ssologin.js(v1.4.18)", 137 | "su": su_value, 138 | "_": int(time.time()*1000), 139 | }) 140 | 141 | try: 142 | response = self.opener.open('http://login.sina.com.cn/sso/prelogin.php?'+post_data) 143 | data = spider.get_html_content(response, charset="utf-8") 144 | json_data = json.loads(re.search("\((?P.*)\)", data).group("data")) 145 | except Exception as excep: 146 | json_data = {} 147 | logging.error("WeiBoLogin get_json_data error: %s", excep) 148 | 149 | logging.debug("WeiBoLogin get_json_data: %s", json_data) 150 | return json_data 151 | 152 | def get_password(self, servertime, nonce, pubkey): 153 | """ 154 | get legal password, encrypt file: http://i.sso.sina.com.cn/js/ssologin.js 155 | """ 156 | string = (str(servertime) + '\t' + str(nonce) + '\n' + str(self.pass_word)).encode("utf-8") 157 | public_key = rsa.PublicKey(int(pubkey, 16), int("10001", 16)) 158 | password = rsa.encrypt(string, public_key) 159 | password = binascii.b2a_hex(password) 160 | return password.decode() 161 | 162 | 163 | class WeiBoBase(WeiBoLogin): 164 | """ 165 | class of WeiBoBase, as a base class 166 | """ 167 | def __init__(self, users_pair=None): 168 | """ 169 | constructor, users_pair: [(u1, p1), (u2, p2), ...] 170 | """ 171 | WeiBoLogin.__init__(self) 172 | 173 | self.users_pair = users_pair 174 | self.users_index = 0 175 | 176 | self.base_url = "http://weibo.com/" 177 | self.header_re = re.compile("\$CONFIG\[[\'\"](?P[\w]+?)[\'\"]\]=[\'\"](?P[\w]*?)[\'\"]") 178 | 179 | self.fetch_queue = queue.Queue() # unfetched url queue (url, keys, repeat) 180 | self.saved_set = set() # saved url or other id 181 | 182 | self.current_page = 1 # current page which is fetching 183 | self.max_repeat = 5 # maxinum repeat time 184 | self.out_file = None # output file 185 | self.out_list = [] # output list 186 | self.out_length = 0 # output length 187 | return 188 | 189 | def re_login(self): 190 | """ 191 | login repeat according to self.users_index 192 | """ 193 | user_name, pass_word = self.users_pair[self.users_index % len(self.users_pair)] 194 | if not self.login(user_name, pass_word): 195 | exit() 196 | self.users_index += 1 197 | return 198 | -------------------------------------------------------------------------------- /demos_weibo/weibo_search.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import re 4 | import bs4 5 | import time 6 | import json 7 | import spider 8 | import logging 9 | import datetime 10 | import urllib.parse 11 | from .weibo_login import WeiBoBase 12 | 13 | 14 | class WeiBoSearch(WeiBoBase): 15 | """ 16 | class of WeiBoSearch 17 | """ 18 | 19 | def __init__(self, users_pair): 20 | """ 21 | constructor 22 | """ 23 | WeiBoBase.__init__(self, users_pair=users_pair) 24 | 25 | # parameters which are needed in this class 26 | self.search_url = "http://s.weibo.com/weibo/" 27 | self.fetch_keys = tuple() # type: (A, B, ...) 28 | self.fetch_timescope = None # type: custom:2016-04-01-0:2016-04-02 29 | self.fetch_type = None # type: typeall=1, xsort=hot, scope=ori 30 | return 31 | 32 | def update_fetch_queue(self): 33 | """ 34 | update fetch queue 35 | """ 36 | search_url = self.search_url + urllib.parse.quote(urllib.parse.quote(" ".join(self.fetch_keys))) 37 | search_url += "&%s&suball=1×cope=%s&page=%d" % (self.fetch_type, self.fetch_timescope, self.current_page) 38 | self.fetch_queue.put(item=(search_url, "search", 0)) 39 | return 40 | 41 | def fetch_search_weibo(self, fetch_keys, fetch_timescope, fetch_type="typeall=1", out_file=None): 42 | """ 43 | fetch search weibo 44 | """ 45 | assert fetch_type in ["typeall=1", "xsort=hot", "scope=ori"] 46 | self.re_login() if not self.user_uniqueid else 0 47 | 48 | # base class variables 49 | self.fetch_queue.queue.clear() 50 | self.saved_set.clear() 51 | self.current_page = 1 52 | self.out_file = out_file 53 | self.out_list = [] 54 | self.out_length = 0 55 | 56 | # this class variables 57 | self.fetch_keys = fetch_keys 58 | self.fetch_timescope = fetch_timescope 59 | self.fetch_type = fetch_type 60 | 61 | # update fetch queue 62 | self.update_fetch_queue() 63 | 64 | while self.fetch_queue.qsize() > 0: 65 | url, keys, repeat = self.fetch_queue.get() 66 | logging.debug("WeiBoSearch: keys=%s, repeat=%s, url=%s", keys, repeat, url) 67 | 68 | try: 69 | html_all = spider.get_html_content(self.opener.open(url)) 70 | for sc in re.findall("", html_all): 71 | json_data = json.loads(sc) 72 | 73 | if json_data.get("pid") == "pl_common_sassfilter": 74 | self.check_anti_by_captcha(json_data["html"]) 75 | self.update_fetch_queue() 76 | break 77 | 78 | if json_data.get("pid") == "pl_weibo_direct": 79 | self.parse_search_weibo_page(json_data["html"]) 80 | break 81 | except Exception as excep: 82 | if repeat < self.max_repeat: 83 | self.fetch_queue.put(item=(url, keys, repeat+1)) 84 | else: 85 | logging.error("WeiBoSearch error: %s, url=%s", excep, url) 86 | return 87 | 88 | def parse_search_weibo_page(self, html): 89 | """ 90 | parse search weibo page 91 | """ 92 | soup = bs4.BeautifulSoup(html, "html.parser") 93 | 94 | if soup.find("div", class_="search_noresult"): 95 | logging.warning("WeiBoSearch: no result") 96 | return 97 | 98 | for item in soup.find_all("div", attrs={"action-type": "feed_list_item", "mid": True}): 99 | weibo_id = item.get("mid") 100 | 101 | soup_info = item.find("a", class_="W_textb", date=True) 102 | weibo_url = soup_info.get("href") 103 | weibo_date = datetime.datetime.fromtimestamp(int(soup_info.get("date")) / 1000.0) 104 | weibo_content = spider.get_string_strip(item.find("p", class_="comment_txt").get_text()) 105 | 106 | soup_user = item.find("img", class_="W_face_radius") 107 | weibo_user = soup_user.get("alt") 108 | _, querys = spider.get_url_params("http://xxx.com/?" + soup_user.get("usercard")) 109 | weibo_user_id = querys["id"] if "id" in querys else "" 110 | 111 | weibo_list = (weibo_id, weibo_url, weibo_date, weibo_user_id, weibo_user, weibo_content) 112 | if self.out_file: 113 | self.out_file.write("\t".join(map(str, weibo_list)) + "\n") 114 | else: 115 | self.out_list.append(weibo_list) 116 | self.out_length += 1 117 | logging.debug("WeiBoSearch: current_page=%s, out_length=%s", self.current_page, self.out_length) 118 | 119 | if soup.find("a", class_="page next S_txt1 S_line1", href=True): 120 | self.current_page += 1 121 | self.update_fetch_queue() 122 | return 123 | 124 | def check_anti_by_captcha(self, html): 125 | """ 126 | check anti-spider by captcha 127 | """ 128 | soup = bs4.BeautifulSoup(html, "html.parser") 129 | 130 | cid, code = None, None 131 | while not code: 132 | captcha_url = soup.find("img", attrs={"node-type": "yzm_img"}).get("src") 133 | response = self.opener.open(spider.get_url_legal(captcha_url, self.search_url)) 134 | cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1004") 135 | 136 | verified_url = "http://s.weibo.com/ajax/pincode/verified?__rnd=%d" % int(time.time() * 1000) 137 | post_data = spider.make_post_data({ 138 | "secode": code, 139 | "type": "sass", 140 | "pageid": "weibo", 141 | "_t": 0 142 | }) 143 | temp = json.loads(spider.get_html_content(self.opener.open(verified_url, data=post_data))) 144 | if temp["code"] == "100000": 145 | logging.warning("WeiBoSearch anti-spider succeed") 146 | else: 147 | logging.warning("WeiBoSearch anti-spider failed") 148 | self.yundama.report(cid) if cid else 0 149 | return 150 | -------------------------------------------------------------------------------- /demos_weibo/weibo_user.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import re 4 | import bs4 5 | import sys 6 | import json 7 | import time 8 | import spider 9 | import random 10 | import logging 11 | import datetime 12 | import urllib.parse 13 | from .weibo_login import WeiBoBase 14 | 15 | 16 | class WeiBoUser(WeiBoBase): 17 | """ 18 | class of WeiBoUser 19 | """ 20 | 21 | def __init__(self, users_pair): 22 | """ 23 | constructor 24 | """ 25 | WeiBoBase.__init__(self, users_pair=users_pair) 26 | 27 | # parameters which are needed in this class 28 | self.bar_url = "http://weibo.com/p/aj/v6/mblog/mbloglist?" 29 | self.html_re = re.compile("") 30 | return 31 | 32 | def fetch_user_from_id(self, user_id): 33 | """ 34 | fetch user data from user_id 35 | :return user_name, [user_page_follows, user_page_fans, user_page_weibos], [follows, fans, weibos] 36 | """ 37 | self.re_login() if not self.user_uniqueid else 0 38 | 39 | user_url_base = "http://weibo.com/%s/profile" % user_id.strip() 40 | user_name, user_pages, user_counts = None, [], [] 41 | 42 | repeat_time = 0 43 | while repeat_time <= self.max_repeat: 44 | logging.debug("WeiBoUser repeat: repeat_time=%d" % repeat_time) if repeat_time > 0 else 0 45 | html_all = spider.get_html_content(self.opener.open(user_url_base, timeout=5)) 46 | 47 | header_dict = {key: value for key, value in self.header_re.findall(html_all)} 48 | if ("uid" not in header_dict) or ("nick" not in header_dict): 49 | repeat_time += 1 50 | continue 51 | 52 | if ("onick" not in header_dict) or (header_dict["onick"] == header_dict["nick"]): 53 | repeat_time += 1 54 | continue 55 | 56 | for sc_string in self.html_re.findall(html_all): 57 | json_data = json.loads(sc_string) 58 | if json_data["domid"] == "Pl_Core_T8CustomTriColumn__3" and "html" in json_data: 59 | soup = bs4.BeautifulSoup(json_data["html"], "html.parser") 60 | a_soup_list = soup.find_all("a", class_="S_txt1") 61 | user_pages = [a_soup.get("href") for a_soup in a_soup_list] 62 | user_counts = [int(spider.get_string_num(a_soup.get_text())) for a_soup in a_soup_list] 63 | user_name = header_dict["onick"] 64 | break 65 | 66 | if user_name: 67 | break 68 | 69 | repeat_time += 1 70 | # return result 71 | logging.warning("WeiBoUser fetch_user_from_id: user_id=%s, user_name=%s" % (user_id, user_name)) 72 | return user_name, user_pages, user_counts 73 | 74 | def fetch_user_weibos(self, user_url, key_dict, file_out=sys.stdout, sleep_time=0): 75 | """ 76 | fetch user weibo, user_url like: http://weibo.com/p/1005051750270991/home?parameters 77 | :param key_dict: {"mod": "data", "is_all": 1} 78 | :param key_dict: {"stat_date": "201512", "is_all": 1} 79 | :param key_dict: { 80 | "is_ori": 1, "is_forward": 1, "is_text": 1, "is_pic": 1, "is_video": 1, "is_music": 1, "is_article": 1, 81 | "key_word": "a b", "start_time": "2016-06-01", "end_time": "2016-06-04", "is_search": 1, "is_searchadv": 1 82 | } 83 | """ 84 | self.re_login() if not self.user_uniqueid else 0 85 | 86 | self.fetch_queue.queue.clear() 87 | self.current_page = 1 88 | self.file_out = file_out 89 | 90 | # get the start url 91 | url_main, _ = spider.get_url_params(user_url, is_unique_values=True) 92 | self.fetch_queue.put((url_main+"?"+urllib.parse.urlencode(key_dict), "page_index", 0)) 93 | 94 | # get data from url 95 | while self.fetch_queue.qsize() > 0: 96 | time.sleep(random.randint(0, sleep_time)) if sleep_time > 0 else 0 97 | url, keys, repeat = self.fetch_queue.get() 98 | 99 | try: 100 | html_all = spider.get_html_content(self.opener.open(url, timeout=5)) 101 | main, querys = spider.get_url_params(url, is_unique_values=True) 102 | 103 | if keys == "page_index": 104 | logging.warning("WeiBoUser index: repeat=%d, page=%d, url=%s" % (repeat, self.current_page, url)) 105 | 106 | header_dict = {key: value for key, value in self.header_re.findall(html_all)} 107 | for sc_string in self.html_re.findall(html_all): 108 | json_data = json.loads(sc_string) 109 | if json_data.get("ns") == "pl.content.homeFeed.index" and \ 110 | json_data["domid"].startswith("Pl_Official_MyProfileFeed"): 111 | # get index data 112 | weibo_count, is_loading, next_page = self.parse_user_weibo_page(json_data["html"]) 113 | if is_loading: 114 | # pagebar 0 and 1 115 | post_dict = { 116 | "id": querys.get("id", header_dict["page_id"]), 117 | "domain": querys.get("domain", header_dict["domain"]), 118 | "domain_op": querys.get("domain_op", header_dict["domain"]), 119 | "pre_page": querys.get("page", 1), 120 | "page": querys.get("page", 1), 121 | "pagebar": 0, 122 | "feed_type": 0, 123 | "ajwvr": 6, 124 | "__rnd": int(time.time() * 1000) 125 | } 126 | post_dict.update(key_dict) 127 | self.fetch_queue.put((self.bar_url+urllib.parse.urlencode(post_dict), "page_bar", 0)) 128 | break 129 | 130 | elif keys == "page_bar": 131 | logging.warning("WeiBoUser bar=%s: page=%d url=%s" % (querys["pagebar"], self.current_page, url)) 132 | 133 | # get bar data 134 | weibo_count, is_loading, next_page = self.parse_user_weibo_page(json.loads(html_all)["data"]) 135 | if is_loading: 136 | querys["pagebar"] = 1 137 | self.fetch_queue.put((self.bar_url+urllib.parse.urlencode(querys), "page_bar", 0)) 138 | 139 | if next_page: 140 | self.current_page += 1 141 | _temp = next_page.get("href") 142 | self.fetch_queue.put((url_main+_temp[_temp.find("?"):], "page_index", 0)) 143 | 144 | except Exception as e: 145 | if repeat < self.max_repeat: 146 | self.fetch_queue.put((url, keys, repeat+1)) 147 | else: 148 | logging.error("WeiBoUser error: error=%s, url=%s" % (str(e), url)) 149 | return 150 | 151 | def parse_user_weibo_page(self, html): 152 | """ 153 | parse user weibo page, return weibo_count, is_loading, next_page_soup 154 | """ 155 | # check frequence 156 | if html.find("你搜的太频繁了") > 0: 157 | logging.warning("WeiBoUser frequence warning: re_login!") 158 | self.re_login() 159 | assert False 160 | 161 | soup = bs4.BeautifulSoup(html, "html.parser") 162 | weibo_count, is_loading, next_page_soup = 0, False, soup.find("a", class_="page next S_txt1 S_line1") 163 | 164 | # check weibo number 165 | count_soup = soup.find("em", class_="W_fb S_spetxt") 166 | if (not count_soup) or int(count_soup.get_text()) > 0: 167 | for weibo_soup in soup.find_all("div", class_=re.compile("WB_cardwrap"), mid=True): 168 | weibo_count += 1 169 | 170 | weibo_id = weibo_soup.get("mid") 171 | if weibo_id in self.saved_set: 172 | continue 173 | 174 | # user information -- user_name and user_href 175 | user_div = weibo_soup.find("div", class_="WB_info") 176 | user_name = user_div.find("a", usercard=True).get_text() 177 | assert user_name, "WeiBoUser error: user_name is null!" 178 | 179 | # content information -- content_date 180 | date_div = weibo_soup.find("div", class_="WB_from S_txt2") 181 | content_date = datetime.datetime.fromtimestamp(int(date_div.find("a", date=True).get("date")) / 1000.0) 182 | 183 | # content information -- content and expand_users 184 | content_div = weibo_soup.find("div", class_="WB_text W_f14") 185 | content = spider.get_string_strip(content_div.get_text()) 186 | self.file_out.write("\t".join([user_name, str(content_date), content]) + "\n") 187 | 188 | # expand information 189 | expand_weibo = weibo_soup.find("div", class_="WB_feed_expand") 190 | if expand_weibo and (not expand_weibo.find("div", class_="WB_empty")): 191 | expand_user_div = expand_weibo.find("div", class_="WB_info") 192 | expand_user_name = expand_user_div.find("a", usercard=True).get_text().strip("@") 193 | 194 | # expand_date_div = expand_weibo.find("div", class_="WB_from S_txt2") 195 | expand_content = spider.get_string_strip(expand_weibo.find("div", class_="WB_text").get_text()) 196 | self.file_out.write("\t".join([user_name, "ex_c", expand_user_name, expand_content]) + "\n") 197 | 198 | self.saved_set.add(weibo_id) 199 | # if is_loading 200 | is_loading = True if html.rfind("正在加载中") > 0 else False 201 | 202 | logging.debug("WeiBoUser: weibo_count=%d, weibo_all=%d, is_loading=%s, next_page=%s" % 203 | (weibo_count, len(self.saved_set), str(is_loading), str(bool(next_page_soup)))) 204 | return weibo_count, is_loading, next_page_soup 205 | -------------------------------------------------------------------------------- /demos_weixin/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 获取微信公众号数据 5 | """ 6 | 7 | from .weixin_public import WeiXinPublic 8 | -------------------------------------------------------------------------------- /demos_weixin/weixin_public.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | import sys 4 | import time 5 | import json 6 | import spider 7 | import logging 8 | import urllib.parse 9 | import urllib.request 10 | from queue import Queue 11 | from bs4 import BeautifulSoup 12 | 13 | 14 | class WeiXinPublic(object): 15 | """ 16 | class of WeiXinPublic 17 | """ 18 | 19 | def __init__(self, max_repeat=5): 20 | """ 21 | constructor 22 | """ 23 | self.base_url = "http://weixin.sogou.com/" 24 | self.base_url_gzhjs = "http://weixin.sogou.com/gzhjs?" 25 | self.base_url_weixin = "http://weixin.sogou.com/weixin?" 26 | self.base_url_antispider = "http://weixin.sogou.com/antispider/" 27 | self.base_url_weixinqq = "http://mp.weixin.qq.com/" 28 | 29 | self.fetch_queue = Queue() # unfetched url queue (url, keys, repeat) 30 | self.saved_set = set() # saved url or other id 31 | self.current_page = 1 # current page which is fetching 32 | self.max_repeat = max_repeat # maxinum repeat time 33 | 34 | self.arts_key = None # key words for fetching articals 35 | self.user_id = None # user id, not the open_id; None if fetch_type is 2 36 | self.search_keys = None # search keys, (key, others) 37 | 38 | self.fetch_type = 1 # fetch type, 1: public_user, 2: public_artical 39 | self.fetch_tsn = 0 # fetch tsn, 0: all, 1: one day, 2: one week, 3: one month 40 | 41 | self.cookie_jar, self.opener = spider.make_cookiejar_opener() 42 | self.opener.addheaders = spider.make_headers( 43 | user_agent="pc", 44 | host="weixin.sogou.com", 45 | accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 46 | accept_encoding="gzip, deflate", 47 | accept_language="zh-CN" 48 | ).items() 49 | 50 | # prepare to identify the captcha, and reset this class 51 | self.yundama = spider.YunDaMa("qixianhu", "mimaMIMA123456") 52 | self.file_out = None 53 | return 54 | 55 | def fetch_user(self, user_id, file_out=sys.stdout): 56 | """ 57 | fetch user 58 | """ 59 | self.file_out = file_out 60 | self.user_id = user_id 61 | self.search_keys = ("user_search", user_id) 62 | 63 | self.fetch_type = 1 64 | self.fetch_tsn = 0 65 | 66 | self.reset_this_class() 67 | self.work() 68 | return 69 | 70 | def fetch_arts(self, arts_key, fetch_tsn=0, file_out=sys.stdout): 71 | """ 72 | fetch articles 73 | """ 74 | self.file_out = file_out 75 | self.arts_key = arts_key 76 | self.search_keys = ("arts_search", arts_key) 77 | 78 | self.fetch_type = 2 79 | self.fetch_tsn = fetch_tsn 80 | 81 | self.reset_this_class() 82 | self.work() 83 | return 84 | 85 | def reset_this_class(self): 86 | """ 87 | reset this class 88 | """ 89 | post_dict = { 90 | "type": self.fetch_type, 91 | "query": self.arts_key if self.fetch_type == 2 else self.user_id, 92 | "ie": "utf-8", 93 | "_sug_": "n", 94 | "_sug_type_": "", 95 | "t": int(time.time() * 1000) 96 | } 97 | if self.fetch_type == 2: 98 | post_dict["tsn"] = self.fetch_tsn 99 | post_dict["page"] = self.current_page 100 | post_data = urllib.parse.urlencode(post_dict) 101 | 102 | self.fetch_queue.queue.clear() 103 | self.fetch_queue.put(item=(self.base_url_weixin+post_data, self.search_keys, 0)) 104 | logging.debug("WeiXinPublic reset_this_class success: current_page=%d" % self.current_page) 105 | return 106 | 107 | def work(self): 108 | """ 109 | process of fetching and parsing 110 | """ 111 | while self.fetch_queue.qsize() > 0: 112 | url, keys, repeat = self.fetch_queue.get() 113 | logging.debug("WeiXinPublic work: keys=%s, repeat=%d, url=%s" % (str(keys), repeat, url)) 114 | try: 115 | response = self.opener.open(url, timeout=5) 116 | if keys[0] == "user_search": 117 | self.parse_user_search(url, keys, response) 118 | 119 | if keys[0] == "user_arts": 120 | self.parse_user_arts(url, keys, response) 121 | 122 | if keys[0] == "arts_search": 123 | self.parse_arts_search(url, keys, response) 124 | 125 | if keys[0] == "get_art": 126 | self.parse_get_art(url, keys, response) 127 | except Exception as excep: 128 | if repeat < self.max_repeat: 129 | self.fetch_queue.put(item=(url, keys, repeat+1)) 130 | else: 131 | logging.error("WeiXinPublic work: error=%s, url=%s" % (str(excep), url)) 132 | return 133 | 134 | def parse_user_search(self, url, keys, response): 135 | """ 136 | parser, keys: ("user_search", user_id) 137 | """ 138 | soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") 139 | if not self.check_anti_by_captcha(soup): 140 | self.reset_this_class() 141 | return 142 | 143 | user_name = "" 144 | for user_item in soup.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item"): 145 | if user_item.find("label", attrs={"name": "em_weixinhao"}).get_text() == self.user_id: 146 | user_name = user_item.find("div", class_="txt-box").find("h3").get_text() 147 | self.fetch_queue.put(item=(user_item.get("href"), ("user_arts", self.user_id, user_name), 0)) 148 | logging.debug("WeiXinPublic parse_user_search: user_name=%s" % user_name) 149 | return 150 | 151 | def parse_user_arts(self, url, keys, response): 152 | """ 153 | parser, keys: ("user_arts", user_id, user_name) 154 | """ 155 | html = spider.get_html_content(response, charset="utf-8") 156 | json_data = spider.get_json_data(html, "msgList = '(?P\{[\w\W]+?\})'") 157 | if json_data: 158 | for item in json_data.get("list", []): 159 | item_url = spider.get_url_legal(item["app_msg_ext_info"]["content_url"][1:], self.base_url_weixinqq).replace("&", "&") 160 | self.fetch_queue.put(item=(item_url, ("get_art", None, keys[1], keys[2]), 0)) 161 | for subitem in item["app_msg_ext_info"]["multi_app_msg_item_list"]: 162 | subitem_url = spider.get_url_legal(subitem["content_url"][1:], self.base_url_weixinqq).replace("&", "&") 163 | self.fetch_queue.put(item=(subitem_url, ("get_art", None, keys[1], keys[2]), 0)) 164 | logging.debug("WeiXinPublic parse_user_arts: len(fetch_queue)=%d" % self.fetch_queue.qsize()) 165 | return 166 | 167 | def parse_arts_search(self, url, keys, response): 168 | """ 169 | parser, keys: ("arts_search", arts_key) 170 | """ 171 | _, querys = spider.get_url_params(url) 172 | self.current_page = int(querys["page"][0]) if "page" in querys else self.current_page 173 | logging.debug("WeiXinPublic parse_arts_search: update current page, current_page=%d" % self.current_page) 174 | 175 | soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") 176 | if not self.check_anti_by_captcha(soup): 177 | self.reset_this_class() 178 | return 179 | 180 | # current page 181 | for art_soup in soup.find_all("div", class_="txt-box"): 182 | art_url = spider.get_url_legal(art_soup.find("a").get("href"), base_url=url) 183 | user_openid = art_soup.find("a", id="weixin_account").get("i") 184 | user_name = art_soup.find("a", id="weixin_account").get("title") 185 | self.fetch_queue.put(item=(art_url, ("get_art", keys[1], user_openid, user_name), 0)) 186 | 187 | # next page 188 | next_page = soup.find("a", id="sogou_next") 189 | if next_page: 190 | next_page_url = spider.get_url_legal(next_page.get("href"), base_url=url) 191 | self.fetch_queue.put(item=(next_page_url, keys, 0)) 192 | return 193 | 194 | def parse_get_art(self, url, keys, response): 195 | """ 196 | parser, keys: ("get_art", None or arts_key, user_id or user_openid, user_name) 197 | """ 198 | soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") 199 | 200 | _, querys = spider.get_url_params(url) 201 | s_title = spider.get_string_strip(soup.title.string) 202 | s_date = soup.find("em", id="post-date").get_text() 203 | self.file_out.write("\t".join([s_title, s_date, str(keys[1:])]) + "\n") 204 | 205 | self.saved_set.add(keys[2] + s_date + s_title) 206 | logging.debug("WeiXinPublic parse_get_art: len(saved_set)=%d" % len(self.saved_set)) 207 | return 208 | 209 | def check_anti_by_captcha(self, soup): 210 | """ 211 | check anti-spider by captcha 212 | :return 1, 0: 1(can continue), 0(can repeat) 213 | """ 214 | if not soup.find("img", id="seccodeImage"): 215 | return 1 216 | 217 | while 1: 218 | cid, code = None, None 219 | while not code: 220 | captcha_url = soup.find("img", id="seccodeImage").get("src") 221 | response = self.opener.open(spider.get_url_legal(captcha_url, self.base_url_antispider)) 222 | cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1006") 223 | 224 | post_data = urllib.parse.urlencode({ 225 | "c": code, 226 | "r": soup.find("input", id="from").get("value"), 227 | "v": 5 228 | }).encode() 229 | response = self.opener.open("http://weixin.sogou.com/antispider/thank.php", data=post_data) 230 | 231 | json_data = json.loads(spider.get_html_content(response, charset="utf-8")) 232 | if json_data["msg"].find("解封成功") >= 0: 233 | snuid = json_data["id"] 234 | self.cookie_jar.set_cookie(spider.make_cookie(name="SNUID", value=snuid, domain="weixin.sogou.com")) 235 | 236 | post_dict = { 237 | "uigs_productid": "webapp", 238 | "type": "antispider", 239 | "subtype": "", 240 | "domain": "weixin", 241 | "suv": "", 242 | "snuid": snuid, 243 | "t": int(time.time() * 1000) 244 | } 245 | for cookie in self.cookie_jar: 246 | if cookie.name == "SUV": 247 | post_dict["suv"] = cookie.value 248 | 249 | post_dict["subtype"] = "0_seccodeInputSuccess" 250 | post_dict["t"] = int(time.time() * 1000) 251 | self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict)) 252 | 253 | post_dict["subtype"] = "close_refresh" 254 | post_dict["t"] = int(time.time() * 1000) 255 | self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict)) 256 | break 257 | else: 258 | self.yundama.report(cid=cid) if cid else 0 259 | logging.warning("WeiXinPublic check_anti_by_captcha: anti-spider success!") 260 | return 0 261 | 262 | 263 | if __name__ == '__main__': 264 | logging.basicConfig(level=logging.DEBUG, format="%(asctime)s\t%(levelname)s\t%(message)s") 265 | 266 | weixin = WeiXinPublic() 267 | # weixin.fetch_user(user_id="diyCRT") 268 | # weixin.fetch_arts("北京国安", fetch_tsn=0) 269 | exit() 270 | -------------------------------------------------------------------------------- /demos_yundama/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 封装关于YunDaMa的接口,识别验证码 5 | """ 6 | 7 | from .yundama import YunDaMa 8 | -------------------------------------------------------------------------------- /demos_yundama/yundama.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | yundama.py by xianhu 5 | """ 6 | 7 | import time 8 | import spider 9 | import logging 10 | import requests 11 | 12 | 13 | class YunDaMa(object): 14 | """ 15 | class of YunDaMa, to identify captcha by yundama.com 16 | """ 17 | 18 | def __init__(self, user_name, pass_word, appid=None, appkey=None): 19 | """ 20 | constructor 21 | """ 22 | self.base_url = "http://api.yundama.com/api.php" 23 | self.base_headers = { 24 | "User-Agent": spider.make_random_useragent("pc"), 25 | "Host": "api.yundama.com", 26 | "Referer": "http://www.yundama.com/download/YDMHttp.html", 27 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 28 | "Accept-Language": "zh-CN,zh;q=0.8", 29 | "Origin": "http://www.yundama.com", 30 | } 31 | 32 | self.user_name = user_name 33 | self.pass_word = pass_word 34 | 35 | self.appid = "1" if not appid else appid 36 | self.appkey = "22cc5376925e9387a23cf797cb9ba745" if not appkey else appkey 37 | return 38 | 39 | def get_captcha(self, file_name, file_bytes, file_type="image/jpeg", codetype="1000", repeat=10): 40 | """ 41 | get captcha result(cid, code), based on file_name, file_bytes, file_type 42 | :key: http://www.yundama.com/apidoc/YDM_ErrorCode.html 43 | :param codetype: http://www.yundama.com/price.html 44 | """ 45 | cid = self.upload(file_name, file_bytes, file_type, codetype) 46 | if not cid: 47 | return None, None 48 | while repeat > 0: 49 | code = self.result(cid) 50 | if code: 51 | return cid, code 52 | repeat -= 1 53 | time.sleep(2) 54 | return cid, None 55 | 56 | def upload(self, file_name, file_bytes, file_type, codetype): 57 | """ 58 | upload image file, return cid or None 59 | """ 60 | post_data = { 61 | "username": self.user_name, 62 | "password": self.pass_word, 63 | "codetype": codetype, 64 | "appid": self.appid, 65 | "appkey": self.appkey, 66 | "timeout": 60, 67 | "method": "upload", 68 | } 69 | files = {"file": (file_name, file_bytes, file_type)} 70 | try: 71 | response = requests.post(self.base_url, data=post_data, headers=self.base_headers, files=files) 72 | json_data = response.json() 73 | except Exception as excep: 74 | json_data = {"ret": -1, "errMsg": excep} 75 | logging.warning("YunDaMa upload %s: %s", "succeed" if json_data["ret"] == 0 else "failed", json_data) 76 | return json_data.get("cid", "") 77 | 78 | def result(self, cid): 79 | """ 80 | get result from cid, return code or None 81 | """ 82 | try: 83 | response = requests.get(self.base_url+("?cid=%d&method=result" % cid), headers=self.base_headers) 84 | json_data = response.json() 85 | except Exception as excep: 86 | json_data = {"ret": -1, "errMsg": excep} 87 | logging.warning("YunDaMa result %s: %s", "succeed" if json_data["ret"] == 0 else "failed", json_data) 88 | return json_data.get("text", "") 89 | 90 | 91 | if __name__ == '__main__': 92 | ydm = YunDaMa("username", "password") 93 | cid_t, code_t = ydm.get_captcha("captcha.jpeg", requests.get("http://www.yundama.com/index/captcha").content) 94 | print(cid_t, code_t) 95 | if cid_t and (not code_t): 96 | ydm.result(cid_t) 97 | -------------------------------------------------------------------------------- /demos_zhihu/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | 获取知乎数据 5 | """ 6 | -------------------------------------------------------------------------------- /demos_zhihu/zhihu_login.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | define login_zhihu to login zhihu.com, just as a demonstration 5 | """ 6 | 7 | import re 8 | import time 9 | import json 10 | import logging 11 | import urllib.parse 12 | import urllib.request 13 | import http.cookiejar 14 | 15 | 16 | def login_zhihu(user_name, pass_word): 17 | """ 18 | login zhihu.com, just as a demonstration 19 | """ 20 | cookie_handler = urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()) 21 | urllib.request.install_opener(urllib.request.build_opener(cookie_handler)) 22 | 23 | # get _xsrf 24 | response = urllib.request.urlopen("http://www.zhihu.com") 25 | data = response.read().decode("utf-8") 26 | _xsrf = re.search("name=\"_xsrf\" value=\"(?P.*)\"", data).group("value") 27 | 28 | # get captcha 29 | response = urllib.request.urlopen('http://www.zhihu.com/captcha.gif?r=%d&type=login' % int(time.time() * 1000)) 30 | with open('captcha.jpg', 'wb') as file_image: 31 | file_image.write(response.read()) 32 | captcha = input("input the captcha: ") 33 | 34 | # login 35 | url = "http://www.zhihu.com/login/email" 36 | headers = { 37 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:41.0) Gecko/20100101 Firefox/41.0", 38 | "Referer": "http://www.zhihu.com/" 39 | } 40 | post_data = urllib.parse.urlencode({ 41 | "_xsrf": _xsrf, 42 | "email": user_name, 43 | "password": pass_word, 44 | "captcha": captcha, 45 | "remember_me": "true" 46 | }).encode() 47 | response = urllib.request.urlopen(urllib.request.Request(url, data=post_data, headers=headers)) 48 | result = json.loads(response.read().decode("utf-8")) 49 | 50 | if result["r"] == 0: 51 | logging.warning("login zhihu success!") 52 | return True 53 | logging.error("login zhihu failed! %s" % str(result)) 54 | return False 55 | -------------------------------------------------------------------------------- /otherfiles/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile by xianhu: build a docker image for spider or flask 2 | # usage: docker build -t user/centos:v06 . 3 | 4 | FROM centos:6.8 5 | 6 | MAINTAINER xianhu 7 | 8 | # change system environments 9 | ENV LANG en_US.UTF-8 10 | ENV LC_ALL en_US.UTF-8 11 | 12 | # change system local time 13 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime 14 | 15 | # update yum and install something 16 | RUN yum update -y 17 | RUN yum install -y xz 18 | RUN yum install -y vim 19 | RUN yum install -y git 20 | RUN yum install -y wget 21 | RUN yum install -y crontabs 22 | RUN yum install -y gcc 23 | RUN yum install -y make 24 | RUN yum install -y zlib-devel 25 | RUN yum install -y openssl-devel 26 | RUN yum clean all 27 | 28 | # restart crontab service 29 | RUN service crond restart 30 | 31 | # download python3 32 | WORKDIR /root/ 33 | RUN wget https://www.python.org/ftp/python/3.5.3/Python-3.5.3.tar.xz 34 | RUN tar -xf Python-3.5.3.tar.xz 35 | 36 | # install python3 37 | WORKDIR /root/Python-3.5.3 38 | RUN ./configure 39 | RUN make install 40 | RUN make clean 41 | RUN make distclean 42 | 43 | # install libs of python3 44 | ADD ./Dockerfile_requirements.txt /root/ 45 | WORKDIR /root/ 46 | RUN pip3 install --upgrade pip 47 | RUN pip3 install -r Dockerfile_requirements.txt 48 | RUN rm -rf /root/* 49 | 50 | # change python to python3 51 | RUN ln -sf /usr/local/bin/python3 /usr/bin/python 52 | RUN ln -sf /usr/bin/python2.6 /usr/bin/python2 53 | 54 | # change /usr/bin/yum 55 | RUN sed -i 's/usr\/bin\/python/usr\/bin\/python2/g' /usr/bin/yum 56 | 57 | # cmd command 58 | CMD /bin/bash 59 | -------------------------------------------------------------------------------- /otherfiles/pylint.conf: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=CVS 13 | 14 | # Add files or directories matching the regex patterns to the blacklist. The 15 | # regex matches against base names, not paths. 16 | ignore-patterns= 17 | 18 | # Pickle collected data for later comparisons. 19 | persistent=yes 20 | 21 | # List of plugins (as comma separated values of python modules names) to load, 22 | # usually to register additional checkers. 23 | load-plugins= 24 | 25 | # Use multiple processes to speed up Pylint. 26 | jobs=1 27 | 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the 29 | # active Python interpreter and may run arbitrary code. 30 | unsafe-load-any-extension=no 31 | 32 | # A comma-separated list of package or module names from where C extensions may 33 | # be loaded. Extensions are loading into the active Python interpreter and may 34 | # run arbitrary code 35 | extension-pkg-whitelist= 36 | 37 | # Allow optimization of some AST trees. This will activate a peephole AST 38 | # optimizer, which will apply various small optimizations. For instance, it can 39 | # be used to obtain the result of joining multiple strings with the addition 40 | # operator. Joining a lot of strings can lead to a maximum recursion error in 41 | # Pylint and this flag can prevent that. It has one side effect, the resulting 42 | # AST will be different than the one from reality. This option is deprecated 43 | # and it will be removed in Pylint 2.0. 44 | optimize-ast=no 45 | 46 | 47 | [MESSAGES CONTROL] 48 | 49 | # Only show warnings with the listed confidence levels. Leave empty to show 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 51 | confidence= 52 | 53 | # Enable the message, report, category or checker with the given id(s). You can 54 | # either give multiple identifier separated by comma (,) or put this option 55 | # multiple time (only on the command line, not in the configuration file where 56 | # it should appear only once). See also the "--disable" option for examples. 57 | #enable= 58 | 59 | # Disable the message, report, category or checker with the given id(s). You 60 | # can either give multiple identifiers separated by comma (,) or put this 61 | # option multiple times (only on the command line, not in the configuration 62 | # file where it should appear only once).You can also use "--disable=all" to 63 | # disable everything first and then reenable specific checks. For example, if 64 | # you want to run only the similarities checker, you can use "--disable=all 65 | # --enable=similarities". If you want to run only the classes checker, but have 66 | # no Warning level messages displayed, use"--disable=all --enable=classes 67 | # --disable=W" 68 | disable=backtick,basestring-builtin,zip-builtin-not-iterating,old-ne-operator,dict-view-method,input-builtin,unichr-builtin,raw_input-builtin,xrange-builtin,parameter-unpacking,unicode-builtin,reduce-builtin,old-raise-syntax,raising-string,print-statement,delslice-method,next-method-called,dict-iter-method,standarderror-builtin,buffer-builtin,intern-builtin,long-builtin,nonzero-method,hex-method,oct-method,range-builtin-not-iterating,coerce-builtin,useless-suppression,setslice-method,indexing-exception,execfile-builtin,getslice-method,import-star-module-level,metaclass-assignment,map-builtin-not-iterating,unpacking-in-except,using-cmp-argument,long-suffix,round-builtin,old-octal-literal,file-builtin,apply-builtin,reload-builtin,old-division,filter-builtin-not-iterating,no-absolute-import,cmp-method,suppressed-message,coerce-method,cmp-builtin 69 | 70 | 71 | [REPORTS] 72 | 73 | # Set the output format. Available formats are text, parseable, colorized, msvs 74 | # (visual studio) and html. You can also give a reporter class, eg 75 | # mypackage.mymodule.MyReporterClass. 76 | output-format=text 77 | 78 | # Put messages in a separate file for each module / package specified on the 79 | # command line instead of printing them on stdout. Reports (if any) will be 80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated 81 | # and it will be removed in Pylint 2.0. 82 | files-output=no 83 | 84 | # Tells whether to display a full report or only the messages 85 | reports=yes 86 | 87 | # Python expression which should return a note less than 10 (10 is the highest 88 | # note). You have access to the variables errors warning, statement which 89 | # respectively contain the number of errors / warnings messages and the total 90 | # number of statements analyzed. This is used by the global evaluation report 91 | # (RP0004). 92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 93 | 94 | # Template used to display messages. This is a python new-style format string 95 | # used to format the message information. See doc for all details 96 | #msg-template= 97 | 98 | 99 | [BASIC] 100 | 101 | # Good variable names which should always be accepted, separated by a comma 102 | good-names=i,j,k,ex,Run,_ 103 | 104 | # Bad variable names which should always be refused, separated by a comma 105 | bad-names=foo,bar,baz,toto,tutu,tata 106 | 107 | # Colon-delimited sets of names that determine each other's naming style when 108 | # the name regexes allow several styles. 109 | name-group= 110 | 111 | # Include a hint for the correct naming format with invalid-name 112 | include-naming-hint=no 113 | 114 | # List of decorators that produce properties, such as abc.abstractproperty. Add 115 | # to this list to register other decorators that produce valid properties. 116 | property-classes=abc.abstractproperty 117 | 118 | # Regular expression matching correct module names 119 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 120 | 121 | # Naming hint for module names 122 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 123 | 124 | # Regular expression matching correct function names 125 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 126 | 127 | # Naming hint for function names 128 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 129 | 130 | # Regular expression matching correct attribute names 131 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 132 | 133 | # Naming hint for attribute names 134 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 135 | 136 | # Regular expression matching correct constant names 137 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 138 | 139 | # Naming hint for constant names 140 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 141 | 142 | # Regular expression matching correct inline iteration names 143 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 144 | 145 | # Naming hint for inline iteration names 146 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 147 | 148 | # Regular expression matching correct argument names 149 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 150 | 151 | # Naming hint for argument names 152 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 153 | 154 | # Regular expression matching correct class names 155 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 156 | 157 | # Naming hint for class names 158 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 159 | 160 | # Regular expression matching correct class attribute names 161 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 162 | 163 | # Naming hint for class attribute names 164 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 165 | 166 | # Regular expression matching correct variable names 167 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 168 | 169 | # Naming hint for variable names 170 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 171 | 172 | # Regular expression matching correct method names 173 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 174 | 175 | # Naming hint for method names 176 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 177 | 178 | # Regular expression which should only match function or class names that do 179 | # not require a docstring. 180 | no-docstring-rgx=^_ 181 | 182 | # Minimum line length for functions/classes that require docstrings, shorter 183 | # ones are exempt. 184 | docstring-min-length=-1 185 | 186 | 187 | [ELIF] 188 | 189 | # Maximum number of nested blocks for function / method body 190 | max-nested-blocks=5 191 | 192 | 193 | [FORMAT] 194 | 195 | # Maximum number of characters on a single line. 196 | max-line-length=200 197 | 198 | # Regexp for a line that is allowed to be longer than the limit. 199 | ignore-long-lines=^\s*(# )??$ 200 | 201 | # Allow the body of an if to be on the same line as the test if there is no 202 | # else. 203 | single-line-if-stmt=no 204 | 205 | # List of optional constructs for which whitespace checking is disabled. `dict- 206 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 207 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 208 | # `empty-line` allows space-only lines. 209 | no-space-check=trailing-comma,dict-separator 210 | 211 | # Maximum number of lines in a module 212 | max-module-lines=1000 213 | 214 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 215 | # tab). 216 | indent-string=' ' 217 | 218 | # Number of spaces of indent required inside a hanging or continued line. 219 | indent-after-paren=4 220 | 221 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 222 | expected-line-ending-format= 223 | 224 | 225 | [LOGGING] 226 | 227 | # Logging modules to check that the string format arguments are in logging 228 | # function parameter format 229 | logging-modules=logging 230 | 231 | 232 | [MISCELLANEOUS] 233 | 234 | # List of note tags to take in consideration, separated by a comma. 235 | notes=FIXME,XXX,TODO 236 | 237 | 238 | [SIMILARITIES] 239 | 240 | # Minimum lines number of a similarity. 241 | min-similarity-lines=4 242 | 243 | # Ignore comments when computing similarities. 244 | ignore-comments=yes 245 | 246 | # Ignore docstrings when computing similarities. 247 | ignore-docstrings=yes 248 | 249 | # Ignore imports when computing similarities. 250 | ignore-imports=no 251 | 252 | 253 | [SPELLING] 254 | 255 | # Spelling dictionary name. Available dictionaries: none. To make it working 256 | # install python-enchant package. 257 | spelling-dict= 258 | 259 | # List of comma separated words that should not be checked. 260 | spelling-ignore-words= 261 | 262 | # A path to a file that contains private dictionary; one word per line. 263 | spelling-private-dict-file= 264 | 265 | # Tells whether to store unknown words to indicated private dictionary in 266 | # --spelling-private-dict-file option instead of raising a message. 267 | spelling-store-unknown-words=no 268 | 269 | 270 | [TYPECHECK] 271 | 272 | # Tells whether missing members accessed in mixin class should be ignored. A 273 | # mixin class is detected if its name ends with "mixin" (case insensitive). 274 | ignore-mixin-members=yes 275 | 276 | # List of module names for which member attributes should not be checked 277 | # (useful for modules/projects where namespaces are manipulated during runtime 278 | # and thus existing member attributes cannot be deduced by static analysis. It 279 | # supports qualified module names, as well as Unix pattern matching. 280 | ignored-modules=Levenshtein 281 | 282 | # List of class names for which member attributes should not be checked (useful 283 | # for classes with dynamically set attributes). This supports the use of 284 | # qualified names. 285 | ignored-classes=optparse.Values,thread._local,_thread._local 286 | 287 | # List of members which are set dynamically and missed by pylint inference 288 | # system, and so shouldn't trigger E1101 when accessed. Python regular 289 | # expressions are accepted. 290 | generated-members= 291 | 292 | # List of decorators that produce context managers, such as 293 | # contextlib.contextmanager. Add to this list to register other decorators that 294 | # produce valid context managers. 295 | contextmanager-decorators=contextlib.contextmanager 296 | 297 | 298 | [VARIABLES] 299 | 300 | # Tells whether we should check for unused import in __init__ files. 301 | init-import=no 302 | 303 | # A regular expression matching the name of dummy variables (i.e. expectedly 304 | # not used). 305 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy 306 | 307 | # List of additional names supposed to be defined in builtins. Remember that 308 | # you should avoid to define new builtins when possible. 309 | additional-builtins= 310 | 311 | # List of strings which can identify a callback function by name. A callback 312 | # name must start or end with one of those strings. 313 | callbacks=cb_,_cb 314 | 315 | # List of qualified module names which can have objects that can redefine 316 | # builtins. 317 | redefining-builtins-modules=six.moves,future.builtins 318 | 319 | 320 | [CLASSES] 321 | 322 | # List of method names used to declare (i.e. assign) instance attributes. 323 | defining-attr-methods=__init__,__new__,setUp 324 | 325 | # List of valid names for the first argument in a class method. 326 | valid-classmethod-first-arg=cls 327 | 328 | # List of valid names for the first argument in a metaclass class method. 329 | valid-metaclass-classmethod-first-arg=mcs 330 | 331 | # List of member names, which should be excluded from the protected access 332 | # warning. 333 | exclude-protected=_asdict,_fields,_replace,_source,_make 334 | 335 | 336 | [DESIGN] 337 | 338 | # Maximum number of arguments for function / method 339 | max-args=10 340 | 341 | # Argument names that match this expression will be ignored. Default to name 342 | # with leading underscore 343 | ignored-argument-names=_.* 344 | 345 | # Maximum number of locals for function / method body 346 | max-locals=30 347 | 348 | # Maximum number of return / yield for function / method body 349 | max-returns=10 350 | 351 | # Maximum number of branch for function / method body 352 | max-branches=30 353 | 354 | # Maximum number of statements in function / method body 355 | max-statements=100 356 | 357 | # Maximum number of parents for a class (see R0901). 358 | max-parents=10 359 | 360 | # Maximum number of attributes for a class (see R0902). 361 | max-attributes=50 362 | 363 | # Minimum number of public methods for a class (see R0903). 364 | min-public-methods=1 365 | 366 | # Maximum number of public methods for a class (see R0904). 367 | max-public-methods=30 368 | 369 | # Maximum number of boolean expressions in a if statement 370 | max-bool-expr=10 371 | 372 | 373 | [IMPORTS] 374 | 375 | # Deprecated modules which should not be used, separated by a comma 376 | deprecated-modules=optparse 377 | 378 | # Create a graph of every (i.e. internal and external) dependencies in the 379 | # given file (report RP0402 must not be disabled) 380 | import-graph= 381 | 382 | # Create a graph of external dependencies in the given file (report RP0402 must 383 | # not be disabled) 384 | ext-import-graph= 385 | 386 | # Create a graph of internal dependencies in the given file (report RP0402 must 387 | # not be disabled) 388 | int-import-graph= 389 | 390 | # Force import order to recognize a module as part of the standard 391 | # compatibility libraries. 392 | known-standard-library= 393 | 394 | # Force import order to recognize a module as part of a third party library. 395 | known-third-party=enchant 396 | 397 | # Analyse import fallback blocks. This can be used to support both Python 2 and 398 | # 3 compatible code, which means that the block might have code that exists 399 | # only in one or another interpreter, leading to false positives when analysed. 400 | analyse-fallback-blocks=no 401 | 402 | 403 | [EXCEPTIONS] 404 | 405 | # Exceptions that will emit a warning when being caught. Defaults to 406 | # "Exception" 407 | overgeneral-exceptions= 408 | -------------------------------------------------------------------------------- /test_demos.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | test_demos.py by xianhu 5 | """ 6 | 7 | import re 8 | import spider 9 | import pymysql 10 | import logging 11 | import requests 12 | from bs4 import BeautifulSoup 13 | from demos_doubanmovies import MovieFetcher, MovieParser 14 | from demos_dangdang import BookFetcher, BookParser, BookSaver 15 | 16 | 17 | def get_douban_movies(): 18 | """ 19 | 测试豆瓣电影爬虫 20 | """ 21 | headers = { 22 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", 23 | "Host": "movie.douban.com", 24 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 25 | "Accept-Encoding": "gzip, deflate, sdch, br", 26 | "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6", 27 | "Cache-Control": "max-age=0", 28 | "Connection": "keep-alive", 29 | "Upgrade-Insecure-Requests": "1", 30 | "Cookie": "bid=Pd48iLTpsf8" 31 | } 32 | 33 | # 获取初始url 34 | all_urls = set() 35 | 36 | resp = requests.get("https://movie.douban.com/tag/", headers=headers, verify=False) 37 | assert resp.status_code == 200, resp.status_code 38 | 39 | soup = BeautifulSoup(resp.text, "html5lib") 40 | a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) 41 | all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) 42 | 43 | resp = requests.get("https://movie.douban.com/tag/?view=cloud", headers=headers, verify=False) 44 | assert resp.status_code == 200, resp.status_code 45 | 46 | soup = BeautifulSoup(resp.text, "html5lib") 47 | a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) 48 | all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) 49 | logging.warning("all urls: %s", len(all_urls)) 50 | 51 | # 构造爬虫 52 | dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1), spider.Saver(), spider.UrlFilter()) 53 | for tag, url in all_urls: 54 | dou_spider.set_start_url(url, ("index", tag), priority=1) 55 | dou_spider.start_work_and_wait_done(fetcher_num=20) 56 | return 57 | 58 | 59 | def get_dangdang_books(): 60 | """ 61 | 测试当当网爬虫 62 | """ 63 | fetcher_number = 10 64 | fetcher_list = [] 65 | for i in range(fetcher_number): 66 | fetcher_list.append(BookFetcher()) 67 | parser = BookParser() 68 | saver = BookSaver() 69 | dang_spider = spider.WebSpider(fetcher_list, parser, saver, None) 70 | 71 | # 获取所有链接并存入数据库,由于时间太长,因此抓取链接和信息分开进行 72 | url_prefix_list = ["http://category.dangdang.com/pg{}-cp01.41.43.05.00.00.html", "http://category.dangdang.com/pg{}-cp01.41.59.00.00.00.html"] 73 | 74 | for url_prefix in url_prefix_list: 75 | for i in range(100): 76 | url = url_prefix.format(i) 77 | dang_spider.set_start_url(url, ("lists",), priority=1) 78 | dang_spider.start_work_and_wait_done(fetcher_num=fetcher_number) 79 | 80 | # 开始抓取所有的详细信息 81 | dang_spider = spider.WebSpider(fetcher_list, parser, saver, None) 82 | conn = pymysql.connect(host="localhost", user="username", password="password", db="dangdang_book", charset="utf8") 83 | cursor = conn.cursor() 84 | conn.autocommit(1) 85 | cursor.execute("select url from book_urls;") 86 | url_list = [item[0] for item in cursor.fetchall()] 87 | 88 | for url in url_list: 89 | dang_spider.set_start_url(url, ("detail",), priority=1) 90 | 91 | dang_spider.start_work_and_wait_done(fetcher_num=fetcher_number) 92 | for f_er in fetcher_list: 93 | f_er.driver_quit() 94 | return 95 | 96 | 97 | def get_car_price(): 98 | """ 99 | 测试汽车价格爬虫 100 | """ 101 | return 102 | 103 | 104 | if __name__ == "__main__": 105 | logging.basicConfig(level=logging.WARNING, format="%(asctime)s\t%(levelname)s\t%(message)s") 106 | # get_douban_movies() 107 | # get_dangdang_books() 108 | get_car_price() 109 | --------------------------------------------------------------------------------