├── README.md └── multi_selenium.py /README.md: -------------------------------------------------------------------------------- 1 | # multi-selenium-in-scrapy 2 | 3 | a middleware that try to make minimal changes to spider while implementing concurrency. 4 | 5 | 不少反爬虫做的比较复杂的网站,可能需要上selenium+无头浏览器才能爬取,但是selenium对并发并不友好,Scrapy一旦用上selenium,异步的优势就体现不出来了,效率会极大降低。multi-selenium中间件通过利用了浏览器本身的并发机制,通过js和scrapy原生的异步调度结合,实现了selenium的伪并发,大大提高了爬取复杂网站的效率。 6 | 7 | Documentation 8 | ------------- 9 | 10 | ### Usage 11 | 12 | 1. set `CONCURRENT_REQUESTS` in settings and enable middleware 13 | 14 | 2. make a little bit change to your spider 15 | 16 | if the page is not ready, instead of blocking the entire process it will return a status of 202 and wait for the opportunity to check again, so we need to add a condition like this: 17 | 18 | if response.status==200: 19 | do xxxx 20 | elif response.status==202: 21 | yield scrapy.Request(response.url, callback=self.parse, dont_filter=True) 22 | 23 | 24 | remeber to add `dont_filter=True` to your Request, otherwise it will be filtered out. 25 | 26 | -------------------------------------------------------------------------------- /multi_selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from scrapy.http import HtmlResponse 3 | from scrapy.exceptions import IgnoreRequest 4 | from queue import Queue 5 | from scrapy.utils.project import get_project_settings 6 | import time 7 | 8 | class SeleniumMiddleware(object): 9 | def __init__(self): 10 | # Initialize browser 11 | options = webdriver.FirefoxOptions() 12 | options.add_argument('-headless') 13 | self.browser = webdriver.Firefox(executable_path=r'C:\Users\su\Desktop\geckodriver-v0.19.1-win64\geckodriver.exe',firefox_options=options) 14 | 15 | # get project settings 16 | settings=get_project_settings() 17 | concurrent_requests=settings.get('CONCURRENT_REQUESTS') 18 | 19 | # Initialize tabs 20 | while len(self.browser.window_handles) < concurrent_requests: 21 | self.browser.execute_script('''window.open("","_blank");''') 22 | 23 | # Initialize window handles queue 24 | self.handle_queue=Queue(maxsize=concurrent_requests) 25 | for handle in self.browser.window_handles: 26 | self.handle_queue.put(handle) 27 | 28 | # Initialize requests dict 29 | self.requests={} 30 | 31 | def process_request(self, request, spider): 32 | result=self.requests.get(request.url) 33 | if result is None: 34 | # get a free window_handle from queue 35 | if self.handle_queue.empty(): 36 | return HtmlResponse(url=request.url,request=request, encoding='utf-8', status=202) 37 | handle = self.handle_queue.get() 38 | 39 | # open url by js 40 | self.browser.switch_to.window(handle) 41 | js = r"location.href='%s';" % request.url 42 | self.browser.execute_script(js) 43 | 44 | # wait for 1s to avoid some bug ("document.readyState" will return a "complete" at the first) 45 | time.sleep(1) 46 | 47 | # mark url 48 | self.requests[request.url]={'status':'waiting','handle':handle} 49 | 50 | return HtmlResponse(url=request.url,request=request, encoding='utf-8', status=202) 51 | 52 | elif result['status']=='waiting': 53 | 54 | # switch to the tab to check page status using javascript 55 | handle = result['handle'] 56 | self.browser.switch_to.window(handle) 57 | document_status=self.browser.execute_script("return document.readyState;") 58 | 59 | if document_status=='complete': 60 | self.requests[request.url]['status'] = 'done' 61 | self.handle_queue.put(handle) 62 | return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', 63 | status=200) 64 | else: 65 | return HtmlResponse(url=request.url, request=request, encoding='utf-8', status=202) 66 | 67 | elif result['status']=="done": 68 | # Filter repeat URL 69 | raise IgnoreRequest 70 | 71 | def __del__(self): 72 | self.browser.quit() 73 | --------------------------------------------------------------------------------