├── .project ├── README.md ├── scrapy.cfg └── znxin ├── __init__.py ├── items.py ├── main.py ├── output.xml ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── spider.py /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | znxin 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy with PhantomJS+Selenium 2 | Simple spider implemented with Scrapy, Selenium and PhantomJS. 3 | Functioning with login, loading dynamic content, mousing moving and clicking, and window handling. -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = znxin.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = znxin 12 | -------------------------------------------------------------------------------- /znxin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yswhynot/scrapy_phantomjs/cec3d99e552ae5e11058ad744e4be535fee7b33a/znxin/__init__.py -------------------------------------------------------------------------------- /znxin/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZnxinItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | name = scrapy.Field() 14 | html = scrapy.Field() 15 | 16 | -------------------------------------------------------------------------------- /znxin/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 23, 2015 3 | 4 | @author: v-shayi 5 | ''' 6 | import scrapy.cmdline 7 | 8 | def main(): 9 | # scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'testspider', '-o', 'output.xml', '-t', 'xml']) 10 | scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'testspider']) 11 | 12 | if __name__ == '__main__': 13 | main() -------------------------------------------------------------------------------- /znxin/output.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /znxin/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import datetime 8 | import os 9 | 10 | class ZnxinPipeline(object): 11 | directory = '/downloads/data' 12 | 13 | def __init__(self): 14 | self.directory = '/downloads/data/zhulianwang/%s' % str(datetime.date.today()) 15 | if not os.path.exists(self.directory): 16 | os.makedirs(self.directory) 17 | 18 | def process_item(self, item, spider): 19 | file = open((self.directory + '/%s.html') % item['name'], 'w') 20 | file.write(item['html'].encode('utf-8')) 21 | file.close() 22 | return item 23 | -------------------------------------------------------------------------------- /znxin/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for znxin project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'znxin' 13 | 14 | SPIDER_MODULES = ['znxin.spiders'] 15 | NEWSPIDER_MODULE = 'znxin.spiders' 16 | 17 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)" 18 | AUTOTHROTTLE_ENABLED= True 19 | 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'znxin (+http://www.yourdomain.com)' 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS=32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY=3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 33 | #CONCURRENT_REQUESTS_PER_IP=16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED=False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED=False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'znxin.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'znxin.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'znxin.pipelines.ZnxinPipeline': 100, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 74 | #AUTOTHROTTLE_ENABLED=True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY=5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY=60 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG=False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED=True 85 | #HTTPCACHE_EXPIRATION_SECS=0 86 | #HTTPCACHE_DIR='httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | -------------------------------------------------------------------------------- /znxin/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /znxin/spiders/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on Oct 22, 2015 4 | 5 | @author: v-shayi 6 | ''' 7 | import scrapy 8 | from scrapy.spider import BaseSpider 9 | from scrapy.http import FormRequest 10 | from scrapy.selector import HtmlXPathSelector 11 | from scrapy.shell import inspect_response 12 | from scrapy.http import Request 13 | from time import sleep 14 | from selenium import webdriver 15 | from selenium.webdriver.common.action_chains import ActionChains 16 | from scrapy.selector import Selector 17 | from znxin.items import ZnxinItem 18 | 19 | indexURL = 'http://hqb.nxin.com/index.shtml' 20 | 21 | class SpiderSpider(BaseSpider): 22 | name = 'testspider' 23 | start_urls = ['http://z.nxin.com/Home/Index'] 24 | 25 | def parse(self, response): 26 | self.logger.info('response url: ' + response.url) 27 | 28 | self.logger.info('starting phantomjs') 29 | dr = webdriver.PhantomJS('C:\\Users\\v-shayi\\Software\\phantomjs\\phantomjs-2.0.0-windows\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe') 30 | 31 | global indexURL 32 | self.logger.info('retrieving index URL') 33 | dr.get(indexURL) 34 | sleep(10) 35 | self.logger.info('driver url: ' + dr.current_url) 36 | 37 | element = dr.find_element_by_xpath('//form[@id="loginAlert"]') 38 | dr.find_element_by_xpath('//input[@id="username"]').send_keys('yswhynot') 39 | dr.find_element_by_xpath('//input[@id="loginPassword"]').send_keys('hello123') 40 | element.submit() 41 | sleep(12) 42 | self.logger.info('driver url after login: ' + dr.current_url) 43 | 44 | next_element = dr.find_element_by_xpath('//a[@class="right red_d"]') 45 | next_element.click() 46 | sleep(10) 47 | self.logger.info('driver url after click: ' + dr.current_url) 48 | 49 | handles = dr.window_handles 50 | dr.switch_to_window(handles[1]) 51 | self.logger.info('driver url after switching tab: ' + dr.current_url) 52 | 53 | item = ZnxinItem() 54 | item['name'] = dr.title 55 | item['html'] = dr.page_source 56 | yield item 57 | 58 | src = dr.page_source.encode('utf-8','ignore') 59 | hxs = HtmlXPathSelector(text=src) 60 | areas = hxs.select('//div[@id="so_baojia_list"]/div') 61 | region_elements = [] 62 | i = 0 63 | for area in areas: 64 | districts = area.select('.//dd/a') 65 | tmp = [(i, j) for j in range(len(districts))] 66 | region_elements.extend(tmp) 67 | i += 1 68 | 69 | for region_element in region_elements: 70 | areas = dr.find_elements_by_xpath('//div[@id="area"]/ul/li') 71 | hov = ActionChains(dr).move_to_element(areas[region_element[0]]) 72 | hov.perform() 73 | 74 | districts = dr.find_elements_by_xpath('//div[@id="so_baojia_list"]/div[contains(@style, "display")]//dd/a') 75 | sleep(0.1) 76 | districts[region_element[1]].click() 77 | sleep(8) 78 | self.logger.info('current title: ' + dr.title) 79 | 80 | item = ZnxinItem() 81 | item['name'] = dr.title 82 | item['html'] = dr.page_source 83 | yield item 84 | 85 | 86 | dr.quit() 87 | # areas = dr.find_elements_by_xpath('//div[@id="so_baojia_list"]/div[class="layer"]') 88 | # for area in areas: 89 | # js = 'arguments[0].style.height="auto"; arguments[0].style.display="block";' 90 | # dr.execute_script(js, area) 91 | # 92 | # area_lists = dr.find_elements_by_xpath('//div[@id="area"]/ul/li') 93 | # for area_list in area_lists: 94 | # js = 'argument[0].class="active";' 95 | # dr.execute_script(js, area_list) 96 | # 97 | # districts = dr.find_elements_by_xpath('//dd/a') 98 | # for district in districts: 99 | # district.click() 100 | # print 'current title: ' + dr.title 101 | # item = ZnxinItem() 102 | 103 | --------------------------------------------------------------------------------