├── .project
├── README.md
├── scrapy.cfg
└── znxin
    ├── __init__.py
    ├── items.py
    ├── main.py
    ├── output.xml
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        └── spider.py


/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>znxin</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.python.pydev.pythonNature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scrapy with PhantomJS+Selenium
2 | Simple spider implemented with Scrapy, Selenium and PhantomJS. 
3 | Functioning with login, loading dynamic content, mousing moving and clicking, and window handling.


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = znxin.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = znxin
12 | 


--------------------------------------------------------------------------------
/znxin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yswhynot/scrapy_phantomjs/cec3d99e552ae5e11058ad744e4be535fee7b33a/znxin/__init__.py


--------------------------------------------------------------------------------
/znxin/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZnxinItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     name = scrapy.Field()
14 |     html = scrapy.Field()
15 |     
16 | 


--------------------------------------------------------------------------------
/znxin/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 23, 2015
 3 | 
 4 | @author: v-shayi
 5 | '''
 6 | import scrapy.cmdline
 7 | 
 8 | def main():
 9 | #     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'testspider', '-o', 'output.xml', '-t', 'xml'])
10 |     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'testspider'])
11 | 
12 | if __name__ == '__main__':
13 |     main()


--------------------------------------------------------------------------------
/znxin/output.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <items><?xml version="1.0" encoding="utf-8"?>
3 | <items>


--------------------------------------------------------------------------------
/znxin/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import datetime
 8 | import os
 9 | 
10 | class ZnxinPipeline(object):
11 |     directory = '/downloads/data'
12 |     
13 |     def __init__(self):
14 |         self.directory = '/downloads/data/zhulianwang/%s' % str(datetime.date.today())
15 |         if not os.path.exists(self.directory):
16 |             os.makedirs(self.directory)
17 |     
18 |     def process_item(self, item, spider):
19 |         file = open((self.directory + '/%s.html') % item['name'], 'w')
20 |         file.write(item['html'].encode('utf-8'))
21 |         file.close()
22 |         return item
23 | 


--------------------------------------------------------------------------------
/znxin/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for znxin project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'znxin'
13 | 
14 | SPIDER_MODULES = ['znxin.spiders']
15 | NEWSPIDER_MODULE = 'znxin.spiders'
16 | 
17 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)"
18 | AUTOTHROTTLE_ENABLED= True
19 | 
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'znxin (+http://www.yourdomain.com)'
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS=32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY=3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
33 | #CONCURRENT_REQUESTS_PER_IP=16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED=False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED=False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'znxin.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'znxin.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'znxin.pipelines.ZnxinPipeline': 100,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
74 | #AUTOTHROTTLE_ENABLED=True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY=5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY=60
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG=False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED=True
85 | #HTTPCACHE_EXPIRATION_SECS=0
86 | #HTTPCACHE_DIR='httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 


--------------------------------------------------------------------------------
/znxin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/znxin/spiders/spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Created on Oct 22, 2015
  4 | 
  5 | @author: v-shayi
  6 | '''
  7 | import scrapy
  8 | from scrapy.spider import BaseSpider
  9 | from scrapy.http import FormRequest
 10 | from scrapy.selector import HtmlXPathSelector
 11 | from scrapy.shell import inspect_response
 12 | from scrapy.http import Request
 13 | from time import sleep
 14 | from selenium import webdriver
 15 | from selenium.webdriver.common.action_chains import ActionChains
 16 | from scrapy.selector import Selector
 17 | from znxin.items import ZnxinItem
 18 | 
 19 | indexURL = 'http://hqb.nxin.com/index.shtml'
 20 | 
 21 | class SpiderSpider(BaseSpider):
 22 |     name = 'testspider'
 23 |     start_urls = ['http://z.nxin.com/Home/Index']
 24 | 
 25 |     def parse(self, response):
 26 |         self.logger.info('response url: ' + response.url)
 27 |         
 28 |         self.logger.info('starting phantomjs')
 29 |         dr = webdriver.PhantomJS('C:\\Users\\v-shayi\\Software\\phantomjs\\phantomjs-2.0.0-windows\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe')
 30 | 
 31 |         global indexURL
 32 |         self.logger.info('retrieving index URL')
 33 |         dr.get(indexURL)
 34 |         sleep(10)
 35 |         self.logger.info('driver url: ' + dr.current_url)
 36 |         
 37 |         element = dr.find_element_by_xpath('//form[@id="loginAlert"]')
 38 |         dr.find_element_by_xpath('//input[@id="username"]').send_keys('yswhynot')
 39 |         dr.find_element_by_xpath('//input[@id="loginPassword"]').send_keys('hello123')
 40 |         element.submit()
 41 |         sleep(12)
 42 |         self.logger.info('driver url after login: ' + dr.current_url)
 43 |         
 44 |         next_element = dr.find_element_by_xpath('//a[@class="right red_d"]')
 45 |         next_element.click()
 46 |         sleep(10)
 47 |         self.logger.info('driver url after click: ' + dr.current_url)
 48 |         
 49 |         handles = dr.window_handles
 50 |         dr.switch_to_window(handles[1])
 51 |         self.logger.info('driver url after switching tab: ' + dr.current_url)
 52 |         
 53 |         item = ZnxinItem()
 54 |         item['name'] = dr.title
 55 |         item['html'] = dr.page_source
 56 |         yield item
 57 |         
 58 |         src = dr.page_source.encode('utf-8','ignore')
 59 |         hxs = HtmlXPathSelector(text=src)    
 60 |         areas = hxs.select('//div[@id="so_baojia_list"]/div')
 61 |         region_elements = []
 62 |         i = 0
 63 |         for area in areas:
 64 |             districts = area.select('.//dd/a')
 65 |             tmp = [(i, j) for j in range(len(districts))]
 66 |             region_elements.extend(tmp)
 67 |             i += 1
 68 |         
 69 |         for region_element in region_elements:
 70 |             areas = dr.find_elements_by_xpath('//div[@id="area"]/ul/li')
 71 |             hov = ActionChains(dr).move_to_element(areas[region_element[0]])
 72 |             hov.perform()
 73 |             
 74 |             districts = dr.find_elements_by_xpath('//div[@id="so_baojia_list"]/div[contains(@style, "display")]//dd/a')
 75 |             sleep(0.1)
 76 |             districts[region_element[1]].click()
 77 |             sleep(8)
 78 |             self.logger.info('current title: ' + dr.title)
 79 |                   
 80 |             item = ZnxinItem()
 81 |             item['name'] = dr.title
 82 |             item['html'] = dr.page_source
 83 |             yield item
 84 |             
 85 |             
 86 |         dr.quit()
 87 | #         areas = dr.find_elements_by_xpath('//div[@id="so_baojia_list"]/div[class="layer"]')
 88 | #         for area in areas:
 89 | #             js = 'arguments[0].style.height="auto"; arguments[0].style.display="block";'
 90 | #             dr.execute_script(js, area)
 91 | #             
 92 | #         area_lists = dr.find_elements_by_xpath('//div[@id="area"]/ul/li')
 93 | #         for area_list in area_lists:
 94 | #             js = 'argument[0].class="active";'
 95 | #             dr.execute_script(js, area_list)
 96 | #             
 97 | #         districts = dr.find_elements_by_xpath('//dd/a')
 98 | #         for district in districts:
 99 | #             district.click()
100 | #             print 'current title: ' + dr.title
101 | #             item = ZnxinItem()
102 | 
103 |         


--------------------------------------------------------------------------------