├── .project
├── README.md
├── scrapy.cfg
└── znxin
├── __init__.py
├── items.py
├── main.py
├── output.xml
├── pipelines.py
├── settings.py
└── spiders
├── __init__.py
└── spider.py
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | znxin
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scrapy with PhantomJS+Selenium
2 | Simple spider implemented with Scrapy, Selenium and PhantomJS.
3 | Functioning with login, loading dynamic content, mousing moving and clicking, and window handling.
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = znxin.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = znxin
12 |
--------------------------------------------------------------------------------
/znxin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yswhynot/scrapy_phantomjs/cec3d99e552ae5e11058ad744e4be535fee7b33a/znxin/__init__.py
--------------------------------------------------------------------------------
/znxin/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ZnxinItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | name = scrapy.Field()
14 | html = scrapy.Field()
15 |
16 |
--------------------------------------------------------------------------------
/znxin/main.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 23, 2015
3 |
4 | @author: v-shayi
5 | '''
6 | import scrapy.cmdline
7 |
8 | def main():
9 | # scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'testspider', '-o', 'output.xml', '-t', 'xml'])
10 | scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'testspider'])
11 |
12 | if __name__ == '__main__':
13 | main()
--------------------------------------------------------------------------------
/znxin/output.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/znxin/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import datetime
8 | import os
9 |
10 | class ZnxinPipeline(object):
11 | directory = '/downloads/data'
12 |
13 | def __init__(self):
14 | self.directory = '/downloads/data/zhulianwang/%s' % str(datetime.date.today())
15 | if not os.path.exists(self.directory):
16 | os.makedirs(self.directory)
17 |
18 | def process_item(self, item, spider):
19 | file = open((self.directory + '/%s.html') % item['name'], 'w')
20 | file.write(item['html'].encode('utf-8'))
21 | file.close()
22 | return item
23 |
--------------------------------------------------------------------------------
/znxin/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for znxin project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'znxin'
13 |
14 | SPIDER_MODULES = ['znxin.spiders']
15 | NEWSPIDER_MODULE = 'znxin.spiders'
16 |
17 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)"
18 | AUTOTHROTTLE_ENABLED= True
19 |
20 |
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'znxin (+http://www.yourdomain.com)'
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS=32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY=3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
33 | #CONCURRENT_REQUESTS_PER_IP=16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED=False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED=False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'znxin.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'znxin.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'znxin.pipelines.ZnxinPipeline': 100,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
74 | #AUTOTHROTTLE_ENABLED=True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY=5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY=60
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG=False
81 |
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED=True
85 | #HTTPCACHE_EXPIRATION_SECS=0
86 | #HTTPCACHE_DIR='httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
89 |
--------------------------------------------------------------------------------
/znxin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/znxin/spiders/spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | Created on Oct 22, 2015
4 |
5 | @author: v-shayi
6 | '''
7 | import scrapy
8 | from scrapy.spider import BaseSpider
9 | from scrapy.http import FormRequest
10 | from scrapy.selector import HtmlXPathSelector
11 | from scrapy.shell import inspect_response
12 | from scrapy.http import Request
13 | from time import sleep
14 | from selenium import webdriver
15 | from selenium.webdriver.common.action_chains import ActionChains
16 | from scrapy.selector import Selector
17 | from znxin.items import ZnxinItem
18 |
19 | indexURL = 'http://hqb.nxin.com/index.shtml'
20 |
21 | class SpiderSpider(BaseSpider):
22 | name = 'testspider'
23 | start_urls = ['http://z.nxin.com/Home/Index']
24 |
25 | def parse(self, response):
26 | self.logger.info('response url: ' + response.url)
27 |
28 | self.logger.info('starting phantomjs')
29 | dr = webdriver.PhantomJS('C:\\Users\\v-shayi\\Software\\phantomjs\\phantomjs-2.0.0-windows\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe')
30 |
31 | global indexURL
32 | self.logger.info('retrieving index URL')
33 | dr.get(indexURL)
34 | sleep(10)
35 | self.logger.info('driver url: ' + dr.current_url)
36 |
37 | element = dr.find_element_by_xpath('//form[@id="loginAlert"]')
38 | dr.find_element_by_xpath('//input[@id="username"]').send_keys('yswhynot')
39 | dr.find_element_by_xpath('//input[@id="loginPassword"]').send_keys('hello123')
40 | element.submit()
41 | sleep(12)
42 | self.logger.info('driver url after login: ' + dr.current_url)
43 |
44 | next_element = dr.find_element_by_xpath('//a[@class="right red_d"]')
45 | next_element.click()
46 | sleep(10)
47 | self.logger.info('driver url after click: ' + dr.current_url)
48 |
49 | handles = dr.window_handles
50 | dr.switch_to_window(handles[1])
51 | self.logger.info('driver url after switching tab: ' + dr.current_url)
52 |
53 | item = ZnxinItem()
54 | item['name'] = dr.title
55 | item['html'] = dr.page_source
56 | yield item
57 |
58 | src = dr.page_source.encode('utf-8','ignore')
59 | hxs = HtmlXPathSelector(text=src)
60 | areas = hxs.select('//div[@id="so_baojia_list"]/div')
61 | region_elements = []
62 | i = 0
63 | for area in areas:
64 | districts = area.select('.//dd/a')
65 | tmp = [(i, j) for j in range(len(districts))]
66 | region_elements.extend(tmp)
67 | i += 1
68 |
69 | for region_element in region_elements:
70 | areas = dr.find_elements_by_xpath('//div[@id="area"]/ul/li')
71 | hov = ActionChains(dr).move_to_element(areas[region_element[0]])
72 | hov.perform()
73 |
74 | districts = dr.find_elements_by_xpath('//div[@id="so_baojia_list"]/div[contains(@style, "display")]//dd/a')
75 | sleep(0.1)
76 | districts[region_element[1]].click()
77 | sleep(8)
78 | self.logger.info('current title: ' + dr.title)
79 |
80 | item = ZnxinItem()
81 | item['name'] = dr.title
82 | item['html'] = dr.page_source
83 | yield item
84 |
85 |
86 | dr.quit()
87 | # areas = dr.find_elements_by_xpath('//div[@id="so_baojia_list"]/div[class="layer"]')
88 | # for area in areas:
89 | # js = 'arguments[0].style.height="auto"; arguments[0].style.display="block";'
90 | # dr.execute_script(js, area)
91 | #
92 | # area_lists = dr.find_elements_by_xpath('//div[@id="area"]/ul/li')
93 | # for area_list in area_lists:
94 | # js = 'argument[0].class="active";'
95 | # dr.execute_script(js, area_list)
96 | #
97 | # districts = dr.find_elements_by_xpath('//dd/a')
98 | # for district in districts:
99 | # district.click()
100 | # print 'current title: ' + dr.title
101 | # item = ZnxinItem()
102 |
103 |
--------------------------------------------------------------------------------