├── README.md ├── SJTUbbsSpiderDemo ├── bbs │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── items.pyc │ ├── pipelines.py │ ├── pipelines.pyc │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── forumSpider.py │ │ └── forumSpider.pyc ├── dataProcess.py ├── dataProcess.pyc └── scrapy.cfg └── gouwu.sogou.com ├── etao ├── __init__.py ├── __init__.pyc ├── items.py ├── items.pyc ├── lstData.py ├── lstData.pyc ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders │ ├── __init__.py │ ├── __init__.pyc │ ├── spider.py │ └── spider.pyc ├── scrapy.cfg └── webData.xml /README.md: -------------------------------------------------------------------------------- 1 | # spiderDemo 2 | Crawlers for several Websites. 3 | -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/__init__.py -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/__init__.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item, Field 9 | 10 | 11 | class BbsItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | url = Field() 15 | forum = Field() 16 | poster = Field() 17 | content = Field() 18 | # postid = Field() 19 | # poster = Field() 20 | # content = Field() 21 | # num = Field() 22 | # date = Field() 23 | 24 | -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/items.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy import signals 9 | from scrapy import log 10 | from bbs.items import BbsItem 11 | from twisted.enterprise import adbapi 12 | from scrapy.contrib.exporter import XmlItemExporter 13 | from dataProcess import dataProcess 14 | 15 | 16 | class XmlWritePipeline(object): 17 | 18 | def __init__(self): 19 | pass 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | pipeline = cls() 24 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 25 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 26 | return pipeline 27 | 28 | def spider_opened(self, spider): 29 | self.file = open('bbsData.xml', 'wb') 30 | self.expoter = XmlItemExporter(self.file) 31 | self.expoter.start_exporting() 32 | 33 | def spider_closed(self, spider): 34 | self.expoter.finish_exporting() 35 | self.file.close() 36 | 37 | # process the crawled data, define and call dataProcess function 38 | # dataProcess('bbsData.xml', 'text.txt') 39 | 40 | def process_item(self, item, spider): 41 | self.expoter.export_item(item) 42 | return item 43 | -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/pipelines.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bbs project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'bbs' 12 | 13 | CONCURRENT_REQUESTS = 200 14 | LOG_LEVEL = 'INFO' 15 | COOKIES_ENABLED = True 16 | RETRY_ENABLED = True 17 | 18 | 19 | SPIDER_MODULES = ['bbs.spiders'] 20 | NEWSPIDER_MODULE = 'bbs.spiders' 21 | 22 | # JOBDIR = 'jobdir' 23 | ITEM_PIPELINES = { 24 | 'bbs.pipelines.XmlWritePipeline': 1000, 25 | } 26 | 27 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 28 | #USER_AGENT = 'bbs (+http://www.yourdomain.com)' 29 | -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/settings.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/spiders/__init__.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/spiders/forumSpider.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | ''' 3 | bbsSpider, Created on Oct, 2014 4 | #version: 5 | #author: chenqx @http://chenqx.github.com 6 | See more: http://doc.scrapy.org/en/latest/index.html 7 | ''' 8 | 9 | from scrapy.selector import Selector 10 | from scrapy.http import Request 11 | from scrapy.contrib.spiders import CrawlSpider 12 | from scrapy.contrib.loader import ItemLoader 13 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 14 | from bbs.items import BbsItem 15 | 16 | class forumSpider(CrawlSpider): 17 | 18 | # name of spiders 19 | name = 'bbsSpider' 20 | allow_domain = ['bbs.sjtu.edu.cn'] 21 | start_urls = [ 'https://bbs.sjtu.edu.cn/bbsall' ] 22 | link_extractor = { 23 | 24 | 'page': SgmlLinkExtractor(allow = '/bbsdoc,board,\w+\.html$'), 25 | 26 | 'page_down': SgmlLinkExtractor(allow = '/bbsdoc,board,\w+,page,\d+\.html$'), 27 | 28 | 'content': SgmlLinkExtractor(allow = '/bbscon,board,\w+,file,M\.\d+\.A\.html$'), 29 | 30 | 31 | 32 | } 33 | 34 | _x_query = { 35 | 36 | 'page_content': '//pre/text()[2]', 37 | 38 | 'poster' : '//pre/a/text()', 39 | 40 | 'forum' : '//center/text()[2]', 41 | 42 | } 43 | 44 | def parse(self, response): 45 | for link in self.link_extractor['page'].extract_links(response): 46 | yield Request(url = link.url, callback=self.parse_page) 47 | 48 | def parse_page(self, response): 49 | 50 | for link in self.link_extractor['page_down'].extract_links(response): 51 | yield Request(url = link.url, callback=self.parse_page) 52 | 53 | for link in self.link_extractor['content'].extract_links(response): 54 | yield Request(url = link.url, callback=self.parse_content) 55 | 56 | 57 | def parse_content(self, response): 58 | 59 | bbsItem_loader = ItemLoader(item=BbsItem(), response = response) 60 | url = str(response.url) 61 | bbsItem_loader.add_value('url', url) 62 | bbsItem_loader.add_xpath('forum', self._x_query['forum']) 63 | bbsItem_loader.add_xpath('poster', self._x_query['poster']) 64 | bbsItem_loader.add_xpath('content', self._x_query['page_content']) 65 | 66 | return bbsItem_loader.load_item() -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/bbs/spiders/forumSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/bbs/spiders/forumSpider.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/dataProcess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | data process after crawling, Created on Oct, 2014 4 | #version: 5 | #author: chenqx 6 | See more: http://chenqx.github.com 7 | ''' 8 | 9 | 10 | from lxml import etree 11 | from ConfigParser import ConfigParser 12 | 13 | class dataProcess: 14 | def __init__(self, source_filename, target_filename): 15 | # load stop words into the memory. 16 | fin = open(source_filename, 'r') 17 | 18 | read = fin.read() 19 | 20 | output = open(target_filename, 'w') 21 | output.write(read) 22 | 23 | fin.close() 24 | output.close() 25 | 26 | 27 | 28 | ##if __name__ == '__main__': 29 | ## 30 | ## dataProcess('D:/Python27/src/bbs/bbsData.xml', 'D:/Python27/src/bbs/text.txt') 31 | -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/dataProcess.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/SJTUbbsSpiderDemo/dataProcess.pyc -------------------------------------------------------------------------------- /SJTUbbsSpiderDemo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = bbs.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bbs 12 | -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/__init__.py -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/__init__.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.item import Item, Field 10 | 11 | 12 | class EtaoItem(scrapy.Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | url = Field() 16 | title = Field() 17 | name = Field() 18 | price = Field() 19 | -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/items.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/lstData.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | class lstData: 4 | lst = [u' Apple iPad 2 16GB, Wi-Fi, 9.7in - Black (MC769LL/A) (2BR)', u' GARMIN FORERUNNER 310XT RUNNING GPS w/ HEART RATE MONITOR HRM 010-00741-01', u'*NEW* Nikon D4s Digital SLR Camera Body + 3 Year USA Warranty', u'*NEW* Nikon D810 Digital SLR Camera Body + 3 Year USA Warranty', u'1.00CT Five Stone Diamond Ring 14K White Gold', u'1/3 Ct Round Cut 14K White Gold Diamond Stud Earrings', u'10" Cool Memory Foam Mattress', u'3D Robotics IRIS+ Ready To Fly RC Drone Quadcopter 915 MHz w/ 3PV Autopilot 3DR', u'5\xc3\x97 E27 Color Changing 3W RGB LED Light Bulb Lamp 85-265V + IR Remote Control New', u'6 x Mini Cordless 3-LED Touch Light Batteries Powered Stick Tap Lamp Black New', u'7 Colors 7" Capacitive Screen Android 4.4 4GB Tablet PC 1.5GHz Cortex A7 Cameras', u'7" Google Android Tablet PC w/ Dual Core 8GB Cameras WiFi Multi-Touch - Vuru NEW', u'7" Tablet Android 4.2 Dual Cam 1.2Ghz WiFi - Assorted Colors - Bonus Items', u'Acer AZ3-105-UR Quad Core 500GB 23" 1080p LED Touchscreen Windows 8.1 All in One', u'Alpine Swiss Leather Twofold Front Pocket Wallet Money Clip ID Window Flip Fold', u'Alpine Swiss Mens Ankle Boots Dressy Casual Leather Lined Dress Shoes Lace up NW', u'Alpine Swiss Mens Leather Dress Shoes Dressy Slip on Loafers Good For Suit Jeans', u'Alpine Swiss Mens Leather Wallets Money Clips Card Cases Bifolds Trifolds FPWs', u'Alpine Swiss Mens Trifold Wallet Genuine Leather Card Case ID Window Billfold NW', u"Alpine Swiss Men's Wallet Genuine Lambskin Leather New Slimline Flip-Out Bifold", u"Alpine Swiss Money Clip Thin Front Pocket Wallet Genuine Top Grain Leather Men's", u'Alpine Swiss Money Clip Thin Wallet Real Leather Card Case Holds Up to 15 Bills', u'Alpine Swiss Womens Ballet Flats Comfort Buckle Loafers Leather Lined Pointy Toe', u'AlpineSwiss Leather Card Case Wallet Slim Super Thin 5 Card Slots Front Pocket', u'Apple iPhone 4 16GB Verizon Wireless WiFi Black and White Smartphone', u'Apple iPhone 4 16GB Verizon Wireless WiFi Black Smartphone', u'Apple iPhone 4 8GB WiFi Verizon Wireless Black Smartphone', u'Apple MacBook Pro 13.3" Laptop with Retina Display', u'ASUS VivoBook ASQ301LA-BSI5T17 - Intel i5, 6GB, 500GB, 13.3" Touchscreen Laptop', u'Beats Beatbox Portable Speaker (Black)', u'Beats by Dr. Dre Solo HD Over-Ear Headphone with Plush Ear Cushions', u'Beats By Dre Solo HD Compact Folding On Ear Headphones - Drench In Blue', u'Beats By Dre Studio High Definition Noise Canceling Over Ear Monster Headphones', u'Black Rivet Cable Knit Beanie w/ Beads', u'Black Rivet Knit Open Work Beret w/ Sparkle', u'Black Rivet Knit Pointelle Beanie', u'Black Rivet Pompom Multi Yarn Hat', u'Black Rivet Reversible Cuff Beanie', u'Black Rivet Textured Knit Beanie', u'Bose Wave\xc2\xae Music System III', u"Bulova Dress Women's Quartz Watch 98L160", u'Canon EF 50mm F/1.8 II Standard Auto Focus Lens BRAND NEW', u'Canon EF-S 18-135mm 18-135 f/3.5-5.6 IS Lens Brand New USA + UV FILTER INCLUDED', u'Canon EOS 1D X Digital SLR Camera (Body Only). Canon 1Dx Full Frame DSLR. NEW', u'Canon EOS 1DX Digital SLR Camera # 5253B002 1D-X Body Only * NEW *', u'Canon EOS 5D Mark III Digital SLR Camera Body (MK 3 DSLR) *NEW* + *3yr Warranty*', u'Canon EOS 5D Mark III Digital SLR Camera Body Only *NEW*', u'Canon EOS 60D 18MP CMOS Digital SLR Camera Body', u'Canon EOS 6D 20.2 MP DSLR Camera (Body Only) ', u'Canon EOS 6D Camera w/ 24-105mm f/4.0L IS USM Lens + 32GB, 2 Batteries + More!', u'Canon EOS 6D Digital SLR Camera (Body)', u'Canon EOS 6D Digital SLR Camera Body', u'Canon EOS 70D SLR Camera + 4 Lens Kit 18-55 STM +75-300 mm + 24GB TOP VALUE KIT!', u'Canon EOS 7D Digital SLR Camera w/EF-S 18-135mm f/3.5-5.6 IS Lens', u'Canon EOS Rebel D700 T5i 18.0 MP Digital SLR Camera - Black (Body) - NEW', u'Canon EOS Rebel T3 1100D Body + 3 Lens Kit 18-55mm IS + 24GB DSLR Top Value Kit', u'Canon EOS Rebel T3i 600D Body + 3 Lens Kit 18-55mm IS + 24GB DSLR Top Value Kit', u'Canon EOS Rebel T3I 600D Body + 4 Lens Kit 18-55 IS +75-300 +16GB Flash & More', u'Canon Zoom Telephoto EF 75-300mm f/4.0-5.6 III Lens', u'Capacitor G1W-C Car Dash Camera DVR NT96650 Chip AR0330 Lens W/Free Mirro Mount', u'Changhong 42" 1080p LED HDTV - LED42YC2000UA', u'Chrome Bakers Rack with Cutting Board and Storage', u"Coach Tristee Sig C 34A8444 Women's Rubber Rainboots Boots", u'CrossOver 290M LG-IPS LED 2560x1080 QHD DVI-D 21:9 cineview 29" Computer Monitor', u'DELL DUAL CORE 2 FAST GHZ DESKTOP COMPUTER PC 4 GB RAM 1 TB HDD WINDOWS 7 64', u'DELL DUAL CORE 2 FAST GHZ DESKTOP COMPUTER PC 4GB RAM, 80GB HDD, WINDOWS 7 PRO', u'Deluxe Credit Case Case Wallet Secretary Leather Alpine Swiss 23 Card Slots NEW', u'Denon S-302 DVD & Home Theater Entertainment System 1080P HDMI WiFi Bonus Stands', u'Dog Training Collar Vibrating Shock Remote Rechargeable and Waterproof', u'Energy by Klipsch Power Bar Elite Sound Bar w/ Wireless Subwoofer', u'Expandable Business Card Case Leather Wallet Thin Slim Money Clip Alternative NW', u'Ferro Aldo Mens Dress Shoes Leather Slip on Loafers Comfort Fit Padded Insoles', u'Fingerless Work Out Gloves Durable Leather Mens Womens Unisex For Driving, Bike', u'Full HD 1080P G1W 2.7" Car Dash DVR Camera Video CAM Recorder H.264 Night Vision', u'Funlux\xc2\xae 4CH NVR Outdoor Network 720P HD IP PoE Home Security Camera System 500G', u'Funlux\xc2\xae 8CH HDMI 960H P2P DVR Outdoor Surveillance Security Camera System 500GB', u'GARMIN FORERUNNER 610 GPS FITNESS SPORTS WATCH BUNDLE W/ HRM/USB (010-00947-10)', u'GBX Mens Casual Loafers Slip On Double Gore Moc Toe Boat Shoes Comfort Moccasins', u'GBX Mens Suede Boots Ankle High Pull On Chelsea Leather Casual Comfort Shoes New', u'Generac GP7500E GP Series 7,500 Watt Portable Generator 5943 NEW', u"Genuine Apple EarPods w' Remote & Mic For iPhone iPod Headphones Headset NEW OEM", u'Genuine Full Grain Leather NFL Bifold Wallet & Tin Gift Box 2 ID Window 12 Cards', u'GoPro Hero3 White Edition Camcorder Camera + Battery + Head Strap + 32GB Top Kit', u'GoPro Hero3+ Plus Black Edition HD Camcorder Camera + 2 Battery + 64GB Top Kit', u'Hammer Anvil Mens Genuine Leather Front Pocket Wallet Thin Bifold Slim Billfold', u"Hammer Anvil Money Clip Thin Front Pocket Compact Genuine Leather Men's Wallet", u"Hanes Women's Long-Sleeve T-Shirt", u'HP Split x2 Intel Dual Core i5 1.6GHz Laptop Tablet', u'Jawbone JAMBOX Wireless Bluetooth Speaker - Black Diamond', u'JBL Pulse Wireless Bluetooth Speaker with LED lights and NFC Pairing (Black)', u'Kocaso 8" Dual Camera 8GB Android 4.0 Capacitive Tablet PC w/ Carrying Case', u'KOCASO Android 4.2 7" Dual-Core 1.2GHz Dual-Cam Tablet w/ Matching Keyboard', u'KOCASO Android 4.2 9" Dual Core 1.2GHz 8GB Dual Camera Tablet PC +4Bonuses', u'KOCASO K-Mini Android 4.2 Tablet 7.9" 1.2 GHz Quad Core 8 GB Memory, Dual Camera', u'Kocaso Smart Watch Bluetooth 4.0 SIM Media Player KW300 in Gold', u'Kocaso Tablet 7" Android 4.2 1.5 GHz Dual Core Dual Camera 4GB', u'KOCASO Tablet Android 4.1 10.1" Dual Camera HDMI External 3G', u'KOCASO Tablet Android 4.1 10.1" Dual Camera HDMI Keyboard Case Bundle', u'KOCASO Tablet Android 4.1 8" Wifi Camera 4 GB 1.2Ghz PC Keyboard Bundle', u'KOCASO Tablet Android 4.1 8" Wifi Camera Capacitive 4 GB 1.2Ghz PC', u'KOCASO Tablet Android 4.2 7" Wifi 1.2 GHz Dual Camera M770 with Accessories', u'KOCASO Tablet Android 4.2 7" Wifi Dual Core Camera 1.5GHZ 8GB Accesories', u'KOCASO Tablet Android 4.2 7.9" Wifi Dual Core 1.2 GHz Camera 8GB Bluetooth M7850', u'Kocaso Tablet Android 4.2 Dual Core 1.2GHz 10.1" Dual Camera HDMI', u'Kocaso Tablet Android 4.2 Dual Core 1.2GHz 10.1" Dual Camera HDMI Carrying Case', u'Kocaso Tablet Dual Cam Android 4.1 Dual Core 1.4GHz 8GB 10.1" HDMI Carrying Case', u'Kocaso Tablet Dual Camera Android 4.1 Dual Core 1.4GHz 8GB 10.1" Bluetooth HDMI', u'KOCASO Tablet PC Dual Camera Google\xc2\u0161Android 4.2 9" Dual Core 1.5GHz\xc2\u01618GB', u'Large Organizer Wallet Checkbook Clutch Bag Womens Pocketbook Purse Alpine Swiss', u'Leather Card Case Front Pocket Wallet ID Window Slim Thin Mini By Alpine Swiss', u'Leather Fanny Pack Waist Bag Adjustable 6 Pockets Adjustable strap up to 52" NEW', u'Leather Money Clip Magnet Slim Thin Front Pocket Wallet Alpine Swiss ID & Cards', u'Leather Travel Wallet Passport Airline Ticket Case Zippered Checkbook Wallet New', u'LG 8,000 BTU Portable Air Conditioner/24-Hour On-Off Timer/Remote- LP0813WNR', u'LG Optimus G Pro E980 - 32GB - Black or White FACTORY UNLOCKED Smartphone (A)', u'LG VS950 Intuition Verizon Wireless Cell Phone', u'LG VS980 G2 Verizon Wireless 4G LTE Android 32 GB Black Smartphone', u'Logitech Protective Water Repellent Folio for iPad Mini', u'Lot2 New Professional 2nd Classic Remote Controller for Nintendo Wii Red', u'Mens Blucher Oxfords Genuine Leather Lace up Dress Shoes by Giorgio Brutini BLK', u'Mens Chelsea Boots Dressy Ankle Stretch Gore Side Zipper Genuine Leather Cap Toe', u'Mens Dress Shoes Genuine Leather Buckle Loafers Slip On by Giorgio Brutini Black', u'Mens Dress Shoes Lace up Oxfords Wingtip Leather Lined Faux Ostrich Skin Upper', u'Mens Dressy Leather Gloves Velcro Wrist Strap Warm Thermal Lining Insulated 40Gr', u'Mens Dressy Leather Gloves w Touch Screen Texting Smart Phone Tablet Compatible', u'Mens Lace Up Oxfords Dress Shoes Genuine Leather Moc Toe Giorgio Brutini Padded', u'Mens Leather Belt Dressy or Casual Black & Brown Gunmetal Buckle Sizes Available', u'Mens Leather Card Organizer Wallet Hammer Anvil Bifold Hipster 10 card Slots New', u'Mens Leather Oxfords Dress Shoes Dressy Lace up By Alpine Swiss Baseball Stitchd', u'Mens Leather Wallet Bifold Removable ID Card Case By Alpine Swiss Superb Quality', u"Men's Leather Wallet By Alpine Swiss Multi Card High Capacity Compact Bifold New", u"Men's Leather Wallet By Hammer Anvil Multi Card High Capacity Compact Bifold New", u'Mens Leather Wallet Spring Money Clip Z Shape Trifold Bifold 11 Card Slots 2 IDs', u'Mens Leather Wallet Zipper Coin Purse 6 Card Slots 3 More Pockets 2 Bill Section', u'Mens Leather Wallets Ostrich Snakeskin Print Trifold Bifold Zip-Around to Choose', u'Mens Military Field Boot Combat Shoes Rugged Lug Sole Canvas & Faux Leather Trim', u'Mens Slim Wallet Bifold Alpine Swiss Billfold Thin Front Pocket Wallet Leather', u'Mens Trifold Wallet Extra Capacity 10 Inside Slots 2 ID Windows By Alpine Swiss', u"Men's Wallet Alpine Swiss Genuine Soft Lambskin Leather Removable 2 ID Card Case", u'Mens Wallet Money Clip Spring Leather Front Pocket Wallet Spring Clip 9 Cards NW', u"Men's Wallets Alpine Swiss Genuine Leather Money Clips Bifold Trifold Card Case", u'Mens Wing Tip Ankle Boots Dressy Lace up Leather Derby Shoe Brogue Medallion Toe', u'Mens Wing Tip Dress Shoes Lace Up Oxfords TwoTone Brogue Medallion Leather Lined', u"Men's Wing Tip Dress Shoes Slip On Oxford Loafers Brogue Medallion Leather Lined", u'Mens Wingtip Oxfords Real Leather Casual & Dress Fashionable Trendy Look by GBX', u'Michael Kors Jet Set Zip Continental Wallet', u'Microsoft XBox One Kinect Black Console Bundle 500GB Hard Drive Blu-Ray 8GB RAM', u'Mohu Leaf Ultimate Amplified Indoor HDTV Antenna', u'Monster ClarityHD Micro Bluetooth Speaker Black - Refurbished', u'Motorola Droid Razr Maxx XT912M Black 16GB Verizon & Page Plus 4G Smartphone', u'Motorola XT912 Droid Razr 16GB Verizon Wireless WiFi Android 8.0 MP Cell Phone', u'Motorola XT926 Droid Razr HD 16GB Verizon Wireless 4G LTE Android Smartphone', u'Netgear NeoTV Prime Media Player - 1080p, Built-in WiFi, HDMI, Google TV Include', u'NETGEAR Wireless Router - AC1750 Dual Band Gigabit WiFi (R6300)', u"New Balance W770v4 - Women's Running Shoe, Stabilizing", u'New Canon EOS Rebel 700D T5i Digital SLR Camera +18-55 STM 75-300 Lens +16GB Kit', u'NEW Canon EOS Rebel T3i SLR Camera + 4 Lens Kit 18-55 IS 75-300 + 24GB Full Kit', u'New DJI Phantom 2 Vision+ Plus RC Quadcopter Drone w/ FPV HD Cam + Extra Battery', u'New Garmin Alpha 100 GPS Dog Track & Train Bundle w/ TT-10 Collar 010-01041-00', u'New Leather Clutch Bag Travel Case Mens Wallet Organizer Purse by Alpine Swiss', u'New Leather Mens Money Clip Spring Clip Front Pocket Wallet by Alpine Swiss Thin', u'NEW Nikon D3200 Digital SLR Camera + 4 Lens Kit 18-55mm + 70-300 + 24GB TOP KIT', u'NEW Nikon D5300 Digital SLR Camera +8 Lens 18-55mm VR 70-300 + 32GB Complete Kit', u'NEW Nikon D7000 Digital SLR Camera w 4 Lens Complete DSLR Kit 24GB TOP VALUE!', u'NEW Nikon D7100 Digital SLR Camera w 4 Lens Complete DSLR Kit 24GB TOP VALUE!', u'New Nikon D7100 SLR Camera + 5 Lens Kit: 18-55mm, 70-300mm, 500mm + 16GB & More!', u'New Passport Cover Travel Case Durable Soft Lambskin Leather By Alpine Swiss BLK', u'New Samsung Galaxy S II S2 Skyrocket i727 AT&T Unlocked 16GB Android Phone Black', u'NEW Samsung Galaxy S5 (G900F) - 16GB - Factory Unlocked - Smartphone ', u'NFL Slim Money Clip Genuine Full Grain Leather Thin Wallet W Football Team Logos', u'NFL Slim Trifold Wallet Genuine Leather Football Team Logo ID Window 8 Card Slot', u'Nikon D7100 Digital SLR Camera (Body Only)', u'Nikon Coolpix P600 Digital Camera - Red 16GB Package', u'Nikon D3100 Digital SLR Camera Body & 18-55mm VR Lens USA', u'Nikon D3100 Digital SLR Camera Body with 18-55mm G DX II AF-S Lens Black USA', u'Nikon D3200 Digital SLR Camera w/18-55mm Lens 16GB Full Kit', u'Nikon D3200 Digital SLR Camera w/18-55mm Lens 24GB Complete Kit', u'Nikon D3200 Digital SLR DSLR Camera + 3 Lens 18-55mm + 24GB KIT & More Brand New', u'Nikon D3200 Digital SLR DSLR Camera +3 Lens 18-55 VR +24GB KIT & More Brand New', u'Nikon D5100 Digital SLR Camera + 3 Lens: 18-55mm VR NIKKOR Lens +32GB Bundle', u'Nikon D5200 24.1 MP Digital SLR Camera - Black (Kit w/ 18-55 VR Lens)', u'Nikon D5200 Digital SLR Camera + 3 Lens Kit 18-55mm VR Lens +32GB Bundle', u'Nikon D5200 Digital SLR Camera + 4 Lens Kit: 18-55mm VR + 70-300 mm +32GB Kit', u'Nikon D5300 24.2 MP Digital SLR Camera - Black (Body only) New', u'Nikon D5300 DSLR Camera with 18-140mm Lens (Black) New', u'Nikon D610 Digital SLR Camera (Body) 16GB Package', u'Nikon D7100 Body \xe2\xa0\x93 Digital SLR DSLR D 7100 Camera Body Only (Black) *NEW* ', u'Nikon Nikkor AF-S 50 mm F/1.8G AF-D FX Lens + UV Filter & Cleaning Kit', u'Nokia Lumia 1520 - 16GB Quad Core (AT&T) Windows 8 Smartphone - Choice of Colors', u'Nokia Lumia 2520 Verizon Wireless 4G LTE WiFi 10.1" 32GB Windows Tablet', u'Nokia Lumia 521 (T-Mobile) 8GB Windows 8 Smartphone - White', u'Nokia Lumia Icon 929 - 32GB Windows 8 Smartphone - Factory Unlocked (Verizon) ', u'Official Microsoft Xbox One Special Edition Armed Forces Wireless Controller', u'Original Jawbone Jambox Bluetooth Portable Speaker- Choose Color/Mystery Color', u'Panasonic HC-X920 3MOS Ultrafine Full HD Wi-Fi Video Camera Camcorder Black USA', u'Pentax Q7 Compact Mirrorless Camera with 5-15mm f/2.8-4.5 Zoom Lens ', u'Pierina Ballet Flats Womens Loafers Shoes Genuine Leather Lined by Alpine Swiss', u'Plantronics Voyager Legend Bluetooth Headset With Text And Noise Reduction', u"Prada SPR 27N Baroque Swirl Women's Sunglasses", u'Raymarine Dragonfly Sonar Gps With Gold Charts Raymarine E70085-gld', u'Real Leather Compact Business Credit Card Wallet with 2 ID Windows 6 card slots', u'Remote Training Collar System Shock Vibrating LCD Display 100 Different Levels', u'Roku 1 Digital HD Streaming Media Player - Hulu, Netflix, Youtube, Pandora', u'Roku 3 Streaming Media Player', u'Roku HD 2500R Digital WiFi Media Streaming Player', u'Roku Streaming Stick HDMI Digital Media Streamer (3500R)', u'Rolex Explorer II Automatic Stainless Steel Mens Watch ', u'Samsung 24" S24D590PL Full HD 1080p LED PLS Widescreen Monitor', u'Samsung EVO 32GB Class 10 MicroSDHC Memory Card with USB Reader', u'Samsung Galaxy Note 3 (N900) - 32GB - Factory Unlocked - Smartphone (Brand New)', u'Samsung Galaxy S 4 IV SGH-I337-16GB- AT&T Unlocked Smartphone WHITE-BLACK-RED', u'Samsung Galaxy S 4 SGH-I337 - 16GB - Black / White / Red UNLOCKED (B)', u'Samsung Galaxy S II S2 i777 AT&T Unlocked GSM 16GB Android SmartPhone Black NEW', u'Samsung Galaxy S III SCH-I535 16GB Pebble Blue, White, Black Verizon Smartphone', u'Samsung Galaxy S III SCH-I535-16GB -Marble White Verizon Smartphone - EB10001', u'Samsung Galaxy S III SGH- I747- 16GB - Marble Blue AT&T (Unlocked) Smartphone', u'Samsung Galaxy S III SGH-I747 - 16GB - Blue / White / Red UNLOCKED (A)', u'Samsung Galaxy S III SGH-I747 - 16GB - Blue / White /Red UNLOCKED (B)', u'Samsung Galaxy S III SGH-I747 -16GB - White AT&T (Unlocked) Smartphone', u'Samsung Galaxy S III SGH-I747 16GB AT&T S3 Red White Blue (Unlocked) Smartphone', u'Samsung Galaxy Tab 3 Lite - 8GB, Wi-Fi, 7-inch Tablet - SM-T110NDWAXAR - White', u'Samsung Galaxy Tab 4 Tablet 2014 Edition 10.1\xe2\xa0\x9d ', u'Samsung Galaxy Tab Pro - 12.2", 32GB, WiFi, Android 4.4 (SM-T9000ZKAXAR) - Black', u'Samsung i535 Galaxy S3 Verizon Wireless Android WiFi 16GB 8MP Camera Cell Phone', u'Samsung i605 Galaxy Note 2 Verizon Wireless 4G LTE 16GB Android WiFi Smartphone', u'Samsung i997 Infuse 4G AT&T Unlocked GSM Android Wifi Touch 8MP Smart Phone New', u'Samsung S27D360HS 27" LED Monitor White w/ Blue ToC Finish Full 1080p ', u'SanDisk Ultra Class 10 32GB microSD micro SDHC SD UHS-I U1 TF Flash Memory Card', u'Seiki 60" Class 1080p LED HDTV - Full HD, 1920x1080 Resolution, 60Hz, 16:9, 3x H', u'SJ4000 Black 12MP HD 1080P Car Cam Sports DV Action Waterproof Camera 2x Battery', u'Slimfold Alpine Swiss Mens Bifold Wallet Soft Genuine Leather 6 Card & ID Slots', u'Sony 48" KDL-48W590B Smart LED HD TV Full HD 1080p 60 Hz WiFi Netflix Hulu Plus', u'Sony Action Cam HDR-AS100VR Wi-Fi GPS Video Camera Camcorder & Live View Remote', u'Sony NEX-5TL Wi-Fi 16.1MP HD Digital Camera 55-210mm & 16-50mm 2 lens bundle bag', u'Sony Playstation 3 Super Slim 12 GB Charcoal Black Console (NTSC)', u'Super Thin Cash Strap Money Clip Wallet Alpine Swiss Slim Card Case Real Leather', u'Tablet 9" Android 4.0 OS Capacitive Dual Camera 1.2Ghz MID PC w/ Carrying Case', u'Timberland Mens 35MM Casual Belt Boot Cut Leather Rugged Classic Jean Belt 32-42', u'Timberland Mens 35MM Casual Belt Genuine Leather Rugged Classic Jean Belt 32-42', u'Timberland Mens Belt Genuine Leather Dressy Classic Black or Brown Sizes 32 - 42', u'Timberland Mens Genuine Leather Belt Metal Buckle Classic Casual Sizes 32-42 New', u'Timberland Mens Wallet Bifold Passcase Genuine Leather 2 ID Windows Billfold NEW', u'Timberland Mens Wallet Commuter Bifold Real Leather Billfold 2 ID 10 Card Slots', u'Timberland Mens Wallet Italian Leather Passcase Flip Up Bifold ID Card Slots New', u'Timberland Mens Wallet Leather Passcase Bifold 2 ID Windows Semi Glossy Dressy', u'Timberland Slim Trifold Wallet Soft Genuine Leather Tin Gift Box 6+2 Card Slots', u'Timberland Slimfold Wallet for Men Thin Minimalist Bifold ID Cards Bills Section', u'Timberland Thin Mens Money Clip Wallet Genuine Delta Leather Efficient & Rugged', u'Toshiba 14" Touch Satellite Core i5-4200U 6GB 750GB Bklt Ky Win 8.1 |BE45T-A4100', u'Toshiba 29\xe2\xa0\x9d 29L1350U LED TV Black Television ', u'Traditional 8x11 Oriental Rug', u'Unlocked Nokia Lumia 928 Windows 8 Smartphone Verizon - Choice of Black of White', u'urBeats Beats By Dr. Dre Headphones Mic In-Ear Noise Cancellation 5 Colors', u'USA Canon EF-S 18-135mm f/3.5-5.6 IS STM - Brand New USA', u'Vans - Canvas Slip-On Unisex Shoe', u'Verizon Nokia Lumia 928 Windows 8 Smartphone Unlocked ~ Choice of Black of White', u'VIBE Sound USB Turntable/Vinyl to MP3 Audio Record Player w/Built-in Speakers', u'Vibe Tablet or Mobile Phone Universal Speaker and Docking System', u'Vizio 32" M322i-B1 Full Array LED HD TV 1080p Built-In Wi-Fi HDMI 120Hz ', u'Vizio 39" E390-A1 Flat Panel LED HD TV 1080p HDMI Black 200,000:1 Contrast Ratio', u'Vizio 50" E500i-B1 Slim Frame LED Smart HD TV 1080p 120Hz Built-in WiFi', u'Vizio E320i-B2 32" LED HDTV Smart TV VIZIO Internet Apps Plus HDMI x2', u'Vizio M471i-A2 47" Razor LED Full 1080p HD TV Smart TV 120 HZ Built-in Wifi HDMI', u'Wireless 3.5mm In-car Fm Transmitter for iPhone 6 5S Samsung Galaxy Note3 S3 S4', u'Womens Ballet Flats Ankle Strap Suede Lined Ballerina Slippers Slip On Shoes New', u'Womens Ballet Flats Ballerina Slippers Leather Lined Slip On Round Toe Shoes New', u"Women's Ballet Flats Patent Leather Ballerina Slippers Slip On Comfort Shoes New", u'Womens Boots Mid Calf 12" Australian Classic Tall Faux Shearling Sheepskin Fur', u"Women's Bow Ballet Flats Iris Round Toe Classic Shoe Real Suede Interior Loafer", u'Womens Cheetah Ballet Flat Micro Suede Shoe Faux Patent Leather Round Toe Buckle', u'Womens Comfort Boots Button Flats Slouch Australian Faux Sheepskin Shearling Fur', u'Womens Dressy Leather Gloves Rabbit Fur Trim Warm Thin Thermal Lining Insulation', u'Womens Dressy Leather Gloves Touch Screen Texting Smart Phone Tablet Compatible', u'Womens Fringe Moccasins Flat Shoes Zipper Ankle Boots Faux Suede Feather Dangle', u'Womens Leather Backpack Purse Sling Shoulder Bag Handbag 3 in 1 Convertible New', u'Womens Leather Cross Body Bag Organizer Purse Shoulder Bag Handbag Good 4 Travel', u'Womens Leather Handbag Mid Size Hobo Shoulder Bag Purse Tote W Multi Pockets New', u'Womens Leather Organizer Purse Shoulder Bag Handbag Cross Body Bag Large Clutch', u'Womens Mid Calf Boots 14\xe2\xa0\x9d Tall 4 button Faux Suede Shearling Fur Alpine Swiss', u'Womens Moccasin Loafers Boat Shoes Ballet Flats Casual Comfort Leather Insoles', u"Women's Purse Cross Body Shoulder Bag Leather Handbag Organizer Messenger Tote", u'Womens Slouch Boots Flats Heel Shoes Faux Suede Rounded Toe Mid Calf Pleated', u'Worx GT 20v Lithium Cordless Grass Trimmer ', u'xBox One Console & Call of Duty Advanced Warfare & 12 MONTH Card', u'X-STAR DP2710LED PLS Panel WQHD 2560x1440 27inch Computer Monitor'] 5 | -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/lstData.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/lstData.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | from scrapy import signals 10 | from scrapy import log 11 | from etao.items import EtaoItem 12 | from twisted.enterprise import adbapi 13 | from scrapy.contrib.exporter import XmlItemExporter 14 | #from dataProcess import dataProcess 15 | class XmlWritePipeline(object): 16 | def __init__(self): 17 | pass 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | pipeline = cls() 21 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 22 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 23 | return pipeline 24 | def spider_opened(self, spider): 25 | self.file = open('webData.xml', 'wb') 26 | self.expoter = XmlItemExporter(self.file) 27 | self.expoter.start_exporting() 28 | def spider_closed(self, spider): 29 | self.expoter.finish_exporting() 30 | self.file.close() 31 | # process the crawled data, define and call dataProcess function 32 | # dataProcess('bbsData.xml', 'text.txt') 33 | def process_item(self, item, spider): 34 | self.expoter.export_item(item) 35 | return item -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/pipelines.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for etao project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'etao' 12 | 13 | 14 | CONCURRENT_REQUESTS = 200 15 | LOG_LEVEL = 'INFO' 16 | COOKIES_ENABLED = True 17 | RETRY_ENABLED = True 18 | 19 | SPIDER_MODULES = ['etao.spiders'] 20 | NEWSPIDER_MODULE = 'etao.spiders' 21 | 22 | # JOBDIR = 'jobdir' 23 | ITEM_PIPELINES = { 24 | 'etao.pipelines.XmlWritePipeline': 1000, 25 | } 26 | 27 | 28 | 29 | 30 | 31 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 32 | #USER_AGENT = 'etao (+http://www.yourdomain.com)' 33 | -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/settings.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/spiders/__init__.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/spiders/spider.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | ''' 3 | gouwu.sogou.com / etao.com Spider, Created on Dec, 2014 4 | #version: 1.0 5 | #author: chenqx @http://chenqx.github.com 6 | See more: http://doc.scrapy.org/en/latest/index.html 7 | ''' 8 | import time 9 | from scrapy.selector import Selector 10 | from scrapy.http import Request 11 | from scrapy.contrib.spiders import CrawlSpider 12 | from scrapy.contrib.loader import ItemLoader 13 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 14 | from etao.items import EtaoItem 15 | from etao.lstData import lstData 16 | 17 | from selenium import selenium 18 | from selenium import webdriver 19 | 20 | class etaoSpider(CrawlSpider): 21 | # name of spiders 22 | name = 'Spider' 23 | allow_domain = ['gouwu.sogou.com'] 24 | start_urls = [ ('http://gouwu.sogou.com/shop?query=' + searchWord ) for searchWord in lstData().lst] 25 | link_extractor = { 26 | 'page': SgmlLinkExtractor(allow = '/detail/\d+\.html.+'), 27 | 'page_down': SgmlLinkExtractor(allow = '/shop\?query=.+',),#restrict_xpaths = '//a[@class = "pagination-next"]' 28 | } 29 | _x_query = { 30 | 'title': '//p[@class="title"]/a/@title', 31 | 'name': '//span[@class="floatR hui61 mt1"]/text()',#//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text() 32 | 'price' : '//span[@class="shopprice font17"]/text()', # 'price' : '//span[@class = "price"]/text()', 33 | } 34 | 35 | def __init__(self): 36 | CrawlSpider.__init__(self) 37 | # use any browser you wish 38 | self.browser = webdriver.Firefox() 39 | 40 | def __del__(self): 41 | self.browser.close() 42 | 43 | def parse(self, response): 44 | #crawl all display page 45 | for link in self.link_extractor['page_down'].extract_links(response): 46 | yield Request(url = link.url, callback=self.parse) 47 | 48 | #browser 49 | self.browser.get(response.url) 50 | time.sleep(5) 51 | # get the data and write it to scrapy items 52 | etaoItem_loader = ItemLoader(item=EtaoItem(), response = response) 53 | url = str(response.url) 54 | etaoItem_loader.add_value('url', url) 55 | etaoItem_loader.add_xpath('title', self._x_query['title']) 56 | etaoItem_loader.add_xpath('name', self._x_query['name']) 57 | etaoItem_loader.add_xpath('price', self._x_query['price']) 58 | yield etaoItem_loader.load_item() -------------------------------------------------------------------------------- /gouwu.sogou.com/etao/spiders/spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenqx/spiderDemo/d3b0eefd9ce3cd14cf4b944d75352fd0c5046751/gouwu.sogou.com/etao/spiders/spider.pyc -------------------------------------------------------------------------------- /gouwu.sogou.com/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = etao.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = etao 12 | --------------------------------------------------------------------------------