├── AliExpress ├── AliExpress │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── aliexpress.cpython-36.pyc │ │ └── aliexpress.py └── scrapy.cfg ├── Amazon_com ├── amazon │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── amazon.cpython-36.pyc │ │ └── amazon.py └── scrapy.cfg ├── Amazon_in ├── amazon │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── amazon.cpython-36.pyc │ │ └── amazon.py └── scrapy.cfg ├── Ebay_com ├── Ebay │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── ebay.cpython-36.pyc │ │ └── ebay.py └── scrapy.cfg ├── Ebay_in ├── Ebay │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── ebay.cpython-36.pyc │ │ └── ebay.py └── scrapy.cfg ├── README.md └── flipkart ├── flipkart ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── settings.cpython-36.pyc ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── flipkart.cpython-36.pyc │ └── flipkart.py └── scrapy.cfg /AliExpress/AliExpress/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /AliExpress/AliExpress/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /AliExpress/AliExpress/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AliexpressItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /AliExpress/AliExpress/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class AliexpressSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /AliExpress/AliExpress/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class AliexpressPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /AliExpress/AliExpress/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for AliExpress project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'AliExpress' 13 | 14 | SPIDER_MODULES = ['AliExpress.spiders'] 15 | NEWSPIDER_MODULE = 'AliExpress.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'AliExpress (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'AliExpress.middlewares.AliexpressSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'AliExpress.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'AliExpress.pipelines.AliexpressPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | #Export as CSV Feed 93 | 94 | #FEED_FORMAT = "csv" 95 | #FEED_URI = "AliExpress.csv" 96 | -------------------------------------------------------------------------------- /AliExpress/AliExpress/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /AliExpress/AliExpress/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /AliExpress/AliExpress/spiders/__pycache__/aliexpress.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/spiders/__pycache__/aliexpress.cpython-36.pyc -------------------------------------------------------------------------------- /AliExpress/AliExpress/spiders/aliexpress.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | import mysql.connector 5 | #database connection 6 | config = { 7 | 'user': 'root', 8 | 'password': 'aitpune411015', 9 | 'host': '127.0.0.1', 10 | 'database': 'Ali_Final', 11 | 'raise_on_warnings': True, 12 | } 13 | 14 | cnx = mysql.connector.connect(**config) 15 | cursor = cnx.cursor() 16 | #cnx.close() 17 | 18 | class AliexpressSpider(scrapy.Spider): 19 | #spider name 20 | name = 'aliexpress' 21 | 22 | def start_requests(self): 23 | #url formed as per user defined category 24 | # g-y for grid view 25 | yield scrapy.Request('https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20171114192306&g=y&SearchText=%s' % self.category,callback=self.parse) 26 | 27 | def parse(self,response): 28 | #Extracting the content using css selectors 29 | start_urls=[] 30 | for i in range(0,5): 31 | link=str(response.css("div.info a.history-item.product::attr(href)")[i].extract()) 32 | start_urls.append("https:"+link) 33 | for url in start_urls: 34 | print(url) 35 | #calling parse function as per url to scrap info related to the product link 36 | yield scrapy.Request(url=url, callback=self.parse_product_info) 37 | info={ 38 | 'hello':'varun', 39 | } 40 | yield info 41 | 42 | def parse_product_info(self, response): 43 | 44 | #Extracting the content using css or xpath selectors 45 | url=str(response.xpath('/html/head/meta[7]/@content').extract_first()) 46 | currency=str(response.xpath('.//*[@class="p-symbol"]/text()').extract_first()) 47 | price=str(response.xpath('//*[@class="p-price"]/text()').extract_first()) 48 | if price==' - ': 49 | price=str(response.xpath('.//*[@class="p-price"]/span/text()').extract_first())+"-"+str(response.xpath('.//*[@class="p-price"]/span[2]/text()').extract_first()) 50 | price=currency+price 51 | discount_price=str(response.xpath('//*[@id="j-sku-discount-price"]/text()').extract_first()) 52 | title=str(response.css("title::text").extract_first()) 53 | product_rating=str(response.xpath('//*[@id="j-customer-reviews-trigger"]/span[2]/text()').extract_first()) 54 | product_rating_count=str(response.xpath('//*[@id="j-customer-reviews-trigger"]/span[3]/text()').extract_first()) 55 | item_specifics=str(response.css(".ui-box.product-property-main span::text").extract()) 56 | seller_name=str(response.xpath('//*[@id="j-store-info-wrap"]/dl/dd[1]/a/text()').extract_first()) 57 | 58 | print ('URL :',url) 59 | #print('CURRENCY :',currency) 60 | print ('Price :',price) 61 | print ('D_Price :',discount_price) 62 | print ('Title :',title) 63 | print ('P_Rating :',product_rating) 64 | print ('P_R_Count :',product_rating_count) 65 | print ('Item_Specifics :',item_specifics) 66 | print ('Seller_Name :',seller_name) 67 | 68 | 69 | cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name)) 70 | print ("%d rows were inserted" % cursor.rowcount) 71 | cnx.commit() 72 | 73 | #create a dictionary to store the scraped info 74 | scraped_info = { 75 | 76 | 'url' : url, 77 | 'price' : price, 78 | 'discount_price' : discount_price, 79 | 'title' : title, 80 | 'product_rating' : product_rating, 81 | 'product_rating_count' : product_rating_count, 82 | 'item_specifics' : item_specifics, 83 | 'seller_name' : seller_name, 84 | } 85 | 86 | #yield or give the scraped info to scrapy 87 | yield scraped_info 88 | 89 | -------------------------------------------------------------------------------- /AliExpress/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = AliExpress.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = AliExpress 12 | -------------------------------------------------------------------------------- /Amazon_com/amazon/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_com/amazon/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_com/amazon/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_com/amazon/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_com/amazon/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class AmazonItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | product_name = scrapy.Field() 13 | product_sale_price = scrapy.Field() 14 | product_category = scrapy.Field() 15 | product_original_price = scrapy.Field() 16 | product_availability = scrapy.Field() 17 | -------------------------------------------------------------------------------- /Amazon_com/amazon/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class AmazonSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Amazon_com/amazon/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Define your item pipelines here 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | class AmazonPipeline(object): 7 | def process_item(self, item, spider): 8 | return item 9 | -------------------------------------------------------------------------------- /Amazon_com/amazon/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for amazon project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'amazon' 13 | 14 | SPIDER_MODULES = ['amazon.spiders'] 15 | NEWSPIDER_MODULE = 'amazon.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'amazon (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'amazon.middlewares.AmazonSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'amazon.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'amazon.pipelines.AmazonPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Amazon_com/amazon/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Amazon_com/amazon/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_com/amazon/spiders/__pycache__/amazon.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/spiders/__pycache__/amazon.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_com/amazon/spiders/amazon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | # import mysql.connector 6 | # #database connection 7 | # config = { 8 | # 'user': 'root', 9 | # 'password': 'aitpune411015', 10 | # 'host': '127.0.0.1', 11 | # 'database': 'Ali_Final', 12 | # 'raise_on_warnings': True, 13 | # } 14 | 15 | # cnx = mysql.connector.connect(**config) 16 | # cursor = cnx.cursor() 17 | #cnx.close() 18 | 19 | from amazon.items import AmazonItem 20 | 21 | class AmazonProductSpider(scrapy.Spider): 22 | #spider name 23 | name = "amazon" 24 | #allowed_domains = ["amazon.com"] 25 | def start_requests(self): 26 | 27 | #Use working product URL below 28 | # start_urls = [ 29 | # "http://www.amazon.com/dp/B0046UR4F4", "http://www.amazon.com/dp/B00JGTVU5A", 30 | # "http://www.amazon.com/dp/B00O9A48N2", "http://www.amazon.com/dp/B00UZKG8QU" 31 | # ] 32 | 33 | #AsinList = ['B073XC3Y5J','B07439FYQX','B01EDXQ5QW','B004OWMLZW','B007YX9O9O',] 34 | 35 | #url formed as per user defined category 36 | yield scrapy.Request('http://www.amazon.com/dp/%s' % self.category,callback=self.parse_product_info) 37 | 38 | def parse_product_info(self, response): 39 | 40 | #Extracting the content using css or xpath selectors 41 | items = AmazonItem() 42 | title = response.xpath('//h1[@id="title"]/span/text()').extract() 43 | sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract() 44 | category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract() 45 | availability = response.xpath('//div[@id="availability"]//text()').extract() 46 | 47 | #create a dictionary to store the scraped info 48 | items['product_name'] = ''.join(title).strip() 49 | items['product_sale_price'] = ''.join(sale_price).strip() 50 | items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip() 51 | items['product_availability'] = ''.join(availability).strip() 52 | 53 | #yield or give the scraped info [items] 54 | yield items -------------------------------------------------------------------------------- /Amazon_com/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = amazon.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = amazon 12 | -------------------------------------------------------------------------------- /Amazon_in/amazon/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_in/amazon/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_in/amazon/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_in/amazon/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_in/amazon/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class AmazonItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | product_name = scrapy.Field() 13 | product_sale_price = scrapy.Field() 14 | product_category = scrapy.Field() 15 | product_original_price = scrapy.Field() 16 | product_availability = scrapy.Field() 17 | -------------------------------------------------------------------------------- /Amazon_in/amazon/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class AmazonSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Amazon_in/amazon/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Define your item pipelines here 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | class AmazonPipeline(object): 7 | def process_item(self, item, spider): 8 | return item 9 | -------------------------------------------------------------------------------- /Amazon_in/amazon/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for amazon project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'amazon' 13 | 14 | SPIDER_MODULES = ['amazon.spiders'] 15 | NEWSPIDER_MODULE = 'amazon.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'amazon (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'amazon.middlewares.AmazonSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'amazon.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'amazon.pipelines.AmazonPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Amazon_in/amazon/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Amazon_in/amazon/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_in/amazon/spiders/__pycache__/amazon.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/spiders/__pycache__/amazon.cpython-36.pyc -------------------------------------------------------------------------------- /Amazon_in/amazon/spiders/amazon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | # import mysql.connector 6 | # #database connection 7 | # config = { 8 | # 'user': 'root', 9 | # 'password': 'aitpune411015', 10 | # 'host': '127.0.0.1', 11 | # 'database': 'Ali_Final', 12 | # 'raise_on_warnings': True, 13 | # } 14 | 15 | # cnx = mysql.connector.connect(**config) 16 | # cursor = cnx.cursor() 17 | #cnx.close() 18 | 19 | from amazon.items import AmazonItem 20 | 21 | class AmazonProductSpider(scrapy.Spider): 22 | #spider name 23 | name = "amazon" 24 | #allowed_domains = ["amazon.in"] 25 | def start_requests(self): 26 | 27 | #AsinList = ['B01H5EBBX8','B0751LYPY3','B002U1ZBG0','B01HQ4NZE0','B01DU10H2G',] 28 | 29 | #url formed as per user defined category 30 | yield scrapy.Request('http://www.amazon.in/dp/%s' % self.category,callback=self.parse_product_info) 31 | 32 | def parse_product_info(self, response): 33 | 34 | #Extracting the content using css or xpath selectors 35 | items = AmazonItem() 36 | title = response.xpath('//h1[@id="title"]/span/text()').extract() 37 | sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract() 38 | category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract() 39 | availability = response.xpath('//div[@id="availability"]//text()').extract() 40 | 41 | #create a dictionary to store the scraped info 42 | items['product_name'] = ''.join(title).strip() 43 | items['product_sale_price'] = ''.join(sale_price).strip() 44 | items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip() 45 | items['product_availability'] = ''.join(availability).strip() 46 | 47 | #yield or give the scraped info [items] 48 | yield items 49 | -------------------------------------------------------------------------------- /Amazon_in/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = amazon.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = amazon 12 | -------------------------------------------------------------------------------- /Ebay_com/Ebay/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_com/Ebay/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_com/Ebay/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class EbayItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /Ebay_com/Ebay/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class EbaySpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Ebay_com/Ebay/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class EbayPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Ebay_com/Ebay/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Ebay project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Ebay' 13 | 14 | SPIDER_MODULES = ['Ebay.spiders'] 15 | NEWSPIDER_MODULE = 'Ebay.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Ebay (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Ebay.middlewares.EbaySpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Ebay.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'Ebay.pipelines.EbayPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | #FEED_FORMAT = "csv" 93 | #FEED_URI = "Ebay.csv" 94 | -------------------------------------------------------------------------------- /Ebay_com/Ebay/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Ebay_com/Ebay/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_com/Ebay/spiders/__pycache__/ebay.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/spiders/__pycache__/ebay.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_com/Ebay/spiders/ebay.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 16 13:48:05 2017 4 | 5 | @author: vicky 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | import scrapy 10 | 11 | import mysql.connector 12 | #database connection 13 | config = { 14 | 'user': 'root', 15 | 'password': 'aitpune411015', 16 | 'host': '127.0.0.1', 17 | 'database': 'Ali_Final', 18 | 'raise_on_warnings': True, 19 | } 20 | 21 | cnx = mysql.connector.connect(**config) 22 | cursor = cnx.cursor() 23 | #cnx.close() 24 | 25 | class EbayProductSpider(scrapy.Spider): 26 | #spider name 27 | name = 'ebay' 28 | 29 | def start_requests(self): 30 | #url formed as per user defined category 31 | #_dmd=2 for grid view 32 | yield scrapy.Request('https://www.ebay.com/sch/i.html?rt=nc&_dmd=2&_nkw=%s' % self.category,callback=self.parse) 33 | 34 | def parse(self,response): 35 | #Extracting the content using css selectors 36 | start_urls=[] 37 | for i in range(0,5): 38 | link=str(response.css("div.gvtitle a.vip::attr(href)")[i].extract()) 39 | start_urls.append(link) 40 | for url in start_urls: 41 | print(url) 42 | #calling parse function as per url to scrap info related to the product link 43 | yield scrapy.Request(url=url, callback=self.parse_product_info) 44 | info={ 45 | 'hello':'varun', 46 | } 47 | yield info 48 | 49 | def parse_product_info(self, response): 50 | 51 | #Extracting the content using css or xpath selectors 52 | url=str(response.xpath('/html/head/link[13]/@href').extract_first()) 53 | #currency=str(response.xpath('.//*[@class="p-symbol"]/text()').extract_first()) 54 | price=str(response.xpath('//*[@id="prcIsum"]/text()').extract_first()) 55 | if price=='': 56 | price=str(response.xpath('//*[@id="mm-saleOrgPrc"]/text()').extract_first()) 57 | discount_price=str(response.xpath('//*[@id="mm-saleDscPrc"]/text()').extract_first()) 58 | else: 59 | discount_price='none' 60 | #if price==' - ': 61 | # price=str(response.xpath('.//*[@class="p-price"]/span/text()').extract_first())+"-"+str(response.xpath('.//*[@class="p-price"]/span[2]/text()').extract_first()) 62 | #price=currency+price 63 | #discount_price=str(response.xpath('//*[@id="j-sku-discount-price"]/text()').extract_first()) 64 | title=str(response.xpath('//*[@id="itemTitle"]/text()').extract_first()) 65 | product_rating=str(response.xpath('//*[@id="histogramid"]/div/div[1]/span[1]/text()').extract_first()) 66 | if product_rating=='': 67 | product_rating='none' 68 | else: 69 | product_rating=product_rating[3:] 70 | product_rating_count=str(response.xpath('//*[@id="_rvwlnk"]/text()').extract_first()) 71 | 72 | #item_specifics=str(response.css(".ui-box.product-property-main span::text").extract()) 73 | item_specifics='none' 74 | seller_name=str(response.xpath('//*[@id="mbgLink"]/span/text()').extract_first()) 75 | shipping_cost=str(response.xpath('.//*[@id="fshippingCost"]//span/text()').extract_first()) 76 | if shipping_cost=='': 77 | shipping_cost='none' 78 | seller_rating=str(response.xpath('.//*[@id="si-fb"]/text()').extract_first()) 79 | 80 | print ('URL :',url) 81 | #print('CURRENCY :',currency) 82 | print ('Price :',price) 83 | print ('D_Price :',discount_price) 84 | print ('Title :',title) 85 | print ('P_Rating :',product_rating) 86 | print ('P_R_Count :',product_rating_count) 87 | print ('Item_Specifics :',item_specifics) 88 | print ('Seller_Name :',seller_name) 89 | print ('Shipping_Cost :',shipping_cost) 90 | print ('Seller_Rating :',seller_rating) 91 | 92 | 93 | cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name)) 94 | print ("%d rows were inserted" % cursor.rowcount) 95 | cnx.commit() 96 | 97 | #create a dictionary to store the scraped info 98 | scraped_info = { 99 | 100 | 'url' : url, 101 | 'price' : price, 102 | 'discount_price' : discount_price, 103 | 'title' : title, 104 | 'product_rating' : product_rating, 105 | 'product_rating_count' : product_rating_count, 106 | 'item_specifics' : item_specifics, 107 | 'seller_name' : seller_name, 108 | 'shipping_cost' : shipping_cost, 109 | 'seller_rating' : seller_rating, 110 | } 111 | 112 | #yield or give the scraped info to scrapy 113 | yield scraped_info 114 | 115 | -------------------------------------------------------------------------------- /Ebay_com/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Ebay.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Ebay 12 | -------------------------------------------------------------------------------- /Ebay_in/Ebay/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_in/Ebay/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_in/Ebay/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class EbayItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /Ebay_in/Ebay/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class EbaySpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Ebay_in/Ebay/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class EbayPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Ebay_in/Ebay/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Ebay project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Ebay' 13 | 14 | SPIDER_MODULES = ['Ebay.spiders'] 15 | NEWSPIDER_MODULE = 'Ebay.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Ebay (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Ebay.middlewares.EbaySpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Ebay.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'Ebay.pipelines.EbayPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | #FEED_FORMAT = "csv" 93 | #FEED_URI = "Ebay.csv" 94 | -------------------------------------------------------------------------------- /Ebay_in/Ebay/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Ebay_in/Ebay/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_in/Ebay/spiders/__pycache__/ebay.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/spiders/__pycache__/ebay.cpython-36.pyc -------------------------------------------------------------------------------- /Ebay_in/Ebay/spiders/ebay.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 16 13:48:05 2017 4 | 5 | @author: vicky 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | import scrapy 10 | 11 | import mysql.connector 12 | #database connection 13 | config = { 14 | 'user': 'root', 15 | 'password': 'aitpune411015', 16 | 'host': '127.0.0.1', 17 | 'database': 'Ali_Final', 18 | 'raise_on_warnings': True, 19 | } 20 | 21 | cnx = mysql.connector.connect(**config) 22 | cursor = cnx.cursor() 23 | #cnx.close() 24 | 25 | class EbayProductSpider(scrapy.Spider): 26 | #spider name 27 | name = 'ebay' 28 | 29 | def start_requests(self): 30 | #url formed as per user defined category 31 | #_dmd=2 for grid view 32 | yield scrapy.Request('https://www.ebay.in/sch/i.html?rt=nc&_dmd=2&_nkw=%s' % self.category,callback=self.parse) 33 | 34 | def parse(self,response): 35 | #Extracting the content using css selectors 36 | start_urls=[] 37 | for i in range(0,5): 38 | link=str(response.css("div.gvtitle a.vip::attr(href)")[i].extract()) 39 | start_urls.append(link) 40 | for url in start_urls: 41 | print(url) 42 | #calling parse function as per url to scrap info related to the product link 43 | yield scrapy.Request(url=url, callback=self.parse_product_info) 44 | info={ 45 | 'hello':'varun', 46 | } 47 | yield info 48 | 49 | def parse_product_info(self, response): 50 | 51 | #Extracting the content using css or xpath selectors 52 | url=str(response.xpath('/html/head/link[13]/@href').extract_first()) 53 | #currency=str(response.xpath('.//*[@class="p-symbol"]/text()').extract_first()) 54 | price=str(response.xpath('//*[@id="prcIsum"]/text()').extract_first()) 55 | if price=='': 56 | price=str(response.xpath('//*[@id="mm-saleOrgPrc"]/text()').extract_first()) 57 | discount_price=str(response.xpath('//*[@id="mm-saleDscPrc"]/text()').extract_first()) 58 | else: 59 | discount_price='none' 60 | #if price==' - ': 61 | # price=str(response.xpath('.//*[@class="p-price"]/span/text()').extract_first())+"-"+str(response.xpath('.//*[@class="p-price"]/span[2]/text()').extract_first()) 62 | #price=currency+price 63 | #discount_price=str(response.xpath('//*[@id="j-sku-discount-price"]/text()').extract_first()) 64 | title=str(response.xpath('//*[@id="itemTitle"]/text()').extract_first()) 65 | product_rating=str(response.xpath('//*[@id="histogramid"]/div/div[1]/span[1]/text()').extract_first()) 66 | if product_rating=='': 67 | product_rating='none' 68 | else: 69 | product_rating=product_rating[3:] 70 | product_rating_count=str(response.xpath('//*[@id="_rvwlnk"]/text()').extract_first()) 71 | 72 | #item_specifics=str(response.css(".ui-box.product-property-main span::text").extract()) 73 | item_specifics='none' 74 | seller_name=str(response.xpath('//*[@id="mbgLink"]/span/text()').extract_first()) 75 | shipping_cost=str(response.xpath('.//*[@id="fshippingCost"]//span/text()').extract_first()) 76 | if shipping_cost=='': 77 | shipping_cost='none' 78 | seller_rating=str(response.xpath('.//*[@id="si-fb"]/text()').extract_first()) 79 | 80 | print ('URL :',url) 81 | #print('CURRENCY :',currency) 82 | print ('Price :',price) 83 | print ('D_Price :',discount_price) 84 | print ('Title :',title) 85 | print ('P_Rating :',product_rating) 86 | print ('P_R_Count :',product_rating_count) 87 | print ('Item_Specifics :',item_specifics) 88 | print ('Seller_Name :',seller_name) 89 | print ('Shipping_Cost :',shipping_cost) 90 | print ('Seller_Rating :',seller_rating) 91 | 92 | 93 | cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name)) 94 | print ("%d rows were inserted" % cursor.rowcount) 95 | cnx.commit() 96 | 97 | #create a dictionary to store the scraped info 98 | scraped_info = { 99 | 100 | 'url' : url, 101 | 'price' : price, 102 | 'discount_price' : discount_price, 103 | 'title' : title, 104 | 'product_rating' : product_rating, 105 | 'product_rating_count' : product_rating_count, 106 | 'item_specifics' : item_specifics, 107 | 'seller_name' : seller_name, 108 | 'shipping_cost' : shipping_cost, 109 | 'seller_rating' : seller_rating, 110 | } 111 | 112 | #yield or give the scraped info to scrapy 113 | yield scraped_info 114 | 115 | -------------------------------------------------------------------------------- /Ebay_in/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Ebay.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Ebay 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # E-Commercial-Sites-Scrappers 2 | Developed various python scripts using Scrapy Framework for scraping various E-Commerce Sites like AliExpress, Flipkart, Amazon and Ebay along with the DataBase Connectivity [MYSQLWorkbench]. 3 | -------------------------------------------------------------------------------- /flipkart/flipkart/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flipkart/flipkart/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /flipkart/flipkart/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FlipkartItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /flipkart/flipkart/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FlipkartSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /flipkart/flipkart/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class FlipkartPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /flipkart/flipkart/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for flipkart project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'flipkart' 13 | 14 | SPIDER_MODULES = ['flipkart.spiders'] 15 | NEWSPIDER_MODULE = 'flipkart.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'flipkart (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'flipkart.middlewares.FlipkartSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'flipkart.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'flipkart.pipelines.FlipkartPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /flipkart/flipkart/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /flipkart/flipkart/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flipkart/flipkart/spiders/__pycache__/flipkart.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/spiders/__pycache__/flipkart.cpython-36.pyc -------------------------------------------------------------------------------- /flipkart/flipkart/spiders/flipkart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Nov 21 11:13:30 2017 4 | 5 | @author: vicky 6 | """ 7 | 8 | import scrapy 9 | 10 | import mysql.connector 11 | #database connection 12 | config = { 13 | 'user': 'root', 14 | 'password': 'aitpune411015', 15 | 'host': '127.0.0.1', 16 | 'database': 'Ali_Final', 17 | 'raise_on_warnings': True, 18 | } 19 | 20 | cnx = mysql.connector.connect(**config) 21 | cursor = cnx.cursor() 22 | #cnx.close() 23 | 24 | class FlipkartProductSpider(scrapy.Spider): 25 | #spider name 26 | name = 'flipkart' 27 | 28 | def start_requests(self): 29 | #url formed as per user defined category 30 | yield scrapy.Request('https://www.flipkart.com/search?otracker=start&as-show=on&as=off&q=%s' % self.category,callback=self.parse) 31 | 32 | def parse(self,response): 33 | #Extracting the content using css selectors 34 | start_urls=[] 35 | for i in range(0,5): 36 | link=str(response.css("div._3liAhj a.Zhf2z-::attr(href)")[i].extract()) 37 | start_urls.append("https://www.flipkart.com"+link) 38 | for url in start_urls: 39 | print(url) 40 | #calling parse function as per url to scrap info related to the product link 41 | yield scrapy.Request(url=url, callback=self.parse_product_info) 42 | info={ 43 | 'hello':'varun', 44 | } 45 | yield info 46 | 47 | def parse_product_info(self, response): 48 | 49 | #Extracting the content using css or xpath selectors 50 | 51 | url=str(response.xpath('/html/head/link[12]/@href').extract_first()) 52 | currency=str(response.xpath('.//*[@class="_3auQ3N _16fZeb"]/text()').extract_first()) 53 | if currency=='': 54 | currency=str(response.xpath('.//*[@class="_1vC4OE _37U4_g"]/text()')[0].extract()) 55 | price=str(response.xpath('.//*[@class="_1vC4OE _37U4_g"]/text()')[1].extract()) 56 | price=currency+" "+price 57 | discount_price='none' 58 | else: 59 | price=str(response.xpath('.//*[@class="_3auQ3N _16fZeb"]/text()')[1].extract()) 60 | price=currency+" "+price 61 | discount_price=str(response.xpath('.//*[@class="_1vC4OE _37U4_g"]/text()')[1].extract()) 62 | discount_price=currency+" "+discount_price 63 | title=str(response.xpath('.//*[@class="_3eAQiD"]/text()').extract_first()) 64 | product_rating=str(response.xpath('.//*[@class="niH0FQ"]/span[1]/div/text()').extract_first()) 65 | if product_rating=='': 66 | product_rating='none' 67 | product_rating_count=str(response.xpath('.//*[@class="_38sUEc"]/span/span/text()').extract_first()) 68 | if product_rating_count=='': 69 | product_rating_count='none' 70 | else: 71 | product_rating_count=product_rating_count[:-1] 72 | 73 | #item_specifics=str(response.css(".ui-box.product-property-main span::text").extract()) 74 | item_specifics='none' 75 | 76 | seller_name=str(response.xpath('.//*[@id="sellerName"]//span/text()').extract_first()) 77 | if seller_name=='': 78 | seller_name='none' 79 | seller_rating='none' 80 | else: 81 | length=len(seller_name) 82 | for i in range(0,length): 83 | if seller_name[i]=='(': 84 | seller_rating=seller_name[i:] 85 | 86 | print ('URL :',url) 87 | #print('CURRENCY :',currency) 88 | print ('Price :',price) 89 | print ('D_Price :',discount_price) 90 | print ('Title :',title) 91 | print ('P_Rating :',product_rating) 92 | print ('P_R_Count :',product_rating_count) 93 | print ('Item_Specifics :',item_specifics) 94 | print ('Seller_Name :',seller_name) 95 | print ('Seller_rating :',seller_rating) 96 | 97 | cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name)) 98 | print ("%d rows were inserted" % cursor.rowcount) 99 | cnx.commit() 100 | 101 | #create a dictionary to store the scraped info 102 | scraped_info = { 103 | 104 | 'url' : url, 105 | 'price' : price, 106 | 'discount_price' : discount_price, 107 | 'title' : title, 108 | 'product_rating' : product_rating, 109 | 'product_rating_count' : product_rating_count, 110 | 'item_specifics' : item_specifics, 111 | 'seller_name' : seller_name, 112 | 'seller_rating' : seller_rating, 113 | } 114 | 115 | #yield or give the scraped info to scrapy 116 | yield scraped_info 117 | 118 | -------------------------------------------------------------------------------- /flipkart/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = flipkart.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = flipkart 12 | --------------------------------------------------------------------------------