├── AliExpress
    ├── AliExpress
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── aliexpress.cpython-36.pyc
    │   │   └── aliexpress.py
    └── scrapy.cfg
├── Amazon_com
    ├── amazon
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── amazon.cpython-36.pyc
    │   │   └── amazon.py
    └── scrapy.cfg
├── Amazon_in
    ├── amazon
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── amazon.cpython-36.pyc
    │   │   └── amazon.py
    └── scrapy.cfg
├── Ebay_com
    ├── Ebay
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── ebay.cpython-36.pyc
    │   │   └── ebay.py
    └── scrapy.cfg
├── Ebay_in
    ├── Ebay
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── ebay.cpython-36.pyc
    │   │   └── ebay.py
    └── scrapy.cfg
├── README.md
└── flipkart
    ├── flipkart
        ├── __pycache__
        │   ├── __init__.cpython-36.pyc
        │   └── settings.cpython-36.pyc
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
        │   ├── __init__.py
        │   ├── __pycache__
        │       ├── __init__.cpython-36.pyc
        │       └── flipkart.cpython-36.pyc
        │   └── flipkart.py
    └── scrapy.cfg


/AliExpress/AliExpress/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/AliExpress/AliExpress/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/AliExpress/AliExpress/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class AliexpressItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/AliExpress/AliExpress/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class AliexpressSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/AliExpress/AliExpress/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class AliexpressPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/AliExpress/AliExpress/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for AliExpress project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'AliExpress'
13 | 
14 | SPIDER_MODULES = ['AliExpress.spiders']
15 | NEWSPIDER_MODULE = 'AliExpress.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'AliExpress (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'AliExpress.middlewares.AliexpressSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'AliExpress.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'AliExpress.pipelines.AliexpressPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | #Export as CSV Feed
93 | 
94 | #FEED_FORMAT = "csv"
95 | #FEED_URI = "AliExpress.csv"
96 | 


--------------------------------------------------------------------------------
/AliExpress/AliExpress/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/AliExpress/AliExpress/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/AliExpress/AliExpress/spiders/__pycache__/aliexpress.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/AliExpress/AliExpress/spiders/__pycache__/aliexpress.cpython-36.pyc


--------------------------------------------------------------------------------
/AliExpress/AliExpress/spiders/aliexpress.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | import mysql.connector
 5 | #database connection
 6 | config = {
 7 |     'user': 'root',
 8 |     'password': 'aitpune411015',
 9 |     'host': '127.0.0.1',
10 |     'database': 'Ali_Final',
11 |     'raise_on_warnings': True,
12 |  }
13 | 
14 | cnx = mysql.connector.connect(**config)
15 | cursor = cnx.cursor()
16 | #cnx.close()
17 | 
18 | class AliexpressSpider(scrapy.Spider):
19 |     #spider name
20 |     name = 'aliexpress'
21 |     
22 |     def start_requests(self):
23 |         #url formed as per user defined category
24 |         # g-y for grid view        
25 |         yield scrapy.Request('https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20171114192306&g=y&SearchText=%s' % self.category,callback=self.parse)
26 |         
27 |     def parse(self,response):
28 |         #Extracting the content using css selectors
29 |         start_urls=[]
30 |         for i in range(0,5):
31 |             link=str(response.css("div.info a.history-item.product::attr(href)")[i].extract())
32 |             start_urls.append("https:"+link)
33 |         for url in start_urls:
34 |             print(url)
35 |             #calling parse function as per url to scrap info related to the product link
36 |             yield scrapy.Request(url=url, callback=self.parse_product_info)
37 |         info={
38 |             'hello':'varun',
39 |           }
40 |         yield info
41 |         
42 |     def parse_product_info(self, response):
43 |         
44 |         #Extracting the content using css or xpath selectors
45 |         url=str(response.xpath('/html/head/meta[7]/@content').extract_first())
46 |         currency=str(response.xpath('.//*[@class="p-symbol"]/text()').extract_first())
47 |         price=str(response.xpath('//*[@class="p-price"]/text()').extract_first())
48 |         if price==' - ':
49 |             price=str(response.xpath('.//*[@class="p-price"]/span/text()').extract_first())+"-"+str(response.xpath('.//*[@class="p-price"]/span[2]/text()').extract_first())
50 |         price=currency+price
51 |         discount_price=str(response.xpath('//*[@id="j-sku-discount-price"]/text()').extract_first())
52 |         title=str(response.css("title::text").extract_first())
53 |         product_rating=str(response.xpath('//*[@id="j-customer-reviews-trigger"]/span[2]/text()').extract_first())
54 |         product_rating_count=str(response.xpath('//*[@id="j-customer-reviews-trigger"]/span[3]/text()').extract_first())             
55 |         item_specifics=str(response.css(".ui-box.product-property-main span::text").extract())
56 |         seller_name=str(response.xpath('//*[@id="j-store-info-wrap"]/dl/dd[1]/a/text()').extract_first())
57 |         
58 |         print ('URL :',url)
59 |         #print('CURRENCY :',currency)
60 |         print ('Price :',price)
61 |         print ('D_Price :',discount_price)
62 |         print ('Title :',title)
63 |         print ('P_Rating :',product_rating)
64 |         print ('P_R_Count :',product_rating_count)
65 |         print ('Item_Specifics :',item_specifics)
66 |         print ('Seller_Name :',seller_name)
67 |         
68 |          
69 |         cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name))
70 |         print ("%d rows were inserted" % cursor.rowcount)        
71 |         cnx.commit()
72 |         
73 |         #create a dictionary to store the scraped info
74 |         scraped_info = {
75 | 
76 |             'url' : url,
77 |             'price' : price,
78 |             'discount_price' : discount_price,
79 |             'title' : title,
80 |             'product_rating' : product_rating,
81 |             'product_rating_count' : product_rating_count,
82 |             'item_specifics' : item_specifics,
83 |             'seller_name' : seller_name,
84 |         }
85 |             
86 |         #yield or give the scraped info to scrapy
87 |         yield scraped_info
88 | 
89 | 


--------------------------------------------------------------------------------
/AliExpress/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = AliExpress.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = AliExpress
12 | 


--------------------------------------------------------------------------------
/Amazon_com/amazon/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_com/amazon/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_com/amazon/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_com/amazon/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_com/amazon/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 |  
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 |  
 8 | import scrapy
 9 |  
10 | class AmazonItem(scrapy.Item):
11 |   # define the fields for your item here like:
12 |   product_name = scrapy.Field()
13 |   product_sale_price = scrapy.Field()
14 |   product_category = scrapy.Field()
15 |   product_original_price = scrapy.Field()
16 |   product_availability = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/Amazon_com/amazon/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class AmazonSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Amazon_com/amazon/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Define your item pipelines here
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
5 | 
6 | class AmazonPipeline(object):
7 | 	def process_item(self, item, spider):
8 | 		return item
9 | 


--------------------------------------------------------------------------------
/Amazon_com/amazon/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for amazon project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'amazon'
13 | 
14 | SPIDER_MODULES = ['amazon.spiders']
15 | NEWSPIDER_MODULE = 'amazon.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'amazon (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'amazon.middlewares.AmazonSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'amazon.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |   'amazon.pipelines.AmazonPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/Amazon_com/amazon/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Amazon_com/amazon/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_com/amazon/spiders/__pycache__/amazon.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_com/amazon/spiders/__pycache__/amazon.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_com/amazon/spiders/amazon.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | 
 5 | # import mysql.connector
 6 | # #database connection
 7 | # config = {
 8 | #     'user': 'root',
 9 | #     'password': 'aitpune411015',
10 | #     'host': '127.0.0.1',
11 | #     'database': 'Ali_Final',
12 | #     'raise_on_warnings': True,
13 | #  }
14 | 
15 | # cnx = mysql.connector.connect(**config)
16 | # cursor = cnx.cursor()
17 | #cnx.close()
18 | 
19 | from amazon.items import AmazonItem
20 | 
21 | class AmazonProductSpider(scrapy.Spider):
22 |     #spider name
23 |     name = "amazon"
24 |     #allowed_domains = ["amazon.com"]
25 |     def start_requests(self):
26 |         
27 |         #Use working product URL below
28 |         # start_urls = [
29 |         #   "http://www.amazon.com/dp/B0046UR4F4", "http://www.amazon.com/dp/B00JGTVU5A",
30 |         #   "http://www.amazon.com/dp/B00O9A48N2", "http://www.amazon.com/dp/B00UZKG8QU"
31 |         #  ]   
32 |         
33 |         #AsinList = ['B073XC3Y5J','B07439FYQX','B01EDXQ5QW','B004OWMLZW','B007YX9O9O',]
34 |         
35 |         #url formed as per user defined category
36 |         yield scrapy.Request('http://www.amazon.com/dp/%s' % self.category,callback=self.parse_product_info)
37 |  
38 |     def parse_product_info(self, response):
39 |         
40 |         #Extracting the content using css or xpath selectors
41 |         items = AmazonItem()
42 |         title = response.xpath('//h1[@id="title"]/span/text()').extract()
43 |         sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract()
44 |         category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()
45 |         availability = response.xpath('//div[@id="availability"]//text()').extract()
46 |         
47 |         #create a dictionary to store the scraped info
48 |         items['product_name'] = ''.join(title).strip()
49 |         items['product_sale_price'] = ''.join(sale_price).strip()
50 |         items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip()
51 |         items['product_availability'] = ''.join(availability).strip()
52 |        
53 |         #yield or give the scraped info [items]        
54 |         yield items


--------------------------------------------------------------------------------
/Amazon_com/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = amazon.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = amazon
12 | 


--------------------------------------------------------------------------------
/Amazon_in/amazon/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_in/amazon/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_in/amazon/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_in/amazon/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_in/amazon/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 |  
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 |  
 8 | import scrapy
 9 |  
10 | class AmazonItem(scrapy.Item):
11 |   # define the fields for your item here like:
12 |   product_name = scrapy.Field()
13 |   product_sale_price = scrapy.Field()
14 |   product_category = scrapy.Field()
15 |   product_original_price = scrapy.Field()
16 |   product_availability = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/Amazon_in/amazon/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class AmazonSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Amazon_in/amazon/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Define your item pipelines here
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
5 | 
6 | class AmazonPipeline(object):
7 | 	def process_item(self, item, spider):
8 | 		return item
9 | 


--------------------------------------------------------------------------------
/Amazon_in/amazon/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for amazon project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'amazon'
13 | 
14 | SPIDER_MODULES = ['amazon.spiders']
15 | NEWSPIDER_MODULE = 'amazon.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'amazon (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'amazon.middlewares.AmazonSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'amazon.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |   'amazon.pipelines.AmazonPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/Amazon_in/amazon/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Amazon_in/amazon/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_in/amazon/spiders/__pycache__/amazon.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Amazon_in/amazon/spiders/__pycache__/amazon.cpython-36.pyc


--------------------------------------------------------------------------------
/Amazon_in/amazon/spiders/amazon.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | 
 5 | # import mysql.connector
 6 | # #database connection
 7 | # config = {
 8 | #     'user': 'root',
 9 | #     'password': 'aitpune411015',
10 | #     'host': '127.0.0.1',
11 | #     'database': 'Ali_Final',
12 | #     'raise_on_warnings': True,
13 | #  }
14 | 
15 | # cnx = mysql.connector.connect(**config)
16 | # cursor = cnx.cursor()
17 | #cnx.close()
18 | 
19 | from amazon.items import AmazonItem
20 | 
21 | class AmazonProductSpider(scrapy.Spider):
22 |     #spider name
23 |     name = "amazon"
24 |     #allowed_domains = ["amazon.in"]
25 |     def start_requests(self):
26 |         
27 |         #AsinList = ['B01H5EBBX8','B0751LYPY3','B002U1ZBG0','B01HQ4NZE0','B01DU10H2G',]
28 |         
29 |         #url formed as per user defined category
30 |         yield scrapy.Request('http://www.amazon.in/dp/%s' % self.category,callback=self.parse_product_info)
31 |  
32 |     def parse_product_info(self, response):
33 |         
34 |         #Extracting the content using css or xpath selectors
35 |         items = AmazonItem()
36 |         title = response.xpath('//h1[@id="title"]/span/text()').extract()
37 |         sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract()
38 |         category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()
39 |         availability = response.xpath('//div[@id="availability"]//text()').extract()
40 |         
41 |         #create a dictionary to store the scraped info
42 |         items['product_name'] = ''.join(title).strip()
43 |         items['product_sale_price'] = ''.join(sale_price).strip()
44 |         items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip()
45 |         items['product_availability'] = ''.join(availability).strip()
46 |        
47 |         #yield or give the scraped info [items]        
48 |         yield items
49 | 


--------------------------------------------------------------------------------
/Amazon_in/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = amazon.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = amazon
12 | 


--------------------------------------------------------------------------------
/Ebay_com/Ebay/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_com/Ebay/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_com/Ebay/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class EbayItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/Ebay_com/Ebay/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class EbaySpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Ebay_com/Ebay/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class EbayPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Ebay_com/Ebay/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Ebay project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Ebay'
13 | 
14 | SPIDER_MODULES = ['Ebay.spiders']
15 | NEWSPIDER_MODULE = 'Ebay.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Ebay (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'Ebay.middlewares.EbaySpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'Ebay.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'Ebay.pipelines.EbayPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | #FEED_FORMAT = "csv"
93 | #FEED_URI = "Ebay.csv"
94 | 


--------------------------------------------------------------------------------
/Ebay_com/Ebay/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Ebay_com/Ebay/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_com/Ebay/spiders/__pycache__/ebay.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_com/Ebay/spiders/__pycache__/ebay.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_com/Ebay/spiders/ebay.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov 16 13:48:05 2017
  4 | 
  5 | @author: vicky
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | import scrapy
 10 | 
 11 | import mysql.connector
 12 | #database connection
 13 | config = {
 14 |     'user': 'root',
 15 |     'password': 'aitpune411015',
 16 |     'host': '127.0.0.1',
 17 |     'database': 'Ali_Final',
 18 |     'raise_on_warnings': True,
 19 |  }
 20 | 
 21 | cnx = mysql.connector.connect(**config)
 22 | cursor = cnx.cursor()
 23 | #cnx.close()
 24 | 
 25 | class EbayProductSpider(scrapy.Spider):
 26 |     #spider name
 27 |     name = 'ebay'
 28 |     
 29 |     def start_requests(self):
 30 |         #url formed as per user defined category
 31 |         #_dmd=2 for grid view
 32 |         yield scrapy.Request('https://www.ebay.com/sch/i.html?rt=nc&_dmd=2&_nkw=%s' % self.category,callback=self.parse)
 33 |         
 34 |     def parse(self,response):
 35 |         #Extracting the content using css selectors
 36 |         start_urls=[]
 37 |         for i in range(0,5):
 38 |             link=str(response.css("div.gvtitle a.vip::attr(href)")[i].extract())
 39 |             start_urls.append(link)
 40 |         for url in start_urls:
 41 |             print(url)
 42 |             #calling parse function as per url to scrap info related to the product link
 43 |             yield scrapy.Request(url=url, callback=self.parse_product_info)
 44 |         info={
 45 |             'hello':'varun',
 46 |           }
 47 |         yield info
 48 |         
 49 |     def parse_product_info(self, response):
 50 |         
 51 |         #Extracting the content using css or xpath selectors
 52 |         url=str(response.xpath('/html/head/link[13]/@href').extract_first())
 53 |         #currency=str(response.xpath('.//*[@class="p-symbol"]/text()').extract_first())
 54 |         price=str(response.xpath('//*[@id="prcIsum"]/text()').extract_first())
 55 |         if price=='':
 56 |             price=str(response.xpath('//*[@id="mm-saleOrgPrc"]/text()').extract_first())
 57 |             discount_price=str(response.xpath('//*[@id="mm-saleDscPrc"]/text()').extract_first())
 58 |         else:
 59 |             discount_price='none'
 60 |         #if price==' - ':
 61 |         #    price=str(response.xpath('.//*[@class="p-price"]/span/text()').extract_first())+"-"+str(response.xpath('.//*[@class="p-price"]/span[2]/text()').extract_first())
 62 |         #price=currency+price
 63 |         #discount_price=str(response.xpath('//*[@id="j-sku-discount-price"]/text()').extract_first())
 64 |         title=str(response.xpath('//*[@id="itemTitle"]/text()').extract_first())
 65 |         product_rating=str(response.xpath('//*[@id="histogramid"]/div/div[1]/span[1]/text()').extract_first())
 66 |         if product_rating=='':
 67 |             product_rating='none'
 68 |         else:
 69 |             product_rating=product_rating[3:]
 70 |         product_rating_count=str(response.xpath('//*[@id="_rvwlnk"]/text()').extract_first())  
 71 |                
 72 |         #item_specifics=str(response.css(".ui-box.product-property-main span::text").extract())
 73 |         item_specifics='none'
 74 |         seller_name=str(response.xpath('//*[@id="mbgLink"]/span/text()').extract_first())
 75 |         shipping_cost=str(response.xpath('.//*[@id="fshippingCost"]//span/text()').extract_first())
 76 |         if shipping_cost=='':
 77 |             shipping_cost='none'
 78 |         seller_rating=str(response.xpath('.//*[@id="si-fb"]/text()').extract_first())
 79 |         
 80 |         print ('URL :',url)
 81 |         #print('CURRENCY :',currency)
 82 |         print ('Price :',price)
 83 |         print ('D_Price :',discount_price)
 84 |         print ('Title :',title)
 85 |         print ('P_Rating :',product_rating)
 86 |         print ('P_R_Count :',product_rating_count)
 87 |         print ('Item_Specifics :',item_specifics)
 88 |         print ('Seller_Name :',seller_name)
 89 |         print ('Shipping_Cost :',shipping_cost)    
 90 |         print ('Seller_Rating :',seller_rating)
 91 |         
 92 |          
 93 |         cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name))
 94 |         print ("%d rows were inserted" % cursor.rowcount)        
 95 |         cnx.commit()
 96 |         
 97 |         #create a dictionary to store the scraped info
 98 |         scraped_info = {
 99 | 
100 |             'url' : url,
101 |             'price' : price,
102 |             'discount_price' : discount_price,
103 |             'title' : title,
104 |             'product_rating' : product_rating,
105 |             'product_rating_count' : product_rating_count,
106 |             'item_specifics' : item_specifics,
107 |             'seller_name' : seller_name,
108 |             'shipping_cost' : shipping_cost, 
109 |             'seller_rating' : seller_rating,
110 |         }
111 |             
112 |         #yield or give the scraped info to scrapy
113 |         yield scraped_info
114 | 
115 | 


--------------------------------------------------------------------------------
/Ebay_com/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Ebay.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Ebay
12 | 


--------------------------------------------------------------------------------
/Ebay_in/Ebay/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_in/Ebay/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_in/Ebay/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class EbayItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/Ebay_in/Ebay/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class EbaySpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Ebay_in/Ebay/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class EbayPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Ebay_in/Ebay/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Ebay project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Ebay'
13 | 
14 | SPIDER_MODULES = ['Ebay.spiders']
15 | NEWSPIDER_MODULE = 'Ebay.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Ebay (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'Ebay.middlewares.EbaySpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'Ebay.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'Ebay.pipelines.EbayPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | #FEED_FORMAT = "csv"
93 | #FEED_URI = "Ebay.csv"
94 | 


--------------------------------------------------------------------------------
/Ebay_in/Ebay/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Ebay_in/Ebay/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_in/Ebay/spiders/__pycache__/ebay.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/Ebay_in/Ebay/spiders/__pycache__/ebay.cpython-36.pyc


--------------------------------------------------------------------------------
/Ebay_in/Ebay/spiders/ebay.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov 16 13:48:05 2017
  4 | 
  5 | @author: vicky
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | import scrapy
 10 | 
 11 | import mysql.connector
 12 | #database connection
 13 | config = {
 14 |     'user': 'root',
 15 |     'password': 'aitpune411015',
 16 |     'host': '127.0.0.1',
 17 |     'database': 'Ali_Final',
 18 |     'raise_on_warnings': True,
 19 |  }
 20 | 
 21 | cnx = mysql.connector.connect(**config)
 22 | cursor = cnx.cursor()
 23 | #cnx.close()
 24 | 
 25 | class EbayProductSpider(scrapy.Spider):
 26 |     #spider name
 27 |     name = 'ebay'
 28 |     
 29 |     def start_requests(self):
 30 |         #url formed as per user defined category
 31 |         #_dmd=2 for grid view
 32 |         yield scrapy.Request('https://www.ebay.in/sch/i.html?rt=nc&_dmd=2&_nkw=%s' % self.category,callback=self.parse)
 33 |         
 34 |     def parse(self,response):
 35 |         #Extracting the content using css selectors
 36 |         start_urls=[]
 37 |         for i in range(0,5):
 38 |             link=str(response.css("div.gvtitle a.vip::attr(href)")[i].extract())
 39 |             start_urls.append(link)
 40 |         for url in start_urls:
 41 |             print(url)
 42 |             #calling parse function as per url to scrap info related to the product link
 43 |             yield scrapy.Request(url=url, callback=self.parse_product_info)
 44 |         info={
 45 |             'hello':'varun',
 46 |           }
 47 |         yield info
 48 |         
 49 |     def parse_product_info(self, response):
 50 |         
 51 |         #Extracting the content using css or xpath selectors
 52 |         url=str(response.xpath('/html/head/link[13]/@href').extract_first())
 53 |         #currency=str(response.xpath('.//*[@class="p-symbol"]/text()').extract_first())
 54 |         price=str(response.xpath('//*[@id="prcIsum"]/text()').extract_first())
 55 |         if price=='':
 56 |             price=str(response.xpath('//*[@id="mm-saleOrgPrc"]/text()').extract_first())
 57 |             discount_price=str(response.xpath('//*[@id="mm-saleDscPrc"]/text()').extract_first())
 58 |         else:
 59 |             discount_price='none'
 60 |         #if price==' - ':
 61 |         #    price=str(response.xpath('.//*[@class="p-price"]/span/text()').extract_first())+"-"+str(response.xpath('.//*[@class="p-price"]/span[2]/text()').extract_first())
 62 |         #price=currency+price
 63 |         #discount_price=str(response.xpath('//*[@id="j-sku-discount-price"]/text()').extract_first())
 64 |         title=str(response.xpath('//*[@id="itemTitle"]/text()').extract_first())
 65 |         product_rating=str(response.xpath('//*[@id="histogramid"]/div/div[1]/span[1]/text()').extract_first())
 66 |         if product_rating=='':
 67 |             product_rating='none'
 68 |         else:
 69 |             product_rating=product_rating[3:]
 70 |         product_rating_count=str(response.xpath('//*[@id="_rvwlnk"]/text()').extract_first())             
 71 |         
 72 |         #item_specifics=str(response.css(".ui-box.product-property-main span::text").extract())
 73 |         item_specifics='none'
 74 |         seller_name=str(response.xpath('//*[@id="mbgLink"]/span/text()').extract_first())
 75 |         shipping_cost=str(response.xpath('.//*[@id="fshippingCost"]//span/text()').extract_first())
 76 |         if shipping_cost=='':
 77 |             shipping_cost='none'
 78 |         seller_rating=str(response.xpath('.//*[@id="si-fb"]/text()').extract_first())
 79 |         
 80 |         print ('URL :',url)
 81 |         #print('CURRENCY :',currency)
 82 |         print ('Price :',price)
 83 |         print ('D_Price :',discount_price)
 84 |         print ('Title :',title)
 85 |         print ('P_Rating :',product_rating)
 86 |         print ('P_R_Count :',product_rating_count)
 87 |         print ('Item_Specifics :',item_specifics)
 88 |         print ('Seller_Name :',seller_name)
 89 |         print ('Shipping_Cost :',shipping_cost)    
 90 |         print ('Seller_Rating :',seller_rating)
 91 |         
 92 |          
 93 |         cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name))
 94 |         print ("%d rows were inserted" % cursor.rowcount)        
 95 |         cnx.commit()
 96 |         
 97 |         #create a dictionary to store the scraped info
 98 |         scraped_info = {
 99 | 
100 |             'url' : url,
101 |             'price' : price,
102 |             'discount_price' : discount_price,
103 |             'title' : title,
104 |             'product_rating' : product_rating,
105 |             'product_rating_count' : product_rating_count,
106 |             'item_specifics' : item_specifics,
107 |             'seller_name' : seller_name,
108 |             'shipping_cost' : shipping_cost, 
109 |             'seller_rating' : seller_rating,
110 |         }
111 |             
112 |         #yield or give the scraped info to scrapy
113 |         yield scraped_info
114 | 
115 | 


--------------------------------------------------------------------------------
/Ebay_in/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Ebay.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Ebay
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # E-Commercial-Sites-Scrappers
2 | Developed various python scripts using Scrapy Framework for scraping various E-Commerce Sites like AliExpress, Flipkart, Amazon and Ebay along with the DataBase Connectivity [MYSQLWorkbench].
3 | 


--------------------------------------------------------------------------------
/flipkart/flipkart/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/flipkart/flipkart/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/flipkart/flipkart/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FlipkartItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/flipkart/flipkart/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class FlipkartSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/flipkart/flipkart/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class FlipkartPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/flipkart/flipkart/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for flipkart project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'flipkart'
13 | 
14 | SPIDER_MODULES = ['flipkart.spiders']
15 | NEWSPIDER_MODULE = 'flipkart.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'flipkart (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'flipkart.middlewares.FlipkartSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'flipkart.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'flipkart.pipelines.FlipkartPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/flipkart/flipkart/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/flipkart/flipkart/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/flipkart/flipkart/spiders/__pycache__/flipkart.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/varunnayal15112/E-Commercial-Sites-Scrappers/e879928329ad3aa13536172f502426a974746a0f/flipkart/flipkart/spiders/__pycache__/flipkart.cpython-36.pyc


--------------------------------------------------------------------------------
/flipkart/flipkart/spiders/flipkart.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Nov 21 11:13:30 2017
  4 | 
  5 | @author: vicky
  6 | """
  7 | 
  8 | import scrapy
  9 | 
 10 | import mysql.connector
 11 | #database connection
 12 | config = {
 13 |     'user': 'root',
 14 |     'password': 'aitpune411015',
 15 |     'host': '127.0.0.1',
 16 |     'database': 'Ali_Final',
 17 |     'raise_on_warnings': True,
 18 |  }
 19 | 
 20 | cnx = mysql.connector.connect(**config)
 21 | cursor = cnx.cursor()
 22 | #cnx.close()
 23 | 
 24 | class FlipkartProductSpider(scrapy.Spider):
 25 |     #spider name
 26 |     name = 'flipkart'
 27 |     
 28 |     def start_requests(self):
 29 |         #url formed as per user defined category
 30 |         yield scrapy.Request('https://www.flipkart.com/search?otracker=start&as-show=on&as=off&q=%s' % self.category,callback=self.parse)
 31 |         
 32 |     def parse(self,response):
 33 |         #Extracting the content using css selectors
 34 |         start_urls=[]
 35 |         for i in range(0,5):
 36 |             link=str(response.css("div._3liAhj a.Zhf2z-::attr(href)")[i].extract())
 37 |             start_urls.append("https://www.flipkart.com"+link)
 38 |         for url in start_urls:
 39 |             print(url)
 40 |             #calling parse function as per url to scrap info related to the product link
 41 |             yield scrapy.Request(url=url, callback=self.parse_product_info)
 42 |         info={
 43 |             'hello':'varun',
 44 |           }
 45 |         yield info
 46 |         
 47 |     def parse_product_info(self, response):
 48 |         
 49 |         #Extracting the content using css or xpath selectors
 50 |         
 51 |         url=str(response.xpath('/html/head/link[12]/@href').extract_first())
 52 |         currency=str(response.xpath('.//*[@class="_3auQ3N _16fZeb"]/text()').extract_first())
 53 |         if currency=='':
 54 |             currency=str(response.xpath('.//*[@class="_1vC4OE _37U4_g"]/text()')[0].extract())
 55 |             price=str(response.xpath('.//*[@class="_1vC4OE _37U4_g"]/text()')[1].extract())
 56 |             price=currency+" "+price
 57 |             discount_price='none'
 58 |         else:
 59 |             price=str(response.xpath('.//*[@class="_3auQ3N _16fZeb"]/text()')[1].extract())
 60 |             price=currency+" "+price
 61 |             discount_price=str(response.xpath('.//*[@class="_1vC4OE _37U4_g"]/text()')[1].extract())
 62 |             discount_price=currency+" "+discount_price
 63 |         title=str(response.xpath('.//*[@class="_3eAQiD"]/text()').extract_first())
 64 |         product_rating=str(response.xpath('.//*[@class="niH0FQ"]/span[1]/div/text()').extract_first())
 65 |         if product_rating=='':
 66 |             product_rating='none'
 67 |         product_rating_count=str(response.xpath('.//*[@class="_38sUEc"]/span/span/text()').extract_first())
 68 |         if product_rating_count=='':
 69 |             product_rating_count='none'
 70 |         else:
 71 |             product_rating_count=product_rating_count[:-1]
 72 |              
 73 |         #item_specifics=str(response.css(".ui-box.product-property-main span::text").extract())
 74 |         item_specifics='none'
 75 |         
 76 |         seller_name=str(response.xpath('.//*[@id="sellerName"]//span/text()').extract_first())
 77 |         if seller_name=='':
 78 |             seller_name='none'
 79 |             seller_rating='none'
 80 |         else:
 81 |             length=len(seller_name)
 82 |             for i in range(0,length):
 83 |                 if seller_name[i]=='(':
 84 |                     seller_rating=seller_name[i:]
 85 |         
 86 |         print ('URL :',url)
 87 |         #print('CURRENCY :',currency)
 88 |         print ('Price :',price)
 89 |         print ('D_Price :',discount_price)
 90 |         print ('Title :',title)
 91 |         print ('P_Rating :',product_rating)
 92 |         print ('P_R_Count :',product_rating_count)
 93 |         print ('Item_Specifics :',item_specifics)
 94 |         print ('Seller_Name :',seller_name)
 95 |         print ('Seller_rating :',seller_rating)
 96 |          
 97 |         cursor.execute("""INSERT INTO AliExpress VALUES(%s,%s,%s,%s,%s,%s,%s,%s)""" , (url,price,discount_price,title,product_rating,product_rating_count,item_specifics,seller_name))
 98 |         print ("%d rows were inserted" % cursor.rowcount)        
 99 |         cnx.commit()
100 |         
101 |         #create a dictionary to store the scraped info
102 |         scraped_info = {
103 | 
104 |             'url' : url,
105 |             'price' : price,
106 |             'discount_price' : discount_price,
107 |             'title' : title,
108 |             'product_rating' : product_rating,
109 |             'product_rating_count' : product_rating_count,
110 |             'item_specifics' : item_specifics,
111 |             'seller_name' : seller_name,
112 |             'seller_rating' : seller_rating,
113 |         }
114 |             
115 |         #yield or give the scraped info to scrapy
116 |         yield scraped_info
117 | 
118 | 


--------------------------------------------------------------------------------
/flipkart/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = flipkart.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = flipkart
12 | 


--------------------------------------------------------------------------------