├── .gitignore
├── README.md
├── requirements.txt
└── walmart_spider
    ├── new_walmart_products_1.csv
    ├── scrapy.cfg
    ├── walmart_products_1.csv
    └── walmart_spider
        ├── __init__.py
        ├── aws_signed_request.py
        ├── items.py
        ├── match.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        ├── spiders
            ├── __init__.py
            ├── captcha_solver.py
            ├── categories.py
            ├── homedepot.py
            ├── kohls.py
            ├── target.py
            └── walmart.py
        ├── test.py
        └── train_captchas_data
            ├── train_captchas_data_images.npy
            └── train_captchas_data_labels.npy


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | *.pyc
12 | venv
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | 
58 | # Sphinx documentation
59 | docs/_build/
60 | 
61 | # PyBuilder
62 | target/
63 | 
64 | #Ipython Notebook
65 | .ipynb_checkpoints
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # wlamart
2 | scraping from walmart, target and dermstore website and getting data from amazon api
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse==1.2.1
2 | beautifulsoup4==4.4.1
3 | bs4==0.0.1
4 | fake-useragent==0.0.8
5 | requests==2.9.1
6 | wsgiref==0.1.2
7 | 


--------------------------------------------------------------------------------
/walmart_spider/new_walmart_products_1.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/new_walmart_products_1.csv


--------------------------------------------------------------------------------
/walmart_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = walmart_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = walmart_spider
12 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_products_1.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_products_1.csv


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_spider/__init__.py


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/aws_signed_request.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import time
 4 | import urllib
 5 | import base64
 6 | import hmac
 7 | import hashlib
 8 | 
 9 | def aws_signed_request(region, params, public_key, private_key, associate_tag=None, version='2011-08-01'):
10 |     
11 |     """
12 |     Copyright (c) 2010-2012 Ulrich Mierendorff
13 | 
14 |     Permission is hereby granted, free of charge, to any person obtaining a
15 |     copy of this software and associated documentation files (the "Software"),
16 |     to deal in the Software without restriction, including without limitation
17 |     the rights to use, copy, modify, merge, publish, distribute, sublicense,
18 |     and/or sell copies of the Software, and to permit persons to whom the
19 |     Software is furnished to do so, subject to the following conditions:
20 | 
21 |     The above copyright notice and this permission notice shall be included in
22 |     all copies or substantial portions of the Software.
23 | 
24 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
27 |     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 |     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30 |     DEALINGS IN THE SOFTWARE.
31 |     """
32 |     
33 |     """
34 |     Parameters:
35 |         region - the Amazon(r) region (ca,com,co.uk,de,fr,co.jp)
36 |         params - a dictionary of parameters, for example
37 |                     {'Operation': 'ItemLookup',
38 |                      'ItemId': 'B000X9FLKM',
39 |                      'ResponseGroup': 'Small'}
40 |         public_key - your "Access Key ID"
41 |         private_key - your "Secret Access Key"
42 |         version [optional]
43 |     """
44 |     
45 |     # some paramters
46 |     method = 'GET'
47 |     host = 'webservices.amazon.' + region
48 |     uri = '/onca/xml'
49 |     
50 |     # additional parameters
51 |     params['Service'] = 'AWSECommerceService'
52 |     params['AWSAccessKeyId'] = public_key
53 |     params['Timestamp'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
54 |     params['Version'] = version
55 |     if associate_tag:
56 |         params['AssociateTag'] = associate_tag
57 |     
58 |     # create the canonicalized query
59 |     canonicalized_query = [urllib.quote(param).replace('%7E', '~') + '=' + urllib.quote(params[param]).replace('%7E', '~')
60 |                             for param in sorted(params.keys())]
61 |     canonicalized_query = '&'.join(canonicalized_query)
62 |     
63 |     # create the string to sign
64 |     string_to_sign = method + '\n' + host + '\n' + uri + '\n' + canonicalized_query;
65 |     
66 |     # calculate HMAC with SHA256 and base64-encoding
67 |     signature = base64.b64encode(hmac.new(key=private_key, msg=string_to_sign, digestmod=hashlib.sha256).digest())
68 |     
69 |     # encode the signature for the request
70 |     signature = urllib.quote(signature).replace('%7E', '~')
71 | 
72 |     return 'http://' + host + uri + '?' + canonicalized_query + '&Signature=' + signature
73 | 
74 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.loader.processors import TakeFirst
 4 | 
 5 | class WalmartItem(scrapy.Item):
 6 |     title = scrapy.Field(output_processor=TakeFirst())
 7 |     upc = scrapy.Field(output_processor=TakeFirst())
 8 |     rank = scrapy.Field(output_processor=TakeFirst())
 9 |     category = scrapy.Field(output_processor=TakeFirst())
10 |     walmart_price = scrapy.Field(output_processor=TakeFirst())
11 |     homedepot_price = scrapy.Field(output_processor=TakeFirst())
12 |     target_price = scrapy.Field(output_processor=TakeFirst())
13 |     amazon_price1 = scrapy.Field(output_processor=TakeFirst())
14 |     amazon_price2 = scrapy.Field(output_processor=TakeFirst())
15 |     amazon_price3 = scrapy.Field(output_processor=TakeFirst())
16 |     weight = scrapy.Field(output_processor=TakeFirst())
17 |     wt_cost = scrapy.Field(output_processor=TakeFirst())
18 |     Tax_Cost = scrapy.Field(output_processor=TakeFirst())
19 |     Fees = scrapy.Field(output_processor=TakeFirst())
20 |     Tot_Cost = scrapy.Field(output_processor=TakeFirst())
21 |     Profit = scrapy.Field(output_processor=TakeFirst())
22 |     ROI = scrapy.Field(output_processor=TakeFirst())
23 | 
24 | 
25 | class CategoryItem(scrapy.Item):
26 |     category = scrapy.Field(output_processor=TakeFirst())
27 |     top5 = scrapy.Field(output_processor=TakeFirst())
28 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/match.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from operator import itemgetter
 3 | 
 4 | def get_upc_from_file(filename):
 5 |     with open(filename, mode='r') as infile:
 6 |         reader = csv.DictReader(infile)
 7 |         all_upc = [row for row in reader]
 8 |         return all_upc
 9 | 
10 | def get_categories_from_file(filename):
11 |     with open(filename, mode='r') as infile:
12 |         reader = csv.DictReader(infile)
13 |         categories = [row for row in reader]
14 |         return categories
15 | 
16 | def fill_acceptable_rank(all_upc, categories):
17 |     for item in all_upc:
18 |         if item['group']:
19 |             for cat in categories:
20 |                 if item['group'] in cat['category']:
21 |                     item['acceptable_rank'] = cat['top5']
22 |     return all_upc
23 | 
24 | def remove_filds_with_rank_greater_then_acceptable_rank(all_upc):
25 |     items_for_delete = []
26 |     for item in all_upc:
27 |         if item['rank'] and item['acceptable_rank']:
28 |             if int(item['rank'].replace(',', '')) > int(item['acceptable_rank']):
29 |                 items_for_delete.append(item)
30 | 
31 |     for item in items_for_delete:
32 |         all_upc.remove(item)
33 | 
34 |     return all_upc
35 | 
36 | def fill_weight_cost(all_upc):
37 |     for item in all_upc:
38 |         if item['weight']:
39 |             item['weight_cost'] = str(float(item['weight'].replace(',', '')) * 0.75)
40 | 
41 |     return all_upc
42 | 
43 | def fill_ROI(all_upc):
44 |     for item in all_upc:
45 |         if item['net_payout'] and item['weight_cost'] and item['cost']:
46 |             item['ROI'] = \
47 |                 float(item['net_payout'].replace('$', '').replace(',', '')) - \
48 |                 (float(item['cost'].replace('$', '').replace(',', ''))*0.7) - \
49 |                 float(item['weight_cost'])
50 | 
51 |     return all_upc
52 | 
53 | def remove_negative_ROI(all_upc):
54 |     items_for_delete = []
55 |     for item in all_upc:
56 |         if item['ROI'] < 0.0:
57 |             items_for_delete.append(item)
58 | 
59 |     for item in items_for_delete:
60 |         all_upc.remove(item)
61 |     return all_upc
62 | 
63 | def sort_by_field(all_upc, field):
64 |     return sorted(all_upc, key=itemgetter(field))
65 | 
66 | def save_to_file(filename, all_upc):
67 |     keys = all_upc[0].keys()
68 |     with open(filename, 'wb') as output_file:
69 |         writer = csv.DictWriter(output_file, keys)
70 |         writer.writeheader()
71 |         writer.writerows(all_upc)
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     for i in range(1, 4):
76 |         all_upc = get_upc_from_file('walmart_products_%s_new.csv' % str(i))
77 |         categories = get_categories_from_file('categories.csv')
78 | 
79 |         all_upc = fill_acceptable_rank(all_upc, categories)
80 |         all_upc = remove_filds_with_rank_greater_then_acceptable_rank(all_upc)
81 |         all_upc = fill_weight_cost(all_upc)
82 |         all_upc = fill_ROI(all_upc)
83 |         all_upc = remove_negative_ROI(all_upc)
84 |         all_upc = sort_by_field(all_upc, 'ROI')
85 | 
86 |         save_to_file('walmart_new_%s.csv' % str(i), all_upc)
87 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/middlewares.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import random
 3 | from scrapy.conf import settings
 4 | 
 5 | class ProxyMiddleware(object):
 6 |   def process_request(self, request, spider):
 7 |     request.meta['proxy'] = "http://23.81.251.102:29842"
 8 | 
 9 |     proxy_user_pass = "dsudom:43FVYMRy"
10 |     encoded_user_pass = base64.encodestring(proxy_user_pass)
11 |     request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
12 | 
13 | 
14 | class RandomUserAgentMiddleware(object):
15 |     def process_request(self, request, spider):
16 |         ua  = random.choice(settings.get('USER_AGENT_LIST'))
17 |         if ua:
18 |             request.headers.setdefault('User-Agent', ua)


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy import signals
 2 | from scrapy.exporters import CsvItemExporter
 3 | 
 4 | class CSVExportPipeline(object):
 5 |     counter = 0
 6 |     file_count = 1
 7 | 
 8 |     def __init__(self):
 9 |         self.files = {}
10 | 
11 |     @classmethod
12 |     def from_crawler(cls, crawler):
13 |         pipeline = cls()
14 |         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
15 |         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
16 |         return pipeline
17 | 
18 |     def spider_opened(self, spider):
19 |         if spider.name != 'categories':
20 |             file = open('%s_products_%s.csv' % (spider.name, self.file_count),
21 |                         'w+b')
22 |             self.files[spider] = file
23 |             tag = spider.name + "_price"
24 |             self.export_fields = ['title', 'upc', 'rank', 'category', tag, 'amazon_price1', 'amazon_price2', 'amazon_price3', 'weight', 'wt_cost', 'Tax_Cost', 'Fees', 'Tot_Cost', 'Profit', 'ROI']
25 |             self.exporter = CsvItemExporter(file, fields_to_export=self.export_fields)
26 |             self.exporter.start_exporting()
27 | 
28 |     def spider_closed(self, spider):
29 |         if spider.name != 'categories':
30 |             self.exporter.finish_exporting()
31 |             file = self.files.pop(spider)
32 |             file.close()
33 | 
34 |     def process_item(self, item, spider):
35 |         if spider.name != 'categories':
36 |             tag = spider.name + "_price"
37 |             price = item.get(tag, None)
38 |             upc = item.get('upc', None)
39 |             if price and upc:
40 |                 self.counter += 1
41 |                 if self.counter == 10000:
42 |                     self.exporter.finish_exporting()
43 |                     file = self.files.pop(spider)
44 |                     file.close()
45 | 
46 |                     self.file_count += 1
47 | 
48 |                     file = open('%s_products_%s.csv' % (spider.name,
49 |                                                         self.file_count),
50 |                                 'w+b')
51 |                     self.files[spider] = file
52 |                     self.export_fields = ['title', 'upc', 'rank', 'category', tag, 'amazon_price1', 'amazon_price2', 'amazon_price3', 'weight', 'wt_cost', 'Tax_Cost', 'Fees', 'Tot_Cost', 'Profit', 'ROI']
53 |                     self.exporter = CsvItemExporter(file, fields_to_export=self.export_fields)
54 |                     self.exporter.start_exporting()
55 | 
56 |                     self.counter = 0
57 |                 print self.counter, '*'*50
58 |                 self.exporter.export_item(item)
59 |         return item
60 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for walmart_spider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'walmart_spider'
 13 | 
 14 | SPIDER_MODULES = ['walmart_spider.spiders']
 15 | NEWSPIDER_MODULE = 'walmart_spider.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'walmart_spider (+http://www.yourdomain.com)'
 20 | 
 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 22 | #CONCURRENT_REQUESTS=32
 23 | 
 24 | # Configure a delay for requests for the same website (default: 0)
 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 26 | # See also autothrottle settings and docs
 27 | #DOWNLOAD_DELAY=3
 28 | # The download delay setting will honor only one of:
 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
 30 | #CONCURRENT_REQUESTS_PER_IP=16
 31 | 
 32 | # Disable cookies (enabled by default)
 33 | #COOKIES_ENABLED=False
 34 | 
 35 | # Disable Telnet Console (enabled by default)
 36 | #TELNETCONSOLE_ENABLED=False
 37 | 
 38 | # Override the default request headers:
 39 | 
 40 | # Enable or disable spider middlewares
 41 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 42 | #SPIDER_MIDDLEWARES = {
 43 | #    'walmart_spider.middlewares.MyCustomSpiderMiddleware': 543,
 44 | #}
 45 | 
 46 | # Enable or disable downloader middlewares
 47 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 48 | # USER_AGENT_LIST = [
 49 | #     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7(KHTML, like Gecko) '
 50 | #     'Chrome/16.0.912.36 Safari/535.7',
 51 | #     'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0)Gecko/16.0 Firefox/16.0',
 52 | #     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3'
 53 | #     '(KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
 54 | # ]
 55 | 
 56 | 
 57 | USER_AGENT_LIST = [
 58 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 59 |         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 60 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 61 |         "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
 62 |         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
 63 |         "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
 64 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
 65 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
 66 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
 67 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
 68 |         "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
 69 |         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
 70 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
 71 |         "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
 72 |         "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
 73 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
 74 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
 75 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
 76 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
 77 |     ]
 78 | 
 79 | 
 80 | # DOWNLOADER_MIDDLEWARES = {
 81 | #      'walmart_spider.middlewares.ProxyMiddleware': 410,
 82 | # }
 83 | 
 84 | DOWNLOADER_MIDDLEWARES = {
 85 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 86 |     'walmart_spider.middlewares.RandomUserAgentMiddleware': 400,
 87 | }
 88 | 
 89 | # Enable or disable extensions
 90 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 91 | #EXTENSIONS = {
 92 | #    'scrapy.telnet.TelnetConsole': None,
 93 | #}
 94 | 
 95 | # Configure item pipelines
 96 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 97 | 
 98 | ITEM_PIPELINES = {
 99 |     'walmart_spider.pipelines.CSVExportPipeline': 300,
100 | }
101 | 
102 | FEED_EXPORTERS = {
103 |     'csv': 'walmart_spider.pipelines.CSVExportPipeline',
104 | }
105 | 
106 | CONCURRENT_REQUESTS=100
107 | 
108 | CONCURRENT_REQUESTS_PER_IP=16
109 | 
110 | # Enable and configure the AutoThrottle extension (disabled by default)
111 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
112 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
113 | AUTOTHROTTLE_ENABLED=True
114 | # The initial download delay
115 | #AUTOTHROTTLE_START_DELAY=5
116 | # The maximum download delay to be set in case of high latencies
117 | #AUTOTHROTTLE_MAX_DELAY=60
118 | # Enable showing throttling stats for every response received:
119 | AUTOTHROTTLE_DEBUG=True
120 | 
121 | # Enable and configure HTTP caching (disabled by default)
122 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
123 | #HTTPCACHE_ENABLED=True
124 | #HTTPCACHE_EXPIRATION_SECS=0
125 | #HTTPCACHE_DIR='httpcache'
126 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
127 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
128 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/captcha_solver.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | try:
  3 |     import numpy as np
  4 |     import cv2
  5 | except Exception as e:
  6 |     print '!!!!!!!!Captcha breaker is not available due to: %s' % e
  7 |     class CaptchaBreakerWrapper(object):
  8 |         @staticmethod
  9 |         def solve_captcha(url):
 10 |             msg("CaptchaBreaker in not available for url: %s" % url,
 11 |                 level=WARNING)
 12 |             return None
 13 | 
 14 | import sys
 15 | import os
 16 | import re
 17 | 
 18 | import urllib
 19 | 
 20 | 
 21 | class CaptchaBreaker:
 22 | 
 23 |     HEIGHT = 50
 24 |     WIDTH = 50
 25 | 
 26 |     ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 27 |     
 28 | 
 29 |     knn = None
 30 | 
 31 |     def __init__(self, train_data, output_train_data_file=None, from_dir=False):
 32 |         if from_dir:
 33 |             self.knn = self.train_from_dir(train_data, output_train_data_file)
 34 |         else:
 35 |             self.knn = self.train_from_file(train_data)
 36 | 
 37 |     def letter_to_number(self, letter):
 38 |         return self.ALPHABET.index(letter)
 39 | 
 40 |     def number_to_letter(self, number):
 41 |         return self.ALPHABET[number]
 42 | 
 43 |     def clean_image(self, image, trim=False):
 44 | 
 45 |         gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
 46 |         thresh = cv2.adaptiveThreshold(
 47 |             gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,1,11,2)
 48 |         kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3))
 49 |         eroded = cv2.erode(thresh,kernel,iterations = 1)
 50 |         contours,hierarchy = cv2.findContours(eroded,cv2.RETR_EXTERNAL,
 51 |                                               cv2.CHAIN_APPROX_NONE)
 52 |         [x,y,w,h] = cv2.boundingRect(contours[0])
 53 |         roi = thresh[y:y+h,x:x+w]
 54 |         if trim:
 55 |             ret = self.add_borders(roi)
 56 |         else:
 57 |             ret = self.add_borders(thresh)
 58 |         return ret
 59 | 
 60 |     def add_borders(self, image):
 61 |         height, width = image.shape
 62 |         width_pad = (self.WIDTH - width) / 2.0
 63 |         left_pad = int(width_pad)
 64 |         if (left_pad != width_pad):
 65 |             right_pad = left_pad+1
 66 |         else:
 67 |             right_pad = left_pad
 68 | 
 69 |         height_pad = (self.HEIGHT - height) / 2.0
 70 |         top_pad = int(height_pad)
 71 |         if (top_pad!=height_pad):
 72 |             bottom_pad = top_pad+1
 73 |         else:
 74 |             bottom_pad = top_pad
 75 | 
 76 |         if height_pad > 0 and width_pad > 0:
 77 |             dst = cv2.copyMakeBorder(image, top_pad, bottom_pad, left_pad,
 78 |                                      right_pad, cv2.BORDER_CONSTANT, value=0)
 79 |         else:    
 80 |             dst = cv2.resize(image,(self.HEIGHT,self.WIDTH))
 81 |             sys.stderr.write("Could not add borders, shape " + str(height)
 82 |                              + "," + str(width) + "\n")
 83 |         return dst
 84 | 
 85 |     def segment(self, im):
 86 | 
 87 |         gray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
 88 |         thresh = cv2.adaptiveThreshold(
 89 |             gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,1,11,2)
 90 |         kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3))
 91 |         thresh = cv2.erode(thresh,kernel,iterations = 1)
 92 |         contours,hierarchy = cv2.findContours(thresh,cv2.RETR_EXTERNAL,
 93 |                                               cv2.CHAIN_APPROX_NONE)
 94 |         rois = []
 95 |         im2 = np.copy(im)
 96 |         for cnt in contours:
 97 |             [x,y,w,h] = cv2.boundingRect(cnt)
 98 |             if (h<5) or (w<5):
 99 |                 continue
100 |             cv2.rectangle(im2,(x,y),(x+w,y+h),(0,0,255),2)
101 |             roi = gray[y:y+h,x:x+w]
102 |             rois.append((roi, x))
103 |         ret = map(lambda x: x[0],sorted(rois, key=lambda x: x[1]))
104 |         return ret
105 | 
106 |     def get_images_from_dir(self, directory):
107 |         train_images_names = os.listdir(directory)
108 |         train_images = []
109 |         train_labels = []
110 |         for filename in train_images_names:
111 |             train_images.append(cv2.imread(directory+"/"+filename))
112 |             m = re.match("(.*)\..*", filename)
113 |             if m:
114 |                 base = m.group(1)
115 |                 letter = base[0]
116 |                 train_labels.append(self.letter_to_number(letter))
117 | 
118 |         for i in range(len(train_images)):
119 |             train_images[i] = self.clean_image(train_images[i])
120 | 
121 |         train_arrays = []
122 |         for image in train_images:
123 |             train_arrays.append(np.array(image))
124 | 
125 |         train_data = np.array(train_arrays)
126 | 
127 |         images = train_data.reshape(-1,
128 |                                     self.HEIGHT*self.WIDTH).astype(np.float32)
129 |         labels = np.array(train_labels)
130 | 
131 |         return (images, labels)
132 | 
133 |     def get_images_from_captcha(self, filename):
134 |         images = self.segment(cv2.imread(filename))
135 | 
136 |         for i in range(len(images)):
137 |             images[i] = cv2.adaptiveThreshold(
138 |                 images[i],255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,1,11,2)
139 |             images[i] = self.add_borders(images[i])
140 | 
141 |         image_arrays = []
142 |         for image in images:
143 |             image_arrays.append(np.array(image))
144 | 
145 |         data = np.array(image_arrays)
146 | 
147 |         ret_images = data.reshape(-1,self.HEIGHT*self.WIDTH).astype(np.float32)
148 |         return ret_images
149 | 
150 |     def train_from_dir(self, train_dir, datafile=None):
151 |         (train, train_labels) = self.get_images_from_dir(train_dir)
152 |         knn = cv2.KNearest()
153 |         knn.train(train,train_labels)
154 |         if datafile:
155 |             np.save(datafile + "_images", train)
156 |             np.save(datafile + "_labels", train_labels)
157 | 
158 |         return knn
159 | 
160 |     def train_from_file(self, train_data_file):
161 |         train = np.load(train_data_file + "/train_captchas_data_images.npy")
162 |         train_labels = np.load(train_data_file +
163 |                                "/train_captchas_data_labels.npy")
164 |         knn = cv2.KNearest()
165 |         knn.train(train,train_labels)
166 | 
167 |         return knn
168 | 
169 |     def test_captcha(self, captchafile):
170 |         test = self.get_images_from_captcha(captchafile)
171 |         ret,result,neighbours,dist = self.knn.find_nearest(test,k=1)
172 |         result_labels = []
173 |         for label in result:
174 |             result_labels.append(self.number_to_letter(int(label[0])))
175 |         return "".join(result_labels)
176 | 
177 |     def test_dir(self, test_dir):
178 |         (test, test_labels) = self.get_images_from_dir(test_dir)
179 |         ret,result,neighbours,dist = self.knn.find_nearest(test,k=2)
180 |         test_letter_labels = []
181 |         for label in test_labels:
182 |             test_letter_labels.append(number_to_letter(label))
183 |         print test_letter_labels
184 |         result_labels = []
185 |         for label in result:
186 |             result_labels.append(number_to_letter(int(label[0])))
187 |         print 'result:\n', result_labels
188 | 
189 |         l1 = np.array(result_labels)
190 |         l2 = np.array(test_letter_labels)
191 |         matches = l1==l2
192 |         correct = np.count_nonzero(matches)
193 |         accuracy = correct*100.0/result.size
194 |         print accuracy
195 | 
196 | 
197 | class CaptchaBreakerWrapper():
198 | 
199 |     CB = None
200 |     # CAPTCHAS_DIR = "captchas"
201 |     # SOLVED_CAPTCHAS_DIR = "solved_captchas"
202 |     # TRAIN_DATA_PATH = "tra         in_captchas_data"
203 |     CAPTCHAS_DIR = "/tmp/captchas"
204 |     SOLVED_CAPTCHAS_DIR = "/tmp/solved_captchas"
205 |     directory = os.path.dirname(os.path.abspath(__file__))
206 |     TRAIN_DATA_PATH = os.path.join(directory, '..', 'train_captchas_data')
207 | 
208 |     def solve_captcha(self, image_URL, debug_info=True):
209 | 
210 |         if not os.path.exists(self.CAPTCHAS_DIR):
211 |             os.makedirs(self.CAPTCHAS_DIR)
212 |         if not os.path.exists(self.SOLVED_CAPTCHAS_DIR):
213 |             os.makedirs(self.SOLVED_CAPTCHAS_DIR)
214 | 
215 |         m = re.match(".*/(Captcha_.*)",image_URL)
216 |         if not m:
217 |             if debug_info:
218 |                 sys.stderr.write("Couldn't extract captcha image name "
219 |                                  "from URL " + image_URL)
220 |             return None
221 | 
222 |         else:
223 |             image_name = m.group(1)
224 |             urllib.urlretrieve(image_URL, self.CAPTCHAS_DIR + "/" + image_name)
225 |             captcha_text = None
226 | 
227 |             try:
228 |                 if not self.CB:
229 |                     self.CB = CaptchaBreaker(self.TRAIN_DATA_PATH)
230 |                     if debug_info:
231 |                         sys.stderr.write("Training captcha classifier...\n")
232 | 
233 |                 captcha_text = self.CB.test_captcha(self.CAPTCHAS_DIR + "/"
234 |                                                     + image_name)
235 | 
236 |                 urllib.urlretrieve(image_URL, self.SOLVED_CAPTCHAS_DIR + "/"
237 |                                    + captcha_text + ".jpg")
238 |                 if debug_info:
239 |                     sys.stderr.write("Solving captcha: " + image_URL +
240 |                                      " with result " + captcha_text + "\n")
241 | 
242 |             except Exception, e:
243 |                 sys.stderr.write("Exception on solving captcha, for captcha "
244 |                                  + self.CAPTCHAS_DIR + "/" + image_name +
245 |                                  "\nException message: " + str(e) + "\n")
246 | 
247 |             return captcha_text
248 | 
249 | 
250 | if __name__=="__main__":
251 |     CW = CaptchaBreakerWrapper()
252 |     # CW.solve_captcha("http://ecx.images-amazon.com/captcha/bfhuzdtn/Captcha_distpnvhaw.jpg", False)
253 |     # CW.solve_captcha("http://ecx.images-amazon.com/captcha/bfhuzdtn/Captcha_distpnvhaw.jpg")
254 |     # CW.solve_captcha("http://ecx.images-amazon.com/captcha/bfhuzdtn/Captcha_distpnvhaw.jpg")
255 |     CW.solve_captcha("https://ipv4.google.com/sorry/image?id=7585877133141730835&hl=ru")


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/categories.py:
--------------------------------------------------------------------------------
 1 | from scrapy import Spider
 2 | from scrapy.loader import ItemLoader
 3 | 
 4 | from walmart_spider.items import CategoryItem
 5 | 
 6 | is_empty = lambda x, y="": x[0] if x else y
 7 | 
 8 | class WalmartSpider(Spider):
 9 |     name = 'categories'
10 | 
11 |     allowed_domains = ['www.arbitragedashboard.com']
12 |     start_urls = ['http://www.arbitragedashboard.com/software/top-rank-chart/']
13 | 
14 |     user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
15 |             'Gecko/20100101 Firefox/35.0'
16 | 
17 |     def parse(self, response):
18 |         trs = response.xpath('//tr')
19 |         for tr in trs:
20 |             l = ItemLoader(item=CategoryItem(), response=response)
21 |             category = is_empty(tr.xpath('./td[1]/text()').extract())
22 |             top5 = is_empty(tr.xpath('./td[5]/text()').extract())
23 | 
24 |             l.add_value('category', category)
25 |             l.add_value('top5', top5)
26 | 
27 |             yield l.load_item()
28 | 
29 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/homedepot.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from scrapy import Spider
  3 | from scrapy.http import Request
  4 | from scrapy.loader import ItemLoader
  5 | from walmart_spider.items import WalmartItem
  6 | from walmart_spider.aws_signed_request import aws_signed_request
  7 | from bs4 import BeautifulSoup
  8 | import requests
  9 | import urllib2
 10 | import datetime
 11 | import time
 12 | import xml.etree.ElementTree as ET
 13 | from fake_useragent import UserAgent
 14 | ua = UserAgent()
 15 | 
 16 | is_empty = lambda x, y="": x[0] if x else y
 17 | 
 18 | class HomedepotSpider(Spider):
 19 |     name = 'homedepot'
 20 | 
 21 |     allowed_domains = ['www.homedepot.com']
 22 | 
 23 |     user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
 24 |             'Gecko/20100101 Firefox/35.0'
 25 | 
 26 |     BASE_URL = 'http://www.homedepot.com/'
 27 | 
 28 |     start_urls = ['http://www.homedepot.com/c/site_map']
 29 | 
 30 |     def start_requests(self):
 31 |         headers = {"Accept": "*/*",
 32 |                    "Accept-Encoding": "gzip, deflate",
 33 |                    "User-Agent": "runscope/0.1"}
 34 |         yield Request(url=self.start_urls[0], headers=headers,
 35 |                       callback=self.parse_category)
 36 | 
 37 |     def parse_category(self, response):
 38 |         links = response.xpath(
 39 |             '//ul[@class="linkList l"]/li/'
 40 |             'a[contains(@href, "www.homedepot.com/b/")]/@href'
 41 |         ).extract()
 42 | 
 43 |         for link in links:
 44 |             print '-'*25, link, '-'*25
 45 |             if 'http' in link:
 46 |                 yield Request(url=link, callback=self.parse_product)
 47 |             else:
 48 |                 yield Request(url='http://'+link, callback=self.parse_product)
 49 | 
 50 |     def parse_product(self, response):
 51 | 
 52 |         product_links = response.xpath(
 53 |             '//div[contains(@class, "product pod")]/form/*/*/a/@href'
 54 |         ).extract()
 55 | 
 56 |         for product_link in product_links:
 57 |             yield Request(url=self.BASE_URL+product_link, callback=self.parse)
 58 | 
 59 |         next_link = is_empty(
 60 |             response.xpath('//a[@title="Next"]/@href').extract()
 61 |         )
 62 |         if next_link:
 63 |             yield Request(url=self.BASE_URL+next_link,
 64 |                           callback=self.parse_product)
 65 | 
 66 |     def parse(self, response):
 67 |         l = ItemLoader(item=WalmartItem(), response=response)
 68 |         upc = response.xpath('//upc/text()').extract()
 69 |         price = response.xpath('//span[@itemprop="price"]/text()').extract()
 70 | 
 71 |         upc = is_empty(upc).strip().replace("\n",'').replace('\r', '')
 72 |         if len(upc) > 12:
 73 |             upc = upc[1:]
 74 |         homedepot_price = is_empty(price).strip().replace("\n", '').replace('\r', '')
 75 |         
 76 |         l.add_value('upc', upc)
 77 |         l.add_value('homedepot_price', is_empty(price).strip().replace(
 78 |             "\n", '').replace('\r', ''))
 79 | 
 80 |         print "\n\n upc : ", upc
 81 |         print "price : ", homedepot_price
 82 | 
 83 |         if upc:
 84 |             region = REGION
 85 |             public_key = AWS_ACCESS_KEY_ID
 86 |             private_key = AWS_ACCESS_SECRET_KEY
 87 |             associate_tag = ASSOCIATE_TAG
 88 |             params = {
 89 |             "AWSAccessKeyId": public_key,
 90 |             "Service": "AWSECommerceService",
 91 |             "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'),
 92 |             "AssociateTag": associate_tag,
 93 |             "IdType": "UPC",
 94 |             "ItemId": upc,
 95 |             "ResponseGroup": "SalesRank, ItemAttributes, Offers, OfferListings",
 96 |             "Operation": "ItemLookup",
 97 |             "SearchIndex": "All"
 98 |             }
 99 |             url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01')
100 |             print url
101 |             try:
102 |                 header = ua.random
103 |                 headers = {"User-Agent": header}
104 |                 r = requests.get(url, headers=headers)
105 |                 content = r.text.encode("UTF-8")
106 |                 root = ET.fromstring(content)
107 |                 time.sleep(1)
108 |                 detail = []
109 |                 details = []
110 |                 new_price = []
111 | 
112 |                 rank = ''
113 |                 weight = None
114 |                 title = None
115 |                 category = ''
116 | 
117 |                 for t in root:
118 |                     for t1 in t:
119 |                         for t2 in t1:
120 |                             if "SalesRank" in t2.tag:
121 |                                 rank = t2.text.encode("UTF-8")
122 |                                 
123 |                             if "ItemAttributes" in t2.tag:
124 |                                 for t3 in t2:
125 |                                     if "ProductTypeName" in t3.tag:
126 |                                         category = t3.text.encode("UTF-8")
127 | 
128 |                                     if "ItemDimensions" in t3.tag:
129 |                                         for t4 in t3:
130 |                                             if "Weight" in t4.tag:
131 |                                                 weight = t4.text.encode("UTF-8")
132 |                                                 
133 | 
134 |                                     if "Title" in t3.tag:
135 |                                         title = t3.text.encode("UTF-8")
136 |                                         
137 | 
138 |                             if "Offers" in t2.tag:
139 |                                 for t5 in t2:
140 |                                     if "MoreOffersUrl" in t5.tag:
141 |                                         link = t5.text
142 |                                         length = len(new_price)
143 |                                         if link != "0" and length < 4:
144 |                                             time.sleep(1)
145 |                                             header = ua.random
146 |                                             
147 |                                             headers = {"User-Agent": header}
148 |                                             r = requests.get(link, headers=headers)
149 |                                             page = r.text
150 |                                             soup = BeautifulSoup(page, 'html.parser')
151 | 
152 |                                             a = soup.find("a", {"class": "a-link-normal a-text-bold"})
153 |                                             if a:
154 |                                                 url1 = "http://www.amazon.com"+a["href"]
155 |                                                 header = ua.random
156 |                                                 headers = {"User-Agent": header}
157 |                                                 r = requests.get(url1, headers=headers)
158 |                                                 page1 = r.text
159 |                                                 soup1 = BeautifulSoup(page1, 'html.parser')
160 |                                             else:
161 |                                                 soup1 = soup
162 | 
163 |                                             n = 1
164 |                                             for div in soup1.find_all("div", {"class": "a-row a-spacing-mini olpOffer"}):
165 |                                                 if n < 4:
166 |                                                     div1 = div.find("div", {"class": "a-column a-span2"})
167 |                                                     
168 |                                                     span = div1.find("span", {"class": "a-size-large a-color-price olpOfferPrice a-text-bold"})
169 |                                                     if span:
170 |                                                         price1 = ' '.join(span.text.split())
171 |                                                         new_price.append(price1)
172 |                                                         n = n+1
173 |                 if new_price and title is not None and weight is not None:
174 |                     l.add_value('title', title)
175 |                     #l.add_value('upc', upc[0])
176 |                     l.add_value('rank', rank)
177 |                     l.add_value('category', category)
178 |                     l.add_value('weight', weight)
179 |                     #l.add_value('walmart_price', price)
180 |                     l.add_value('amazon_price1', new_price[0])
181 |                     try:
182 |                         if new_price[1]:
183 |                             l.add_value('amazon_price2', new_price[1])
184 |                     except:
185 |                         price1 = ''
186 |                         l.add_value('amazon_price2', price1)
187 | 
188 |                     try:
189 |                         if new_price[2]:
190 |                             l.add_value('amazon_price3', new_price[2])
191 |                     except:
192 |                         price2 = ''
193 |                         l.add_value('amazon_price3', price2)
194 |                         l.add_value('weight', weight)    
195 | 
196 |                     amazon_price1 = new_price[0].split("$")
197 |                     amazon_price1 = float(amazon_price1[1])
198 | 
199 |                     if "-" in homedepot_price:
200 |                         price = homedepot_price.split("-")
201 |                         price1 = float(price[0].split("$")[1])
202 |                         price2 = float(price[1].split("$")[1])
203 |                         if price2 < amazon_price1:
204 |                             homedepot_price = price2
205 |                         else:
206 |                             homedepot_price = price1
207 |                     else:
208 |                         homedepot_price = float(homedepot_price.split("$")[1])
209 | 
210 |                     weight = float(weight)
211 |                     
212 |                     wt_cost = weight * 0.55
213 |                     l.add_value('wt_cost', wt_cost)
214 | 
215 |                     Tax_Cost = homedepot_price * 0.065
216 |                     l.add_value('Tax_Cost', Tax_Cost)
217 |                     
218 |                     Fees = amazon_price1 * 0.27
219 |                     l.add_value('Fees', Fees)
220 |                     
221 |                     Tot_Cost = homedepot_price + wt_cost + Tax_Cost + Fees
222 |                     l.add_value('Tot_Cost', Tot_Cost)
223 |                     
224 |                     Profit = amazon_price1 - Tot_Cost
225 |                     l.add_value('Profit', Profit)
226 |                     
227 |                     ROI = Profit / (homedepot_price + wt_cost + Tax_Cost)
228 |                     l.add_value('ROI', ROI)
229 |                     
230 |                     yield l.load_item()
231 | 
232 |             except Exception as e:
233 |                 print "\n Exception : ", e


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/kohls.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | 
 4 | from scrapy import Spider
 5 | from scrapy.http import Request
 6 | from scrapy.loader import ItemLoader
 7 | from scrapy.utils.response import open_in_browser
 8 | 
 9 | from walmart_spider.items import WalmartItem
10 | 
11 | is_empty = lambda x, y="": x[0] if x else y
12 | 
13 | class KohlsSpider(Spider):
14 |     name = 'kohls'
15 | 
16 |     allowed_domains = ['www.kohls.com']
17 | 
18 |     user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
19 |             'Gecko/20100101 Firefox/35.0'
20 | 
21 |     BASE_URL = 'http://www.kohls.com'
22 | 
23 |     start_urls = ['http://www.kohls.com/feature/sitemapmain.jsp']
24 | 
25 |     def parse(self, response):
26 |         links = response.xpath(
27 |             '//div[@id="sitemap-content"]/div/ul/li/a[contains(@href,"catalog")]/@href'
28 |         ).extract()
29 | 
30 |         for link in links:
31 |             yield Request(url=self.BASE_URL+link, callback=self.parse_product)
32 | 
33 |     def parse_product(self, response):
34 | 
35 |         script = response.xpath(
36 |             '//script[contains(text(), "pmpSearchJsonData")]'
37 |         ).extract()[0].replace('\n', '').strip()
38 | 
39 |         data = json.loads(re.findall('pmpSearchJsonData = ({.*});', script)[0])
40 | 
41 |         for product in data['productInfo']['productList']:
42 |             l = ItemLoader(item=WalmartItem(), response=response)
43 | 
44 |             l.add_value('title', product['productTitle'])
45 |             l.add_value('price', product['pricing']['regularPrice'])
46 | 
47 |             yield l.load_item()
48 | 
49 |         next_link = is_empty(
50 |             response.xpath('//link[@rel="next"]/@href').extract()
51 |         )
52 |         if next_link:
53 |             yield Request(url=self.BASE_URL+next_link,
54 |                           callback=self.parse_product)
55 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/target.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import datetime
  4 | from scrapy import Spider
  5 | from scrapy.http import Request
  6 | from walmart_spider.aws_signed_request import aws_signed_request
  7 | from scrapy.loader import ItemLoader
  8 | from bs4 import BeautifulSoup
  9 | import requests
 10 | import urllib2
 11 | 
 12 | from walmart_spider.items import WalmartItem
 13 | import time
 14 | import xml.etree.ElementTree as ET
 15 | from fake_useragent import UserAgent
 16 | ua = UserAgent()
 17 | 
 18 | is_empty = lambda x, y="": x[0] if x else y
 19 | 
 20 | class TargetSpider(Spider):
 21 | 	name = 'target'
 22 | 
 23 | 	allowed_domains = ['www.target.com', 'tws.target.com']
 24 | 
 25 | 	user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
 26 | 			'Gecko/20100101 Firefox/35.0'
 27 | 
 28 | 	BASE_URL = 'http://www.target.com'
 29 | 
 30 | 	start_urls = ['http://www.target.com/c/more/-/N-5xsxf#?lnk=ct_menu_12_1&'
 31 | 				  'intc=1865103|null']
 32 | 
 33 | 	JSON_SEARCH_URL = "http://tws.target.com/searchservice/item/" \
 34 | 					  "search_results/v2/by_keyword?" \
 35 | 					  "callback=getPlpResponse" \
 36 | 					  "&response_group=Items%2CVariationSummary" \
 37 | 					  "&category={category}" \
 38 | 					  "&sort_by=bestselling" \
 39 | 					  "&pageCount=60" \
 40 | 					  "&zone=PLP" \
 41 | 					  "&facets=" \
 42 | 					  "&view_type=medium" \
 43 | 					  "&page={page}" \
 44 | 					  "&offset={index}" \
 45 | 					  "&stateData="
 46 | 
 47 | 	def parse(self, response):
 48 | 		categories = response.xpath(
 49 | 			'//ul[@class="innerCol"]/li/a/@href').re('N-(.*)#')
 50 | 
 51 | 		for category in categories:
 52 | 			new_meta = response.meta.copy()
 53 | 			new_meta['category'] = category
 54 | 			new_meta['next_page'] = 2
 55 | 			new_meta['index'] = new_meta['next_page']*60
 56 | 
 57 | 			yield Request(url=self.JSON_SEARCH_URL.format(category=category,
 58 | 														  page=1,
 59 | 														  index=0),
 60 | 						  meta=new_meta,
 61 | 				 		  callback=self.parse_product)
 62 | 
 63 | 	def parse_product(self, response):
 64 | 		data = json.loads(
 65 | 			re.findall('getPlpResponse\((.*)\)', response.body)[0]
 66 | 		)
 67 | 
 68 | 		if len(data['searchResponse']['items']['Item']) > 0:
 69 | 
 70 | 			for product in data['searchResponse']['items']['Item']:
 71 | 				l = ItemLoader(item=WalmartItem(), response=response)
 72 | 				if 'priceSummary' in product.keys():
 73 | 					l.add_value('upc', product['upc'])
 74 | 					l.add_value('target_price',
 75 | 								product['priceSummary']['offerPrice']['amount'])
 76 | 					upc = product['upc']
 77 | 					target_price = product['priceSummary']['offerPrice']['amount']
 78 | 					print "upc : ", upc
 79 | 					print "target_price : ", target_price
 80 | 
 81 | 					if upc:
 82 | 						region = REGION
 83 | 						public_key = AWS_ACCESS_KEY_ID
 84 | 						private_key = AWS_ACCESS_SECRET_KEY
 85 | 						associate_tag = ASSOCIATE_TAG
 86 | 						params = {
 87 | 						"AWSAccessKeyId": public_key,
 88 | 						"Service": "AWSECommerceService",
 89 | 						"Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'),
 90 | 						"AssociateTag": associate_tag,
 91 | 						"IdType": "UPC",
 92 | 						"ItemId": upc,
 93 | 						"ResponseGroup": "SalesRank, ItemAttributes, Offers, OfferListings",
 94 | 						"Operation": "ItemLookup",
 95 | 						"SearchIndex": "All"
 96 | 						}
 97 | 						url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01')
 98 | 						print url
 99 | 						time.sleep(1)
100 | 						try:
101 | 							header = ua.random
102 | 							headers = {"User-Agent": header}
103 | 							r = requests.get(url, headers=headers)
104 | 							content = r.text.encode("UTF-8")
105 | 							root = ET.fromstring(content)
106 | 
107 | 							detail = []
108 | 							details = []
109 | 							new_price = []
110 | 
111 | 							rank = ''
112 | 							weight = None
113 | 							title = None
114 | 							category = ''
115 | 
116 | 							for t in root:
117 | 							 	for t1 in t:
118 | 									for t2 in t1:
119 | 								  		if "SalesRank" in t2.tag:
120 | 											rank = t2.text.encode("UTF-8")
121 | 											
122 | 
123 | 								  		if "ItemAttributes" in t2.tag:
124 | 											for t3 in t2:
125 | 												if "ProductTypeName" in t3.tag:
126 | 													category = t3.text.encode("UTF-8")
127 | 													
128 | 
129 | 									  			if "ItemDimensions" in t3.tag:
130 | 													for t4 in t3:
131 | 										  				if "Weight" in t4.tag:
132 | 															weight = t4.text.encode("UTF-8")
133 | 															
134 | 
135 | 									  			if "Title" in t3.tag:
136 | 													title = t3.text.encode("UTF-8")
137 | 													
138 | 											
139 | 										if "Offers" in t2.tag:
140 | 											for t5 in t2:
141 | 												if "MoreOffersUrl" in t5.tag:
142 | 													link = t5.text
143 | 													print "\n link : ", link
144 | 													length = len(new_price)
145 | 													if link != "0" and length < 4:
146 | 														time.sleep(1)
147 | 														header = ua.random
148 | 														headers = {"User-Agent": header}
149 | 														
150 | 														r = requests.get(link, headers=headers)
151 | 														page = r.text
152 | 														soup = BeautifulSoup(page, 'html.parser')
153 | 
154 | 														a = soup.find("a", {"class": "a-link-normal a-text-bold"})
155 | 														if a:
156 | 															url1 = "http://www.amazon.com"+a["href"]
157 | 															header = ua.random
158 | 															headers = {"User-Agent": header}
159 | 															r = requests.get(url1, headers=headers)
160 | 															page1 = r.text
161 | 															soup1 = BeautifulSoup(page1, 'html.parser')
162 | 														else:
163 | 															soup1 = soup
164 | 
165 | 														n = 1
166 | 														for div in soup1.find_all("div", {"class": "a-row a-spacing-mini olpOffer"}):
167 | 															if n < 4:
168 | 																div1 = div.find("div", {"class": "a-column a-span2"})
169 | 																
170 | 																span = div1.find("span", {"class": "a-size-large a-color-price olpOfferPrice a-text-bold"})
171 | 																if span:
172 | 																	price = ' '.join(span.text.split())
173 | 																	new_price.append(price)
174 | 																	n = n+1
175 | 
176 | 							if new_price and title is not None and weight is not None:
177 | 								l.add_value('title', title)
178 | 								
179 | 								#l.add_value('upc', upc[0])
180 | 								l.add_value('rank', rank)
181 | 								
182 | 								l.add_value('category', category)
183 | 								
184 | 								l.add_value('weight', weight)
185 | 								
186 | 								#l.add_value('walmart_price', price)
187 | 								l.add_value('amazon_price1', new_price[0])
188 | 								
189 | 								try:
190 | 									if new_price[1]:
191 | 										l.add_value('amazon_price2', new_price[1])
192 | 								except:
193 | 									price1 = ''
194 | 									l.add_value('amazon_price2', price1)
195 | 
196 | 								try:
197 | 									if new_price[2]:
198 | 										l.add_value('amazon_price3', new_price[2])
199 | 								except:
200 | 									price2 = ''
201 | 									l.add_value('amazon_price3', price2)
202 | 								l.add_value('weight', weight)
203 | 								
204 | 								amazon_price1 = new_price[0].split("$")
205 | 								amazon_price1 = float(amazon_price1[1])
206 | 
207 | 								if "-" in target_price:
208 | 									price = target_price.split("-")
209 | 									price1 = float(price[0].split("$")[1])
210 | 									price2 = float(price[1].split("$")[1])
211 | 									if price2 < amazon_price1:
212 | 										target_price = price2
213 | 									else:
214 | 										target_price = price1
215 | 								else:
216 | 									target_price = float(target_price.split("$")[1])
217 | 
218 | 								
219 | 								weight = float(weight)
220 | 							
221 | 								wt_cost = weight * 0.55
222 | 								l.add_value('wt_cost', wt_cost)
223 | 								
224 | 								
225 | 								Tax_Cost = target_price * 0.065
226 | 								l.add_value('Tax_Cost', Tax_Cost)
227 | 								
228 | 								Fees = amazon_price1 * 0.27
229 | 								l.add_value('Fees', Fees)
230 | 								
231 | 								
232 | 								Tot_Cost = target_price + wt_cost + Tax_Cost + Fees
233 | 								l.add_value('Tot_Cost', Tot_Cost)
234 | 								
235 | 								
236 | 								Profit = amazon_price1 - Tot_Cost
237 | 								l.add_value('Profit', Profit)
238 | 								
239 | 								
240 | 								ROI = Profit / (target_price + wt_cost + Tax_Cost)
241 | 								l.add_value('ROI', ROI)
242 | 								
243 | 								yield l.load_item()
244 | 						except:
245 | 							pass
246 | 
247 | 			page = response.meta['next_page']
248 | 			category = response.meta['category']
249 | 			index = response.meta['index']
250 | 			new_meta = response.meta.copy()
251 | 			new_meta['next_page'] = page + 1
252 | 			new_meta['index'] = new_meta['next_page']*60
253 | 			yield Request(url=self.JSON_SEARCH_URL.format(category=category,
254 | 														  page=page,
255 | 														  index=index),
256 | 						  meta=new_meta,
257 | 						  callback=self.parse_product)
258 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/spiders/walmart.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import datetime
  3 | from walmart_spider.aws_signed_request import aws_signed_request
  4 | from scrapy import Spider
  5 | from scrapy.http import Request, FormRequest
  6 | from scrapy.loader import ItemLoader
  7 | from scrapy.log import WARNING, DEBUG, INFO, ERROR
  8 | 
  9 | import requests
 10 | from walmart_spider.items import WalmartItem
 11 | import urllib2
 12 | from bs4 import BeautifulSoup
 13 | import time
 14 | import xml.etree.ElementTree as ET
 15 | import sys
 16 | from fake_useragent import UserAgent
 17 | ua = UserAgent()
 18 | 
 19 | is_empty = lambda x, y="": x[0] if x else y
 20 | 
 21 | # try:
 22 | #     from captcha_solver import CaptchaBreakerWrapper
 23 | # except Exception as e:
 24 | #     print '!!!!!!!!Captcha breaker is not available due to: %s' % e
 25 | #
 26 | #     class CaptchaBreakerWrapper(object):
 27 | #         @staticmethod
 28 | #         def solve_captcha(url):
 29 | #             msg("CaptchaBreaker in not available for url: %s" % url,
 30 | #                 level=WARNING)
 31 | #             return None
 32 | 
 33 | 
 34 | class WalmartSpider(Spider):
 35 |     name = 'walmart'
 36 | 
 37 |     allowed_domains = ['www.walmart.com', 'www.amazon.com']
 38 |     start_urls = ['http://www.walmart.com/all-departments']
 39 | 
 40 |     user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
 41 |             'Gecko/20100101 Firefox/35.0'
 42 | 
 43 |     BASE_URL = 'http://www.walmart.com'
 44 | 
 45 |     AMAZON_SEARCH_URL = "http://www.amazon.com/s/ref=nb_sb_noss?" \
 46 |                         "field-keywords={upc}"
 47 | 
 48 |     # def __init__(self, captcha_retries='10', *args, **kwargs):
 49 |     #     self.captcha_retries = int(captcha_retries)
 50 |     #     self._cbw = CaptchaBreakerWrapper()
 51 |     #     super(WalmartSpider, self).__init__(*args, **kwargs)
 52 | 
 53 |     def start_requests(self):
 54 |         headers = {"Accept": "*/*",
 55 |                    "Accept-Encoding": "gzip, deflate",
 56 |                    "User-Agent": "runscope/0.1"}
 57 |         yield Request(url=self.start_urls[0], headers=headers,
 58 |                       callback=self.parse_category)
 59 | 
 60 |     # def parse_captcha(self, response):
 61 |     #     if self._has_captcha(response):
 62 |     #         result = self._handle_captcha(response, self.parse_captcha)
 63 |     #     else:
 64 |     #         result = self.parse_without_captcha(response)
 65 |     #     return result
 66 |     #
 67 |     # def parse_without_captcha(self, response):
 68 |     #     links = response.xpath(
 69 |     #         '//a[@class="all-depts-links-category"]/@href'
 70 |     #     ).extract()
 71 |     #
 72 |     #     for link in links:
 73 |     #         print '-'*25, link, '-'*25
 74 |     #         yield Request(url=self.BASE_URL+link, callback=self.parse_product)
 75 | 
 76 |     def parse_category(self, response):
 77 |         links = response.xpath(
 78 |             '//a[@class="all-depts-links-category"]/@href'
 79 |         ).extract()
 80 | 
 81 |         for link in links:
 82 |             print '-'*25, link, '-'*25
 83 |             yield Request(url=self.BASE_URL+link, callback=self.parse_product)
 84 | 
 85 |     def parse_product(self, response):
 86 | 
 87 |         product_links = response.xpath(
 88 |             '//ul[@class="tile-list tile-list-grid"]/li/div/'
 89 |             'a[@class="js-product-title"]/@href'
 90 |         ).extract()
 91 | 
 92 |         for product_link in product_links:
 93 |             yield Request(url=self.BASE_URL+product_link, callback=self.parse)
 94 | 
 95 |         next_link = is_empty(response.xpath(
 96 |             '//a[@class="paginator-btn paginator-btn-next"]/@href'
 97 |         ).extract())
 98 |         if next_link:
 99 |             next_link = re.sub(
100 |                 '\?.*', next_link, response.url, flags=re.IGNORECASE
101 |             )
102 | 
103 |             yield Request(url=next_link, callback=self.parse_product)
104 | 
105 |     def parse(self, response):
106 |         l = ItemLoader(item=WalmartItem(), response=response)
107 |         upc = response.xpath('//meta[@property="og:upc"]/@content').extract()
108 |         price = ''.join(
109 |             response.xpath(
110 |                 '//div[@itemprop="price"][1]/text() |'
111 |                 ' //div[@itemprop="price"][1]/*/text()'
112 |             ).extract()
113 |         )
114 | 
115 |         if not price:
116 |             script = is_empty(response.xpath(
117 |                 '//script[contains(text(), "productSellersMap")]/text()'
118 |             ).extract())
119 |             price = is_empty(
120 |                 re.findall('\"currencyAmount\":([\d+,]?\d+.\d+)', script)
121 |             )
122 |             if not price:
123 |                 script = is_empty(
124 |                     response.xpath(
125 |                         '//script[contains(text(),"item_price")]/text()'
126 |                     ).extract()
127 |                 )
128 |                 price = is_empty(
129 |                     re.findall("item_price\',\'(\$[\d+,]?\d+.\d+)\'", script)
130 |                 )
131 |             else:
132 |                 price = '$' + price
133 | 
134 |         l.add_value('upc', is_empty(upc))
135 |         if price:
136 |             l.add_value('walmart_price', price.replace(" ", ''))
137 | 
138 |         if upc:
139 |             region = REGION
140 |             public_key = AWS_ACCESS_KEY_ID
141 |             private_key = AWS_ACCESS_SECRET_KEY
142 |             associate_tag = ASSOCIATE_TAG
143 | 
144 |             params = {
145 |                 "AWSAccessKeyId": public_key,
146 |                 "Service": "AWSECommerceService",
147 |                 "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'),
148 |                 "AssociateTag": associate_tag,
149 |                 "IdType": "UPC",
150 |                 "ItemId": upc[0],
151 |                 "ResponseGroup": "SalesRank, ItemAttributes, Offers, OfferListings",
152 |                 "Operation": "ItemLookup",
153 |                 "SearchIndex": "All"
154 |             }
155 | 
156 |             url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01')
157 | 
158 |             print url
159 | 
160 |             #time.sleep(1)
161 |             try:
162 |                 header = ua.random
163 |                 headers = {"User-Agent": header}
164 |                 r = requests.get(url, headers=headers)
165 |                 content = r.text.encode("UTF-8")
166 |                 root = ET.fromstring(content)
167 | 
168 |                 detail = []
169 |                 details = []
170 |                 new_price = []
171 | 
172 |                 rank = ''
173 |                 weight = None
174 |                 title = None
175 |                 category = ''
176 |                 
177 |                 for t in root:
178 |                     for t1 in t:
179 |                         for t2 in t1:
180 |                             if "SalesRank" in t2.tag:
181 |                                 rank = t2.text.encode("UTF-8")
182 |                                 
183 |                             if "ItemAttributes" in t2.tag:
184 |                                 for t3 in t2:
185 |                                     if "ProductTypeName" in t3.tag:
186 |                                         category = t3.text.encode("UTF-8")
187 |                                         
188 |                                     if "ItemDimensions" in t3.tag:
189 |                                         for t4 in t3:
190 |                                             if "Weight" in t4.tag:
191 |                                                 weight = t4.text.encode("UTF-8")
192 |                                                 
193 |                                     if "Title" in t3.tag:
194 |                                         title = t3.text.encode("UTF-8")
195 |                                         
196 |                             if "Offers" in t2.tag:
197 |                                 for t5 in t2:
198 |                                     if "MoreOffersUrl" in t5.tag:
199 |                                         link = t5.text
200 |                                         length = len(new_price)
201 |                                         if link != "0" and length < 4:
202 |                                             #time.sleep(1)
203 |                                             header = ua.random
204 |                                             headers = {"User-Agent": header}
205 |                                             r = requests.get(link, headers=headers)
206 |                                             page = r.text
207 |                                             soup = BeautifulSoup(page, 'html.parser')
208 | 
209 |                                             a = soup.find("a", {"class": "a-link-normal a-text-bold"})
210 |                                             if a:
211 |                                                 url1 = "http://www.amazon.com"+a["href"]
212 |                                                 header = ua.random
213 |                                                 headers = {"User-Agent": header}
214 |                                                 r = requests.get(url1, headers=headers)
215 |                                                 page1 = r.text
216 |                                                 soup1 = BeautifulSoup(page1, 'html.parser')
217 |                                             else:
218 |                                                 soup1 = soup
219 | 
220 |                                             n = 1
221 |                                             for div in soup1.find_all("div", {"class": "a-row a-spacing-mini olpOffer"}):
222 |                                                 if n < 4:
223 |                                                     div1 = div.find("div", {"class": "a-column a-span2"})
224 |                                                     
225 |                                                     span = div1.find("span", {"class": "a-size-large a-color-price olpOfferPrice a-text-bold"})
226 |                                                     if span:
227 |                                                         price = ' '.join(span.text.split())
228 |                                                         new_price.append(price)
229 |                                                         n = n+1
230 | 
231 |                 if new_price and title is not None and weight is not None:
232 |                     l.add_value('title', title)
233 |                     #l.add_value('upc', upc[0])
234 |                     l.add_value('rank', rank)
235 |                     l.add_value('category', category)
236 |                     l.add_value('weight', weight)
237 |                     #l.add_value('walmart_price', price)
238 |                     l.add_value('amazon_price1', new_price[0])
239 |                     try:
240 |                         if new_price[1]:
241 |                             l.add_value('amazon_price2', new_price[1])
242 |                     except:
243 |                         price1 = ''
244 |                         l.add_value('amazon_price2', price1)
245 | 
246 |                     try:
247 |                         if new_price[2]:
248 |                             l.add_value('amazon_price3', new_price[2])
249 |                     except:
250 |                         price2 = ''
251 |                         l.add_value('amazon_price3', price2)
252 |                     l.add_value('weight', weight)    
253 |                     
254 |                     if "-" in price:
255 |                         w_price = price.split("-")
256 |                         price1 = float(w_price[0].split("$")[1])
257 |                         price2 = float(w_price[1].split("$")[1])
258 |                         if price2 < amazon_price1:
259 |                             walmart_price = price2
260 |                         else:
261 |                             walmart_price = price1
262 |                     else:
263 |                         walmart_price = float(price.split("$")[1])
264 | 
265 |                     weight = float(weight)
266 |                     amazon_price1 = new_price[0].split("$")
267 |                     amazon_price1 = float(amazon_price1[1])
268 | 
269 |                     wt_cost = weight * 0.55
270 |                     l.add_value('wt_cost', wt_cost)
271 |                     
272 |                     Tax_Cost = walmart_price * 0.065
273 |                     l.add_value('Tax_Cost', Tax_Cost)
274 |                     
275 |                     Fees = amazon_price1 * 0.27
276 |                     l.add_value('Fees', Fees)
277 |                     
278 |                     Tot_Cost = walmart_price + wt_cost + Tax_Cost + Fees
279 |                     l.add_value('Tot_Cost', Tot_Cost)
280 |                     
281 |                     Profit = amazon_price1 - Tot_Cost
282 |                     l.add_value('Profit', Profit)
283 |                     
284 |                     ROI = Profit / (walmart_price + wt_cost + Tax_Cost)
285 |                     l.add_value('ROI', ROI)
286 |                     yield l.load_item()
287 |             except:
288 |                 pass
289 |         # if upc:
290 |         #     new_meta = response.meta.copy()
291 |         #     new_meta['item'] = l
292 |         #     yield Request(url=self.AMAZON_SEARCH_URL.format(upc=upc[0]),
293 |         #                   meta=new_meta, callback=self.parse_amazon_category)
294 |         # else:
295 |         
296 |         #yield l.load_item()
297 | 
298 |     #TODO: handling amazon
299 | #     def parse_amazon_category(self, response):
300 | #         if self._has_captcha(response):
301 | #             yield self._handle_captcha(response, self.parse_amazon_category)
302 | #         else:
303 | #             link = response.xpath('//a[@class="a-link-normal s-access-detail-page'
304 | #                                   '  a-text-normal"]/@href').extract()
305 | #             if link:
306 | #                 new_meta = response.meta.copy()
307 | #                 new_meta['item'] = response.meta['item']
308 | #                 yield Request(url=link[0], meta=new_meta,
309 | #                               callback=self.parse_amazon_product)
310 | #
311 | #     def parse_amazon_product(self, response):
312 | #         if self._has_captcha(response):
313 | #             yield self._handle_captcha(response, self.parse_amazon_product)
314 | #         else:
315 | #             l = response.meta['item']
316 | #             title = response.xpath(
317 | #                 '//span[@id="productTitle"]/text()'
318 | #             ).extract()
319 | #             l.add_value('title', is_empty(title))
320 | #
321 | #             amazon_price = response.xpath(
322 | #                 '//span[@id="priceblock_ourprice"]/text()'
323 | #             ).extract()
324 | #             l.add_value('amazon_price', is_empty(amazon_price))
325 | #
326 | #             weight = is_empty(response.xpath(
327 | #                 '//div[@class="content"]/ul/li/b[contains(text(),'
328 | #                 ' "Weight")]/following::text() | '
329 | #                 '//table[@id="productDetails_detailBullets_sections1"]/'
330 | #                 'tr[contains(.,"Weight")]/td/text()'
331 | #             ).extract(),'').replace('(', '').strip()
332 | #             l.add_value('weight', weight)
333 | #
334 | #             rank_category = response.xpath(
335 | #                 '//li[@id="SalesRank"]/text() |'
336 | #                 '//table[@id="productDetails_detailBullets_sections1"]'
337 | #                 '/tr[contains(.,"Best Seller")]/td'
338 | #             ).re('#(\d+[,\d+]*) in (.*) \(')
339 | #             if rank_category:
340 | #                 l.add_value('rank', rank_category[0])
341 | #                 l.add_value('category', rank_category[1])
342 | #             else:
343 | #                 category = response.xpath(
344 | #                     '//div[@id="wayfinding-breadcrumbs_feature_div"]/ul'
345 | #                     '/li[1]/span/a/text()'
346 | #                 ).extract()
347 | #                 if category:
348 | #                     category = category[0].strip()
349 | #                 l.add_value('category', category)
350 | #
351 | #             yield l.load_item()
352 | #
353 | # # Captcha handling functions.
354 | #     def _has_captcha(self, response):
355 | #         return '.images-amazon.com/captcha/' in response.body_as_unicode()
356 | #
357 | #     def _solve_captcha(self, response):
358 | #         forms = response.xpath('//form')
359 | #         assert len(forms) == 1, "More than one form found."
360 | #
361 | #         captcha_img = forms[0].xpath(
362 | #             '//img[contains(@src, "/captcha/")]/@src').extract()[0]
363 | #
364 | #         self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
365 | #         return self._cbw.solve_captcha(captcha_img)
366 | #
367 | #     def _handle_captcha(self, response, callback):
368 | #         captcha_solve_try = response.meta.get('captcha_solve_try', 0)
369 | #         url = response.url
370 | #         self.log("Captcha challenge for %s (try %d)."
371 | #                  % (url, captcha_solve_try),
372 | #                  level=INFO)
373 | #
374 | #         captcha = self._solve_captcha(response)
375 | #
376 | #         if captcha is None:
377 | #             self.log(
378 | #                 "Failed to guess captcha for '%s' (try: %d)." % (
379 | #                     url, captcha_solve_try),
380 | #                 level=ERROR
381 | #             )
382 | #             result = None
383 | #         else:
384 | #             self.log(
385 | #                 "On try %d, submitting captcha '%s' for '%s'." % (
386 | #                     captcha_solve_try, captcha, url),
387 | #                 level=INFO
388 | #             )
389 | #             meta = response.meta.copy()
390 | #             meta['captcha_solve_try'] = captcha_solve_try + 1
391 | #             result = FormRequest.from_response(
392 | #                 response,
393 | #                 formname='',
394 | #                 formdata={'field-keywords': captcha},
395 | #                 callback=callback,
396 | #                 dont_filter=True,
397 | #                 meta=meta)
398 | #
399 | #         return result
400 | 


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/test.py:
--------------------------------------------------------------------------------
 1 | from aws_signed_request import aws_signed_request
 2 | import datetime
 3 | import requests
 4 | 
 5 | # Service=AWSECommerceService&
 6 | # AWSAccessKeyId=AKIAJCAIVLPWYX553QKA&
 7 | # AssociateTag=esfera01-20&
 8 | # Operation=ItemSearch&
 9 | # Keywords=horse,bridle&
10 | # SearchIndex=PetSupplies,SportingGoods&
11 | # Timestamp={timestamp}&
12 | # Signature=[Request Signature]
13 | 
14 | 
15 | # http://webservices.amazon.com/onca/xml?
16 | # Service=AWSECommerceService&
17 | # AWSAccessKeyId=[AWS Access Key ID]&
18 | # AssociateTag=[Associate ID]&
19 | # Operation=ItemSearch&
20 | # Keywords=Potter&
21 | # SearchIndex=Books&
22 | # ItemPage=4
23 | # &Timestamp=[YYYY-MM-DDThh:mm:ssZ]
24 | # &Signature=[Request Signature]
25 | 
26 | region = "com"
27 | public_key = "AKIAJCAIVLPWYX553QKA"
28 | private_key = "VNCDZ5l0IEUqJIrr/0wuh1Cyj+ZxfbA/42d3Cu/a"
29 | associate_tag = "esfera01-20"
30 | 
31 | params = {
32 | 	"AWSAccessKeyId": public_key,
33 |     "Service": "AWSECommerceService",
34 |     "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'),
35 |     "AssociateTag": associate_tag,
36 |     "Operation": "ItemSearch",
37 |     #"ItemPage": "1",
38 |     "SearchIndex": "Electronics",
39 |     #"Keywords": "Electronics"
40 | }
41 | 
42 | url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01')
43 | print "url : ", url
44 | 
45 | # response = requests.get(url)
46 | # content = response.text
47 | 
48 | # print "\n content : ", content


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_images.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_images.npy


--------------------------------------------------------------------------------
/walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_labels.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_labels.npy


--------------------------------------------------------------------------------