├── .gitignore ├── README.md ├── requirements.txt └── walmart_spider ├── new_walmart_products_1.csv ├── scrapy.cfg ├── walmart_products_1.csv └── walmart_spider ├── __init__.py ├── aws_signed_request.py ├── items.py ├── match.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders ├── __init__.py ├── captcha_solver.py ├── categories.py ├── homedepot.py ├── kohls.py ├── target.py └── walmart.py ├── test.py └── train_captchas_data ├── train_captchas_data_images.npy └── train_captchas_data_labels.npy /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | *.pyc 12 | venv 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | #Ipython Notebook 65 | .ipynb_checkpoints 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wlamart 2 | scraping from walmart, target and dermstore website and getting data from amazon api 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.2.1 2 | beautifulsoup4==4.4.1 3 | bs4==0.0.1 4 | fake-useragent==0.0.8 5 | requests==2.9.1 6 | wsgiref==0.1.2 7 | -------------------------------------------------------------------------------- /walmart_spider/new_walmart_products_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/new_walmart_products_1.csv -------------------------------------------------------------------------------- /walmart_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = walmart_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = walmart_spider 12 | -------------------------------------------------------------------------------- /walmart_spider/walmart_products_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_products_1.csv -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_spider/__init__.py -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/aws_signed_request.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import urllib 5 | import base64 6 | import hmac 7 | import hashlib 8 | 9 | def aws_signed_request(region, params, public_key, private_key, associate_tag=None, version='2011-08-01'): 10 | 11 | """ 12 | Copyright (c) 2010-2012 Ulrich Mierendorff 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a 15 | copy of this software and associated documentation files (the "Software"), 16 | to deal in the Software without restriction, including without limitation 17 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 18 | and/or sell copies of the Software, and to permit persons to whom the 19 | Software is furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in 22 | all copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 27 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 29 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 | DEALINGS IN THE SOFTWARE. 31 | """ 32 | 33 | """ 34 | Parameters: 35 | region - the Amazon(r) region (ca,com,co.uk,de,fr,co.jp) 36 | params - a dictionary of parameters, for example 37 | {'Operation': 'ItemLookup', 38 | 'ItemId': 'B000X9FLKM', 39 | 'ResponseGroup': 'Small'} 40 | public_key - your "Access Key ID" 41 | private_key - your "Secret Access Key" 42 | version [optional] 43 | """ 44 | 45 | # some paramters 46 | method = 'GET' 47 | host = 'webservices.amazon.' + region 48 | uri = '/onca/xml' 49 | 50 | # additional parameters 51 | params['Service'] = 'AWSECommerceService' 52 | params['AWSAccessKeyId'] = public_key 53 | params['Timestamp'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) 54 | params['Version'] = version 55 | if associate_tag: 56 | params['AssociateTag'] = associate_tag 57 | 58 | # create the canonicalized query 59 | canonicalized_query = [urllib.quote(param).replace('%7E', '~') + '=' + urllib.quote(params[param]).replace('%7E', '~') 60 | for param in sorted(params.keys())] 61 | canonicalized_query = '&'.join(canonicalized_query) 62 | 63 | # create the string to sign 64 | string_to_sign = method + '\n' + host + '\n' + uri + '\n' + canonicalized_query; 65 | 66 | # calculate HMAC with SHA256 and base64-encoding 67 | signature = base64.b64encode(hmac.new(key=private_key, msg=string_to_sign, digestmod=hashlib.sha256).digest()) 68 | 69 | # encode the signature for the request 70 | signature = urllib.quote(signature).replace('%7E', '~') 71 | 72 | return 'http://' + host + uri + '?' + canonicalized_query + '&Signature=' + signature 73 | 74 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.loader.processors import TakeFirst 4 | 5 | class WalmartItem(scrapy.Item): 6 | title = scrapy.Field(output_processor=TakeFirst()) 7 | upc = scrapy.Field(output_processor=TakeFirst()) 8 | rank = scrapy.Field(output_processor=TakeFirst()) 9 | category = scrapy.Field(output_processor=TakeFirst()) 10 | walmart_price = scrapy.Field(output_processor=TakeFirst()) 11 | homedepot_price = scrapy.Field(output_processor=TakeFirst()) 12 | target_price = scrapy.Field(output_processor=TakeFirst()) 13 | amazon_price1 = scrapy.Field(output_processor=TakeFirst()) 14 | amazon_price2 = scrapy.Field(output_processor=TakeFirst()) 15 | amazon_price3 = scrapy.Field(output_processor=TakeFirst()) 16 | weight = scrapy.Field(output_processor=TakeFirst()) 17 | wt_cost = scrapy.Field(output_processor=TakeFirst()) 18 | Tax_Cost = scrapy.Field(output_processor=TakeFirst()) 19 | Fees = scrapy.Field(output_processor=TakeFirst()) 20 | Tot_Cost = scrapy.Field(output_processor=TakeFirst()) 21 | Profit = scrapy.Field(output_processor=TakeFirst()) 22 | ROI = scrapy.Field(output_processor=TakeFirst()) 23 | 24 | 25 | class CategoryItem(scrapy.Item): 26 | category = scrapy.Field(output_processor=TakeFirst()) 27 | top5 = scrapy.Field(output_processor=TakeFirst()) 28 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/match.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from operator import itemgetter 3 | 4 | def get_upc_from_file(filename): 5 | with open(filename, mode='r') as infile: 6 | reader = csv.DictReader(infile) 7 | all_upc = [row for row in reader] 8 | return all_upc 9 | 10 | def get_categories_from_file(filename): 11 | with open(filename, mode='r') as infile: 12 | reader = csv.DictReader(infile) 13 | categories = [row for row in reader] 14 | return categories 15 | 16 | def fill_acceptable_rank(all_upc, categories): 17 | for item in all_upc: 18 | if item['group']: 19 | for cat in categories: 20 | if item['group'] in cat['category']: 21 | item['acceptable_rank'] = cat['top5'] 22 | return all_upc 23 | 24 | def remove_filds_with_rank_greater_then_acceptable_rank(all_upc): 25 | items_for_delete = [] 26 | for item in all_upc: 27 | if item['rank'] and item['acceptable_rank']: 28 | if int(item['rank'].replace(',', '')) > int(item['acceptable_rank']): 29 | items_for_delete.append(item) 30 | 31 | for item in items_for_delete: 32 | all_upc.remove(item) 33 | 34 | return all_upc 35 | 36 | def fill_weight_cost(all_upc): 37 | for item in all_upc: 38 | if item['weight']: 39 | item['weight_cost'] = str(float(item['weight'].replace(',', '')) * 0.75) 40 | 41 | return all_upc 42 | 43 | def fill_ROI(all_upc): 44 | for item in all_upc: 45 | if item['net_payout'] and item['weight_cost'] and item['cost']: 46 | item['ROI'] = \ 47 | float(item['net_payout'].replace('$', '').replace(',', '')) - \ 48 | (float(item['cost'].replace('$', '').replace(',', ''))*0.7) - \ 49 | float(item['weight_cost']) 50 | 51 | return all_upc 52 | 53 | def remove_negative_ROI(all_upc): 54 | items_for_delete = [] 55 | for item in all_upc: 56 | if item['ROI'] < 0.0: 57 | items_for_delete.append(item) 58 | 59 | for item in items_for_delete: 60 | all_upc.remove(item) 61 | return all_upc 62 | 63 | def sort_by_field(all_upc, field): 64 | return sorted(all_upc, key=itemgetter(field)) 65 | 66 | def save_to_file(filename, all_upc): 67 | keys = all_upc[0].keys() 68 | with open(filename, 'wb') as output_file: 69 | writer = csv.DictWriter(output_file, keys) 70 | writer.writeheader() 71 | writer.writerows(all_upc) 72 | 73 | 74 | if __name__ == '__main__': 75 | for i in range(1, 4): 76 | all_upc = get_upc_from_file('walmart_products_%s_new.csv' % str(i)) 77 | categories = get_categories_from_file('categories.csv') 78 | 79 | all_upc = fill_acceptable_rank(all_upc, categories) 80 | all_upc = remove_filds_with_rank_greater_then_acceptable_rank(all_upc) 81 | all_upc = fill_weight_cost(all_upc) 82 | all_upc = fill_ROI(all_upc) 83 | all_upc = remove_negative_ROI(all_upc) 84 | all_upc = sort_by_field(all_upc, 'ROI') 85 | 86 | save_to_file('walmart_new_%s.csv' % str(i), all_upc) 87 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import random 3 | from scrapy.conf import settings 4 | 5 | class ProxyMiddleware(object): 6 | def process_request(self, request, spider): 7 | request.meta['proxy'] = "http://23.81.251.102:29842" 8 | 9 | proxy_user_pass = "dsudom:43FVYMRy" 10 | encoded_user_pass = base64.encodestring(proxy_user_pass) 11 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass 12 | 13 | 14 | class RandomUserAgentMiddleware(object): 15 | def process_request(self, request, spider): 16 | ua = random.choice(settings.get('USER_AGENT_LIST')) 17 | if ua: 18 | request.headers.setdefault('User-Agent', ua) -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exporters import CsvItemExporter 3 | 4 | class CSVExportPipeline(object): 5 | counter = 0 6 | file_count = 1 7 | 8 | def __init__(self): 9 | self.files = {} 10 | 11 | @classmethod 12 | def from_crawler(cls, crawler): 13 | pipeline = cls() 14 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 15 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 16 | return pipeline 17 | 18 | def spider_opened(self, spider): 19 | if spider.name != 'categories': 20 | file = open('%s_products_%s.csv' % (spider.name, self.file_count), 21 | 'w+b') 22 | self.files[spider] = file 23 | tag = spider.name + "_price" 24 | self.export_fields = ['title', 'upc', 'rank', 'category', tag, 'amazon_price1', 'amazon_price2', 'amazon_price3', 'weight', 'wt_cost', 'Tax_Cost', 'Fees', 'Tot_Cost', 'Profit', 'ROI'] 25 | self.exporter = CsvItemExporter(file, fields_to_export=self.export_fields) 26 | self.exporter.start_exporting() 27 | 28 | def spider_closed(self, spider): 29 | if spider.name != 'categories': 30 | self.exporter.finish_exporting() 31 | file = self.files.pop(spider) 32 | file.close() 33 | 34 | def process_item(self, item, spider): 35 | if spider.name != 'categories': 36 | tag = spider.name + "_price" 37 | price = item.get(tag, None) 38 | upc = item.get('upc', None) 39 | if price and upc: 40 | self.counter += 1 41 | if self.counter == 10000: 42 | self.exporter.finish_exporting() 43 | file = self.files.pop(spider) 44 | file.close() 45 | 46 | self.file_count += 1 47 | 48 | file = open('%s_products_%s.csv' % (spider.name, 49 | self.file_count), 50 | 'w+b') 51 | self.files[spider] = file 52 | self.export_fields = ['title', 'upc', 'rank', 'category', tag, 'amazon_price1', 'amazon_price2', 'amazon_price3', 'weight', 'wt_cost', 'Tax_Cost', 'Fees', 'Tot_Cost', 'Profit', 'ROI'] 53 | self.exporter = CsvItemExporter(file, fields_to_export=self.export_fields) 54 | self.exporter.start_exporting() 55 | 56 | self.counter = 0 57 | print self.counter, '*'*50 58 | self.exporter.export_item(item) 59 | return item 60 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for walmart_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'walmart_spider' 13 | 14 | SPIDER_MODULES = ['walmart_spider.spiders'] 15 | NEWSPIDER_MODULE = 'walmart_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'walmart_spider (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | 40 | # Enable or disable spider middlewares 41 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 42 | #SPIDER_MIDDLEWARES = { 43 | # 'walmart_spider.middlewares.MyCustomSpiderMiddleware': 543, 44 | #} 45 | 46 | # Enable or disable downloader middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 48 | # USER_AGENT_LIST = [ 49 | # 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7(KHTML, like Gecko) ' 50 | # 'Chrome/16.0.912.36 Safari/535.7', 51 | # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0)Gecko/16.0 Firefox/16.0', 52 | # 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3' 53 | # '(KHTML, like Gecko) Version/5.1.3 Safari/534.53.10' 54 | # ] 55 | 56 | 57 | USER_AGENT_LIST = [ 58 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 59 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 60 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 61 | "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0", 62 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0", 63 | "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", 64 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", 65 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", 66 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 67 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", 68 | "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0", 69 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", 70 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 71 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0", 72 | "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", 73 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", 74 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 75 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0", 76 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)", 77 | ] 78 | 79 | 80 | # DOWNLOADER_MIDDLEWARES = { 81 | # 'walmart_spider.middlewares.ProxyMiddleware': 410, 82 | # } 83 | 84 | DOWNLOADER_MIDDLEWARES = { 85 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 86 | 'walmart_spider.middlewares.RandomUserAgentMiddleware': 400, 87 | } 88 | 89 | # Enable or disable extensions 90 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 91 | #EXTENSIONS = { 92 | # 'scrapy.telnet.TelnetConsole': None, 93 | #} 94 | 95 | # Configure item pipelines 96 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 97 | 98 | ITEM_PIPELINES = { 99 | 'walmart_spider.pipelines.CSVExportPipeline': 300, 100 | } 101 | 102 | FEED_EXPORTERS = { 103 | 'csv': 'walmart_spider.pipelines.CSVExportPipeline', 104 | } 105 | 106 | CONCURRENT_REQUESTS=100 107 | 108 | CONCURRENT_REQUESTS_PER_IP=16 109 | 110 | # Enable and configure the AutoThrottle extension (disabled by default) 111 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 112 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 113 | AUTOTHROTTLE_ENABLED=True 114 | # The initial download delay 115 | #AUTOTHROTTLE_START_DELAY=5 116 | # The maximum download delay to be set in case of high latencies 117 | #AUTOTHROTTLE_MAX_DELAY=60 118 | # Enable showing throttling stats for every response received: 119 | AUTOTHROTTLE_DEBUG=True 120 | 121 | # Enable and configure HTTP caching (disabled by default) 122 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 123 | #HTTPCACHE_ENABLED=True 124 | #HTTPCACHE_EXPIRATION_SECS=0 125 | #HTTPCACHE_DIR='httpcache' 126 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 127 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 128 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/captcha_solver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | try: 3 | import numpy as np 4 | import cv2 5 | except Exception as e: 6 | print '!!!!!!!!Captcha breaker is not available due to: %s' % e 7 | class CaptchaBreakerWrapper(object): 8 | @staticmethod 9 | def solve_captcha(url): 10 | msg("CaptchaBreaker in not available for url: %s" % url, 11 | level=WARNING) 12 | return None 13 | 14 | import sys 15 | import os 16 | import re 17 | 18 | import urllib 19 | 20 | 21 | class CaptchaBreaker: 22 | 23 | HEIGHT = 50 24 | WIDTH = 50 25 | 26 | ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 27 | 28 | 29 | knn = None 30 | 31 | def __init__(self, train_data, output_train_data_file=None, from_dir=False): 32 | if from_dir: 33 | self.knn = self.train_from_dir(train_data, output_train_data_file) 34 | else: 35 | self.knn = self.train_from_file(train_data) 36 | 37 | def letter_to_number(self, letter): 38 | return self.ALPHABET.index(letter) 39 | 40 | def number_to_letter(self, number): 41 | return self.ALPHABET[number] 42 | 43 | def clean_image(self, image, trim=False): 44 | 45 | gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY) 46 | thresh = cv2.adaptiveThreshold( 47 | gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,1,11,2) 48 | kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3)) 49 | eroded = cv2.erode(thresh,kernel,iterations = 1) 50 | contours,hierarchy = cv2.findContours(eroded,cv2.RETR_EXTERNAL, 51 | cv2.CHAIN_APPROX_NONE) 52 | [x,y,w,h] = cv2.boundingRect(contours[0]) 53 | roi = thresh[y:y+h,x:x+w] 54 | if trim: 55 | ret = self.add_borders(roi) 56 | else: 57 | ret = self.add_borders(thresh) 58 | return ret 59 | 60 | def add_borders(self, image): 61 | height, width = image.shape 62 | width_pad = (self.WIDTH - width) / 2.0 63 | left_pad = int(width_pad) 64 | if (left_pad != width_pad): 65 | right_pad = left_pad+1 66 | else: 67 | right_pad = left_pad 68 | 69 | height_pad = (self.HEIGHT - height) / 2.0 70 | top_pad = int(height_pad) 71 | if (top_pad!=height_pad): 72 | bottom_pad = top_pad+1 73 | else: 74 | bottom_pad = top_pad 75 | 76 | if height_pad > 0 and width_pad > 0: 77 | dst = cv2.copyMakeBorder(image, top_pad, bottom_pad, left_pad, 78 | right_pad, cv2.BORDER_CONSTANT, value=0) 79 | else: 80 | dst = cv2.resize(image,(self.HEIGHT,self.WIDTH)) 81 | sys.stderr.write("Could not add borders, shape " + str(height) 82 | + "," + str(width) + "\n") 83 | return dst 84 | 85 | def segment(self, im): 86 | 87 | gray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY) 88 | thresh = cv2.adaptiveThreshold( 89 | gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,1,11,2) 90 | kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3)) 91 | thresh = cv2.erode(thresh,kernel,iterations = 1) 92 | contours,hierarchy = cv2.findContours(thresh,cv2.RETR_EXTERNAL, 93 | cv2.CHAIN_APPROX_NONE) 94 | rois = [] 95 | im2 = np.copy(im) 96 | for cnt in contours: 97 | [x,y,w,h] = cv2.boundingRect(cnt) 98 | if (h<5) or (w<5): 99 | continue 100 | cv2.rectangle(im2,(x,y),(x+w,y+h),(0,0,255),2) 101 | roi = gray[y:y+h,x:x+w] 102 | rois.append((roi, x)) 103 | ret = map(lambda x: x[0],sorted(rois, key=lambda x: x[1])) 104 | return ret 105 | 106 | def get_images_from_dir(self, directory): 107 | train_images_names = os.listdir(directory) 108 | train_images = [] 109 | train_labels = [] 110 | for filename in train_images_names: 111 | train_images.append(cv2.imread(directory+"/"+filename)) 112 | m = re.match("(.*)\..*", filename) 113 | if m: 114 | base = m.group(1) 115 | letter = base[0] 116 | train_labels.append(self.letter_to_number(letter)) 117 | 118 | for i in range(len(train_images)): 119 | train_images[i] = self.clean_image(train_images[i]) 120 | 121 | train_arrays = [] 122 | for image in train_images: 123 | train_arrays.append(np.array(image)) 124 | 125 | train_data = np.array(train_arrays) 126 | 127 | images = train_data.reshape(-1, 128 | self.HEIGHT*self.WIDTH).astype(np.float32) 129 | labels = np.array(train_labels) 130 | 131 | return (images, labels) 132 | 133 | def get_images_from_captcha(self, filename): 134 | images = self.segment(cv2.imread(filename)) 135 | 136 | for i in range(len(images)): 137 | images[i] = cv2.adaptiveThreshold( 138 | images[i],255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,1,11,2) 139 | images[i] = self.add_borders(images[i]) 140 | 141 | image_arrays = [] 142 | for image in images: 143 | image_arrays.append(np.array(image)) 144 | 145 | data = np.array(image_arrays) 146 | 147 | ret_images = data.reshape(-1,self.HEIGHT*self.WIDTH).astype(np.float32) 148 | return ret_images 149 | 150 | def train_from_dir(self, train_dir, datafile=None): 151 | (train, train_labels) = self.get_images_from_dir(train_dir) 152 | knn = cv2.KNearest() 153 | knn.train(train,train_labels) 154 | if datafile: 155 | np.save(datafile + "_images", train) 156 | np.save(datafile + "_labels", train_labels) 157 | 158 | return knn 159 | 160 | def train_from_file(self, train_data_file): 161 | train = np.load(train_data_file + "/train_captchas_data_images.npy") 162 | train_labels = np.load(train_data_file + 163 | "/train_captchas_data_labels.npy") 164 | knn = cv2.KNearest() 165 | knn.train(train,train_labels) 166 | 167 | return knn 168 | 169 | def test_captcha(self, captchafile): 170 | test = self.get_images_from_captcha(captchafile) 171 | ret,result,neighbours,dist = self.knn.find_nearest(test,k=1) 172 | result_labels = [] 173 | for label in result: 174 | result_labels.append(self.number_to_letter(int(label[0]))) 175 | return "".join(result_labels) 176 | 177 | def test_dir(self, test_dir): 178 | (test, test_labels) = self.get_images_from_dir(test_dir) 179 | ret,result,neighbours,dist = self.knn.find_nearest(test,k=2) 180 | test_letter_labels = [] 181 | for label in test_labels: 182 | test_letter_labels.append(number_to_letter(label)) 183 | print test_letter_labels 184 | result_labels = [] 185 | for label in result: 186 | result_labels.append(number_to_letter(int(label[0]))) 187 | print 'result:\n', result_labels 188 | 189 | l1 = np.array(result_labels) 190 | l2 = np.array(test_letter_labels) 191 | matches = l1==l2 192 | correct = np.count_nonzero(matches) 193 | accuracy = correct*100.0/result.size 194 | print accuracy 195 | 196 | 197 | class CaptchaBreakerWrapper(): 198 | 199 | CB = None 200 | # CAPTCHAS_DIR = "captchas" 201 | # SOLVED_CAPTCHAS_DIR = "solved_captchas" 202 | # TRAIN_DATA_PATH = "tra in_captchas_data" 203 | CAPTCHAS_DIR = "/tmp/captchas" 204 | SOLVED_CAPTCHAS_DIR = "/tmp/solved_captchas" 205 | directory = os.path.dirname(os.path.abspath(__file__)) 206 | TRAIN_DATA_PATH = os.path.join(directory, '..', 'train_captchas_data') 207 | 208 | def solve_captcha(self, image_URL, debug_info=True): 209 | 210 | if not os.path.exists(self.CAPTCHAS_DIR): 211 | os.makedirs(self.CAPTCHAS_DIR) 212 | if not os.path.exists(self.SOLVED_CAPTCHAS_DIR): 213 | os.makedirs(self.SOLVED_CAPTCHAS_DIR) 214 | 215 | m = re.match(".*/(Captcha_.*)",image_URL) 216 | if not m: 217 | if debug_info: 218 | sys.stderr.write("Couldn't extract captcha image name " 219 | "from URL " + image_URL) 220 | return None 221 | 222 | else: 223 | image_name = m.group(1) 224 | urllib.urlretrieve(image_URL, self.CAPTCHAS_DIR + "/" + image_name) 225 | captcha_text = None 226 | 227 | try: 228 | if not self.CB: 229 | self.CB = CaptchaBreaker(self.TRAIN_DATA_PATH) 230 | if debug_info: 231 | sys.stderr.write("Training captcha classifier...\n") 232 | 233 | captcha_text = self.CB.test_captcha(self.CAPTCHAS_DIR + "/" 234 | + image_name) 235 | 236 | urllib.urlretrieve(image_URL, self.SOLVED_CAPTCHAS_DIR + "/" 237 | + captcha_text + ".jpg") 238 | if debug_info: 239 | sys.stderr.write("Solving captcha: " + image_URL + 240 | " with result " + captcha_text + "\n") 241 | 242 | except Exception, e: 243 | sys.stderr.write("Exception on solving captcha, for captcha " 244 | + self.CAPTCHAS_DIR + "/" + image_name + 245 | "\nException message: " + str(e) + "\n") 246 | 247 | return captcha_text 248 | 249 | 250 | if __name__=="__main__": 251 | CW = CaptchaBreakerWrapper() 252 | # CW.solve_captcha("http://ecx.images-amazon.com/captcha/bfhuzdtn/Captcha_distpnvhaw.jpg", False) 253 | # CW.solve_captcha("http://ecx.images-amazon.com/captcha/bfhuzdtn/Captcha_distpnvhaw.jpg") 254 | # CW.solve_captcha("http://ecx.images-amazon.com/captcha/bfhuzdtn/Captcha_distpnvhaw.jpg") 255 | CW.solve_captcha("https://ipv4.google.com/sorry/image?id=7585877133141730835&hl=ru") -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/categories.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | from scrapy.loader import ItemLoader 3 | 4 | from walmart_spider.items import CategoryItem 5 | 6 | is_empty = lambda x, y="": x[0] if x else y 7 | 8 | class WalmartSpider(Spider): 9 | name = 'categories' 10 | 11 | allowed_domains = ['www.arbitragedashboard.com'] 12 | start_urls = ['http://www.arbitragedashboard.com/software/top-rank-chart/'] 13 | 14 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 15 | 'Gecko/20100101 Firefox/35.0' 16 | 17 | def parse(self, response): 18 | trs = response.xpath('//tr') 19 | for tr in trs: 20 | l = ItemLoader(item=CategoryItem(), response=response) 21 | category = is_empty(tr.xpath('./td[1]/text()').extract()) 22 | top5 = is_empty(tr.xpath('./td[5]/text()').extract()) 23 | 24 | l.add_value('category', category) 25 | l.add_value('top5', top5) 26 | 27 | yield l.load_item() 28 | 29 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/homedepot.py: -------------------------------------------------------------------------------- 1 | import re 2 | from scrapy import Spider 3 | from scrapy.http import Request 4 | from scrapy.loader import ItemLoader 5 | from walmart_spider.items import WalmartItem 6 | from walmart_spider.aws_signed_request import aws_signed_request 7 | from bs4 import BeautifulSoup 8 | import requests 9 | import urllib2 10 | import datetime 11 | import time 12 | import xml.etree.ElementTree as ET 13 | from fake_useragent import UserAgent 14 | ua = UserAgent() 15 | 16 | is_empty = lambda x, y="": x[0] if x else y 17 | 18 | class HomedepotSpider(Spider): 19 | name = 'homedepot' 20 | 21 | allowed_domains = ['www.homedepot.com'] 22 | 23 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 24 | 'Gecko/20100101 Firefox/35.0' 25 | 26 | BASE_URL = 'http://www.homedepot.com/' 27 | 28 | start_urls = ['http://www.homedepot.com/c/site_map'] 29 | 30 | def start_requests(self): 31 | headers = {"Accept": "*/*", 32 | "Accept-Encoding": "gzip, deflate", 33 | "User-Agent": "runscope/0.1"} 34 | yield Request(url=self.start_urls[0], headers=headers, 35 | callback=self.parse_category) 36 | 37 | def parse_category(self, response): 38 | links = response.xpath( 39 | '//ul[@class="linkList l"]/li/' 40 | 'a[contains(@href, "www.homedepot.com/b/")]/@href' 41 | ).extract() 42 | 43 | for link in links: 44 | print '-'*25, link, '-'*25 45 | if 'http' in link: 46 | yield Request(url=link, callback=self.parse_product) 47 | else: 48 | yield Request(url='http://'+link, callback=self.parse_product) 49 | 50 | def parse_product(self, response): 51 | 52 | product_links = response.xpath( 53 | '//div[contains(@class, "product pod")]/form/*/*/a/@href' 54 | ).extract() 55 | 56 | for product_link in product_links: 57 | yield Request(url=self.BASE_URL+product_link, callback=self.parse) 58 | 59 | next_link = is_empty( 60 | response.xpath('//a[@title="Next"]/@href').extract() 61 | ) 62 | if next_link: 63 | yield Request(url=self.BASE_URL+next_link, 64 | callback=self.parse_product) 65 | 66 | def parse(self, response): 67 | l = ItemLoader(item=WalmartItem(), response=response) 68 | upc = response.xpath('//upc/text()').extract() 69 | price = response.xpath('//span[@itemprop="price"]/text()').extract() 70 | 71 | upc = is_empty(upc).strip().replace("\n",'').replace('\r', '') 72 | if len(upc) > 12: 73 | upc = upc[1:] 74 | homedepot_price = is_empty(price).strip().replace("\n", '').replace('\r', '') 75 | 76 | l.add_value('upc', upc) 77 | l.add_value('homedepot_price', is_empty(price).strip().replace( 78 | "\n", '').replace('\r', '')) 79 | 80 | print "\n\n upc : ", upc 81 | print "price : ", homedepot_price 82 | 83 | if upc: 84 | region = REGION 85 | public_key = AWS_ACCESS_KEY_ID 86 | private_key = AWS_ACCESS_SECRET_KEY 87 | associate_tag = ASSOCIATE_TAG 88 | params = { 89 | "AWSAccessKeyId": public_key, 90 | "Service": "AWSECommerceService", 91 | "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'), 92 | "AssociateTag": associate_tag, 93 | "IdType": "UPC", 94 | "ItemId": upc, 95 | "ResponseGroup": "SalesRank, ItemAttributes, Offers, OfferListings", 96 | "Operation": "ItemLookup", 97 | "SearchIndex": "All" 98 | } 99 | url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01') 100 | print url 101 | try: 102 | header = ua.random 103 | headers = {"User-Agent": header} 104 | r = requests.get(url, headers=headers) 105 | content = r.text.encode("UTF-8") 106 | root = ET.fromstring(content) 107 | time.sleep(1) 108 | detail = [] 109 | details = [] 110 | new_price = [] 111 | 112 | rank = '' 113 | weight = None 114 | title = None 115 | category = '' 116 | 117 | for t in root: 118 | for t1 in t: 119 | for t2 in t1: 120 | if "SalesRank" in t2.tag: 121 | rank = t2.text.encode("UTF-8") 122 | 123 | if "ItemAttributes" in t2.tag: 124 | for t3 in t2: 125 | if "ProductTypeName" in t3.tag: 126 | category = t3.text.encode("UTF-8") 127 | 128 | if "ItemDimensions" in t3.tag: 129 | for t4 in t3: 130 | if "Weight" in t4.tag: 131 | weight = t4.text.encode("UTF-8") 132 | 133 | 134 | if "Title" in t3.tag: 135 | title = t3.text.encode("UTF-8") 136 | 137 | 138 | if "Offers" in t2.tag: 139 | for t5 in t2: 140 | if "MoreOffersUrl" in t5.tag: 141 | link = t5.text 142 | length = len(new_price) 143 | if link != "0" and length < 4: 144 | time.sleep(1) 145 | header = ua.random 146 | 147 | headers = {"User-Agent": header} 148 | r = requests.get(link, headers=headers) 149 | page = r.text 150 | soup = BeautifulSoup(page, 'html.parser') 151 | 152 | a = soup.find("a", {"class": "a-link-normal a-text-bold"}) 153 | if a: 154 | url1 = "http://www.amazon.com"+a["href"] 155 | header = ua.random 156 | headers = {"User-Agent": header} 157 | r = requests.get(url1, headers=headers) 158 | page1 = r.text 159 | soup1 = BeautifulSoup(page1, 'html.parser') 160 | else: 161 | soup1 = soup 162 | 163 | n = 1 164 | for div in soup1.find_all("div", {"class": "a-row a-spacing-mini olpOffer"}): 165 | if n < 4: 166 | div1 = div.find("div", {"class": "a-column a-span2"}) 167 | 168 | span = div1.find("span", {"class": "a-size-large a-color-price olpOfferPrice a-text-bold"}) 169 | if span: 170 | price1 = ' '.join(span.text.split()) 171 | new_price.append(price1) 172 | n = n+1 173 | if new_price and title is not None and weight is not None: 174 | l.add_value('title', title) 175 | #l.add_value('upc', upc[0]) 176 | l.add_value('rank', rank) 177 | l.add_value('category', category) 178 | l.add_value('weight', weight) 179 | #l.add_value('walmart_price', price) 180 | l.add_value('amazon_price1', new_price[0]) 181 | try: 182 | if new_price[1]: 183 | l.add_value('amazon_price2', new_price[1]) 184 | except: 185 | price1 = '' 186 | l.add_value('amazon_price2', price1) 187 | 188 | try: 189 | if new_price[2]: 190 | l.add_value('amazon_price3', new_price[2]) 191 | except: 192 | price2 = '' 193 | l.add_value('amazon_price3', price2) 194 | l.add_value('weight', weight) 195 | 196 | amazon_price1 = new_price[0].split("$") 197 | amazon_price1 = float(amazon_price1[1]) 198 | 199 | if "-" in homedepot_price: 200 | price = homedepot_price.split("-") 201 | price1 = float(price[0].split("$")[1]) 202 | price2 = float(price[1].split("$")[1]) 203 | if price2 < amazon_price1: 204 | homedepot_price = price2 205 | else: 206 | homedepot_price = price1 207 | else: 208 | homedepot_price = float(homedepot_price.split("$")[1]) 209 | 210 | weight = float(weight) 211 | 212 | wt_cost = weight * 0.55 213 | l.add_value('wt_cost', wt_cost) 214 | 215 | Tax_Cost = homedepot_price * 0.065 216 | l.add_value('Tax_Cost', Tax_Cost) 217 | 218 | Fees = amazon_price1 * 0.27 219 | l.add_value('Fees', Fees) 220 | 221 | Tot_Cost = homedepot_price + wt_cost + Tax_Cost + Fees 222 | l.add_value('Tot_Cost', Tot_Cost) 223 | 224 | Profit = amazon_price1 - Tot_Cost 225 | l.add_value('Profit', Profit) 226 | 227 | ROI = Profit / (homedepot_price + wt_cost + Tax_Cost) 228 | l.add_value('ROI', ROI) 229 | 230 | yield l.load_item() 231 | 232 | except Exception as e: 233 | print "\n Exception : ", e -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/kohls.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | from scrapy import Spider 5 | from scrapy.http import Request 6 | from scrapy.loader import ItemLoader 7 | from scrapy.utils.response import open_in_browser 8 | 9 | from walmart_spider.items import WalmartItem 10 | 11 | is_empty = lambda x, y="": x[0] if x else y 12 | 13 | class KohlsSpider(Spider): 14 | name = 'kohls' 15 | 16 | allowed_domains = ['www.kohls.com'] 17 | 18 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 19 | 'Gecko/20100101 Firefox/35.0' 20 | 21 | BASE_URL = 'http://www.kohls.com' 22 | 23 | start_urls = ['http://www.kohls.com/feature/sitemapmain.jsp'] 24 | 25 | def parse(self, response): 26 | links = response.xpath( 27 | '//div[@id="sitemap-content"]/div/ul/li/a[contains(@href,"catalog")]/@href' 28 | ).extract() 29 | 30 | for link in links: 31 | yield Request(url=self.BASE_URL+link, callback=self.parse_product) 32 | 33 | def parse_product(self, response): 34 | 35 | script = response.xpath( 36 | '//script[contains(text(), "pmpSearchJsonData")]' 37 | ).extract()[0].replace('\n', '').strip() 38 | 39 | data = json.loads(re.findall('pmpSearchJsonData = ({.*});', script)[0]) 40 | 41 | for product in data['productInfo']['productList']: 42 | l = ItemLoader(item=WalmartItem(), response=response) 43 | 44 | l.add_value('title', product['productTitle']) 45 | l.add_value('price', product['pricing']['regularPrice']) 46 | 47 | yield l.load_item() 48 | 49 | next_link = is_empty( 50 | response.xpath('//link[@rel="next"]/@href').extract() 51 | ) 52 | if next_link: 53 | yield Request(url=self.BASE_URL+next_link, 54 | callback=self.parse_product) 55 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/target.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import datetime 4 | from scrapy import Spider 5 | from scrapy.http import Request 6 | from walmart_spider.aws_signed_request import aws_signed_request 7 | from scrapy.loader import ItemLoader 8 | from bs4 import BeautifulSoup 9 | import requests 10 | import urllib2 11 | 12 | from walmart_spider.items import WalmartItem 13 | import time 14 | import xml.etree.ElementTree as ET 15 | from fake_useragent import UserAgent 16 | ua = UserAgent() 17 | 18 | is_empty = lambda x, y="": x[0] if x else y 19 | 20 | class TargetSpider(Spider): 21 | name = 'target' 22 | 23 | allowed_domains = ['www.target.com', 'tws.target.com'] 24 | 25 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 26 | 'Gecko/20100101 Firefox/35.0' 27 | 28 | BASE_URL = 'http://www.target.com' 29 | 30 | start_urls = ['http://www.target.com/c/more/-/N-5xsxf#?lnk=ct_menu_12_1&' 31 | 'intc=1865103|null'] 32 | 33 | JSON_SEARCH_URL = "http://tws.target.com/searchservice/item/" \ 34 | "search_results/v2/by_keyword?" \ 35 | "callback=getPlpResponse" \ 36 | "&response_group=Items%2CVariationSummary" \ 37 | "&category={category}" \ 38 | "&sort_by=bestselling" \ 39 | "&pageCount=60" \ 40 | "&zone=PLP" \ 41 | "&facets=" \ 42 | "&view_type=medium" \ 43 | "&page={page}" \ 44 | "&offset={index}" \ 45 | "&stateData=" 46 | 47 | def parse(self, response): 48 | categories = response.xpath( 49 | '//ul[@class="innerCol"]/li/a/@href').re('N-(.*)#') 50 | 51 | for category in categories: 52 | new_meta = response.meta.copy() 53 | new_meta['category'] = category 54 | new_meta['next_page'] = 2 55 | new_meta['index'] = new_meta['next_page']*60 56 | 57 | yield Request(url=self.JSON_SEARCH_URL.format(category=category, 58 | page=1, 59 | index=0), 60 | meta=new_meta, 61 | callback=self.parse_product) 62 | 63 | def parse_product(self, response): 64 | data = json.loads( 65 | re.findall('getPlpResponse\((.*)\)', response.body)[0] 66 | ) 67 | 68 | if len(data['searchResponse']['items']['Item']) > 0: 69 | 70 | for product in data['searchResponse']['items']['Item']: 71 | l = ItemLoader(item=WalmartItem(), response=response) 72 | if 'priceSummary' in product.keys(): 73 | l.add_value('upc', product['upc']) 74 | l.add_value('target_price', 75 | product['priceSummary']['offerPrice']['amount']) 76 | upc = product['upc'] 77 | target_price = product['priceSummary']['offerPrice']['amount'] 78 | print "upc : ", upc 79 | print "target_price : ", target_price 80 | 81 | if upc: 82 | region = REGION 83 | public_key = AWS_ACCESS_KEY_ID 84 | private_key = AWS_ACCESS_SECRET_KEY 85 | associate_tag = ASSOCIATE_TAG 86 | params = { 87 | "AWSAccessKeyId": public_key, 88 | "Service": "AWSECommerceService", 89 | "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'), 90 | "AssociateTag": associate_tag, 91 | "IdType": "UPC", 92 | "ItemId": upc, 93 | "ResponseGroup": "SalesRank, ItemAttributes, Offers, OfferListings", 94 | "Operation": "ItemLookup", 95 | "SearchIndex": "All" 96 | } 97 | url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01') 98 | print url 99 | time.sleep(1) 100 | try: 101 | header = ua.random 102 | headers = {"User-Agent": header} 103 | r = requests.get(url, headers=headers) 104 | content = r.text.encode("UTF-8") 105 | root = ET.fromstring(content) 106 | 107 | detail = [] 108 | details = [] 109 | new_price = [] 110 | 111 | rank = '' 112 | weight = None 113 | title = None 114 | category = '' 115 | 116 | for t in root: 117 | for t1 in t: 118 | for t2 in t1: 119 | if "SalesRank" in t2.tag: 120 | rank = t2.text.encode("UTF-8") 121 | 122 | 123 | if "ItemAttributes" in t2.tag: 124 | for t3 in t2: 125 | if "ProductTypeName" in t3.tag: 126 | category = t3.text.encode("UTF-8") 127 | 128 | 129 | if "ItemDimensions" in t3.tag: 130 | for t4 in t3: 131 | if "Weight" in t4.tag: 132 | weight = t4.text.encode("UTF-8") 133 | 134 | 135 | if "Title" in t3.tag: 136 | title = t3.text.encode("UTF-8") 137 | 138 | 139 | if "Offers" in t2.tag: 140 | for t5 in t2: 141 | if "MoreOffersUrl" in t5.tag: 142 | link = t5.text 143 | print "\n link : ", link 144 | length = len(new_price) 145 | if link != "0" and length < 4: 146 | time.sleep(1) 147 | header = ua.random 148 | headers = {"User-Agent": header} 149 | 150 | r = requests.get(link, headers=headers) 151 | page = r.text 152 | soup = BeautifulSoup(page, 'html.parser') 153 | 154 | a = soup.find("a", {"class": "a-link-normal a-text-bold"}) 155 | if a: 156 | url1 = "http://www.amazon.com"+a["href"] 157 | header = ua.random 158 | headers = {"User-Agent": header} 159 | r = requests.get(url1, headers=headers) 160 | page1 = r.text 161 | soup1 = BeautifulSoup(page1, 'html.parser') 162 | else: 163 | soup1 = soup 164 | 165 | n = 1 166 | for div in soup1.find_all("div", {"class": "a-row a-spacing-mini olpOffer"}): 167 | if n < 4: 168 | div1 = div.find("div", {"class": "a-column a-span2"}) 169 | 170 | span = div1.find("span", {"class": "a-size-large a-color-price olpOfferPrice a-text-bold"}) 171 | if span: 172 | price = ' '.join(span.text.split()) 173 | new_price.append(price) 174 | n = n+1 175 | 176 | if new_price and title is not None and weight is not None: 177 | l.add_value('title', title) 178 | 179 | #l.add_value('upc', upc[0]) 180 | l.add_value('rank', rank) 181 | 182 | l.add_value('category', category) 183 | 184 | l.add_value('weight', weight) 185 | 186 | #l.add_value('walmart_price', price) 187 | l.add_value('amazon_price1', new_price[0]) 188 | 189 | try: 190 | if new_price[1]: 191 | l.add_value('amazon_price2', new_price[1]) 192 | except: 193 | price1 = '' 194 | l.add_value('amazon_price2', price1) 195 | 196 | try: 197 | if new_price[2]: 198 | l.add_value('amazon_price3', new_price[2]) 199 | except: 200 | price2 = '' 201 | l.add_value('amazon_price3', price2) 202 | l.add_value('weight', weight) 203 | 204 | amazon_price1 = new_price[0].split("$") 205 | amazon_price1 = float(amazon_price1[1]) 206 | 207 | if "-" in target_price: 208 | price = target_price.split("-") 209 | price1 = float(price[0].split("$")[1]) 210 | price2 = float(price[1].split("$")[1]) 211 | if price2 < amazon_price1: 212 | target_price = price2 213 | else: 214 | target_price = price1 215 | else: 216 | target_price = float(target_price.split("$")[1]) 217 | 218 | 219 | weight = float(weight) 220 | 221 | wt_cost = weight * 0.55 222 | l.add_value('wt_cost', wt_cost) 223 | 224 | 225 | Tax_Cost = target_price * 0.065 226 | l.add_value('Tax_Cost', Tax_Cost) 227 | 228 | Fees = amazon_price1 * 0.27 229 | l.add_value('Fees', Fees) 230 | 231 | 232 | Tot_Cost = target_price + wt_cost + Tax_Cost + Fees 233 | l.add_value('Tot_Cost', Tot_Cost) 234 | 235 | 236 | Profit = amazon_price1 - Tot_Cost 237 | l.add_value('Profit', Profit) 238 | 239 | 240 | ROI = Profit / (target_price + wt_cost + Tax_Cost) 241 | l.add_value('ROI', ROI) 242 | 243 | yield l.load_item() 244 | except: 245 | pass 246 | 247 | page = response.meta['next_page'] 248 | category = response.meta['category'] 249 | index = response.meta['index'] 250 | new_meta = response.meta.copy() 251 | new_meta['next_page'] = page + 1 252 | new_meta['index'] = new_meta['next_page']*60 253 | yield Request(url=self.JSON_SEARCH_URL.format(category=category, 254 | page=page, 255 | index=index), 256 | meta=new_meta, 257 | callback=self.parse_product) 258 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/spiders/walmart.py: -------------------------------------------------------------------------------- 1 | import re 2 | import datetime 3 | from walmart_spider.aws_signed_request import aws_signed_request 4 | from scrapy import Spider 5 | from scrapy.http import Request, FormRequest 6 | from scrapy.loader import ItemLoader 7 | from scrapy.log import WARNING, DEBUG, INFO, ERROR 8 | 9 | import requests 10 | from walmart_spider.items import WalmartItem 11 | import urllib2 12 | from bs4 import BeautifulSoup 13 | import time 14 | import xml.etree.ElementTree as ET 15 | import sys 16 | from fake_useragent import UserAgent 17 | ua = UserAgent() 18 | 19 | is_empty = lambda x, y="": x[0] if x else y 20 | 21 | # try: 22 | # from captcha_solver import CaptchaBreakerWrapper 23 | # except Exception as e: 24 | # print '!!!!!!!!Captcha breaker is not available due to: %s' % e 25 | # 26 | # class CaptchaBreakerWrapper(object): 27 | # @staticmethod 28 | # def solve_captcha(url): 29 | # msg("CaptchaBreaker in not available for url: %s" % url, 30 | # level=WARNING) 31 | # return None 32 | 33 | 34 | class WalmartSpider(Spider): 35 | name = 'walmart' 36 | 37 | allowed_domains = ['www.walmart.com', 'www.amazon.com'] 38 | start_urls = ['http://www.walmart.com/all-departments'] 39 | 40 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 41 | 'Gecko/20100101 Firefox/35.0' 42 | 43 | BASE_URL = 'http://www.walmart.com' 44 | 45 | AMAZON_SEARCH_URL = "http://www.amazon.com/s/ref=nb_sb_noss?" \ 46 | "field-keywords={upc}" 47 | 48 | # def __init__(self, captcha_retries='10', *args, **kwargs): 49 | # self.captcha_retries = int(captcha_retries) 50 | # self._cbw = CaptchaBreakerWrapper() 51 | # super(WalmartSpider, self).__init__(*args, **kwargs) 52 | 53 | def start_requests(self): 54 | headers = {"Accept": "*/*", 55 | "Accept-Encoding": "gzip, deflate", 56 | "User-Agent": "runscope/0.1"} 57 | yield Request(url=self.start_urls[0], headers=headers, 58 | callback=self.parse_category) 59 | 60 | # def parse_captcha(self, response): 61 | # if self._has_captcha(response): 62 | # result = self._handle_captcha(response, self.parse_captcha) 63 | # else: 64 | # result = self.parse_without_captcha(response) 65 | # return result 66 | # 67 | # def parse_without_captcha(self, response): 68 | # links = response.xpath( 69 | # '//a[@class="all-depts-links-category"]/@href' 70 | # ).extract() 71 | # 72 | # for link in links: 73 | # print '-'*25, link, '-'*25 74 | # yield Request(url=self.BASE_URL+link, callback=self.parse_product) 75 | 76 | def parse_category(self, response): 77 | links = response.xpath( 78 | '//a[@class="all-depts-links-category"]/@href' 79 | ).extract() 80 | 81 | for link in links: 82 | print '-'*25, link, '-'*25 83 | yield Request(url=self.BASE_URL+link, callback=self.parse_product) 84 | 85 | def parse_product(self, response): 86 | 87 | product_links = response.xpath( 88 | '//ul[@class="tile-list tile-list-grid"]/li/div/' 89 | 'a[@class="js-product-title"]/@href' 90 | ).extract() 91 | 92 | for product_link in product_links: 93 | yield Request(url=self.BASE_URL+product_link, callback=self.parse) 94 | 95 | next_link = is_empty(response.xpath( 96 | '//a[@class="paginator-btn paginator-btn-next"]/@href' 97 | ).extract()) 98 | if next_link: 99 | next_link = re.sub( 100 | '\?.*', next_link, response.url, flags=re.IGNORECASE 101 | ) 102 | 103 | yield Request(url=next_link, callback=self.parse_product) 104 | 105 | def parse(self, response): 106 | l = ItemLoader(item=WalmartItem(), response=response) 107 | upc = response.xpath('//meta[@property="og:upc"]/@content').extract() 108 | price = ''.join( 109 | response.xpath( 110 | '//div[@itemprop="price"][1]/text() |' 111 | ' //div[@itemprop="price"][1]/*/text()' 112 | ).extract() 113 | ) 114 | 115 | if not price: 116 | script = is_empty(response.xpath( 117 | '//script[contains(text(), "productSellersMap")]/text()' 118 | ).extract()) 119 | price = is_empty( 120 | re.findall('\"currencyAmount\":([\d+,]?\d+.\d+)', script) 121 | ) 122 | if not price: 123 | script = is_empty( 124 | response.xpath( 125 | '//script[contains(text(),"item_price")]/text()' 126 | ).extract() 127 | ) 128 | price = is_empty( 129 | re.findall("item_price\',\'(\$[\d+,]?\d+.\d+)\'", script) 130 | ) 131 | else: 132 | price = '$' + price 133 | 134 | l.add_value('upc', is_empty(upc)) 135 | if price: 136 | l.add_value('walmart_price', price.replace(" ", '')) 137 | 138 | if upc: 139 | region = REGION 140 | public_key = AWS_ACCESS_KEY_ID 141 | private_key = AWS_ACCESS_SECRET_KEY 142 | associate_tag = ASSOCIATE_TAG 143 | 144 | params = { 145 | "AWSAccessKeyId": public_key, 146 | "Service": "AWSECommerceService", 147 | "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'), 148 | "AssociateTag": associate_tag, 149 | "IdType": "UPC", 150 | "ItemId": upc[0], 151 | "ResponseGroup": "SalesRank, ItemAttributes, Offers, OfferListings", 152 | "Operation": "ItemLookup", 153 | "SearchIndex": "All" 154 | } 155 | 156 | url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01') 157 | 158 | print url 159 | 160 | #time.sleep(1) 161 | try: 162 | header = ua.random 163 | headers = {"User-Agent": header} 164 | r = requests.get(url, headers=headers) 165 | content = r.text.encode("UTF-8") 166 | root = ET.fromstring(content) 167 | 168 | detail = [] 169 | details = [] 170 | new_price = [] 171 | 172 | rank = '' 173 | weight = None 174 | title = None 175 | category = '' 176 | 177 | for t in root: 178 | for t1 in t: 179 | for t2 in t1: 180 | if "SalesRank" in t2.tag: 181 | rank = t2.text.encode("UTF-8") 182 | 183 | if "ItemAttributes" in t2.tag: 184 | for t3 in t2: 185 | if "ProductTypeName" in t3.tag: 186 | category = t3.text.encode("UTF-8") 187 | 188 | if "ItemDimensions" in t3.tag: 189 | for t4 in t3: 190 | if "Weight" in t4.tag: 191 | weight = t4.text.encode("UTF-8") 192 | 193 | if "Title" in t3.tag: 194 | title = t3.text.encode("UTF-8") 195 | 196 | if "Offers" in t2.tag: 197 | for t5 in t2: 198 | if "MoreOffersUrl" in t5.tag: 199 | link = t5.text 200 | length = len(new_price) 201 | if link != "0" and length < 4: 202 | #time.sleep(1) 203 | header = ua.random 204 | headers = {"User-Agent": header} 205 | r = requests.get(link, headers=headers) 206 | page = r.text 207 | soup = BeautifulSoup(page, 'html.parser') 208 | 209 | a = soup.find("a", {"class": "a-link-normal a-text-bold"}) 210 | if a: 211 | url1 = "http://www.amazon.com"+a["href"] 212 | header = ua.random 213 | headers = {"User-Agent": header} 214 | r = requests.get(url1, headers=headers) 215 | page1 = r.text 216 | soup1 = BeautifulSoup(page1, 'html.parser') 217 | else: 218 | soup1 = soup 219 | 220 | n = 1 221 | for div in soup1.find_all("div", {"class": "a-row a-spacing-mini olpOffer"}): 222 | if n < 4: 223 | div1 = div.find("div", {"class": "a-column a-span2"}) 224 | 225 | span = div1.find("span", {"class": "a-size-large a-color-price olpOfferPrice a-text-bold"}) 226 | if span: 227 | price = ' '.join(span.text.split()) 228 | new_price.append(price) 229 | n = n+1 230 | 231 | if new_price and title is not None and weight is not None: 232 | l.add_value('title', title) 233 | #l.add_value('upc', upc[0]) 234 | l.add_value('rank', rank) 235 | l.add_value('category', category) 236 | l.add_value('weight', weight) 237 | #l.add_value('walmart_price', price) 238 | l.add_value('amazon_price1', new_price[0]) 239 | try: 240 | if new_price[1]: 241 | l.add_value('amazon_price2', new_price[1]) 242 | except: 243 | price1 = '' 244 | l.add_value('amazon_price2', price1) 245 | 246 | try: 247 | if new_price[2]: 248 | l.add_value('amazon_price3', new_price[2]) 249 | except: 250 | price2 = '' 251 | l.add_value('amazon_price3', price2) 252 | l.add_value('weight', weight) 253 | 254 | if "-" in price: 255 | w_price = price.split("-") 256 | price1 = float(w_price[0].split("$")[1]) 257 | price2 = float(w_price[1].split("$")[1]) 258 | if price2 < amazon_price1: 259 | walmart_price = price2 260 | else: 261 | walmart_price = price1 262 | else: 263 | walmart_price = float(price.split("$")[1]) 264 | 265 | weight = float(weight) 266 | amazon_price1 = new_price[0].split("$") 267 | amazon_price1 = float(amazon_price1[1]) 268 | 269 | wt_cost = weight * 0.55 270 | l.add_value('wt_cost', wt_cost) 271 | 272 | Tax_Cost = walmart_price * 0.065 273 | l.add_value('Tax_Cost', Tax_Cost) 274 | 275 | Fees = amazon_price1 * 0.27 276 | l.add_value('Fees', Fees) 277 | 278 | Tot_Cost = walmart_price + wt_cost + Tax_Cost + Fees 279 | l.add_value('Tot_Cost', Tot_Cost) 280 | 281 | Profit = amazon_price1 - Tot_Cost 282 | l.add_value('Profit', Profit) 283 | 284 | ROI = Profit / (walmart_price + wt_cost + Tax_Cost) 285 | l.add_value('ROI', ROI) 286 | yield l.load_item() 287 | except: 288 | pass 289 | # if upc: 290 | # new_meta = response.meta.copy() 291 | # new_meta['item'] = l 292 | # yield Request(url=self.AMAZON_SEARCH_URL.format(upc=upc[0]), 293 | # meta=new_meta, callback=self.parse_amazon_category) 294 | # else: 295 | 296 | #yield l.load_item() 297 | 298 | #TODO: handling amazon 299 | # def parse_amazon_category(self, response): 300 | # if self._has_captcha(response): 301 | # yield self._handle_captcha(response, self.parse_amazon_category) 302 | # else: 303 | # link = response.xpath('//a[@class="a-link-normal s-access-detail-page' 304 | # ' a-text-normal"]/@href').extract() 305 | # if link: 306 | # new_meta = response.meta.copy() 307 | # new_meta['item'] = response.meta['item'] 308 | # yield Request(url=link[0], meta=new_meta, 309 | # callback=self.parse_amazon_product) 310 | # 311 | # def parse_amazon_product(self, response): 312 | # if self._has_captcha(response): 313 | # yield self._handle_captcha(response, self.parse_amazon_product) 314 | # else: 315 | # l = response.meta['item'] 316 | # title = response.xpath( 317 | # '//span[@id="productTitle"]/text()' 318 | # ).extract() 319 | # l.add_value('title', is_empty(title)) 320 | # 321 | # amazon_price = response.xpath( 322 | # '//span[@id="priceblock_ourprice"]/text()' 323 | # ).extract() 324 | # l.add_value('amazon_price', is_empty(amazon_price)) 325 | # 326 | # weight = is_empty(response.xpath( 327 | # '//div[@class="content"]/ul/li/b[contains(text(),' 328 | # ' "Weight")]/following::text() | ' 329 | # '//table[@id="productDetails_detailBullets_sections1"]/' 330 | # 'tr[contains(.,"Weight")]/td/text()' 331 | # ).extract(),'').replace('(', '').strip() 332 | # l.add_value('weight', weight) 333 | # 334 | # rank_category = response.xpath( 335 | # '//li[@id="SalesRank"]/text() |' 336 | # '//table[@id="productDetails_detailBullets_sections1"]' 337 | # '/tr[contains(.,"Best Seller")]/td' 338 | # ).re('#(\d+[,\d+]*) in (.*) \(') 339 | # if rank_category: 340 | # l.add_value('rank', rank_category[0]) 341 | # l.add_value('category', rank_category[1]) 342 | # else: 343 | # category = response.xpath( 344 | # '//div[@id="wayfinding-breadcrumbs_feature_div"]/ul' 345 | # '/li[1]/span/a/text()' 346 | # ).extract() 347 | # if category: 348 | # category = category[0].strip() 349 | # l.add_value('category', category) 350 | # 351 | # yield l.load_item() 352 | # 353 | # # Captcha handling functions. 354 | # def _has_captcha(self, response): 355 | # return '.images-amazon.com/captcha/' in response.body_as_unicode() 356 | # 357 | # def _solve_captcha(self, response): 358 | # forms = response.xpath('//form') 359 | # assert len(forms) == 1, "More than one form found." 360 | # 361 | # captcha_img = forms[0].xpath( 362 | # '//img[contains(@src, "/captcha/")]/@src').extract()[0] 363 | # 364 | # self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) 365 | # return self._cbw.solve_captcha(captcha_img) 366 | # 367 | # def _handle_captcha(self, response, callback): 368 | # captcha_solve_try = response.meta.get('captcha_solve_try', 0) 369 | # url = response.url 370 | # self.log("Captcha challenge for %s (try %d)." 371 | # % (url, captcha_solve_try), 372 | # level=INFO) 373 | # 374 | # captcha = self._solve_captcha(response) 375 | # 376 | # if captcha is None: 377 | # self.log( 378 | # "Failed to guess captcha for '%s' (try: %d)." % ( 379 | # url, captcha_solve_try), 380 | # level=ERROR 381 | # ) 382 | # result = None 383 | # else: 384 | # self.log( 385 | # "On try %d, submitting captcha '%s' for '%s'." % ( 386 | # captcha_solve_try, captcha, url), 387 | # level=INFO 388 | # ) 389 | # meta = response.meta.copy() 390 | # meta['captcha_solve_try'] = captcha_solve_try + 1 391 | # result = FormRequest.from_response( 392 | # response, 393 | # formname='', 394 | # formdata={'field-keywords': captcha}, 395 | # callback=callback, 396 | # dont_filter=True, 397 | # meta=meta) 398 | # 399 | # return result 400 | -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/test.py: -------------------------------------------------------------------------------- 1 | from aws_signed_request import aws_signed_request 2 | import datetime 3 | import requests 4 | 5 | # Service=AWSECommerceService& 6 | # AWSAccessKeyId=AKIAJCAIVLPWYX553QKA& 7 | # AssociateTag=esfera01-20& 8 | # Operation=ItemSearch& 9 | # Keywords=horse,bridle& 10 | # SearchIndex=PetSupplies,SportingGoods& 11 | # Timestamp={timestamp}& 12 | # Signature=[Request Signature] 13 | 14 | 15 | # http://webservices.amazon.com/onca/xml? 16 | # Service=AWSECommerceService& 17 | # AWSAccessKeyId=[AWS Access Key ID]& 18 | # AssociateTag=[Associate ID]& 19 | # Operation=ItemSearch& 20 | # Keywords=Potter& 21 | # SearchIndex=Books& 22 | # ItemPage=4 23 | # &Timestamp=[YYYY-MM-DDThh:mm:ssZ] 24 | # &Signature=[Request Signature] 25 | 26 | region = "com" 27 | public_key = "AKIAJCAIVLPWYX553QKA" 28 | private_key = "VNCDZ5l0IEUqJIrr/0wuh1Cyj+ZxfbA/42d3Cu/a" 29 | associate_tag = "esfera01-20" 30 | 31 | params = { 32 | "AWSAccessKeyId": public_key, 33 | "Service": "AWSECommerceService", 34 | "Timestamp": datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ'), 35 | "AssociateTag": associate_tag, 36 | "Operation": "ItemSearch", 37 | #"ItemPage": "1", 38 | "SearchIndex": "Electronics", 39 | #"Keywords": "Electronics" 40 | } 41 | 42 | url = aws_signed_request(region, params, public_key, private_key, associate_tag, version='2011-08-01') 43 | print "url : ", url 44 | 45 | # response = requests.get(url) 46 | # content = response.text 47 | 48 | # print "\n content : ", content -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_images.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_images.npy -------------------------------------------------------------------------------- /walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_labels.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parul1931/walmart/72c0dd990913ba9cf0eff6b7944f2be175d34672/walmart_spider/walmart_spider/train_captchas_data/train_captchas_data_labels.npy --------------------------------------------------------------------------------