├── screenshot └── result.png ├── .gitignore ├── Pipfile ├── README.md ├── export.py ├── request.py ├── test.py ├── utils.py ├── Pipfile.lock ├── dispose.py ├── main.py └── proxies.py /screenshot/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DisasterMeng/Gui-Amazon-Review/HEAD/screenshot/result.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .idea/Gui-Amazon-Review.iml 3 | .idea/encodings.xml 4 | .idea/misc.xml 5 | .idea/modules.xml 6 | .idea/workspace.xml 7 | .idea/ 8 | __pycache__/ 9 | build/ 10 | dist/ 11 | .vscode/ 12 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | requests = "*" 10 | lxml = "*" 11 | pyinstaller = "*" 12 | fake-useragent = "*" 13 | 14 | [requires] 15 | python_version = "3.6" 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 下载亚马逊评论工具 2 | 一个小改改时不时找我做一个Google扩展来下载Amazon的评论以提高工作效率,奈何时间一直不在这一块,刚好同学也接到了这个需求,就打算一起做一个。 3 | 4 | 最后商量还是做一个Gui,Google扩展有时候感觉有些不方便,顺带了解一下Python中的标准库tkinter。 5 | 6 | 最后分工我来做界面,同学来写下载解析那一块,说说一下思路,其实还是很简单的,就跟普通的爬虫差不多。 7 | 8 | - Gui提供2个参数,一个是站点,一个是产品Asin,Gui的具体界面就不仔细说了 9 | - 通过2个参数,可以构建一个种子Url,先会检测产品是否存在(404) 10 | - 检测通过后,然后开始开启线程递归下载 11 | - 下载完成后,解析数据保存到本地 12 | - 判断是否存在下一页,存在则修改header继续下载解析,直到最后一页 13 | 14 | 没有使用代理,因为数据量小,防止被ban,加了一些延时请求,如果有需要的朋友可以自行添加。 15 | 16 | 最后建议一下,简单的页面可以是用tkinter,复杂可以使用其他Gui库,比如pyqt,tkinter实现复杂的页面有点费劲。 17 | 18 | 下载地址[https://github.com/DisasterMeng/Gui-Amazon-Review/releases/download/1.6/Amazon-Review_1.6.exe](https://github.com/DisasterMeng/Gui-Amazon-Review/releases/download/1.6/Amazon-Review_1.6.exe) 19 | 20 | 源码地址[https://github.com/DisasterMeng/Gui-Amazon-Review](https://github.com/DisasterMeng/Gui-Amazon-Review),求星星😊 21 | 22 | ![结果](./screenshot/result.png) 23 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | import os, time, csv 2 | 3 | from utils import getDesktopPath 4 | 5 | NEED = ('asin', 'name', 'format', 'vp', 'review_country', 'date', 'stars', 'title', 'content', 'href', 'buyer', 6 | 'helpful') 7 | NEEDDOC = ('ASIN', '评价人', '格式条', '是否vp', '评论国家', '日期', '星级', '标题', '内容', '评论链接', '买家链接', '点赞数') 8 | FILENAME = "Amazon_{asin}_Review_%Y_%m_%d_%H_%M.csv" 9 | 10 | 11 | class JsonCsv: 12 | def __init__(self, asin): 13 | self.ASIN = asin 14 | self.csvFile = open(self.getPath(), 'w', newline='', encoding='utf-8-sig') 15 | self.writer = csv.writer(self.csvFile) 16 | self.writer.writerow(NEEDDOC) 17 | 18 | def getPath(self): 19 | name = time.strftime(FILENAME.format(asin=self.disposeASIN()), time.localtime()) 20 | return os.path.join(getDesktopPath(), name) 21 | 22 | def writerCsv(self, dicData): 23 | for dic in dicData: 24 | row = [] 25 | for item in NEED: 26 | row.append(dic[item]) 27 | self.writer.writerow(row) 28 | 29 | def closeCsv(self): 30 | self.csvFile.close() 31 | 32 | def disposeASIN(self): 33 | return self.ASIN.replace(' ', '').strip('\n') -------------------------------------------------------------------------------- /request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from urllib.parse import urlencode 3 | from fake_useragent import UserAgent 4 | 5 | from utils import getAmazonDomain, amazon_headers 6 | 7 | REVIEWSURL = '{domain}/product-reviews/{asin}/ref=cm_cr_arp_d_viewopt_srt' 8 | 9 | reviewParam = { 10 | 'ie': 'UTF8', 11 | 'reviewerType': 'all_reviews', 12 | 'sortBy': 'recent', 13 | 'pageNumber': '' 14 | } 15 | 16 | 17 | class AmazonRequests: 18 | def __init__(self, country, asin, page, session, proxies): 19 | self.session = session if session else requests.session() 20 | self.proxies = proxies 21 | self.ASIN = asin 22 | self.Country = country 23 | self.page = page 24 | self.retryNum = 0 25 | self.referer = getAmazonDomain(self.Country) 26 | self.headers = amazon_headers.copy() 27 | self.headers['user-agent'] = UserAgent().random 28 | 29 | def getURL(self): 30 | return REVIEWSURL.format(domain=getAmazonDomain(self.Country), asin=self.ASIN) 31 | 32 | def nextPage(self): 33 | self.page += 1 34 | 35 | def getPage(self): 36 | return self.page 37 | 38 | def getAmaoznData(self, is_lang=False): 39 | try: 40 | reviewParam['pageNumber'] = str(self.getPage()) 41 | if is_lang and self.Country == 'US': 42 | reviewParam['filterByLanguage'] = 'en_US' 43 | self.headers['referer'] = self.referer 44 | response = self.session.get(self.getURL(), params=reviewParam, headers=self.headers, proxies=self.proxies, timeout=(5, 10)) 45 | response.encoding = 'utf-8' 46 | self.referer = '%s?%s' % (self.getURL(), urlencode(reviewParam)) 47 | if response.status_code == 200: 48 | return response.text 49 | else: 50 | return response.status_code 51 | except requests.exceptions.RequestException as e: 52 | if self.retryNum == 2: 53 | print(e) 54 | return self.retryNum 55 | self.retryNum += 1 56 | return self.getAmaoznData() 57 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import requests 3 | from lxml import etree 4 | from dispose import AmazonDispose 5 | from request import AmazonRequests 6 | from fake_useragent import UserAgent 7 | from utils import request_message, getAmazonDomain, is_robot, amazon_headers 8 | 9 | 10 | def test(): 11 | with open('amazon.txt', 'rb') as f: 12 | data = f.read() 13 | selector = etree.HTML(data) 14 | s = selector.xpath('//div[@data-asin]') 15 | print(s) 16 | # dispose = AmazonDispose(data, 'US', 'B076MP43X5') 17 | # print(dispose.dispose()) 18 | # request = AmazonRequests('US', 'B01N2K4U7') 19 | # print(request.getAmaoznData()) 20 | pass 21 | 22 | 23 | def test2(): 24 | url = "https://www.amazon.com/product-reviews/B01GW2GH4M/ref=cm_cr_dp_d_show_all_btm" 25 | querystring = {"ie": "UTF8", "reviewerType": "all_reviews"} 26 | headers = { 27 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 28 | 'accept-encoding': "gzip, deflate, br", 29 | 'accept-language': "zh-CN,zh;q=0.9", 30 | 'upgrade-insecure-requests': "1", 31 | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", 32 | 'Host': "www.amazon.com", 33 | 'Cookie': 'session-id=139-9990947-0054464; session-id-time=2082787201l; ubid-main=130-4498879-2446230;' 34 | ' x-wl-uid=1DK2hCjGRanOF9SqtLU+cG2jpVaZc8RqqWI0RsdSziAnlhPKPXh9+G6McB8O39ouF0g+rZAAZaHQ=;' 35 | ' session-token=4OknvgrkZrGafAYRbufkHpNtHcsN2vzxvk3WsD+LDtMoDH2RvLqahHkcm8H+' 36 | 'zrZGHqG9J3CX8pWjr5zsQqK0Mymb6tIKy6/JzD4PJCk17uqw5EUQPVnRoB5yyeoLDJbDUNJfx/c4XZVUNqKEqdd4JlvdOOCjTeEfxKqFBjKT4iJlGA+' 37 | 'TNi7ySJj8oD5Y3lqD; skin=noskin; i18n-prefs=USD; lc-main=zh_CN; sp-cdn="L5Z9: CN"', 38 | 'Connection': "keep-alive" 39 | } 40 | response = requests.request("GET", url, headers=headers, params=querystring) 41 | print(response.text) 42 | 43 | 44 | def request(session, url, headers=None, proxies=None, types='txt'): 45 | response = session.get(url, headers=headers, proxies=proxies, timeout=20) 46 | response.encoding = 'utf-8' 47 | return session, request_message(response, types) 48 | 49 | 50 | def amazon_robot_check(country): 51 | print('正在进行amazon机器人验证') 52 | cur_header = amazon_headers.copy() 53 | cur_header['user-agent'] = UserAgent().random 54 | session, response = request(requests.session(), getAmazonDomain(country), headers=cur_header) 55 | if not is_robot(etree.HTML(response)): 56 | print('没有机器人验证') 57 | return True 58 | else: 59 | print('机器人验证') 60 | return False 61 | 62 | 63 | async def test3(): 64 | loop = asyncio.get_event_loop() 65 | future = loop.run_in_executor(None, amazon_robot_check, 'US') 66 | response = await future 67 | print(response) 68 | 69 | 70 | if __name__ == '__main__': 71 | # test2() 72 | loop = asyncio.get_event_loop() 73 | task = asyncio.ensure_future(test3()) 74 | loop.run_until_complete(task) 75 | print(task.result()) 76 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import winreg 4 | 5 | RESOURCE = { 6 | 'US': 'https://www.amazon.com', 7 | 'AE': 'https://www.amazon.ae', 8 | 'CN': 'https://www.amazon.cn', 9 | 'JP': 'https://www.amazon.co.jp', 10 | 'UK': 'https://www.amazon.co.uk', 11 | 'FR': 'https://www.amazon.fr', 12 | 'DE': 'https://www.amazon.de', 13 | 'ES': 'https://www.amazon.es', 14 | 'IT': 'https://www.amazon.it', 15 | 'CA': 'https://www.amazon.ca', 16 | 'IN': 'https://www.amazon.in', 17 | 'AU': 'https://www.amazon.com.au', 18 | 'GB': 'https://www.amazon.co.uk', 19 | 'MX': 'https://www.amazon.com.mx' 20 | # 'SG': 'https://www.amazon.com.sg' 21 | } 22 | 23 | REVIEW_COUNTRY = { 24 | 'US': r'Reviewed in the (.*) on', 25 | 'AE': None, 26 | 'CN': r'已在(.*)亚马逊上发表', 27 | 'JP': r'に(.*)でレビュー済み', 28 | 'UK': r'Reviewed in the (.*) on', 29 | 'FR': r'Commenté en (.*) le', 30 | 'DE': r'Rezension aus (.*) vom', 31 | 'ES': r'Revisado en (.*) e', 32 | 'IT': r'Recensito in (.*) il', 33 | 'CA': r'Reviewed in (.*) on', 34 | 'IN': r'Reviewed in (.*) on', 35 | 'AU': r'Reviewed in (.*) on', 36 | 'GB': r'Reviewed in the (.*) on', 37 | 'MX': r'Reseñado en los (.*) el' 38 | } 39 | 40 | REVIEW_HELPFUL = { 41 | 'US': 'One', 42 | 'AE': 'One', 43 | 'CN': None, 44 | 'JP': None, 45 | 'UK': 'One', 46 | 'FR': 'Une', 47 | 'DE': 'Eine', 48 | 'ES': 'Una', 49 | 'IT': 'Una', 50 | 'CA': 'One', 51 | 'IN': 'One', 52 | 'AU': 'One', 53 | 'GB': 'One', 54 | 'MX': 'Una' 55 | } 56 | 57 | LANG_CODE = { 58 | 'CN': 'zh_CN', 59 | 'US': 'en_US' 60 | } 61 | 62 | FR_MONTH = { 63 | "janvier": "January", 64 | "février": "February", 65 | "mars": "March", 66 | "avril": "April", 67 | "mai": "May", 68 | "juin": "June", 69 | "juillet": "July", 70 | "août": "August", 71 | "septembre": "September", 72 | "octobre": "October", 73 | "novembre": "November", 74 | "décembre": "December" 75 | } 76 | 77 | MX_MONTH = ES_MONTH = { 78 | "enero": "January", 79 | "febrero": "February", 80 | "marzo": "March", 81 | "abril": "April", 82 | "mayo": "May", 83 | "junio": "June", 84 | "julio": "July", 85 | "agosto": "August", 86 | "septiembre": "September", 87 | "octubre": "October", 88 | "noviembre": "November", 89 | "diciembre": "December" 90 | } 91 | 92 | IT_MONTH = { 93 | "gennaio": "January", 94 | "febbraio": "February", 95 | "marzo": "March", 96 | "aprile": "April", 97 | "maggio": "May", 98 | "giugno": "June", 99 | "luglio": "July", 100 | "agosto": "August", 101 | "settembre": "September", 102 | "ottobre": "October", 103 | "novembre": "November", 104 | "dicembre": "December" 105 | } 106 | 107 | DE_MONTH = { 108 | "Januar": "January", 109 | "Februar": "February", 110 | "März": "March", 111 | "April": "April", 112 | "Mai": "May", 113 | "Juni": "June", 114 | "Juli": "July", 115 | "August": "August", 116 | "September": "September", 117 | "Oktober": "October", 118 | "November": "November", 119 | "Dezember": "December" 120 | } 121 | 122 | TIME_CODE = { 123 | 'US': {'format': '%B%d,%Y', 'replace': r'Reviewed in the (.*) on'}, 124 | 'AE': '%B%d,%Y', 125 | 'CN': '%Y年%m月%d日', 126 | 'JP': {'format': '%Y年%m月%d日', 'replace': r'に(.*)でレビュー済み'}, 127 | 'UK': {'format': '%d%B%Y', 'replace': r'Reviewed in the (.*) on'}, 128 | 'FR': {'MapMonth': FR_MONTH, 'format': '%d%B%Y', 'replace': r'Commenté en (.*) le'}, 129 | 'DE': {'MapMonth': DE_MONTH, 'format': '%d.%B%Y', 'replace': r'Rezension aus (.*) vom'}, 130 | 'ES': {'MapMonth': ES_MONTH, 'format': '%d%B%Y', 'replace': [r'Revisado en (.*) el', 'de']}, 131 | 'IT': {'MapMonth': IT_MONTH, 'format': '%d%B%Y', 'replace': r'Recensito in (.*) il'}, 132 | 'CA': {'format': '%B%d,%Y', 'replace': r'Reviewed in (.*) on'}, 133 | 'IN': {'format': '%d%B%Y', 'replace': r'Reviewed in (.*) on'}, 134 | 'AU': {'format': '%d%B%Y', 'replace': r'Reviewed in (.*) on'}, 135 | 'GB': {'format': '%d%B%Y', 'replace': r'Reviewed in the (.*) on'}, 136 | 'MX': {'MapMonth': MX_MONTH, 'format': '%d%B%Y', 'replace': [r'Revisado en (.*) el', 'de']} 137 | # 'SG': 'https://www.amazon.com.sg' 138 | } 139 | 140 | STANDARD_TIME = '%d-%b-%y' 141 | 142 | 143 | def getAmazonDomain(country): 144 | return RESOURCE[country.upper()] 145 | 146 | 147 | def getDesktopPath(): 148 | key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders') 149 | return winreg.QueryValueEx(key, "Desktop")[0] 150 | 151 | 152 | def is_number(s): 153 | try: 154 | float(s) 155 | return True 156 | except ValueError: 157 | pass 158 | try: 159 | import unicodedata 160 | unicodedata.numeric(s) 161 | return True 162 | except (TypeError, ValueError): 163 | pass 164 | return False 165 | 166 | 167 | amazon_headers = { 168 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;' 169 | 'q=0.8,application/signed-exchange;v=b3', 170 | # 'Host': getAmazonDomain(self.Country), 171 | 'referer': '', 172 | 'accept-encoding': 'gzip, deflate, br', 173 | 'accept-language': 'zh-CN,zh;q=0.9', 174 | 'upgrade-insecure-requests': '1', 175 | 'sec-fetch-mode': 'navigate', 176 | 'sec-fetch-site': 'none', 177 | 'sec-fetch-user': '?1', 178 | 'Connection': 'keep-alive' 179 | } 180 | 181 | 182 | def is_robot(selector): 183 | robot = selector.xpath('//form[@action="/errors/validateCaptcha"]') 184 | return True if robot else False 185 | 186 | 187 | def wait(): 188 | random_time = random.randint(1, 3) 189 | print('等待时间 %s' % random_time) 190 | time.sleep(random_time) 191 | 192 | 193 | def request_message(response, mode): 194 | print(response.status_code) 195 | if response.status_code != 200: 196 | return None 197 | if mode == 'json': 198 | return response.json() 199 | elif mode == 'txt': 200 | return response.text 201 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "32e60b72e043096c547808d11ee227393a957f0ff90c36a0232ec9169dea2972" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "altgraph": { 20 | "hashes": [ 21 | "sha256:d6814989f242b2b43025cba7161fc1b8fb487a62cd49c49245d6fd01c18ac997", 22 | "sha256:ddf5320017147ba7b810198e0b6619bd7b5563aa034da388cea8546b877f9b0c" 23 | ], 24 | "version": "==0.16.1" 25 | }, 26 | "certifi": { 27 | "hashes": [ 28 | "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", 29 | "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" 30 | ], 31 | "version": "==2019.9.11" 32 | }, 33 | "chardet": { 34 | "hashes": [ 35 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 36 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 37 | ], 38 | "version": "==3.0.4" 39 | }, 40 | "fake-useragent": { 41 | "hashes": [ 42 | "sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35" 43 | ], 44 | "index": "pypi", 45 | "version": "==0.1.11" 46 | }, 47 | "future": { 48 | "hashes": [ 49 | "sha256:858e38522e8fd0d3ce8f0c1feaf0603358e366d5403209674c7b617fa0c24093" 50 | ], 51 | "version": "==0.18.1" 52 | }, 53 | "idna": { 54 | "hashes": [ 55 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 56 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 57 | ], 58 | "version": "==2.8" 59 | }, 60 | "lxml": { 61 | "hashes": [ 62 | "sha256:0358b9e9642bc7d39aac5cffe9884a99a5ca68e5e2c1b89e570ed60da9139908", 63 | "sha256:091a359c4dafebbecd3959d9013f1b896b5371859165e4e50b01607a98d9e3e2", 64 | "sha256:1998e4e60603c64bcc35af61b4331ab3af087457900d3980e18d190e17c3a697", 65 | "sha256:2000b4088dee9a41f459fddaf6609bba48a435ce6374bb254c5ccdaa8928c5ba", 66 | "sha256:2afb0064780d8aaf165875be5898c1866766e56175714fa5f9d055433e92d41d", 67 | "sha256:2d8f1d9334a4e3ff176d096c14ded3100547d73440683567d85b8842a53180bb", 68 | "sha256:2e38db22f6a3199fd63675e1b4bd795d676d906869047398f29f38ca55cb453a", 69 | "sha256:3181f84649c1a1ca62b19ddf28436b1b2cb05ae6c7d2628f33872e713994c364", 70 | "sha256:37462170dfd88af8431d04de6b236e6e9c06cda71e2ca26d88ef2332fd2a5237", 71 | "sha256:3a9d8521c89bf6f2a929c3d12ad3ad7392c774c327ea809fd08a13be6b3bc05f", 72 | "sha256:3d0bbd2e1a28b4429f24fd63a122a450ce9edb7a8063d070790092d7343a1aa4", 73 | "sha256:483d60585ce3ee71929cea70949059f83850fa5e12deb9c094ed1c8c2ec73cbd", 74 | "sha256:4888be27d5cba55ce94209baef5bcd7bbd7314a3d17021a5fc10000b3a5f737d", 75 | "sha256:64b0d62e4209170a2a0c404c446ab83b941a0003e96604d2e4f4cb735f8a2254", 76 | "sha256:68010900898fdf139ac08549c4dba8206c584070a960ffc530aebf0c6f2794ef", 77 | "sha256:872ecb066de602a0099db98bd9e57f4cfc1d62f6093d94460c787737aa08f39e", 78 | "sha256:88a32b03f2e4cd0e63f154cac76724709f40b3fc2f30139eb5d6f900521b44ed", 79 | "sha256:b1dc7683da4e67ab2bebf266afa68098d681ae02ce570f0d1117312273d2b2ac", 80 | "sha256:b29e27ce9371810250cb1528a771d047a9c7b0f79630dc7dc5815ff828f4273b", 81 | "sha256:ce197559596370d985f1ce6b7051b52126849d8159040293bf8b98cb2b3e1f78", 82 | "sha256:d45cf6daaf22584eff2175f48f82c4aa24d8e72a44913c5aff801819bb73d11f", 83 | "sha256:e2ff9496322b2ce947ba4a7a5eb048158de9d6f3fe9efce29f1e8dd6878561e6", 84 | "sha256:f7b979518ec1f294a41a707c007d54d0f3b3e1fd15d5b26b7e99b62b10d9a72e", 85 | "sha256:f9c7268e9d16e34e50f8246c4f24cf7353764affd2bc971f0379514c246e3f6b", 86 | "sha256:f9c839806089d79de588ee1dde2dae05dc1156d3355dfeb2b51fde84d9c960ad", 87 | "sha256:ff962953e2389226adc4d355e34a98b0b800984399153c6678f2367b11b4d4b8" 88 | ], 89 | "index": "pypi", 90 | "version": "==4.3.2" 91 | }, 92 | "macholib": { 93 | "hashes": [ 94 | "sha256:ac02d29898cf66f27510d8f39e9112ae00590adb4a48ec57b25028d6962b1ae1", 95 | "sha256:c4180ffc6f909bf8db6cd81cff4b6f601d575568f4d5dee148c830e9851eb9db" 96 | ], 97 | "version": "==1.11" 98 | }, 99 | "pefile": { 100 | "hashes": [ 101 | "sha256:a5d6e8305c6b210849b47a6174ddf9c452b2888340b8177874b862ba6c207645" 102 | ], 103 | "version": "==2019.4.18" 104 | }, 105 | "pyinstaller": { 106 | "hashes": [ 107 | "sha256:a5a6e04a66abfcf8761e89a2ebad937919c6be33a7b8963e1a961b55cb35986b" 108 | ], 109 | "index": "pypi", 110 | "version": "==3.4" 111 | }, 112 | "requests": { 113 | "hashes": [ 114 | "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", 115 | "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" 116 | ], 117 | "index": "pypi", 118 | "version": "==2.21.0" 119 | }, 120 | "urllib3": { 121 | "hashes": [ 122 | "sha256:4c291ca23bbb55c76518905869ef34bdd5f0e46af7afe6861e8375643ffee1a0", 123 | "sha256:9a247273df709c4fedb38c711e44292304f73f39ab01beda9f6b9fc375669ac3" 124 | ], 125 | "index": "pypi", 126 | "version": "==1.24.2" 127 | } 128 | }, 129 | "develop": {} 130 | } 131 | -------------------------------------------------------------------------------- /dispose.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | from lxml import etree 4 | 5 | from utils import getAmazonDomain, LANG_CODE, TIME_CODE, STANDARD_TIME, REVIEW_HELPFUL, REVIEW_COUNTRY 6 | 7 | STARS = r'(\d+)' 8 | HELPFUL = r'(\d+)' 9 | 10 | 11 | class AmazonDispose: 12 | def __init__(self, AmazonData, Country, ASIN): 13 | self.Country = Country 14 | self.ASIN = ASIN 15 | self.selector = etree.HTML(AmazonData) 16 | 17 | def dispose(self): 18 | reviewAll = [] 19 | reviewData = self.selector.xpath('//div[@data-hook="review"]') 20 | if len(reviewData) <= 0: 21 | return None 22 | for review in reviewData: 23 | reviewRow = {} 24 | reviewDate = review.xpath('div/div/span[@data-hook="review-date"]//text()') 25 | reviewCountry = review.xpath('div/div/span[@data-hook="published-on-amzn-text"]//text()') \ 26 | if self.Country == 'CN' else reviewDate 27 | reviewHref = review.xpath('div/div/div[2]/a[@data-hook="review-title"]/@href') 28 | reviewTitle = review.xpath('div/div/div[2]/a[@data-hook="review-title"]/span//text()') 29 | reviewStrip = review.xpath('div/div/div[3]/a[@data-hook="format-strip"]//text()') 30 | reviewVP = review.xpath('div/div/div[3]/span/a/span[@data-hook="avp-badge"]') 31 | 32 | reviewVP = 'vp' if reviewVP else '非vp' 33 | 34 | reviewBuyer = review.xpath('div/div/div[@data-hook="genome-widget"]/a/@href') 35 | reviewBuyerName = \ 36 | review.xpath('div/div/div[@data-hook="genome-widget"]/a/div[@class="a-profile-content"]/span//text()') 37 | reviewStars = \ 38 | review.xpath('div/div/div[2]/a[@class="a-link-normal"]/i[@data-hook="review-star-rating"]/@class') 39 | reviewStars = re.search(STARS, self.getData(reviewStars)) 40 | 41 | reviewStars = reviewStars.group(1) if reviewStars else '' 42 | 43 | reviewHelpful = review.xpath('div/div/div[contains(@class, "review-comments")]/div' 44 | '/span[@data-hook="review-voting-widget"]/div[1]' 45 | '/span[@data-hook="helpful-vote-statement"]//text()') 46 | re_review_helpful = re.search(HELPFUL, self.getData(reviewHelpful)) 47 | 48 | reviewHelpful = re_review_helpful.group(1) if re_review_helpful else self.get_helpful(reviewHelpful) 49 | 50 | reviewContent = review.xpath('div/div/div[4]/span[@data-hook="review-body"]//text()') 51 | # print(self.get_date(reviewDate)) 52 | reviewRow['asin'] = self.ASIN 53 | reviewRow['date'] = self.get_date(reviewDate) 54 | reviewRow['href'] = self.getURLData(reviewHref) 55 | reviewRow['title'] = self.getData(reviewTitle) 56 | reviewRow['format'] = self.getData(reviewStrip) 57 | reviewRow['vp'] = reviewVP 58 | reviewRow['buyer'] = self.getURLData(reviewBuyer) 59 | reviewRow['name'] = self.getData(reviewBuyerName) 60 | reviewRow['stars'] = reviewStars 61 | reviewRow['content'] = self.getData(reviewContent) 62 | reviewRow['helpful'] = reviewHelpful 63 | reviewRow['review_country'] = self.get_country(reviewCountry) 64 | reviewAll.append(reviewRow) 65 | return reviewAll 66 | 67 | def isNextPage(self): 68 | next_page = self.selector.xpath('//li[contains(@class, "a-last")]/@class') 69 | if next_page: 70 | return False if 'a-disabled' in next_page else True 71 | else: 72 | return False 73 | 74 | def getData(self, data): 75 | return ''.join(data).strip().replace('\n', '') if data else '' 76 | 77 | def getURLData(self, data): 78 | return '%s%s' % (getAmazonDomain(self.Country), self.getData(data)) if data else '' 79 | 80 | def get_date(self, data): 81 | date = self.getData(data) 82 | try: 83 | date = date.replace(' ', '') 84 | time_format = TIME_CODE[self.Country] 85 | if type(time_format) == dict: 86 | if 'replace' in time_format: 87 | if type(time_format['replace']) == list: 88 | for replace_item in time_format['replace']: 89 | date = re.sub(self.re_remove_spaces(replace_item), '', date) 90 | # date = date.replace(replace_item.replace(' ', ''), '') 91 | else: 92 | date = re.sub(self.re_remove_spaces(time_format['replace']), '', date) 93 | # date = date.replace(time_format['replace'].replace(' ', ''), '') 94 | if 'MapMonth' in time_format: 95 | for item in time_format['MapMonth']: 96 | date = date.replace(item, time_format['MapMonth'][item]) 97 | time_format = time_format['format'] 98 | time_struct = time.strptime(date, time_format) 99 | return time.strftime(STANDARD_TIME, time_struct) 100 | except (TypeError, ValueError, SyntaxError) as e: 101 | print(e) 102 | return date 103 | 104 | def get_helpful(self, data): 105 | helpful_data = self.getData(data).lower() 106 | helpful = REVIEW_HELPFUL[self.Country] 107 | return 1 if helpful and helpful_data.find(helpful.lower()) > -1 else 0 108 | 109 | def get_country(self, data): 110 | country_data = self.getData(data) 111 | re_country = REVIEW_COUNTRY[self.Country] 112 | if re_country: 113 | country_data = re.search(re_country, country_data) 114 | return country_data.group(1) if country_data else '' 115 | return '' 116 | 117 | def get_selector(self): 118 | return self.selector 119 | 120 | def is_lang(self): 121 | lang = self.selector.xpath('//select[@id="language-type-dropdown"]') 122 | if not lang: 123 | return False 124 | for item in lang: 125 | param = item.xpath('option[@selected]/@value') 126 | for (key, value) in LANG_CODE.items(): 127 | if value == self.getData(param) and key == 'CN': 128 | return True 129 | return False 130 | 131 | @staticmethod 132 | def re_remove_spaces(data): 133 | return re.sub(r' ', '', data) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from tkinter import * 2 | from tkinter import ttk 3 | 4 | import random 5 | import time 6 | import threading 7 | 8 | from proxies import Proxy 9 | from export import JsonCsv 10 | from request import AmazonRequests 11 | from dispose import AmazonDispose 12 | from utils import is_number, RESOURCE, is_robot 13 | 14 | 15 | class Application(Frame): 16 | def __init__(self, master=None): 17 | Frame.__init__(self, master) 18 | self.pack() 19 | self.window_init() 20 | self.createWidgets() 21 | self.data = [] 22 | self.requests = '' 23 | self.csv = '' 24 | self.is_lang = False 25 | 26 | def window_init(self): 27 | self.master.title('Amazon评论获取工具 by 素笺 and 凌寒初见') 28 | self.master.resizable(width=FALSE, height=FALSE) 29 | 30 | def createWidgets(self): 31 | # fm2 32 | self.fm2 = Frame(self) 33 | self.fm2_left = Frame(self.fm2) 34 | self.fm2_right = Frame(self.fm2) 35 | self.fm2_left_top = Frame(self.fm2_left) 36 | self.fm2_left_center = Frame(self.fm2_left) 37 | self.fm2_left_bottom = Frame(self.fm2_left) 38 | self.fm2_right_top = Frame(self.fm2_right) 39 | self.fm2_right_bottom = Frame(self.fm2_right) 40 | 41 | self.is_proxies = BooleanVar() 42 | self.checkbutton = Checkbutton(self.fm2_right, text='是否使用代理', variable=self.is_proxies, onvalue=True, 43 | offvalue=False) 44 | self.checkbutton.pack(side=LEFT, padx=5) 45 | 46 | self.siteLabel = Label(self.fm2_left_top, text='站点') 47 | self.siteLabel.pack(side=LEFT, padx=10) 48 | 49 | self.siteBox = ttk.Combobox(self.fm2_left_top, state='readonly', width=17) 50 | self.siteBox.pack(side=LEFT) 51 | self.siteBox['value'] = list(RESOURCE.keys()) 52 | self.siteBox.current(0) 53 | self.fm2_left_top.pack(side=TOP, pady=5) 54 | 55 | self.asinLabel = Label(self.fm2_left_center, text='asin') 56 | self.asinLabel.pack(side=LEFT, padx=10) 57 | 58 | self.asinEntry = Entry(self.fm2_left_center) 59 | self.asinEntry.pack(side=LEFT) 60 | self.fm2_left_center.pack(side=TOP, pady=5) 61 | 62 | self.pageLabel = Label(self.fm2_left_bottom, text='页码') 63 | self.pageLabel.pack(side=LEFT, padx=10) 64 | 65 | pageValue = StringVar() 66 | pageValue.set('1') 67 | 68 | self.pageEntry = Entry(self.fm2_left_bottom, textvariable=pageValue) 69 | self.pageEntry.pack(side=LEFT) 70 | self.fm2_left_bottom.pack(side=TOP, pady=5) 71 | self.fm2_left.pack(side=LEFT) 72 | 73 | self.startButton = Button(self.fm2_right, text='开始获取', command=self.start) 74 | self.startButton.pack() 75 | self.fm2_right.pack(side=LEFT, padx=5) 76 | 77 | self.fm2.pack(side=TOP, pady=10) 78 | 79 | # fm3 80 | self.fm3 = Frame(self) 81 | self.msg = Text(self.fm3) 82 | self.msg.pack() 83 | self.msg.config(state=DISABLED) 84 | self.fm3.pack(side=TOP, fill=X) 85 | 86 | def write_msg(self, msg): 87 | self.msg.config(state=NORMAL) 88 | self.msg.insert(END, '\n' + msg) 89 | self.msg.config(state=DISABLED) 90 | self.msg.see(END) 91 | 92 | def delete_msg(self): 93 | self.msg.config(state=NORMAL) 94 | self.msg.delete(0.0, END) 95 | self.msg.config(state=DISABLED) 96 | 97 | def start(self): 98 | self.is_lang = False 99 | self.delete_msg() 100 | self.startButton.config(state=DISABLED) 101 | site = self.siteBox.get() 102 | asin = self.asinEntry.get() 103 | page = self.pageEntry.get() 104 | if not asin: 105 | self.write_msg('asin 为空,请输入asin') 106 | self.startButton.config(state=NORMAL) 107 | return 108 | if not page: 109 | self.write_msg('页码 为空, 请输入页码') 110 | self.startButton.config(state=NORMAL) 111 | return 112 | try: 113 | page = int(page) 114 | except Exception as e: 115 | print(e) 116 | self.write_msg('出现错误, 原因: 页码不是数字') 117 | self.startButton.config(state=NORMAL) 118 | return 119 | self.write_msg('开始任务...,站点--{},Asin--{}'.format(site, asin)) 120 | if not self.is_proxies.get(): 121 | self.write_msg('不使用代理') 122 | proxies = None 123 | session = None 124 | else: 125 | self.write_msg('使用代理, 正在准备代理') 126 | try: 127 | session, proxies = Proxy(self).get_proxies(site) 128 | if not proxies or type(proxies) != dict: 129 | self.write_msg('代理获取失败, 原因: {}'.format(proxies['msg'] if proxies and 'msg' in proxies else '无')) 130 | self.startButton.config(state=NORMAL) 131 | return 132 | except Exception as e: 133 | print(e) 134 | self.write_msg('出现错误, 原因: {}'.format(e)) 135 | self.startButton.config(state=NORMAL) 136 | return 137 | 138 | #初始化请求类 139 | self.requests = AmazonRequests(site, asin, page, session, proxies) 140 | self.csv = JsonCsv(asin) 141 | t = threading.Thread(target=self.start_download) 142 | self.daemon = t.setDaemon(True) 143 | t.start() 144 | 145 | def start_download(self): 146 | # 解析数据 并存储数据 147 | # 判断asin是否存在 148 | amazon_data = self.requests.getAmaoznData(self.is_lang) 149 | self.write_msg('正在获取第{}页'.format(self.requests.getPage())) 150 | if amazon_data and is_number(amazon_data): 151 | if amazon_data == 404: 152 | self.write_msg('asin不存在,请查看是否输入有误') 153 | if amazon_data == 2: 154 | self.write_msg('请求失败') 155 | if amazon_data == 400: 156 | self.write_msg('服务器无法理解此请求') 157 | self.startButton.config(state=NORMAL) 158 | return 159 | self.write_msg('正在解析数据') 160 | dispose = AmazonDispose(amazon_data, self.siteBox.get(), self.asinEntry.get()) 161 | if is_robot(dispose.get_selector()): 162 | self.write_msg('机器人验证') 163 | self.startButton.config(state=NORMAL) 164 | return 165 | if dispose.is_lang(): 166 | self.is_lang = True 167 | self.write_msg('语言不符合, 重新请求') 168 | self.wait('重新请求') 169 | self.start_download() 170 | return 171 | dic_data = dispose.dispose() 172 | print(dic_data) 173 | # self.write_msg(str(dic_data)) 174 | if dic_data: 175 | self.write_msg('写入数据') 176 | self.csv.writerCsv(dic_data) 177 | else: 178 | self.write_msg('没有数据可以写入') 179 | if dispose.isNextPage(): 180 | self.wait('请求下一页') 181 | self.requests.nextPage() 182 | self.start_download() 183 | else: 184 | self.csv.closeCsv() 185 | self.write_msg('评论获取完毕') 186 | self.startButton.config(state=NORMAL) 187 | 188 | def wait(self, msg): 189 | random_time = random.randint(5, 10) 190 | self.write_msg('等待%s秒,%s' % (random_time, msg)) 191 | time.sleep(random_time) 192 | 193 | 194 | if __name__ == '__main__': 195 | app = Application() 196 | app.mainloop() 197 | -------------------------------------------------------------------------------- /proxies.py: -------------------------------------------------------------------------------- 1 | import random 2 | import asyncio 3 | import requests 4 | import threading 5 | 6 | from lxml import etree 7 | from threading import Timer 8 | from fake_useragent import UserAgent 9 | from multiprocessing import cpu_count 10 | from datetime import datetime, timedelta 11 | 12 | from utils import request_message, getAmazonDomain, is_robot, wait, amazon_headers 13 | 14 | MAX_PROXY_REQUESTS_NUM = 2 15 | 16 | SCANNING_TIME = 0 17 | 18 | MAX_PROXY_POOL_NUM = { 19 | 'AE': 0, 20 | 'CN': 0, 21 | 'JP': 0, 22 | 'US': 0, 23 | 'UK': 0, 24 | 'FR': 0, 25 | 'DE': 0, 26 | 'ES': 0, 27 | 'IT': 0, 28 | 'CA': 0, 29 | 'IN': 0, 30 | 'AU': 0, 31 | 'GB': 0, 32 | 'MX': 0, 33 | 'BR': 0 34 | # 'SG': 'https://www.amazon.com.sg' 35 | } 36 | 37 | proxy_url = 'http://http.tiqu.alicdns.com/getip3?num={num}&type=2&pro=&city=0&yys=0&port=11&time=1&ts=1&ys=0&cs=0' \ 38 | '&lb=1&sb=0&pb=4&mr=2®ions=&gm=4' 39 | 40 | proxies_mate = 'https://{host}:{port}' 41 | 42 | 43 | class Proxy: 44 | _instance_lock = threading.Lock() 45 | 46 | def __new__(cls, application): 47 | if not hasattr(cls, '_instance'): 48 | with Proxy._instance_lock: 49 | if not hasattr(cls, '_instance'): 50 | Proxy._instance = super().__new__(cls) 51 | return Proxy._instance 52 | 53 | def __init__(self, application): 54 | self.application = application 55 | self.agents = {} 56 | self.proxies_num = 0 57 | self.loop = asyncio.get_event_loop() 58 | # self.add_agent() 59 | # timer(self) 60 | 61 | async def agent_pool(self, country=None, proxy_num=1): 62 | proxies_array = [] 63 | self.application.write_msg('开始请求代理') 64 | session, response = self.request(requests.session(), proxy_url 65 | .format(num=min(proxy_num, cpu_count()) if country else 1), types='json') 66 | if 'success' in response and response['success']: 67 | self.application.write_msg('请求代理成功') 68 | self.proxies_num = 0 69 | for item in response['data']: 70 | proxies = { 71 | 'http': proxies_mate.format(host=item['ip'], port=item['port']), 72 | 'https': proxies_mate.format(host=item['ip'], port=item['port']), 73 | } 74 | proxies_array.append({'session': session, 'proxies': proxies, 'expire_time': item['expire_time']}) 75 | if country: 76 | self.application.write_msg('有国家参数, 进行amazon访问处理,判断ip是否有效') 77 | print('有国家参数, 进行amazon访问处理,判断ip是否有效') 78 | proxies_data = {} 79 | for proxies_item in proxies_array: 80 | future = self.loop.run_in_executor(None, self.amazon_robot_check, proxies_item, country) 81 | results = await future 82 | if results: 83 | if country in proxies_data: 84 | proxies_data[country].append(results) 85 | else: 86 | proxies_data[country] = [results] 87 | self.application.write_msg('代理: {}'.format(proxies_data if proxies_data else '代理无效')) 88 | print('获取有效代理对象: ', proxies_data if proxies_data else None) 89 | return proxies_data if proxies_data else None 90 | else: 91 | self.application.write_msg('无国家参数, 不需要处理, 直接返回代理') 92 | print('无国家参数, 不需要处理, 直接返回代理') 93 | return proxies_array.pop() 94 | else: 95 | self.proxies_num += 1 96 | self.application.write_msg('请求代理失败, 正在重试...重试次数为: {}'.format(self.proxies_num)) 97 | if self.proxies_num < MAX_PROXY_REQUESTS_NUM: 98 | wait() 99 | task = asyncio.ensure_future(self.agent_pool(country, proxy_num)) 100 | self.loop.run_until_complete(task) 101 | return task.result() 102 | else: 103 | self.application.write_msg('请求代理失败, 重试已达最大次数') 104 | print('请求代理失败, 重试已达最大次数') 105 | return response 106 | 107 | def amazon_robot_check(self, data, country): 108 | print('正在进行amazon机器人验证') 109 | cur_header = amazon_headers.copy() 110 | cur_header['user-agent'] = UserAgent().random 111 | session, response = self.request(data['session'], getAmazonDomain(country), headers=cur_header, 112 | proxies=data['proxies']) 113 | data['session'] = session 114 | return data if not is_robot(etree.HTML(response)) else None 115 | 116 | def add_agent(self): 117 | # add_agent_arr 需要添加代理的国家 118 | add_agent_arr = [] 119 | print('检测代理是否需要添加') 120 | for item in MAX_PROXY_POOL_NUM: 121 | if MAX_PROXY_POOL_NUM[item] == 0: 122 | continue 123 | cur_proxy_num = len(self.agents[item]) if item in self.agents else 0 124 | if cur_proxy_num >= MAX_PROXY_POOL_NUM[item]: 125 | continue 126 | add_agent_arr.append({'country': item, 'proxy_num': (MAX_PROXY_POOL_NUM[item] - cur_proxy_num)}) 127 | if add_agent_arr: 128 | print('代理需要添加的国家有: ', ','.join([item['country'] for item in add_agent_arr])) 129 | for item in add_agent_arr: 130 | task = asyncio.ensure_future(self.agent_pool(item['country'], item['proxy_num'])) 131 | self.loop.run_until_complete(task) 132 | results = task.result() 133 | if results: 134 | if item['country'] in self.agents: 135 | self.agents[item['country']].extend(results[item['country']]) 136 | else: 137 | self.agents[item['country']] = results[item['country']] 138 | else: 139 | continue 140 | wait() 141 | self.add_agent() 142 | else: 143 | print('代理以达到设定数量, 不在进行添加') 144 | return None 145 | 146 | def remove_expired(self): 147 | print('正在移除过期代理') 148 | for country in self.agents: 149 | for index, item in enumerate(self.agents[country]): 150 | if self.compare_time(item['expire_time']): 151 | print('存在过期代理进行移除') 152 | self.agents[country].pop(index) 153 | self.add_agent() 154 | 155 | def get_proxies(self, country=None): 156 | if country: 157 | if country in self.agents: 158 | index = random.randint(0, len(self.agents[country]) - 1) 159 | agent = self.agents[country][index] 160 | if self.compare_time(agent['expire_time']): 161 | print('取出当前代理发现过期') 162 | self.agents[country].pop(index) 163 | self.add_agent() 164 | return self.get_proxies(country) 165 | return agent['session'], agent['proxies'] 166 | else: 167 | task = asyncio.ensure_future(self.agent_pool(country, proxy_num=1)) 168 | self.loop.run_until_complete(task) 169 | results = task.result() 170 | if results: 171 | agent = random.choice(results[country]) 172 | return agent['session'], agent['proxies'] 173 | wait() 174 | return self.get_proxies(country) 175 | else: 176 | task = asyncio.ensure_future(self.agent_pool()) 177 | self.loop.run_until_complete(task) 178 | return task.result() 179 | 180 | @staticmethod 181 | def request(session, url, headers=None, proxies=None, types='txt'): 182 | response = session.get(url, headers=headers, proxies=proxies, timeout=20) 183 | response.encoding = 'utf-8' 184 | return session, request_message(response, types) 185 | 186 | @staticmethod 187 | def compare_time(date): 188 | # 2020-04-14 11:44:46 189 | now = (datetime.now() + timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') 190 | return datetime.strptime(now, '%Y-%m-%d %H:%M:%S') > datetime.strptime(date, '%Y-%m-%d %H:%M:%S') 191 | 192 | 193 | # 定时器 194 | def timer(proxy): 195 | # 移除过期代理 196 | if SCANNING_TIME == 0: 197 | return None 198 | proxy.remove_expired() 199 | t = Timer(SCANNING_TIME, timer, (proxy,)) 200 | t.start() 201 | --------------------------------------------------------------------------------