├── .gitignore ├── README.md ├── action.py ├── config.py └── script.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MitmAppiumJD 2 | MitmProxy and Appium to Crawl Comments in JD APP 3 | -------------------------------------------------------------------------------- /action.py: -------------------------------------------------------------------------------- 1 | from appium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from time import sleep 6 | from config import * 7 | 8 | 9 | class Action(): 10 | def __init__(self): 11 | """ 12 | 初始化 13 | """ 14 | # 驱动配置 15 | self.desired_caps = { 16 | 'platformName': PLATFORM, 17 | 'deviceName': DEVICE_NAME, 18 | 'appPackage': 'com.jingdong.app.mall', 19 | 'appActivity': 'main.MainActivity' 20 | } 21 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps) 22 | self.wait = WebDriverWait(self.driver, TIMEOUT) 23 | 24 | def comments(self): 25 | # 点击进入搜索页面 26 | search = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jingdong.app.mall:id/mp'))) 27 | search.click() 28 | # 输入搜索文本 29 | box = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jd.lib.search:id/search_box_layout'))) 30 | box.set_text(KEYWORD) 31 | # 点击搜索按钮 32 | button = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jd.lib.search:id/search_btn'))) 33 | button.click() 34 | # 点击进入商品详情 35 | view = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jd.lib.search:id/product_list_item'))) 36 | view.click() 37 | # 进入评论详情 38 | tab = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jd.lib.productdetail:id/pd_tab3'))) 39 | tab.click() 40 | 41 | def scroll(self): 42 | while True: 43 | # 模拟拖动 44 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y) 45 | sleep(SCROLL_SLEEP_TIME) 46 | 47 | def main(self): 48 | self.comments() 49 | self.scroll() 50 | 51 | 52 | if __name__ == '__main__': 53 | action = Action() 54 | action.main() 55 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # 平台 4 | PLATFORM = 'Android' 5 | 6 | # 设备名称 通过 adb devices -l 获取 7 | DEVICE_NAME = 'MI_NOTE_Pro' 8 | 9 | # APP路径 10 | APP = os.path.abspath('.') + 'jd/.apk' 11 | 12 | # Appium地址 13 | DRIVER_SERVER = 'http://localhost:4723/wd/hub' 14 | # 等待元素加载时间 15 | TIMEOUT = 300 16 | 17 | # 滑动点 18 | FLICK_START_X = 300 19 | FLICK_START_Y = 300 20 | FLICK_DISTANCE = 700 21 | 22 | 23 | # 滑动间隔 24 | SCROLL_SLEEP_TIME = 1 25 | 26 | KEYWORD = '手机' -------------------------------------------------------------------------------- /script.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pymongo 3 | from urllib.parse import unquote 4 | import re 5 | 6 | client = pymongo.MongoClient('localhost') 7 | db = client['jd'] 8 | comments_collection = db['comments'] 9 | products_collection = db['products'] 10 | 11 | def response(flow): 12 | global comments_collection, products_collection 13 | # 提取评论数据 14 | url = 'api.m.jd.com/client.action' 15 | if url in flow.request.url: 16 | pattern = re.compile('sku\".*?\"(\d+)\"') 17 | # Request请求参数中包含商品ID 18 | body = unquote(flow.request.text) 19 | # 提取商品ID 20 | id = re.search(pattern, body).group(1) if re.search(pattern, body) else None 21 | # 提取Response Body 22 | text = flow.response.text 23 | data = json.loads(text) 24 | comments = data.get('commentInfoList') or [] 25 | # 提取评论数据 26 | for comment in comments: 27 | if comment.get('commentInfo') and comment.get('commentInfo').get('commentData'): 28 | info = comment.get('commentInfo') 29 | text = info.get('commentData') 30 | date = info.get('commentDate') 31 | nickname = info.get('userNickName') 32 | pictures = info.get('pictureInfoList') 33 | print(id, nickname, text, date) 34 | comments_collection.insert({ 35 | 'id': id, 36 | 'text': text, 37 | 'date': date, 38 | 'nickname': nickname, 39 | 'pictures': pictures 40 | }) 41 | 42 | url = 'cdnware.m.jd.com' 43 | if url in flow.request.url: 44 | text = flow.response.text 45 | data = json.loads(text) 46 | if data.get('wareInfo') and data.get('wareInfo').get('basicInfo'): 47 | info = data.get('wareInfo').get('basicInfo') 48 | id = info.get('wareId') 49 | name = info.get('name') 50 | images = info.get('wareImage') 51 | print(id, name, images) 52 | products_collection.insert({ 53 | 'id': id, 54 | 'name': name, 55 | 'images': images 56 | }) --------------------------------------------------------------------------------