├── .idea ├── dataSources.local.xml ├── dataSources.xml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── search.iml ├── vcs.xml └── workspace.xml ├── LICENSE ├── README.md ├── __pycache__ ├── config.cpython-36.pyc ├── config.cpython-37.pyc ├── processor.cpython-36.pyc ├── xhs_wechat_item_script.cpython-36.pyc └── xhs_wechat_noteid_script.cpython-36.pyc ├── config.py ├── idata_xhs.py ├── log.txt ├── processor.py ├── xhs_app.py ├── xhs_web_request.py ├── xhs_wechat.py ├── xhs_wechat_item_script.py └── xhs_wechat_noteid_script.py /.idea/dataSources.local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | false 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | sqlite.xerial 6 | true 7 | org.sqlite.JDBC 8 | jdbc:sqlite:C:\Users\Chars\PycharmProjects\search\data\project.db 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 10 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/search.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 39 | 40 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 279 | 280 | 281 | 282 | 283 | 303 | 304 | 305 | 325 | 326 | 327 | 347 | 348 | 349 | 369 | 370 | 371 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 1553526660002 408 | 426 | 427 | 428 | 429 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 HhhuYu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/README.md -------------------------------------------------------------------------------- /__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/processor.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/xhs_wechat_item_script.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/xhs_wechat_item_script.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/xhs_wechat_noteid_script.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/xhs_wechat_noteid_script.cpython-36.pyc -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # 平台 4 | PLATFORM = 'Android' 5 | 6 | # 设备名称 通过 adb devices -l 获取 7 | DEVICE_NAME = 'MI_8' 8 | 9 | # APP包名 10 | XHS_PACKAGE = 'com.xingin.xhs' 11 | WECHAT_PACKAGE = 'com.tencent.mm' 12 | APP_PACKAGE = 'com.xingin.xhs' 13 | # 'com.xingin.xhs' 小红书 14 | # 'com.jingdong.app.mall' jd 15 | # 'com.tencent.mm' wechat 16 | 17 | # 入口类名 18 | XHS_ACTIVITY = '.activity.SplashActivity' 19 | WECHAT_ACTIVITY = '.ui.LauncherUI' 20 | 21 | APP_ACTIVITY = '.activity.SplashActivity' 22 | # '.activity.SplashActivity' 小红书 23 | # '.MainFrameActivity' jd 24 | # '.ui.LauncherUI' wechat 25 | 26 | # Appium地址 27 | DRIVER_SERVER = 'http://localhost:4723/wd/hub' 28 | # 等待元素加载时间 29 | TIMEOUT = 300 30 | 31 | # 微信手机号密码 32 | USERNAME = '' 33 | PASSWORD = '' 34 | 35 | # 滑动点 36 | FLICK_START_X = 300 37 | FLICK_START_Y = 300 38 | FLICK_DISTANCE = 700 39 | 40 | # MongoDB配置 41 | MONGO_URL = 'localhost' 42 | 43 | WECHAT_XHS_MONGO_DB = 'wechat' 44 | WECHAT_XHS_MONGO_COLLECTION = 'xhs' 45 | WECHAT_XHS_NOTE_MONGO_COLLECTION = 'noteID' 46 | 47 | XHS_MONGO_DB = 'xhs' 48 | XHS_MONGO_COLLECTION = 'testContent' 49 | XHS_MONGO_ITEM_COLLECTION = 'noteItem' 50 | 51 | # 滑动间隔 52 | SCROLL_SLEEP_TIME = 3 53 | 54 | KEYWORD = '杭州' 55 | 56 | 57 | # webspider 58 | 59 | MONGO_DB = 'taobao' 60 | MONGO_COLLECTION = 'products' 61 | TAOBAO_KEYWORD = 'ipad' 62 | MAX_PAGE = 100 63 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] 64 | -------------------------------------------------------------------------------- /idata_xhs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import time 4 | 5 | import pymongo 6 | 7 | import requests 8 | 9 | from config import * 10 | 11 | API_KEY = "" 12 | API_AREA = "" 13 | url = "http://" + API_AREA + "/post/xiaohongshu_ids" 14 | 15 | 16 | client = pymongo.MongoClient(MONGO_URL) 17 | xhs_db = client[WECHAT_XHS_MONGO_DB] 18 | 19 | 20 | 21 | headers = { 22 | "Accept-Encoding": "gzip", 23 | "Connection": "close" 24 | } 25 | 26 | param = { 27 | 'id' : '5a5f1ea9c8e55d32cbe96617', 28 | 'apikey': API_KEY 29 | } 30 | 31 | 32 | def test(): 33 | r = requests.get(url, headers=headers, params=param) 34 | json_obj = r.json() 35 | print(json_obj) 36 | 37 | 38 | def note_id_read(): 39 | note_id_list = [] 40 | list = xhs_db[WECHAT_XHS_NOTE_MONGO_COLLECTION].find() 41 | for item in list: 42 | note_id_list.append(item['note_id']) 43 | 44 | # print(note_id_list) 45 | return note_id_list 46 | 47 | 48 | def send_quest(node_id): 49 | param["id"] = node_id 50 | count = 0 51 | while True: 52 | try: 53 | r = requests.get(url, headers=headers, params=param) 54 | json_obj = r.json() 55 | # print(json_obj) 56 | if json_obj["retcode"] == "000000": 57 | save_to_mongo(json_obj) 58 | return 59 | except requests.exceptions.RequestException: 60 | continue 61 | except json.decoder.JSONDecodeError: 62 | if count == 3: 63 | print("note_id:", note_id) 64 | return 65 | continue 66 | 67 | 68 | item_count = 0 69 | 70 | 71 | def save_to_mongo(result): 72 | """ 73 | 保存至MongoDB 74 | :param result: 结果 75 | """ 76 | time.sleep(1) 77 | f1 = open('log.txt', 'a', encoding="utf-8") 78 | try: 79 | if xhs_db[XHS_MONGO_ITEM_COLLECTION].insert(result): 80 | string = time.ctime() + ' ' + "#" + str(item_count) + ": " + '存储到MongoDB成功:' + result["data"][0]["id"] 81 | f1.writelines(string + '\n') 82 | print(string) 83 | 84 | except Exception: 85 | string = time.ctime() + ' ' + "#" + str(item_count) + ": " + '存储到MongoDB失败:' + result["data"][0]["id"] 86 | f1.writelines(string + '\n') 87 | print(string) 88 | 89 | f1.close() 90 | 91 | def had_stored_note_id(): 92 | with open('log.txt', 'r', encoding="utf-8") as f: 93 | text = f.readlines() 94 | list = [] 95 | pattern = re.compile(r'成功:(\w+)') 96 | for line in text: 97 | key = pattern.findall(line) 98 | if len(key) != 0: 99 | list.append(key[0]) 100 | return list 101 | 102 | if __name__ == "__main__": 103 | # test() 104 | 105 | # 中断 避免重复项 106 | note_id_list = note_id_read() 107 | had_stored_note_id_list = had_stored_note_id() 108 | note_id_list = [item for item in note_id_list if item not in had_stored_note_id_list] 109 | 110 | # print(note_id_list) 111 | 112 | for note_id in note_id_list: 113 | send_quest(note_id) 114 | item_count = item_count + 1 -------------------------------------------------------------------------------- /processor.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | 4 | 5 | class Processor(): 6 | def date(self, datetime): 7 | """ 8 | 处理时间 9 | :param datetime: 原始时间 10 | :return: 处理后时间 11 | """ 12 | if re.match('\d+分钟前', datetime): 13 | minute = re.match('(\d+)', datetime).group(1) 14 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(minute) * 60)) 15 | if re.match('\d+小时前', datetime): 16 | hour = re.match('(\d+)', datetime).group(1) 17 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(hour) * 60 * 60)) 18 | if re.match('昨天', datetime): 19 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60)) 20 | if re.match('\d+天前', datetime): 21 | day = re.match('(\d+)', datetime).group(1) 22 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time()) - float(day) * 24 * 60 * 60) 23 | return datetime 24 | -------------------------------------------------------------------------------- /xhs_app.py: -------------------------------------------------------------------------------- 1 | from appium import webdriver 2 | from pymongo import MongoClient 3 | from selenium.common.exceptions import NoSuchElementException 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from time import sleep 8 | 9 | from processor import Processor 10 | from config import * 11 | 12 | 13 | class Action(): 14 | 15 | def __init__(self): 16 | """ 17 | 初始化 18 | """ 19 | # 驱动配置 20 | self.desired_caps = { 21 | 'platformName': PLATFORM, 22 | 'deviceName': DEVICE_NAME, 23 | 'appPackage': XHS_PACKAGE, 24 | 'appActivity': XHS_ACTIVITY, 25 | 'noReset': True, 26 | "automationName": "Uiautomator2" 27 | } 28 | global collection 29 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps) 30 | self.wait = WebDriverWait(self.driver, TIMEOUT) 31 | self.client = MongoClient(MONGO_URL) 32 | self.db = self.client[XHS_MONGO_DB] 33 | self.collection = self.db[XHS_MONGO_COLLECTION] 34 | self.processor = Processor() 35 | 36 | def enterApp(self): 37 | el1 = self.wait.until(EC.presence_of_element_located((By.ID, 'com.xingin.xhs:id/zs'))) 38 | el1.click() 39 | el2 = self.wait.until(EC.presence_of_element_located((By.ID, "com.xingin.xhs:id/ak5"))) 40 | el2.click() 41 | el3 = self.wait.until(EC.presence_of_element_located((By.ID, "com.xingin.xhs:id/ak4"))) 42 | el3.send_keys(KEYWORD) 43 | el4 = self.wait.until(EC.presence_of_element_located((By.ID, "com.xingin.xhs:id/ak7"))) 44 | el4.click() 45 | el5 = self.wait.until(EC.presence_of_element_located((By.XPATH, 46 | '/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.view.ViewGroup/android.view.ViewGroup/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[1]'))) 47 | el5.click() 48 | 49 | def scroll(self): 50 | while True: 51 | # 当前页面显示的所有状态 52 | items = self.wait.until(EC.presence_of_all_elements_located((By.ID, 'com.xingin.xhs:id/a1z'))) 53 | # 遍历每条状态 54 | for item in items: 55 | try: 56 | # 昵称 57 | nickname = item.find_element_by_id('com.xingin.xhs:id/bhs').get_attribute('text') 58 | # 正文 59 | content = item.find_element_by_id('com.xingin.xhs:id/anl').get_attribute('text') 60 | # 日期 61 | date = item.find_element_by_id('com.xingin.xhs:id/ask').get_attribute('text') 62 | # 处理日期 63 | date = self.processor.date(date) 64 | print(nickname, content, date) 65 | data = { 66 | 'nickname': nickname, 67 | 'content': content, 68 | 'date': date, 69 | } 70 | # 插入MongoDB 71 | self.collection.update({'nickname': nickname, 'content': content}, {'$set': data}, True) 72 | sleep(SCROLL_SLEEP_TIME) 73 | except NoSuchElementException: 74 | pass 75 | # 上滑 76 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y) 77 | 78 | def main(self): 79 | self.enterApp() 80 | self.scroll() 81 | 82 | 83 | if __name__ == '__main__': 84 | action = Action() 85 | action.main() -------------------------------------------------------------------------------- /xhs_web_request.py: -------------------------------------------------------------------------------- 1 | from urllib. request import ProxyHandler, build_opener 2 | import requests 3 | 4 | proxy ='127.0.0.1:9743' 5 | 6 | proxies = { 7 | 'http':'http://' + proxy, 8 | 'https':'https://' + proxy 9 | } 10 | 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' 13 | } 14 | 15 | try: 16 | response = requests.get('http://httpbin.org/get', proxies=proxies, headers=headers) 17 | print(response.text) 18 | except requests.exceptions.ConnectionError as e: 19 | print('Error', e.args) -------------------------------------------------------------------------------- /xhs_wechat.py: -------------------------------------------------------------------------------- 1 | 2 | from appium import webdriver 3 | 4 | from selenium.common.exceptions import NoSuchElementException, WebDriverException 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from pymongo import MongoClient 9 | from time import sleep 10 | from processor import Processor 11 | from config import * 12 | 13 | 14 | class XHS(): 15 | 16 | index=1 17 | def __init__(self): 18 | """ 19 | 初始化 20 | """ 21 | # 驱动配置 22 | self.desired_caps = { 23 | 'platformName': PLATFORM, 24 | 'deviceName': DEVICE_NAME, 25 | 'appPackage': WECHAT_PACKAGE, 26 | 'appActivity': WECHAT_ACTIVITY, 27 | 'noReset': True, 28 | "automationName": "Uiautomator2" 29 | } 30 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps) 31 | self.wait = WebDriverWait(self.driver, TIMEOUT) 32 | self.client = MongoClient(MONGO_URL) 33 | self.db = self.client[WECHAT_XHS_MONGO_DB] 34 | self.collection = self.db[WECHAT_XHS_MONGO_COLLECTION] 35 | # 处理器 36 | self.processor = Processor() 37 | 38 | def login(self): 39 | """ 40 | 登录微信 41 | :return: 42 | """ 43 | # 登录按钮 44 | login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/cjk'))) 45 | login.click() 46 | # 手机输入 47 | phone = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/h2'))) 48 | phone.set_text(USERNAME) 49 | # 下一步 50 | next = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.tencent.mm:id/adj'))) 51 | next.click() 52 | # 密码 53 | password = self.wait.until( 54 | EC.presence_of_element_located((By.XPATH, '//*[@resource-id="com.tencent.mm:id/h2"][1]'))) 55 | password.set_text(PASSWORD) 56 | # 提交 57 | submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.tencent.mm:id/adj'))) 58 | submit.click() 59 | 60 | 61 | def enter(self, index=1): 62 | """ 63 | 进入小红书 64 | :return: 65 | """ 66 | # 选项卡 67 | tab = self.wait.until( 68 | EC.presence_of_element_located((By.XPATH, '//android.widget.FrameLayout[@content-desc="当前所在页面,与的聊天"]/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.LinearLayout/android.widget.RelativeLayout[3]'))) 69 | tab.click() 70 | # 小程序 71 | app = self.wait.until(EC.presence_of_element_located((By.XPATH, "//android.widget.FrameLayout[@content-desc=\"当前所在页面,与的聊天\"]/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout/com.tencent.mm.ui.mogic.WxViewPager/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.ListView/android.widget.LinearLayout[9]"))) 72 | app.click() 73 | 74 | xhs = self.wait.until(EC.presence_of_element_located((By.XPATH, "//android.widget.FrameLayout[@content-desc=\"当前所在页面,小程序\"]/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[2]/android.widget.FrameLayout[2]/android.support.v7.widget.RecyclerView/android.widget.RelativeLayout[1]"))) 75 | xhs.click() 76 | 77 | search = self.wait.until(EC.presence_of_element_located((By.XPATH, 78 | "/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.webkit.WebView/android.view.View/android.view.View[2]/android.view.View/android.view.View[1]/android.view.View/android.view.View[2]/android.view.View"))) 79 | search.click() 80 | 81 | # 爬取内容or爬取note_id 82 | 83 | # 这句注释掉就是爬取note_id 84 | self.card_selete() 85 | 86 | 87 | 88 | def card_selete(self): 89 | card = self.wait.until(EC.presence_of_element_located((By.XPATH, 90 | "/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.webkit.WebView/android.view.View[2]/android.view.View[2]/android.view.View/android.view.View/android.view.View[2]/android.view.View[1]/android.view.View[3]/android.view.View/android.view.View/android.view.View[" + str(self.index) + "]/android.view.View[1]/android.widget.Button/android.view.View[1]/android.view.View/android.widget.Image"))) 91 | 92 | card.click() 93 | 94 | self.index = self.index + 1 95 | sleep(SCROLL_SLEEP_TIME) 96 | 97 | def back_button(self): 98 | back = self.wait.until(EC.presence_of_element_located((By.XPATH, 99 | "/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.webkit.WebView/android.view.View/android.view.View[1]/android.view.View/android.view.View[1]/android.view.View/android.view.View[1]/android.view.View/android.widget.Image"))) 100 | back.click() 101 | sleep(SCROLL_SLEEP_TIME) 102 | 103 | def crawl(self): 104 | """ 105 | 爬取 106 | :return: 107 | """ 108 | while True: 109 | 110 | # 上滑 111 | try: 112 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y) 113 | except WebDriverException: 114 | self.back_button() 115 | self.card_selete() 116 | 117 | sleep(SCROLL_SLEEP_TIME) 118 | 119 | 120 | def main(self): 121 | """ 122 | 入口 123 | :return: 124 | """ 125 | # 登录 126 | # self.login() 127 | # 进入小红书 128 | self.enter() 129 | # 爬取 130 | self.crawl() 131 | 132 | 133 | if __name__ == '__main__': 134 | xhs = XHS() 135 | xhs.main() 136 | -------------------------------------------------------------------------------- /xhs_wechat_item_script.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pymongo 3 | from time import sleep 4 | from config import * 5 | 6 | 7 | def response(flow): 8 | global collection 9 | client = pymongo.MongoClient(MONGO_URL) 10 | db = client[WECHAT_XHS_MONGO_DB] 11 | collection = db[WECHAT_XHS_MONGO_COLLECTION] 12 | 13 | 14 | url = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/note/' 15 | if flow.request.url.startswith(url): 16 | # 数据的解析 17 | # print(flow.request.url) 18 | for item in json.loads(flow.response.text)["data"]: 19 | comment_list = [] 20 | for comment in item["comment_list"]: 21 | content = comment["content"] 22 | user = comment["user"]["name"] 23 | comment_item = [user, content] 24 | comment_list.append(comment_item) 25 | 26 | for note in item["note_list"]: 27 | note_id = note["id"] 28 | comment_list.append(note_id) 29 | user = note["user"]["name"] 30 | collect_count = note["collected_count"] 31 | comment_count = note["comments_count"] 32 | like_count = note["liked_count"] 33 | share_count = note["shared_count"] 34 | description = note["desc"] 35 | img_ist = note["images_list"] 36 | date = note["time"] 37 | 38 | content = { 39 | 'note_id': note_id, 40 | 'user': user, 41 | 'description': description, 42 | 'collect_count': collect_count, 43 | 'comment_count': comment_count, 44 | 'like_count': like_count, 45 | 'share_count': share_count, 46 | 'img_list': img_ist, 47 | 'date': date, 48 | 'comment': comment_list 49 | } 50 | 51 | collection.insert(content) 52 | # print(content) 53 | # sleep(SCROLL_SLEEP_TIME) 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /xhs_wechat_noteid_script.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pymongo 3 | 4 | from config import * 5 | 6 | 7 | def response(flow): 8 | global collection 9 | client = pymongo.MongoClient(MONGO_URL) 10 | db = client[WECHAT_XHS_MONGO_DB] 11 | collection = db[WECHAT_XHS_NOTE_MONGO_COLLECTION] 12 | 13 | url1 = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/search/notes?' 14 | url2 = 'https://www.xiaohongshu.com/fe_api/burdock/v1/page/' 15 | if flow.request.url.startswith(url1): 16 | # 数据的解析 17 | print(flow.request.url) 18 | 19 | notes = json.loads(flow.response.text)["data"]["notes"] 20 | for note in notes: 21 | note_id = note["id"] 22 | img_list = note["images_list"] 23 | title = note["title"] 24 | user = note["user"] 25 | 26 | content = { 27 | "note_id": note_id, 28 | "img_list": img_list, 29 | "title": title, 30 | "user":user 31 | } 32 | 33 | collection.insert(content) 34 | 35 | elif flow.request.url.startswith(url2): 36 | print(flow.request.url) 37 | 38 | notes = json.loads(flow.response.text)["data"] 39 | for note in notes: 40 | note_id = note["id"] 41 | img_list = note["cover"] 42 | title = note["title"] 43 | user = note["user"] 44 | 45 | content = { 46 | "note_id": note_id, 47 | "img_list": img_list, 48 | "title": title, 49 | "user": user 50 | } 51 | 52 | collection.insert(content) 53 | --------------------------------------------------------------------------------