├── .idea
├── dataSources.local.xml
├── dataSources.xml
├── encodings.xml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── search.iml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── __pycache__
├── config.cpython-36.pyc
├── config.cpython-37.pyc
├── processor.cpython-36.pyc
├── xhs_wechat_item_script.cpython-36.pyc
└── xhs_wechat_noteid_script.cpython-36.pyc
├── config.py
├── idata_xhs.py
├── log.txt
├── processor.py
├── xhs_app.py
├── xhs_web_request.py
├── xhs_wechat.py
├── xhs_wechat_item_script.py
└── xhs_wechat_noteid_script.py
/.idea/dataSources.local.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | false
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/dataSources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | sqlite.xerial
6 | true
7 | org.sqlite.JDBC
8 | jdbc:sqlite:C:\Users\Chars\PycharmProjects\search\data\project.db
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/search.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
164 |
165 |
166 |
167 | comment
168 | processor
169 | android.view.View
170 | KEY
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 | true
217 |
218 | true
219 | true
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 | 1553526660002
408 |
409 |
410 | 1553526660002
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 HhhuYu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/README.md
--------------------------------------------------------------------------------
/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/config.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/processor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/processor.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/xhs_wechat_item_script.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/xhs_wechat_item_script.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/xhs_wechat_noteid_script.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/charstal/xhs_simple_crawler/7612d3a7eff7d4b2ded8bbb54bf05f201c607e22/__pycache__/xhs_wechat_noteid_script.cpython-36.pyc
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # 平台
4 | PLATFORM = 'Android'
5 |
6 | # 设备名称 通过 adb devices -l 获取
7 | DEVICE_NAME = 'MI_8'
8 |
9 | # APP包名
10 | XHS_PACKAGE = 'com.xingin.xhs'
11 | WECHAT_PACKAGE = 'com.tencent.mm'
12 | APP_PACKAGE = 'com.xingin.xhs'
13 | # 'com.xingin.xhs' 小红书
14 | # 'com.jingdong.app.mall' jd
15 | # 'com.tencent.mm' wechat
16 |
17 | # 入口类名
18 | XHS_ACTIVITY = '.activity.SplashActivity'
19 | WECHAT_ACTIVITY = '.ui.LauncherUI'
20 |
21 | APP_ACTIVITY = '.activity.SplashActivity'
22 | # '.activity.SplashActivity' 小红书
23 | # '.MainFrameActivity' jd
24 | # '.ui.LauncherUI' wechat
25 |
26 | # Appium地址
27 | DRIVER_SERVER = 'http://localhost:4723/wd/hub'
28 | # 等待元素加载时间
29 | TIMEOUT = 300
30 |
31 | # 微信手机号密码
32 | USERNAME = ''
33 | PASSWORD = ''
34 |
35 | # 滑动点
36 | FLICK_START_X = 300
37 | FLICK_START_Y = 300
38 | FLICK_DISTANCE = 700
39 |
40 | # MongoDB配置
41 | MONGO_URL = 'localhost'
42 |
43 | WECHAT_XHS_MONGO_DB = 'wechat'
44 | WECHAT_XHS_MONGO_COLLECTION = 'xhs'
45 | WECHAT_XHS_NOTE_MONGO_COLLECTION = 'noteID'
46 |
47 | XHS_MONGO_DB = 'xhs'
48 | XHS_MONGO_COLLECTION = 'testContent'
49 | XHS_MONGO_ITEM_COLLECTION = 'noteItem'
50 |
51 | # 滑动间隔
52 | SCROLL_SLEEP_TIME = 3
53 |
54 | KEYWORD = '杭州'
55 |
56 |
57 | # webspider
58 |
59 | MONGO_DB = 'taobao'
60 | MONGO_COLLECTION = 'products'
61 | TAOBAO_KEYWORD = 'ipad'
62 | MAX_PAGE = 100
63 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
64 |
--------------------------------------------------------------------------------
/idata_xhs.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import time
4 |
5 | import pymongo
6 |
7 | import requests
8 |
9 | from config import *
10 |
11 | API_KEY = ""
12 | API_AREA = ""
13 | url = "http://" + API_AREA + "/post/xiaohongshu_ids"
14 |
15 |
16 | client = pymongo.MongoClient(MONGO_URL)
17 | xhs_db = client[WECHAT_XHS_MONGO_DB]
18 |
19 |
20 |
21 | headers = {
22 | "Accept-Encoding": "gzip",
23 | "Connection": "close"
24 | }
25 |
26 | param = {
27 | 'id' : '5a5f1ea9c8e55d32cbe96617',
28 | 'apikey': API_KEY
29 | }
30 |
31 |
32 | def test():
33 | r = requests.get(url, headers=headers, params=param)
34 | json_obj = r.json()
35 | print(json_obj)
36 |
37 |
38 | def note_id_read():
39 | note_id_list = []
40 | list = xhs_db[WECHAT_XHS_NOTE_MONGO_COLLECTION].find()
41 | for item in list:
42 | note_id_list.append(item['note_id'])
43 |
44 | # print(note_id_list)
45 | return note_id_list
46 |
47 |
48 | def send_quest(node_id):
49 | param["id"] = node_id
50 | count = 0
51 | while True:
52 | try:
53 | r = requests.get(url, headers=headers, params=param)
54 | json_obj = r.json()
55 | # print(json_obj)
56 | if json_obj["retcode"] == "000000":
57 | save_to_mongo(json_obj)
58 | return
59 | except requests.exceptions.RequestException:
60 | continue
61 | except json.decoder.JSONDecodeError:
62 | if count == 3:
63 | print("note_id:", note_id)
64 | return
65 | continue
66 |
67 |
68 | item_count = 0
69 |
70 |
71 | def save_to_mongo(result):
72 | """
73 | 保存至MongoDB
74 | :param result: 结果
75 | """
76 | time.sleep(1)
77 | f1 = open('log.txt', 'a', encoding="utf-8")
78 | try:
79 | if xhs_db[XHS_MONGO_ITEM_COLLECTION].insert(result):
80 | string = time.ctime() + ' ' + "#" + str(item_count) + ": " + '存储到MongoDB成功:' + result["data"][0]["id"]
81 | f1.writelines(string + '\n')
82 | print(string)
83 |
84 | except Exception:
85 | string = time.ctime() + ' ' + "#" + str(item_count) + ": " + '存储到MongoDB失败:' + result["data"][0]["id"]
86 | f1.writelines(string + '\n')
87 | print(string)
88 |
89 | f1.close()
90 |
91 | def had_stored_note_id():
92 | with open('log.txt', 'r', encoding="utf-8") as f:
93 | text = f.readlines()
94 | list = []
95 | pattern = re.compile(r'成功:(\w+)')
96 | for line in text:
97 | key = pattern.findall(line)
98 | if len(key) != 0:
99 | list.append(key[0])
100 | return list
101 |
102 | if __name__ == "__main__":
103 | # test()
104 |
105 | # 中断 避免重复项
106 | note_id_list = note_id_read()
107 | had_stored_note_id_list = had_stored_note_id()
108 | note_id_list = [item for item in note_id_list if item not in had_stored_note_id_list]
109 |
110 | # print(note_id_list)
111 |
112 | for note_id in note_id_list:
113 | send_quest(note_id)
114 | item_count = item_count + 1
--------------------------------------------------------------------------------
/processor.py:
--------------------------------------------------------------------------------
1 | import time
2 | import re
3 |
4 |
5 | class Processor():
6 | def date(self, datetime):
7 | """
8 | 处理时间
9 | :param datetime: 原始时间
10 | :return: 处理后时间
11 | """
12 | if re.match('\d+分钟前', datetime):
13 | minute = re.match('(\d+)', datetime).group(1)
14 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(minute) * 60))
15 | if re.match('\d+小时前', datetime):
16 | hour = re.match('(\d+)', datetime).group(1)
17 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(hour) * 60 * 60))
18 | if re.match('昨天', datetime):
19 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60))
20 | if re.match('\d+天前', datetime):
21 | day = re.match('(\d+)', datetime).group(1)
22 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time()) - float(day) * 24 * 60 * 60)
23 | return datetime
24 |
--------------------------------------------------------------------------------
/xhs_app.py:
--------------------------------------------------------------------------------
1 | from appium import webdriver
2 | from pymongo import MongoClient
3 | from selenium.common.exceptions import NoSuchElementException
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.support.ui import WebDriverWait
6 | from selenium.webdriver.support import expected_conditions as EC
7 | from time import sleep
8 |
9 | from processor import Processor
10 | from config import *
11 |
12 |
13 | class Action():
14 |
15 | def __init__(self):
16 | """
17 | 初始化
18 | """
19 | # 驱动配置
20 | self.desired_caps = {
21 | 'platformName': PLATFORM,
22 | 'deviceName': DEVICE_NAME,
23 | 'appPackage': XHS_PACKAGE,
24 | 'appActivity': XHS_ACTIVITY,
25 | 'noReset': True,
26 | "automationName": "Uiautomator2"
27 | }
28 | global collection
29 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps)
30 | self.wait = WebDriverWait(self.driver, TIMEOUT)
31 | self.client = MongoClient(MONGO_URL)
32 | self.db = self.client[XHS_MONGO_DB]
33 | self.collection = self.db[XHS_MONGO_COLLECTION]
34 | self.processor = Processor()
35 |
36 | def enterApp(self):
37 | el1 = self.wait.until(EC.presence_of_element_located((By.ID, 'com.xingin.xhs:id/zs')))
38 | el1.click()
39 | el2 = self.wait.until(EC.presence_of_element_located((By.ID, "com.xingin.xhs:id/ak5")))
40 | el2.click()
41 | el3 = self.wait.until(EC.presence_of_element_located((By.ID, "com.xingin.xhs:id/ak4")))
42 | el3.send_keys(KEYWORD)
43 | el4 = self.wait.until(EC.presence_of_element_located((By.ID, "com.xingin.xhs:id/ak7")))
44 | el4.click()
45 | el5 = self.wait.until(EC.presence_of_element_located((By.XPATH,
46 | '/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.view.ViewGroup/android.view.ViewGroup/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[1]')))
47 | el5.click()
48 |
49 | def scroll(self):
50 | while True:
51 | # 当前页面显示的所有状态
52 | items = self.wait.until(EC.presence_of_all_elements_located((By.ID, 'com.xingin.xhs:id/a1z')))
53 | # 遍历每条状态
54 | for item in items:
55 | try:
56 | # 昵称
57 | nickname = item.find_element_by_id('com.xingin.xhs:id/bhs').get_attribute('text')
58 | # 正文
59 | content = item.find_element_by_id('com.xingin.xhs:id/anl').get_attribute('text')
60 | # 日期
61 | date = item.find_element_by_id('com.xingin.xhs:id/ask').get_attribute('text')
62 | # 处理日期
63 | date = self.processor.date(date)
64 | print(nickname, content, date)
65 | data = {
66 | 'nickname': nickname,
67 | 'content': content,
68 | 'date': date,
69 | }
70 | # 插入MongoDB
71 | self.collection.update({'nickname': nickname, 'content': content}, {'$set': data}, True)
72 | sleep(SCROLL_SLEEP_TIME)
73 | except NoSuchElementException:
74 | pass
75 | # 上滑
76 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y)
77 |
78 | def main(self):
79 | self.enterApp()
80 | self.scroll()
81 |
82 |
83 | if __name__ == '__main__':
84 | action = Action()
85 | action.main()
--------------------------------------------------------------------------------
/xhs_web_request.py:
--------------------------------------------------------------------------------
1 | from urllib. request import ProxyHandler, build_opener
2 | import requests
3 |
4 | proxy ='127.0.0.1:9743'
5 |
6 | proxies = {
7 | 'http':'http://' + proxy,
8 | 'https':'https://' + proxy
9 | }
10 |
11 | headers = {
12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
13 | }
14 |
15 | try:
16 | response = requests.get('http://httpbin.org/get', proxies=proxies, headers=headers)
17 | print(response.text)
18 | except requests.exceptions.ConnectionError as e:
19 | print('Error', e.args)
--------------------------------------------------------------------------------
/xhs_wechat.py:
--------------------------------------------------------------------------------
1 |
2 | from appium import webdriver
3 |
4 | from selenium.common.exceptions import NoSuchElementException, WebDriverException
5 | from selenium.webdriver.common.by import By
6 | from selenium.webdriver.support.ui import WebDriverWait
7 | from selenium.webdriver.support import expected_conditions as EC
8 | from pymongo import MongoClient
9 | from time import sleep
10 | from processor import Processor
11 | from config import *
12 |
13 |
14 | class XHS():
15 |
16 | index=1
17 | def __init__(self):
18 | """
19 | 初始化
20 | """
21 | # 驱动配置
22 | self.desired_caps = {
23 | 'platformName': PLATFORM,
24 | 'deviceName': DEVICE_NAME,
25 | 'appPackage': WECHAT_PACKAGE,
26 | 'appActivity': WECHAT_ACTIVITY,
27 | 'noReset': True,
28 | "automationName": "Uiautomator2"
29 | }
30 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps)
31 | self.wait = WebDriverWait(self.driver, TIMEOUT)
32 | self.client = MongoClient(MONGO_URL)
33 | self.db = self.client[WECHAT_XHS_MONGO_DB]
34 | self.collection = self.db[WECHAT_XHS_MONGO_COLLECTION]
35 | # 处理器
36 | self.processor = Processor()
37 |
38 | def login(self):
39 | """
40 | 登录微信
41 | :return:
42 | """
43 | # 登录按钮
44 | login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/cjk')))
45 | login.click()
46 | # 手机输入
47 | phone = self.wait.until(EC.presence_of_element_located((By.ID, 'com.tencent.mm:id/h2')))
48 | phone.set_text(USERNAME)
49 | # 下一步
50 | next = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.tencent.mm:id/adj')))
51 | next.click()
52 | # 密码
53 | password = self.wait.until(
54 | EC.presence_of_element_located((By.XPATH, '//*[@resource-id="com.tencent.mm:id/h2"][1]')))
55 | password.set_text(PASSWORD)
56 | # 提交
57 | submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.tencent.mm:id/adj')))
58 | submit.click()
59 |
60 |
61 | def enter(self, index=1):
62 | """
63 | 进入小红书
64 | :return:
65 | """
66 | # 选项卡
67 | tab = self.wait.until(
68 | EC.presence_of_element_located((By.XPATH, '//android.widget.FrameLayout[@content-desc="当前所在页面,与的聊天"]/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.LinearLayout/android.widget.RelativeLayout[3]')))
69 | tab.click()
70 | # 小程序
71 | app = self.wait.until(EC.presence_of_element_located((By.XPATH, "//android.widget.FrameLayout[@content-desc=\"当前所在页面,与的聊天\"]/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout/com.tencent.mm.ui.mogic.WxViewPager/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.ListView/android.widget.LinearLayout[9]")))
72 | app.click()
73 |
74 | xhs = self.wait.until(EC.presence_of_element_located((By.XPATH, "//android.widget.FrameLayout[@content-desc=\"当前所在页面,小程序\"]/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout[2]/android.widget.FrameLayout[2]/android.support.v7.widget.RecyclerView/android.widget.RelativeLayout[1]")))
75 | xhs.click()
76 |
77 | search = self.wait.until(EC.presence_of_element_located((By.XPATH,
78 | "/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.webkit.WebView/android.view.View/android.view.View[2]/android.view.View/android.view.View[1]/android.view.View/android.view.View[2]/android.view.View")))
79 | search.click()
80 |
81 | # 爬取内容or爬取note_id
82 |
83 | # 这句注释掉就是爬取note_id
84 | self.card_selete()
85 |
86 |
87 |
88 | def card_selete(self):
89 | card = self.wait.until(EC.presence_of_element_located((By.XPATH,
90 | "/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.webkit.WebView/android.view.View[2]/android.view.View[2]/android.view.View/android.view.View/android.view.View[2]/android.view.View[1]/android.view.View[3]/android.view.View/android.view.View/android.view.View[" + str(self.index) + "]/android.view.View[1]/android.widget.Button/android.view.View[1]/android.view.View/android.widget.Image")))
91 |
92 | card.click()
93 |
94 | self.index = self.index + 1
95 | sleep(SCROLL_SLEEP_TIME)
96 |
97 | def back_button(self):
98 | back = self.wait.until(EC.presence_of_element_located((By.XPATH,
99 | "/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.RelativeLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[1]/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.webkit.WebView/android.view.View/android.view.View[1]/android.view.View/android.view.View[1]/android.view.View/android.view.View[1]/android.view.View/android.widget.Image")))
100 | back.click()
101 | sleep(SCROLL_SLEEP_TIME)
102 |
103 | def crawl(self):
104 | """
105 | 爬取
106 | :return:
107 | """
108 | while True:
109 |
110 | # 上滑
111 | try:
112 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y)
113 | except WebDriverException:
114 | self.back_button()
115 | self.card_selete()
116 |
117 | sleep(SCROLL_SLEEP_TIME)
118 |
119 |
120 | def main(self):
121 | """
122 | 入口
123 | :return:
124 | """
125 | # 登录
126 | # self.login()
127 | # 进入小红书
128 | self.enter()
129 | # 爬取
130 | self.crawl()
131 |
132 |
133 | if __name__ == '__main__':
134 | xhs = XHS()
135 | xhs.main()
136 |
--------------------------------------------------------------------------------
/xhs_wechat_item_script.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pymongo
3 | from time import sleep
4 | from config import *
5 |
6 |
7 | def response(flow):
8 | global collection
9 | client = pymongo.MongoClient(MONGO_URL)
10 | db = client[WECHAT_XHS_MONGO_DB]
11 | collection = db[WECHAT_XHS_MONGO_COLLECTION]
12 |
13 |
14 | url = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/note/'
15 | if flow.request.url.startswith(url):
16 | # 数据的解析
17 | # print(flow.request.url)
18 | for item in json.loads(flow.response.text)["data"]:
19 | comment_list = []
20 | for comment in item["comment_list"]:
21 | content = comment["content"]
22 | user = comment["user"]["name"]
23 | comment_item = [user, content]
24 | comment_list.append(comment_item)
25 |
26 | for note in item["note_list"]:
27 | note_id = note["id"]
28 | comment_list.append(note_id)
29 | user = note["user"]["name"]
30 | collect_count = note["collected_count"]
31 | comment_count = note["comments_count"]
32 | like_count = note["liked_count"]
33 | share_count = note["shared_count"]
34 | description = note["desc"]
35 | img_ist = note["images_list"]
36 | date = note["time"]
37 |
38 | content = {
39 | 'note_id': note_id,
40 | 'user': user,
41 | 'description': description,
42 | 'collect_count': collect_count,
43 | 'comment_count': comment_count,
44 | 'like_count': like_count,
45 | 'share_count': share_count,
46 | 'img_list': img_ist,
47 | 'date': date,
48 | 'comment': comment_list
49 | }
50 |
51 | collection.insert(content)
52 | # print(content)
53 | # sleep(SCROLL_SLEEP_TIME)
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/xhs_wechat_noteid_script.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pymongo
3 |
4 | from config import *
5 |
6 |
7 | def response(flow):
8 | global collection
9 | client = pymongo.MongoClient(MONGO_URL)
10 | db = client[WECHAT_XHS_MONGO_DB]
11 | collection = db[WECHAT_XHS_NOTE_MONGO_COLLECTION]
12 |
13 | url1 = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/search/notes?'
14 | url2 = 'https://www.xiaohongshu.com/fe_api/burdock/v1/page/'
15 | if flow.request.url.startswith(url1):
16 | # 数据的解析
17 | print(flow.request.url)
18 |
19 | notes = json.loads(flow.response.text)["data"]["notes"]
20 | for note in notes:
21 | note_id = note["id"]
22 | img_list = note["images_list"]
23 | title = note["title"]
24 | user = note["user"]
25 |
26 | content = {
27 | "note_id": note_id,
28 | "img_list": img_list,
29 | "title": title,
30 | "user":user
31 | }
32 |
33 | collection.insert(content)
34 |
35 | elif flow.request.url.startswith(url2):
36 | print(flow.request.url)
37 |
38 | notes = json.loads(flow.response.text)["data"]
39 | for note in notes:
40 | note_id = note["id"]
41 | img_list = note["cover"]
42 | title = note["title"]
43 | user = note["user"]
44 |
45 | content = {
46 | "note_id": note_id,
47 | "img_list": img_list,
48 | "title": title,
49 | "user": user
50 | }
51 |
52 | collection.insert(content)
53 |
--------------------------------------------------------------------------------