├── test
    ├── __init__.py
    ├── add_url_to_redis.py
    ├── test_proxy_pool.py
    ├── try.py
    └── try_mongoDB.py
├── .gitignore
├── requirements.txt
├── TreadCrawler
    ├── __init__.py
    ├── RedisClient.py
    └── TreadUrlCrawler.py
├── setting.ini
├── Utils
    ├── EuclidDataTools_test.py
    ├── MongoClient.py
    └── EuclidDataTools.py
├── README.md
├── full_text_Crawler.py
├── simple_main.py
└── main_class.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .history
2 | test.log
3 | __pycache__
4 | guba
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Euclid-Jie/Euclidguba-search/HEAD/requirements.txt


--------------------------------------------------------------------------------
/TreadCrawler/__init__.py:
--------------------------------------------------------------------------------
1 | from .TreadUrlCrawler import ThreadUrlCrawler
2 | from .RedisClient import RedisClient
3 | 


--------------------------------------------------------------------------------
/setting.ini:
--------------------------------------------------------------------------------
 1 | [Redis]
 2 | redis_host = localhost
 3 | redis_port = 6379
 4 | redis_password = 123456
 5 | redis_db = 0
 6 | redis_key = urls
 7 | 
 8 | [proxies]
 9 | tunnel = d476.kdltps.com:15818
10 | 
11 | [ThreadCrawler]
12 | num_threads = 32
13 | 
14 | [mainClass]
15 | pages_start = 0
16 | pages_end = 100


--------------------------------------------------------------------------------
/Utils/EuclidDataTools_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2023/3/11 11:16
 3 | # @Author  : Euclid-Jie
 4 | # @File    : EuclidDataTools_test.py
 5 | import pandas as pd
 6 | 
 7 | from EuclidDataTools import *
 8 | 
 9 | if __name__ == "__main__":
10 |     data = {"id": 123, "sex": "male", "age": "12", "job": "student"}
11 |     # 1、CsvClient.insert_one
12 |     myCol = CsvClient(subFolder="demoOut", FileName="demo1")
13 |     myCol.insert_one(data)
14 | 
15 |     # 2、EuclidCsvTools.saveCsvFile
16 |     df = pd.DataFrame([data])
17 |     EuclidCsvTools(subFolder="demoOut", FileName="demo2.csv").saveCsvFile(
18 |         df, append=False
19 |     )
20 | 


--------------------------------------------------------------------------------
/test/add_url_to_redis.py:
--------------------------------------------------------------------------------
 1 | from TreadCrawler import RedisClient
 2 | import configparser
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     config = configparser.ConfigParser()
 7 |     config.read("setting.ini")
 8 |     redis_client = RedisClient(config=config)
 9 |     lines = [
10 |         "/news,002611,1407434999.html",
11 |         "/news,002611,1407434732.html",
12 |         "/news,002611,1407434570.html",
13 |         "/news,002611,1407432104.html",
14 |         "/news,002611,1407428529.html",
15 |         "/news,002611,1407428130.html",
16 |         "/news,002611,1407427781.html",
17 |         "/news,002611,1407425977.html",
18 |         "/news,002611,1407424968.html",
19 |         "/news,002611,1407424842.html",
20 |         "/news,002611,1407421621.html",
21 |         "/news,002611,1407420792.html",
22 |         "/news,002611,1407417853.html",
23 |         "/news,002611,1407416059.html",
24 |         "/news,002611,1407415463.html",
25 |         "/news,002229,1407912249.html",
26 |     ]
27 |     for i in lines:
28 |         redis_client.add_url(i)
29 | 


--------------------------------------------------------------------------------
/TreadCrawler/RedisClient.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | import threading
 3 | import configparser
 4 | 
 5 | __all__ = ["RedisClient"]
 6 | 
 7 | 
 8 | class RedisClient:
 9 |     def __init__(self, config: configparser.ConfigParser):
10 |         self.redis_client = redis.StrictRedis(
11 |             host=config.get("Redis", "redis_host"),
12 |             port=config.getint("Redis", "redis_port"),
13 |             db=config.getint("Redis", "redis_db"),
14 |             password=config.get("Redis", "redis_password"),
15 |         )
16 |         self.lock = threading.Lock()
17 |         self.redis_key = config.get("Redis", "redis_key")
18 | 
19 |     def add_url(self, url) -> None:
20 |         with self.lock:
21 |             self.redis_client.lpush(self.redis_key, url)
22 | 
23 |     def get_url(self) -> str:
24 |         with self.lock:
25 |             url = self.redis_client.rpop(self.redis_key)
26 |         if url:
27 |             return url.decode("utf-8")
28 |         else:
29 |             return None
30 | 
31 |     def __len__(self) -> int:
32 |         return self.redis_client.llen(self.redis_key)
33 | 


--------------------------------------------------------------------------------
/test/test_proxy_pool.py:
--------------------------------------------------------------------------------
 1 | # 从数据库调用IP
 2 | import requests
 3 | import random
 4 | import time
 5 | 
 6 | def get_proxy():
 7 |     all = requests.get("http://127.0.0.1:5010/all/").json()
 8 |     # 随机选一个
 9 |     if len(all) == 0:
10 |         time.sleep(10)
11 |         print("No proxy available, waiting for 10 seconds")
12 |         return get_proxy()
13 |     return random.choice(all)
14 | 
15 | def get_proxies_count() -> int:
16 |     return requests.get("http://127.0.0.1:5010/count/").json()["count"]
17 | 
18 | 
19 | # 删除数据库中IP
20 | def delete_proxy(proxy):
21 |     requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
22 | 
23 | 
24 | # 使用代理IP发起请求
25 | def getResponse(URL, header):
26 |     retry_count = 5
27 |     proxy = get_proxy().get("proxy")
28 |     while retry_count > 0:
29 |         try:
30 |             response = requests.get(URL, headers=header, timeout=60, proxies={"http": "http://{}".format(proxy)})
31 |             # 使用代理访问
32 |             return response
33 |         except Exception:
34 |             retry_count -= 1
35 |             # 删除代理池中代理
36 |             delete_proxy(proxy)
37 |             return None
38 | 
39 | if __name__ == "__main__":
40 |     # 测试代理IP
41 |     print(get_proxies_count())
42 |     print(get_proxy().get("proxy"))


--------------------------------------------------------------------------------
/Utils/MongoClient.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2023/2/9 23:26
 3 | # @Author  : Euclid-Jie
 4 | # @File    : MongoClient.py
 5 | import pymongo
 6 | import pandas as pd
 7 | 
 8 | __all__ = ["MongoClient", "read_mongo"]
 9 | 
10 | 
11 | def MongoClient(DBName, collectionName):
12 |     # 连接数据库
13 |     myclient = pymongo.MongoClient("mongodb://localhost:27017/")
14 |     mydb = myclient[DBName]  # 数据库名称
15 |     mycol = mydb[collectionName]  # 集合（表）
16 |     return mycol
17 | 
18 | 
19 | def read_mongo(DBName, collectionName, query=None, no_id=True):
20 |     """
21 |     Read from Mongo and Store into DataFrame
22 |     :param DBName: mongoDB dataBase's name
23 |     :param collectionName: mongoDB dataBase's collection's name
24 |     :param query: a selection for data, demo: query = {"time": {"$gt": "2021-01-01"}}
25 |     :param no_id: do not write _id column to data
26 |     :return: pd.DataFrame
27 |     """
28 |     # Connect to MongoDB
29 |     if query is None:
30 |         query = {}
31 |     col = MongoClient(DBName, collectionName)
32 |     # Make a query to the specific DB and Collection
33 |     cursor = col.find(query)
34 |     # Expand the cursor and construct the DataFrame
35 |     df = pd.DataFrame(list(cursor))
36 |     # Delete the _id
37 |     if no_id and "_id" in df:
38 |         del df["_id"]
39 |     return df.drop_duplicates()
40 | 


--------------------------------------------------------------------------------
/TreadCrawler/TreadUrlCrawler.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import requests
 3 | import configparser
 4 | from TreadCrawler.RedisClient import RedisClient
 5 | 
 6 | 
 7 | class ThreadUrlCrawler:
 8 |     def __init__(
 9 |         self,
10 |     ):
11 |         config = configparser.ConfigParser()
12 |         config.read("setting.ini")
13 |         self.redis_client: RedisClient = RedisClient(config=config)
14 |         self.lock = threading.Lock()
15 |         self.redis_key = config.get("Redis", "redis_key")
16 |         self.num_threads = config.getint("ThreadCrawler", "num_threads")
17 |         self.threads = []
18 |         self.stop_crawling = threading.Event()
19 | 
20 |     def crawl(self, url) -> bool:
21 |         raise NotImplementedError("Subclasses must implement crawl method")
22 | 
23 |     def _worker(self):
24 |         while not self.stop_crawling.is_set():
25 |             with self.lock:
26 |                 url = self.redis_client.get_url()
27 |             if url:
28 |                 if self.crawl(url):
29 |                     pass
30 |                 else:
31 |                     with self.lock:
32 |                         self.redis_client.add_url(url)
33 | 
34 |     def start(self):
35 |         for _ in range(self.num_threads):
36 |             t = threading.Thread(target=self._worker)
37 |             t.daemon = True
38 |             t.start()
39 |             self.threads.append(t)
40 | 
41 |         for t in self.threads:
42 |             t.join()
43 | 
44 |         # 打印剩余未爬取URL的数量
45 |         print(f"Remaining URLs: {self.redis_client.__len__()}")
46 | 
47 |     def stop(self):
48 |         self.stop_crawling.set()
49 | 
50 | 
51 | # Example subclass of ThreadUrlCrawler
52 | class MyThreadCrawler(ThreadUrlCrawler):
53 |     def crawl(self, url):
54 |         try:
55 |             response = requests.get(url)
56 |             if response.status_code == 200:
57 |                 print(f"Successfully crawled: {url}")
58 |                 return True
59 |             else:
60 |                 print(f"Failed to crawl: {url}")
61 |                 return False
62 |         except Exception as e:
63 |             print(f"Exception while crawling {url}: {e}")
64 |             return False
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 东方财富股吧数据采集
  2 | 
  3 | [![wakatime](https://wakatime.com/badge/user/b638b33f-0c9e-4408-b427-258fe0b24ad0/project/018e0f79-4bee-4fd1-8d86-55a20bee6528.svg)](https://wakatime.com/badge/user/b638b33f-0c9e-4408-b427-258fe0b24ad0/project/018e0f79-4bee-4fd1-8d86-55a20bee6528)
  4 | 
  5 | 长期维护，欢迎issue，帮助完善代码
  6 | 
  7 | 由于近期无免费代理可用，本项目恢复到使用付费代理池，推荐使用[*快代理*](https://www.kuaidaili.com/?ref=mes9ujq5wnrn)
  8 | 
  9 | 在启动本项目前，希望你已经安装了`MongoDB`、`redis`，这两个数据库会使得本项目更易于使用，展开来说：
 10 | 
 11 | 1. 如果你没有安装`redis`，你仍可以使用本项目，但是无法获取帖子的详情，只能获取标题
 12 | 2. 如果你没有安装`MongoDB`，你仍可以使用本项目，因为本项目支持写入csv文件
 13 | 3. 很遗憾，尽管你安装了`redis`，但是没有安装`MongoDB`，那你依旧无无法获取帖子的详情，只能获取标题
 14 | 4. 很遗憾，尽管你安装了`MongoDB`，但是没有安装`redis`，那你依旧无无法获取帖子的详情，只能获取标题
 15 | 5. 幸运的是，如果你两个都没有，你仍可以使用本项目，因为你仍旧可以获得仅有标题的csv文件，心软的我为你特别准备了`simple_main.py`
 16 | 
 17 | ## 程序特性
 18 | 
 19 | - [X] 可爬取热帖和全部，在 `main_class.get_data()`中设置 `url`
 20 |   - 热帖：https://guba.eastmoney.com/list,600519,99_1.html
 21 |   - 全部：https://guba.eastmoney.com/list,600519_1.html
 22 | - ~~使用免费代理，亲测可以完成爬取任务~~ (24年由于免费代理库的作者不更新，本项目也只能转向付费代理，sorry)
 23 | - [X] 仅爬取帖子 `title`时，速度极快
 24 | - [x] `redis`异步多线程获取完整贴子内容
 25 | - [ ] 指定时间区间的爬取
 26 | - [ ] docker部署（暂时还不会，等我搞完这个，你们就不需要学`redis`和`mongoDB`了)
 27 | - [ ] UI界面（多半不会有，希望有大佬帮我一把）
 28 | 
 29 | ## 启动步骤
 30 | 
 31 | ### 1. 获取代码
 32 | 
 33 | 1. 第一种方式，如果你会使用 `git`, 请直接 `clone`
 34 | 2. 第二种方式，下载源码，详见下图，点击 `Download ZIP` 既可下载，随即解压既可
 35 | 
 36 |    ![image-20240315122017995](https://euclid-picgo.oss-cn-shenzhen.aliyuncs.com/image/image-20240315122017995.png)
 37 | 
 38 | ### 2. 配置环境
 39 | 
 40 | 前置条件是安转并启动了 `redis`,`mongo`,并将 `redis`密码设置为123456，后续会添加这部分的操作说明
 41 | 
 42 | 建议使用虚拟环境，并安装依赖
 43 | 
 44 | ```cmd
 45 | pip install -r requirements.txt
 46 | ```
 47 | 
 48 | ### 3.编写配置
 49 | 
 50 | 本项目非常人性化地设置了配置文件，`setting.ini`
 51 | 
 52 | - `Redis`部分为用于评论详情爬取的后台程序`FullTextCrawler`
 53 | - `proxies`是使用[*快代理*](*https://www.kuaidaili.com/?ref=mes9ujq5wnrn*)获取的代理tunnel
 54 | - `ThreadCrawler`部分是用于评论详情爬取的后台程序`FullTextCrawler`的线程池数量设置
 55 | - `mainClass`是主程序的参数
 56 | 
 57 | ```ini
 58 | [Redis]
 59 | redis_host = localhost
 60 | redis_port = 6379
 61 | redis_password = 
 62 | redis_db = 0
 63 | redis_key = urls
 64 | 
 65 | [proxies]
 66 | tunnel = d476.kdltps.com:15818
 67 | 
 68 | [ThreadCrawler]
 69 | num_threads = 32
 70 | 
 71 | [mainClass]
 72 | pages_start = 0
 73 | pages_end = 100
 74 | ```
 75 | 
 76 | ### 4.启动程序
 77 | 
 78 | 1. 启动 `FullTextCrawler`，如果没有`redis`, 跳过次步骤
 79 | 
 80 |    新开终端，运行
 81 | 
 82 |    ```cmd
 83 |    python -m full_text_Crawler
 84 |    ```
 85 | 2. 启动主程序
 86 | 
 87 |    在 `main_class.py`中设置好参数，新开终端，运行
 88 | 
 89 |    ```
 90 |    python -m main_class
 91 |    ```
 92 | 
 93 | 爬取成功的数据会在，`MongoDB.guba` 中，如有问题，请 [issue](https://github.com/Euclid-Jie/Euclidguba-search/issues/new)
 94 | 
 95 | ## 附录
 96 | 
 97 | 1. 爬取成功的数据截图
 98 | 
 99 |    ![image-20240315123641440](https://euclid-picgo.oss-cn-shenzhen.aliyuncs.com/image/image-20240315123641440.png)
100 | 2. 股吧页面截图
101 | 
102 |    ![](https://euclid-picgo.oss-cn-shenzhen.aliyuncs.com/image/202302161115850.png)
103 | 


--------------------------------------------------------------------------------
/Utils/EuclidDataTools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2023/3/11 10:45
 3 | # @Author  : Euclid-Jie
 4 | # @File    : EuclidDataTools.py
 5 | import pandas as pd
 6 | from pathlib import Path
 7 | from typing import Optional, Union
 8 | 
 9 | __all__ = ["EuclidCsvTools", "CsvClient"]
10 | 
11 | 
12 | class EuclidCsvTools:
13 |     """
14 |     this class include tools used to precess csv file
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         subFolder: Optional[Union[str, Path]] = None,
20 |         FileName: str = "DemoOut.csv",
21 |     ):
22 |         # para init
23 |         if subFolder:
24 |             self.subFolder = Path(subFolder)
25 | 
26 |         assert FileName.endswith(".csv"), "file name must end with .csv"
27 |         self.FileName = FileName
28 |         self.path_clear()
29 | 
30 |     def path_clear(self):
31 |         """
32 |         get the full folder path and full file path
33 |         :return:
34 |         """
35 |         if self.subFolder:
36 |             self.FullFolderPath = Path.cwd().joinpath(self.subFolder)
37 |             self.FullFolderPath.mkdir(parents=True, exist_ok=True)
38 |             self.FullFilePath = (
39 |                 Path.cwd().joinpath(self.subFolder).joinpath(self.FileName)
40 |             )
41 |         else:
42 |             self.FullFolderPath = Path.cwd()
43 |             self.FullFolderPath.mkdir(parents=True, exist_ok=True)
44 |             self.FullFilePath = Path.cwd().joinpath(self.FileName)
45 |         print("文件将存储在: {}".format(self.FullFilePath))
46 | 
47 |     def saveCsvFile(self, df, append=False):
48 |         """
49 |         save data to csv
50 |         :param df: pd.DataFrame
51 |         :param append: True(append save) or False(overwrite)
52 |         :return:
53 |         """
54 |         if append and self.FullFilePath.exists():
55 |             self.writeDf2Csv(df, self.FullFilePath)
56 |         else:
57 |             df.to_csv(self.FullFilePath, encoding="utf_8_sig", index=False)
58 | 
59 |     @classmethod
60 |     def writeDf2Csv(cls, df: pd.DataFrame, FullFilePath):
61 |         if FullFilePath.exists():
62 |             # write after a exist file without header
63 |             df.to_csv(
64 |                 FullFilePath, mode="a", encoding="utf_8_sig", header=False, index=False
65 |             )
66 |         else:
67 |             # write out a new file with header
68 |             df.to_csv(
69 |                 FullFilePath, mode="w", encoding="utf_8_sig", header=True, index=False
70 |             )
71 | 
72 | 
73 | class CsvClient(EuclidCsvTools):
74 |     def __init__(
75 |         self,
76 |         subFolder: Optional[Union[str, Path]] = None,
77 |         FileName: str = "DemoOut.csv",
78 |     ):
79 |         """
80 |         :param subFolder:
81 |         :param FileName:
82 |         """
83 |         if ~FileName.endswith(".csv") and "." not in FileName:
84 |             FileName = FileName + ".csv"
85 |         else:
86 |             raise ValueError("FileName must end with .csv or not contain '.'")
87 |         super().__init__(subFolder=subFolder, FileName=FileName)
88 | 
89 |     def insert_one(self, data: Union[dict, pd.DataFrame]):
90 |         if isinstance(data, dict):
91 |             data = pd.DataFrame([data])
92 |         elif isinstance(data, pd.DataFrame):
93 |             pass
94 |         else:
95 |             raise TypeError("传入参数仅支出dict和pd.DataFrame")
96 |         self.saveCsvFile(df=data, append=True)
97 | 


--------------------------------------------------------------------------------
/full_text_Crawler.py:
--------------------------------------------------------------------------------
 1 | from TreadCrawler import ThreadUrlCrawler
 2 | import requests
 3 | from typing import Union
 4 | from bs4 import BeautifulSoup
 5 | from Utils.MongoClient import MongoClient
 6 | import configparser
 7 | 
 8 | 
 9 | class FullTextCrawler(ThreadUrlCrawler):
10 |     header = {
11 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
12 |     }
13 |     mongo_client = MongoClient("guba", "东方精工")
14 |     failed_proxies = {}
15 |     proxy_fail_times_treshold = 16
16 | 
17 |     def crawl(self, url):
18 |         """
19 |         the href of each item have different fartherPath:
20 |             1、https://caifuhao
21 |             2、http://guba.eastmoney.com
22 | 
23 |         :param data_json: the json data lack full text
24 |         :return: the data json with full text
25 |         """
26 |         url_map = {
27 |             "caifuhao": "https:",
28 |             "/new": "http://guba.eastmoney.com",
29 |         }
30 |         match_times = 0
31 |         url_map_len = len(url_map)
32 |         for k, v in url_map.items():
33 |             match_times += 1
34 |             if k in url:
35 |                 soup = self.get_soup_form_url(v + url)
36 |                 if soup:
37 |                     try:
38 |                         time = soup.find("div", {"class": "time"}).text
39 |                     except (ValueError, AttributeError) as e:
40 |                         time = ""
41 |                     try:
42 | 
43 |                         if soup.find("div", {"id": "post_content"}):
44 |                             full_text = soup.find("div", {"id": "post_content"}).text
45 |                         else:
46 |                             full_text = soup.find("div", {"class": "newstext"}).text
47 |                     except (ValueError, AttributeError) as e:
48 |                         full_text = ""
49 |                 else:
50 |                     full_text = None
51 |                     return False
52 |             elif match_times == url_map_len:
53 |                 full_text = None
54 |         if full_text:
55 |             print(f"Successfully crawled: {url}, full text: {full_text}")
56 |             self.mongo_client.update_one(
57 |                 {"href": url}, {"$set": {"full_text": full_text, "time": time}}
58 |             )
59 |             return True
60 |         elif full_text is None:
61 |             print(f"Failed to crawl: {url}")
62 |             return False
63 | 
64 |     def get_soup_form_url(self, url) -> Union[BeautifulSoup, None]:
65 |         try:
66 |             response = requests.get(
67 |                 url, headers=self.header, timeout=10, proxies=self.proxies
68 |             )  # 使用request获取网页
69 |             if response.status_code != 200:
70 |                 return None
71 |             else:
72 |                 html = response.content.decode(
73 |                     "utf-8", "ignore"
74 |                 )  # 将网页源码转换格式为html
75 |                 soup = BeautifulSoup(
76 |                     html, features="lxml"
77 |                 )  # 构建soup对象，"lxml"为设置的解析器
78 |                 return soup
79 |         except Exception as e:
80 |             return None
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     # 读取配置文件
85 |     config = configparser.ConfigParser()
86 |     config.read("setting.ini", encoding="utf-8")
87 |     tunnel = config.get("proxies", "tunnel")
88 |     # 启动full_text_crawler,置于后台运行,一旦有新的url加入redis中,将会自动爬取,并存入mongodb
89 |     full_text_crawler = FullTextCrawler()
90 |     full_text_crawler.proxies = {
91 |         "http": "http://%(proxy)s/" % {"proxy": tunnel},
92 |         "https": "http://%(proxy)s/" % {"proxy": tunnel},
93 |     }
94 |     full_text_crawler.start()
95 | 


--------------------------------------------------------------------------------
/test/try.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2023/2/10 19:53
  3 | # @Author  : Euclid-Jie
  4 | # @File    : try.py
  5 | import os
  6 | import pandas as pd
  7 | import pymongo
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | def MongoClient(DBName, collectionName):
 14 |     # 连接数据库
 15 |     myclient = pymongo.MongoClient("mongodb://localhost:27017/")
 16 |     mydb = myclient[DBName]  # 数据库名称
 17 |     mycol = mydb[collectionName]  # 集合（表）
 18 |     return mycol
 19 | 
 20 | 
 21 | def get_data(page, fileFullName):
 22 |     header = {
 23 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0"
 24 |     }
 25 |     Url = "http://guba.eastmoney.com/list,600519,99_{}.html".format(page)
 26 |     response = requests.get(Url, headers=header, timeout=60)  # 使用request获取网页
 27 |     html = response.content.decode("utf-8", "ignore")  # 将网页源码转换格式为html
 28 |     soup = BeautifulSoup(html, features="lxml")  # 构建soup对象，"lxml"为设置的解析器
 29 |     data_list = soup.find_all("div", "articleh")
 30 |     num = len(data_list)
 31 |     out_df = pd.DataFrame()
 32 |     for item in data_list:
 33 |         try:
 34 |             data_json = get_data_json(item)
 35 |             # col.insert_one(data_json)
 36 |         except:
 37 |             num -= 1
 38 |             pass
 39 |     save_data(out_df, os.getcwd(), fileFullName)
 40 |     return num
 41 | 
 42 | 
 43 | def clear_str(str_raw):
 44 |     for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]:
 45 |         str_raw = str_raw.replace(pat, "")
 46 |     return str_raw
 47 | 
 48 | 
 49 | def get_full_text(data_json):
 50 |     header = {
 51 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0"
 52 |     }
 53 |     if "caifuhao" in data_json["href"]:
 54 |         url = "https:" + data_json["href"]
 55 |         response = requests.get(url, headers=header, timeout=60)
 56 |         html = response.content.decode("utf-8", "ignore")  # 将网页源码转换格式为html
 57 |         soup = BeautifulSoup(html, features="lxml")  # 构建soup对象，"lxml"为设置的解析器
 58 |         try:
 59 |             data_json["full_text"] = soup.find("div", "article-body").get_text()
 60 |             return data_json
 61 |         except:
 62 |             return data_json
 63 | 
 64 |     elif "/new" in data_json["href"]:
 65 |         url = "http://guba.eastmoney.com" + data_json["href"]
 66 |         response = requests.get(url, headers=header, timeout=60)
 67 |         html = response.content.decode("utf-8", "ignore")  # 将网页源码转换格式为html
 68 |         soup = BeautifulSoup(html, features="lxml")  # 构建soup对象，"lxml"为设置的解析器
 69 |         data_json["full_text"] = clear_str(
 70 |             soup.find("div", {"id": "post_content"}).text
 71 |         )
 72 |         return data_json
 73 |     else:
 74 |         return data_json
 75 | 
 76 | 
 77 | def get_data_json(item):
 78 |     spans = item.find_all("span")
 79 |     data_json = {
 80 |         "阅读": spans[0].text,
 81 |         "评论": spans[1].text,
 82 |         "标题": spans[2].a["title"],
 83 |         "href": spans[2].a["href"],
 84 |         "作者": spans[3].a.text,
 85 |         "最后更新": spans[4].text,
 86 |     }
 87 | 
 88 |     return get_full_text(data_json)
 89 | 
 90 | 
 91 | def save_data(data_df, FileFullPath, FilePath):
 92 |     """
 93 |     轮子函数，用于存储数据，可实现对已存在文件的追加写入
 94 |     :param data_df: 目标数据
 95 |     :param FileFullPath: 全路径，包括文件名和后缀
 96 |     :param FilePath: 文件名，包括后缀
 97 |     :return:
 98 |     """
 99 |     FileFullPath = os.path.join(FileFullPath, FilePath)
100 |     if os.path.isfile(FileFullPath):
101 |         data_df.to_csv(
102 |             FilePath, mode="a", header=False, index=False, encoding="utf_8_sig"
103 |         )
104 |     else:
105 |         data_df.to_csv(
106 |             FilePath, mode="w", header=True, index=False, encoding="utf_8_sig"
107 |         )
108 | 
109 | 
110 | if __name__ == "__main__":
111 | 
112 |     fileFullName = "茅台.csv"
113 |     with tqdm(range(1, 60)) as t:
114 |         for page in t:
115 |             t.set_description("page:{}".format(page))  # 进度条左边显示信息
116 |             writed_len = get_data(page, fileFullName)
117 |             t.set_postfix({"状态": "已成功写入{}条".format(writed_len)})  # 进度条右边显示信息
118 | 


--------------------------------------------------------------------------------
/test/try_mongoDB.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2023/2/10 19:53
  3 | # @Author  : Euclid-Jie
  4 | # @File    : try.py
  5 | import os
  6 | import pandas as pd
  7 | import pymongo
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | def MongoClient(DBName, collectionName):
 14 |     # 连接数据库
 15 |     myclient = pymongo.MongoClient("mongodb://localhost:27017/")
 16 |     mydb = myclient[DBName]  # 数据库名称
 17 |     mycol = mydb[collectionName]  # 集合（表）
 18 |     return mycol
 19 | 
 20 | 
 21 | def get_data(page, col):
 22 |     header = {
 23 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0"
 24 |     }
 25 |     Url = "http://guba.eastmoney.com/list,600519,99_{}.html".format(page)
 26 |     proxies = {"http": "http://127.0.0.1:12345", "https": "http://127.0.0.1:12345"}
 27 |     response = requests.get(
 28 |         Url, headers=header, timeout=60, proxies=proxies
 29 |     )  # 使用request获取网页
 30 |     html = response.content.decode("utf-8", "ignore")  # 将网页源码转换格式为html
 31 |     soup = BeautifulSoup(html, features="lxml")  # 构建soup对象，"lxml"为设置的解析器
 32 |     data_list = soup.find_all("div", "articleh")
 33 |     num = len(data_list)
 34 |     for item in data_list:
 35 |         try:
 36 |             data_json = get_data_json(item)
 37 |             col.insert_one(data_json)
 38 |         except:
 39 |             num -= 1
 40 |             pass
 41 |     return num
 42 | 
 43 | 
 44 | def clear_str(str_raw):
 45 |     for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]:
 46 |         str_raw = str_raw.replace(pat, "")
 47 |     return str_raw
 48 | 
 49 | 
 50 | def get_full_text(data_json):
 51 |     header = {
 52 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0"
 53 |     }
 54 |     if "caifuhao" in data_json["href"]:
 55 |         url = "https:" + data_json["href"]
 56 |         proxies = {"http": "http://127.0.0.1:12345", "https": "http://127.0.0.1:12345"}
 57 |         response = requests.get(
 58 |             url, headers=header, timeout=60, proxies=proxies
 59 |         )  # 使用request获取网页
 60 |         html = response.content.decode("utf-8", "ignore")  # 将网页源码转换格式为html
 61 |         soup = BeautifulSoup(html, features="lxml")  # 构建soup对象，"lxml"为设置的解析器
 62 |         try:
 63 |             data_json["full_text"] = soup.find("div", "article-body").get_text()
 64 |             return data_json
 65 |         except:
 66 |             return data_json
 67 | 
 68 |     elif "/new" in data_json["href"]:
 69 |         url = "http://guba.eastmoney.com" + data_json["href"]
 70 |         proxies = {"http": "http://127.0.0.1:12345", "https": "http://127.0.0.1:12345"}
 71 |         response = requests.get(
 72 |             url, headers=header, timeout=60, proxies=proxies
 73 |         )  # 使用request获取网页
 74 |         html = response.content.decode("utf-8", "ignore")  # 将网页源码转换格式为html
 75 |         soup = BeautifulSoup(html, features="lxml")  # 构建soup对象，"lxml"为设置的解析器
 76 |         data_json["full_text"] = clear_str(
 77 |             soup.find("div", {"id": "post_content"}).text
 78 |         )
 79 |         return data_json
 80 |     else:
 81 |         return data_json
 82 | 
 83 | 
 84 | def get_data_json(item):
 85 |     spans = item.find_all("span")
 86 |     data_json = {
 87 |         "阅读": spans[0].text,
 88 |         "评论": spans[1].text,
 89 |         "标题": spans[2].a["title"],
 90 |         "href": spans[2].a["href"],
 91 |         "作者": spans[3].a.text,
 92 |         "最后更新": spans[4].text,
 93 |     }
 94 | 
 95 |     return get_full_text(data_json)
 96 | 
 97 | 
 98 | def save_data(data_df, SaveFolderPath, FilePath):
 99 |     """
100 |     轮子函数，用于存储数据，可实现对已存在文件的追加写入
101 |     :param SaveFolderPath: the FordPath save data files, usually is the projectPath: os.getcwd()
102 |     :param data_df: 目标数据
103 |     :param FilePath: 文件名，包括后缀
104 |     :return:
105 |     """
106 |     # concat the folderPath and dataPath
107 |     FileFullPath = os.path.join(SaveFolderPath, FilePath)
108 |     if os.path.isfile(FileFullPath):
109 |         data_df.to_csv(
110 |             FilePath, mode="a", header=False, index=False, encoding="utf_8_sig"
111 |         )
112 |     else:
113 |         data_df.to_csv(
114 |             FilePath, mode="w", header=True, index=False, encoding="utf_8_sig"
115 |         )
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     col = MongoClient("guba", "中国医药_base")
120 |     with tqdm(range(1, 59)) as t:
121 |         for page in t:
122 |             t.set_description("page:{}".format(page))  # 进度条左边显示信息
123 |             writed_len = get_data(page, col)
124 |             t.set_postfix({"状态": "已成功写入{}条".format(writed_len)})  # 进度条右边显示信息
125 | 


--------------------------------------------------------------------------------
/simple_main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2023/2/11 21:27
  3 | # @Author  : Euclid-Jie
  4 | # @File    : main_class.py
  5 | import pandas as pd
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | from tqdm import tqdm
  9 | import logging
 10 | from retrying import retry
 11 | from typing import Optional, Union
 12 | from concurrent.futures import ThreadPoolExecutor, as_completed
 13 | from Utils.EuclidDataTools import CsvClient
 14 | import configparser
 15 | 
 16 | 
 17 | class guba_comments:
 18 |     """
 19 |     this class is designed for get hot comments for guba, have two method which can be set at def get_data()
 20 |     1、all: https://guba.eastmoney.com/list,600519_1.html, secCode: 600519, page: 1
 21 |     2、hot: https://guba.eastmoney.com/list,600519,99_1.html secCode: 600519, page: 1
 22 | 
 23 |     because to the ip control, this need to set proxies pools
 24 |     by using proxies https://www.kuaidaili.com/usercenter/overview/, can solve this problem
 25 | 
 26 |     Program characteristics:
 27 |         1、default write data to mongoDB, by init "MogoDB=False", can switch to write data to csv file
 28 |         2、Use retry mechanism, once rise error, the program will restart at the least page and num (each page has 80 num)
 29 | 
 30 |     """
 31 | 
 32 |     failed_proxies = {}
 33 |     proxy_fail_times_treshold = 3
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         config: configparser.ConfigParser,
 38 |         secCode: Union[str, int],
 39 |         pages_start: int = 0,
 40 |         pages_end: int = 100,
 41 |         num_start: int = 0,
 42 |         collectionName: Optional[str] = None,
 43 |     ):
 44 |         # param init
 45 |         if isinstance(secCode, int):
 46 |             # 补齐6位数
 47 |             self.secCode = str(secCode).zfill(6)
 48 |         elif isinstance(secCode, str):
 49 |             self.secCode = secCode
 50 |         self.pages_start = pages_start
 51 |         self.pages_end = pages_end
 52 |         self.num_start = num_start
 53 |         self._year = pd.Timestamp.now().year
 54 | 
 55 |         # rewrite the secCode setting
 56 |         if config.has_option("mainClass", "secCode"):
 57 |             self.secCode = config.get("mainClass", "secCode")
 58 |             print(
 59 |                 f"secCode has been overridden by {self.secCode} in the configuration file."
 60 |             )
 61 |         if config.has_option("mainClass", "pages_start"):
 62 |             self.pages_start = int(config.get("mainClass", "pages_start"))
 63 |             print(
 64 |                 f"pages_start has been overridden by {self.pages_start} in the configuration file."
 65 |             )
 66 |         if config.has_option("mainClass", "pages_end"):
 67 |             self.pages_end = int(config.get("mainClass", "pages_end"))
 68 |             print(
 69 |                 f"pages_end has been overridden by {self.pages_end} in the configuration file."
 70 |             )
 71 |         if config.has_option("mainClass", "collectionName"):
 72 |             collectionName = config.get("mainClass", "collectionName")
 73 |             print(
 74 |                 f"collectionName has been overridden by {collectionName} in the configuration file."
 75 |             )
 76 | 
 77 |         collectionName = collectionName if collectionName else self.secCode
 78 |         self.col = CsvClient("guba", collectionName)
 79 | 
 80 |         # log setting
 81 |         log_format = "%(levelname)s %(asctime)s %(filename)s %(lineno)d %(message)s"
 82 |         logging.basicConfig(filename="test.log", format=log_format, level=logging.INFO)
 83 | 
 84 |     @staticmethod
 85 |     def clear_str(str_raw):
 86 |         for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]:
 87 |             str_raw.strip(pat).replace(pat, "")
 88 |         return str_raw
 89 | 
 90 |     @staticmethod
 91 |     def run_thread_pool_sub(target, args, max_work_count):
 92 |         with ThreadPoolExecutor(max_workers=max_work_count) as t:
 93 |             res = [t.submit(target, i) for i in args]
 94 |             return res
 95 | 
 96 |     @retry(stop_max_attempt_number=5)  # 最多尝试5次
 97 |     def get_soup_form_url(self, url: str) -> BeautifulSoup:
 98 |         """
 99 |         get the html content used by requests.get
100 |         :param url:
101 |         :return: BeautifulSoup
102 |         """
103 |         response = requests.get(
104 |             url, headers=self.header, timeout=10, proxies=self.proxies
105 |         )  # 使用request获取网页
106 |         html = response.content.decode("utf-8", "ignore")
107 |         soup = BeautifulSoup(html, features="lxml")
108 |         return soup
109 | 
110 |     def get_data_json(self, item):
111 |         """
112 |         get the special keys from item, in this the project,
113 |         the keys con be "阅读"、"评论"、……
114 | 
115 |         by use the get_full_text, the return json data will contain full_text
116 |         :param item:
117 |         :return: json data contains full_text
118 |         """
119 | 
120 |         tds = item.find_all("td")
121 |         data_json = {
122 |             "阅读": tds[0].text,
123 |             "评论": tds[1].text,
124 |             "标题": tds[2].a.text,
125 |             "href": tds[2].a["href"],
126 |             "作者": tds[3].a.text,
127 |             "最后更新": tds[4].text,
128 |         }
129 |         if "caifuhao" in data_json["href"]:
130 |             self._year = int(data_json["href"].split("/")[-1][0:4])
131 |         dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"])
132 |         if dt > pd.Timestamp.now():
133 |             self._year -= 1
134 |             dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"])
135 |         data_json["最后更新"] = dt
136 |         return data_json
137 | 
138 |     def get_data(self, page):
139 |         """
140 |         process to deal the single page's data
141 |         :param page: the page needed to be processed
142 |         :return:
143 |         """
144 |         # Url = "http://guba.eastmoney.com/list,{},99_{}.html".format(self.secCode, page)
145 |         Url = "http://guba.eastmoney.com/list,{},f_{}.html".format(self.secCode, page)
146 |         soup = self.get_soup_form_url(Url)
147 |         data_list = soup.find_all("tr", "listitem")
148 | 
149 |         # 开启并行获取data_json
150 |         res = self.run_thread_pool_sub(self.get_data_json, data_list, max_work_count=12)
151 |         for future in as_completed(res):
152 |             data_json = future.result()
153 |             self.col.insert_one(data_json)
154 |             self.t.set_postfix(
155 |                 {
156 |                     "状态": "已写num:{}".format(self.num_start),
157 |                 }
158 |             )  # 进度条右边显示信息
159 |             self.num_start += 1
160 | 
161 |     def main(self):
162 |         with tqdm(range(self.pages_start, self.pages_end)) as self.t:
163 |             for page in self.t:
164 |                 self.t.set_description("page:{}".format(page))  # 进度条左边显示信息
165 |                 self.get_data(page)
166 |                 self.num_start = 0
167 |                 self.pages_start += 1
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     # config
172 |     config = configparser.ConfigParser()
173 |     config.read("setting.ini", encoding="utf-8")
174 |     # init
175 |     demo = guba_comments(
176 |         config=config,
177 |         secCode="002611",
178 |         collectionName="东方精工",
179 |     )
180 | 
181 |     # setting
182 |     header = {
183 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
184 |     }
185 |     demo.header = header
186 |     tunnel = config.get("proxies", "tunnel")
187 |     demo.proxies = {
188 |         "http": "http://%(proxy)s/" % {"proxy": tunnel},
189 |         "https": "http://%(proxy)s/" % {"proxy": tunnel},
190 |     }
191 |     # run and get data
192 |     demo.main()
193 | 


--------------------------------------------------------------------------------
/main_class.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2023/2/11 21:27
  3 | # @Author  : Euclid-Jie
  4 | # @File    : main_class.py
  5 | import pandas as pd
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | from tqdm import tqdm
  9 | import logging
 10 | from retrying import retry
 11 | from typing import Optional, Union
 12 | from concurrent.futures import ThreadPoolExecutor, as_completed
 13 | from Utils.MongoClient import MongoClient
 14 | from Utils.EuclidDataTools import CsvClient
 15 | from TreadCrawler import RedisClient
 16 | import configparser
 17 | 
 18 | 
 19 | class guba_comments:
 20 |     """
 21 |     this class is designed for get hot comments for guba, have two method which can be set at def get_data()
 22 |     1、all: https://guba.eastmoney.com/list,600519_1.html, secCode: 600519, page: 1
 23 |     2、hot: https://guba.eastmoney.com/list,600519,99_1.html secCode: 600519, page: 1
 24 | 
 25 |     because to the ip control, this need to set proxies pools
 26 |     by using proxies https://www.kuaidaili.com/usercenter/overview/, can solve this problem
 27 | 
 28 |     Program characteristics:
 29 |         1、default write data to mongoDB, by init "MogoDB=False", can switch to write data to csv file
 30 |         2、Use retry mechanism, once rise error, the program will restart at the least page and num (each page has 80 num)
 31 | 
 32 |     """
 33 | 
 34 |     failed_proxies = {}
 35 |     proxy_fail_times_treshold = 3
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         config: configparser.ConfigParser,
 40 |         secCode: Union[str, int],
 41 |         pages_start: int = 0,
 42 |         pages_end: int = 100,
 43 |         num_start: int = 0,
 44 |         MongoDB: bool = True,
 45 |         collectionName: Optional[str] = None,
 46 |         full_text: bool = False,
 47 |     ):
 48 |         # param init
 49 |         if isinstance(secCode, int):
 50 |             # 补齐6位数
 51 |             self.secCode = str(secCode).zfill(6)
 52 |         elif isinstance(secCode, str):
 53 |             self.secCode = secCode
 54 |         self.pages_start = pages_start
 55 |         self.pages_end = pages_end
 56 |         self.num_start = num_start
 57 |         self.full_text = full_text
 58 |         self._year = pd.Timestamp.now().year
 59 | 
 60 |         # redis client for full_text_Crawler
 61 |         self.redis_client: RedisClient = RedisClient(config=config)
 62 | 
 63 |         # rewrite the secCode setting
 64 |         if config.has_option("mainClass", "secCode"):
 65 |             self.secCode = config.get("mainClass", "secCode")
 66 |             print(
 67 |                 f"secCode has been overridden by {self.secCode} in the configuration file."
 68 |             )
 69 |         if config.has_option("mainClass", "pages_start"):
 70 |             self.pages_start = int(config.get("mainClass", "pages_start"))
 71 |             print(
 72 |                 f"pages_start has been overridden by {self.pages_start} in the configuration file."
 73 |             )
 74 |         if config.has_option("mainClass", "pages_end"):
 75 |             self.pages_end = int(config.get("mainClass", "pages_end"))
 76 |             print(
 77 |                 f"pages_end has been overridden by {self.pages_end} in the configuration file."
 78 |             )
 79 |         if config.has_option("mainClass", "collectionName"):
 80 |             collectionName = config.get("mainClass", "collectionName")
 81 |             print(
 82 |                 f"collectionName has been overridden by {collectionName} in the configuration file."
 83 |             )
 84 | 
 85 |         # choose one save method, default MongoDB
 86 |         # 1、csv
 87 |         # 2、MongoDB
 88 |         collectionName = collectionName if collectionName else self.secCode
 89 |         self.col = (
 90 |             MongoClient("guba", collectionName)
 91 |             if MongoDB
 92 |             else CsvClient("guba", collectionName)
 93 |         )
 94 | 
 95 |         # log setting
 96 |         log_format = "%(levelname)s %(asctime)s %(filename)s %(lineno)d %(message)s"
 97 |         logging.basicConfig(filename="test.log", format=log_format, level=logging.INFO)
 98 | 
 99 |     @staticmethod
100 |     def clear_str(str_raw):
101 |         for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]:
102 |             str_raw.strip(pat).replace(pat, "")
103 |         return str_raw
104 | 
105 |     @staticmethod
106 |     def run_thread_pool_sub(target, args, max_work_count):
107 |         with ThreadPoolExecutor(max_workers=max_work_count) as t:
108 |             res = [t.submit(target, i) for i in args]
109 |             return res
110 | 
111 |     @retry(stop_max_attempt_number=5)  # 最多尝试5次
112 |     def get_soup_form_url(self, url: str) -> BeautifulSoup:
113 |         """
114 |         get the html content used by requests.get
115 |         :param url:
116 |         :return: BeautifulSoup
117 |         """
118 |         response = requests.get(
119 |             url, headers=self.header, timeout=10, proxies=self.proxies
120 |         )  # 使用request获取网页
121 |         html = response.content.decode("utf-8", "ignore")
122 |         soup = BeautifulSoup(html, features="lxml")
123 |         return soup
124 | 
125 |     def get_data_json(self, item):
126 |         """
127 |         get the special keys from item, in this the project,
128 |         the keys con be "阅读"、"评论"、……
129 | 
130 |         by use the get_full_text, the return json data will contain full_text
131 |         :param item:
132 |         :return: json data contains full_text
133 |         """
134 | 
135 |         tds = item.find_all("td")
136 |         data_json = {
137 |             "阅读": tds[0].text,
138 |             "评论": tds[1].text,
139 |             "标题": tds[2].a.text,
140 |             "href": tds[2].a["href"],
141 |             "作者": tds[3].a.text,
142 |             "最后更新": tds[4].text,
143 |         }
144 |         if "caifuhao" in data_json["href"]:
145 |             self._year = int(data_json["href"].split("/")[-1][0:4])
146 |         dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"])
147 |         if dt > pd.Timestamp.now():
148 |             self._year -= 1
149 |             dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"])
150 |         data_json["最后更新"] = dt
151 |         if self.full_text:
152 |             self.redis_client.add_url(data_json["href"])
153 |         return data_json
154 | 
155 |     def get_data(self, page):
156 |         """
157 |         process to deal the single page's data
158 |         :param page: the page needed to be processed
159 |         :return:
160 |         """
161 |         # Url = "http://guba.eastmoney.com/list,{},99_{}.html".format(self.secCode, page)
162 |         Url = "http://guba.eastmoney.com/list,{},f_{}.html".format(self.secCode, page)
163 |         soup = self.get_soup_form_url(Url)
164 |         data_list = soup.find_all("tr", "listitem")
165 | 
166 |         # 开启并行获取data_json
167 |         res = self.run_thread_pool_sub(self.get_data_json, data_list, max_work_count=12)
168 |         for future in as_completed(res):
169 |             data_json = future.result()
170 |             self.col.insert_one(data_json)
171 |             self.t.set_postfix(
172 |                 {
173 |                     "状态": "已写num:{}".format(self.num_start),
174 |                 }
175 |             )  # 进度条右边显示信息
176 |             self.num_start += 1
177 | 
178 |     def main(self):
179 |         with tqdm(range(self.pages_start, self.pages_end)) as self.t:
180 |             for page in self.t:
181 |                 self.t.set_description("page:{}".format(page))  # 进度条左边显示信息
182 |                 self.get_data(page)
183 |                 self.num_start = 0
184 |                 self.pages_start += 1
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     # config
189 |     config = configparser.ConfigParser()
190 |     config.read("setting.ini", encoding="utf-8")
191 |     # init
192 |     demo = guba_comments(
193 |         config=config,
194 |         secCode="002611",
195 |         MongoDB=True,
196 |         collectionName="东方精工",
197 |         full_text=True,
198 |     )
199 | 
200 |     # setting
201 |     header = {
202 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
203 |     }
204 |     demo.header = header
205 |     tunnel = config.get("proxies", "tunnel")
206 |     demo.proxies = {
207 |         "http": "http://%(proxy)s/" % {"proxy": tunnel},
208 |         "https": "http://%(proxy)s/" % {"proxy": tunnel},
209 |     }
210 |     # run and get data
211 |     demo.main()
212 | 


--------------------------------------------------------------------------------