├── test ├── __init__.py ├── add_url_to_redis.py ├── test_proxy_pool.py ├── try.py └── try_mongoDB.py ├── .gitignore ├── requirements.txt ├── TreadCrawler ├── __init__.py ├── RedisClient.py └── TreadUrlCrawler.py ├── setting.ini ├── Utils ├── EuclidDataTools_test.py ├── MongoClient.py └── EuclidDataTools.py ├── README.md ├── full_text_Crawler.py ├── simple_main.py └── main_class.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .history 2 | test.log 3 | __pycache__ 4 | guba 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Euclid-Jie/Euclidguba-search/HEAD/requirements.txt -------------------------------------------------------------------------------- /TreadCrawler/__init__.py: -------------------------------------------------------------------------------- 1 | from .TreadUrlCrawler import ThreadUrlCrawler 2 | from .RedisClient import RedisClient 3 | -------------------------------------------------------------------------------- /setting.ini: -------------------------------------------------------------------------------- 1 | [Redis] 2 | redis_host = localhost 3 | redis_port = 6379 4 | redis_password = 123456 5 | redis_db = 0 6 | redis_key = urls 7 | 8 | [proxies] 9 | tunnel = d476.kdltps.com:15818 10 | 11 | [ThreadCrawler] 12 | num_threads = 32 13 | 14 | [mainClass] 15 | pages_start = 0 16 | pages_end = 100 -------------------------------------------------------------------------------- /Utils/EuclidDataTools_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/3/11 11:16 3 | # @Author : Euclid-Jie 4 | # @File : EuclidDataTools_test.py 5 | import pandas as pd 6 | 7 | from EuclidDataTools import * 8 | 9 | if __name__ == "__main__": 10 | data = {"id": 123, "sex": "male", "age": "12", "job": "student"} 11 | # 1、CsvClient.insert_one 12 | myCol = CsvClient(subFolder="demoOut", FileName="demo1") 13 | myCol.insert_one(data) 14 | 15 | # 2、EuclidCsvTools.saveCsvFile 16 | df = pd.DataFrame([data]) 17 | EuclidCsvTools(subFolder="demoOut", FileName="demo2.csv").saveCsvFile( 18 | df, append=False 19 | ) 20 | -------------------------------------------------------------------------------- /test/add_url_to_redis.py: -------------------------------------------------------------------------------- 1 | from TreadCrawler import RedisClient 2 | import configparser 3 | 4 | 5 | if __name__ == "__main__": 6 | config = configparser.ConfigParser() 7 | config.read("setting.ini") 8 | redis_client = RedisClient(config=config) 9 | lines = [ 10 | "/news,002611,1407434999.html", 11 | "/news,002611,1407434732.html", 12 | "/news,002611,1407434570.html", 13 | "/news,002611,1407432104.html", 14 | "/news,002611,1407428529.html", 15 | "/news,002611,1407428130.html", 16 | "/news,002611,1407427781.html", 17 | "/news,002611,1407425977.html", 18 | "/news,002611,1407424968.html", 19 | "/news,002611,1407424842.html", 20 | "/news,002611,1407421621.html", 21 | "/news,002611,1407420792.html", 22 | "/news,002611,1407417853.html", 23 | "/news,002611,1407416059.html", 24 | "/news,002611,1407415463.html", 25 | "/news,002229,1407912249.html", 26 | ] 27 | for i in lines: 28 | redis_client.add_url(i) 29 | -------------------------------------------------------------------------------- /TreadCrawler/RedisClient.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import threading 3 | import configparser 4 | 5 | __all__ = ["RedisClient"] 6 | 7 | 8 | class RedisClient: 9 | def __init__(self, config: configparser.ConfigParser): 10 | self.redis_client = redis.StrictRedis( 11 | host=config.get("Redis", "redis_host"), 12 | port=config.getint("Redis", "redis_port"), 13 | db=config.getint("Redis", "redis_db"), 14 | password=config.get("Redis", "redis_password"), 15 | ) 16 | self.lock = threading.Lock() 17 | self.redis_key = config.get("Redis", "redis_key") 18 | 19 | def add_url(self, url) -> None: 20 | with self.lock: 21 | self.redis_client.lpush(self.redis_key, url) 22 | 23 | def get_url(self) -> str: 24 | with self.lock: 25 | url = self.redis_client.rpop(self.redis_key) 26 | if url: 27 | return url.decode("utf-8") 28 | else: 29 | return None 30 | 31 | def __len__(self) -> int: 32 | return self.redis_client.llen(self.redis_key) 33 | -------------------------------------------------------------------------------- /test/test_proxy_pool.py: -------------------------------------------------------------------------------- 1 | # 从数据库调用IP 2 | import requests 3 | import random 4 | import time 5 | 6 | def get_proxy(): 7 | all = requests.get("http://127.0.0.1:5010/all/").json() 8 | # 随机选一个 9 | if len(all) == 0: 10 | time.sleep(10) 11 | print("No proxy available, waiting for 10 seconds") 12 | return get_proxy() 13 | return random.choice(all) 14 | 15 | def get_proxies_count() -> int: 16 | return requests.get("http://127.0.0.1:5010/count/").json()["count"] 17 | 18 | 19 | # 删除数据库中IP 20 | def delete_proxy(proxy): 21 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 22 | 23 | 24 | # 使用代理IP发起请求 25 | def getResponse(URL, header): 26 | retry_count = 5 27 | proxy = get_proxy().get("proxy") 28 | while retry_count > 0: 29 | try: 30 | response = requests.get(URL, headers=header, timeout=60, proxies={"http": "http://{}".format(proxy)}) 31 | # 使用代理访问 32 | return response 33 | except Exception: 34 | retry_count -= 1 35 | # 删除代理池中代理 36 | delete_proxy(proxy) 37 | return None 38 | 39 | if __name__ == "__main__": 40 | # 测试代理IP 41 | print(get_proxies_count()) 42 | print(get_proxy().get("proxy")) -------------------------------------------------------------------------------- /Utils/MongoClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/2/9 23:26 3 | # @Author : Euclid-Jie 4 | # @File : MongoClient.py 5 | import pymongo 6 | import pandas as pd 7 | 8 | __all__ = ["MongoClient", "read_mongo"] 9 | 10 | 11 | def MongoClient(DBName, collectionName): 12 | # 连接数据库 13 | myclient = pymongo.MongoClient("mongodb://localhost:27017/") 14 | mydb = myclient[DBName] # 数据库名称 15 | mycol = mydb[collectionName] # 集合(表) 16 | return mycol 17 | 18 | 19 | def read_mongo(DBName, collectionName, query=None, no_id=True): 20 | """ 21 | Read from Mongo and Store into DataFrame 22 | :param DBName: mongoDB dataBase's name 23 | :param collectionName: mongoDB dataBase's collection's name 24 | :param query: a selection for data, demo: query = {"time": {"$gt": "2021-01-01"}} 25 | :param no_id: do not write _id column to data 26 | :return: pd.DataFrame 27 | """ 28 | # Connect to MongoDB 29 | if query is None: 30 | query = {} 31 | col = MongoClient(DBName, collectionName) 32 | # Make a query to the specific DB and Collection 33 | cursor = col.find(query) 34 | # Expand the cursor and construct the DataFrame 35 | df = pd.DataFrame(list(cursor)) 36 | # Delete the _id 37 | if no_id and "_id" in df: 38 | del df["_id"] 39 | return df.drop_duplicates() 40 | -------------------------------------------------------------------------------- /TreadCrawler/TreadUrlCrawler.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import requests 3 | import configparser 4 | from TreadCrawler.RedisClient import RedisClient 5 | 6 | 7 | class ThreadUrlCrawler: 8 | def __init__( 9 | self, 10 | ): 11 | config = configparser.ConfigParser() 12 | config.read("setting.ini") 13 | self.redis_client: RedisClient = RedisClient(config=config) 14 | self.lock = threading.Lock() 15 | self.redis_key = config.get("Redis", "redis_key") 16 | self.num_threads = config.getint("ThreadCrawler", "num_threads") 17 | self.threads = [] 18 | self.stop_crawling = threading.Event() 19 | 20 | def crawl(self, url) -> bool: 21 | raise NotImplementedError("Subclasses must implement crawl method") 22 | 23 | def _worker(self): 24 | while not self.stop_crawling.is_set(): 25 | with self.lock: 26 | url = self.redis_client.get_url() 27 | if url: 28 | if self.crawl(url): 29 | pass 30 | else: 31 | with self.lock: 32 | self.redis_client.add_url(url) 33 | 34 | def start(self): 35 | for _ in range(self.num_threads): 36 | t = threading.Thread(target=self._worker) 37 | t.daemon = True 38 | t.start() 39 | self.threads.append(t) 40 | 41 | for t in self.threads: 42 | t.join() 43 | 44 | # 打印剩余未爬取URL的数量 45 | print(f"Remaining URLs: {self.redis_client.__len__()}") 46 | 47 | def stop(self): 48 | self.stop_crawling.set() 49 | 50 | 51 | # Example subclass of ThreadUrlCrawler 52 | class MyThreadCrawler(ThreadUrlCrawler): 53 | def crawl(self, url): 54 | try: 55 | response = requests.get(url) 56 | if response.status_code == 200: 57 | print(f"Successfully crawled: {url}") 58 | return True 59 | else: 60 | print(f"Failed to crawl: {url}") 61 | return False 62 | except Exception as e: 63 | print(f"Exception while crawling {url}: {e}") 64 | return False 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 东方财富股吧数据采集 2 | 3 | [![wakatime](https://wakatime.com/badge/user/b638b33f-0c9e-4408-b427-258fe0b24ad0/project/018e0f79-4bee-4fd1-8d86-55a20bee6528.svg)](https://wakatime.com/badge/user/b638b33f-0c9e-4408-b427-258fe0b24ad0/project/018e0f79-4bee-4fd1-8d86-55a20bee6528) 4 | 5 | 长期维护,欢迎issue,帮助完善代码 6 | 7 | 由于近期无免费代理可用,本项目恢复到使用付费代理池,推荐使用[*快代理*](https://www.kuaidaili.com/?ref=mes9ujq5wnrn) 8 | 9 | 在启动本项目前,希望你已经安装了`MongoDB`、`redis`,这两个数据库会使得本项目更易于使用,展开来说: 10 | 11 | 1. 如果你没有安装`redis`,你仍可以使用本项目,但是无法获取帖子的详情,只能获取标题 12 | 2. 如果你没有安装`MongoDB`,你仍可以使用本项目,因为本项目支持写入csv文件 13 | 3. 很遗憾,尽管你安装了`redis`,但是没有安装`MongoDB`,那你依旧无无法获取帖子的详情,只能获取标题 14 | 4. 很遗憾,尽管你安装了`MongoDB`,但是没有安装`redis`,那你依旧无无法获取帖子的详情,只能获取标题 15 | 5. 幸运的是,如果你两个都没有,你仍可以使用本项目,因为你仍旧可以获得仅有标题的csv文件,心软的我为你特别准备了`simple_main.py` 16 | 17 | ## 程序特性 18 | 19 | - [X] 可爬取热帖和全部,在 `main_class.get_data()`中设置 `url` 20 | - 热帖:https://guba.eastmoney.com/list,600519,99_1.html 21 | - 全部:https://guba.eastmoney.com/list,600519_1.html 22 | - ~~使用免费代理,亲测可以完成爬取任务~~ (24年由于免费代理库的作者不更新,本项目也只能转向付费代理,sorry) 23 | - [X] 仅爬取帖子 `title`时,速度极快 24 | - [x] `redis`异步多线程获取完整贴子内容 25 | - [ ] 指定时间区间的爬取 26 | - [ ] docker部署(暂时还不会,等我搞完这个,你们就不需要学`redis`和`mongoDB`了) 27 | - [ ] UI界面(多半不会有,希望有大佬帮我一把) 28 | 29 | ## 启动步骤 30 | 31 | ### 1. 获取代码 32 | 33 | 1. 第一种方式,如果你会使用 `git`, 请直接 `clone` 34 | 2. 第二种方式,下载源码,详见下图,点击 `Download ZIP` 既可下载,随即解压既可 35 | 36 | ![image-20240315122017995](https://euclid-picgo.oss-cn-shenzhen.aliyuncs.com/image/image-20240315122017995.png) 37 | 38 | ### 2. 配置环境 39 | 40 | 前置条件是安转并启动了 `redis`,`mongo`,并将 `redis`密码设置为123456,后续会添加这部分的操作说明 41 | 42 | 建议使用虚拟环境,并安装依赖 43 | 44 | ```cmd 45 | pip install -r requirements.txt 46 | ``` 47 | 48 | ### 3.编写配置 49 | 50 | 本项目非常人性化地设置了配置文件,`setting.ini` 51 | 52 | - `Redis`部分为用于评论详情爬取的后台程序`FullTextCrawler` 53 | - `proxies`是使用[*快代理*](*https://www.kuaidaili.com/?ref=mes9ujq5wnrn*)获取的代理tunnel 54 | - `ThreadCrawler`部分是用于评论详情爬取的后台程序`FullTextCrawler`的线程池数量设置 55 | - `mainClass`是主程序的参数 56 | 57 | ```ini 58 | [Redis] 59 | redis_host = localhost 60 | redis_port = 6379 61 | redis_password = 62 | redis_db = 0 63 | redis_key = urls 64 | 65 | [proxies] 66 | tunnel = d476.kdltps.com:15818 67 | 68 | [ThreadCrawler] 69 | num_threads = 32 70 | 71 | [mainClass] 72 | pages_start = 0 73 | pages_end = 100 74 | ``` 75 | 76 | ### 4.启动程序 77 | 78 | 1. 启动 `FullTextCrawler`,如果没有`redis`, 跳过次步骤 79 | 80 | 新开终端,运行 81 | 82 | ```cmd 83 | python -m full_text_Crawler 84 | ``` 85 | 2. 启动主程序 86 | 87 | 在 `main_class.py`中设置好参数,新开终端,运行 88 | 89 | ``` 90 | python -m main_class 91 | ``` 92 | 93 | 爬取成功的数据会在,`MongoDB.guba` 中,如有问题,请 [issue](https://github.com/Euclid-Jie/Euclidguba-search/issues/new) 94 | 95 | ## 附录 96 | 97 | 1. 爬取成功的数据截图 98 | 99 | ![image-20240315123641440](https://euclid-picgo.oss-cn-shenzhen.aliyuncs.com/image/image-20240315123641440.png) 100 | 2. 股吧页面截图 101 | 102 | ![](https://euclid-picgo.oss-cn-shenzhen.aliyuncs.com/image/202302161115850.png) 103 | -------------------------------------------------------------------------------- /Utils/EuclidDataTools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/3/11 10:45 3 | # @Author : Euclid-Jie 4 | # @File : EuclidDataTools.py 5 | import pandas as pd 6 | from pathlib import Path 7 | from typing import Optional, Union 8 | 9 | __all__ = ["EuclidCsvTools", "CsvClient"] 10 | 11 | 12 | class EuclidCsvTools: 13 | """ 14 | this class include tools used to precess csv file 15 | """ 16 | 17 | def __init__( 18 | self, 19 | subFolder: Optional[Union[str, Path]] = None, 20 | FileName: str = "DemoOut.csv", 21 | ): 22 | # para init 23 | if subFolder: 24 | self.subFolder = Path(subFolder) 25 | 26 | assert FileName.endswith(".csv"), "file name must end with .csv" 27 | self.FileName = FileName 28 | self.path_clear() 29 | 30 | def path_clear(self): 31 | """ 32 | get the full folder path and full file path 33 | :return: 34 | """ 35 | if self.subFolder: 36 | self.FullFolderPath = Path.cwd().joinpath(self.subFolder) 37 | self.FullFolderPath.mkdir(parents=True, exist_ok=True) 38 | self.FullFilePath = ( 39 | Path.cwd().joinpath(self.subFolder).joinpath(self.FileName) 40 | ) 41 | else: 42 | self.FullFolderPath = Path.cwd() 43 | self.FullFolderPath.mkdir(parents=True, exist_ok=True) 44 | self.FullFilePath = Path.cwd().joinpath(self.FileName) 45 | print("文件将存储在: {}".format(self.FullFilePath)) 46 | 47 | def saveCsvFile(self, df, append=False): 48 | """ 49 | save data to csv 50 | :param df: pd.DataFrame 51 | :param append: True(append save) or False(overwrite) 52 | :return: 53 | """ 54 | if append and self.FullFilePath.exists(): 55 | self.writeDf2Csv(df, self.FullFilePath) 56 | else: 57 | df.to_csv(self.FullFilePath, encoding="utf_8_sig", index=False) 58 | 59 | @classmethod 60 | def writeDf2Csv(cls, df: pd.DataFrame, FullFilePath): 61 | if FullFilePath.exists(): 62 | # write after a exist file without header 63 | df.to_csv( 64 | FullFilePath, mode="a", encoding="utf_8_sig", header=False, index=False 65 | ) 66 | else: 67 | # write out a new file with header 68 | df.to_csv( 69 | FullFilePath, mode="w", encoding="utf_8_sig", header=True, index=False 70 | ) 71 | 72 | 73 | class CsvClient(EuclidCsvTools): 74 | def __init__( 75 | self, 76 | subFolder: Optional[Union[str, Path]] = None, 77 | FileName: str = "DemoOut.csv", 78 | ): 79 | """ 80 | :param subFolder: 81 | :param FileName: 82 | """ 83 | if ~FileName.endswith(".csv") and "." not in FileName: 84 | FileName = FileName + ".csv" 85 | else: 86 | raise ValueError("FileName must end with .csv or not contain '.'") 87 | super().__init__(subFolder=subFolder, FileName=FileName) 88 | 89 | def insert_one(self, data: Union[dict, pd.DataFrame]): 90 | if isinstance(data, dict): 91 | data = pd.DataFrame([data]) 92 | elif isinstance(data, pd.DataFrame): 93 | pass 94 | else: 95 | raise TypeError("传入参数仅支出dict和pd.DataFrame") 96 | self.saveCsvFile(df=data, append=True) 97 | -------------------------------------------------------------------------------- /full_text_Crawler.py: -------------------------------------------------------------------------------- 1 | from TreadCrawler import ThreadUrlCrawler 2 | import requests 3 | from typing import Union 4 | from bs4 import BeautifulSoup 5 | from Utils.MongoClient import MongoClient 6 | import configparser 7 | 8 | 9 | class FullTextCrawler(ThreadUrlCrawler): 10 | header = { 11 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0", 12 | } 13 | mongo_client = MongoClient("guba", "东方精工") 14 | failed_proxies = {} 15 | proxy_fail_times_treshold = 16 16 | 17 | def crawl(self, url): 18 | """ 19 | the href of each item have different fartherPath: 20 | 1、https://caifuhao 21 | 2、http://guba.eastmoney.com 22 | 23 | :param data_json: the json data lack full text 24 | :return: the data json with full text 25 | """ 26 | url_map = { 27 | "caifuhao": "https:", 28 | "/new": "http://guba.eastmoney.com", 29 | } 30 | match_times = 0 31 | url_map_len = len(url_map) 32 | for k, v in url_map.items(): 33 | match_times += 1 34 | if k in url: 35 | soup = self.get_soup_form_url(v + url) 36 | if soup: 37 | try: 38 | time = soup.find("div", {"class": "time"}).text 39 | except (ValueError, AttributeError) as e: 40 | time = "" 41 | try: 42 | 43 | if soup.find("div", {"id": "post_content"}): 44 | full_text = soup.find("div", {"id": "post_content"}).text 45 | else: 46 | full_text = soup.find("div", {"class": "newstext"}).text 47 | except (ValueError, AttributeError) as e: 48 | full_text = "" 49 | else: 50 | full_text = None 51 | return False 52 | elif match_times == url_map_len: 53 | full_text = None 54 | if full_text: 55 | print(f"Successfully crawled: {url}, full text: {full_text}") 56 | self.mongo_client.update_one( 57 | {"href": url}, {"$set": {"full_text": full_text, "time": time}} 58 | ) 59 | return True 60 | elif full_text is None: 61 | print(f"Failed to crawl: {url}") 62 | return False 63 | 64 | def get_soup_form_url(self, url) -> Union[BeautifulSoup, None]: 65 | try: 66 | response = requests.get( 67 | url, headers=self.header, timeout=10, proxies=self.proxies 68 | ) # 使用request获取网页 69 | if response.status_code != 200: 70 | return None 71 | else: 72 | html = response.content.decode( 73 | "utf-8", "ignore" 74 | ) # 将网页源码转换格式为html 75 | soup = BeautifulSoup( 76 | html, features="lxml" 77 | ) # 构建soup对象,"lxml"为设置的解析器 78 | return soup 79 | except Exception as e: 80 | return None 81 | 82 | 83 | if __name__ == "__main__": 84 | # 读取配置文件 85 | config = configparser.ConfigParser() 86 | config.read("setting.ini", encoding="utf-8") 87 | tunnel = config.get("proxies", "tunnel") 88 | # 启动full_text_crawler,置于后台运行,一旦有新的url加入redis中,将会自动爬取,并存入mongodb 89 | full_text_crawler = FullTextCrawler() 90 | full_text_crawler.proxies = { 91 | "http": "http://%(proxy)s/" % {"proxy": tunnel}, 92 | "https": "http://%(proxy)s/" % {"proxy": tunnel}, 93 | } 94 | full_text_crawler.start() 95 | -------------------------------------------------------------------------------- /test/try.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/2/10 19:53 3 | # @Author : Euclid-Jie 4 | # @File : try.py 5 | import os 6 | import pandas as pd 7 | import pymongo 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from tqdm import tqdm 11 | 12 | 13 | def MongoClient(DBName, collectionName): 14 | # 连接数据库 15 | myclient = pymongo.MongoClient("mongodb://localhost:27017/") 16 | mydb = myclient[DBName] # 数据库名称 17 | mycol = mydb[collectionName] # 集合(表) 18 | return mycol 19 | 20 | 21 | def get_data(page, fileFullName): 22 | header = { 23 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0" 24 | } 25 | Url = "http://guba.eastmoney.com/list,600519,99_{}.html".format(page) 26 | response = requests.get(Url, headers=header, timeout=60) # 使用request获取网页 27 | html = response.content.decode("utf-8", "ignore") # 将网页源码转换格式为html 28 | soup = BeautifulSoup(html, features="lxml") # 构建soup对象,"lxml"为设置的解析器 29 | data_list = soup.find_all("div", "articleh") 30 | num = len(data_list) 31 | out_df = pd.DataFrame() 32 | for item in data_list: 33 | try: 34 | data_json = get_data_json(item) 35 | # col.insert_one(data_json) 36 | except: 37 | num -= 1 38 | pass 39 | save_data(out_df, os.getcwd(), fileFullName) 40 | return num 41 | 42 | 43 | def clear_str(str_raw): 44 | for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]: 45 | str_raw = str_raw.replace(pat, "") 46 | return str_raw 47 | 48 | 49 | def get_full_text(data_json): 50 | header = { 51 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0" 52 | } 53 | if "caifuhao" in data_json["href"]: 54 | url = "https:" + data_json["href"] 55 | response = requests.get(url, headers=header, timeout=60) 56 | html = response.content.decode("utf-8", "ignore") # 将网页源码转换格式为html 57 | soup = BeautifulSoup(html, features="lxml") # 构建soup对象,"lxml"为设置的解析器 58 | try: 59 | data_json["full_text"] = soup.find("div", "article-body").get_text() 60 | return data_json 61 | except: 62 | return data_json 63 | 64 | elif "/new" in data_json["href"]: 65 | url = "http://guba.eastmoney.com" + data_json["href"] 66 | response = requests.get(url, headers=header, timeout=60) 67 | html = response.content.decode("utf-8", "ignore") # 将网页源码转换格式为html 68 | soup = BeautifulSoup(html, features="lxml") # 构建soup对象,"lxml"为设置的解析器 69 | data_json["full_text"] = clear_str( 70 | soup.find("div", {"id": "post_content"}).text 71 | ) 72 | return data_json 73 | else: 74 | return data_json 75 | 76 | 77 | def get_data_json(item): 78 | spans = item.find_all("span") 79 | data_json = { 80 | "阅读": spans[0].text, 81 | "评论": spans[1].text, 82 | "标题": spans[2].a["title"], 83 | "href": spans[2].a["href"], 84 | "作者": spans[3].a.text, 85 | "最后更新": spans[4].text, 86 | } 87 | 88 | return get_full_text(data_json) 89 | 90 | 91 | def save_data(data_df, FileFullPath, FilePath): 92 | """ 93 | 轮子函数,用于存储数据,可实现对已存在文件的追加写入 94 | :param data_df: 目标数据 95 | :param FileFullPath: 全路径,包括文件名和后缀 96 | :param FilePath: 文件名,包括后缀 97 | :return: 98 | """ 99 | FileFullPath = os.path.join(FileFullPath, FilePath) 100 | if os.path.isfile(FileFullPath): 101 | data_df.to_csv( 102 | FilePath, mode="a", header=False, index=False, encoding="utf_8_sig" 103 | ) 104 | else: 105 | data_df.to_csv( 106 | FilePath, mode="w", header=True, index=False, encoding="utf_8_sig" 107 | ) 108 | 109 | 110 | if __name__ == "__main__": 111 | 112 | fileFullName = "茅台.csv" 113 | with tqdm(range(1, 60)) as t: 114 | for page in t: 115 | t.set_description("page:{}".format(page)) # 进度条左边显示信息 116 | writed_len = get_data(page, fileFullName) 117 | t.set_postfix({"状态": "已成功写入{}条".format(writed_len)}) # 进度条右边显示信息 118 | -------------------------------------------------------------------------------- /test/try_mongoDB.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/2/10 19:53 3 | # @Author : Euclid-Jie 4 | # @File : try.py 5 | import os 6 | import pandas as pd 7 | import pymongo 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from tqdm import tqdm 11 | 12 | 13 | def MongoClient(DBName, collectionName): 14 | # 连接数据库 15 | myclient = pymongo.MongoClient("mongodb://localhost:27017/") 16 | mydb = myclient[DBName] # 数据库名称 17 | mycol = mydb[collectionName] # 集合(表) 18 | return mycol 19 | 20 | 21 | def get_data(page, col): 22 | header = { 23 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0" 24 | } 25 | Url = "http://guba.eastmoney.com/list,600519,99_{}.html".format(page) 26 | proxies = {"http": "http://127.0.0.1:12345", "https": "http://127.0.0.1:12345"} 27 | response = requests.get( 28 | Url, headers=header, timeout=60, proxies=proxies 29 | ) # 使用request获取网页 30 | html = response.content.decode("utf-8", "ignore") # 将网页源码转换格式为html 31 | soup = BeautifulSoup(html, features="lxml") # 构建soup对象,"lxml"为设置的解析器 32 | data_list = soup.find_all("div", "articleh") 33 | num = len(data_list) 34 | for item in data_list: 35 | try: 36 | data_json = get_data_json(item) 37 | col.insert_one(data_json) 38 | except: 39 | num -= 1 40 | pass 41 | return num 42 | 43 | 44 | def clear_str(str_raw): 45 | for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]: 46 | str_raw = str_raw.replace(pat, "") 47 | return str_raw 48 | 49 | 50 | def get_full_text(data_json): 51 | header = { 52 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0" 53 | } 54 | if "caifuhao" in data_json["href"]: 55 | url = "https:" + data_json["href"] 56 | proxies = {"http": "http://127.0.0.1:12345", "https": "http://127.0.0.1:12345"} 57 | response = requests.get( 58 | url, headers=header, timeout=60, proxies=proxies 59 | ) # 使用request获取网页 60 | html = response.content.decode("utf-8", "ignore") # 将网页源码转换格式为html 61 | soup = BeautifulSoup(html, features="lxml") # 构建soup对象,"lxml"为设置的解析器 62 | try: 63 | data_json["full_text"] = soup.find("div", "article-body").get_text() 64 | return data_json 65 | except: 66 | return data_json 67 | 68 | elif "/new" in data_json["href"]: 69 | url = "http://guba.eastmoney.com" + data_json["href"] 70 | proxies = {"http": "http://127.0.0.1:12345", "https": "http://127.0.0.1:12345"} 71 | response = requests.get( 72 | url, headers=header, timeout=60, proxies=proxies 73 | ) # 使用request获取网页 74 | html = response.content.decode("utf-8", "ignore") # 将网页源码转换格式为html 75 | soup = BeautifulSoup(html, features="lxml") # 构建soup对象,"lxml"为设置的解析器 76 | data_json["full_text"] = clear_str( 77 | soup.find("div", {"id": "post_content"}).text 78 | ) 79 | return data_json 80 | else: 81 | return data_json 82 | 83 | 84 | def get_data_json(item): 85 | spans = item.find_all("span") 86 | data_json = { 87 | "阅读": spans[0].text, 88 | "评论": spans[1].text, 89 | "标题": spans[2].a["title"], 90 | "href": spans[2].a["href"], 91 | "作者": spans[3].a.text, 92 | "最后更新": spans[4].text, 93 | } 94 | 95 | return get_full_text(data_json) 96 | 97 | 98 | def save_data(data_df, SaveFolderPath, FilePath): 99 | """ 100 | 轮子函数,用于存储数据,可实现对已存在文件的追加写入 101 | :param SaveFolderPath: the FordPath save data files, usually is the projectPath: os.getcwd() 102 | :param data_df: 目标数据 103 | :param FilePath: 文件名,包括后缀 104 | :return: 105 | """ 106 | # concat the folderPath and dataPath 107 | FileFullPath = os.path.join(SaveFolderPath, FilePath) 108 | if os.path.isfile(FileFullPath): 109 | data_df.to_csv( 110 | FilePath, mode="a", header=False, index=False, encoding="utf_8_sig" 111 | ) 112 | else: 113 | data_df.to_csv( 114 | FilePath, mode="w", header=True, index=False, encoding="utf_8_sig" 115 | ) 116 | 117 | 118 | if __name__ == "__main__": 119 | col = MongoClient("guba", "中国医药_base") 120 | with tqdm(range(1, 59)) as t: 121 | for page in t: 122 | t.set_description("page:{}".format(page)) # 进度条左边显示信息 123 | writed_len = get_data(page, col) 124 | t.set_postfix({"状态": "已成功写入{}条".format(writed_len)}) # 进度条右边显示信息 125 | -------------------------------------------------------------------------------- /simple_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/2/11 21:27 3 | # @Author : Euclid-Jie 4 | # @File : main_class.py 5 | import pandas as pd 6 | import requests 7 | from bs4 import BeautifulSoup 8 | from tqdm import tqdm 9 | import logging 10 | from retrying import retry 11 | from typing import Optional, Union 12 | from concurrent.futures import ThreadPoolExecutor, as_completed 13 | from Utils.EuclidDataTools import CsvClient 14 | import configparser 15 | 16 | 17 | class guba_comments: 18 | """ 19 | this class is designed for get hot comments for guba, have two method which can be set at def get_data() 20 | 1、all: https://guba.eastmoney.com/list,600519_1.html, secCode: 600519, page: 1 21 | 2、hot: https://guba.eastmoney.com/list,600519,99_1.html secCode: 600519, page: 1 22 | 23 | because to the ip control, this need to set proxies pools 24 | by using proxies https://www.kuaidaili.com/usercenter/overview/, can solve this problem 25 | 26 | Program characteristics: 27 | 1、default write data to mongoDB, by init "MogoDB=False", can switch to write data to csv file 28 | 2、Use retry mechanism, once rise error, the program will restart at the least page and num (each page has 80 num) 29 | 30 | """ 31 | 32 | failed_proxies = {} 33 | proxy_fail_times_treshold = 3 34 | 35 | def __init__( 36 | self, 37 | config: configparser.ConfigParser, 38 | secCode: Union[str, int], 39 | pages_start: int = 0, 40 | pages_end: int = 100, 41 | num_start: int = 0, 42 | collectionName: Optional[str] = None, 43 | ): 44 | # param init 45 | if isinstance(secCode, int): 46 | # 补齐6位数 47 | self.secCode = str(secCode).zfill(6) 48 | elif isinstance(secCode, str): 49 | self.secCode = secCode 50 | self.pages_start = pages_start 51 | self.pages_end = pages_end 52 | self.num_start = num_start 53 | self._year = pd.Timestamp.now().year 54 | 55 | # rewrite the secCode setting 56 | if config.has_option("mainClass", "secCode"): 57 | self.secCode = config.get("mainClass", "secCode") 58 | print( 59 | f"secCode has been overridden by {self.secCode} in the configuration file." 60 | ) 61 | if config.has_option("mainClass", "pages_start"): 62 | self.pages_start = int(config.get("mainClass", "pages_start")) 63 | print( 64 | f"pages_start has been overridden by {self.pages_start} in the configuration file." 65 | ) 66 | if config.has_option("mainClass", "pages_end"): 67 | self.pages_end = int(config.get("mainClass", "pages_end")) 68 | print( 69 | f"pages_end has been overridden by {self.pages_end} in the configuration file." 70 | ) 71 | if config.has_option("mainClass", "collectionName"): 72 | collectionName = config.get("mainClass", "collectionName") 73 | print( 74 | f"collectionName has been overridden by {collectionName} in the configuration file." 75 | ) 76 | 77 | collectionName = collectionName if collectionName else self.secCode 78 | self.col = CsvClient("guba", collectionName) 79 | 80 | # log setting 81 | log_format = "%(levelname)s %(asctime)s %(filename)s %(lineno)d %(message)s" 82 | logging.basicConfig(filename="test.log", format=log_format, level=logging.INFO) 83 | 84 | @staticmethod 85 | def clear_str(str_raw): 86 | for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]: 87 | str_raw.strip(pat).replace(pat, "") 88 | return str_raw 89 | 90 | @staticmethod 91 | def run_thread_pool_sub(target, args, max_work_count): 92 | with ThreadPoolExecutor(max_workers=max_work_count) as t: 93 | res = [t.submit(target, i) for i in args] 94 | return res 95 | 96 | @retry(stop_max_attempt_number=5) # 最多尝试5次 97 | def get_soup_form_url(self, url: str) -> BeautifulSoup: 98 | """ 99 | get the html content used by requests.get 100 | :param url: 101 | :return: BeautifulSoup 102 | """ 103 | response = requests.get( 104 | url, headers=self.header, timeout=10, proxies=self.proxies 105 | ) # 使用request获取网页 106 | html = response.content.decode("utf-8", "ignore") 107 | soup = BeautifulSoup(html, features="lxml") 108 | return soup 109 | 110 | def get_data_json(self, item): 111 | """ 112 | get the special keys from item, in this the project, 113 | the keys con be "阅读"、"评论"、…… 114 | 115 | by use the get_full_text, the return json data will contain full_text 116 | :param item: 117 | :return: json data contains full_text 118 | """ 119 | 120 | tds = item.find_all("td") 121 | data_json = { 122 | "阅读": tds[0].text, 123 | "评论": tds[1].text, 124 | "标题": tds[2].a.text, 125 | "href": tds[2].a["href"], 126 | "作者": tds[3].a.text, 127 | "最后更新": tds[4].text, 128 | } 129 | if "caifuhao" in data_json["href"]: 130 | self._year = int(data_json["href"].split("/")[-1][0:4]) 131 | dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"]) 132 | if dt > pd.Timestamp.now(): 133 | self._year -= 1 134 | dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"]) 135 | data_json["最后更新"] = dt 136 | return data_json 137 | 138 | def get_data(self, page): 139 | """ 140 | process to deal the single page's data 141 | :param page: the page needed to be processed 142 | :return: 143 | """ 144 | # Url = "http://guba.eastmoney.com/list,{},99_{}.html".format(self.secCode, page) 145 | Url = "http://guba.eastmoney.com/list,{},f_{}.html".format(self.secCode, page) 146 | soup = self.get_soup_form_url(Url) 147 | data_list = soup.find_all("tr", "listitem") 148 | 149 | # 开启并行获取data_json 150 | res = self.run_thread_pool_sub(self.get_data_json, data_list, max_work_count=12) 151 | for future in as_completed(res): 152 | data_json = future.result() 153 | self.col.insert_one(data_json) 154 | self.t.set_postfix( 155 | { 156 | "状态": "已写num:{}".format(self.num_start), 157 | } 158 | ) # 进度条右边显示信息 159 | self.num_start += 1 160 | 161 | def main(self): 162 | with tqdm(range(self.pages_start, self.pages_end)) as self.t: 163 | for page in self.t: 164 | self.t.set_description("page:{}".format(page)) # 进度条左边显示信息 165 | self.get_data(page) 166 | self.num_start = 0 167 | self.pages_start += 1 168 | 169 | 170 | if __name__ == "__main__": 171 | # config 172 | config = configparser.ConfigParser() 173 | config.read("setting.ini", encoding="utf-8") 174 | # init 175 | demo = guba_comments( 176 | config=config, 177 | secCode="002611", 178 | collectionName="东方精工", 179 | ) 180 | 181 | # setting 182 | header = { 183 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0", 184 | } 185 | demo.header = header 186 | tunnel = config.get("proxies", "tunnel") 187 | demo.proxies = { 188 | "http": "http://%(proxy)s/" % {"proxy": tunnel}, 189 | "https": "http://%(proxy)s/" % {"proxy": tunnel}, 190 | } 191 | # run and get data 192 | demo.main() 193 | -------------------------------------------------------------------------------- /main_class.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2023/2/11 21:27 3 | # @Author : Euclid-Jie 4 | # @File : main_class.py 5 | import pandas as pd 6 | import requests 7 | from bs4 import BeautifulSoup 8 | from tqdm import tqdm 9 | import logging 10 | from retrying import retry 11 | from typing import Optional, Union 12 | from concurrent.futures import ThreadPoolExecutor, as_completed 13 | from Utils.MongoClient import MongoClient 14 | from Utils.EuclidDataTools import CsvClient 15 | from TreadCrawler import RedisClient 16 | import configparser 17 | 18 | 19 | class guba_comments: 20 | """ 21 | this class is designed for get hot comments for guba, have two method which can be set at def get_data() 22 | 1、all: https://guba.eastmoney.com/list,600519_1.html, secCode: 600519, page: 1 23 | 2、hot: https://guba.eastmoney.com/list,600519,99_1.html secCode: 600519, page: 1 24 | 25 | because to the ip control, this need to set proxies pools 26 | by using proxies https://www.kuaidaili.com/usercenter/overview/, can solve this problem 27 | 28 | Program characteristics: 29 | 1、default write data to mongoDB, by init "MogoDB=False", can switch to write data to csv file 30 | 2、Use retry mechanism, once rise error, the program will restart at the least page and num (each page has 80 num) 31 | 32 | """ 33 | 34 | failed_proxies = {} 35 | proxy_fail_times_treshold = 3 36 | 37 | def __init__( 38 | self, 39 | config: configparser.ConfigParser, 40 | secCode: Union[str, int], 41 | pages_start: int = 0, 42 | pages_end: int = 100, 43 | num_start: int = 0, 44 | MongoDB: bool = True, 45 | collectionName: Optional[str] = None, 46 | full_text: bool = False, 47 | ): 48 | # param init 49 | if isinstance(secCode, int): 50 | # 补齐6位数 51 | self.secCode = str(secCode).zfill(6) 52 | elif isinstance(secCode, str): 53 | self.secCode = secCode 54 | self.pages_start = pages_start 55 | self.pages_end = pages_end 56 | self.num_start = num_start 57 | self.full_text = full_text 58 | self._year = pd.Timestamp.now().year 59 | 60 | # redis client for full_text_Crawler 61 | self.redis_client: RedisClient = RedisClient(config=config) 62 | 63 | # rewrite the secCode setting 64 | if config.has_option("mainClass", "secCode"): 65 | self.secCode = config.get("mainClass", "secCode") 66 | print( 67 | f"secCode has been overridden by {self.secCode} in the configuration file." 68 | ) 69 | if config.has_option("mainClass", "pages_start"): 70 | self.pages_start = int(config.get("mainClass", "pages_start")) 71 | print( 72 | f"pages_start has been overridden by {self.pages_start} in the configuration file." 73 | ) 74 | if config.has_option("mainClass", "pages_end"): 75 | self.pages_end = int(config.get("mainClass", "pages_end")) 76 | print( 77 | f"pages_end has been overridden by {self.pages_end} in the configuration file." 78 | ) 79 | if config.has_option("mainClass", "collectionName"): 80 | collectionName = config.get("mainClass", "collectionName") 81 | print( 82 | f"collectionName has been overridden by {collectionName} in the configuration file." 83 | ) 84 | 85 | # choose one save method, default MongoDB 86 | # 1、csv 87 | # 2、MongoDB 88 | collectionName = collectionName if collectionName else self.secCode 89 | self.col = ( 90 | MongoClient("guba", collectionName) 91 | if MongoDB 92 | else CsvClient("guba", collectionName) 93 | ) 94 | 95 | # log setting 96 | log_format = "%(levelname)s %(asctime)s %(filename)s %(lineno)d %(message)s" 97 | logging.basicConfig(filename="test.log", format=log_format, level=logging.INFO) 98 | 99 | @staticmethod 100 | def clear_str(str_raw): 101 | for pat in ["\n", " ", " ", "\r", "\xa0", "\n\r\n"]: 102 | str_raw.strip(pat).replace(pat, "") 103 | return str_raw 104 | 105 | @staticmethod 106 | def run_thread_pool_sub(target, args, max_work_count): 107 | with ThreadPoolExecutor(max_workers=max_work_count) as t: 108 | res = [t.submit(target, i) for i in args] 109 | return res 110 | 111 | @retry(stop_max_attempt_number=5) # 最多尝试5次 112 | def get_soup_form_url(self, url: str) -> BeautifulSoup: 113 | """ 114 | get the html content used by requests.get 115 | :param url: 116 | :return: BeautifulSoup 117 | """ 118 | response = requests.get( 119 | url, headers=self.header, timeout=10, proxies=self.proxies 120 | ) # 使用request获取网页 121 | html = response.content.decode("utf-8", "ignore") 122 | soup = BeautifulSoup(html, features="lxml") 123 | return soup 124 | 125 | def get_data_json(self, item): 126 | """ 127 | get the special keys from item, in this the project, 128 | the keys con be "阅读"、"评论"、…… 129 | 130 | by use the get_full_text, the return json data will contain full_text 131 | :param item: 132 | :return: json data contains full_text 133 | """ 134 | 135 | tds = item.find_all("td") 136 | data_json = { 137 | "阅读": tds[0].text, 138 | "评论": tds[1].text, 139 | "标题": tds[2].a.text, 140 | "href": tds[2].a["href"], 141 | "作者": tds[3].a.text, 142 | "最后更新": tds[4].text, 143 | } 144 | if "caifuhao" in data_json["href"]: 145 | self._year = int(data_json["href"].split("/")[-1][0:4]) 146 | dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"]) 147 | if dt > pd.Timestamp.now(): 148 | self._year -= 1 149 | dt = pd.to_datetime(str(self._year) + "-" + data_json["最后更新"]) 150 | data_json["最后更新"] = dt 151 | if self.full_text: 152 | self.redis_client.add_url(data_json["href"]) 153 | return data_json 154 | 155 | def get_data(self, page): 156 | """ 157 | process to deal the single page's data 158 | :param page: the page needed to be processed 159 | :return: 160 | """ 161 | # Url = "http://guba.eastmoney.com/list,{},99_{}.html".format(self.secCode, page) 162 | Url = "http://guba.eastmoney.com/list,{},f_{}.html".format(self.secCode, page) 163 | soup = self.get_soup_form_url(Url) 164 | data_list = soup.find_all("tr", "listitem") 165 | 166 | # 开启并行获取data_json 167 | res = self.run_thread_pool_sub(self.get_data_json, data_list, max_work_count=12) 168 | for future in as_completed(res): 169 | data_json = future.result() 170 | self.col.insert_one(data_json) 171 | self.t.set_postfix( 172 | { 173 | "状态": "已写num:{}".format(self.num_start), 174 | } 175 | ) # 进度条右边显示信息 176 | self.num_start += 1 177 | 178 | def main(self): 179 | with tqdm(range(self.pages_start, self.pages_end)) as self.t: 180 | for page in self.t: 181 | self.t.set_description("page:{}".format(page)) # 进度条左边显示信息 182 | self.get_data(page) 183 | self.num_start = 0 184 | self.pages_start += 1 185 | 186 | 187 | if __name__ == "__main__": 188 | # config 189 | config = configparser.ConfigParser() 190 | config.read("setting.ini", encoding="utf-8") 191 | # init 192 | demo = guba_comments( 193 | config=config, 194 | secCode="002611", 195 | MongoDB=True, 196 | collectionName="东方精工", 197 | full_text=True, 198 | ) 199 | 200 | # setting 201 | header = { 202 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0", 203 | } 204 | demo.header = header 205 | tunnel = config.get("proxies", "tunnel") 206 | demo.proxies = { 207 | "http": "http://%(proxy)s/" % {"proxy": tunnel}, 208 | "https": "http://%(proxy)s/" % {"proxy": tunnel}, 209 | } 210 | # run and get data 211 | demo.main() 212 | --------------------------------------------------------------------------------