├── .gitignore ├── README.MD ├── image.jpg ├── sd-generate-1.webp ├── checkpoints ├── model │ └── eval │ │ └── similarity_evaluation_train-eval_results.csv └── model_1 │ └── eval │ └── similarity_evaluation_train-eval_results.csv ├── read_html_example.py ├── similar_article_detect.py ├── huggingface_download.py ├── text_similarity_bert.py ├── gne_demo.py ├── newspaper_demo.py ├── object.json ├── sentence_tf_demo.py ├── xpath_example.py ├── td-tf.py ├── sentence-transformer-chinese.py ├── data.json ├── sentence-transformer-basic-demo.py ├── influxdb_demo.py ├── sentence-transformer-fine-tuning.py ├── demojson_demo.py ├── pandas_demo.ipynb ├── llm_data_processing.py ├── llm_data_processing_en.py ├── llm_data_process_advanced.py ├── es-demo.py └── image_process.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | local-models/ -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # python爬虫不可不知的N个高效数据清洗方法 -------------------------------------------------------------------------------- /image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rockyzsu/crawler_data_processing/master/image.jpg -------------------------------------------------------------------------------- /sd-generate-1.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rockyzsu/crawler_data_processing/master/sd-generate-1.webp -------------------------------------------------------------------------------- /checkpoints/model/eval/similarity_evaluation_train-eval_results.csv: -------------------------------------------------------------------------------- 1 | epoch,steps,cosine_pearson,cosine_spearman 2 | 1.6666666666666665,5,0.982106629364837,0.7 3 | 3.3333333333333335,10,0.9912963642725102,0.8999999999999998 4 | 5.0,15,0.9968199182590103,0.8999999999999998 5 | 6.666666666666667,20,0.9962659612106473,0.8999999999999998 6 | 8.333333333333334,25,0.9962659612106473,0.8999999999999998 7 | 10.0,30,0.9962659612106473,0.8999999999999998 8 | -------------------------------------------------------------------------------- /checkpoints/model_1/eval/similarity_evaluation_train-eval_results.csv: -------------------------------------------------------------------------------- 1 | epoch,steps,cosine_pearson,cosine_spearman 2 | 1.6666666666666665,5,0.979984670271406,0.6 3 | 3.3333333333333335,10,0.9922434965813424,0.8999999999999998 4 | 5.0,15,0.9966130255031076,0.8999999999999998 5 | 6.666666666666667,20,0.9937984320422485,0.8999999999999998 6 | 8.333333333333334,25,0.9937984320422485,0.8999999999999998 7 | 10.0,30,0.9937984320422485,0.8999999999999998 8 | -------------------------------------------------------------------------------- /read_html_example.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | from io import StringIO 4 | 5 | url = "https://www.fortunechina.com/fortune500/c/2024-08/05/content_456697.htm" 6 | 7 | payload = {} 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 10 | 11 | } 12 | 13 | response = requests.request("GET", url, headers=headers, data=payload) 14 | response.encoding = 'utf8' 15 | df = pd.read_html(StringIO(response.text))[0] 16 | print(df.head(10)) 17 | -------------------------------------------------------------------------------- /similar_article_detect.py: -------------------------------------------------------------------------------- 1 | from Levenshtein import distance # Need to install python-Levenshtein 2 | 3 | def text_similarity_simple(text1, text2): 4 | # Calculate edit distance 5 | edit_dist = distance(text1, text2) 6 | # Normalize similarity (ranges from 0 to 1, where 1 means identical) 7 | max_len = max(len(text1), len(text2)) 8 | return 1 - edit_dist / max_len if max_len > 0 else 1.0 9 | 10 | # Examples 11 | 12 | text_a = "The quick brown fox jumps over the lazy dog" 13 | text_b = "The quick brown fox jumps over the sleepy dog" 14 | 15 | print(f"Similarity between A and B: {text_similarity_simple(text_a, text_b):.2f}") # Approximately 0.91 -------------------------------------------------------------------------------- /huggingface_download.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import hf_hub_download, snapshot_download 2 | 3 | # 设置镜像 4 | # export 5 | 6 | 7 | # 示例1:下载单个文件(如模型权重文件) 8 | # 下载 "bert-base-uncased" 模型的 config.json 到当前目录 9 | # hf_hub_download( 10 | # repo_id="bert-base-uncased", # 模型/数据集仓库ID(格式:用户名/仓库名) 11 | # filename="config.json", # 要下载的文件名 12 | # local_dir="./bert-model" # 本地保存目录(可选) 13 | # ) 14 | 15 | # 示例2:下载整个模型仓库(推荐) 16 | # 下载 "sentence-transformers/all-MiniLM-L6-v2" 完整模型到本地 17 | model_name = 'uer/sbert-base-chinese-nli' 18 | model_dir = snapshot_download( 19 | repo_id=model_name, 20 | local_dir="./local-models/uer/sbert-base-chinese-nli" # 本地保存路径 21 | ) 22 | 23 | print(f"模型已保存到:{model_dir}") -------------------------------------------------------------------------------- /text_similarity_bert.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, util 2 | 3 | def text_similarity_bert(text1, text2): 4 | # 加载预训练模型(支持多语言) 5 | model = SentenceTransformer('all-MiniLM-L6-v2') # 轻量级模型,适合快速使用 6 | 7 | # 生成句子嵌入向量 8 | embeddings1 = model.encode(text1, convert_to_tensor=True) 9 | embeddings2 = model.encode(text2, convert_to_tensor=True) 10 | 11 | # 计算余弦相似度 12 | cosine_score = util.cos_sim(embeddings1, embeddings2).item() 13 | return cosine_score 14 | 15 | # 示例 16 | text_a = "机器学习是人工智能的核心" 17 | text_b = "人工智能的核心是机器学习" 18 | text_c = "深度学习是机器学习的一个分支" 19 | 20 | print(f"A与B的相似度: {text_similarity_bert(text_a, text_b):.2f}") # 约0.89(语义相似) 21 | print(f"A与C的相似度: {text_similarity_bert(text_a, text_c):.2f}") # 约0.65(相关但不同) 22 | -------------------------------------------------------------------------------- /gne_demo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from gne import GeneralNewsExtractor 3 | 4 | url = "https://www.chinadaily.com.cn/a/202505/24/WS68317c10a310a04af22c1529.html" 5 | 6 | 7 | def crawl(url): 8 | 9 | headers = { 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 11 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7,zh-TW;q=0.6', 12 | 'Referer': 'https://www.chinadaily.com.cn/', 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 14 | } 15 | 16 | response = requests.request("GET", url, headers=headers) 17 | return response.text 18 | 19 | 20 | extractor = GeneralNewsExtractor() 21 | 22 | html = crawl(url) 23 | result = extractor.extract(html) 24 | print('Title --> ', result['title'], '\n') 25 | print('Content -->', result['content'], '\n') 26 | print('PublishTime -->', result['publish_time']) 27 | -------------------------------------------------------------------------------- /newspaper_demo.py: -------------------------------------------------------------------------------- 1 | # from newspaper import Article 2 | 3 | # url = 'http://www.cnn.com/2013/11/27/justice/tucson-arizona-captive-girls/' 4 | # article = Article(url) 5 | # article.download() 6 | 7 | # article.parse() 8 | 9 | # print('Author -->',article.authors) 10 | # print('Publish date -->',article.publish_date) 11 | # print('Article --> ',article.text) 12 | import pandas as pd 13 | import newspaper 14 | cnn_paper = newspaper.build('https://cnn.com') 15 | article_list = [] 16 | for article in cnn_paper.articles: 17 | url = article.url 18 | article = newspaper.Article(url) 19 | article.download() 20 | article.parse() 21 | 22 | print('Author -->',article.authors) 23 | print('Publish date -->',article.publish_date) 24 | print('Article --> ',article.text) 25 | article_obj = { 26 | 'url': url, 27 | 'author': article.authors, 28 | 'publish_date': article.publish_date, 29 | 'text': article.text 30 | } 31 | 32 | article_list.append(article_obj) 33 | 34 | df = pd.DataFrame(article_list) 35 | df.to_excel('cnn_articles.xlsx', index=False) 36 | 37 | -------------------------------------------------------------------------------- /object.json: -------------------------------------------------------------------------------- 1 | var rankData = {datas: [ 2 | "017001,格林港股通臻选混合C,GLGGTZXHHC,2025-05-16,1.4466,1.4466,0.41,5,13.56,11.41,35.75,44.65,,,22.61,44.66,2024-04-22,1,44.6455,,0.00%,,,,", 3 | "020469,长城半导体混合发起式A,CCBDTHHFQSA,2025-05-16,1.3616,1.3616,0.31,-1.84,-2.6,-2.54,-2.16,44.51,,,-1.91,36.16,2024-02-27,1,42.4864,1.50%,0.15%,1,0.15%,1,", 4 | "360011,光大保德信动态优选灵活配置混合A,GDBDXDTYXLHPZHHA,2025-05-16,0.949,2.747,0.32,-1.86,0.53,-3.06,15.31,44.44,-0.32,7.47,13.38,302.23,2009-10-28,1,43.3535,1.50%,0.15%,1,0.15%,1,32.61", 5 | "501208,中欧创新未来混合(LOF),ZOCXWLHHLOF,2025-05-16,0.9825,0.9825,-0.5,0.46,6.4,-0.57,15.64,40.22,12.12,12.79,15.56,-1.75,2020-10-09,1,39.8178,,0.00%,,,,", 6 | "008998,同泰竞争优势混合C,TTJZYSHHC,2025-05-16,1.0334,1.0334,2.91,2.46,13.69,6.56,20.33,40.1,16.24,9.6,30.02,3.34,2020-04-27,1,37.1466,,0.00%,,,,3.43", 7 | "017461,长城久鑫混合C,CCJXHHC,2025-05-16,1.8615,1.8615,2.06,1.73,16.19,10.01,41.84,40,12.91,,40.48,16.38,2022-11-30,1,40.3000,,0.00%,,,," 8 | ],allRecords: 7801,pageIndex: 3,pageNum: 50,allPages: 157,allNum: 17432,zs_count: 3393,gp_count: 1003,hh_count: 7801,zq_count: 4153,qdii_count: 206,fof_count: 874 9 | }; -------------------------------------------------------------------------------- /sentence_tf_demo.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, util 2 | 3 | 4 | # 加载轻量级模型(适合CPU) 5 | model = SentenceTransformer('./local-models/all-MiniLM-L6-v2') 6 | def check(): 7 | # 测试文本嵌入生成 8 | embedding = model.encode("Hello, world!") 9 | print(f"嵌入向量维度: {embedding.shape}") # 应输出 (384,) 10 | 11 | 12 | def sentance_bert(): 13 | 14 | def text_similarity_bert(text1, text2): 15 | # 加载预训练模型(支持多语言) 16 | # model = SentenceTransformer('all-MiniLM-L6-v2') # 轻量级模型,适合快速使用 17 | 18 | # 生成句子嵌入向量 19 | embeddings1 = model.encode(text1, convert_to_tensor=True) 20 | embeddings2 = model.encode(text2, convert_to_tensor=True) 21 | 22 | # 计算余弦相似度 23 | cosine_score = util.cos_sim(embeddings1, embeddings2).item() 24 | return cosine_score 25 | 26 | # 示例 27 | text_a = "机器学习是人工智能的核心" 28 | text_b = "人工智能的核心是机器学习" 29 | text_c = "深度学习是机器学习的一个分支" 30 | 31 | print(f"A与B的相似度: {text_similarity_bert(text_a, text_b):.2f}") # 约0.89(语义相似) 32 | print(f"A与C的相似度: {text_similarity_bert(text_a, text_c):.2f}") # 约0.65(相关但不同) 33 | 34 | if __name__ == "__main__": 35 | sentance_bert() -------------------------------------------------------------------------------- /xpath_example.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from parsel import Selector 3 | 4 | 5 | url = "https://www.fortunechina.com/fortune500/c/2024-08/05/content_456697.htm" 6 | 7 | payload = {} 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 10 | 11 | } 12 | 13 | response = requests.request("GET", url, headers=headers, data=payload) 14 | response.encoding = 'utf8' 15 | resp = Selector(text=response.text) 16 | nodes = resp.xpath( 17 | '//div[@class="hf-right word-img2"]/div[@class="word-table"]/div[@class="wt-table-wrap"]/table/tbody/tr') 18 | 19 | cn_count = 0 20 | us_count = 0 21 | for node in nodes: 22 | num = node.xpath('./td[1]/text()').extract_first() 23 | name = node.xpath('./td[2]/a/text()').extract_first() 24 | income = node.xpath('./td[3]/text()').extract_first() 25 | profit = node.xpath('./td[4]/text()').extract_first() 26 | country = node.xpath('./td[5]/text()').extract_first() 27 | if country == '中国': 28 | cn_count += 1 29 | print(num, name, income, profit, country) 30 | 31 | if country == '美国': 32 | us_count += 1 33 | 34 | print('500强中国企业数量:', cn_count) 35 | print('500强美国企业数量:', us_count) 36 | -------------------------------------------------------------------------------- /td-tf.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | 4 | def calculate_similarity(sentences): 5 | # Initialize TF-IDF Vectorizer 6 | vectorizer = TfidfVectorizer(stop_words='english') # Remove common English stop words 7 | 8 | # Transform sentences into TF-IDF vectors 9 | tfidf_matrix = vectorizer.fit_transform(sentences) 10 | 11 | # Get feature names (words) for reference 12 | feature_names = vectorizer.get_feature_names_out() 13 | 14 | # Calculate cosine similarity between the first sentence and others 15 | similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten() 16 | 17 | return similarities, feature_names,tfidf_matrix 18 | 19 | # Example sentences 20 | sentences = [ 21 | "I love reading books", 22 | "I enjoy reading novels", 23 | ] 24 | 25 | # Calculate similarities 26 | similarities, features,tfidf_matrix = calculate_similarity(sentences) 27 | 28 | # Display results 29 | print(f"Base sentence: {sentences[0]}\n") 30 | for i, similarity in enumerate(similarities, 1): 31 | print(f"Sentence {i}: {sentences[i]}") 32 | print(f"Similarity score: {similarity:.4f}\n") 33 | 34 | # Show some important words (high TF-IDF in base sentence) 35 | print("Key words from base sentence (with significant TF-IDF weights):") 36 | base_vector = tfidf_matrix[0].toarray()[0] 37 | top_indices = base_vector.argsort()[-5:][::-1] # Top 5 words 38 | for idx in top_indices: 39 | print(f"- {features[idx]}: {base_vector[idx]:.4f}") -------------------------------------------------------------------------------- /sentence-transformer-chinese.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, util 2 | 3 | MODEL_PATH = './local-models/uer/sbert-base-chinese-nli' # Path to a lightweight model suitable for CPU 4 | 5 | def calculate_similarity(text1, text2, model): 6 | """ 7 | Calculate cosine similarity between two texts using a pre-trained model 8 | 9 | Parameters: 10 | text1 (str): First text to compare 11 | text2 (str): Second text to compare 12 | model: Pre-loaded SentenceTransformer model 13 | 14 | Returns: 15 | float: Cosine similarity score between 0 and 1 16 | """ 17 | # Generate embeddings for both texts 18 | embedding1 = model.encode(text1, convert_to_tensor=True) 19 | embedding2 = model.encode(text2, convert_to_tensor=True) 20 | 21 | # Calculate and return cosine similarity 22 | return util.cos_sim(embedding1, embedding2).item() 23 | 24 | 25 | def main(): 26 | # Define the three texts for comparison 27 | text_a = "过量摄入高糖食物会导致血糖快速升高,长期可能增加 2 型糖尿病的发病风险。" 28 | # text_b = "昨天我去饭馆吃饭,很开心" 29 | text_b = "如果经常吃很多含糖量高的东西,血糖会迅速上升,时间久了可能更容易得 2 型糖尿病。" 30 | text_c = "每天坚持 30 分钟有氧运动,能增强心肺功能,改善身体代谢,降低心血管疾病风险。" 31 | 32 | # Load a pre-trained SentenceTransformer model 33 | # Using a lightweight model suitable for general purpose similarity tasks 34 | model = SentenceTransformer(MODEL_PATH) 35 | 36 | # Calculate similarity scores 37 | similarity_ab = calculate_similarity(text_a, text_b, model) 38 | similarity_ac = calculate_similarity(text_a, text_c, model) 39 | 40 | # Display results with formatted output 41 | print(f"Text A: {text_a}\n") 42 | print(f"Text B: {text_b}") 43 | print(f"Similarity between A and B: {similarity_ab:.4f}") 44 | print("\n" + "-"*50 + "\n") 45 | print(f"Text C: {text_c}") 46 | print(f"Similarity between A and C: {similarity_ac:.4f}") 47 | 48 | # Provide a simple interpretation of the results 49 | print("\nInterpretation:") 50 | print(f"- A and B are {'highly similar' if similarity_ab > 0.7 else 'moderately similar' if similarity_ab > 0.4 else 'not very similar'}") 51 | print(f"- A and C are {'highly similar' if similarity_ac > 0.7 else 'moderately similar' if similarity_ac > 0.4 else 'not very similar'}") 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /data.json: -------------------------------------------------------------------------------- 1 | { 2 | "count": 82, 3 | "next": "https://swapi.dev/api/people/?page=2", 4 | "previous": null, 5 | "results": [ 6 | { 7 | "name": "Luke Skywalker", 8 | "height": "172", 9 | "mass": "77", 10 | "hair_color": "blond", 11 | "skin_color": "fair", 12 | "eye_color": "blue", 13 | "birth_year": "19BBY", 14 | "gender": "male", 15 | "homeworld": "https://swapi.dev/api/planets/1/", 16 | "films": [ 17 | "https://swapi.dev/api/films/1/", 18 | "https://swapi.dev/api/films/2/", 19 | "https://swapi.dev/api/films/3/", 20 | "https://swapi.dev/api/films/6/" 21 | ], 22 | "species": [], 23 | "vehicles": [ 24 | "https://swapi.dev/api/vehicles/14/", 25 | "https://swapi.dev/api/vehicles/30/" 26 | ], 27 | "starships": [ 28 | "https://swapi.dev/api/starships/12/", 29 | "https://swapi.dev/api/starships/22/" 30 | ], 31 | "created": "2014-12-09T13:50:51.644000Z", 32 | "edited": "2014-12-20T21:17:56.891000Z", 33 | "url": "https://swapi.dev/api/people/1/" 34 | }, 35 | { 36 | "name": "C-3PO", 37 | "height": "167", 38 | "mass": "75", 39 | "hair_color": "n/a", 40 | "skin_color": "gold", 41 | "eye_color": "yellow", 42 | "birth_year": "112BBY", 43 | "gender": "n/a", 44 | "homeworld": "https://swapi.dev/api/planets/1/", 45 | "films": [ 46 | "https://swapi.dev/api/films/1/", 47 | "https://swapi.dev/api/films/2/", 48 | "https://swapi.dev/api/films/3/", 49 | "https://swapi.dev/api/films/4/", 50 | "https://swapi.dev/api/films/5/", 51 | "https://swapi.dev/api/films/6/" 52 | ], 53 | "species": [ 54 | "https://swapi.dev/api/species/2/" 55 | ], 56 | "vehicles": [], 57 | "starships": [], 58 | "created": "2014-12-10T15:10:51.357000Z", 59 | "edited": "2014-12-20T21:17:50.309000Z", 60 | "url": "https://swapi.dev/api/people/2/" 61 | } 62 | ] 63 | } -------------------------------------------------------------------------------- /sentence-transformer-basic-demo.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, util 2 | 3 | MODEL_PATH = './local-models/all-MiniLM-L6-v2' # Path to a lightweight model suitable for CPU 4 | 5 | def calculate_similarity(text1, text2, model): 6 | """ 7 | Calculate cosine similarity between two texts using a pre-trained model 8 | 9 | Parameters: 10 | text1 (str): First text to compare 11 | text2 (str): Second text to compare 12 | model: Pre-loaded SentenceTransformer model 13 | 14 | Returns: 15 | float: Cosine similarity score between 0 and 1 16 | """ 17 | # Generate embeddings for both texts 18 | embedding1 = model.encode(text1, convert_to_tensor=True) 19 | embedding2 = model.encode(text2, convert_to_tensor=True) 20 | 21 | # Calculate and return cosine similarity 22 | return util.cos_sim(embedding1, embedding2).item() 23 | 24 | 25 | def main(): 26 | # Define the three texts for comparison 27 | text_a = "Artificial intelligence is transforming modern society through automation and data analysis." 28 | text_b = "Machine learning algorithms are changing contemporary culture by automating processes and analyzing information." 29 | text_c = "Climate change affects global weather patterns and requires immediate environmental action." 30 | 31 | # Load a pre-trained SentenceTransformer model 32 | # Using a lightweight model suitable for general purpose similarity tasks 33 | model = SentenceTransformer(MODEL_PATH) 34 | 35 | # Calculate similarity scores 36 | similarity_ab = calculate_similarity(text_a, text_b, model) 37 | similarity_ac = calculate_similarity(text_a, text_c, model) 38 | 39 | # Display results with formatted output 40 | print(f"Text A: {text_a}\n") 41 | print(f"Text B: {text_b}") 42 | print(f"Similarity between A and B: {similarity_ab:.4f}") 43 | print("\n" + "-"*50 + "\n") 44 | print(f"Text C: {text_c}") 45 | print(f"Similarity between A and C: {similarity_ac:.4f}") 46 | 47 | # Provide a simple interpretation of the results 48 | print("\nInterpretation:") 49 | print(f"- A and B are {'highly similar' if similarity_ab > 0.7 else 'moderately similar' if similarity_ab > 0.4 else 'not very similar'}") 50 | print(f"- A and C are {'highly similar' if similarity_ac > 0.7 else 'moderately similar' if similarity_ac > 0.4 else 'not very similar'}") 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /influxdb_demo.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from influxdb import InfluxDBClient 3 | import yfinance as yf 4 | # from influxdb_client import InfluxDBClient, Point, WritePrecision 5 | # from influxdb_client.client.write_api import SYNCHRONOUS 6 | from datetime import datetime, timedelta 7 | import pytz 8 | import akshare as ak 9 | 10 | # ======================== 11 | # 配置区域(按需修改) 12 | # ======================== 13 | 14 | INFLUX_CONFIG = { 15 | "url": "http://localhost:8086", # InfluxDB地址 16 | "token": "your_admin_token", # InfluxDB访问令牌 17 | "org": "your_org", # 组织名称 18 | "bucket": "stock_data" # 存储桶名称 19 | } 20 | 21 | STOCKS = ["AAPL", "MSFT", "GOOGL", "TSLA"] # 要监控的股票代码 22 | TIMEZONE = pytz.timezone("America/New_York") # 股票市场时区 23 | 24 | 25 | # ======================== 26 | # 获取股票数据 27 | # ======================== 28 | def fetch_stock_data(ticker): 29 | 30 | df = ak.stock_us_daily(symbol=ticker) # 苹果历史数据 31 | print(df.tail()) 32 | return df.tail(200) 33 | 34 | 35 | # connect to InfluxDB 36 | client = InfluxDBClient(host='localhost', port=8086) 37 | client.switch_database('db_stock') 38 | 39 | 40 | def write_data_to_influx_db(df): 41 | 42 | date_column = 'date' 43 | # make sure date column is datetime type 44 | if not pd.api.types.is_datetime64_any_dtype(df[date_column]): 45 | df[date_column] = pd.to_datetime(df[date_column]) 46 | df['symbol'] = 'NVDA' 47 | # set date column as index 48 | points = [] 49 | for _, row in df.iterrows(): 50 | point = { 51 | "measurement": "stock_data", # measurement is like a table in SQL 52 | "time": row["date"], 53 | "tags": { 54 | "ticker": row["symbol"] # ticker is like a column in SQL 55 | }, 56 | "fields": { 57 | "open": row["open"], 58 | "close": row["close"], 59 | "high": row["high"], 60 | "low": row["low"], 61 | "volume": row["volume"] 62 | 63 | } 64 | } 65 | points.append(point) 66 | 67 | try: 68 | client.write_points( 69 | points=points, 70 | time_precision='s' 71 | ) 72 | except Exception as e: 73 | print(f"Error writing data: {e}") 74 | finally: 75 | client.close() 76 | 77 | def query_data_from_influx_db(): 78 | query = 'SELECT * FROM stock_data WHERE ticker = \'NVDA\' ORDER BY time DESC' 79 | result = client.query(query) 80 | df = pd.DataFrame(list(result.get_points())) 81 | print(df.head()) 82 | 83 | if __name__ == '__main__': 84 | # df = fetch_stock_data("NVDA") 85 | # write_data_to_influx_db(df) 86 | query_data_from_influx_db() -------------------------------------------------------------------------------- /sentence-transformer-fine-tuning.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util 2 | from torch.utils.data import Dataset, DataLoader 3 | import pandas as pd 4 | import random 5 | 6 | # ---------------------- 7 | # 1. 自定义数据集(映射式数据集) 8 | # ---------------------- 9 | class CustomDataset(Dataset): 10 | def __init__(self, examples): 11 | self.examples = examples 12 | 13 | def __len__(self): 14 | return len(self.examples) 15 | 16 | def __getitem__(self, idx): 17 | return self.examples[idx] 18 | 19 | # 准备数据 20 | data = [ 21 | {"sentence1": "人工智能正在改变世界", "sentence2": "AI技术对全球产生深远影响", "score": 0.92}, 22 | {"sentence1": "我喜欢用Python编程", "sentence2": "我热爱使用Python编写代码", "score": 0.88}, 23 | {"sentence1": "北京是中国的首都", "sentence2": "上海是中国的经济中心", "score": 0.23}, 24 | {"sentence1": "今天天气真好", "sentence2": "今天天气很不错", "score": 0.85}, 25 | {"sentence1": "机器学习是人工智能的分支", "sentence2": "深度学习属于机器学习领域", "score": 0.76}, 26 | ] 27 | 28 | df = pd.DataFrame(data) 29 | 30 | # 构建训练样本并打乱 31 | train_examples = [ 32 | InputExample( 33 | texts=[row["sentence1"], row["sentence2"]], 34 | label=row["score"] 35 | ) for _, row in df.iterrows() 36 | ] 37 | random.shuffle(train_examples) 38 | 39 | # 使用自定义数据集 40 | train_dataset = CustomDataset(train_examples) 41 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2) 42 | 43 | # ---------------------- 44 | # 2. 加载基础模型 45 | # ---------------------- 46 | model = SentenceTransformer("./local-models/uer/sbert-base-chinese-nli") 47 | 48 | # ---------------------- 49 | # 3. 配置训练参数 50 | # ---------------------- 51 | train_loss = losses.CosineSimilarityLoss(model) 52 | 53 | evaluator = evaluation.EmbeddingSimilarityEvaluator( 54 | sentences1=df["sentence1"].tolist(), 55 | sentences2=df["sentence2"].tolist(), 56 | scores=df["score"].tolist(), 57 | name="train-eval" 58 | ) 59 | 60 | num_epochs = 10 61 | warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) 62 | output_path = "./local-models/fine-tuned-model" 63 | 64 | # ---------------------- 65 | # 4. 执行训练 66 | # ---------------------- 67 | model.fit( 68 | train_objectives=[(train_dataloader, train_loss)], 69 | evaluator=evaluator, 70 | epochs=num_epochs, 71 | warmup_steps=warmup_steps, 72 | output_path=output_path, 73 | show_progress_bar=True, 74 | evaluation_steps=5 75 | ) 76 | 77 | # ---------------------- 78 | # 5. 测试微调后的模型(修复余弦相似度计算) 79 | # ---------------------- 80 | fine_tuned_model = SentenceTransformer(output_path) 81 | 82 | sentence1 = "自然语言处理是人工智能的重要领域" 83 | sentence2 = "NLP是AI的关键分支" 84 | embedding1 = fine_tuned_model.encode(sentence1, convert_to_tensor=True) 85 | embedding2 = fine_tuned_model.encode(sentence2, convert_to_tensor=True) 86 | 87 | # 关键修复:使用util.cos_sim而非losses.CosineSimilarityLoss.cos_sim 88 | similarity = util.cos_sim(embedding1, embedding2).item() 89 | 90 | print(f"微调后模型计算的相似度: {similarity:.4f}") 91 | -------------------------------------------------------------------------------- /demojson_demo.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import requests 3 | headers = { 4 | "Accept": "*/*", 5 | "Accept-Encoding": "gzip, deflate", 6 | "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7", 7 | "Cache-Control": "no-cache", 8 | "Cookie": "AUTH_FUND.EASTMONEY.COM_GSJZ=AUTH*TTJJ*TOKEN; em_hq_fls=js; HAList=a-sh-603707-%u5065%u53CB%u80A1%u4EFD%2Ca-sz-300999-%u91D1%u9F99%u9C7C%2Ca-sh-605338-%u5DF4%u6BD4%u98DF%u54C1%2Ca-sh-600837-%u6D77%u901A%u8BC1%u5238%2Ca-sh-600030-%u4E2D%u4FE1%u8BC1%u5238%2Ca-sz-300059-%u4E1C%u65B9%u8D22%u5BCC%2Cd-hk-06185; EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; qgqp_b_id=956b72f8de13e912a4fc731a7845a6f8; searchbar_code=163407_588080_501077_163406_001665_001664_007049_004433_005827_110011; EMFUND0=null; EMFUND5=02-24%2019%3A30%3A19@%23%24%u5357%u65B9%u6709%u8272%u91D1%u5C5EETF%u8054%u63A5C@%23%24004433; EMFUND6=02-24%2021%3A46%3A42@%23%24%u5357%u65B9%u4E2D%u8BC1%u7533%u4E07%u6709%u8272%u91D1%u5C5EETF@%23%24512400; EMFUND7=02-24%2021%3A58%3A27@%23%24%u6613%u65B9%u8FBE%u84DD%u7B79%u7CBE%u9009%u6DF7%u5408@%23%24005827; EMFUND8=03-05%2015%3A33%3A29@%23%24%u6613%u65B9%u8FBE%u4E2D%u5C0F%u76D8%u6DF7%u5408@%23%24110011; EMFUND9=03-05 23:47:41@#$%u5929%u5F18%u4F59%u989D%u5B9D%u8D27%u5E01@%23%24000198; ASP.NET_SessionId=ntwtbzdkb0vpkzvil2a3h1ip; st_si=44251094035925; st_asi=delete; st_pvi=77351447730109; st_sp=2020-08-16%2015%3A54%3A02; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=3; st_psi=20210309200219784-0-8081344721", 9 | "Host": "fund.eastmoney.com", 10 | "Pragma": "no-cache", 11 | "Proxy-Connection": "keep-alive", 12 | "Referer": "http://fund.eastmoney.com/data/fundranking.html", 13 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", 14 | } 15 | 16 | 17 | def demo1(): 18 | url = "https://fund.eastmoney.com/Data/Fund_JJJZ_Data.aspx" 19 | params = { 20 | "t": "8", 21 | "page": "1,50000", 22 | "js": "reData", 23 | "sort": "fcode,asc", 24 | } 25 | 26 | r = requests.get(url, params=params, headers=headers) 27 | data_text = r.text 28 | print(data_text) 29 | 30 | 31 | def demo2(): 32 | time_interval = 'jnzf' 33 | ft = 'hh' 34 | td_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d') 35 | td_dt = datetime.datetime.strptime(td_str, '%Y-%m-%d') 36 | # 去年今日 37 | last_dt = td_dt - datetime.timedelta(days=365) 38 | last_str = datetime.datetime.strftime(last_dt, '%Y-%m-%d') 39 | # rank_url = 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft={0}&rs=&gs=0&sc={1}zf&st=desc&sd={2}&ed={3}&qdii=&tabSubtype=,,,,,&pi=1&pn=10000&dx=1'.format( 40 | # ft, time_interval, last_str, td_str) 41 | rank_url = 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft={0}&rs=&gs=0&sc={1}&st=desc&sd={2}&ed={3}&qdii=&tabSubtype=,,,,,&pi=1&pn=10000&dx=1'.format( 42 | ft, time_interval, last_str, td_str) 43 | 44 | print(rank_url) 45 | # r = requests.get(rank_url, headers=headers) 46 | # print(r.text) 47 | 48 | def demo3(): 49 | url="https://swapi.dev/api/people/" 50 | r = requests.get(url,verify=False) 51 | data_text = r.json() 52 | print(data_text) 53 | 54 | 55 | def dump_mongodb(): 56 | import pymongo 57 | 58 | url="https://swapi.dev/api/people/" 59 | response = requests.get(url,verify=False) 60 | json_data = response.json() 61 | 62 | user='root' 63 | password='byb202007leave' 64 | host='134.175.130.90' 65 | port='17018' 66 | 67 | connect_uri = f'mongodb://{user}:{password}@{host}:{port}' 68 | client = pymongo.MongoClient(connect_uri) 69 | db = client['db_spider'] 70 | collection = db['wars_star'] 71 | collection.insert_many(json_data['results']) 72 | 73 | # demo2() 74 | # demo3() 75 | dump_mongodb() 76 | -------------------------------------------------------------------------------- /pandas_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "from unicodedata import normalize\n", 13 | "\n", 14 | "table_MN = pd.read_html('https://en.wikipedia.org/wiki/Minnesota')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "7f80830e", 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/html": [ 26 | "
\n", 27 | "\n", 40 | "\n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | "
LocationJuly (°F)July (°C)January (°F)January (°C)
0Minneapolis83/6428/1823/7−4/−13
1Saint Paul83/6328/1723/6−5/−14
2Rochester82/6328/1723/3−5/−16
3Duluth76/5524/1319/1−7/−17
4St. Cloud81/5827/1418/−1−7/−18
5Mankato86/6230/1623/3−5/−16
6International Falls77/5225/1115/−6−9/−21
\n", 110 | "
" 111 | ], 112 | "text/plain": [ 113 | " Location July (°F) July (°C) January (°F) January (°C)\n", 114 | "0 Minneapolis 83/64 28/18 23/7 −4/−13\n", 115 | "1 Saint Paul 83/63 28/17 23/6 −5/−14\n", 116 | "2 Rochester 82/63 28/17 23/3 −5/−16\n", 117 | "3 Duluth 76/55 24/13 19/1 −7/−17\n", 118 | "4 St. Cloud 81/58 27/14 18/−1 −7/−18\n", 119 | "5 Mankato 86/62 30/16 23/3 −5/−16\n", 120 | "6 International Falls 77/52 25/11 15/−6 −9/−21" 121 | ] 122 | }, 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | }, 127 | { 128 | "ename": "", 129 | "evalue": "", 130 | "output_type": "error", 131 | "traceback": [ 132 | "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", 133 | "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", 134 | "\u001b[1;31mClick here for more info. \n", 135 | "\u001b[1;31mView Jupyter log for further details." 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "table_MN[2]" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "86a15f8a", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "py11", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.11.5" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 5 173 | } 174 | -------------------------------------------------------------------------------- /llm_data_processing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Tuple 6 | import nltk 7 | from nltk.corpus import stopwords 8 | from nltk.tokenize import word_tokenize 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | import hashlib 11 | 12 | # 下载NLTK资源(首次次运行需要) 13 | nltk.download('stopwords') 14 | nltk.download('punkt') 15 | 16 | class LLMDataCleaner: 17 | def __init__(self): 18 | """初始化数据清洗工具,加载停用词等资源""" 19 | self.stop_words = set(stopwords.words('english')) # 英文停用词 20 | # 扩展停用词表(可根据需求添加中文停用词) 21 | self.custom_stop_words = {"http", "https", "www", "com", "html", "jpg", "png"} 22 | self.stop_words.update(self.custom_stop_words) 23 | 24 | def remove_duplicates(self, texts: List[str]) -> Tuple[List[str], int]: 25 | """ 26 | 去除重复文本 27 | :param texts: 文本列表 28 | :return: 去重后的文本列表和去除的重复数量 29 | """ 30 | # 使用哈希值快速检测重复 31 | seen = set() 32 | unique_texts = [] 33 | duplicates = 0 34 | 35 | for text in texts: 36 | # 对文本进行哈希处理(忽略大小写和前后空格) 37 | text_normalized = text.strip().lower() 38 | text_hash = hashlib.md5(text_normalized.encode()).hexdigest() 39 | 40 | if text_hash not in seen: 41 | seen.add(text_hash) 42 | unique_texts.append(text) 43 | else: 44 | duplicates += 1 45 | 46 | return unique_texts, duplicates 47 | 48 | def filter_short_texts(self, texts: List[str], min_length: int = 10) -> Tuple[List[str], int]: 49 | """ 50 | 过滤过短文本(通常包含噪声) 51 | :param texts: 文本列表 52 | :param min_length: 最小字符长度 53 | :return: 过滤后的文本列表和去除的短文本数量 54 | """ 55 | filtered = [] 56 | removed = 0 57 | 58 | for text in texts: 59 | if len(text.strip()) >= min_length: 60 | filtered.append(text) 61 | else: 62 | removed += 1 63 | 64 | return filtered, removed 65 | 66 | def clean_special_characters(self, text: str) -> str: 67 | """ 68 | 清理特殊字符、乱码和多余空格 69 | :param text: 原始文本 70 | :return: 清理后的文本 71 | """ 72 | # 去除URL 73 | text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 74 | 75 | # 去除HTML标签 76 | text = re.sub(r'<.*?>', '', text) 77 | 78 | # 去除特殊字符和乱码(保留基本标点和字母数字) 79 | text = re.sub(r'[^\w\s.,!?\'\"-]', '', text) 80 | 81 | # 合并多个空格为一个 82 | text = re.sub(r'\s+', ' ', text).strip() 83 | 84 | return text 85 | 86 | def remove_stopwords(self, text: str, language: str = 'english') -> str: 87 | """ 88 | 去除停用词(可选步骤,根据模型需求决定) 89 | :param text: 原始文本 90 | :param language: 语言(目前支持英文) 91 | :return: 去除停用词后的文本 92 | """ 93 | if language != 'english': 94 | return text # 可扩展支持其他语言 95 | 96 | words = word_tokenize(text) 97 | filtered_words = [word for word in words if word.lower() not in self.stop_words] 98 | return ' '.join(filtered_words) 99 | 100 | def normalize_case(self, text: str, case: str = 'lower') -> str: 101 | """ 102 | 规范化大小写(通常转为小写,减少词汇表大小) 103 | :param text: 原始文本 104 | :param case: 目标大小写('lower'或'upper') 105 | :return: 规范化后的文本 106 | """ 107 | if case == 'lower': 108 | return text.lower() 109 | elif case == 'upper': 110 | return text.upper() 111 | return text 112 | 113 | def filter_low_quality_texts(self, texts: List[str], quality_threshold: float = 0.3) -> Tuple[List[str], int]: 114 | """ 115 | 过滤低质量文本(基于非标点字符比例) 116 | :param texts: 文本列表 117 | :param quality_threshold: 非标点字符占比阈值 118 | :return: 过滤后的文本列表和去除的低质量文本数量 119 | """ 120 | filtered = [] 121 | removed = 0 122 | 123 | for text in texts: 124 | if not text: 125 | removed += 1 126 | continue 127 | 128 | # 计算非标点字符比例 129 | total_chars = len(text) 130 | punctuation_chars = sum(1 for c in text if c in string.punctuation) 131 | non_punct_ratio = (total_chars - punctuation_chars) / total_chars 132 | 133 | if non_punct_ratio >= quality_threshold: 134 | filtered.append(text) 135 | else: 136 | removed += 1 137 | 138 | return filtered, removed 139 | 140 | def detect_language(self, text: str) -> str: 141 | """ 142 | 简单语言检测(基于字符集) 143 | :param text: 文本 144 | :return: 语言标识('en'/'zh'/'other') 145 | """ 146 | # 检测中文字符 147 | if re.search(r'[\u4e00-\u9fff]', text): 148 | return 'zh' 149 | # 检测英文字符 150 | elif re.search(r'[a-zA-Z]', text): 151 | return 'en' 152 | else: 153 | return 'other' 154 | 155 | def process_batch(self, texts: List[str], min_length: int = 10, quality_threshold: float = 0.3) -> Tuple[List[str], dict]: 156 | """ 157 | 批量处理文本的完整流程 158 | :param texts: 原始文本列表 159 | :param min_length: 最小长度阈值 160 | :param quality_threshold: 质量阈值 161 | :return: 清洗后的文本列表和处理统计信息 162 | """ 163 | stats = { 164 | 'original_count': len(texts), 165 | 'duplicates_removed': 0, 166 | 'short_texts_removed': 0, 167 | 'low_quality_removed': 0, 168 | 'other_removed': 0, 169 | 'final_count': 0 170 | } 171 | 172 | # 1. 去除重复文本 173 | unique_texts, duplicates = self.remove_duplicates(texts) 174 | stats['duplicates_removed'] = duplicates 175 | 176 | # 2. 过滤过短文本 177 | filtered_length, short_removed = self.filter_short_texts(unique_texts, min_length) 178 | stats['short_texts_removed'] = short_removed 179 | 180 | # 3. 清理特殊字符和规范化 181 | cleaned = [] 182 | for text in filtered_length: 183 | # 清理特殊字符 184 | text_clean = self.clean_special_characters(text) 185 | # 规范化大小写(英文) 186 | lang = self.detect_language(text_clean) 187 | if lang == 'en': 188 | text_clean = self.normalize_case(text_clean, 'lower') 189 | cleaned.append(text_clean) 190 | 191 | # 4. 过滤低质量文本 192 | high_quality, low_quality_removed = self.filter_low_quality_texts(cleaned, quality_threshold) 193 | stats['low_quality_removed'] = low_quality_removed 194 | 195 | # 5. 最终统计 196 | stats['final_count'] = len(high_quality) 197 | stats['other_removed'] = stats['original_count'] - stats['final_count'] - sum([ 198 | stats['duplicates_removed'], 199 | stats['short_texts_removed'], 200 | stats['low_quality_removed'] 201 | ]) 202 | 203 | return high_quality, stats 204 | 205 | 206 | # 使用示例 207 | if __name__ == "__main__": 208 | # 示例数据(模拟从文件读取的原始文本) 209 | raw_texts = [ 210 | "Hello world! This is a sample text for LLM training. ", 211 | "Hello world! This is a sample text for LLM training. ", # 重复文本 212 | "Bad text!!!???", # 低质量文本(标点过多) 213 | "Short.", # 过短文本 214 | "https://example.com - Check this website!", # 包含URL 215 | "

HTML tagged text

", # 包含HTML标签 216 | "中文文本示例,测试多语言处理。", 217 | "Another example with multiple spaces and special chars: @#$%" 218 | ] 219 | 220 | # 初始化清洗器 221 | cleaner = LLMDataCleaner() 222 | 223 | # 批量处理文本 224 | cleaned_texts, stats = cleaner.process_batch( 225 | raw_texts, 226 | min_length=8, 227 | quality_threshold=0.5 228 | ) 229 | 230 | # 输出结果 231 | print("清洗统计:") 232 | for key, value in stats.items(): 233 | print(f"{key}: {value}") 234 | 235 | print("\n清洗后的文本:") 236 | for i, text in enumerate(cleaned_texts, 1): 237 | print(f"{i}. {text}") 238 | -------------------------------------------------------------------------------- /llm_data_processing_en.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import hashlib 4 | from typing import List, Tuple 5 | import nltk 6 | from nltk.corpus import stopwords 7 | from nltk.tokenize import word_tokenize 8 | 9 | # Download NLTK resources (required for first run) 10 | nltk.download('stopwords') 11 | nltk.download('punkt') 12 | 13 | class LLMDataCleaner: 14 | def __init__(self): 15 | """Initialize data cleaning utility and load resources like stop words""" 16 | self.stop_words = set(stopwords.words('english')) # English stop words 17 | # Extended custom stop words (can be extended as needed) 18 | self.custom_stop_words = {"http", "https", "www", "com", "html", "jpg", "png"} 19 | self.stop_words.update(self.custom_stop_words) 20 | 21 | def remove_duplicates(self, texts: List[str]) -> Tuple[List[str], int]: 22 | """ 23 | Remove duplicate texts 24 | :param texts: List of texts 25 | :return: Tuple of (deduplicated text list, number of duplicates removed) 26 | """ 27 | # Use hash values for fast duplicate detection 28 | seen = set() 29 | unique_texts = [] 30 | duplicates = 0 31 | 32 | for text in texts: 33 | # Normalize text before hashing (ignore case and leading/trailing spaces) 34 | text_normalized = text.strip().lower() 35 | text_hash = hashlib.md5(text_normalized.encode()).hexdigest() 36 | 37 | if text_hash not in seen: 38 | seen.add(text_hash) 39 | unique_texts.append(text) 40 | else: 41 | duplicates += 1 42 | 43 | return unique_texts, duplicates 44 | 45 | def filter_short_texts(self, texts: List[str], min_length: int = 10) -> Tuple[List[str], int]: 46 | """ 47 | Filter out excessively short texts (usually containing noise) 48 | :param texts: List of texts 49 | :param min_length: Minimum character length threshold 50 | :return: Tuple of (filtered text list, number of short texts removed) 51 | """ 52 | filtered = [] 53 | removed = 0 54 | 55 | for text in texts: 56 | if len(text.strip()) >= min_length: 57 | filtered.append(text) 58 | else: 59 | removed += 1 60 | 61 | return filtered, removed 62 | 63 | def clean_special_characters(self, text: str) -> str: 64 | """ 65 | Clean special characters, garbled code, and extra spaces 66 | :param text: Original text 67 | :return: Cleaned text 68 | """ 69 | # Remove URLs 70 | text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 71 | 72 | # Remove HTML tags 73 | text = re.sub(r'<.*?>', '', text) 74 | 75 | # Remove special characters and garbled code (retain basic punctuation and alphanumerics) 76 | text = re.sub(r'[^\w\s.,!?\'\"-]', '', text) 77 | 78 | # Merge multiple spaces into one 79 | text = re.sub(r'\s+', ' ', text).strip() 80 | 81 | return text 82 | 83 | def remove_stopwords(self, text: str, language: str = 'english') -> str: 84 | """ 85 | Remove stop words (optional step, depends on model requirements) 86 | :param text: Original text 87 | :param language: Language (currently supports English) 88 | :return: Text with stop words removed 89 | """ 90 | if language != 'english': 91 | return text # Can be extended to support other languages 92 | 93 | words = word_tokenize(text) 94 | filtered_words = [word for word in words if word.lower() not in self.stop_words] 95 | return ' '.join(filtered_words) 96 | 97 | def normalize_case(self, text: str, case: str = 'lower') -> str: 98 | """ 99 | Normalize text case (usually convert to lowercase to reduce vocabulary size) 100 | :param text: Original text 101 | :param case: Target case ('lower' or 'upper') 102 | :return: Case-normalized text 103 | """ 104 | if case == 'lower': 105 | return text.lower() 106 | elif case == 'upper': 107 | return text.upper() 108 | return text 109 | 110 | def filter_low_quality_texts(self, texts: List[str], quality_threshold: float = 0.3) -> Tuple[List[str], int]: 111 | """ 112 | Filter low-quality texts (based on proportion of non-punctuation characters) 113 | :param texts: List of texts 114 | :param quality_threshold: Threshold for proportion of non-punctuation characters 115 | :return: Tuple of (filtered text list, number of low-quality texts removed) 116 | """ 117 | filtered = [] 118 | removed = 0 119 | 120 | for text in texts: 121 | if not text: 122 | removed += 1 123 | continue 124 | 125 | # Calculate ratio of non-punctuation characters 126 | total_chars = len(text) 127 | punctuation_chars = sum(1 for c in text if c in string.punctuation) 128 | non_punct_ratio = (total_chars - punctuation_chars) / total_chars 129 | 130 | if non_punct_ratio >= quality_threshold: 131 | filtered.append(text) 132 | else: 133 | removed += 1 134 | 135 | return filtered, removed 136 | 137 | def detect_language(self, text: str) -> str: 138 | """ 139 | Simple language detection (based on character set) 140 | :param text: Input text 141 | :return: Language code ('en'/'zh'/'other') 142 | """ 143 | # Detect Chinese characters 144 | if re.search(r'[\u4e00-\u9fff]', text): 145 | return 'zh' 146 | # Detect English characters 147 | elif re.search(r'[a-zA-Z]', text): 148 | return 'en' 149 | else: 150 | return 'other' 151 | 152 | def process_batch(self, texts: List[str], min_length: int = 10, quality_threshold: float = 0.3) -> Tuple[List[str], dict]: 153 | """ 154 | Complete processing pipeline for batch text cleaning 155 | :param texts: Original list of texts 156 | :param min_length: Minimum length threshold 157 | :param quality_threshold: Quality threshold for filtering 158 | :return: Tuple of (cleaned text list, processing statistics) 159 | """ 160 | stats = { 161 | 'original_count': len(texts), 162 | 'duplicates_removed': 0, 163 | 'short_texts_removed': 0, 164 | 'low_quality_removed': 0, 165 | 'other_removed': 0, 166 | 'final_count': 0 167 | } 168 | 169 | # 1. Remove duplicate texts 170 | unique_texts, duplicates = self.remove_duplicates(texts) 171 | stats['duplicates_removed'] = duplicates 172 | 173 | # 2. Filter out short texts 174 | filtered_length, short_removed = self.filter_short_texts(unique_texts, min_length) 175 | stats['short_texts_removed'] = short_removed 176 | 177 | # 3. Clean special characters and normalize 178 | cleaned = [] 179 | for text in filtered_length: 180 | # Clean special characters 181 | text_clean = self.clean_special_characters(text) 182 | # Normalize case (for English) 183 | lang = self.detect_language(text_clean) 184 | if lang == 'en': 185 | text_clean = self.normalize_case(text_clean, 'lower') 186 | cleaned.append(text_clean) 187 | 188 | # 4. Filter low quality texts 189 | high_quality, low_quality_removed = self.filter_low_quality_texts(cleaned, quality_threshold) 190 | stats['low_quality_removed'] = low_quality_removed 191 | 192 | # 5. Final statistics 193 | stats['final_count'] = len(high_quality) 194 | stats['other_removed'] = stats['original_count'] - stats['final_count'] - sum([ 195 | stats['duplicates_removed'], 196 | stats['short_texts_removed'], 197 | stats['low_quality_removed'] 198 | ]) 199 | 200 | return high_quality, stats 201 | 202 | 203 | # Usage example 204 | if __name__ == "__main__": 205 | # Sample data (simulating raw text read from files) 206 | raw_texts = [ 207 | "Hello world! This is a sample text for LLM training. ", 208 | "Hello world! This is a sample text for LLM training. ", # Duplicate text 209 | "Bad text!!!???", # Low quality text (too many punctuation) 210 | "Short.", # Excessively short text 211 | "https://example.com - Check this website!", # Contains URL 212 | "

HTML tagged text

", # Contains HTML tags 213 | "中文文本示例,测试多语言处理。", # Chinese text example 214 | "Another example with multiple spaces and special chars: @#$%" 215 | ] 216 | 217 | # Initialize cleaner 218 | cleaner = LLMDataCleaner() 219 | 220 | # Process text batch 221 | cleaned_texts, stats = cleaner.process_batch( 222 | raw_texts, 223 | min_length=8, 224 | quality_threshold=0.5 225 | ) 226 | 227 | # Output results 228 | print("Cleaning statistics:") 229 | for key, value in stats.items(): 230 | print(f"{key}: {value}") 231 | 232 | print("\nCleaned texts:") 233 | for i, text in enumerate(cleaned_texts, 1): 234 | print(f"{i}. {text}") 235 | -------------------------------------------------------------------------------- /llm_data_process_advanced.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import hashlib 4 | import spacy 5 | import fasttext 6 | import dask.bag as db 7 | from dask.diagnostics import ProgressBar 8 | from typing import List, Tuple, Dict 9 | import torch 10 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline 11 | import numpy as np 12 | from langdetect import detect, LangDetectException 13 | 14 | # ---------------------- 15 | # Resource Initialization 16 | # ---------------------- 17 | # Load NLP models (download first if needed) 18 | # spacy download en_core_web_lg 19 | # spacy download zh_core_web_lg 20 | try: 21 | nlp_en = spacy.load("en_core_web_lg") # For English NER and parsing 22 | nlp_zh = spacy.load("zh_core_web_lg") # For Chinese NER 23 | except: 24 | print("Warning: SpaCy models not found. Sensitive info detection may be limited.") 25 | nlp_en = None 26 | nlp_zh = None 27 | 28 | # FastText for language detection (more accurate than regex) 29 | # Download model: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin 30 | try: 31 | ft_model = fasttext.load_model('lid.176.bin') 32 | except: 33 | print("Warning: FastText model not found. Using fallback language detection.") 34 | ft_model = None 35 | 36 | # Load quality assessment model (assesses text coherence/information density) 37 | quality_model_name = "microsoft/xtremedistil-l6-h384-uncased" 38 | quality_tokenizer = AutoTokenizer.from_pretrained(quality_model_name) 39 | quality_model = AutoModelForSequenceClassification.from_pretrained( 40 | quality_model_name, 41 | num_labels=2 # 0: low quality, 1: high quality (fine-tuned on custom data) 42 | ) 43 | quality_pipeline = pipeline( 44 | "text-classification", 45 | model=quality_model, 46 | tokenizer=quality_tokenizer, 47 | device=0 if torch.cuda.is_available() else -1 48 | ) 49 | 50 | # ---------------------- 51 | # Advanced Cleaner Class 52 | # ---------------------- 53 | class AdvancedLLMCleaner: 54 | def __init__(self): 55 | # Sensitive pattern database (extended) 56 | self.sensitive_patterns = { 57 | "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 58 | "phone": r'\b(?:\+?86)?1[3-9]\d{9}\b', # Chinese phone 59 | "id_card": r'\b\d{17}[\dXx]\b', # Chinese ID 60 | "credit_card": r'\b(?:\d{4}[-\s]?){3}\d{4}\b' 61 | } 62 | # Harmful keywords (example categories) 63 | self.harmful_keywords = {"violence", "discrimination", "hate", "terrorism"} 64 | 65 | def detect_language_advanced(self, text: str) -> Tuple[str, float]: 66 | """ 67 | Advanced language detection with confidence score 68 | Returns (language code, confidence) 69 | """ 70 | if not text.strip(): 71 | return ("unknown", 0.0) 72 | 73 | try: 74 | if ft_model: 75 | predictions = ft_model.predict(text, k=1) 76 | lang = predictions[0][0].replace("__label__", "") 77 | confidence = predictions[1][0] 78 | return (lang, confidence) 79 | else: 80 | # Fallback to langdetect 81 | lang = detect(text) 82 | return (lang, 0.8) # Assume lower confidence 83 | except (LangDetectException, IndexError): 84 | return ("unknown", 0.0) 85 | 86 | def split_long_text(self, text: str, lang: str = "en", max_tokens: int = 512) -> List[str]: 87 | """ 88 | Split long text into semantic chunks (avoid splitting sentences) 89 | """ 90 | nlp = nlp_en if lang == "en" else nlp_zh if lang == "zh" else None 91 | if not nlp or not text.strip(): 92 | return [text] 93 | 94 | doc = nlp(text) 95 | sentences = [sent.text for sent in doc.sents] 96 | chunks = [] 97 | current_chunk = [] 98 | current_length = 0 99 | 100 | for sent in sentences: 101 | sent_tokens = len(nlp(sent)) 102 | if current_length + sent_tokens <= max_tokens: 103 | current_chunk.append(sent) 104 | current_length += sent_tokens 105 | else: 106 | if current_chunk: 107 | chunks.append(" ".join(current_chunk)) 108 | current_chunk = [sent] 109 | current_length = sent_tokens 110 | 111 | if current_chunk: 112 | chunks.append(" ".join(current_chunk)) 113 | 114 | return chunks 115 | 116 | def filter_semantic_quality(self, texts: List[str], threshold: float = 0.7) -> List[str]: 117 | """ 118 | Filter texts based on semantic quality (using pre-trained classifier) 119 | """ 120 | high_quality = [] 121 | # Batch processing to improve efficiency 122 | for i in range(0, len(texts), 32): 123 | batch = texts[i:i+32] 124 | results = quality_pipeline(batch) 125 | for text, res in zip(batch, results): 126 | if res["label"] == "LABEL_1" and res["score"] >= threshold: 127 | high_quality.append(text) 128 | return high_quality 129 | 130 | def desensitize_text(self, text: str, lang: str = "en") -> str: 131 | """ 132 | Deep desensitization: replace sensitive info with placeholders 133 | """ 134 | # 1. Pattern-based replacement 135 | for name, pattern in self.sensitive_patterns.items(): 136 | text = re.sub(pattern, f"[{name}_REDACTED]", text) 137 | 138 | # 2. NER-based replacement (names, addresses, organizations) 139 | nlp = nlp_en if lang == "en" else nlp_zh if lang == "zh" else None 140 | if nlp: 141 | doc = nlp(text) 142 | for ent in doc.ents: 143 | # Redact personal entities (customize based on your needs) 144 | if ent.label_ in ["PERSON", "GPE", "ORG", "DATE"]: # GPE: countries/cities 145 | text = text.replace(ent.text, f"[{ent.label_}_REDACTED]") 146 | 147 | # 3. Harmful content filtering 148 | for keyword in self.harmful_keywords: 149 | if keyword in text.lower(): 150 | return "" # Remove entirely if harmful content is found 151 | return text 152 | 153 | def remove_cross_lang_noise(self, text: str, primary_lang: str = None) -> str: 154 | """ 155 | Remove mixed-language noise (e.g., English words in Chinese text with low info value) 156 | """ 157 | if not primary_lang: 158 | primary_lang, _ = self.detect_language_advanced(text) 159 | if primary_lang == "unknown": 160 | return text 161 | 162 | # For Chinese text: remove English words with low semantic value 163 | if primary_lang == "zh": 164 | # Keep meaningful English terms (e.g., "AI", "GDP") but remove noise 165 | english_words = re.findall(r'[A-Za-z]+', text) 166 | for word in english_words: 167 | if len(word) < 3 and word.lower() not in {"ai", "it", "gdp"}: 168 | text = text.replace(word, "") 169 | return text 170 | 171 | def distributed_clean(self, file_paths: List[str], batch_size: int = 1000) -> None: 172 | """ 173 | Distributed cleaning for large-scale data using Dask 174 | """ 175 | # Create Dask bag from file paths 176 | bag = db.from_sequence(file_paths, npartitions=8) 177 | 178 | # Define processing pipeline 179 | def process_file(file_path): 180 | with open(file_path, "r", encoding="utf-8", errors="ignore") as f: 181 | texts = [line.strip() for line in f if line.strip()] 182 | 183 | cleaned = [] 184 | for text in texts: 185 | # Basic cleaning 186 | text = re.sub(r'\s+', ' ', text).strip() 187 | if len(text) < 20: 188 | continue 189 | 190 | # Language detection 191 | lang, conf = self.detect_language_advanced(text) 192 | if conf < 0.6: 193 | continue 194 | 195 | # Desensitize 196 | text = self.desensitize_text(text, lang) 197 | if not text: 198 | continue 199 | 200 | # Cross-language noise removal 201 | text = self.remove_cross_lang_noise(text, lang) 202 | 203 | # Split long text 204 | chunks = self.split_long_text(text, lang) 205 | cleaned.extend(chunks) 206 | 207 | return cleaned 208 | 209 | # Execute in parallel 210 | cleaned_bag = bag.map(process_file).flatten() 211 | 212 | # Save results (example: write to output directory) 213 | with ProgressBar(): 214 | cleaned_bag.to_textfiles("cleaned_data/output_*.txt") 215 | 216 | print("Distributed cleaning completed.") 217 | 218 | 219 | # ---------------------- 220 | # Usage Example 221 | # ---------------------- 222 | if __name__ == "__main__": 223 | cleaner = AdvancedLLMCleaner() 224 | 225 | # Example 1: Process a single long text 226 | long_text = """ 227 | Dr. John Smith (john.smith@example.com) delivered a speech on AI in Beijing. 228 | He mentioned that 80% of data scientists use Python. His phone number is 13800138000. 229 | 这是一段包含英文单词的中文文本,其中夹杂着一些 short 英文单词。 230 | """ 231 | lang, _ = cleaner.detect_language_advanced(long_text) 232 | desensitized = cleaner.desensitize_text(long_text, lang) 233 | filtered = cleaner.remove_cross_lang_noise(desensitized, lang) 234 | chunks = cleaner.split_long_text(filtered, lang) 235 | print("Processed chunks:") 236 | for i, chunk in enumerate(chunks): 237 | print(f"Chunk {i+1}: {chunk}") 238 | 239 | # Example 2: Semantic quality filtering 240 | sample_texts = [ 241 | "Good morning! How are you?", # High quality 242 | "Asdflkj qwerpoi 12345...", # Low quality 243 | "The quick brown fox jumps over the lazy dog." # High quality 244 | ] 245 | high_quality = cleaner.filter_semantic_quality(sample_texts) 246 | print("\nHigh quality texts after filtering:", high_quality) 247 | 248 | # Example 3: Distributed cleaning (uncomment to test with your files) 249 | # file_paths = [f"data/raw_{i}.txt" for i in range(10)] # Replace with your file paths 250 | # cleaner.distributed_clean(file_paths) 251 | -------------------------------------------------------------------------------- /es-demo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from elasticsearch import Elasticsearch 4 | from elasticsearch.exceptions import RequestError 5 | import time 6 | import random 7 | import json 8 | from urllib.parse import urlparse, parse_qs 9 | 10 | # Initialize Elasticsearch connection 11 | def init_elasticsearch(host='localhost', port=9200): 12 | """Initialize Elasticsearch connection""" 13 | es = Elasticsearch([f'http://{host}:{port}']) 14 | if es.ping(): 15 | print("Successfully connected to Elasticsearch") 16 | return es 17 | else: 18 | print("Failed to connect to Elasticsearch, please check configuration") 19 | return None 20 | 21 | # Create index and mapping 22 | def create_ecommerce_index(es, index_name='amazon_products'): 23 | """Create Amazon product index and mapping""" 24 | if es.indices.exists(index=index_name): 25 | print(f"Index {index_name} already exists") 26 | return True 27 | 28 | # Define index mapping 29 | mapping = { 30 | "settings": { 31 | "number_of_shards": 1, 32 | "number_of_replicas": 0, 33 | "analysis": { 34 | "analyzer": { 35 | "english_analyzer": { 36 | "type": "standard", 37 | "stopwords": "_english_" 38 | } 39 | } 40 | } 41 | }, 42 | "mappings": { 43 | "properties": { 44 | "product_id": {"type": "keyword"}, 45 | "name": { 46 | "type": "text", 47 | "analyzer": "english_analyzer", 48 | "fields": {"keyword": {"type": "keyword"}} 49 | }, 50 | "price": {"type": "float"}, 51 | "original_price": {"type": "float"}, 52 | "currency": {"type": "keyword"}, 53 | "category": {"type": "keyword"}, 54 | "sub_category": {"type": "keyword"}, 55 | "brand": {"type": "keyword"}, 56 | "rating": {"type": "float"}, 57 | "review_count": {"type": "integer"}, 58 | "description": {"type": "text", "analyzer": "english_analyzer"}, 59 | "features": {"type": "text", "analyzer": "english_analyzer"}, 60 | "specifications": {"type": "text", "analyzer": "english_analyzer"}, 61 | "url": {"type": "keyword"}, 62 | "image_url": {"type": "keyword"}, 63 | "availability": {"type": "keyword"}, 64 | "scraped_at": {"type": "date"} 65 | } 66 | } 67 | } 68 | 69 | try: 70 | es.indices.create(index=index_name, body=mapping) 71 | print(f"Successfully created index {index_name}") 72 | return True 73 | except RequestError as e: 74 | print(f"Failed to create index: {e}") 75 | return False 76 | 77 | # Extract data from Amazon product page 78 | def scrape_amazon_product(product_url): 79 | """Scrape product data from Amazon product page""" 80 | # Set request headers to simulate browser behavior 81 | headers = { 82 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", 83 | "Accept-Language": "en-US,en;q=0.9", 84 | "Accept-Encoding": "gzip, deflate, br", 85 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 86 | "Connection": "keep-alive" 87 | } 88 | 89 | try: 90 | # Add random delay to avoid anti-scraping measures 91 | time.sleep(random.uniform(2, 5)) 92 | 93 | # Send request 94 | response = requests.get(product_url, headers=headers, timeout=10) 95 | response.raise_for_status() # Raise HTTP errors 96 | 97 | # Parse HTML 98 | soup = BeautifulSoup(response.text, 'html.parser') 99 | 100 | # Extract ASIN (Amazon product ID) 101 | parsed_url = urlparse(product_url) 102 | query_params = parse_qs(parsed_url.query) 103 | asin = query_params.get('asin', [None])[0] 104 | 105 | # If no ASIN in URL, try extracting from page 106 | if not asin: 107 | asin_meta = soup.find('meta', {'name': 'twitter:data1'}) 108 | if asin_meta: 109 | asin = asin_meta.get('content', '').split(':')[-1].strip() 110 | 111 | # Extract product name 112 | product_name = soup.find('span', {'id': 'productTitle'}) 113 | product_name = product_name.get_text(strip=True) if product_name else None 114 | 115 | # Extract price 116 | price = None 117 | original_price = None 118 | currency = '$' 119 | 120 | price_elem = soup.find('span', {'class': 'a-price-whole'}) 121 | if price_elem: 122 | price_str = price_elem.get_text(strip=True).replace(',', '').replace('.', '') 123 | decimal_elem = soup.find('span', {'class': 'a-price-fraction'}) 124 | if decimal_elem: 125 | price_str += '.' + decimal_elem.get_text(strip=True) 126 | price = float(price_str) if price_str else None 127 | 128 | # Extract original price (if discounted) 129 | original_price_elem = soup.find('span', {'class': 'a-price a-text-price'}) 130 | if original_price_elem: 131 | original_price_str = original_price_elem.get_text(strip=True).replace(currency, '').replace(',', '') 132 | original_price = float(original_price_str) if original_price_str else None 133 | 134 | # Extract rating and review count 135 | rating = None 136 | review_count = None 137 | 138 | rating_elem = soup.find('span', {'class': 'a-icon-alt'}) 139 | if rating_elem: 140 | rating_str = rating_elem.get_text(strip=True).split()[0] 141 | rating = float(rating_str) if rating_str else None 142 | 143 | review_count_elem = soup.find('span', {'id': 'acrCustomerReviewText'}) 144 | if review_count_elem: 145 | review_count_str = review_count_elem.get_text(strip=True).split()[0].replace(',', '') 146 | review_count = int(review_count_str) if review_count_str else None 147 | 148 | # Extract brand 149 | brand = None 150 | brand_elem = soup.find('a', {'id': 'bylineInfo'}) 151 | if brand_elem: 152 | brand = brand_elem.get_text(strip=True).replace('Visit the ', '').replace(' Store', '') 153 | 154 | # Extract product description 155 | description = None 156 | description_elem = soup.find('div', {'id': 'productDescription'}) 157 | if description_elem: 158 | description = description_elem.get_text(strip=True) 159 | 160 | # Extract product features 161 | features = [] 162 | features_elems = soup.find_all('li', {'class': 'a-spacing-mini'}) 163 | if features_elems: 164 | features = [f.get_text(strip=True) for f in features_elems[:5]] # Get first 5 features 165 | features_text = ', '.join(features) 166 | 167 | # Extract category information 168 | category = None 169 | sub_category = None 170 | breadcrumbs = soup.find_all('li', {'class': 'a-spacing-none a-list-item'}) 171 | if len(breadcrumbs) >= 2: 172 | category = breadcrumbs[-2].get_text(strip=True) if len(breadcrumbs) > 1 else None 173 | sub_category = breadcrumbs[-1].get_text(strip=True) if breadcrumbs else None 174 | 175 | # Extract image URL 176 | image_url = None 177 | image_elem = soup.find('img', {'id': 'landingImage'}) 178 | if image_elem: 179 | image_url = image_elem.get('src') 180 | 181 | # Extract availability status 182 | availability = None 183 | availability_elem = soup.find('div', {'id': 'availability'}) 184 | if availability_elem: 185 | availability = availability_elem.get_text(strip=True) 186 | 187 | # Build product data dictionary 188 | product_data = { 189 | "product_id": asin, 190 | "name": product_name, 191 | "price": price, 192 | "original_price": original_price, 193 | "currency": currency, 194 | "category": category, 195 | "sub_category": sub_category, 196 | "brand": brand, 197 | "rating": rating, 198 | "review_count": review_count, 199 | "description": description, 200 | "features": features_text, 201 | "url": product_url, 202 | "image_url": image_url, 203 | "availability": availability, 204 | "scraped_at": time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) 205 | } 206 | 207 | return product_data 208 | 209 | except Exception as e: 210 | print(f"Error scraping product: {e}") 211 | return None 212 | 213 | # Import data to Elasticsearch 214 | def import_to_es(es, product_data, index_name='amazon_products'): 215 | """Import single product data to Elasticsearch""" 216 | if not product_data or not product_data.get('product_id'): 217 | print("Invalid product data, skipping import") 218 | return False 219 | 220 | try: 221 | # Use product ID as document ID 222 | response = es.index( 223 | index=index_name, 224 | id=product_data['product_id'], 225 | body=product_data 226 | ) 227 | 228 | if response['result'] in ['created', 'updated']: 229 | print(f"Successfully imported product: {product_data.get('name')}") 230 | return True 231 | else: 232 | print(f"Failed to import product: {product_data.get('name')}") 233 | return False 234 | 235 | except Exception as e: 236 | print(f"Error importing to Elasticsearch: {e}") 237 | return False 238 | 239 | # Main function 240 | def main(): 241 | # Initialize Elasticsearch connection 242 | es = init_elasticsearch() 243 | if not es: 244 | return 245 | 246 | # Create index 247 | index_name = 'amazon_products' 248 | create_ecommerce_index(es, index_name) 249 | 250 | # List of Amazon product URLs to scrape 251 | product_urls = [ 252 | # Example product URLs - replace with any Amazon product pages 253 | "https://www.amazon.com/dp/B07VGRJDFY", 254 | "https://www.amazon.com/dp/B08N5WRWNW", 255 | "https://www.amazon.com/dp/B09V3KXJPB" 256 | ] 257 | 258 | # Scrape and import each product 259 | for url in product_urls: 260 | print(f"\nScraping product from: {url}") 261 | product_data = scrape_amazon_product(url) 262 | 263 | if product_data: 264 | print(f"Successfully scraped product: {product_data.get('name')}") 265 | import_to_es(es, product_data, index_name) 266 | else: 267 | print(f"Failed to scrape product from: {url}") 268 | 269 | # Scrape interval 270 | if url != product_urls[-1]: 271 | sleep_time = random.uniform(3, 7) 272 | print(f"Waiting {sleep_time:.2f} seconds before next scrape...") 273 | time.sleep(sleep_time) 274 | 275 | print("\nAll operations completed") 276 | 277 | if __name__ == "__main__": 278 | main() 279 | -------------------------------------------------------------------------------- /image_process.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "69e85293", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Image downloaded successfully.\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import requests\n", 19 | "url = 'https://raw.githubusercontent.com/boringcdn/sd/master/sd-generate-4.webp'\n", 20 | "response = requests.get(url)\n", 21 | "with open('sd-generate-1.webp', 'wb') as file:\n", 22 | " file.write(response.content)\n", 23 | " print(\"Image downloaded successfully.\")\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "f11d0005", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "\n", 37 | "\n", 38 | "AI Image Gallery - Stable Diffusion AI\n", 39 | "\n", 40 | "\n", 41 | "\n", 42 | "\n", 43 | "\n", 44 | "\n", 45 | "\n", 46 | "\n", 47 | "\n", 48 | "\n", 49 | "\n", 50 | "\n", 51 | "\n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "\n", 57 | "\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | "\n", 62 | "\n", 63 | "\n", 64 | "

AI-generated images with Stable Diffusion AI

Find and download the best AI-generated images to inspire your AI art and improve your creations.

\"Stable

Prompt: A tree, but instead of leaves, it grows small clouds. Cartoon.

\"Stable

Prompt: Full body, anime art, Young woman practicing martial arts with steel gloves, Chinese dragon in the background, sky garden sanctuary, 4k

\"Stable

Prompt: Black Yamaha cruiser with an old man on it cruising down the coast, 8k masterpiece, perfect lighting, stunning details, shadow play, detailed hues, motion blur

\"Stable

Prompt: A cat father teaching a cat son how to fix a car, lifelike.

\"Stable

Prompt: A surreal coffee maker designed by Dieter Rams. Product ad retro. Colorful stunning design.

\"Stable

Prompt: 4d photographic image of full body image of a cute little chibi boy realistic, vivid colors octane render trending on artstation, artistic photography, photorealistic concept art, soft natural volumetric cinematic perfect light, UHD no background

You Might Also Be Interested In

AI Image Generator

Generate images effortlessly from your text prompts. Use any available model or upload your own, with full control over parameters like ControlNet and LoRA for a tailored creative experience.

Stable Diffusion Prompts

Find the best Stable Diffusion prompts to inspire your creativity.

Stable Diffusion 3.5 Large

Stable Diffusion 3.5 Large is an 8-billion-parameter model delivering high-quality, prompt-adherent images up to 1 megapixel, customizable for professional use on consumer hardware.

Stable Diffusion 3.5 Large Turbo

Stable Diffusion 3.5 Large Turbo is a fast, high-quality AI image generator that delivers exceptional prompt adherence in just four steps, optimized for consumer hardware.

Stable Diffusion 3 Medium

Stable Diffusion 3 Medium makes it easy to create high-quality images from text prompts.

Stable Diffusion Web UI

A web interface with the Stable Diffusion AI model to create stunning AI art online.

\n", 65 | "\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "import requests\n", 71 | "\n", 72 | "url = \"https://stabledifffusion.com/gallery\"\n", 73 | "\n", 74 | "payload = {}\n", 75 | "headers = {\n", 76 | " 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", 77 | " 'cookie': '_ga=GA1.1.258999226.1754806446; _ga_C4QP4FPRFF=GS2.1.s1754806445$o1$g1$t1754807302$j44$l0$h0',\n", 78 | " 'pragma': 'no-cache',\n", 79 | " 'referer': 'https://stabledifffusion.com/',\n", 80 | " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'\n", 81 | "}\n", 82 | "\n", 83 | "response = requests.request(\"GET\", url, headers=headers, data=payload)\n", 84 | "\n", 85 | "print(response.text)\n", 86 | "\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "0dc57976", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "['https://cdn.jsdelivr.net/gh/boringcdn/sd/sd-generate-1.webp',\n", 99 | " 'https://cdn.jsdelivr.net/gh/boringcdn/sd/sd-generate-2.webp',\n", 100 | " 'https://cdn.jsdelivr.net/gh/boringcdn/sd/sd-generate-3.webp',\n", 101 | " 'https://cdn.jsdelivr.net/gh/boringcdn/sd/sd-generate-4.webp',\n", 102 | " 'https://cdn.jsdelivr.net/gh/boringcdn/sd/sd-generate-5.webp',\n", 103 | " 'https://cdn.jsdelivr.net/gh/boringcdn/sd/sd-generate-6.webp']" 104 | ] 105 | }, 106 | "execution_count": 7, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "from parsel import Selector\n", 113 | "import requests\n", 114 | "\n", 115 | "url = \"https://stabledifffusion.com/gallery\"\n", 116 | "\n", 117 | "payload = {}\n", 118 | "headers = {\n", 119 | " 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", 120 | " 'cookie': '_ga=GA1.1.258999226.1754806446; _ga_C4QP4FPRFF=GS2.1.s1754806445$o1$g1$t1754807302$j44$l0$h0',\n", 121 | " 'pragma': 'no-cache',\n", 122 | " 'referer': 'https://stabledifffusion.com/',\n", 123 | " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'\n", 124 | "}\n", 125 | "\n", 126 | "def get_stable_diffusion_images():\n", 127 | " response = requests.request(\"GET\", url, headers=headers, data=payload)\n", 128 | " text = response.text\n", 129 | " resp = Selector(text=text)\n", 130 | " image_urls = resp.xpath('//div[@class=\"grid grid-cols-1 md:grid-cols-3 gap-4\"]/div[@class=\"max-w-sm\"]/img/@src').getall()\n", 131 | " return image_urls\n", 132 | "\n", 133 | "\n", 134 | "\n" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "id": "92b2ee9a", 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "59124" 147 | ] 148 | }, 149 | "execution_count": 5, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "len(text)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 6, 161 | "id": "f6b99fec", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "True" 168 | ] 169 | }, 170 | "execution_count": 6, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "'A cat father' in text" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "2d23bd36", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "def download_image(image_url, filename):\n", 187 | " response = requests.get(image_url)\n", 188 | " if response.status_code == 200:\n", 189 | " with open(filename, 'wb') as file:\n", 190 | " file.write(response.content)\n", 191 | " print(f\"Image {filename} downloaded successfully.\")\n", 192 | " else:\n", 193 | " print(f\"Failed to download image {filename}. Status code: {response.status_code}\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 8, 199 | "id": "93324ca8", 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "格式: WEBP\n", 207 | "尺寸: (1664, 2304)\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "from PIL import Image\n", 213 | "\n", 214 | "# 打开WebP图片\n", 215 | "with Image.open(\"sd-generate-1.webp\") as img:\n", 216 | " # 显示图片信息\n", 217 | " print(f\"格式: {img.format}\")\n", 218 | " print(f\"尺寸: {img.size}\")\n", 219 | " # 可以进行其他操作,如转换格式\n", 220 | " img.save(\"image.jpg\", \"JPEG\")" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "py11", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.11.5" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 5 245 | } 246 | --------------------------------------------------------------------------------