├── README.md ├── Spider ├── PR_calculation.py ├── downloadlink.py ├── htmonly.py ├── htmonly_pagerank.py ├── mutispider.py └── mutispider_pagerank.py ├── app.py ├── data_clean ├── clean_document.py ├── load_data.py ├── merge_data.py ├── news1_clean_ distinct.py ├── news_clean_distinct.py └── news_clean_frame.py ├── datasets_and_logs └── 2024_11_29_23_45_00_log.txt ├── db_init ├── init_db.py ├── init_db_new.py └── init_user_profiles.py ├── img-folder ├── 19255F29.png ├── image-20241217170449292.png ├── image-20241217172152142.png ├── image-20241217173749975.png ├── image-20241217173936568.png ├── image-20241217174140950.png ├── image-20241217174236515.png ├── image-20241217174713138.png ├── image-20241217175037417.png ├── image-20241217175233504.png ├── image-20241217184806123.png ├── image-20241217184922146.png ├── image-20241217185008968.png ├── image-20241217185208423.png ├── image-20241217185558358.png ├── image-20241217192145331.png ├── image-20241217192259748.png ├── image-20241217192419633.png ├── image-20241217192631920.png ├── image-20241217193833186.png ├── image-20241217193922327.png ├── image-20241217194706713.png ├── image-20241217195003153.png ├── image-20241217195200393.png ├── image-20241217201947701.png ├── image-20241217202328199.png ├── image-20241217204512415.png ├── image-20241217204732261.png ├── image-20241217205153730.png ├── image-20241217205341225.png ├── image-20241217205927444.png ├── image-20241217210224984.png ├── image-20241217210435856.png ├── image-20241217210524271.png ├── image-20241217210643902.png ├── image-20241217210942244.png ├── image-20241217211441465.png ├── image-20241217211645716.png ├── image-20241217211724777.png ├── image-20241217212505606.png ├── image-20241217212805264.png ├── image-20241217220109997.png ├── image-20241217220232016.png ├── image-20241217220410027.png ├── image-20241217220710450.png ├── image-20241217221306764.png ├── image-20241217221434868.png ├── image-20241217221619006.png ├── image-20241217221818883.png ├── image-20241217222258357.png ├── image-20241217231353742.png ├── image-20241217231856471.png ├── image-20241217234338468.png ├── image-20241217234427613.png ├── image-20241217234452315.png ├── image-20241217234622692.png └── readme.md ├── index ├── ES_Index.py ├── creat_index.py ├── creat_index00.py ├── creat_index01.py └── creat_index_document.py ├── search ├── __pycache__ │ ├── manager.cpython-39.pyc │ ├── personalization.cpython-39.pyc │ └── processor.cpython-39.pyc ├── manager.py ├── personalization.py └── processor.py ├── static └── css │ ├── document.css │ ├── main.css │ ├── pagination.css │ ├── results.css │ ├── search.css │ ├── search_history.css │ ├── search_suggestions.css │ └── user.css ├── templates ├── history.html ├── preferences.html ├── profile.html ├── search.html ├── search0.html ├── search00.html └── snapshot.html ├── test_document.py ├── test_html.py ├── test_wildcard.py └── 说明文档.pdf /Spider/PR_calculation.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | import networkx as nx 3 | import numpy as np 4 | from scipy import sparse 5 | from numba import jit 6 | from tqdm import tqdm 7 | import pandas as pd 8 | from urllib.parse import urlparse 9 | from concurrent.futures import ThreadPoolExecutor 10 | import multiprocessing 11 | import time 12 | from pymongo.operations import UpdateOne 13 | from pymongo.errors import BulkWriteError 14 | 15 | class OptimizedPageRankCalculator: 16 | def __init__(self, damping_factor=0.85, tolerance=1e-6, max_iter=100): 17 | self.damping_factor = damping_factor 18 | self.tolerance = tolerance 19 | self.max_iter = max_iter 20 | 21 | print("初始化数据库连接...") 22 | self.client = MongoClient('mongodb://localhost:27017/') 23 | self.db = self.client['nankai_news_datasets'] 24 | self.collection = self.db['NEWS'] 25 | self.n_jobs = multiprocessing.cpu_count() 26 | print(f"将使用 {self.n_jobs} 个CPU核心进行计算") 27 | 28 | def build_sparse_matrix(self, urls): 29 | """构建优化的稀疏矩阵""" 30 | start_time = time.time() 31 | n = len(urls) 32 | print(f"\n第1步/3: 构建稀疏矩阵 (总计 {n} 个URL)") 33 | 34 | # URL映射 35 | print("创建URL索引映射...") 36 | url_to_idx = {url: idx for idx, url in enumerate(urls)} 37 | 38 | # 并行处理URL 39 | chunk_size = max(1000, n // self.n_jobs) 40 | edges = [] 41 | 42 | print("并行构建边关系...") 43 | with tqdm(total=n) as pbar: 44 | for i in range(0, n, chunk_size): 45 | chunk = urls[i:i + chunk_size] 46 | for url in chunk: 47 | parsed = urlparse(url) 48 | path = parsed.path.split('/') 49 | if len(path) > 2: 50 | base = '/'.join(path[:-1]) 51 | edges.extend([ 52 | (url_to_idx[url], url_to_idx[other]) 53 | for other in urls[i:i + chunk_size] 54 | if other != url and other.startswith(f"{parsed.scheme}://{parsed.netloc}{base}") 55 | ]) 56 | pbar.update(len(chunk)) 57 | 58 | # 构建矩阵 59 | print("构建最终矩阵...") 60 | # 构建稀疏矩阵 61 | if edges: 62 | rows, cols = zip(*edges) 63 | data = np.ones(len(rows)) 64 | matrix = sparse.csr_matrix((data, (rows, cols)), shape=(n, n)) 65 | else: 66 | matrix = sparse.csr_matrix((n, n)) 67 | 68 | # 标准化矩阵,避免孤立节点 69 | out_degrees = np.array(matrix.sum(axis=1)).flatten() 70 | out_degrees[out_degrees == 0] = 1 # 避免除以零 71 | matrix = sparse.diags(1 / out_degrees) @ matrix 72 | 73 | elapsed = time.time() - start_time 74 | print(f"矩阵构建完成! 用时: {elapsed:.2f}秒") 75 | return matrix, url_to_idx 76 | 77 | @staticmethod 78 | @jit(nopython=True) 79 | def _power_iteration(matrix_data, matrix_indices, matrix_indptr, damping, n, max_iter, tolerance): 80 | """使用numba加速的幂迭代""" 81 | scores = np.full(n, 1.0 / n) # 初始化为均匀分布 82 | teleport = (1 - damping) / n 83 | 84 | for iter_num in range(max_iter): 85 | prev_scores = scores.copy() 86 | new_scores = np.zeros(n) 87 | 88 | for i in range(n): 89 | for j in range(matrix_indptr[i], matrix_indptr[i + 1]): 90 | col = matrix_indices[j] 91 | val = matrix_data[j] 92 | new_scores[col] += val * prev_scores[i] 93 | 94 | scores = teleport + damping * new_scores 95 | diff = np.abs(scores - prev_scores).sum() 96 | 97 | if diff < tolerance: 98 | break 99 | 100 | return scores, iter_num + 1 101 | 102 | def calculate_pagerank(self): 103 | """计算PageRank""" 104 | # 获取所有URL 105 | print("\n开始PageRank计算...") 106 | start_time = time.time() 107 | 108 | print("第2步/3: 从数据库加载URL...") 109 | urls = [doc['url'] for doc in self.collection.find({}, {'url': 1, '_id': 0})] 110 | n = len(urls) 111 | print(f"加载完成,共 {n} 个URL") 112 | 113 | # 构建矩阵 114 | matrix, url_to_idx = self.build_sparse_matrix(urls) 115 | 116 | # 计算PageRank 117 | print("\n第3步/3: 迭代计算PageRank...") 118 | scores, iterations = self._power_iteration( 119 | matrix.data, matrix.indices, matrix.indptr, 120 | self.damping_factor, len(urls), self.max_iter, self.tolerance 121 | ) 122 | 123 | # 构建结果 124 | print("整理计算结果...") 125 | idx_to_url = {v: k for k, v in url_to_idx.items()} 126 | df = pd.DataFrame({ 127 | 'url': [idx_to_url[i] for i in range(len(scores))], 128 | 'pagerank': scores 129 | }) 130 | df = df.sort_values('pagerank', ascending=False) 131 | 132 | total_time = time.time() - start_time 133 | print(f"\n计算完成! 总用时: {total_time:.2f}秒") 134 | print(f"迭代次数: {iterations}") 135 | return df 136 | 137 | def preview_results(self, df): 138 | """预览结果""" 139 | print("\n结果预览:") 140 | print("\n最高PageRank值的5个页面:") 141 | for _, row in df.head().iterrows(): 142 | print(f"PageRank: {row['pagerank']:.6e} | URL: {row['url']}") 143 | 144 | print("\n最低PageRank值的5个页面:") 145 | for _, row in df.tail().iterrows(): 146 | print(f"PageRank: {row['pagerank']:.6e} | URL: {row['url']}") 147 | 148 | stats = df['pagerank'].describe() 149 | print(f"\n统计信息:") 150 | print(f"平均值: {stats['mean']:.6e}") 151 | print(f"标准差: {stats['std']:.6e}") 152 | print(f"最小值: {stats['min']:.6e}") 153 | print(f"最大值: {stats['max']:.6e}") 154 | 155 | return input("\n要更新数据库吗?(yes/no): ").lower().strip() == 'yes' 156 | 157 | def update_mongodb(self, df): 158 | """更新数据库""" 159 | print("\n开始更新数据库...") 160 | batch_size = 1000 161 | total = len(df) 162 | updated = 0 163 | 164 | with tqdm(total=total, desc="更新进度") as pbar: 165 | for i in range(0, total, batch_size): 166 | batch = df.iloc[i:i + batch_size] 167 | operations = [] 168 | 169 | for _, row in batch.iterrows(): 170 | operations.append( 171 | UpdateOne( 172 | {'url': row['url']}, 173 | {'$set': {'pagerank': float(row['pagerank'])}}, 174 | upsert=False 175 | ) 176 | ) 177 | 178 | if operations: 179 | try: 180 | result = self.collection.bulk_write(operations) 181 | updated += result.modified_count 182 | except BulkWriteError as bwe: 183 | print(f"批量写入错误: {bwe.details}") 184 | raise 185 | pbar.update(len(batch)) 186 | 187 | print(f"更新完成! 更新了 {updated} 条记录") 188 | 189 | def run(self): 190 | """运行主流程""" 191 | try: 192 | df = self.calculate_pagerank() 193 | if self.preview_results(df): 194 | self.update_mongodb(df) 195 | print("\n所有操作已完成!") 196 | else: 197 | print("\n操作已取消,数据库未更新") 198 | except Exception as e: 199 | print(f"发生错误: {str(e)}") 200 | 201 | 202 | if __name__ == "__main__": 203 | calculator = OptimizedPageRankCalculator() 204 | calculator.run() -------------------------------------------------------------------------------- /Spider/downloadlink.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import time 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from datetime import datetime 8 | import logging 9 | 10 | # 配置日志 11 | log_filename = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_log.txt" # 日志文件名 12 | logging.basicConfig( 13 | level=logging.INFO, # 设置日志级别为INFO,输出INFO及以上级别的日志 14 | format="%(asctime)s - %(levelname)s - %(message)s", # 日志格式 15 | handlers=[ 16 | logging.StreamHandler(), # 输出到控制台 17 | logging.FileHandler(log_filename, mode="w", encoding="utf-8") # 输出到日志文件 18 | ] 19 | ) 20 | 21 | # 设置头信息,防止反爬虫 22 | headers_parameters = { 23 | 'Connection': 'Keep-Alive', 24 | 'Accept': 'text/html', 25 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 26 | 'Accept-Encoding': 'gzip, deflate', 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 28 | } 29 | 30 | # 下载文档后缀列表 31 | download_suffix_list = [ 32 | "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", # 常见文档格式 33 | "mp3", "mp4", "avi", "mkv", "mov", "wmv", "flv", # 音频和视频格式 34 | "zip", "rar", "tar", "gz", "bz2", "7z", # 压缩文件格式 35 | "jpg", "jpeg", "png", "gif", "bmp", "tiff", # 图片格式 36 | "exe", "apk", "dmg", # 可执行文件和应用程序 37 | "csv", "txt", "rtf", # 文本文件 38 | "xls", "xlsx", # 表格文件 39 | ] 40 | 41 | # 获取网页内容 42 | def get_html(url): 43 | print(url) 44 | try: 45 | response = requests.get(url, timeout=crawl_timeout, headers=headers_parameters, allow_redirects=False) 46 | response.encoding = response.apparent_encoding 47 | except Exception as e: 48 | print(e) 49 | return "" 50 | return response.text 51 | 52 | # 获取网页中的所有链接 53 | def get_expand_urls(bs, url,download_id_counter): 54 | urls_expand = [] 55 | for item in bs.find_all("a"): # 当前网页html的所有a标签 56 | href = item.get("href") 57 | if href is None: 58 | continue 59 | href = str(href) 60 | index = href.find("#") # 去除#跳转 61 | if index != -1: 62 | href = href[:index] 63 | if href.find("javascript") != -1 or href.find("download") != -1: 64 | continue 65 | if len(href) < 1 or href == '/': 66 | continue 67 | if href.find("http") == -1: 68 | if href[0] != '/': 69 | href = '/' + href 70 | else: 71 | if href[0] == '.' and href[1] == '/': 72 | href = href[1:] 73 | if url[-1] == '/': # 去除url尾部的'/'(如果有) 74 | url = url[:-1] 75 | href = url + href 76 | else: # 对于绝对地址,直接添加 77 | index_of_end_of_domain = href.find('/', href.find("//") + 2) 78 | index_of_nankai_str = href.find("nankai") 79 | if index_of_nankai_str == -1 or index_of_nankai_str > index_of_end_of_domain: 80 | continue 81 | if href.find("less.nankai.edu.cn/public") != -1 or href.find("weekly.nankai.edu.cn/oldrelease.php") != -1: 82 | continue 83 | # 如果是下载链接 84 | index_suffix = href.rfind(".") 85 | if href[index_suffix + 1:] in download_suffix_list: # 如果是下载地址 86 | # 可能从标签获取标题或者描述 87 | file_title = item.get_text().strip() # 链接文本作为标题 88 | if not file_title: 89 | file_title = "Unknown Title" # 如果没有链接文本,设为默认标题 90 | # # 打印下载链接信息,包括序号 91 | # download_id = len(urls_taken) + 1 # 为每个链接分配一个唯一的序号 92 | # 打印下载链接信息,包括序号 93 | download_id = download_id_counter[0] # 获取当前的下载 ID 94 | download_id_counter[0] += 1 # 更新 ID 计数器 95 | logging.info(f"[{download_id}]Download link found: {href}, Title: {file_title}") 96 | # 获取文件类型 97 | file_type = href.split('.')[-1] if '.' in href else 'unknown' 98 | 99 | # 保存下载链接信息 100 | download_info = { 101 | "url": href, 102 | "title": file_title, 103 | "file_type": file_type, 104 | "file_name": href.split("/")[-1], 105 | "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 106 | } 107 | # with open(os.path.join(dirname, f"download_{download_id}.json"), 'w', encoding="utf-8") as file: 108 | # json.dump(download_info, file, ensure_ascii=False) 109 | # continue 110 | 111 | # 保存每个下载链接为单独的JSON文件 112 | json_file_name = f"download_{download_id}.json" 113 | with open(os.path.join(dirname, json_file_name), 'w', encoding="utf-8") as file: 114 | json.dump(download_info, file, ensure_ascii=False) 115 | 116 | # 这里不用继续执行,也没有必要保存其他链接的信息 117 | continue # 一旦保存该链接的json,继续检查下一个链接 118 | 119 | urls_expand.append(href) 120 | # 如果没有扩展链接,返回空列表而不是 None 121 | return urls_expand if urls_expand else [] 122 | 123 | # 保存下载链接到文件 124 | def save_download_links(download_links): 125 | filename = "download_links.json" 126 | if os.path.exists(filename): 127 | with open(filename, 'r', encoding="utf-8") as file: 128 | all_links = json.load(file) 129 | else: 130 | all_links = [] 131 | 132 | all_links.extend(download_links) 133 | 134 | with open(filename, 'w', encoding="utf-8") as file: 135 | json.dump(all_links, file, ensure_ascii=False, indent=4) 136 | logging.info(f"Saved {len(download_links)} download links.") 137 | 138 | # 迭代爬虫 139 | def crawl_loop(i, url_count, download_link_count, urls_target, urls_taken,download_id_counter, max_crawl_count): 140 | # 如果已经达到最大深度、迭代次数,或者达到了最大爬取数量,停止爬虫 141 | if i == 0: 142 | logging.info("Crawl finished!") 143 | logging.info(f"Total URLs crawled: {url_count}") 144 | logging.info(f"Total download links found: {download_link_count}") 145 | return 146 | 147 | urls_expand = [] 148 | download_links = [] 149 | 150 | for url in urls_target: 151 | html = get_html(url) 152 | bs = BeautifulSoup(html, "html.parser") 153 | for url_expand in get_expand_urls(bs, url,download_id_counter): 154 | if url_expand not in urls_taken: 155 | html_expand = get_html(url_expand) 156 | bs_expand = BeautifulSoup(html_expand, "html.parser") 157 | url_count += 1 158 | new_links = get_expand_urls(bs_expand, url_expand,download_id_counter) 159 | if new_links is None: 160 | continue # 如果返回 None,则跳过当前循环 161 | 162 | download_links.extend(new_links) 163 | download_link_count += len(new_links) 164 | urls_expand.append(url_expand) 165 | # 添加到已爬取集合中 166 | for new_url in new_links: 167 | urls_taken.add(new_url) 168 | #logging.info(f"Total crawled pages: {url_count} - Total download links: {download_link_count}") 169 | if url_count >= max_crawl_count: # 如果达到最大爬取数量,跳出外层循环 170 | break 171 | 172 | # 保存下载链接 173 | save_download_links(download_links) 174 | # 递归调用 crawl_loop,继续爬取 175 | return crawl_loop(i - 1, url_count, download_link_count, urls_expand, urls_taken,download_id_counter, max_crawl_count) 176 | 177 | # 爬虫设置和初始化 178 | download_id_counter = [1] # 初始化下载 ID 计数器 179 | dirname = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") # 目录名称 180 | os.mkdir(dirname) 181 | crawl_timeout = 1 # 爬虫连接超时时间 182 | crawl_iteration_times = 8 # 爬虫迭代次数 183 | html_index = 0 # 网页索引 184 | url_count = 0 # 总爬取网页数量 185 | urls_target = [] # 爬虫目标网址 186 | #urls_taken = [] # 已访问的网址 187 | urls_taken = set() # 使用集合来避免重复 188 | urls_invalid = [] # 无效的网址 189 | max_crawl_count = 30000 # 设定最大爬取数量 190 | # 从目标网址文件加载目标网址 191 | with open("default_urls_download.json") as file: 192 | urls_target = json.load(file) 193 | 194 | # 执行爬虫 195 | crawl_loop(crawl_iteration_times, url_count, 0, urls_target, urls_taken,download_id_counter, max_crawl_count) -------------------------------------------------------------------------------- /Spider/htmonly.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import time 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from datetime import datetime 8 | import logging 9 | 10 | # 配置日志 11 | log_filename = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_log.txt" # 日志文件名 12 | logging.basicConfig( 13 | level=logging.INFO, # 设置日志级别为INFO,输出INFO及以上级别的日志 14 | format="%(asctime)s - %(levelname)s - %(message)s", # 日志格式 15 | handlers=[ 16 | logging.StreamHandler(), # 输出到控制台 17 | logging.FileHandler(log_filename, mode="w", encoding="utf-8") # 输出到日志文件 18 | ] 19 | ) 20 | 21 | # 设置头信息,防止反爬虫 22 | headers_parameters = { 23 | 'Connection': 'Keep-Alive', 24 | 'Accept': 'text/html', 25 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 26 | 'Accept-Encoding': 'gzip, deflate', 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 28 | } 29 | 30 | # 下载文档后缀列表 31 | download_suffix_list = [ 32 | "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", # 常见文档格式 33 | "mp3", "mp4", "avi", "mkv", "mov", "wmv", "flv", # 音频和视频格式 34 | "zip", "rar", "tar", "gz", "bz2", "7z", # 压缩文件格式 35 | "jpg", "jpeg", "png", "gif", "bmp", "tiff", # 图片格式 36 | "exe", "apk", "dmg", # 可执行文件和应用程序 37 | "csv", "txt", "rtf", # 文本文件 38 | "xls", "xlsx", # 表格文件 39 | ] 40 | 41 | # 获取网页内容 42 | def get_html(url): 43 | print(url) 44 | try: 45 | response = requests.get(url, timeout=crawl_timeout, headers=headers_parameters, allow_redirects=False) 46 | response.encoding = response.apparent_encoding 47 | except Exception as e: 48 | print(e) 49 | return "" 50 | return response.text 51 | 52 | # 获取网页中的所有链接 53 | def get_expand_urls(bs, url): 54 | urls_expand = [] 55 | for item in bs.find_all("a"): # 当前网页html的所有a标签 56 | href = item.get("href") 57 | if href is None: 58 | continue 59 | href = str(href) 60 | index = href.find("#") # 去除#跳转 61 | if index != -1: 62 | href = href[:index] 63 | if href.find("javascript") != -1 or href.find("download") != -1: 64 | continue 65 | if len(href) < 1 or href == '/': 66 | continue 67 | if href.find("http") == -1: 68 | if href[0] != '/': 69 | href = '/' + href 70 | else: 71 | if href[0] == '.' and href[1] == '/': 72 | href = href[1:] 73 | if url[-1] == '/': # 去除url尾部的'/'(如果有) 74 | url = url[:-1] 75 | href = url + href 76 | else: # 对于绝对地址,直接添加 77 | index_of_end_of_domain = href.find('/', href.find("//") + 2) 78 | index_of_nankai_str = href.find("nankai") 79 | if index_of_nankai_str == -1 or index_of_nankai_str > index_of_end_of_domain: 80 | continue 81 | if href.find("less.nankai.edu.cn/public") != -1 or href.find("weekly.nankai.edu.cn/oldrelease.php") != -1: 82 | continue 83 | 84 | index_suffix = href.rfind(".") 85 | if href[index_suffix + 1:] in download_suffix_list: # 如果是下载地址 86 | logging.info("Download link found: " + href) 87 | continue 88 | 89 | urls_expand.append(href) 90 | return urls_expand 91 | 92 | # 打印和保存网页数据 93 | def print_json_data(json_data,html_index): 94 | logging.info(f"Page {html_index}:") 95 | logging.info("url: " + json_data["url"]) 96 | logging.info("title: " + json_data["title"]) 97 | content = json_data["content"] 98 | content = str(content).replace('\n', '') 99 | content = str(content).replace('\t', '') 100 | if len(content) > 100: 101 | logging.info("content: " + content[0:99] + "...") 102 | else: 103 | logging.info("content: " + content) 104 | 105 | # 保存网页内容到文件 106 | def content_handler(bs, url, index): 107 | title = "" 108 | content = "" 109 | for item in bs.findAll(): 110 | if item.name == "script" or item.name == "style": 111 | continue 112 | content += item.get_text() 113 | content = re.sub("\n\n", "", content) 114 | content = content.replace('\n', '') 115 | content = content.replace('\t', '') 116 | if bs.title is not None: 117 | title = bs.title.get_text() 118 | if title == "" or title is None or title.find("301") != -1 or title.find("302") != -1 or title.find("404") != -1: 119 | logging.info(f"Skipping page {index} (title: {title})") # 打印跳过的页面信息 120 | return False 121 | 122 | else: 123 | json_data = {"url": url, 124 | "title": title, 125 | "content": content, 126 | "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")} 127 | print_json_data(json_data,index) 128 | with open(os.path.join(dirname, str(index) + ".json"), 'w', encoding="utf-8") as file: 129 | json.dump(json_data, file, ensure_ascii=False) 130 | file.close() 131 | return True 132 | 133 | # 迭代爬虫 134 | def crawl_loop(i, url_count, html_index, urls_target, urls_taken): 135 | if i == 0: 136 | logging.info("Crawl finished!") 137 | logging.info(f"Total URLs crawled: {url_count}") 138 | logging.info(f"Total valid URLs: {html_index}") 139 | return 140 | urls_expand = [] 141 | for url in urls_target: 142 | html = get_html(url) 143 | bs = BeautifulSoup(html, "html.parser") 144 | for url_expand in get_expand_urls(bs, url): 145 | if url_expand not in urls_taken: 146 | html_expand = get_html(url_expand) 147 | bs_expand = BeautifulSoup(html_expand, "html.parser") 148 | url_count += 1 149 | if not content_handler(bs_expand, url_expand, html_index): 150 | continue 151 | html_index += 1 152 | urls_expand.append(url_expand) 153 | # urls_taken.append(url_expand)#对应列表方法 154 | urls_taken.add(url_expand) # 修改为 set 的 add 方法 155 | logging.info(f"Total crawled pages: {url_count} - Current page index: {html_index}") # 输出当前的爬取数量和页面索引 156 | return crawl_loop(i - 1, url_count, html_index, urls_expand, urls_taken) 157 | 158 | # 爬虫设置和初始化 159 | dirname = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") # 目录名称 160 | os.mkdir(dirname) 161 | crawl_timeout = 1 # 爬虫连接超时时间 162 | crawl_iteration_times = 6 # 爬虫迭代次数 163 | html_index = 0 # 网页索引 164 | url_count = 0 # 总爬取网页数量 165 | urls_target = [] # 爬虫目标网址 166 | #urls_taken = [] # 已访问的网址 167 | urls_taken = set() # 使用集合来避免重复 168 | urls_invalid = [] # 无效的网址 169 | 170 | # 从目标网址文件加载目标网址 171 | with open("../datasets_and_logs/default_urls.json") as file: 172 | urls_target = json.load(file) 173 | 174 | # 执行爬虫 175 | crawl_loop(crawl_iteration_times, url_count, html_index, urls_target, urls_taken) -------------------------------------------------------------------------------- /Spider/htmonly_pagerank.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import time 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from datetime import datetime 8 | import logging 9 | import networkx as nx 10 | 11 | 12 | class PageRankHandler: 13 | def __init__(self): 14 | self.link_graph = nx.DiGraph() # 使用有向图存储链接关系 15 | 16 | def add_links(self, from_url, to_urls): 17 | """添加链接关系到图中""" 18 | for to_url in to_urls: 19 | self.link_graph.add_edge(from_url, to_url) 20 | 21 | def calculate_pagerank(self, alpha=0.85): 22 | """计算PageRank值""" 23 | return nx.pagerank(self.link_graph, alpha=alpha) 24 | 25 | def save_pagerank(self, pagerank_scores, dirname): 26 | """保存PageRank结果""" 27 | with open(os.path.join(dirname, "pagerank.json"), 'w', encoding="utf-8") as f: 28 | json.dump(pagerank_scores, f, ensure_ascii=False) 29 | 30 | def get_top_pages(self, pagerank_scores, n=10): 31 | """获取PageRank值最高的n个页面""" 32 | sorted_pages = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True) 33 | return sorted_pages[:n] 34 | 35 | 36 | # 配置日志 37 | log_filename = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_log.txt" 38 | logging.basicConfig( 39 | level=logging.INFO, 40 | format="%(asctime)s - %(levelname)s - %(message)s", 41 | handlers=[ 42 | logging.StreamHandler(), 43 | logging.FileHandler(log_filename, mode="w", encoding="utf-8") 44 | ] 45 | ) 46 | 47 | # 初始化PageRank处理器 48 | pagerank_handler = PageRankHandler() 49 | 50 | # 设置头信息 51 | headers_parameters = { 52 | 'Connection': 'Keep-Alive', 53 | 'Accept': 'text/html', 54 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 55 | 'Accept-Encoding': 'gzip, deflate', 56 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 57 | } 58 | 59 | # 下载文档后缀列表 60 | download_suffix_list = [ 61 | "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", 62 | "mp3", "mp4", "avi", "mkv", "mov", "wmv", "flv", 63 | "zip", "rar", "tar", "gz", "bz2", "7z", 64 | "jpg", "jpeg", "png", "gif", "bmp", "tiff", 65 | "exe", "apk", "dmg", 66 | "csv", "txt", "rtf", 67 | "xls", "xlsx", 68 | ] 69 | 70 | 71 | def get_html(url): 72 | try: 73 | response = requests.get(url, timeout=crawl_timeout, headers=headers_parameters, allow_redirects=False) 74 | response.encoding = response.apparent_encoding 75 | except Exception as e: 76 | logging.error(f"Error fetching {url}: {e}") 77 | return "" 78 | return response.text 79 | 80 | 81 | def get_expand_urls(bs, url): 82 | urls_expand = [] 83 | for item in bs.find_all("a"): 84 | href = item.get("href") 85 | if href is None: 86 | continue 87 | href = str(href) 88 | 89 | # 链接清理和过滤逻辑 90 | index = href.find("#") 91 | if index != -1: 92 | href = href[:index] 93 | if href.find("javascript") != -1 or href.find("download") != -1: 94 | continue 95 | if len(href) < 1 or href == '/': 96 | continue 97 | 98 | # 处理相对链接 99 | if href.find("http") == -1: 100 | if href[0] != '/': 101 | href = '/' + href 102 | elif href[0] == '.' and href[1] == '/': 103 | href = href[1:] 104 | if url[-1] == '/': 105 | url = url[:-1] 106 | href = url + href 107 | else: 108 | # 过滤非南开域名链接 109 | index_of_end_of_domain = href.find('/', href.find("//") + 2) 110 | index_of_nankai_str = href.find("nankai") 111 | if index_of_nankai_str == -1 or index_of_nankai_str > index_of_end_of_domain: 112 | continue 113 | 114 | # 过滤特定URL 115 | if href.find("less.nankai.edu.cn/public") != -1 or href.find("weekly.nankai.edu.cn/oldrelease.php") != -1: 116 | continue 117 | 118 | # 过滤下载链接 119 | index_suffix = href.rfind(".") 120 | if href[index_suffix + 1:] in download_suffix_list: 121 | logging.info(f"Download link found: {href}") 122 | continue 123 | 124 | urls_expand.append(href) 125 | 126 | # 添加链接关系到PageRank处理器 127 | pagerank_handler.add_links(url, urls_expand) 128 | return urls_expand 129 | 130 | 131 | def print_json_data(json_data, html_index): 132 | logging.info(f"Page {html_index}:") 133 | logging.info(f"url: {json_data['url']}") 134 | logging.info(f"title: {json_data['title']}") 135 | content = json_data["content"] 136 | content = str(content).replace('\n', '').replace('\t', '') 137 | logging.info(f"content: {content[:100]}..." if len(content) > 100 else f"content: {content}") 138 | 139 | 140 | def content_handler(bs, url, index): 141 | title = "" 142 | content = "" 143 | 144 | for item in bs.findAll(): 145 | if item.name in ["script", "style"]: 146 | continue 147 | content += item.get_text() 148 | 149 | content = re.sub("\n\n", "", content) 150 | content = content.replace('\n', '').replace('\t', '') 151 | 152 | if bs.title: 153 | title = bs.title.get_text() 154 | 155 | if not title or any(str(code) in title for code in ["301", "302", "404"]): 156 | logging.info(f"Skipping page {index} (title: {title})") 157 | return False 158 | 159 | json_data = { 160 | "url": url, 161 | "title": title, 162 | "content": content, 163 | "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 164 | } 165 | 166 | print_json_data(json_data, index) 167 | with open(os.path.join(dirname, f"{index}.json"), 'w', encoding="utf-8") as file: 168 | json.dump(json_data, file, ensure_ascii=False) 169 | return True 170 | 171 | 172 | def crawl_loop(i, url_count, html_index, urls_target, urls_taken): 173 | if i == 0: 174 | logging.info("Crawl finished!") 175 | logging.info(f"Total URLs crawled: {url_count}") 176 | logging.info(f"Total valid URLs: {html_index}") 177 | 178 | # 计算并保存PageRank值 179 | logging.info("Calculating PageRank...") 180 | pagerank_scores = pagerank_handler.calculate_pagerank() 181 | pagerank_handler.save_pagerank(pagerank_scores, dirname) 182 | 183 | # 输出排名靠前的页面 184 | top_pages = pagerank_handler.get_top_pages(pagerank_scores) 185 | logging.info("\nTop 10 pages by PageRank:") 186 | for url, score in top_pages: 187 | logging.info(f"URL: {url}, PageRank: {score:.6f}") 188 | 189 | logging.info("PageRank calculation completed") 190 | return 191 | 192 | urls_expand = [] 193 | for url in urls_target: 194 | html = get_html(url) 195 | bs = BeautifulSoup(html, "html.parser") 196 | for url_expand in get_expand_urls(bs, url): 197 | if url_expand not in urls_taken: 198 | html_expand = get_html(url_expand) 199 | bs_expand = BeautifulSoup(html_expand, "html.parser") 200 | url_count += 1 201 | if not content_handler(bs_expand, url_expand, html_index): 202 | continue 203 | html_index += 1 204 | urls_expand.append(url_expand) 205 | urls_taken.add(url_expand) 206 | logging.info(f"Total crawled pages: {url_count} - Current page index: {html_index}") 207 | 208 | return crawl_loop(i - 1, url_count, html_index, urls_expand, urls_taken) 209 | 210 | 211 | # 爬虫设置和初始化 212 | dirname = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") 213 | os.mkdir(dirname) 214 | crawl_timeout = 1 215 | crawl_iteration_times = 6 216 | html_index = 0 217 | url_count = 0 218 | urls_target = [] 219 | urls_taken = set() 220 | 221 | # 从文件加载目标网址 222 | with open("../datasets_and_logs/default_urls.json") as file: 223 | urls_target = json.load(file) 224 | 225 | # 执行爬虫 226 | crawl_loop(crawl_iteration_times, url_count, html_index, urls_target, urls_taken) -------------------------------------------------------------------------------- /Spider/mutispider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | from datetime import datetime 5 | import re 6 | import time 7 | import random 8 | from concurrent.futures import ThreadPoolExecutor 9 | import logging 10 | from pymongo import MongoClient 11 | from pymongo.errors import DuplicateKeyError 12 | import hashlib 13 | import os 14 | import mimetypes 15 | import gridfs 16 | 17 | 18 | class NewsScraperNankai: 19 | def __init__(self): 20 | self.base_url = "http://news.nankai.edu.cn" 21 | self.first_page = "http://news.nankai.edu.cn/dcxy/index.shtml" 22 | self.page_template = "https://news.nankai.edu.cn/dcxy/system/count//0005000/000000000000/000/000/c0005000000000000000_000000{:03d}.shtml" 23 | self.max_pages = 524 24 | 25 | # MongoDB连接设置 26 | self.mongo_client = MongoClient('mongodb://localhost:27017/') 27 | self.db = self.mongo_client['nankai_news_datasets'] 28 | self.news_collection = self.db['NEWS'] 29 | self.snapshot_collection = self.db['WEB_snapshot'] 30 | self.fs = gridfs.GridFS(self.db) # 用于存储附件 31 | 32 | # 创建索引 33 | self.news_collection.create_index([('url', 1)], unique=True) 34 | self.snapshot_collection.create_index([('url', 1), ('captured_at', -1)]) 35 | 36 | # 支持的附件类型 37 | self.supported_attachments = [ 38 | ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", # 常见文档格式 39 | ".mp3", ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", # 音频和视频格式 40 | ".zip", ".rar", ".tar", ".gz", ".bz2", ".7z", # 压缩文件格式 41 | ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", # 图片格式 42 | ".exe", ".apk", ".dmg", # 可执行文件和应用程序 43 | ".csv", ".txt", ".rtf", # 文本文件 44 | ".xls", ".xlsx", # 表格文件 45 | ] 46 | 47 | logging.basicConfig( 48 | level=logging.INFO, 49 | format='%(asctime)s - %(levelname)s - %(message)s', 50 | handlers=[ 51 | logging.FileHandler('../scraper.log', encoding='utf-8'), 52 | logging.StreamHandler() 53 | ] 54 | ) 55 | 56 | self.headers = { 57 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 58 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 59 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 60 | 'Connection': 'keep-alive' 61 | } 62 | def get_page_urls(self): 63 | """生成所有页面的URL""" 64 | urls = [self.first_page] # 第一页 65 | # 添加后续页面 66 | urls.extend(self.page_template.format(i) for i in range(1, self.max_pages + 1)) 67 | return urls 68 | def get_soup(self, url, retries=3): 69 | """获取页面的BeautifulSoup对象和原始HTML内容""" 70 | for i in range(retries): 71 | try: 72 | time.sleep(random.uniform(1, 3)) 73 | response = requests.get(url, headers=self.headers, timeout=10) 74 | response.encoding = 'utf-8' 75 | 76 | if response.status_code == 200: 77 | html_content = response.text 78 | return BeautifulSoup(html_content, 'html.parser'), html_content 79 | else: 80 | logging.warning(f"Failed to fetch {url}, status code: {response.status_code}") 81 | 82 | except Exception as e: 83 | logging.error(f"Attempt {i + 1} failed for {url}: {str(e)}") 84 | if i == retries - 1: 85 | logging.error(f"All attempts failed for {url}") 86 | return None, None 87 | time.sleep(random.uniform(2, 5)) 88 | return None, None 89 | 90 | def save_snapshot(self, url, html_content): 91 | """保存网页快照""" 92 | try: 93 | snapshot_data = { 94 | 'url': url, 95 | 'html_content': html_content, 96 | 'captured_at': datetime.now(), 97 | 'content_hash': hashlib.md5(html_content.encode('utf-8')).hexdigest() 98 | } 99 | self.snapshot_collection.insert_one(snapshot_data) 100 | return snapshot_data['content_hash'] 101 | except Exception as e: 102 | logging.error(f"Error saving snapshot for {url}: {str(e)}") 103 | return None 104 | 105 | def find_attachments(self, soup, base_url): 106 | """查找页面中的附件链接""" 107 | attachments = [] 108 | for link in soup.find_all('a', href=True): 109 | href = link['href'].lower() 110 | if any(ext in href for ext in self.supported_attachments): 111 | full_url = self.base_url + href if href.startswith('/') else href 112 | attachments.append({ 113 | 'url': full_url, 114 | 'filename': os.path.basename(href), 115 | 'title': link.text.strip() 116 | }) 117 | return attachments 118 | 119 | def save_attachment(self, attachment_info): 120 | """保存附件到GridFS""" 121 | try: 122 | response = requests.get(attachment_info['url'], headers=self.headers, timeout=30) 123 | if response.status_code == 200: 124 | file_id = self.fs.put( 125 | response.content, 126 | filename=attachment_info['filename'], 127 | url=attachment_info['url'], 128 | title=attachment_info['title'], 129 | upload_date=datetime.now() 130 | ) 131 | return file_id 132 | except Exception as e: 133 | logging.error(f"Error saving attachment {attachment_info['url']}: {str(e)}") 134 | return None 135 | 136 | def parse_news_list_page(self, url): 137 | """解析新闻列表页面""" 138 | soup, html_content = self.get_soup(url) 139 | if not soup: 140 | return [] 141 | 142 | # 保存列表页快照 143 | snapshot_hash = self.save_snapshot(url, html_content) 144 | 145 | news_items = [] 146 | tables = soup.find_all('table', attrs={'width': "98%", 'border': "0", 'cellpadding': "0", 'cellspacing': "0"}) 147 | 148 | for table in tables: 149 | try: 150 | title_link = table.find('a') 151 | if not title_link: 152 | continue 153 | 154 | title = title_link.text.strip() 155 | news_url = self.base_url + title_link['href'] if title_link['href'].startswith('/') else title_link[ 156 | 'href'] 157 | date_td = table.find('td', align="right") 158 | date = date_td.text.strip() if date_td else None 159 | 160 | logging.info(f"Processing: {title}") 161 | 162 | # 获取新闻详细内容和快照 163 | article_content, article_snapshot_hash, article_attachments = self.parse_news_detail(news_url) 164 | 165 | news_item = { 166 | 'title': title, 167 | 'url': news_url, 168 | 'date': date, 169 | 'source': article_content.get('source', ''), 170 | 'content': article_content.get('content', ''), 171 | 'snapshot_hash': article_snapshot_hash, 172 | 'attachments': article_attachments 173 | } 174 | 175 | news_items.append(news_item) 176 | 177 | except Exception as e: 178 | logging.error(f"Error parsing news item: {str(e)}") 179 | continue 180 | 181 | return news_items 182 | 183 | def parse_news_detail(self, url): 184 | """解析新闻详细页面,包括快照和附件""" 185 | soup, html_content = self.get_soup(url) 186 | if not soup: 187 | return {'source': '', 'content': ''}, None, [] 188 | 189 | try: 190 | # 保存快照 191 | snapshot_hash = self.save_snapshot(url, html_content) 192 | 193 | # 查找附件 194 | attachments = self.find_attachments(soup, url) 195 | saved_attachments = [] 196 | 197 | # 保存附件 198 | for attachment in attachments: 199 | file_id = self.save_attachment(attachment) 200 | if file_id: 201 | saved_attachments.append({ 202 | 'file_id': file_id, 203 | 'url': attachment['url'], 204 | 'filename': attachment['filename'], 205 | 'title': attachment['title'] 206 | }) 207 | 208 | # 解析内容 209 | source_span = soup.find('span', string=re.compile('来源:')) 210 | source = source_span.text.strip() if source_span else '' 211 | 212 | content_div = soup.find('td', id='txt') 213 | if content_div: 214 | paragraphs = content_div.find_all('p') 215 | content = '\n'.join([p.text.strip() for p in paragraphs if p.text.strip()]) 216 | else: 217 | content = '' 218 | 219 | return { 220 | 'source': source, 221 | 'content': content 222 | }, snapshot_hash, saved_attachments 223 | 224 | except Exception as e: 225 | logging.error(f"Error parsing detail page {url}: {str(e)}") 226 | return {'source': '', 'content': ''}, None, [] 227 | 228 | def scrape_batch(self, urls, batch_size=10): 229 | """批量抓取新闻并保存到MongoDB""" 230 | for i in range(0, len(urls), batch_size): 231 | batch_urls = urls[i:i + batch_size] 232 | batch_number = i // batch_size + 1 233 | 234 | logging.info(f"Processing batch {batch_number}, pages {i + 1} to {min(i + batch_size, len(urls))}") 235 | 236 | # 使用线程池并行处理每批URL 237 | with ThreadPoolExecutor(max_workers=5) as executor: 238 | batch_results = list(executor.map(self.parse_news_list_page, batch_urls)) 239 | 240 | # 合并结果 241 | batch_news = [item for sublist in batch_results if sublist for item in sublist] 242 | 243 | # 保存这一批次的数据到MongoDB 244 | inserted, updated = self.save_to_mongodb(batch_news, batch_number) 245 | logging.info(f"Batch {batch_number} completed: {inserted} new items, {updated} updates") 246 | 247 | # 批次间休息 248 | time.sleep(random.uniform(3, 5)) 249 | 250 | def scrape(self): 251 | """主抓取函数""" 252 | logging.info("Starting to scrape news...") 253 | urls = self.get_page_urls() 254 | self.scrape_batch(urls) 255 | 256 | # 打印最终统计信息 257 | total_news = self.get_news_count() 258 | logging.info(f"Scraping completed. Total news in database: {total_news}") 259 | 260 | def save_to_mongodb(self, news_items, batch_number=None): 261 | """保存数据到MongoDB""" 262 | if not news_items: 263 | logging.warning("No data to save to MongoDB") 264 | return 0, 0 265 | 266 | inserted_count = 0 267 | updated_count = 0 268 | 269 | for item in news_items: 270 | try: 271 | # 添加时间戳和批次信息 272 | item['created_at'] = datetime.now() 273 | item['batch_number'] = batch_number 274 | 275 | # 使用update_one with upsert=True来避免重复插入 276 | result = self.news_collection.update_one( 277 | {'url': item['url']}, # 查询条件 278 | {'$set': item}, # 更新的数据 279 | upsert=True # 如果不存在则插入 280 | ) 281 | 282 | if result.upserted_id: 283 | inserted_count += 1 284 | elif result.modified_count: 285 | updated_count += 1 286 | 287 | except Exception as e: 288 | logging.error(f"Error saving to MongoDB: {str(e)}") 289 | continue 290 | 291 | logging.info( 292 | f"Batch {batch_number}: Inserted {inserted_count} new documents, Updated {updated_count} documents") 293 | return inserted_count, updated_count 294 | 295 | def get_news_count(self): 296 | """获取数据库中的新闻总数""" 297 | return self.news_collection.count_documents({}) 298 | def cleanup(self): 299 | """清理资源""" 300 | self.mongo_client.close() 301 | 302 | 303 | def main(): 304 | scraper = None 305 | try: 306 | scraper = NewsScraperNankai() 307 | scraper.scrape() 308 | except Exception as e: 309 | logging.error(f"An error occurred during scraping: {str(e)}") 310 | finally: 311 | if scraper: 312 | scraper.cleanup() 313 | 314 | 315 | if __name__ == "__main__": 316 | main() -------------------------------------------------------------------------------- /Spider/mutispider_pagerank.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | from datetime import datetime 5 | import re 6 | import time 7 | import random 8 | from concurrent.futures import ThreadPoolExecutor 9 | import logging 10 | from pymongo import MongoClient 11 | from pymongo.errors import DuplicateKeyError 12 | from pymongo.operations import UpdateOne 13 | import hashlib 14 | import os 15 | import mimetypes 16 | import gridfs 17 | import numpy as np 18 | 19 | 20 | class PageRankCalculator: 21 | def __init__(self, mongo_client): 22 | self.db = mongo_client['nankai_news_datasets'] 23 | self.news_collection = self.db['NEWS'] 24 | self.links_collection = self.db['LINKS'] 25 | self.pagerank_collection = self.db['PAGERANK'] 26 | 27 | # 创建索引 28 | self.links_collection.create_index([('from_url', 1), ('to_url', 1)], unique=True) 29 | self.pagerank_collection.create_index([('url', 1)], unique=True) 30 | 31 | def extract_links(self, soup, current_url): 32 | """提取页面中的所有链接""" 33 | links = [] 34 | if not soup: 35 | return links 36 | 37 | for a_tag in soup.find_all('a', href=True): 38 | href = a_tag['href'] 39 | if href.startswith('/'): 40 | href = f"http://news.nankai.edu.cn{href}" 41 | elif not href.startswith('http'): 42 | continue 43 | 44 | if 'nankai.edu.cn' in href: # 只保留南开域名下的链接 45 | links.append({ 46 | 'from_url': current_url, 47 | 'to_url': href, 48 | 'anchor_text': a_tag.get_text(strip=True), 49 | 'created_at': datetime.now() 50 | }) 51 | return links 52 | 53 | def save_links(self, links): 54 | """保存链接关系到数据库""" 55 | if not links: 56 | return 57 | 58 | for link in links: 59 | try: 60 | self.links_collection.update_one( 61 | { 62 | 'from_url': link['from_url'], 63 | 'to_url': link['to_url'] 64 | }, 65 | {'$set': link}, 66 | upsert=True 67 | ) 68 | except Exception as e: 69 | logging.error(f"Error saving link: {str(e)}") 70 | 71 | def build_graph(self): 72 | """构建网页链接图""" 73 | graph = {} 74 | # 获取所有链接关系 75 | links = self.links_collection.find({}) 76 | 77 | for link in links: 78 | from_url = link['from_url'] 79 | to_url = link['to_url'] 80 | 81 | if from_url not in graph: 82 | graph[from_url] = [] 83 | if to_url not in graph: 84 | graph[to_url] = [] 85 | 86 | if to_url not in graph[from_url]: 87 | graph[from_url].append(to_url) 88 | 89 | return graph 90 | 91 | def calculate_pagerank(self, damping_factor=0.85, max_iterations=100, min_delta=1e-5): 92 | """计算PageRank值""" 93 | graph = self.build_graph() 94 | if not graph: 95 | logging.warning("No graph data available for PageRank calculation") 96 | return {} 97 | 98 | # 初始化PageRank值 99 | num_pages = len(graph) 100 | initial_value = 1.0 / num_pages 101 | pagerank = {url: initial_value for url in graph} 102 | 103 | for iteration in range(max_iterations): 104 | new_pagerank = {} 105 | total_diff = 0 106 | 107 | # 计算新的PageRank值 108 | for url in graph: 109 | incoming_pr = 0 110 | for incoming_url in graph: 111 | if url in graph[incoming_url]: 112 | outgoing_count = len(graph[incoming_url]) 113 | if outgoing_count > 0: 114 | incoming_pr += pagerank[incoming_url] / outgoing_count 115 | 116 | new_value = (1 - damping_factor) / num_pages + damping_factor * incoming_pr 117 | new_pagerank[url] = new_value 118 | total_diff += abs(new_value - pagerank[url]) 119 | 120 | # 更新PageRank值 121 | pagerank = new_pagerank 122 | 123 | # 检查是否收敛 124 | if total_diff < min_delta: 125 | logging.info(f"PageRank converged after {iteration + 1} iterations") 126 | break 127 | 128 | return pagerank 129 | 130 | def update_pagerank_scores(self): 131 | """更新数据库中的PageRank分数""" 132 | pagerank_scores = self.calculate_pagerank() 133 | 134 | # 批量更新PageRank值 135 | operations = [] 136 | timestamp = datetime.now() 137 | 138 | for url, score in pagerank_scores.items(): 139 | operations.append(UpdateOne( 140 | {'url': url}, 141 | { 142 | '$set': { 143 | 'pagerank': score, 144 | 'updated_at': timestamp 145 | } 146 | }, 147 | upsert=True 148 | )) 149 | 150 | if operations: 151 | try: 152 | result = self.pagerank_collection.bulk_write(operations) 153 | logging.info(f"Updated {result.modified_count} PageRank scores, " 154 | f"Inserted {result.upserted_count} new scores") 155 | except Exception as e: 156 | logging.error(f"Error updating PageRank scores: {str(e)}") 157 | 158 | def should_update_pagerank(self, threshold=1000): 159 | """判断是否需要更新PageRank""" 160 | last_update = self.pagerank_collection.find_one( 161 | sort=[('updated_at', -1)] 162 | ) 163 | 164 | if not last_update: 165 | return True 166 | 167 | # 检查新增链接数量 168 | new_links_count = self.links_collection.count_documents({ 169 | 'created_at': {'$gt': last_update['updated_at']} 170 | }) 171 | 172 | return new_links_count >= threshold 173 | 174 | 175 | class NewsScraperNankai: 176 | def __init__(self): 177 | self.base_url = "http://news.nankai.edu.cn" 178 | self.first_page = "http://news.nankai.edu.cn/dcxy/index.shtml" 179 | self.page_template = "https://news.nankai.edu.cn/dcxy/system/count//0005000/000000000000/000/000/c0005000000000000000_000000{:03d}.shtml" 180 | self.max_pages = 524 181 | 182 | # MongoDB连接设置 183 | self.mongo_client = MongoClient('mongodb://localhost:27017/') 184 | self.db = self.mongo_client['nankai_news_datasets'] 185 | self.news_collection = self.db['NEWS'] 186 | self.snapshot_collection = self.db['WEB_snapshot'] 187 | self.fs = gridfs.GridFS(self.db) # 用于存储附件 188 | 189 | # 创建索引 190 | self.news_collection.create_index([('url', 1)], unique=True) 191 | self.snapshot_collection.create_index([('url', 1), ('captured_at', -1)]) 192 | 193 | # 初始化PageRank计算器 194 | self.pagerank_calculator = PageRankCalculator(self.mongo_client) 195 | 196 | # 支持的附件类型 197 | self.supported_attachments = [ 198 | ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", # 常见文档格式 199 | ".mp3", ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", # 音频和视频格式 200 | ".zip", ".rar", ".tar", ".gz", ".bz2", ".7z", # 压缩文件格式 201 | ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", # 图片格式 202 | ".exe", ".apk", ".dmg", # 可执行文件和应用程序 203 | ".csv", ".txt", ".rtf", # 文本文件 204 | ".xls", ".xlsx", # 表格文件 205 | ] 206 | 207 | # 设置日志 208 | logging.basicConfig( 209 | level=logging.INFO, 210 | format='%(asctime)s - %(levelname)s - %(message)s', 211 | handlers=[ 212 | logging.FileHandler('scraper.log', encoding='utf-8'), 213 | logging.StreamHandler() 214 | ] 215 | ) 216 | 217 | # 设置请求头 218 | self.headers = { 219 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 220 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 221 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 222 | 'Connection': 'keep-alive' 223 | } 224 | 225 | def get_page_urls(self): 226 | """生成所有页面的URL""" 227 | urls = [self.first_page] # 第一页 228 | urls.extend(self.page_template.format(i) for i in range(1, self.max_pages + 1)) 229 | return urls 230 | 231 | def get_soup(self, url, retries=3): 232 | """获取页面的BeautifulSoup对象和原始HTML内容""" 233 | for i in range(retries): 234 | try: 235 | time.sleep(random.uniform(1, 3)) 236 | response = requests.get(url, headers=self.headers, timeout=10) 237 | response.encoding = 'utf-8' 238 | 239 | if response.status_code == 200: 240 | html_content = response.text 241 | return BeautifulSoup(html_content, 'html.parser'), html_content 242 | else: 243 | logging.warning(f"Failed to fetch {url}, status code: {response.status_code}") 244 | 245 | except Exception as e: 246 | logging.error(f"Attempt {i + 1} failed for {url}: {str(e)}") 247 | if i == retries - 1: 248 | logging.error(f"All attempts failed for {url}") 249 | return None, None 250 | time.sleep(random.uniform(2, 5)) 251 | return None, None 252 | 253 | def save_snapshot(self, url, html_content): 254 | """保存网页快照""" 255 | try: 256 | snapshot_data = { 257 | 'url': url, 258 | 'html_content': html_content, 259 | 'captured_at': datetime.now(), 260 | 'content_hash': hashlib.md5(html_content.encode('utf-8')).hexdigest() 261 | } 262 | self.snapshot_collection.insert_one(snapshot_data) 263 | return snapshot_data['content_hash'] 264 | except Exception as e: 265 | logging.error(f"Error saving snapshot for {url}: {str(e)}") 266 | return None 267 | 268 | def find_attachments(self, soup, base_url): 269 | """查找页面中的附件链接""" 270 | attachments = [] 271 | for link in soup.find_all('a', href=True): 272 | href = link['href'].lower() 273 | if any(ext in href for ext in self.supported_attachments): 274 | full_url = self.base_url + href if href.startswith('/') else href 275 | attachments.append({ 276 | 'url': full_url, 277 | 'filename': os.path.basename(href), 278 | 'title': link.text.strip() 279 | }) 280 | return attachments 281 | 282 | def save_attachment(self, attachment_info): 283 | """保存附件到GridFS""" 284 | try: 285 | response = requests.get(attachment_info['url'], headers=self.headers, timeout=30) 286 | if response.status_code == 200: 287 | file_id = self.fs.put( 288 | response.content, 289 | filename=attachment_info['filename'], 290 | url=attachment_info['url'], 291 | title=attachment_info['title'], 292 | upload_date=datetime.now() 293 | ) 294 | return file_id 295 | except Exception as e: 296 | logging.error(f"Error saving attachment {attachment_info['url']}: {str(e)}") 297 | return None 298 | 299 | def parse_news_list_page(self, url): 300 | """解析新闻列表页面""" 301 | soup, html_content = self.get_soup(url) 302 | if not soup: 303 | return [] 304 | 305 | # 保存列表页快照 306 | snapshot_hash = self.save_snapshot(url, html_content) 307 | 308 | # 提取并保存页面链接关系 309 | links = self.pagerank_calculator.extract_links(soup, url) 310 | self.pagerank_calculator.save_links(links) 311 | 312 | news_items = [] 313 | tables = soup.find_all('table', attrs={'width': "98%", 'border': "0", 'cellpadding': "0", 'cellspacing': "0"}) 314 | 315 | for table in tables: 316 | try: 317 | title_link = table.find('a') 318 | if not title_link: 319 | continue 320 | 321 | title = title_link.text.strip() 322 | news_url = self.base_url + title_link['href'] if title_link['href'].startswith('/') else title_link[ 323 | 'href'] 324 | date_td = table.find('td', align="right") 325 | date = date_td.text.strip() if date_td else None 326 | 327 | logging.info(f"Processing: {title}") 328 | 329 | # 获取新闻详细内容和快照 330 | article_content, article_snapshot_hash, article_attachments = self.parse_news_detail(news_url) 331 | 332 | news_item = { 333 | 'title': title, 334 | 'url': news_url, 335 | 'date': date, 336 | 'source': article_content.get('source', ''), 337 | 'content': article_content.get('content', ''), 338 | 'snapshot_hash': article_snapshot_hash, 339 | 'attachments': article_attachments 340 | } 341 | 342 | news_items.append(news_item) 343 | 344 | except Exception as e: 345 | logging.error(f"Error parsing news item: {str(e)}") 346 | continue 347 | 348 | return news_items 349 | 350 | def parse_news_detail(self, url): 351 | """解析新闻详细页面,包括快照和附件""" 352 | soup, html_content = self.get_soup(url) 353 | if not soup: 354 | return {'source': '', 'content': ''}, None, [] 355 | 356 | try: 357 | # 保存快照 358 | snapshot_hash = self.save_snapshot(url, html_content) 359 | 360 | # 提取并保存页面链接关系 361 | links = self.pagerank_calculator.extract_links(soup, url) 362 | self.pagerank_calculator.save_links(links) 363 | 364 | # 查找附件 365 | attachments = self.find_attachments(soup, url) 366 | saved_attachments = [] 367 | 368 | # 保存附件 369 | for attachment in attachments: 370 | file_id = self.save_attachment(attachment) 371 | if file_id: 372 | saved_attachments.append({ 373 | 'file_id': file_id, 374 | 'url': attachment['url'], 375 | 'filename': attachment['filename'], 376 | 'title': attachment['title'] 377 | }) 378 | 379 | # 解析内容 380 | source_span = soup.find('span', string=re.compile('来源:')) 381 | source = source_span.text.strip() if source_span else '' 382 | 383 | content_div = soup.find('td', id='txt') 384 | if content_div: 385 | paragraphs = content_div.find_all('p') 386 | content = '\n'.join([p.text.strip() for p in paragraphs if p.text.strip()]) 387 | else: 388 | content = '' 389 | 390 | return { 391 | 'source': source, 392 | 'content': content 393 | }, snapshot_hash, saved_attachments 394 | 395 | except Exception as e: 396 | logging.error(f"Error parsing detail page {url}: {str(e)}") 397 | return {'source': '', 'content': ''}, None, [] 398 | 399 | def scrape_batch(self, urls, batch_size=10): 400 | """批量抓取新闻并保存到MongoDB""" 401 | for i in range(0, len(urls), batch_size): 402 | batch_urls = urls[i:i + batch_size] 403 | batch_number = i // batch_size + 1 404 | 405 | logging.info(f"Processing batch {batch_number}, pages {i + 1} to {min(i + batch_size, len(urls))}") 406 | 407 | # 使用线程池并行处理每批URL 408 | with ThreadPoolExecutor(max_workers=5) as executor: 409 | batch_results = list(executor.map(self.parse_news_list_page, batch_urls)) 410 | 411 | # 合并结果 412 | batch_news = [item for sublist in batch_results if sublist for item in sublist] 413 | 414 | # 保存这一批次的数据到MongoDB 415 | inserted, updated = self.save_to_mongodb(batch_news, batch_number) 416 | logging.info(f"Batch {batch_number} completed: {inserted} new items, {updated} updates") 417 | 418 | # 检查是否需要更新PageRank 419 | if self.pagerank_calculator.should_update_pagerank(): 420 | logging.info("Starting PageRank update...") 421 | self.pagerank_calculator.update_pagerank_scores() 422 | logging.info("PageRank update completed") 423 | 424 | # 批次间休息 425 | time.sleep(random.uniform(3, 5)) 426 | 427 | def save_to_mongodb(self, news_items, batch_number=None): 428 | """保存数据到MongoDB""" 429 | if not news_items: 430 | logging.warning("No data to save to MongoDB") 431 | return 0, 0 432 | 433 | inserted_count = 0 434 | updated_count = 0 435 | 436 | for item in news_items: 437 | try: 438 | # 添加时间戳和批次信息 439 | item['created_at'] = datetime.now() 440 | item['batch_number'] = batch_number 441 | 442 | # 使用update_one with upsert=True来避免重复插入 443 | result = self.news_collection.update_one( 444 | {'url': item['url']}, # 查询条件 445 | {'$set': item}, # 更新的数据 446 | upsert=True # 如果不存在则插入 447 | ) 448 | 449 | if result.upserted_id: 450 | inserted_count += 1 451 | elif result.modified_count: 452 | updated_count += 1 453 | 454 | except Exception as e: 455 | logging.error(f"Error saving to MongoDB: {str(e)}") 456 | continue 457 | 458 | logging.info( 459 | f"Batch {batch_number}: Inserted {inserted_count} new documents, Updated {updated_count} documents") 460 | return inserted_count, updated_count 461 | 462 | def get_news_count(self): 463 | """获取数据库中的新闻总数""" 464 | return self.news_collection.count_documents({}) 465 | 466 | def update_pagerank_if_needed(self): 467 | """检查并在需要时更新PageRank""" 468 | if self.pagerank_calculator.should_update_pagerank(): 469 | logging.info("Starting PageRank update...") 470 | self.pagerank_calculator.update_pagerank_scores() 471 | logging.info("PageRank update completed") 472 | 473 | def scrape(self): 474 | """主抓取函数""" 475 | logging.info("Starting to scrape news...") 476 | urls = self.get_page_urls() 477 | self.scrape_batch(urls) 478 | 479 | # 完成后更新一次PageRank 480 | self.update_pagerank_if_needed() 481 | 482 | # 打印最终统计信息 483 | total_news = self.get_news_count() 484 | logging.info(f"Scraping completed. Total news in database: {total_news}") 485 | 486 | def cleanup(self): 487 | """清理资源""" 488 | self.mongo_client.close() 489 | 490 | def main(): 491 | scraper = None 492 | try: 493 | scraper = NewsScraperNankai() 494 | scraper.scrape() 495 | except Exception as e: 496 | logging.error(f"An error occurred during scraping: {str(e)}") 497 | finally: 498 | if scraper: 499 | scraper.cleanup() 500 | 501 | if __name__ == "__main__": 502 | main() -------------------------------------------------------------------------------- /data_clean/clean_document.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | 4 | class MongoDBCleaner: 5 | def __init__(self, db_name, collection_name): 6 | # 连接到 MongoDB 7 | self.client = MongoClient('mongodb://localhost:27017/') 8 | self.db = self.client[db_name] 9 | self.collection = self.db[collection_name] 10 | 11 | def clean_data(self): 12 | """清洗数据:删除 chunkSize 字段并添加 filetype 字段""" 13 | print("开始清洗数据...") 14 | 15 | # 查询所有文档总数 16 | total = self.collection.count_documents({}) # 获取集合中文档总数 17 | print(f"总文档数: {total}") 18 | 19 | # 初始化更新计数 20 | updated = 0 21 | 22 | # 遍历所有文档 23 | with self.collection.find({}, {'filename': 1}) as cursor: # 只取需要字段 24 | for doc in cursor: 25 | # 提取文件类型 26 | filetype = None 27 | if 'filename' in doc: 28 | filetype = doc['filename'].split('.')[-1] if '.' in doc['filename'] else 'unknown' 29 | 30 | # 构造更新操作 31 | update_query = { 32 | '$unset': {'chunkSize': ""}, # 删除 chunkSize 字段 33 | '$set': {'filetype': filetype} # 添加 filetype 字段 34 | } 35 | 36 | # 执行更新 37 | self.collection.update_one({'_id': doc['_id']}, update_query) 38 | updated += 1 39 | 40 | # 打印进度(每 100 条打印一次) 41 | if updated % 100 == 0: 42 | print(f"已更新 {updated}/{total} 条记录...") 43 | 44 | print(f"清洗完成!共更新 {updated}/{total} 条记录。") 45 | 46 | 47 | if __name__ == "__main__": 48 | # 替换为你的数据库名称和集合名称 49 | db_name = "nankai_news_datasets" 50 | collection_name = "Document" 51 | 52 | cleaner = MongoDBCleaner(db_name, collection_name) 53 | cleaner.clean_data() 54 | -------------------------------------------------------------------------------- /data_clean/load_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from pymongo import MongoClient 4 | from datetime import datetime 5 | 6 | client = MongoClient('mongodb://localhost:27017/') 7 | db = client['nankai_news_datasets'] 8 | collection = db['2024_12_01_02_57_18'] 9 | 10 | for filename in os.listdir(r'C:\Xing\IR\lab4\code\2024_12_01_02_57_18'): 11 | if filename.endswith('.json'): 12 | with open(os.path.join(r'C:\Xing\IR\lab4\code\2024_12_01_02_57_18', filename), 'r', encoding='utf-8') as f: 13 | json_data = json.load(f) 14 | collection.insert_one(json_data) -------------------------------------------------------------------------------- /data_clean/merge_data.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from datetime import datetime 3 | 4 | 5 | def merge_collections(): 6 | # 连接MongoDB 7 | client = MongoClient('mongodb://localhost:27017/') 8 | db = client['nankai_news_datasets'] # 替换成你的数据库名 9 | 10 | # 源集合名称 11 | collection1_name = '2024_11_30_00_52_59' 12 | collection2_name = '2024_11_30_02_32_56' 13 | # 目标集合名称(合并后的集合) 14 | merged_collection_name = 'NEWS1' 15 | 16 | try: 17 | # 创建一个新的集合来存储合并结果 18 | if merged_collection_name in db.list_collection_names(): 19 | print(f"集合 {merged_collection_name} 已存在,先删除它") 20 | db[merged_collection_name].drop() 21 | 22 | # 记录合并前的文档数量 23 | count1 = db[collection1_name].count_documents({}) 24 | count2 = db[collection2_name].count_documents({}) 25 | print(f"合并前统计:") 26 | print(f"集合 {collection1_name}: {count1} 条文档") 27 | print(f"集合 {collection2_name}: {count2} 条文档") 28 | 29 | # 使用聚合管道合并集合 30 | pipeline = [ 31 | {'$out': merged_collection_name} 32 | ] 33 | 34 | # 将第一个集合的数据写入新集合 35 | db[collection1_name].aggregate(pipeline) 36 | 37 | # 将第二个集合的数据添加到新集合 38 | db[collection2_name].aggregate([ 39 | {'$merge': { 40 | 'into': merged_collection_name, 41 | 'whenMatched': 'keepExisting', # 如果遇到重复文档,保留已存在的 42 | 'whenNotMatched': 'insert' # 如果是新文档,则插入 43 | }} 44 | ]) 45 | 46 | # 统计合并后的文档数量 47 | merged_count = db[merged_collection_name].count_documents({}) 48 | print(f"\n合并完成!") 49 | print(f"合并后的集合 {merged_collection_name}: {merged_count} 条文档") 50 | 51 | # 检查是否有重复文档 52 | if merged_count < count1 + count2: 53 | print(f"注意:检测到 {count1 + count2 - merged_count} 条重复文档被跳过") 54 | 55 | # 显示合并后的示例文档 56 | print("\n合并后的文档示例:") 57 | sample_doc = db[merged_collection_name].find_one() 58 | print(sample_doc) 59 | 60 | except Exception as e: 61 | print(f"合并过程中出错: {str(e)}") 62 | 63 | finally: 64 | client.close() 65 | 66 | 67 | if __name__ == "__main__": 68 | merge_collections() -------------------------------------------------------------------------------- /data_clean/news1_clean_ distinct.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from bson.objectid import ObjectId 3 | 4 | 5 | def remove_duplicates(): 6 | # 连接MongoDB 7 | client = MongoClient('mongodb://localhost:27017/') 8 | db = client['nankai_news_datasets'] 9 | collection = db['NEWS1'] 10 | 11 | try: 12 | # 记录清理前的文档数量 13 | initial_count = collection.count_documents({}) 14 | print(f"清理前文档数量: {initial_count}") 15 | 16 | # 1. 基于URL去重 17 | print("\n正在基于URL去重...") 18 | duplicate_urls = collection.aggregate([ 19 | {"$group": { 20 | "_id": "$url", 21 | "count": {"$sum": 1}, 22 | "ids": {"$push": "$_id"}, 23 | "first_id": {"$first": "$_id"} 24 | }}, 25 | {"$match": { 26 | "count": {"$gt": 1} 27 | }} 28 | ], allowDiskUse=True) # 允许使用磁盘处理大数据集 29 | 30 | url_dups_removed = 0 31 | for dup in duplicate_urls: 32 | # 获取要删除的文档ID(除了第一个之外的所有ID) 33 | ids_to_remove = [id for id in dup["ids"] if id != dup["first_id"]] 34 | if ids_to_remove: 35 | result = collection.delete_many({"_id": {"$in": ids_to_remove}}) 36 | url_dups_removed += result.deleted_count 37 | print(f"删除了 {result.deleted_count} 条URL重复的文档") 38 | 39 | # 2. 基于内容去重 40 | print("\n正在基于内容去重...") 41 | duplicate_contents = collection.aggregate([ 42 | {"$group": { 43 | "_id": "$content", 44 | "count": {"$sum": 1}, 45 | "ids": {"$push": "$_id"}, 46 | "first_id": {"$first": "$_id"} 47 | }}, 48 | {"$match": { 49 | "count": {"$gt": 1} 50 | }} 51 | ], allowDiskUse=True) 52 | 53 | content_dups_removed = 0 54 | for dup in duplicate_contents: 55 | ids_to_remove = [id for id in dup["ids"] if id != dup["first_id"]] 56 | if ids_to_remove: 57 | result = collection.delete_many({"_id": {"$in": ids_to_remove}}) 58 | content_dups_removed += result.deleted_count 59 | print(f"删除了 {result.deleted_count} 条内容重复的文档") 60 | 61 | # 验证删除结果 62 | final_count = collection.count_documents({}) 63 | total_removed = initial_count - final_count 64 | 65 | print("\n清理结果统计:") 66 | print(f"初始文档数量: {initial_count}") 67 | print(f"最终文档数量: {final_count}") 68 | print(f"基于URL删除的重复文档: {url_dups_removed}") 69 | print(f"基于内容删除的重复文档: {content_dups_removed}") 70 | print(f"实际减少的文档数量: {total_removed}") 71 | 72 | # 显示一个示例文档以确认集合仍然可访问 73 | print("\n清理后的文档示例:") 74 | sample = collection.find_one() 75 | if sample: 76 | print(f"文档ID: {sample['_id']}") 77 | print(f"URL: {sample.get('url', 'N/A')}") 78 | print(f"标题: {sample.get('title', 'N/A')}") 79 | else: 80 | print("警告:无法获取示例文档") 81 | 82 | except Exception as e: 83 | print(f"清理过程中出错: {str(e)}") 84 | print("错误堆栈:") 85 | import traceback 86 | print(traceback.format_exc()) 87 | 88 | finally: 89 | client.close() 90 | 91 | 92 | if __name__ == "__main__": 93 | # 确认操作 94 | print("注意:此操作将直接删除重复文档。建议先备份数据。") 95 | confirm = input("是否继续?(y/n): ") 96 | if confirm.lower() == 'y': 97 | remove_duplicates() 98 | else: 99 | print("操作已取消") -------------------------------------------------------------------------------- /data_clean/news_clean_distinct.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from pprint import pprint 3 | 4 | 5 | def check_duplicates(): 6 | # 连接MongoDB 7 | client = MongoClient('mongodb://localhost:27017/') 8 | db = client['nankai_news_datasets'] # 替换成你的数据库名 9 | collection = db['NEWS'] # 替换成你的集合名 10 | 11 | try: 12 | # 获取总文档数 13 | total_docs = collection.count_documents({}) 14 | print(f"\n数据库总文档数量: {total_docs}") 15 | 16 | # 查找重复的URL 17 | print("\n==== URL重复情况统计 ====") 18 | duplicate_urls = list(collection.aggregate([ 19 | { 20 | "$group": { 21 | "_id": "$url", 22 | "count": {"$sum": 1}, 23 | "documents": { 24 | "$push": { 25 | "_id": "$_id", 26 | "title": "$title", 27 | "source": "$source", 28 | "date": "$date" 29 | } 30 | } 31 | } 32 | }, 33 | { 34 | "$match": { 35 | "count": {"$gt": 1} 36 | } 37 | }, 38 | { 39 | "$sort": {"count": -1} # 按重复次数降序排序 40 | } 41 | ])) 42 | 43 | # 打印重复URL的统计信息 44 | if duplicate_urls: 45 | print(f"\n发现 {len(duplicate_urls)} 组重复URL") 46 | total_duplicates = sum(doc['count'] - 1 for doc in duplicate_urls) 47 | print(f"总共有 {total_duplicates} 条重复文档需要清理") 48 | 49 | # 显示重复文档的详细示例 50 | print("\n==== 重复文档示例(显示前3组) ====") 51 | for i, dup in enumerate(duplicate_urls[:3], 1): 52 | print(f"\n第 {i} 组重复 (重复 {dup['count']} 次):") 53 | print(f"URL: {dup['_id']}") 54 | print("包含的文档:") 55 | for doc in dup['documents']: 56 | print("-" * 50) 57 | print(f"文档ID: {doc['_id']}") 58 | print(f"标题: {doc.get('title', 'N/A')}") 59 | print(f"来源: {doc.get('source', 'N/A')}") 60 | print(f"日期: {doc.get('date', 'N/A')}") 61 | 62 | # 显示重复次数分布 63 | print("\n==== 重复次数分布 ====") 64 | duplicate_counts = {} 65 | for dup in duplicate_urls: 66 | count = dup['count'] 67 | duplicate_counts[count] = duplicate_counts.get(count, 0) + 1 68 | 69 | for count, freq in sorted(duplicate_counts.items()): 70 | print(f"重复 {count} 次的URL有 {freq} 个") 71 | 72 | else: 73 | print("没有发现重复的URL") 74 | 75 | except Exception as e: 76 | print(f"检查过程中出错: {str(e)}") 77 | import traceback 78 | print(traceback.format_exc()) 79 | 80 | finally: 81 | client.close() 82 | 83 | 84 | if __name__ == "__main__": 85 | print("开始检查重复数据...") 86 | check_duplicates() 87 | 88 | user_input = input("\n是否需要进行数据清理?(y/n): ") 89 | if user_input.lower() == 'y': 90 | print("\n请运行清理脚本进行数据清理。") 91 | else: 92 | print("操作已取消") -------------------------------------------------------------------------------- /data_clean/news_clean_frame.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from datetime import datetime 3 | 4 | 5 | def test_cleaning_on_sample(): 6 | # 连接MongoDB 7 | client = MongoClient('mongodb://localhost:27017/') # 替换成你的MongoDB连接字符串 8 | db = client['nankai_news_datasets'] # 替换成你的数据库名 9 | collection = db['NEWS'] # 替换成你的集合名 10 | 11 | # 获取前10条数据的ID 12 | sample_ids = [doc['_id'] for doc in collection.find().limit(10)] 13 | 14 | print("测试前的数据样本:") 15 | for doc in collection.find({'_id': {'$in': sample_ids}}): 16 | print(f"ID: {doc['_id']}") 17 | print(f"Source: {doc.get('source', '未找到')}") 18 | print(f"Batch Number: {doc.get('batch_number', '未找到')}") 19 | print("-" * 50) 20 | 21 | try: 22 | # 仅对这10条数据进行清洗 23 | # 清理source字段中的"来源:" 24 | collection.update_many( 25 | { 26 | '_id': {'$in': sample_ids}, 27 | 'source': {'$regex': '来源:'} 28 | }, 29 | [{ 30 | '$set': { 31 | 'source': { 32 | '$replaceAll': { 33 | 'input': '$source', 34 | 'find': '来源:', 35 | 'replacement': '' 36 | } 37 | } 38 | } 39 | }] 40 | ) 41 | 42 | # 删除batch_number字段 43 | collection.update_many( 44 | {'_id': {'$in': sample_ids}}, 45 | {'$unset': {'batch_number': ''}} 46 | ) 47 | 48 | print("\n清洗后的数据样本:") 49 | for doc in collection.find({'_id': {'$in': sample_ids}}): 50 | print(f"ID: {doc['_id']}") 51 | print(f"Source: {doc.get('source', '未找到')}") 52 | print(f"Batch Number: {doc.get('batch_number', '未找到')}") 53 | print("-" * 50) 54 | 55 | user_input = input("\n测试结果是否符合预期?(y/n): ") 56 | 57 | if user_input.lower() == 'y': 58 | print("\n是否要对所有数据进行清洗?(y/n): ") 59 | clean_all = input() 60 | if clean_all.lower() == 'y': 61 | # 清洗所有数据 62 | collection.update_many( 63 | {'source': {'$regex': '来源:'}}, 64 | [{ 65 | '$set': { 66 | 'source': { 67 | '$replaceAll': { 68 | 'input': '$source', 69 | 'find': '来源:', 70 | 'replacement': '' 71 | } 72 | } 73 | } 74 | }] 75 | ) 76 | 77 | collection.update_many( 78 | {}, 79 | {'$unset': {'batch_number': ''}} 80 | ) 81 | print("所有数据清洗完成!") 82 | else: 83 | print("操作已取消") 84 | else: 85 | print("请调整清洗规则后重试") 86 | 87 | except Exception as e: 88 | print(f"发生错误: {str(e)}") 89 | 90 | finally: 91 | client.close() 92 | 93 | 94 | if __name__ == "__main__": 95 | test_cleaning_on_sample() -------------------------------------------------------------------------------- /db_init/init_db.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient, ASCENDING 2 | from datetime import datetime 3 | 4 | 5 | def init_user_database(): 6 | """初始化用户相关的所有数据库集合""" 7 | try: 8 | # 连接数据库 9 | client = MongoClient('localhost', 27017) 10 | db = client['nankai_news_datasets'] # 使用现有的数据库 11 | 12 | # 1. 用户集合 (users) 13 | if 'users' not in db.list_collection_names(): 14 | users = db.create_collection('users') 15 | users.create_index([('username', ASCENDING)], unique=True) 16 | users.create_index([('email', ASCENDING)], unique=True) 17 | print("用户集合创建成功") 18 | 19 | # 2. 搜索历史集合 (search_history) 20 | if 'search_history' not in db.list_collection_names(): 21 | search_history = db.create_collection('search_history') 22 | search_history.create_index([('user_id', ASCENDING)]) 23 | search_history.create_index([('timestamp', ASCENDING)]) 24 | print("搜索历史集合创建成功") 25 | 26 | # 3. 用户偏好设置集合 (user_preferences) 27 | if 'user_preferences' not in db.list_collection_names(): 28 | preferences = db.create_collection('user_preferences') 29 | preferences.create_index([('user_id', ASCENDING)], unique=True) 30 | print("用户偏好集合创建成功") 31 | 32 | # 4. 登录历史集合 (login_history) 33 | if 'login_history' not in db.list_collection_names(): 34 | login_history = db.create_collection('login_history') 35 | login_history.create_index([('user_id', ASCENDING)]) 36 | login_history.create_index([('login_time', ASCENDING)]) 37 | print("登录历史集合创建成功") 38 | 39 | print("\n数据库初始化完成!创建了以下集合:") 40 | print("- users: 用户基本信息") 41 | print("- search_history: 搜索历史记录") 42 | print("- user_preferences: 用户偏好设置") 43 | print("- login_history: 登录历史记录") 44 | 45 | # 展示所有集合的结构 46 | print("\n各集合的数据结构:") 47 | print("\nusers 集合结构:") 48 | print({ 49 | "username": "用户名 (唯一)", 50 | "email": "邮箱 (唯一)", 51 | "password": "密码哈希", 52 | "created_at": "创建时间", 53 | "last_login": "最后登录时间" 54 | }) 55 | 56 | print("\nsearch_history 集合结构:") 57 | print({ 58 | "user_id": "用户ID", 59 | "query": "搜索关键词", 60 | "search_in": "搜索范围", 61 | "sort_by": "排序方式", 62 | "timestamp": "搜索时间" 63 | }) 64 | 65 | print("\nuser_preferences 集合结构:") 66 | print({ 67 | "user_id": "用户ID", 68 | "default_search_in": "默认搜索范围", 69 | "default_sort_by": "默认排序方式", 70 | "results_per_page": "每页结果数" 71 | }) 72 | 73 | print("\nlogin_history 集合结构:") 74 | print({ 75 | "user_id": "用户ID", 76 | "login_time": "登录时间", 77 | "ip_address": "IP地址" 78 | }) 79 | 80 | except Exception as e: 81 | print(f"初始化数据库时出错: {str(e)}") 82 | raise e 83 | 84 | 85 | if __name__ == "__main__": 86 | init_user_database() -------------------------------------------------------------------------------- /db_init/init_db_new.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient, ASCENDING 2 | from datetime import datetime 3 | 4 | 5 | def init_user_database(): 6 | """初始化用户相关的所有数据库集合""" 7 | try: 8 | # 连接数据库 9 | client = MongoClient('localhost', 27017) 10 | db = client['nankai_news_datasets'] # 使用现有的数据库 11 | 12 | # 1. 用户集合 (users) 13 | if 'users' not in db.list_collection_names(): 14 | users = db.create_collection('users') 15 | users.create_index([('username', ASCENDING)], unique=True) 16 | users.create_index([('email', ASCENDING)], unique=True) 17 | print("用户集合创建成功") 18 | 19 | # 2. 搜索历史集合 (search_history) 20 | if 'search_history' not in db.list_collection_names(): 21 | search_history = db.create_collection('search_history') 22 | search_history.create_index([('user_id', ASCENDING)]) 23 | search_history.create_index([('timestamp', ASCENDING)]) 24 | print("搜索历史集合创建成功") 25 | 26 | # 3. 用户偏好设置集合 (user_preferences) 27 | if 'user_preferences' not in db.list_collection_names(): 28 | preferences = db.create_collection('user_preferences') 29 | preferences.create_index([('user_id', ASCENDING)], unique=True) 30 | print("用户偏好集合创建成功") 31 | 32 | # 4. 登录历史集合 (login_history) 33 | if 'login_history' not in db.list_collection_names(): 34 | login_history = db.create_collection('login_history') 35 | login_history.create_index([('user_id', ASCENDING)]) 36 | login_history.create_index([('login_time', ASCENDING)]) 37 | print("登录历史集合创建成功") 38 | 39 | # 5. 新增:用户身份信息集合 (user_profiles) 40 | if 'user_profiles' not in db.list_collection_names(): 41 | user_profiles = db.create_collection('user_profiles') 42 | # 创建user_id索引确保一个用户只有一个profile 43 | user_profiles.create_index([('user_id', ASCENDING)], unique=True) 44 | # 为了支持按身份类型和学院查询,创建这些字段的索引 45 | user_profiles.create_index([('role', ASCENDING)]) 46 | user_profiles.create_index([('college', ASCENDING)]) 47 | print("用户身份信息集合创建成功") 48 | 49 | print("\n数据库初始化完成!创建了以下集合:") 50 | print("- users: 用户基本信息") 51 | print("- search_history: 搜索历史记录") 52 | print("- user_preferences: 用户偏好设置") 53 | print("- login_history: 登录历史记录") 54 | print("- user_profiles: 用户身份信息") 55 | 56 | # 展示所有集合的结构 57 | print("\n各集合的数据结构:") 58 | print("\nusers 集合结构:") 59 | print({ 60 | "username": "用户名 (唯一)", 61 | "email": "邮箱 (唯一)", 62 | "password": "密码哈希", 63 | "created_at": "创建时间", 64 | "last_login": "最后登录时间" 65 | }) 66 | 67 | print("\nsearch_history 集合结构:") 68 | print({ 69 | "user_id": "用户ID", 70 | "query": "搜索关键词", 71 | "search_in": "搜索范围", 72 | "sort_by": "排序方式", 73 | "timestamp": "搜索时间" 74 | }) 75 | 76 | print("\nuser_preferences 集合结构:") 77 | print({ 78 | "user_id": "用户ID", 79 | "default_search_in": "默认搜索范围", 80 | "default_sort_by": "默认排序方式", 81 | "results_per_page": "每页结果数" 82 | }) 83 | 84 | print("\nlogin_history 集合结构:") 85 | print({ 86 | "user_id": "用户ID", 87 | "login_time": "登录时间", 88 | "ip_address": "IP地址" 89 | }) 90 | 91 | print("\nuser_profiles 集合结构:") 92 | print({ 93 | "user_id": "用户ID (唯一)", 94 | "age": "年龄 (可选)", 95 | "role": "身份 (本科生/研究生/博士生/教师)", 96 | "college": "学院 (可选)", 97 | "major": "专业 (可选)", 98 | "grade": "年级 (可选)", 99 | "research_interests": "研究方向 (可选,数组)", 100 | "last_updated": "最后更新时间" 101 | }) 102 | 103 | except Exception as e: 104 | print(f"初始化数据库时出错: {str(e)}") 105 | raise e 106 | 107 | 108 | if __name__ == "__main__": 109 | init_user_database() -------------------------------------------------------------------------------- /db_init/init_user_profiles.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient, ASCENDING 2 | from datetime import datetime 3 | 4 | 5 | def init_user_profiles(): 6 | """创建用户身份信息表并为现有用户初始化数据""" 7 | try: 8 | # 连接数据库 9 | client = MongoClient('localhost', 27017) 10 | db = client['nankai_news_datasets'] 11 | 12 | # 1. 创建user_profiles集合 13 | if 'user_profiles' not in db.list_collection_names(): 14 | user_profiles = db.create_collection('user_profiles') 15 | # 创建user_id索引确保一个用户只有一个profile 16 | user_profiles.create_index([('user_id', ASCENDING)], unique=True) 17 | print("用户身份信息集合创建成功") 18 | else: 19 | user_profiles = db['user_profiles'] 20 | print("用户身份信息集合已存在") 21 | 22 | # 2. 获取现有用户列表 23 | existing_users = db.users.find({}, {'_id': 1}) 24 | 25 | # 3. 为现有用户初始化身份信息 26 | default_profile = { 27 | "role": "未设置", # 默认身份 28 | "college": "未设置", # 默认学院 29 | "age": None, # 默认年龄为空 30 | "created_at": datetime.now(), 31 | "last_updated": datetime.now() 32 | } 33 | 34 | for user in existing_users: 35 | # 检查用户是否已有profile 36 | if not user_profiles.find_one({"user_id": user['_id']}): 37 | profile_data = { 38 | "user_id": user['_id'], 39 | **default_profile 40 | } 41 | user_profiles.insert_one(profile_data) 42 | print(f"为用户 {user['_id']} 创建默认身份信息") 43 | 44 | print("\n初始化完成!user_profiles集合结构如下:") 45 | print({ 46 | "user_id": "用户ID (唯一)", 47 | "role": "身份 (默认'未设置')", 48 | "college": "学院 (默认'未设置')", 49 | "age": "年龄 (默认None)", 50 | "created_at": "创建时间", 51 | "last_updated": "最后更新时间" 52 | }) 53 | 54 | # 打印初始化统计信息 55 | total_profiles = user_profiles.count_documents({}) 56 | print(f"\n总计初始化了 {total_profiles} 条用户身份信息") 57 | 58 | except Exception as e: 59 | print(f"初始化用户身份信息时出错: {str(e)}") 60 | raise e 61 | 62 | 63 | def create_profile_for_new_user(user_id): 64 | """为新注册用户创建身份信息记录""" 65 | try: 66 | client = MongoClient('localhost', 27017) 67 | db = client['nankai_news_datasets'] 68 | user_profiles = db['user_profiles'] 69 | 70 | # 检查是否已存在 71 | if not user_profiles.find_one({"user_id": user_id}): 72 | profile_data = { 73 | "user_id": user_id, 74 | "role": "未设置", 75 | "college": "未设置", 76 | "age": None, 77 | "created_at": datetime.now(), 78 | "last_updated": datetime.now() 79 | } 80 | user_profiles.insert_one(profile_data) 81 | print(f"为新用户 {user_id} 创建身份信息成功") 82 | else: 83 | print(f"用户 {user_id} 的身份信息已存在") 84 | 85 | except Exception as e: 86 | print(f"创建用户身份信息时出错: {str(e)}") 87 | raise e 88 | 89 | 90 | if __name__ == "__main__": 91 | # 初始化user_profiles集合并为现有用户创建记录 92 | init_user_profiles() -------------------------------------------------------------------------------- /img-folder/19255F29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/19255F29.png -------------------------------------------------------------------------------- /img-folder/image-20241217170449292.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217170449292.png -------------------------------------------------------------------------------- /img-folder/image-20241217172152142.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217172152142.png -------------------------------------------------------------------------------- /img-folder/image-20241217173749975.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217173749975.png -------------------------------------------------------------------------------- /img-folder/image-20241217173936568.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217173936568.png -------------------------------------------------------------------------------- /img-folder/image-20241217174140950.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217174140950.png -------------------------------------------------------------------------------- /img-folder/image-20241217174236515.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217174236515.png -------------------------------------------------------------------------------- /img-folder/image-20241217174713138.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217174713138.png -------------------------------------------------------------------------------- /img-folder/image-20241217175037417.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217175037417.png -------------------------------------------------------------------------------- /img-folder/image-20241217175233504.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217175233504.png -------------------------------------------------------------------------------- /img-folder/image-20241217184806123.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217184806123.png -------------------------------------------------------------------------------- /img-folder/image-20241217184922146.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217184922146.png -------------------------------------------------------------------------------- /img-folder/image-20241217185008968.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217185008968.png -------------------------------------------------------------------------------- /img-folder/image-20241217185208423.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217185208423.png -------------------------------------------------------------------------------- /img-folder/image-20241217185558358.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217185558358.png -------------------------------------------------------------------------------- /img-folder/image-20241217192145331.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192145331.png -------------------------------------------------------------------------------- /img-folder/image-20241217192259748.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192259748.png -------------------------------------------------------------------------------- /img-folder/image-20241217192419633.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192419633.png -------------------------------------------------------------------------------- /img-folder/image-20241217192631920.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192631920.png -------------------------------------------------------------------------------- /img-folder/image-20241217193833186.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217193833186.png -------------------------------------------------------------------------------- /img-folder/image-20241217193922327.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217193922327.png -------------------------------------------------------------------------------- /img-folder/image-20241217194706713.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217194706713.png -------------------------------------------------------------------------------- /img-folder/image-20241217195003153.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217195003153.png -------------------------------------------------------------------------------- /img-folder/image-20241217195200393.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217195200393.png -------------------------------------------------------------------------------- /img-folder/image-20241217201947701.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217201947701.png -------------------------------------------------------------------------------- /img-folder/image-20241217202328199.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217202328199.png -------------------------------------------------------------------------------- /img-folder/image-20241217204512415.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217204512415.png -------------------------------------------------------------------------------- /img-folder/image-20241217204732261.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217204732261.png -------------------------------------------------------------------------------- /img-folder/image-20241217205153730.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217205153730.png -------------------------------------------------------------------------------- /img-folder/image-20241217205341225.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217205341225.png -------------------------------------------------------------------------------- /img-folder/image-20241217205927444.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217205927444.png -------------------------------------------------------------------------------- /img-folder/image-20241217210224984.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210224984.png -------------------------------------------------------------------------------- /img-folder/image-20241217210435856.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210435856.png -------------------------------------------------------------------------------- /img-folder/image-20241217210524271.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210524271.png -------------------------------------------------------------------------------- /img-folder/image-20241217210643902.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210643902.png -------------------------------------------------------------------------------- /img-folder/image-20241217210942244.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210942244.png -------------------------------------------------------------------------------- /img-folder/image-20241217211441465.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217211441465.png -------------------------------------------------------------------------------- /img-folder/image-20241217211645716.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217211645716.png -------------------------------------------------------------------------------- /img-folder/image-20241217211724777.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217211724777.png -------------------------------------------------------------------------------- /img-folder/image-20241217212505606.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217212505606.png -------------------------------------------------------------------------------- /img-folder/image-20241217212805264.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217212805264.png -------------------------------------------------------------------------------- /img-folder/image-20241217220109997.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220109997.png -------------------------------------------------------------------------------- /img-folder/image-20241217220232016.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220232016.png -------------------------------------------------------------------------------- /img-folder/image-20241217220410027.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220410027.png -------------------------------------------------------------------------------- /img-folder/image-20241217220710450.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220710450.png -------------------------------------------------------------------------------- /img-folder/image-20241217221306764.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221306764.png -------------------------------------------------------------------------------- /img-folder/image-20241217221434868.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221434868.png -------------------------------------------------------------------------------- /img-folder/image-20241217221619006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221619006.png -------------------------------------------------------------------------------- /img-folder/image-20241217221818883.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221818883.png -------------------------------------------------------------------------------- /img-folder/image-20241217222258357.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217222258357.png -------------------------------------------------------------------------------- /img-folder/image-20241217231353742.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217231353742.png -------------------------------------------------------------------------------- /img-folder/image-20241217231856471.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217231856471.png -------------------------------------------------------------------------------- /img-folder/image-20241217234338468.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234338468.png -------------------------------------------------------------------------------- /img-folder/image-20241217234427613.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234427613.png -------------------------------------------------------------------------------- /img-folder/image-20241217234452315.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234452315.png -------------------------------------------------------------------------------- /img-folder/image-20241217234622692.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234622692.png -------------------------------------------------------------------------------- /img-folder/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /index/ES_Index.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from elasticsearch import Elasticsearch 3 | from elasticsearch.helpers import bulk 4 | from bson import ObjectId 5 | 6 | 7 | class NewsIndexer: 8 | def __init__(self, 9 | mongo_host='localhost', 10 | mongo_port=27017, 11 | mongo_db='nankai_news', 12 | es_host='localhost', 13 | es_port=9200, 14 | index_name='nankai_news_index'): 15 | # MongoDB连接 16 | self.mongo_client = pymongo.MongoClient(mongo_host, mongo_port) 17 | self.mongo_db = self.mongo_client[mongo_db] 18 | self.news_collection = self.mongo_db['news'] 19 | 20 | # Elasticsearch连接 21 | self.es = Elasticsearch( 22 | [f'http://{es_host}:{es_port}'], 23 | basic_auth=('elastic', '123456'), # 添加身份验证 24 | timeout = 300, # 增加超时时间为30秒 25 | max_retries = 3, 26 | retry_on_timeout=True# 添加重试机制 27 | ) 28 | self.index_name = index_name 29 | 30 | def create_index(self): 31 | """创建Elasticsearch索引""" 32 | settings = { 33 | "index": { 34 | "number_of_replicas": 2, 35 | "number_of_shards": 1 36 | }, 37 | "analysis": { 38 | "analyzer": { 39 | "ik_smart_pinyin": { 40 | "type": "custom", 41 | "tokenizer": "ik_smart", 42 | "filter": ["lowercase", "pinyin_filter"] 43 | } 44 | }, 45 | "filter": { 46 | "pinyin_filter": { 47 | "type": "pinyin", 48 | "keep_full_pinyin": False, # 仅保留必要的拼音格式 49 | "keep_joined_full_pinyin": False, 50 | "keep_original": True, 51 | "limit_first_letter_length": 16, 52 | "remove_duplicated_term": True, 53 | "none_chinese_pinyin_tokenize": False # 减少非中文字符的处理 54 | } 55 | } 56 | } 57 | } 58 | 59 | mappings = { 60 | "properties": { 61 | "title": { 62 | "type": "text", 63 | "analyzer": "ik_max_word", 64 | "fields": { 65 | "pinyin": { 66 | "type": "text", 67 | "analyzer": "ik_smart_pinyin" 68 | } 69 | } 70 | }, 71 | "url": {"type": "keyword"}, 72 | "content": { 73 | "type": "text", 74 | "analyzer": "ik_max_word", 75 | "search_analyzer": "ik_smart" 76 | }, 77 | "source": {"type": "keyword"}, 78 | "date": {"type": "date", "format": "yyyy-MM-dd"} 79 | } 80 | } 81 | 82 | 83 | if self.es.indices.exists(index=self.index_name): 84 | self.es.indices.delete(index=self.index_name) 85 | 86 | self.es.indices.create( 87 | index=self.index_name, 88 | body={ 89 | "settings": settings, 90 | "mappings": mappings 91 | } 92 | ) 93 | 94 | def prepare_documents(self): 95 | """准备索引文档""" 96 | documents = [] 97 | for news_doc in self.news_collection.find(): 98 | title = news_doc.get('title', '') 99 | doc = { 100 | "_id": str(news_doc['_id']), 101 | "title": title, 102 | "url": news_doc.get('url', ''), 103 | "content": news_doc.get('content', ''), 104 | "source": news_doc.get('source', ''), 105 | "date": news_doc.get('date', ''), 106 | "suggest": { 107 | "input": [title], 108 | "weight": 10 109 | } 110 | } 111 | documents.append(doc) 112 | 113 | return documents 114 | 115 | def close(self): 116 | """关闭数据库连接""" 117 | self.mongo_client.close() 118 | 119 | 120 | def main(): 121 | indexer = NewsIndexer( 122 | mongo_host='localhost', 123 | mongo_port=27017, 124 | mongo_db='nankai_news', 125 | es_host='localhost', 126 | es_port=9200, 127 | index_name='nankai_news_index' 128 | ) 129 | 130 | try: 131 | print("开始创建索引...") 132 | indexer.create_index() 133 | print("索引结构创建完成") 134 | 135 | print("开始准备文档...") 136 | documents = indexer.prepare_documents() 137 | print(f"文档准备完成,共 {len(documents)} 条记录") 138 | 139 | print("开始批量索引文档...") 140 | try: 141 | success, failed = bulk( 142 | indexer.es, 143 | [ 144 | { 145 | '_index': indexer.index_name, 146 | '_id': doc['_id'], 147 | **doc 148 | } 149 | for doc in documents 150 | ], 151 | chunk_size=500, # 设置每批处理的文档数量 152 | request_timeout=300, # 设置批量请求的超时时间 153 | refresh=True 154 | ) 155 | print(f"文档索引完成,成功:{success} 条,失败:{failed} 条") 156 | except Exception as e: 157 | print(f"批量索引过程中发生错误: {str(e)}") 158 | # 记录详细错误信息 159 | import traceback 160 | print(traceback.format_exc()) 161 | finally: 162 | indexer.close() 163 | 164 | 165 | if __name__ == "__main__": 166 | main() -------------------------------------------------------------------------------- /index/creat_index.py: -------------------------------------------------------------------------------- 1 | # 基础搜索索引 2 | from whoosh.index import create_in 3 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED 4 | from jieba.analyse import ChineseAnalyzer 5 | import os 6 | from datetime import datetime 7 | from pymongo import MongoClient 8 | 9 | 10 | # 1. 连接MongoDB和获取数据 11 | def get_mongodb_data(): 12 | client = MongoClient('localhost', 27017) 13 | db = client['nankai_news_datasets'] # 替换为您的数据库名 14 | 15 | # 获取网页数据集合1 16 | collection1 = db['NEWS1'] # 第一种格式的网页数据,无快照 17 | # 获取网页数据集合2 18 | collection2 = db['NEWS'] # 第二种格式的网页数据,有快照 19 | # 获取快照数据 20 | snapshots = db['WEB_snapshot'] # 快照集合 21 | # 添加文档集合 22 | documents = db['DOCUMENTS'] # 假设文档存储在DOCUMENTS集合中 23 | # 用snapshot_hash创建快照字典,用于NEWS集合 24 | snapshot_dict = {doc['content_hash']: doc for doc in snapshots.find()} 25 | 26 | # 返回所有数据 27 | return collection1.find(), collection2.find(), snapshot_dict, documents.find() 28 | 29 | 30 | # 2. 创建索引结构 31 | def create_schema(): 32 | analyzer = ChineseAnalyzer() 33 | schema = Schema( 34 | id=ID(stored=True, unique=True), 35 | url=ID(stored=True), 36 | title=TEXT(stored=True, analyzer=analyzer), 37 | content=TEXT(stored=True, analyzer=analyzer), 38 | publish_date=DATETIME(stored=True), 39 | source=TEXT(stored=True), 40 | snapshot_hash=ID(stored=True), # 用于匹配对应的快照 41 | captured_at=DATETIME(stored=True), # 快照捕获时间 42 | 43 | # 添加文档相关字段 44 | filetype = ID(stored=True), # 文档类型(doc/docx/pdf等) 45 | filename = ID(stored=True), # 文件名 46 | upload_date = DATETIME(stored=True) # 上传时间 47 | ) 48 | return schema 49 | 50 | 51 | # 3. 添加文档函数 52 | def add_document(writer, doc, doc_type, snapshot_dict=None): 53 | document = { 54 | 'id': str(doc['_id']), 55 | 'url': doc['url'] if 'url' in doc else None 56 | } 57 | 58 | # 处理文档类型 59 | if 'filetype' in doc: # 如果是文档 60 | document.update({ 61 | 'filetype': doc['filetype'], 62 | 'filename': doc['filename'] if 'filename' in doc else None, 63 | 'title': doc['title'] if 'title' in doc else None, 64 | 'upload_date': datetime.fromisoformat(doc['upload_date'].replace('Z', '+00:00')) if 'upload_date' in doc else None 65 | }) 66 | 67 | # 处理不同格式的数据 68 | if doc_type == 'format1': # NEWS1格式,无快照 69 | if 'title' in doc and doc['title']: 70 | document['title'] = doc['title'] 71 | if 'content' in doc: 72 | document['content'] = doc['content'] 73 | 74 | elif doc_type == 'format2': # NEWS格式,有快照 75 | if 'title' in doc and doc['title']: 76 | document['title'] = doc['title'] 77 | if 'content' in doc: 78 | document['content'] = doc['content'] 79 | if 'date' in doc: 80 | try: 81 | document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d") 82 | except: 83 | pass 84 | if 'source' in doc: 85 | document['source'] = doc['source'] 86 | 87 | # 只为NEWS格式添加快照信息 88 | if 'snapshot_hash' in doc: 89 | document['snapshot_hash'] = doc['snapshot_hash'] 90 | # 从快照集合获取捕获时间 91 | if snapshot_dict and doc['snapshot_hash'] in snapshot_dict: 92 | snapshot = snapshot_dict[doc['snapshot_hash']] 93 | if 'captured_at' in snapshot: 94 | document['captured_at'] = snapshot['captured_at'] 95 | 96 | try: 97 | writer.add_document(**document) 98 | return True 99 | except Exception as e: 100 | print(f"Error adding document {document['id']}: {str(e)}") 101 | return False 102 | 103 | 104 | # 4. 初始化索引 105 | def initialize_index(): 106 | if not os.path.exists("../index_dir"): 107 | os.mkdir("../index_dir") 108 | ix = create_in("index_dir", create_schema()) 109 | 110 | # 获取所有数据 111 | collection1_docs, collection2_docs, snapshot_dict, documents = get_mongodb_data() 112 | 113 | # 添加文档到索引 114 | with ix.writer() as writer: 115 | count = 0 116 | 117 | # 处理文档 118 | for doc in documents: 119 | if add_document(writer, doc, 'document'): 120 | count += 1 121 | if count % 1000 == 0: 122 | print(f"已处理 {count} 条数据") 123 | 124 | # 处理NEWS1的文档 125 | for doc in collection1_docs: 126 | if add_document(writer, doc, 'format1', snapshot_dict): 127 | count += 1 128 | if count % 1000 == 0: 129 | print(f"已处理 {count} 条数据") 130 | 131 | # 处理NEWS的文档 132 | for doc in collection2_docs: 133 | if add_document(writer, doc, 'format2', snapshot_dict): 134 | count += 1 135 | if count % 1000 == 0: 136 | print(f"已处理 {count} 条数据") 137 | 138 | print("索引创建完成!共处理 {} 条数据".format(count)) 139 | return ix 140 | 141 | 142 | if __name__ == "__main__": 143 | ix = initialize_index() -------------------------------------------------------------------------------- /index/creat_index00.py: -------------------------------------------------------------------------------- 1 | # 基础搜索索引 2 | from whoosh.index import create_in 3 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED 4 | from jieba.analyse import ChineseAnalyzer 5 | import os 6 | from datetime import datetime 7 | from pymongo import MongoClient 8 | 9 | 10 | # 1. 连接MongoDB和获取数据 11 | def get_mongodb_data(): 12 | client = MongoClient('localhost', 27017) 13 | db = client['nankai_news_datasets'] # 替换为您的数据库名 14 | 15 | # 获取网页数据集合1 16 | collection1 = db['NEWS1'] # 第一种格式的网页数据 17 | # 获取网页数据集合2 18 | collection2 = db['NEWS'] # 第二种格式的网页数据 19 | # 获取快照数据 20 | snapshots = db['WEB_snapshot'] # 快照集合 21 | 22 | # 创建快照字典用于查找 23 | snapshot_dict = {doc['_id']: doc for doc in snapshots.find()} 24 | 25 | # 返回所有数据 26 | return collection1.find(), collection2.find(), snapshot_dict 27 | 28 | 29 | # 2. 创建索引结构 30 | def create_schema(): 31 | analyzer = ChineseAnalyzer() 32 | schema = Schema( 33 | id=ID(stored=True, unique=True), 34 | url=ID(stored=True), 35 | title=TEXT(stored=True, analyzer=analyzer), 36 | content=TEXT(stored=True, analyzer=analyzer), 37 | publish_date=DATETIME(stored=True), 38 | source=TEXT(stored=True), 39 | snapshot_hash=ID(stored=True), 40 | snapshot_content=STORED # 存储快照内容但不索引 41 | ) 42 | return schema 43 | 44 | 45 | # 3. 添加文档函数 46 | def add_document(writer, doc, doc_type, snapshot_dict=None): 47 | if 'filename' in doc: # 跳过文档类型 48 | return 49 | 50 | document = { 51 | 'id': str(doc['_id']), 52 | 'url': doc['url'] 53 | } 54 | 55 | # 处理不同格式的数据 56 | if doc_type == 'format1': # 第一种格式news1 57 | if 'title' in doc and doc['title']: 58 | document['title'] = doc['title'] 59 | if 'content' in doc: 60 | document['content'] = doc['content'] 61 | # 删除对crawl_time的处理 62 | 63 | elif doc_type == 'format2': # 第二种格式NEWS 64 | if 'title' in doc and doc['title']: 65 | document['title'] = doc['title'] 66 | if 'content' in doc: 67 | document['content'] = doc['content'] 68 | if 'date' in doc: 69 | try: 70 | document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d") 71 | except: 72 | pass 73 | if 'source' in doc: 74 | document['source'] = doc['source'] 75 | 76 | # 添加快照信息 77 | if snapshot_dict and str(doc['_id']) in snapshot_dict: 78 | snapshot = snapshot_dict[str(doc['_id'])] 79 | if 'snapshot_hash' in snapshot: 80 | document['snapshot_hash'] = snapshot['snapshot_hash'] 81 | if 'html_content' in snapshot: 82 | document['snapshot_content'] = snapshot['html_content'] 83 | 84 | try: 85 | writer.add_document(**document) 86 | return True 87 | except Exception as e: 88 | print(f"Error adding document {document['id']}: {str(e)}") 89 | return False 90 | 91 | 92 | # 4. 初始化索引 93 | def initialize_index(): 94 | if not os.path.exists("../index_dir"): 95 | os.mkdir("../index_dir") 96 | ix = create_in("index_dir", create_schema()) 97 | 98 | # 获取所有数据 99 | collection1_docs, collection2_docs, snapshot_dict = get_mongodb_data() 100 | 101 | # 添加文档到索引 102 | with ix.writer() as writer: 103 | count = 0 104 | 105 | # 处理第一种格式的文档 106 | for doc in collection1_docs: 107 | if add_document(writer, doc, 'format1', snapshot_dict): 108 | count += 1 109 | if count % 1000 == 0: 110 | print(f"已处理 {count} 条数据") 111 | 112 | # 处理第二种格式的文档 113 | for doc in collection2_docs: 114 | if add_document(writer, doc, 'format2', snapshot_dict): 115 | count += 1 116 | if count % 1000 == 0: 117 | print(f"已处理 {count} 条数据") 118 | 119 | print("索引创建完成!共处理 {} 条数据".format(count)) 120 | return ix 121 | 122 | 123 | if __name__ == "__main__": 124 | ix = initialize_index() -------------------------------------------------------------------------------- /index/creat_index01.py: -------------------------------------------------------------------------------- 1 | # 包含锚文本 2 | # 基础搜索索引 3 | from whoosh.index import create_in 4 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED 5 | from jieba.analyse import ChineseAnalyzer 6 | import os 7 | from datetime import datetime 8 | from pymongo import MongoClient 9 | from bs4 import BeautifulSoup 10 | 11 | # 1. 连接MongoDB和获取数据 12 | def get_mongodb_data(): 13 | client = MongoClient('localhost', 27017) 14 | db = client['nankai_news_datasets'] # 替换为您的数据库名 15 | 16 | # 获取网页数据集合1 17 | collection1 = db['NEWS1'] # 第一种格式的网页数据,无快照 18 | # 获取网页数据集合2 19 | collection2 = db['NEWS'] # 第二种格式的网页数据,有快照 20 | # 获取快照数据 21 | snapshots = db['WEB_snapshot'] # 快照集合 22 | # 添加文档集合 23 | documents = db['DOCUMENTS'] # 假设文档存储在DOCUMENTS集合中 24 | # 用snapshot_hash创建快照字典,用于NEWS集合 25 | snapshot_dict = {doc['content_hash']: doc for doc in snapshots.find()} 26 | 27 | # 返回所有数据 28 | return collection1.find(), collection2.find(), snapshot_dict, documents.find() 29 | 30 | 31 | # 2. 创建索引结构 32 | def create_schema(): 33 | analyzer = ChineseAnalyzer() 34 | schema = Schema( 35 | id=ID(stored=True, unique=True), 36 | url=ID(stored=True), 37 | title=TEXT(stored=True, analyzer=analyzer), 38 | content=TEXT(stored=True, analyzer=analyzer), 39 | anchor_text=TEXT(stored=True, analyzer=analyzer), # 添加锚文本字段 40 | publish_date=DATETIME(stored=True), 41 | source=TEXT(stored=True), 42 | snapshot_hash=ID(stored=True), # 用于匹配对应的快照 43 | captured_at=DATETIME(stored=True), # 快照捕获时间 44 | 45 | # 添加文档相关字段 46 | filetype = ID(stored=True), # 文档类型(doc/docx/pdf等) 47 | filename = ID(stored=True), # 文件名 48 | upload_date = DATETIME(stored=True) # 上传时间 49 | ) 50 | return schema 51 | 52 | def extract_anchor_text(html_content): 53 | """从HTML内容中提取锚文本""" 54 | try: 55 | soup = BeautifulSoup(html_content, 'html.parser') 56 | anchors = soup.find_all('a') 57 | # 获取所有非空的锚文本 58 | anchor_texts = [a.get_text().strip() for a in anchors if a.get_text().strip()] 59 | return " ".join(anchor_texts) 60 | except Exception as e: 61 | print(f"Error extracting anchor text: {str(e)}") 62 | return "" 63 | # 3. 添加文档函数 64 | def add_document(writer, doc, doc_type, snapshot_dict=None): 65 | document = { 66 | 'id': str(doc['_id']), 67 | 'url': doc['url'] if 'url' in doc else None 68 | } 69 | 70 | # 处理文档类型 71 | if 'filetype' in doc: # 如果是文档 72 | document.update({ 73 | 'filetype': doc['filetype'], 74 | 'filename': doc['filename'] if 'filename' in doc else None, 75 | 'title': doc['title'] if 'title' in doc else None, 76 | 'upload_date': datetime.fromisoformat(doc['upload_date'].replace('Z', '+00:00')) if 'upload_date' in doc else None 77 | }) 78 | # 可能需要提取文档内容并添加到content字段 79 | # document['content'] = extract_doc_content(doc) # 需要实现文档内容提取函数 80 | 81 | # 处理不同格式的数据 82 | if doc_type == 'format1': # NEWS1格式,无快照 83 | if 'title' in doc and doc['title']: 84 | document['title'] = doc['title'] 85 | if 'content' in doc: 86 | document['content'] = doc['content'] 87 | 88 | elif doc_type == 'format2': # NEWS格式,有快照 89 | if 'title' in doc and doc['title']: 90 | document['title'] = doc['title'] 91 | if 'content' in doc: 92 | document['content'] = doc['content'] 93 | if 'date' in doc: 94 | try: 95 | document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d") 96 | except: 97 | pass 98 | if 'source' in doc: 99 | document['source'] = doc['source'] 100 | 101 | # 只为NEWS格式添加快照信息 102 | if 'snapshot_hash' in doc: 103 | document['snapshot_hash'] = doc['snapshot_hash'] 104 | # 从快照集合获取捕获时间 105 | if snapshot_dict and doc['snapshot_hash'] in snapshot_dict: 106 | snapshot = snapshot_dict[doc['snapshot_hash']] 107 | if 'captured_at' in snapshot: 108 | document['captured_at'] = snapshot['captured_at'] 109 | 110 | # 从快照的HTML内容中提取锚文本 111 | if 'html_content' in snapshot: 112 | anchor_text = extract_anchor_text(snapshot['html_content']) 113 | if anchor_text: # 如果成功提取到锚文本 114 | document['anchor_text'] = anchor_text 115 | try: 116 | writer.add_document(**document) 117 | return True 118 | except Exception as e: 119 | print(f"Error adding document {document['id']}: {str(e)}") 120 | return False 121 | 122 | 123 | # 4. 初始化索引 124 | def initialize_index(): 125 | if not os.path.exists("../index_dir"): 126 | os.mkdir("../index_dir") 127 | ix = create_in("index_dir", create_schema()) 128 | 129 | # 获取所有数据 130 | collection1_docs, collection2_docs, snapshot_dict, documents = get_mongodb_data() 131 | 132 | # 添加文档到索引 133 | with ix.writer() as writer: 134 | count = 0 135 | 136 | # 处理文档 137 | for doc in documents: 138 | if add_document(writer, doc, 'document'): 139 | count += 1 140 | if count % 1000 == 0: 141 | print(f"已处理 {count} 条数据") 142 | 143 | # 处理NEWS1的文档 144 | for doc in collection1_docs: 145 | if add_document(writer, doc, 'format1', snapshot_dict): 146 | count += 1 147 | if count % 1000 == 0: 148 | print(f"已处理 {count} 条数据") 149 | 150 | # 处理NEWS的文档 151 | for doc in collection2_docs: 152 | if add_document(writer, doc, 'format2', snapshot_dict): 153 | count += 1 154 | if count % 1000 == 0: 155 | print(f"已处理 {count} 条数据") 156 | 157 | print("索引创建完成!共处理 {} 条数据".format(count)) 158 | return ix 159 | 160 | 161 | if __name__ == "__main__": 162 | ix = initialize_index() -------------------------------------------------------------------------------- /index/creat_index_document.py: -------------------------------------------------------------------------------- 1 | # 基础搜索索引 2 | from whoosh.index import create_in 3 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED 4 | from jieba.analyse import ChineseAnalyzer 5 | import os 6 | from datetime import datetime 7 | from pymongo import MongoClient 8 | 9 | 10 | # 1. 连接MongoDB和获取数据 11 | def get_mongodb_data(): 12 | client = MongoClient('localhost', 27017) 13 | db = client['nankai_news_datasets'] # 替换为您的数据库名 14 | 15 | # 获取网页数据集合1 16 | collection1 = db['NEWS1'] # 第一种格式的网页数据,无快照 17 | # 获取网页数据集合2 18 | collection2 = db['NEWS'] # 第二种格式的网页数据,有快照 19 | # 获取快照数据 20 | snapshots = db['WEB_snapshot'] # 快照集合 21 | # 添加文档集合 22 | documents = db['DOCUMENTS'] # 假设文档存储在DOCUMENTS集合中 23 | # 用snapshot_hash创建快照字典,用于NEWS集合 24 | snapshot_dict = {doc['content_hash']: doc for doc in snapshots.find()} 25 | 26 | # 返回所有数据 27 | return collection1.find(), collection2.find(), snapshot_dict, documents.find() 28 | 29 | 30 | # 2. 创建索引结构 31 | def create_schema(): 32 | analyzer = ChineseAnalyzer() 33 | schema = Schema( 34 | id=ID(stored=True, unique=True), 35 | url=ID(stored=True), 36 | title=TEXT(stored=True, analyzer=analyzer, phrase=True), 37 | content=TEXT(stored=True, analyzer=analyzer, phrase=True), 38 | publish_date=DATETIME(stored=True), 39 | source=TEXT(stored=True), 40 | snapshot_hash=ID(stored=True), # 用于匹配对应的快照 41 | captured_at=DATETIME(stored=True), # 快照捕获时间 42 | 43 | # 添加文档相关字段 44 | filetype = ID(stored=True), # 文档类型(doc/docx/pdf等) 45 | filename = ID(stored=True), # 文件名 46 | upload_date = DATETIME(stored=True) # 上传时间 47 | ) 48 | return schema 49 | 50 | # 3. 添加文档函数 51 | def add_document(writer, doc, doc_type, snapshot_dict=None): 52 | document = { 53 | 'id': str(doc['_id']), 54 | 'url': doc['url'] if 'url' in doc else None 55 | } 56 | 57 | # 处理文档类型 58 | if 'filetype' in doc: # 如果是文档 59 | document.update({ 60 | 'filetype': doc['filetype'], 61 | 'filename': doc['filename'] if 'filename' in doc else None, 62 | 'title': doc['title'] if 'title' in doc else None, 63 | 'upload_date': doc.get('upload_date') if 'upload_date' in doc else None 64 | }) 65 | # 打印处理后的 upload_date 66 | if 'upload_date' in doc: 67 | print(f"处理后的 upload_date: {doc.get('upload_date')}") 68 | 69 | # 处理不同格式的数据 70 | if doc_type == 'format1': # NEWS1格式,无快照 71 | if 'title' in doc and doc['title']: 72 | document['title'] = doc['title'] 73 | if 'content' in doc: 74 | document['content'] = doc['content'] 75 | 76 | elif doc_type == 'format2': # NEWS格式,有快照 77 | if 'title' in doc and doc['title']: 78 | document['title'] = doc['title'] 79 | if 'content' in doc: 80 | document['content'] = doc['content'] 81 | if 'date' in doc: 82 | try: 83 | document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d") 84 | except: 85 | pass 86 | if 'source' in doc: 87 | document['source'] = doc['source'] 88 | 89 | # 只为NEWS格式添加快照信息 90 | if 'snapshot_hash' in doc: 91 | document['snapshot_hash'] = doc['snapshot_hash'] 92 | # 从快照集合获取捕获时间 93 | if snapshot_dict and doc['snapshot_hash'] in snapshot_dict: 94 | snapshot = snapshot_dict[doc['snapshot_hash']] 95 | if 'captured_at' in snapshot: 96 | document['captured_at'] = snapshot['captured_at'] 97 | 98 | try: 99 | writer.add_document(**document) 100 | return True 101 | except Exception as e: 102 | print(f"Error adding document {document['id']}: {str(e)}") 103 | return False 104 | 105 | 106 | # 4. 初始化索引 107 | def initialize_index(): 108 | if not os.path.exists("index_dir"): 109 | os.mkdir("index_dir") 110 | ix = create_in("index_dir", create_schema()) 111 | 112 | # 获取所有数据 113 | collection1_docs, collection2_docs, snapshot_dict, documents = get_mongodb_data() 114 | 115 | # 添加文档到索引 116 | with ix.writer() as writer: 117 | doc_count = 0 118 | news1_count = 0 119 | news2_count = 0 120 | print("\n=== 开始处理文档集合(DOCUMENTS) ===") 121 | # 处理文档 122 | for doc in documents: 123 | if add_document(writer, doc, 'document'): 124 | doc_count += 1 125 | if doc_count % 100 == 0: 126 | print(f"已处理 {doc_count } 条数据") 127 | 128 | print("\n=== 开始处理NEWS1集合 ===") 129 | # 处理NEWS1的文档 130 | for doc in collection1_docs: 131 | if add_document(writer, doc, 'format1', snapshot_dict): 132 | news1_count += 1 133 | if news1_count % 1000 == 0: 134 | print(f"已处理 {news1_count } 条数据") 135 | 136 | print("\n=== 开始处理NEWS集合 ===") 137 | # 处理NEWS的文档 138 | for doc in collection2_docs: 139 | if add_document(writer, doc, 'format2', snapshot_dict): 140 | news2_count += 1 141 | if news2_count % 1000 == 0: 142 | print(f"已处理 {news2_count} 条数据") 143 | total_count = doc_count + news1_count + news2_count 144 | print("索引创建完成!共处理 {} 条数据".format(total_count)) 145 | return ix 146 | 147 | 148 | if __name__ == "__main__": 149 | ix = initialize_index() -------------------------------------------------------------------------------- /search/__pycache__/manager.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/search/__pycache__/manager.cpython-39.pyc -------------------------------------------------------------------------------- /search/__pycache__/personalization.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/search/__pycache__/personalization.cpython-39.pyc -------------------------------------------------------------------------------- /search/__pycache__/processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/search/__pycache__/processor.cpython-39.pyc -------------------------------------------------------------------------------- /search/manager.py: -------------------------------------------------------------------------------- 1 | # search/manager.py 2 | from whoosh.qparser import MultifieldParser, QueryParser 3 | from whoosh.query import Term, Or, Phrase, Wildcard, Regex 4 | from whoosh.highlight import ContextFragmenter, HtmlFormatter 5 | from datetime import datetime 6 | import math 7 | from bson.objectid import ObjectId # 添加这个导入 8 | from pymongo import MongoClient 9 | from whoosh.highlight import ContextFragmenter, HtmlFormatter 10 | class SearchManager: 11 | def __init__(self, searcher, results_per_page=10): 12 | self.searcher = searcher 13 | self.RESULTS_PER_PAGE = results_per_page 14 | # 定义所有支持的文档类型 15 | self.SUPPORTED_FILETYPES = ['pdf', 'doc', 'docx', 'xls', 'xlsx'] 16 | # MongoDB 连接 17 | self.client = MongoClient('localhost', 27017) 18 | self.db = self.client['nankai_news_datasets'] 19 | 20 | def _get_document_info(self, doc_str_id): 21 | """从MongoDB获取文档详细信息""" 22 | try: 23 | # 使用doc_id查询MongoDB获取文件信息 24 | doc_info = self.db.documents.find_one({'_id': ObjectId(doc_str_id)}) 25 | if doc_info: 26 | return { 27 | 'filename': doc_info.get('filename', '未知文件名'), 28 | 'length': doc_info.get('length', 0), 29 | 'upload_date': doc_info.get('upload_date') 30 | } 31 | return None 32 | except Exception as e: 33 | print(f"获取文档信息错误: {str(e)}") 34 | return None 35 | def _get_field_config(self, search_in='all'): 36 | """获取搜索字段和权重配置""" 37 | if search_in == 'title': 38 | return {"fields": ["title"], "weights": {"title": 1.0}} 39 | elif search_in == 'content': 40 | return {"fields": ["content"], "weights": {"content": 1.0}} 41 | else: # 'all' 42 | return {"fields": ["title", "content"], "weights": {"title": 2.0, "content": 1.0}} 43 | 44 | def execute_search(self, search_type, query_text, search_in='all', sort_by='relevance', filetypes=None): 45 | """统一的搜索执行接口""" 46 | field_config = self._get_field_config(search_in) 47 | 48 | # 根据搜索类型选择查询构建方式 49 | if search_type == 'document': 50 | query = self._build_document_query(query_text, field_config, filetypes) 51 | # 执行搜索 52 | results = self.searcher.search(query, limit=None, terms=True) 53 | 54 | # 只对文档搜索结果添加文件信息 55 | for hit in results: 56 | doc_str_id = hit.get('id') 57 | if doc_str_id: 58 | doc_info = self._get_document_info(doc_str_id) 59 | if doc_info: 60 | hit['filename'] = doc_info['filename'] 61 | hit['filesize'] = doc_info['length'] 62 | hit['upload_date'] = doc_info['upload_date'] 63 | elif search_type == 'phrase': 64 | query = self._build_phrase_query(query_text, field_config) 65 | elif search_type == 'wildcard': 66 | query = self._build_wildcard_query(query_text, field_config) 67 | if query is None: 68 | # 如果查询无效,返回空结果 69 | # 如果查询无效,返回空的查询结果,但设置limit为1 70 | return self.searcher.search(Term("content", "IMPOSSIBLE_MATCH_STRING"), limit=1) 71 | else: # basic search 72 | query = self._build_basic_query(query_text, field_config) 73 | 74 | # 执行搜索 75 | if sort_by == 'date': 76 | results = self.searcher.search( 77 | query, 78 | limit=None, 79 | sortedby='publish_date', 80 | reverse=True, 81 | terms=True, 82 | ) 83 | else: 84 | results = self.searcher.search(query, 85 | limit=None, 86 | terms=True) 87 | 88 | # 设置高亮 89 | results.fragmenter = ContextFragmenter(maxchars=200, surround=50) 90 | results.formatter = HtmlFormatter(tagname="strong", classname="highlight") 91 | results.formatter.between = "..." 92 | return results 93 | 94 | def _build_basic_query(self, query_text, field_config): 95 | parser = MultifieldParser( 96 | field_config["fields"], 97 | schema=self.searcher.schema, 98 | fieldboosts=field_config["weights"] 99 | ) 100 | return parser.parse(query_text) 101 | 102 | def _build_document_query(self, query_text, field_config, filetypes): 103 | weights = field_config["weights"].copy() 104 | weights.update({ 105 | "filename": 1.5, 106 | "filetype": 1.0 107 | }) 108 | 109 | parser = MultifieldParser( 110 | field_config["fields"] + ["filename", "filetype"], 111 | schema=self.searcher.schema, 112 | fieldboosts=weights 113 | ) 114 | 115 | base_query = parser.parse(query_text) 116 | 117 | # 如果用户没有选择文件类型,就使用所有支持的类型 118 | if not filetypes: 119 | filetypes = self.SUPPORTED_FILETYPES 120 | 121 | # 构建文件类型过滤器 122 | filetype_filter = Or([Term("filetype", ft.lower()) for ft in filetypes]) 123 | return base_query & filetype_filter 124 | 125 | def _build_phrase_query(self, query_text, field_config): 126 | """ 127 | 构建短语查询 - 要求精确匹配完整短语 128 | """ 129 | from whoosh.query import And, Term, Phrase 130 | from jieba.analyse import ChineseAnalyzer 131 | # 使用中文分析器进行分词 132 | analyzer = ChineseAnalyzer() 133 | terms = [token.text for token in analyzer(query_text)] 134 | 135 | # 如果短语只有一个词,使用 Term 查询 136 | if len(terms) == 1: 137 | return Or([Term(field, query_text) for field in field_config["fields"]]) 138 | 139 | # 对每个搜索字段构建短语查询 140 | phrase_queries = [] 141 | for field in field_config["fields"]: 142 | # 使用 Phrase 查询,slop=0 表示词必须严格相邻 143 | phrase_queries.append( 144 | Phrase(field, terms, slop=0) 145 | ) 146 | 147 | # 使用 Or 组合所有字段的查询 148 | final_query = Or(phrase_queries) 149 | 150 | print(f"构建的短语查询: {final_query}") # 调试输出 151 | return final_query 152 | 153 | def _build_wildcard_query(self, query_text, field_config): 154 | """ 155 | 构建通配符查询: 156 | ? - 匹配单个字符 157 | * - 匹配零个或多个字符 158 | """ 159 | from whoosh.query import Or, Wildcard 160 | 161 | def process_query(query): 162 | # 处理中文通配符 163 | query = query.replace('?', '?') 164 | query = query.replace('*', '*') 165 | 166 | # 关键修改:确保通配符能正确匹配中文 167 | # 如果查询以*结尾,保持原样;如果不以*结尾且包含*,在*后添加*以匹配任意字符 168 | if '*' in query and not query.endswith('*'): 169 | parts = query.split('*') 170 | query = '*'.join(parts[:-1]) + '*' + parts[-1] + '*' 171 | elif not '*' in query and not '?' in query: 172 | query = query + '*' 173 | 174 | return query 175 | 176 | def validate_query(query): 177 | # 验证通配符使用是否合法 178 | if not any(char in query for char in ['?', '*']): 179 | return False 180 | # 不允许只有通配符的查询 181 | if query.strip('*?') == '': 182 | return False 183 | return True 184 | 185 | queries = [] 186 | fields = field_config["fields"] 187 | 188 | # 处理查询文本 189 | processed_query = process_query(query_text) 190 | print(f"处理后的通配符查询: {processed_query}") # 调试输出 191 | 192 | # 验证查询的合法性 193 | if not validate_query(processed_query): 194 | print(f"无效的通配符查询: {query_text}") 195 | return None 196 | 197 | # 为每个搜索字段创建通配符查询 198 | for field in fields: 199 | wildcard = Wildcard(field, processed_query) 200 | queries.append(wildcard) 201 | 202 | # 组合所有字段的查询 203 | final_query = Or(queries) if len(queries) > 1 else queries[0] 204 | print(f"最终通配符查询: {final_query}") # 调试输出 205 | return final_query -------------------------------------------------------------------------------- /search/personalization.py: -------------------------------------------------------------------------------- 1 | # search/personalization.py 2 | import math 3 | class SearchPersonalization: 4 | """搜索结果个性化处理类""" 5 | 6 | # 学院相关性映射表 7 | COLLEGE_RELATIONS = { 8 | '文学院': ['新闻与传播学院', '汉语言文化学院', '外国语学院'], 9 | '历史学院': ['文学院', '哲学院', '周恩来政府管理学院'], 10 | '物理科学学院': ['电子信息与光学工程学院', '材料科学与工程学院'], 11 | '化学学院': ['材料科学与工程学院', '生命科学学院', '医学院', '药学院'], 12 | '生命科学学院': ['化学学院', '医学院', '药学院', '环境科学与工程学院'], 13 | '计算机与网络空间安全学院': ['软件学院', '人工智能学院', '数学科学学院'], 14 | '计算机学院': ['软件学院', '人工智能学院', '数学科学学院'], # 兼容简称 15 | '网络空间安全学院': ['计算机学院', '软件学院', '数学科学学院'], # 兼容分拆名称 16 | '数学科学学院': ['统计与数据科学学院', '计算机学院', '人工智能学院'], 17 | '经济学院': ['商学院', '金融学院', '统计与数据科学学院'], 18 | '商学院': ['经济学院', '金融学院', '旅游与服务学院'], 19 | '医学院': ['生命科学学院', '药学院'], 20 | '周恩来政府管理学院': ['法学院', '马克思主义学院', '历史学院'] 21 | } 22 | 23 | def __init__(self, user_profile=None): 24 | self.user_profile = user_profile 25 | 26 | def personalize_results(self, results, sort_by='relevance'): 27 | """ 28 | 根据用户身份和排序偏好个性化搜索结果 29 | Args: 30 | results: 原始搜索结果列表 31 | sort_by: 排序方式 ('relevance' 或 'time') 32 | Returns: 33 | 调整后的搜索结果列表 34 | """ 35 | if not self.user_profile: 36 | return results # 未登录用户返回原始结果 37 | 38 | # 获取用户角色和学院信息 39 | role = self.user_profile.get('role', '未设置') 40 | college = self.user_profile.get('college', '未设置') 41 | # 获取相关学院列表 42 | related_colleges = self._get_related_colleges(college) 43 | 44 | # 将所有结果转换为(得分,hit)元组列表 45 | result_list = [] 46 | for hit in results: 47 | try: 48 | # 获取或设置基础得分 49 | base_score = hit.score if hasattr(hit, 'score') else 1.0 50 | 51 | # 安全地获取文档内容 52 | content = '' 53 | # 尝试从不同可能的字段获取内容 54 | content_fields = ['title', 'content', 'text'] 55 | for field in content_fields: 56 | if hasattr(hit, field): 57 | content += str(getattr(hit, field, '')) + ' ' 58 | elif hasattr(hit, 'get'): 59 | content += str(hit.get(field, '')) + ' ' 60 | content = content.lower() 61 | 62 | # 计算boost因子 63 | boost = self._calculate_boost(content, role, college, related_colleges) 64 | 65 | # 计算最终得分 66 | #下面这个算出来有问题,我直接用权重代表final,实验报告用这个 67 | final_score = boost*(1+0.019*base_score) 68 | print(base_score) 69 | print(boost) 70 | print("final_score的值为:", final_score) 71 | # 存储元组: (最终得分, 时间戳或默认值, 原始对象) 72 | timestamp = None 73 | if hasattr(hit, 'publish_date'): 74 | timestamp = getattr(hit, 'publish_date') 75 | elif hasattr(hit, 'get'): 76 | timestamp = hit.get('publish_date') 77 | 78 | result_list.append((final_score, timestamp, hit)) 79 | 80 | except Exception as e: 81 | print(f"处理结果时出错: {str(e)}") 82 | result_list.append((base_score, None, hit)) 83 | 84 | # 根据排序方式排序 85 | if sort_by == 'time': 86 | # 先按时间排序,时间相同的按分数排序 87 | sorted_results = sorted( 88 | result_list, 89 | key=lambda x: (x[1] or '', -x[0]), # 使用空字符串作为默认时间戳 90 | reverse=True 91 | ) 92 | else: 93 | # 按分数排序 94 | sorted_results = sorted( 95 | result_list, 96 | key=lambda x: x[0], # 使用最终得分排序 97 | reverse=True 98 | ) 99 | 100 | # 只返回原始对象列表 101 | return [item[2] for item in sorted_results] 102 | 103 | def _calculate_boost(self, content, role, college, related_colleges): 104 | """计算搜索结果的权重提升""" 105 | boost = 1.0 106 | boost_reasons = [] # 用于记录加分原因 107 | 108 | print(f"\n分析文档: {content[:200]}...") 109 | print(f"用户角色: {role}, 学院: {college}") 110 | 111 | # 1. 基于角色的内容提升 112 | if role == '教师': 113 | if any(tag in content.lower() for tag in ['学术', '科研', '教学', '实验室', '课题']): 114 | boost *= 1.3 115 | boost_reasons.append("教师-学术内容匹配: +30%") 116 | if any(tag in content.lower() for tag in ['教务', '师资', '课程']): 117 | boost *= 1.2 118 | boost_reasons.append("教师-教务内容匹配: +20%") 119 | elif role in ['本科生', '研究生', '博士生']: 120 | if any(tag in content.lower() for tag in ['学生', '教务', '活动', '奖学金']): 121 | boost *= 1.2 122 | boost_reasons.append("学生相关内容匹配: +20%") 123 | if any(tag in content.lower() for tag in ['就业', '实习', '竞赛', '夜跑', '社团', '活动']): 124 | boost *= 1.15 125 | boost_reasons.append("学生活动内容匹配: +15%") 126 | 127 | # 2. 学院相关性判断 128 | if college != '未设置': 129 | # 规范化处理内容和学院名称 130 | normalized_content = content.lower() 131 | normalized_college = college.lower() 132 | 133 | # 检查文档中是否包含学院名称(包括变体形式) 134 | college_variations = { 135 | '计算机与网络空间安全学院': ['计算机学院', '网安学院', '计算机与网安学院', '网络空间安全学院'], 136 | '文学院': ['文学院', '中文系', '汉语言'], 137 | '商学院': ['商学院', 'MBA', '工商管理'], 138 | '医学院': ['医学院', '附属医院', '临床医学'], 139 | '生命科学学院': ['生科院', '生命学院', '生物学院'], 140 | '物理科学学院': ['物理学院', '物理系'], 141 | '化学学院': ['化学院', '化学系'], 142 | '数学科学学院': ['数学院', '数学系'], 143 | '经济学院': ['经济系', '经济管理'] 144 | } 145 | 146 | college_matched = False 147 | # 检查完整学院名称 148 | if college.lower() in normalized_content: 149 | boost *= 1.4 150 | college_matched = True 151 | boost_reasons.append(f"完整学院名称匹配({college}): +40%") 152 | 153 | # 检查学院变体 154 | if not college_matched: 155 | variations = college_variations.get(college, []) 156 | for variation in variations: 157 | if variation.lower() in normalized_content: 158 | boost *= 1.3 159 | college_matched = True 160 | boost_reasons.append(f"学院变体名称匹配({variation}): +30%") 161 | break 162 | 163 | # 检查学院关键词 164 | if not college_matched: 165 | keywords = self._get_college_context_keywords(college) 166 | matched_keywords = [kw for kw in keywords if kw.lower() in normalized_content] 167 | if matched_keywords: 168 | keyword_boost = 1.1 + min(len(matched_keywords) * 0.05, 0.3) 169 | boost *= keyword_boost 170 | boost_reasons.append( 171 | f"学院关键词匹配({', '.join(matched_keywords)}): +{(keyword_boost - 1) * 100:.0f}%") 172 | 173 | # 检查相关学院 174 | for related_college in related_colleges: 175 | if related_college.lower() in normalized_content: 176 | boost *= 1.15 177 | boost_reasons.append(f"相关学院匹配({related_college}): +15%") 178 | break 179 | 180 | # 检查活动类型和学院组合 181 | activity_keywords = ['活动', '比赛', '夜跑', '讲座', '社团'] 182 | if any(kw in normalized_content for kw in activity_keywords): 183 | if college_matched: 184 | boost *= 1.25 185 | boost_reasons.append("本院活动加分: +25%") 186 | elif any(related in normalized_content for related in related_colleges): 187 | boost *= 1.1 188 | boost_reasons.append("相关学院活动加分: +10%") 189 | # # 添加PageRank影响 190 | # try: 191 | # # 直接获取pagerank属性 192 | # pagerank = getattr(content, 'pagerank', 0) 193 | # if pagerank > 0: 194 | # # 使用很小的系数确保PageRank不会过度影响排序 195 | # pr_boost = 1 + 0.05 * math.log1p(pagerank) 196 | # boost *= pr_boost 197 | # boost_reasons.append(f"PageRank boost: +{((pr_boost - 1) * 100):.2f}%") 198 | # except (AttributeError, ValueError) as e: 199 | # # 如果无法获取或转换PageRank值,直接忽略 200 | # pass 201 | # 打印加分详情 202 | print("\n加分详情:") 203 | for reason in boost_reasons: 204 | print(f"- {reason}") 205 | print(f"最终权重系数: {boost:.2f}\n") 206 | 207 | return boost 208 | 209 | def _get_related_colleges(self, college): 210 | """获取与用户学院相关的其他学院列表""" 211 | if college == '未设置': 212 | return [] 213 | 214 | # 处理学院名称的不同形式 215 | college_variants = { 216 | '计算机与网络空间安全学院': ['计算机学院', '网络空间安全学院', '信息科学学院'], 217 | '计算机学院': ['计算机与网络空间安全学院', '软件学院', '信息科学学院'], 218 | '文学院': ['新闻学院', '外国语学院', '汉语言文化学院'], 219 | '物理科学学院': ['物理学院', '光学工程学院'], 220 | '化学学院': ['化学系', '材料学院'], 221 | '医学院': ['生命科学院', '药学院'], 222 | '商学院': ['经济学院', '管理学院'] 223 | } 224 | 225 | # 获取基础相关学院 226 | related = self.COLLEGE_RELATIONS.get(college, []) 227 | 228 | # 添加变体形式 229 | variants = college_variants.get(college, []) 230 | 231 | # 合并所有相关学院,去重 232 | all_related = list(set(related + variants)) 233 | 234 | return all_related 235 | 236 | def _get_college_context_keywords(self, college): 237 | """获取学院相关的上下文关键词""" 238 | COLLEGE_KEYWORDS = { 239 | '计算机与网络空间安全学院': [ 240 | # 专业术语 241 | '编程', '算法', '软件', '人工智能', '网络', 242 | '网络安全', '信息安全', '密码学', '渗透测试', 243 | # 场地 244 | '实验室', '机房', '创新实践基地', 245 | # 活动 246 | '程序设计大赛', '编程竞赛', 'ACM', '网络安全竞赛', 247 | # 学科 248 | '计算机科学', '软件工程', '网络工程', '信息安全', 249 | ], 250 | '文学院': [ 251 | # 专业术语 252 | '文学', '写作', '语言', '文化', '古籍', 253 | # 场地 254 | '图书馆', '文学社', '创作室', 255 | # 活动 256 | '诗歌朗诵', '读书会', '文学讲座', '创作比赛', 257 | # 学科 258 | '中国语言文学', '汉语言', '文艺学', '比较文学' 259 | ], 260 | '物理科学学院': [ 261 | '物理', '光学', '量子', '实验室', '力学', 262 | '电磁学', '热学', '光电', '激光' 263 | ], 264 | '化学学院': [ 265 | '化学', '分子', '实验', '材料', '有机化学', 266 | '无机化学', '分析化学', '物理化学' 267 | ], 268 | '经济学院': [ 269 | '经济', '金融', '贸易', '市场', '投资', 270 | '统计', '财务', '商业', '管理' 271 | ], 272 | '医学院': [ 273 | '医学', '临床', '病理', '解剖', '生理', 274 | '药理', '诊断', '治疗', '护理' 275 | ] 276 | } 277 | 278 | # 通用关键词 279 | base_keywords = ['科研', '实验室', '研究', '项目', '讲座', '活动'] 280 | 281 | # 获取特定学院的关键词,如果没有则使用空列表 282 | college_specific = COLLEGE_KEYWORDS.get(college, []) 283 | 284 | # 合并特定关键词和通用关键词 285 | return college_specific + base_keywords -------------------------------------------------------------------------------- /search/processor.py: -------------------------------------------------------------------------------- 1 | # search/processor.py 2 | import math 3 | 4 | class ResultProcessor: 5 | def __init__(self, results_per_page=10): 6 | self.RESULTS_PER_PAGE = results_per_page 7 | # #定义要排除的URL列表,短语查询时开启 8 | # self.EXCLUDED_URLS = [ 9 | # # 在这里添加更多需要排除的URL 10 | # ] 11 | def process_results(self, results, page=1): 12 | """处理搜索结果并应用分页""" 13 | # 过滤掉不想显示的URL 14 | # filtered_results = [hit for hit in results if hit.get('url') not in self.EXCLUDED_URLS] 15 | # total_results = len(filtered_results) 16 | #正常查询注释掉上面两句,恢复下面这一句 17 | 18 | total_results = len(results) 19 | total_pages = math.ceil(total_results / self.RESULTS_PER_PAGE) 20 | 21 | # 计算分页 22 | start_page = max(1, page - 5) 23 | end_page = min(total_pages, start_page + 9) 24 | if end_page - start_page < 9: 25 | start_page = max(1, end_page - 9) 26 | 27 | # 获取当前页的结果 28 | start_idx = (page - 1) * self.RESULTS_PER_PAGE 29 | end_idx = start_idx + self.RESULTS_PER_PAGE 30 | page_results = results[start_idx:end_idx] 31 | # 正常查询恢复上面这句,注释下面这一句 32 | #page_results = filtered_results[start_idx:end_idx] # 这里使用filtered_results 33 | 34 | # 处理结果 35 | processed_results = [self._process_single_result(hit) for hit in page_results] 36 | 37 | return { 38 | 'results': processed_results, 39 | 'total': total_results, 40 | 'total_pages': total_pages, 41 | 'page_range': range(start_page, end_page + 1) 42 | } 43 | 44 | def _process_single_result(self, hit): 45 | """处理单个搜索结果""" 46 | # 如果是文档类型,使用特殊的处理方式,不需要处理 content 47 | if hit.get('filetype'): 48 | return { 49 | 'title': hit.get('title', '无标题'), 50 | 'filename': hit.get('filename', '未知文件名'), 51 | 'filetype': hit.get('filetype', '未知类型'), 52 | 'upload_date': hit.get('upload_date', None), 53 | 'url': hit.get('url', '#'), # 如果有文档链接的话 54 | 'snippet': None, # 文档不显示内容片段 55 | 'source': '', 56 | 'date': '', 57 | 'sort_date': '', 58 | 'snapshot_hash': None, 59 | 'snapshot_date': None 60 | } 61 | 62 | source = hit.get('source', '') 63 | date_str = source.split(' - ')[-1] if source else '' 64 | sort_date = self._process_date(date_str) 65 | 66 | # 特别处理通配符查询的结果 67 | content = hit.get('content', '') 68 | highlighted_content = hit.highlights("content") 69 | 70 | if hit.matched_terms(): # 获取匹配的词条 71 | # 将匹配的词条以及周围的文本包含在snippet中 72 | snippet = highlighted_content if highlighted_content else content[:200] 73 | else: 74 | snippet = content[:200] 75 | 76 | # 从索引中获取快照哈希值和捕获时间 77 | snapshot_hash = hit.get('snapshot_hash') # 这个字段在索引中已存储 78 | captured_at = hit.get('captured_at') # 这个字段在索引中已存储 79 | 80 | # 格式化快照捕获时间 81 | snapshot_date = None 82 | if captured_at: 83 | try: 84 | snapshot_date = captured_at.strftime('%Y/%m/%d') 85 | except: 86 | snapshot_date = None 87 | 88 | return { 89 | 'title': hit.highlights("title") or hit.get('title', '无标题'), 90 | 'url': hit.get('url', '#'), 91 | 'snippet': snippet, 92 | 'source': hit.get('source', None), 93 | 'date': hit.get('publish_date', None), 94 | 'sort_date': sort_date, 95 | 'filetype': hit.get('filetype', None), 96 | 'filename': hit.get('filename', None), 97 | 'snapshot_hash': snapshot_hash, # 这个hash用于在数据库中查找对应的快照 98 | 'snapshot_date': snapshot_date # 显示的快照日期 99 | } 100 | 101 | def _process_date(self, date_str): 102 | """处理日期格式""" 103 | if not date_str: 104 | return '' 105 | try: 106 | parts = date_str.split('-') 107 | return f"{parts[0]}-{parts[1].zfill(2)}-{parts[2].zfill(2)}" 108 | except: 109 | return '' -------------------------------------------------------------------------------- /static/css/document.css: -------------------------------------------------------------------------------- 1 | .document-result { 2 | padding: 10px; 3 | border-radius: 4px; 4 | background-color: #f8f9fa; 5 | margin-bottom: 15px; 6 | } 7 | 8 | .document-result .result-title { 9 | font-size: 16px; 10 | color: #1a0dab; 11 | text-decoration: none; 12 | display: block; 13 | margin-bottom: 8px; 14 | } 15 | 16 | .document-result .result-title:hover { 17 | text-decoration: underline; 18 | } 19 | 20 | .document-result .result-meta { 21 | font-size: 13px; 22 | color: #666; 23 | margin-bottom: 8px; 24 | } 25 | 26 | .document-result .result-url { 27 | color: #006621; 28 | text-decoration: none; 29 | } 30 | 31 | .document-info { 32 | font-size: 13px; 33 | color: #666; 34 | margin: 8px 0; 35 | line-height: 1.4; 36 | } 37 | 38 | .document-info span { 39 | margin-right: 15px; 40 | display: inline-block; 41 | } 42 | 43 | .document-info .file-type { 44 | color: #28a745; 45 | } 46 | 47 | .document-info .file-size { 48 | color: #dc3545; 49 | } 50 | 51 | .document-info .upload-date { 52 | color: #6c757d; 53 | } 54 | 55 | .document-info .filename { 56 | color: #0056b3; 57 | } -------------------------------------------------------------------------------- /static/css/main.css: -------------------------------------------------------------------------------- 1 | /* 其他样式保持不变 */ 2 | .logo { 3 | font-size: 72px; 4 | font-weight: bold; 5 | margin-bottom: 30px; 6 | cursor: default; 7 | } 8 | /* 修改logo样式,每个字母不同颜色 */ 9 | .logo span:nth-child(1) { color: #4285f4; } /* A */ 10 | .logo span:nth-child(2) { color: #ea4335; } /* L */ 11 | .logo span:nth-child(3) { color: #fbbc05; } /* L */ 12 | .logo span:nth-child(4) { color: #4285f4; } /* I */ 13 | .logo span:nth-child(5) { color: #34a853; } /* N */ 14 | .logo span:nth-child(6) { color: #ea4335; } /* K */ 15 | .logo span:nth-child(7) { color: #fbbc05; } /* U */ 16 | 17 | -------------------------------------------------------------------------------- /static/css/pagination.css: -------------------------------------------------------------------------------- 1 | 2 | /* 分页样式 */ 3 | .pagination { 4 | margin-top: 20px; 5 | text-align: center; 6 | font-size: 14px; 7 | } 8 | 9 | .pagination a, .pagination span { 10 | display: inline-block; 11 | padding: 8px 12px; 12 | margin: 0 4px; 13 | color: #1a0dab; 14 | text-decoration: none; 15 | border-radius: 3px; 16 | } 17 | 18 | .pagination .current-page { 19 | background-color: #f8f9fa; 20 | color: #000; 21 | font-weight: bold; 22 | } 23 | 24 | .pagination a:hover { 25 | background-color: #f8f9fa; 26 | } 27 | 28 | .page-nav { 29 | color: #1a0dab; 30 | } 31 | 32 | .page-number { 33 | color: #1a0dab; 34 | } -------------------------------------------------------------------------------- /static/css/results.css: -------------------------------------------------------------------------------- 1 | .results { 2 | max-width: 720px; 3 | margin: 0 auto; 4 | } 5 | 6 | .search-stats { 7 | color: #70757a; 8 | font-size: 14px; 9 | margin-bottom: 20px; 10 | padding: 0 20px; 11 | } 12 | 13 | /* 单个结果项 */ 14 | .result-item { 15 | max-width: 670px; 16 | margin-bottom: 25px; 17 | padding: 15px 20px; 18 | border-radius: 4px; 19 | background-color: #fff; 20 | box-shadow: 0 1px 3px rgba(0,0,0,0.1); 21 | } 22 | 23 | /* 标题链接 */ 24 | .result-title { 25 | color: #1a0dab; 26 | font-size: 18px; 27 | text-decoration: none; 28 | display: block; 29 | margin-bottom: 4px; 30 | } 31 | 32 | .result-title:hover { 33 | text-decoration: underline; 34 | } 35 | 36 | /* 标题高亮样式 */ 37 | .result-title em { 38 | font-weight: bold; 39 | font-style: normal; 40 | color: #1a0dab; 41 | background-color: transparent; 42 | text-decoration: none; 43 | } 44 | 45 | .result-title:hover em { 46 | text-decoration: underline; 47 | } 48 | 49 | /* 结果元信息区域 */ 50 | .result-meta { 51 | margin: 4px 0; 52 | font-size: 14px; 53 | color: #006621; 54 | display: flex; 55 | align-items: center; 56 | gap: 10px; 57 | } 58 | 59 | /* URL显示 */ 60 | .result-url { 61 | color: #006621; 62 | text-decoration: none; 63 | } 64 | 65 | /* 快照链接 */ 66 | .snapshot-link { 67 | color: #1a73e8; 68 | text-decoration: none; 69 | font-size: 13px; 70 | } 71 | 72 | .snapshot-link:hover { 73 | text-decoration: underline; 74 | } 75 | 76 | /* 内容摘要 */ 77 | .result-snippet { 78 | color: #3c4043; 79 | font-size: 14px; 80 | line-height: 1.58; 81 | margin: 4px 0; 82 | } 83 | 84 | /* 高亮匹配词 */ 85 | .result-snippet em { 86 | font-weight: bold; 87 | font-style: normal; 88 | background-color: #ffffd0; 89 | } 90 | 91 | /* 结果底部信息 */ 92 | .result-footer { 93 | margin-top: 4px; 94 | font-size: 13px; 95 | color: #70757a; 96 | } 97 | 98 | .result-source, 99 | .result-date { 100 | margin-right: 10px; 101 | } 102 | 103 | /* 无结果提示 */ 104 | .no-results { 105 | text-align: center; 106 | color: #70757a; 107 | margin-top: 40px; 108 | padding: 20px; 109 | } 110 | 111 | /* 响应式调整 */ 112 | @media (max-width: 768px) { 113 | .results { 114 | padding: 0 15px; 115 | } 116 | 117 | .result-item { 118 | padding: 12px 15px; 119 | } 120 | } -------------------------------------------------------------------------------- /static/css/search.css: -------------------------------------------------------------------------------- 1 | /* 搜索容器样式 */ 2 | .search-container { 3 | display: flex; 4 | flex-direction: column; 5 | align-items: center; 6 | margin-top: 170px; /* 调整上边距,为顶部选项栏和用户状态栏留出空间 */ 7 | padding: 20px; 8 | position: relative; /* 添加这行 */ 9 | } 10 | 11 | /* 搜索选项栏样式 */ 12 | /* 修改搜索选项栏样式 */ 13 | .search-options-bar { 14 | width: 100%; 15 | background-color: #f8f9fa; 16 | border-bottom: 1px solid #dfe1e5; 17 | padding: 10px 0; 18 | position: fixed; 19 | top: 0px; /* 调整顶部位置 */ 20 | left: 0; 21 | z-index: 100; 22 | } 23 | 24 | .options-container { 25 | max-width: 750px; /* 与搜索框宽度保持一致 */ 26 | margin: 0 auto; 27 | padding: 0 20px; 28 | display: flex; 29 | align-items: center; 30 | justify-content: center; /* 添加居中对齐 */ 31 | gap: 15px; 32 | } 33 | 34 | /* 选项区域的下拉框样式 */ 35 | .options-container select { 36 | height: 36px; 37 | padding: 0 10px; 38 | border: 1px solid #dfe1e5; 39 | border-radius: 4px; 40 | background-color: white; 41 | color: #3c4043; 42 | font-size: 14px; 43 | cursor: pointer; 44 | } 45 | 46 | /* 文档类型选项样式 */ 47 | #fileTypeOptions { 48 | display: flex; 49 | align-items: center; 50 | gap: 10px; 51 | padding: 10px; 52 | border: 1px solid #eee; 53 | border-radius: 4px; 54 | font-size: 14px; 55 | } 56 | 57 | /* 主搜索表单样式 */ 58 | .main-search-form { 59 | width: 100%; 60 | max-width: 750px; 61 | margin: 0 auto; 62 | } 63 | 64 | .main-search-input { 65 | display: flex; 66 | gap: 10px; 67 | width: 100%; 68 | align-items: center; 69 | position: relative; /* 添加这行 */ 70 | } 71 | 72 | /* 主搜索框样式 */ 73 | .main-search-input input { 74 | flex: 1; 75 | height: 44px; 76 | padding: 0 20px; 77 | font-size: 16px; 78 | border: 1px solid #dfe1e5; 79 | border-radius: 24px; 80 | outline: none; 81 | box-shadow: 0 1px 6px rgba(32,33,36,.28); 82 | } 83 | 84 | /* 搜索框悬停和焦点效果 */ 85 | .main-search-input input:hover, 86 | .main-search-input input:focus { 87 | box-shadow: 0 1px 6px rgba(32,33,36,.28); 88 | border-color: rgba(223,225,229,0); 89 | } 90 | 91 | /* 搜索按钮样式 */ 92 | .main-search-input button { 93 | height: 44px; 94 | padding: 0 30px; 95 | background-color: #1a73e8; 96 | color: white; 97 | border: none; 98 | border-radius: 24px; 99 | font-size: 16px; 100 | cursor: pointer; 101 | transition: all 0.2s; 102 | } 103 | 104 | /* 搜索按钮悬停效果 */ 105 | .main-search-input button:hover { 106 | background-color: #1557b0; 107 | box-shadow: 0 1px 2px 0 rgba(66,133,244,0.3), 108 | 0 1px 3px 1px rgba(66,133,244,0.15); 109 | } 110 | 111 | /* 搜索提示样式 */ 112 | .search-tips { 113 | max-width: 600px; 114 | margin: 15px auto 0; 115 | text-align: center; 116 | padding: 10px; 117 | background: #f5f5f5; 118 | border-radius: 4px; 119 | } 120 | 121 | .search-tip { 122 | margin: 5px 0; 123 | color: #70757a; 124 | font-size: 14px; 125 | } 126 | 127 | /* 结果统计样式 */ 128 | .search-stats { 129 | color: #70757a; 130 | font-size: 14px; 131 | margin-bottom: 20px; 132 | padding-left: 20px; 133 | } 134 | 135 | /* 响应式布局调整 */ 136 | @media (max-width: 768px) { 137 | .options-container { 138 | flex-direction: column; 139 | align-items: stretch; 140 | padding: 10px; 141 | } 142 | 143 | .main-search-input { 144 | flex-direction: column; 145 | gap: 10px; 146 | } 147 | 148 | .main-search-input button { 149 | width: 100%; 150 | } 151 | 152 | .search-container { 153 | margin-top: 200px; 154 | } 155 | 156 | #fileTypeOptions { 157 | flex-wrap: wrap; 158 | justify-content: center; 159 | } 160 | } -------------------------------------------------------------------------------- /static/css/search_history.css: -------------------------------------------------------------------------------- 1 | .search-history { 2 | position: absolute; 3 | top: 100%; /* 紧贴搜索框底部 */ 4 | left: 0; 5 | right: 0; 6 | background: white; 7 | border: 1px solid #e0e0e0; 8 | border-radius: 0 0 8px 8px; 9 | box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1); 10 | z-index: 9999; /* 确保在最上层 */ 11 | margin-top: 0; /* 移除间距 */ 12 | width: calc(100% - 110px); /* 减去搜索按钮的宽度和间距 */ 13 | } 14 | 15 | .search-history-header { 16 | padding: 10px 15px; 17 | border-bottom: 1px solid #e0e0e0; 18 | display: flex; 19 | justify-content: space-between; 20 | align-items: center; 21 | color: #666; 22 | } 23 | 24 | .search-history-list { 25 | list-style: none; 26 | margin: 0; 27 | padding: 0; 28 | max-height: 300px; 29 | overflow-y: auto; 30 | background: white; /* 确保背景是白色的 */ 31 | } 32 | 33 | .search-history-item { 34 | padding: 8px 15px; 35 | display: flex; 36 | justify-content: space-between; 37 | align-items: center; 38 | cursor: pointer; 39 | } 40 | 41 | .search-history-item:hover { 42 | background-color: #f5f5f5; 43 | } 44 | 45 | .search-history-query { 46 | flex-grow: 1; 47 | color: #333; 48 | } 49 | 50 | .delete-history { 51 | color: #999; 52 | padding: 4px 8px; 53 | visibility: hidden; 54 | } 55 | 56 | .search-history-item:hover .delete-history { 57 | visibility: visible; 58 | } 59 | 60 | .search-history-footer { 61 | padding: 10px 15px; 62 | border-top: 1px solid #e0e0e0; 63 | text-align: center; 64 | } 65 | 66 | .view-more { 67 | color: #1a73e8; 68 | text-decoration: none; 69 | } 70 | 71 | .clear-all { 72 | color: #666; 73 | text-decoration: none; 74 | font-size: 0.9em; 75 | } 76 | 77 | .clear-all:hover, .view-more:hover { 78 | text-decoration: underline; 79 | } -------------------------------------------------------------------------------- /static/css/search_suggestions.css: -------------------------------------------------------------------------------- 1 | .search-suggestions { 2 | position: absolute; 3 | top: 100%; 4 | left: 0; 5 | right: 0; 6 | background: white; 7 | border: 1px solid #e0e0e0; 8 | border-radius: 0 0 4px 4px; 9 | box-shadow: 0 2px 4px rgba(0,0,0,0.1); 10 | z-index: 1000; 11 | max-height: 300px; 12 | overflow-y: auto; 13 | } 14 | 15 | .suggestion-item { 16 | padding: 8px 16px; 17 | cursor: pointer; 18 | border-bottom: 1px solid #f0f0f0; 19 | white-space: nowrap; 20 | overflow: hidden; 21 | text-overflow: ellipsis; 22 | display: flex; 23 | align-items: center; 24 | gap: 8px; 25 | } 26 | 27 | .suggestion-item:last-child { 28 | border-bottom: none; 29 | } 30 | 31 | .suggestion-item:hover { 32 | background-color: #f5f5f5; 33 | } 34 | 35 | .suggestion-item .highlight { 36 | font-weight: bold; 37 | } 38 | 39 | .suggestion-item .icon { 40 | color: #666; 41 | font-size: 14px; 42 | min-width: 20px; 43 | } 44 | 45 | .suggestion-item.history { 46 | background-color: #f8f9fa; 47 | } 48 | 49 | .suggestion-item.history .icon::before { 50 | content: "⏱"; 51 | } 52 | 53 | .suggestion-item.title .icon::before { 54 | content: "🔍"; 55 | } 56 | 57 | /* 确保搜索联想框在历史记录之上 */ 58 | .search-suggestions { 59 | z-index: 1001; 60 | } 61 | 62 | .search-history { 63 | z-index: 1000; 64 | } 65 | 66 | /* 优化滚动条样式 */ 67 | .search-suggestions::-webkit-scrollbar { 68 | width: 6px; 69 | } 70 | 71 | .search-suggestions::-webkit-scrollbar-thumb { 72 | background-color: #ddd; 73 | border-radius: 3px; 74 | } 75 | 76 | .search-suggestions::-webkit-scrollbar-track { 77 | background-color: #f5f5f5; 78 | } -------------------------------------------------------------------------------- /static/css/user.css: -------------------------------------------------------------------------------- 1 | /* 用户相关样式 */ 2 | /* 用户部分调整到搜索选项栏下方 */ 3 | .user-section { 4 | position: fixed; 5 | top: 57px; /* 调整位置到搜索选项栏下方 */ 6 | right: 20px; 7 | display: flex; 8 | align-items: center; 9 | gap: 10px; 10 | z-index: 1001; 11 | background-color: white; 12 | padding: 5px 10px; 13 | border-radius: 4px; 14 | } 15 | 16 | .auth-buttons { 17 | display: flex; 18 | gap: 10px; 19 | } 20 | 21 | .auth-buttons button { 22 | padding: 8px 16px; 23 | border: none; 24 | border-radius: 4px; 25 | cursor: pointer; 26 | } 27 | 28 | .login-btn { 29 | background-color: #4285f4; 30 | color: white; 31 | } 32 | 33 | .register-btn { 34 | background-color: #34a853; 35 | color: white; 36 | } 37 | 38 | .user-panel { 39 | display: flex; 40 | align-items: center; 41 | gap: 10px; 42 | } 43 | 44 | .user-avatar { 45 | width: 32px; 46 | height: 32px; 47 | border-radius: 50%; 48 | background-color: #ddd; 49 | display: flex; 50 | align-items: center; 51 | justify-content: center; 52 | } 53 | 54 | .dropdown-menu { 55 | position: absolute; 56 | top: 100%; 57 | right: 0; 58 | background: white; 59 | border: 1px solid #ddd; 60 | border-radius: 4px; 61 | padding: 8px 0; 62 | display: none; 63 | box-shadow: 0 2px 4px rgba(0,0,0,0.1); 64 | } 65 | 66 | .dropdown-menu.show { 67 | display: block; 68 | } 69 | 70 | .dropdown-menu a { 71 | display: block; 72 | padding: 8px 16px; 73 | color: #333; 74 | text-decoration: none; 75 | } 76 | 77 | .dropdown-menu a:hover { 78 | background-color: #f5f5f5; 79 | } 80 | /* 登录/注册模态框样式 */ 81 | /* 修改模态框样式 */ 82 | .modal { 83 | display: none; 84 | position: fixed; 85 | top: 0; 86 | left: 0; 87 | width: 100%; 88 | height: 100%; 89 | background-color: rgba(0, 0, 0, 0.5); /* 半透明黑色背景 */ 90 | z-index: 10000; /* 确保在最上层 */ 91 | justify-content: center; 92 | align-items: center; 93 | } 94 | 95 | .modal.show { 96 | display: flex; 97 | align-items: center; 98 | justify-content: center; 99 | } 100 | 101 | .modal-content { 102 | background: white; 103 | padding: 20px; 104 | border-radius: 8px; 105 | width: 100%; 106 | max-width: 400px; 107 | } 108 | 109 | .modal-header { 110 | display: flex; 111 | justify-content: space-between; 112 | align-items: center; 113 | margin-bottom: 20px; 114 | } 115 | 116 | .close-btn { 117 | background: none; 118 | border: none; 119 | font-size: 20px; 120 | cursor: pointer; 121 | } 122 | 123 | .form-group { 124 | margin-bottom: 16px; 125 | } 126 | 127 | .form-group label { 128 | display: block; 129 | margin-bottom: 8px; 130 | } 131 | 132 | .form-group input { 133 | width: 100%; 134 | padding: 8px; 135 | border: 1px solid #ddd; 136 | border-radius: 4px; 137 | } 138 | 139 | .submit-btn { 140 | width: 100%; 141 | padding: 10px; 142 | background-color: #4285f4; 143 | color: white; 144 | border: none; 145 | border-radius: 4px; 146 | cursor: pointer; 147 | } 148 | /* Flash消息样式 */ 149 | .flash-messages { 150 | position: fixed; 151 | top: 60px; /* 原来是20px,调整到60px,让它在搜索选项栏下方 */ 152 | left: 50%; 153 | transform: translateX(-50%); 154 | z-index: 1002; 155 | } 156 | 157 | .flash-message { 158 | padding: 10px 20px; 159 | margin-bottom: 10px; 160 | background-color: #f8d7da; 161 | border: 1px solid #f5c6cb; 162 | border-radius: 4px; 163 | color: #721c24; 164 | text-align: center; 165 | } -------------------------------------------------------------------------------- /templates/history.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 搜索历史 - ALLINKU 6 | 7 | 86 | 87 | 88 |
89 |
90 |

搜索历史

91 |
92 | 返回搜索 93 | {% if history %} 94 | 95 | {% endif %} 96 |
97 |
98 | 99 | {% if history %} 100 |
101 | {% for item in history %} 102 |
103 | 108 |
109 | 搜索范围: {{ {'all': '全部', 'title': '标题', 'content': '内容'}[item.search_in] }} 110 | | 排序方式: {{ {'relevance': '相关度', 'date': '时间'}[item.sort_by] }} 111 | | {{ item.timestamp.strftime('%Y-%m-%d %H:%M:%S') }} 112 |
113 |
114 | {% endfor %} 115 |
116 | {% else %} 117 |
118 | 暂无搜索历史 119 |
120 | {% endif %} 121 |
122 | 123 | 137 | 138 | -------------------------------------------------------------------------------- /templates/preferences.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 个性化设置 - ALLINKU 6 | 7 | 92 | 93 | 94 |
95 |
96 |

个性化设置

97 | 返回搜索 98 |
99 | 100 | {% with messages = get_flashed_messages() %} 101 | {% if messages %} 102 | {% for message in messages %} 103 |
104 | {{ message }} 105 |
106 | {% endfor %} 107 | {% endif %} 108 | {% endwith %} 109 | 110 |
111 |
112 | 113 | 118 |
119 | 120 |
121 | 122 | 126 |
127 | 128 |
129 | 130 | 135 |
136 | 137 |
138 | 139 |
140 |
141 |
142 | 143 | -------------------------------------------------------------------------------- /templates/profile.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 个人信息 - ALLINKU 6 | 7 | 92 | 93 | 94 |
95 |
96 |

个人信息

97 | 返回搜索 98 |
99 | 100 | {% with messages = get_flashed_messages() %} 101 | {% if messages %} 102 | {% for message in messages %} 103 |
104 | {{ message }} 105 |
106 | {% endfor %} 107 | {% endif %} 108 | {% endwith %} 109 | 110 |
111 |
112 |
113 | 114 | 121 |
122 | 123 |
124 | 125 | 127 |
128 | 129 |
130 | 131 | 133 |
134 | 135 |
136 | 137 |
138 |
139 |
140 |
141 | 142 | 175 | 176 | -------------------------------------------------------------------------------- /templates/search0.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ALLINKU - 南开搜索 6 | 7 | 8 | 240 | 241 | 242 |
243 | 246 |
247 |
248 | 252 | 253 | 258 | 259 | 263 | 264 |
265 |
266 |
267 | 268 | {% if results %} 269 |
270 | {% if results %} 271 |
272 | 找到约 {{total}} 条结果 273 |
274 | 275 | {% for result in results %} 276 |
277 | {{result.title|safe}} 278 |
{{result.url}}
279 |
{{result.snippet|safe}}
280 |
281 | {% if result.source %} 282 | {{result.source}} 283 | {% endif %} 284 | {% if result.date %} 285 | {{result.date.strftime('%Y-%m-%d')}} 286 | {% endif %} 287 |
288 |
289 | {% endfor %} 290 | {# Add pagination #} 291 | {% if total_pages > 1 %} 292 | {% if current_page > 1 %} 293 | 上一页 294 | {% endif %} 295 | 296 | {% for p in page_range %} 297 | {% if p == current_page %} 298 | {{ p }} 299 | {% else %} 300 | {{ p }} 301 | {% endif %} 302 | {% endfor %} 303 | 304 | {% if current_page < total_pages %} 305 | 下一页 306 | {% endif %} 307 | {% endif %} 308 | {% else %} 309 | {% if query %} 310 |
311 | 未找到与 "{{query}}" 相关的结果 312 |
313 | {% endif %} 314 | {% endif %} 315 |
316 | {% endif %} 317 | 318 | -------------------------------------------------------------------------------- /templates/search00.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ALLINKU - 南开搜索 6 | 7 | 8 | 9 | 10 | 11 | 170 | 171 | 172 | 173 | {% with messages = get_flashed_messages() %} 174 | {% if messages %} 175 |
176 | {% for message in messages %} 177 |
{{ message }}
178 | {% endfor %} 179 |
180 | {% endif %} 181 | {% endwith %} 182 | 183 |
184 | {% if current_user.is_authenticated %} 185 |
186 |
187 | {{ current_user.username[0] }} 188 |
189 | {{ current_user.username }} 190 | 191 | 196 |
197 | {% else %} 198 |
199 | 200 | 201 |
202 | {% endif %} 203 |
204 | 205 | 206 | 225 | 226 | 227 | 254 |
255 | 258 |
259 |
260 | 264 | 265 | 270 | 271 | 275 | 276 | 277 |
278 |
279 |
280 | 281 | {% if results %} 282 |
283 |
284 | 找到约 {{total}} 条结果 285 |
286 | 287 | {% for result in results %} 288 |
289 | {{result.title|safe}} 290 |
{{result.url}}
291 |
{{result.snippet|safe}}
292 |
293 | {% if result.source %} 294 | {{result.source}} 295 | {% endif %} 296 | {% if result.date %} 297 | {{result.date.strftime('%Y-%m-%d')}} 298 | {% endif %} 299 |
300 |
301 | {% endfor %} 302 | {# Add pagination #} 303 | {% if total_pages > 1 %} 304 | {% if current_page > 1 %} 305 | 上一页 306 | {% endif %} 307 | 308 | {% for p in page_range %} 309 | {% if p == current_page %} 310 | {{ p }} 311 | {% else %} 312 | {{ p }} 313 | {% endif %} 314 | {% endfor %} 315 | 316 | {% if current_page < total_pages %} 317 | 下一页 318 | {% endif %} 319 | {% endif %} 320 | 321 |
322 | {% else %} 323 | {% if query %} 324 |
325 | 未找到与 "{{query}}" 相关的结果 326 |
327 | {% endif %} 328 | {% endif %} 329 | 330 | 362 | 363 | 364 | 392 | 393 | -------------------------------------------------------------------------------- /templates/snapshot.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ALLINKU - 网页快照 7 | 52 | 53 | 54 |
55 | 返回搜索 56 |

{{ title }}

57 |
58 |
原始网址:{{ original_url }}
59 |
快照时间:{{ captured_time }}
60 | {% if source %}
来源:{{ source }}
{% endif %} 61 |
62 |
63 |
64 | {% if content %} 65 |
66 | {{ content | safe }} 67 |
68 | {% else %} 69 |
70 | 未找到快照内容 71 |
72 | {% endif %} 73 |
74 | 75 | 99 | 100 | -------------------------------------------------------------------------------- /test_document.py: -------------------------------------------------------------------------------- 1 | from whoosh.index import open_dir 2 | from whoosh.query import Term 3 | 4 | 5 | def get_url_by_id(index_dir, doc_id): 6 | """ 7 | 通过文档ID查询对应的URL 8 | 9 | Args: 10 | index_dir (str): 索引目录的路径 11 | doc_id (str): 要查询的文档ID 12 | 13 | Returns: 14 | str: 文档的URL,如果未找到则返回None 15 | """ 16 | try: 17 | # 打开索引目录 18 | ix = open_dir(index_dir) 19 | 20 | # 创建搜索器 21 | with ix.searcher() as searcher: 22 | # 使用Term查询 23 | query = Term("id", str(doc_id)) 24 | results = searcher.search(query) 25 | 26 | if len(results) > 0: 27 | # 获取URL 28 | url = results[0].get('url') 29 | if url: 30 | print(f"文档 ID {doc_id} 的URL是: {url}") 31 | else: 32 | print(f"文档 ID {doc_id} 没有URL信息") 33 | return url 34 | else: 35 | print(f"未找到ID为 {doc_id} 的文档") 36 | return None 37 | 38 | except Exception as e: 39 | print(f"查询过程中发生错误: {str(e)}") 40 | return None 41 | 42 | 43 | if __name__ == "__main__": 44 | # 使用示例 45 | index_dir = "index_dir" # 索引目录路径 46 | doc_id = "675bfc1fed10fa8630043272" # 替换为要查询的文档ID 47 | 48 | # 查询URL 49 | url = get_url_by_id(index_dir, doc_id) -------------------------------------------------------------------------------- /test_html.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | def test_specific_hash(snapshot_hash): 3 | try: 4 | client = MongoClient('localhost', 27017) 5 | db = client['nankai_news_datasets'] 6 | 7 | print(f"\n测试特定hash: {snapshot_hash}") 8 | 9 | # 在数据库中查找快照 10 | snapshot = db.WEB_snapshot.find_one({'content_hash': snapshot_hash}) 11 | if snapshot: 12 | print("\n1. 找到快照:") 13 | print(f"- html_content 长度: {len(snapshot.get('html_content', ''))}") 14 | print(f"- captured_at: {snapshot.get('captured_at')}") 15 | else: 16 | print("\n1. 未找到快照") 17 | 18 | except Exception as e: 19 | print(f"\n错误: {str(e)}") 20 | finally: 21 | client.close() 22 | 23 | 24 | # 测试特定hash 25 | test_specific_hash("ee985e251e6d522d52f10c17d2d283b5") -------------------------------------------------------------------------------- /test_wildcard.py: -------------------------------------------------------------------------------- 1 | from whoosh.index import open_dir 2 | from whoosh.qparser import QueryParser, WildcardPlugin 3 | from whoosh.query import Wildcard 4 | import jieba 5 | 6 | # 打开索引 7 | ix = open_dir("index_dir") 8 | 9 | # 测试函数 10 | def test_wildcard_patterns(): 11 | with ix.searcher() as searcher: 12 | # 测试 ? 和 * 的不同情况 13 | test_cases = [ 14 | "计?", # 应该匹配:"计算"、"计划"等 15 | "计算*", # 应该匹配:"计算机"、"计算方法"等 16 | "计*", # 应该匹配所有以"计"开头的词 17 | "南开*" # 应该匹配所有以"南开"开头的词 18 | ] 19 | 20 | for test_query in test_cases: 21 | print(f"\n测试查询: {test_query}") 22 | 23 | # 先检查索引中包含的terms 24 | prefix = test_query.replace('?', '').replace('*', '') 25 | print(f"索引中包含'{prefix}'开头的terms:") 26 | matching_terms = [] 27 | for term in searcher.reader().lexicon("content"): 28 | try: 29 | decoded_term = term.decode('utf-8') 30 | if decoded_term.startswith(prefix): 31 | matching_terms.append(decoded_term) 32 | except UnicodeDecodeError: 33 | continue 34 | print(f"匹配的terms: {matching_terms[:10]}") # 只显示前10个 35 | 36 | # 执行查询 37 | from whoosh.query import Wildcard 38 | query = Wildcard("content", test_query) 39 | results = searcher.search(query, limit=5) 40 | 41 | print(f"查询结果数量: {len(results)}") 42 | for hit in results: 43 | print(f"- 标题: {hit['title']}") 44 | print(f" 匹配内容: {hit.highlights('content', top=1)}") 45 | 46 | if __name__ == "__main__": 47 | test_wildcard_patterns() -------------------------------------------------------------------------------- /说明文档.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/说明文档.pdf --------------------------------------------------------------------------------