├── README.md
├── Spider
    ├── PR_calculation.py
    ├── downloadlink.py
    ├── htmonly.py
    ├── htmonly_pagerank.py
    ├── mutispider.py
    └── mutispider_pagerank.py
├── app.py
├── data_clean
    ├── clean_document.py
    ├── load_data.py
    ├── merge_data.py
    ├── news1_clean_ distinct.py
    ├── news_clean_distinct.py
    └── news_clean_frame.py
├── datasets_and_logs
    └── 2024_11_29_23_45_00_log.txt
├── db_init
    ├── init_db.py
    ├── init_db_new.py
    └── init_user_profiles.py
├── img-folder
    ├── 19255F29.png
    ├── image-20241217170449292.png
    ├── image-20241217172152142.png
    ├── image-20241217173749975.png
    ├── image-20241217173936568.png
    ├── image-20241217174140950.png
    ├── image-20241217174236515.png
    ├── image-20241217174713138.png
    ├── image-20241217175037417.png
    ├── image-20241217175233504.png
    ├── image-20241217184806123.png
    ├── image-20241217184922146.png
    ├── image-20241217185008968.png
    ├── image-20241217185208423.png
    ├── image-20241217185558358.png
    ├── image-20241217192145331.png
    ├── image-20241217192259748.png
    ├── image-20241217192419633.png
    ├── image-20241217192631920.png
    ├── image-20241217193833186.png
    ├── image-20241217193922327.png
    ├── image-20241217194706713.png
    ├── image-20241217195003153.png
    ├── image-20241217195200393.png
    ├── image-20241217201947701.png
    ├── image-20241217202328199.png
    ├── image-20241217204512415.png
    ├── image-20241217204732261.png
    ├── image-20241217205153730.png
    ├── image-20241217205341225.png
    ├── image-20241217205927444.png
    ├── image-20241217210224984.png
    ├── image-20241217210435856.png
    ├── image-20241217210524271.png
    ├── image-20241217210643902.png
    ├── image-20241217210942244.png
    ├── image-20241217211441465.png
    ├── image-20241217211645716.png
    ├── image-20241217211724777.png
    ├── image-20241217212505606.png
    ├── image-20241217212805264.png
    ├── image-20241217220109997.png
    ├── image-20241217220232016.png
    ├── image-20241217220410027.png
    ├── image-20241217220710450.png
    ├── image-20241217221306764.png
    ├── image-20241217221434868.png
    ├── image-20241217221619006.png
    ├── image-20241217221818883.png
    ├── image-20241217222258357.png
    ├── image-20241217231353742.png
    ├── image-20241217231856471.png
    ├── image-20241217234338468.png
    ├── image-20241217234427613.png
    ├── image-20241217234452315.png
    ├── image-20241217234622692.png
    └── readme.md
├── index
    ├── ES_Index.py
    ├── creat_index.py
    ├── creat_index00.py
    ├── creat_index01.py
    └── creat_index_document.py
├── search
    ├── __pycache__
    │   ├── manager.cpython-39.pyc
    │   ├── personalization.cpython-39.pyc
    │   └── processor.cpython-39.pyc
    ├── manager.py
    ├── personalization.py
    └── processor.py
├── static
    └── css
    │   ├── document.css
    │   ├── main.css
    │   ├── pagination.css
    │   ├── results.css
    │   ├── search.css
    │   ├── search_history.css
    │   ├── search_suggestions.css
    │   └── user.css
├── templates
    ├── history.html
    ├── preferences.html
    ├── profile.html
    ├── search.html
    ├── search0.html
    ├── search00.html
    └── snapshot.html
├── test_document.py
├── test_html.py
├── test_wildcard.py
└── 说明文档.pdf


/Spider/PR_calculation.py:
--------------------------------------------------------------------------------
  1 | from pymongo import MongoClient
  2 | import networkx as nx
  3 | import numpy as np
  4 | from scipy import sparse
  5 | from numba import jit
  6 | from tqdm import tqdm
  7 | import pandas as pd
  8 | from urllib.parse import urlparse
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | import multiprocessing
 11 | import time
 12 | from pymongo.operations import UpdateOne
 13 | from pymongo.errors import BulkWriteError
 14 | 
 15 | class OptimizedPageRankCalculator:
 16 |     def __init__(self, damping_factor=0.85, tolerance=1e-6, max_iter=100):
 17 |         self.damping_factor = damping_factor
 18 |         self.tolerance = tolerance
 19 |         self.max_iter = max_iter
 20 | 
 21 |         print("初始化数据库连接...")
 22 |         self.client = MongoClient('mongodb://localhost:27017/')
 23 |         self.db = self.client['nankai_news_datasets']
 24 |         self.collection = self.db['NEWS']
 25 |         self.n_jobs = multiprocessing.cpu_count()
 26 |         print(f"将使用 {self.n_jobs} 个CPU核心进行计算")
 27 | 
 28 |     def build_sparse_matrix(self, urls):
 29 |         """构建优化的稀疏矩阵"""
 30 |         start_time = time.time()
 31 |         n = len(urls)
 32 |         print(f"\n第1步/3: 构建稀疏矩阵 (总计 {n} 个URL)")
 33 | 
 34 |         # URL映射
 35 |         print("创建URL索引映射...")
 36 |         url_to_idx = {url: idx for idx, url in enumerate(urls)}
 37 | 
 38 |         # 并行处理URL
 39 |         chunk_size = max(1000, n // self.n_jobs)
 40 |         edges = []
 41 | 
 42 |         print("并行构建边关系...")
 43 |         with tqdm(total=n) as pbar:
 44 |             for i in range(0, n, chunk_size):
 45 |                 chunk = urls[i:i + chunk_size]
 46 |                 for url in chunk:
 47 |                     parsed = urlparse(url)
 48 |                     path = parsed.path.split('/')
 49 |                     if len(path) > 2:
 50 |                         base = '/'.join(path[:-1])
 51 |                         edges.extend([
 52 |                             (url_to_idx[url], url_to_idx[other])
 53 |                             for other in urls[i:i + chunk_size]
 54 |                             if other != url and other.startswith(f"{parsed.scheme}://{parsed.netloc}{base}")
 55 |                         ])
 56 |                 pbar.update(len(chunk))
 57 | 
 58 |         # 构建矩阵
 59 |         print("构建最终矩阵...")
 60 |         # 构建稀疏矩阵
 61 |         if edges:
 62 |             rows, cols = zip(*edges)
 63 |             data = np.ones(len(rows))
 64 |             matrix = sparse.csr_matrix((data, (rows, cols)), shape=(n, n))
 65 |         else:
 66 |             matrix = sparse.csr_matrix((n, n))
 67 | 
 68 |         # 标准化矩阵，避免孤立节点
 69 |         out_degrees = np.array(matrix.sum(axis=1)).flatten()
 70 |         out_degrees[out_degrees == 0] = 1  # 避免除以零
 71 |         matrix = sparse.diags(1 / out_degrees) @ matrix
 72 | 
 73 |         elapsed = time.time() - start_time
 74 |         print(f"矩阵构建完成! 用时: {elapsed:.2f}秒")
 75 |         return matrix, url_to_idx
 76 | 
 77 |     @staticmethod
 78 |     @jit(nopython=True)
 79 |     def _power_iteration(matrix_data, matrix_indices, matrix_indptr, damping, n, max_iter, tolerance):
 80 |         """使用numba加速的幂迭代"""
 81 |         scores = np.full(n, 1.0 / n)  # 初始化为均匀分布
 82 |         teleport = (1 - damping) / n
 83 | 
 84 |         for iter_num in range(max_iter):
 85 |             prev_scores = scores.copy()
 86 |             new_scores = np.zeros(n)
 87 | 
 88 |             for i in range(n):
 89 |                 for j in range(matrix_indptr[i], matrix_indptr[i + 1]):
 90 |                     col = matrix_indices[j]
 91 |                     val = matrix_data[j]
 92 |                     new_scores[col] += val * prev_scores[i]
 93 | 
 94 |             scores = teleport + damping * new_scores
 95 |             diff = np.abs(scores - prev_scores).sum()
 96 | 
 97 |             if diff < tolerance:
 98 |                 break
 99 | 
100 |         return scores, iter_num + 1
101 | 
102 |     def calculate_pagerank(self):
103 |         """计算PageRank"""
104 |         # 获取所有URL
105 |         print("\n开始PageRank计算...")
106 |         start_time = time.time()
107 | 
108 |         print("第2步/3: 从数据库加载URL...")
109 |         urls = [doc['url'] for doc in self.collection.find({}, {'url': 1, '_id': 0})]
110 |         n = len(urls)
111 |         print(f"加载完成，共 {n} 个URL")
112 | 
113 |         # 构建矩阵
114 |         matrix, url_to_idx = self.build_sparse_matrix(urls)
115 | 
116 |         # 计算PageRank
117 |         print("\n第3步/3: 迭代计算PageRank...")
118 |         scores, iterations = self._power_iteration(
119 |             matrix.data, matrix.indices, matrix.indptr,
120 |             self.damping_factor, len(urls), self.max_iter, self.tolerance
121 |         )
122 | 
123 |         # 构建结果
124 |         print("整理计算结果...")
125 |         idx_to_url = {v: k for k, v in url_to_idx.items()}
126 |         df = pd.DataFrame({
127 |             'url': [idx_to_url[i] for i in range(len(scores))],
128 |             'pagerank': scores
129 |         })
130 |         df = df.sort_values('pagerank', ascending=False)
131 | 
132 |         total_time = time.time() - start_time
133 |         print(f"\n计算完成! 总用时: {total_time:.2f}秒")
134 |         print(f"迭代次数: {iterations}")
135 |         return df
136 | 
137 |     def preview_results(self, df):
138 |         """预览结果"""
139 |         print("\n结果预览:")
140 |         print("\n最高PageRank值的5个页面:")
141 |         for _, row in df.head().iterrows():
142 |             print(f"PageRank: {row['pagerank']:.6e} | URL: {row['url']}")
143 | 
144 |         print("\n最低PageRank值的5个页面:")
145 |         for _, row in df.tail().iterrows():
146 |             print(f"PageRank: {row['pagerank']:.6e} | URL: {row['url']}")
147 | 
148 |         stats = df['pagerank'].describe()
149 |         print(f"\n统计信息:")
150 |         print(f"平均值: {stats['mean']:.6e}")
151 |         print(f"标准差: {stats['std']:.6e}")
152 |         print(f"最小值: {stats['min']:.6e}")
153 |         print(f"最大值: {stats['max']:.6e}")
154 | 
155 |         return input("\n要更新数据库吗？(yes/no): ").lower().strip() == 'yes'
156 | 
157 |     def update_mongodb(self, df):
158 |         """更新数据库"""
159 |         print("\n开始更新数据库...")
160 |         batch_size = 1000
161 |         total = len(df)
162 |         updated = 0
163 | 
164 |         with tqdm(total=total, desc="更新进度") as pbar:
165 |             for i in range(0, total, batch_size):
166 |                 batch = df.iloc[i:i + batch_size]
167 |                 operations = []
168 | 
169 |                 for _, row in batch.iterrows():
170 |                     operations.append(
171 |                         UpdateOne(
172 |                             {'url': row['url']},
173 |                             {'$set': {'pagerank': float(row['pagerank'])}},
174 |                             upsert=False
175 |                         )
176 |                     )
177 | 
178 |                 if operations:
179 |                     try:
180 |                         result = self.collection.bulk_write(operations)
181 |                         updated += result.modified_count
182 |                     except BulkWriteError as bwe:
183 |                         print(f"批量写入错误: {bwe.details}")
184 |                         raise
185 |                 pbar.update(len(batch))
186 | 
187 |         print(f"更新完成! 更新了 {updated} 条记录")
188 | 
189 |     def run(self):
190 |         """运行主流程"""
191 |         try:
192 |             df = self.calculate_pagerank()
193 |             if self.preview_results(df):
194 |                 self.update_mongodb(df)
195 |                 print("\n所有操作已完成!")
196 |             else:
197 |                 print("\n操作已取消，数据库未更新")
198 |         except Exception as e:
199 |             print(f"发生错误: {str(e)}")
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     calculator = OptimizedPageRankCalculator()
204 |     calculator.run()


--------------------------------------------------------------------------------
/Spider/downloadlink.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import time
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from datetime import datetime
  8 | import logging
  9 | 
 10 | # 配置日志
 11 | log_filename = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_log.txt"  # 日志文件名
 12 | logging.basicConfig(
 13 |     level=logging.INFO,  # 设置日志级别为INFO，输出INFO及以上级别的日志
 14 |     format="%(asctime)s - %(levelname)s - %(message)s",  # 日志格式
 15 |     handlers=[
 16 |         logging.StreamHandler(),  # 输出到控制台
 17 |         logging.FileHandler(log_filename, mode="w", encoding="utf-8")  # 输出到日志文件
 18 |     ]
 19 | )
 20 | 
 21 | # 设置头信息，防止反爬虫
 22 | headers_parameters = {
 23 |     'Connection': 'Keep-Alive',
 24 |     'Accept': 'text/html',
 25 |     'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
 26 |     'Accept-Encoding': 'gzip, deflate',
 27 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 28 | }
 29 | 
 30 | # 下载文档后缀列表
 31 | download_suffix_list = [
 32 |     "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",  # 常见文档格式
 33 |     "mp3", "mp4", "avi", "mkv", "mov", "wmv", "flv",  # 音频和视频格式
 34 |     "zip", "rar", "tar", "gz", "bz2", "7z",  # 压缩文件格式
 35 |     "jpg", "jpeg", "png", "gif", "bmp", "tiff",  # 图片格式
 36 |     "exe", "apk", "dmg",  # 可执行文件和应用程序
 37 |     "csv", "txt", "rtf",  # 文本文件
 38 |     "xls", "xlsx",  # 表格文件
 39 | ]
 40 | 
 41 | # 获取网页内容
 42 | def get_html(url):
 43 |     print(url)
 44 |     try:
 45 |         response = requests.get(url, timeout=crawl_timeout, headers=headers_parameters, allow_redirects=False)
 46 |         response.encoding = response.apparent_encoding
 47 |     except Exception as e:
 48 |         print(e)
 49 |         return ""
 50 |     return response.text
 51 | 
 52 | # 获取网页中的所有链接
 53 | def get_expand_urls(bs, url,download_id_counter):
 54 |     urls_expand = []
 55 |     for item in bs.find_all("a"):  # 当前网页html的所有a标签
 56 |         href = item.get("href")
 57 |         if href is None:
 58 |             continue
 59 |         href = str(href)
 60 |         index = href.find("#")  # 去除#跳转
 61 |         if index != -1:
 62 |             href = href[:index]
 63 |         if href.find("javascript") != -1 or href.find("download") != -1:
 64 |             continue
 65 |         if len(href) < 1 or href == '/':
 66 |             continue
 67 |         if href.find("http") == -1:
 68 |             if href[0] != '/':
 69 |                 href = '/' + href
 70 |             else:
 71 |                 if href[0] == '.' and href[1] == '/':
 72 |                     href = href[1:]
 73 |             if url[-1] == '/':  # 去除url尾部的'/'（如果有）
 74 |                 url = url[:-1]
 75 |             href = url + href
 76 |         else:  # 对于绝对地址，直接添加
 77 |             index_of_end_of_domain = href.find('/', href.find("//") + 2)
 78 |             index_of_nankai_str = href.find("nankai")
 79 |             if index_of_nankai_str == -1 or index_of_nankai_str > index_of_end_of_domain:
 80 |                 continue
 81 |         if href.find("less.nankai.edu.cn/public") != -1 or href.find("weekly.nankai.edu.cn/oldrelease.php") != -1:
 82 |             continue
 83 |         # 如果是下载链接
 84 |         index_suffix = href.rfind(".")
 85 |         if href[index_suffix + 1:] in download_suffix_list:  # 如果是下载地址
 86 |             # 可能从<a>标签获取标题或者描述
 87 |             file_title = item.get_text().strip()  # 链接文本作为标题
 88 |             if not file_title:
 89 |                 file_title = "Unknown Title"  # 如果没有链接文本，设为默认标题
 90 |             # # 打印下载链接信息，包括序号
 91 |             # download_id = len(urls_taken) + 1  # 为每个链接分配一个唯一的序号
 92 |             # 打印下载链接信息，包括序号
 93 |             download_id = download_id_counter[0]  # 获取当前的下载 ID
 94 |             download_id_counter[0] += 1  # 更新 ID 计数器
 95 |             logging.info(f"[{download_id}]Download link found: {href}, Title: {file_title}")
 96 |             # 获取文件类型
 97 |             file_type = href.split('.')[-1] if '.' in href else 'unknown'
 98 | 
 99 |             # 保存下载链接信息
100 |             download_info = {
101 |                 "url": href,
102 |                 "title": file_title,
103 |                 "file_type": file_type,
104 |                 "file_name": href.split("/")[-1],
105 |                 "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
106 |             }
107 |             # with open(os.path.join(dirname, f"download_{download_id}.json"), 'w', encoding="utf-8") as file:
108 |             #     json.dump(download_info, file, ensure_ascii=False)
109 |             # continue
110 | 
111 |             # 保存每个下载链接为单独的JSON文件
112 |             json_file_name = f"download_{download_id}.json"
113 |             with open(os.path.join(dirname, json_file_name), 'w', encoding="utf-8") as file:
114 |                 json.dump(download_info, file, ensure_ascii=False)
115 | 
116 |             # 这里不用继续执行，也没有必要保存其他链接的信息
117 |             continue  # 一旦保存该链接的json，继续检查下一个链接
118 | 
119 |         urls_expand.append(href)
120 |     # 如果没有扩展链接，返回空列表而不是 None
121 |     return urls_expand if urls_expand else []
122 | 
123 | # 保存下载链接到文件
124 | def save_download_links(download_links):
125 |     filename = "download_links.json"
126 |     if os.path.exists(filename):
127 |         with open(filename, 'r', encoding="utf-8") as file:
128 |             all_links = json.load(file)
129 |     else:
130 |         all_links = []
131 | 
132 |     all_links.extend(download_links)
133 | 
134 |     with open(filename, 'w', encoding="utf-8") as file:
135 |         json.dump(all_links, file, ensure_ascii=False, indent=4)
136 |     logging.info(f"Saved {len(download_links)} download links.")
137 | 
138 | # 迭代爬虫
139 | def crawl_loop(i, url_count,  download_link_count, urls_target, urls_taken,download_id_counter, max_crawl_count):
140 |     # 如果已经达到最大深度、迭代次数，或者达到了最大爬取数量，停止爬虫
141 |     if i == 0:
142 |         logging.info("Crawl finished!")
143 |         logging.info(f"Total URLs crawled: {url_count}")
144 |         logging.info(f"Total download links found: {download_link_count}")
145 |         return
146 | 
147 |     urls_expand = []
148 |     download_links = []
149 | 
150 |     for url in urls_target:
151 |         html = get_html(url)
152 |         bs = BeautifulSoup(html, "html.parser")
153 |         for url_expand in get_expand_urls(bs, url,download_id_counter):
154 |             if url_expand not in urls_taken:
155 |                 html_expand = get_html(url_expand)
156 |                 bs_expand = BeautifulSoup(html_expand, "html.parser")
157 |                 url_count += 1
158 |                 new_links = get_expand_urls(bs_expand, url_expand,download_id_counter)
159 |                 if new_links is None:
160 |                     continue  # 如果返回 None，则跳过当前循环
161 | 
162 |                 download_links.extend(new_links)
163 |                 download_link_count += len(new_links)
164 |                 urls_expand.append(url_expand)
165 |                 # 添加到已爬取集合中
166 |                 for new_url in new_links:
167 |                     urls_taken.add(new_url)
168 |                 #logging.info(f"Total crawled pages: {url_count} - Total download links: {download_link_count}")
169 |         if url_count >= max_crawl_count:  # 如果达到最大爬取数量，跳出外层循环
170 |             break
171 | 
172 |         # 保存下载链接
173 |     save_download_links(download_links)
174 |     # 递归调用 crawl_loop，继续爬取
175 |     return crawl_loop(i - 1, url_count, download_link_count, urls_expand, urls_taken,download_id_counter, max_crawl_count)
176 | 
177 | # 爬虫设置和初始化
178 | download_id_counter = [1]  # 初始化下载 ID 计数器
179 | dirname = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")  # 目录名称
180 | os.mkdir(dirname)
181 | crawl_timeout = 1  # 爬虫连接超时时间
182 | crawl_iteration_times = 8  # 爬虫迭代次数
183 | html_index = 0  # 网页索引
184 | url_count = 0  # 总爬取网页数量
185 | urls_target = []  # 爬虫目标网址
186 | #urls_taken = []  # 已访问的网址
187 | urls_taken = set()  # 使用集合来避免重复
188 | urls_invalid = []  # 无效的网址
189 | max_crawl_count = 30000  # 设定最大爬取数量
190 | # 从目标网址文件加载目标网址
191 | with open("default_urls_download.json") as file:
192 |     urls_target = json.load(file)
193 | 
194 | # 执行爬虫
195 | crawl_loop(crawl_iteration_times, url_count, 0, urls_target, urls_taken,download_id_counter, max_crawl_count)


--------------------------------------------------------------------------------
/Spider/htmonly.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import time
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from datetime import datetime
  8 | import logging
  9 | 
 10 | # 配置日志
 11 | log_filename = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_log.txt"  # 日志文件名
 12 | logging.basicConfig(
 13 |     level=logging.INFO,  # 设置日志级别为INFO，输出INFO及以上级别的日志
 14 |     format="%(asctime)s - %(levelname)s - %(message)s",  # 日志格式
 15 |     handlers=[
 16 |         logging.StreamHandler(),  # 输出到控制台
 17 |         logging.FileHandler(log_filename, mode="w", encoding="utf-8")  # 输出到日志文件
 18 |     ]
 19 | )
 20 | 
 21 | # 设置头信息，防止反爬虫
 22 | headers_parameters = {
 23 |     'Connection': 'Keep-Alive',
 24 |     'Accept': 'text/html',
 25 |     'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
 26 |     'Accept-Encoding': 'gzip, deflate',
 27 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 28 | }
 29 | 
 30 | # 下载文档后缀列表
 31 | download_suffix_list = [
 32 |     "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",  # 常见文档格式
 33 |     "mp3", "mp4", "avi", "mkv", "mov", "wmv", "flv",  # 音频和视频格式
 34 |     "zip", "rar", "tar", "gz", "bz2", "7z",  # 压缩文件格式
 35 |     "jpg", "jpeg", "png", "gif", "bmp", "tiff",  # 图片格式
 36 |     "exe", "apk", "dmg",  # 可执行文件和应用程序
 37 |     "csv", "txt", "rtf",  # 文本文件
 38 |     "xls", "xlsx",  # 表格文件
 39 | ]
 40 | 
 41 | # 获取网页内容
 42 | def get_html(url):
 43 |     print(url)
 44 |     try:
 45 |         response = requests.get(url, timeout=crawl_timeout, headers=headers_parameters, allow_redirects=False)
 46 |         response.encoding = response.apparent_encoding
 47 |     except Exception as e:
 48 |         print(e)
 49 |         return ""
 50 |     return response.text
 51 | 
 52 | # 获取网页中的所有链接
 53 | def get_expand_urls(bs, url):
 54 |     urls_expand = []
 55 |     for item in bs.find_all("a"):  # 当前网页html的所有a标签
 56 |         href = item.get("href")
 57 |         if href is None:
 58 |             continue
 59 |         href = str(href)
 60 |         index = href.find("#")  # 去除#跳转
 61 |         if index != -1:
 62 |             href = href[:index]
 63 |         if href.find("javascript") != -1 or href.find("download") != -1:
 64 |             continue
 65 |         if len(href) < 1 or href == '/':
 66 |             continue
 67 |         if href.find("http") == -1:
 68 |             if href[0] != '/':
 69 |                 href = '/' + href
 70 |             else:
 71 |                 if href[0] == '.' and href[1] == '/':
 72 |                     href = href[1:]
 73 |             if url[-1] == '/':  # 去除url尾部的'/'（如果有）
 74 |                 url = url[:-1]
 75 |             href = url + href
 76 |         else:  # 对于绝对地址，直接添加
 77 |             index_of_end_of_domain = href.find('/', href.find("//") + 2)
 78 |             index_of_nankai_str = href.find("nankai")
 79 |             if index_of_nankai_str == -1 or index_of_nankai_str > index_of_end_of_domain:
 80 |                 continue
 81 |         if href.find("less.nankai.edu.cn/public") != -1 or href.find("weekly.nankai.edu.cn/oldrelease.php") != -1:
 82 |             continue
 83 | 
 84 |         index_suffix = href.rfind(".")
 85 |         if href[index_suffix + 1:] in download_suffix_list:  # 如果是下载地址
 86 |             logging.info("Download link found: " + href)
 87 |             continue
 88 | 
 89 |         urls_expand.append(href)
 90 |     return urls_expand
 91 | 
 92 | # 打印和保存网页数据
 93 | def print_json_data(json_data,html_index):
 94 |     logging.info(f"Page {html_index}:")
 95 |     logging.info("url: " + json_data["url"])
 96 |     logging.info("title: " + json_data["title"])
 97 |     content = json_data["content"]
 98 |     content = str(content).replace('\n', '')
 99 |     content = str(content).replace('\t', '')
100 |     if len(content) > 100:
101 |         logging.info("content: " + content[0:99] + "...")
102 |     else:
103 |         logging.info("content: " + content)
104 | 
105 | # 保存网页内容到文件
106 | def content_handler(bs, url, index):
107 |     title = ""
108 |     content = ""
109 |     for item in bs.findAll():
110 |         if item.name == "script" or item.name == "style":
111 |             continue
112 |         content += item.get_text()
113 |     content = re.sub("\n\n", "", content)
114 |     content = content.replace('\n', '')
115 |     content = content.replace('\t', '')
116 |     if bs.title is not None:
117 |         title = bs.title.get_text()
118 |     if title == "" or title is None or title.find("301") != -1 or title.find("302") != -1 or title.find("404") != -1:
119 |         logging.info(f"Skipping page {index} (title: {title})")  # 打印跳过的页面信息
120 |         return False
121 | 
122 |     else:
123 |         json_data = {"url": url,
124 |                      "title": title,
125 |                      "content": content,
126 |                      "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
127 |         print_json_data(json_data,index)
128 |         with open(os.path.join(dirname, str(index) + ".json"), 'w', encoding="utf-8") as file:
129 |             json.dump(json_data, file, ensure_ascii=False)
130 |         file.close()
131 |         return True
132 | 
133 | # 迭代爬虫
134 | def crawl_loop(i, url_count, html_index, urls_target, urls_taken):
135 |     if i == 0:
136 |         logging.info("Crawl finished!")
137 |         logging.info(f"Total URLs crawled: {url_count}")
138 |         logging.info(f"Total valid URLs: {html_index}")
139 |         return
140 |     urls_expand = []
141 |     for url in urls_target:
142 |         html = get_html(url)
143 |         bs = BeautifulSoup(html, "html.parser")
144 |         for url_expand in get_expand_urls(bs, url):
145 |             if url_expand not in urls_taken:
146 |                 html_expand = get_html(url_expand)
147 |                 bs_expand = BeautifulSoup(html_expand, "html.parser")
148 |                 url_count += 1
149 |                 if not content_handler(bs_expand, url_expand, html_index):
150 |                     continue
151 |                 html_index += 1
152 |                 urls_expand.append(url_expand)
153 |                 # urls_taken.append(url_expand)#对应列表方法
154 |                 urls_taken.add(url_expand)  # 修改为 set 的 add 方法
155 |                 logging.info(f"Total crawled pages: {url_count} - Current page index: {html_index}")  # 输出当前的爬取数量和页面索引
156 |     return crawl_loop(i - 1, url_count, html_index, urls_expand, urls_taken)
157 | 
158 | # 爬虫设置和初始化
159 | dirname = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")  # 目录名称
160 | os.mkdir(dirname)
161 | crawl_timeout = 1  # 爬虫连接超时时间
162 | crawl_iteration_times = 6  # 爬虫迭代次数
163 | html_index = 0  # 网页索引
164 | url_count = 0  # 总爬取网页数量
165 | urls_target = []  # 爬虫目标网址
166 | #urls_taken = []  # 已访问的网址
167 | urls_taken = set()  # 使用集合来避免重复
168 | urls_invalid = []  # 无效的网址
169 | 
170 | # 从目标网址文件加载目标网址
171 | with open("../datasets_and_logs/default_urls.json") as file:
172 |     urls_target = json.load(file)
173 | 
174 | # 执行爬虫
175 | crawl_loop(crawl_iteration_times, url_count, html_index, urls_target, urls_taken)


--------------------------------------------------------------------------------
/Spider/htmonly_pagerank.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import time
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from datetime import datetime
  8 | import logging
  9 | import networkx as nx
 10 | 
 11 | 
 12 | class PageRankHandler:
 13 |     def __init__(self):
 14 |         self.link_graph = nx.DiGraph()  # 使用有向图存储链接关系
 15 | 
 16 |     def add_links(self, from_url, to_urls):
 17 |         """添加链接关系到图中"""
 18 |         for to_url in to_urls:
 19 |             self.link_graph.add_edge(from_url, to_url)
 20 | 
 21 |     def calculate_pagerank(self, alpha=0.85):
 22 |         """计算PageRank值"""
 23 |         return nx.pagerank(self.link_graph, alpha=alpha)
 24 | 
 25 |     def save_pagerank(self, pagerank_scores, dirname):
 26 |         """保存PageRank结果"""
 27 |         with open(os.path.join(dirname, "pagerank.json"), 'w', encoding="utf-8") as f:
 28 |             json.dump(pagerank_scores, f, ensure_ascii=False)
 29 | 
 30 |     def get_top_pages(self, pagerank_scores, n=10):
 31 |         """获取PageRank值最高的n个页面"""
 32 |         sorted_pages = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)
 33 |         return sorted_pages[:n]
 34 | 
 35 | 
 36 | # 配置日志
 37 | log_filename = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_log.txt"
 38 | logging.basicConfig(
 39 |     level=logging.INFO,
 40 |     format="%(asctime)s - %(levelname)s - %(message)s",
 41 |     handlers=[
 42 |         logging.StreamHandler(),
 43 |         logging.FileHandler(log_filename, mode="w", encoding="utf-8")
 44 |     ]
 45 | )
 46 | 
 47 | # 初始化PageRank处理器
 48 | pagerank_handler = PageRankHandler()
 49 | 
 50 | # 设置头信息
 51 | headers_parameters = {
 52 |     'Connection': 'Keep-Alive',
 53 |     'Accept': 'text/html',
 54 |     'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
 55 |     'Accept-Encoding': 'gzip, deflate',
 56 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 57 | }
 58 | 
 59 | # 下载文档后缀列表
 60 | download_suffix_list = [
 61 |     "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
 62 |     "mp3", "mp4", "avi", "mkv", "mov", "wmv", "flv",
 63 |     "zip", "rar", "tar", "gz", "bz2", "7z",
 64 |     "jpg", "jpeg", "png", "gif", "bmp", "tiff",
 65 |     "exe", "apk", "dmg",
 66 |     "csv", "txt", "rtf",
 67 |     "xls", "xlsx",
 68 | ]
 69 | 
 70 | 
 71 | def get_html(url):
 72 |     try:
 73 |         response = requests.get(url, timeout=crawl_timeout, headers=headers_parameters, allow_redirects=False)
 74 |         response.encoding = response.apparent_encoding
 75 |     except Exception as e:
 76 |         logging.error(f"Error fetching {url}: {e}")
 77 |         return ""
 78 |     return response.text
 79 | 
 80 | 
 81 | def get_expand_urls(bs, url):
 82 |     urls_expand = []
 83 |     for item in bs.find_all("a"):
 84 |         href = item.get("href")
 85 |         if href is None:
 86 |             continue
 87 |         href = str(href)
 88 | 
 89 |         # 链接清理和过滤逻辑
 90 |         index = href.find("#")
 91 |         if index != -1:
 92 |             href = href[:index]
 93 |         if href.find("javascript") != -1 or href.find("download") != -1:
 94 |             continue
 95 |         if len(href) < 1 or href == '/':
 96 |             continue
 97 | 
 98 |         # 处理相对链接
 99 |         if href.find("http") == -1:
100 |             if href[0] != '/':
101 |                 href = '/' + href
102 |             elif href[0] == '.' and href[1] == '/':
103 |                 href = href[1:]
104 |             if url[-1] == '/':
105 |                 url = url[:-1]
106 |             href = url + href
107 |         else:
108 |             # 过滤非南开域名链接
109 |             index_of_end_of_domain = href.find('/', href.find("//") + 2)
110 |             index_of_nankai_str = href.find("nankai")
111 |             if index_of_nankai_str == -1 or index_of_nankai_str > index_of_end_of_domain:
112 |                 continue
113 | 
114 |         # 过滤特定URL
115 |         if href.find("less.nankai.edu.cn/public") != -1 or href.find("weekly.nankai.edu.cn/oldrelease.php") != -1:
116 |             continue
117 | 
118 |         # 过滤下载链接
119 |         index_suffix = href.rfind(".")
120 |         if href[index_suffix + 1:] in download_suffix_list:
121 |             logging.info(f"Download link found: {href}")
122 |             continue
123 | 
124 |         urls_expand.append(href)
125 | 
126 |     # 添加链接关系到PageRank处理器
127 |     pagerank_handler.add_links(url, urls_expand)
128 |     return urls_expand
129 | 
130 | 
131 | def print_json_data(json_data, html_index):
132 |     logging.info(f"Page {html_index}:")
133 |     logging.info(f"url: {json_data['url']}")
134 |     logging.info(f"title: {json_data['title']}")
135 |     content = json_data["content"]
136 |     content = str(content).replace('\n', '').replace('\t', '')
137 |     logging.info(f"content: {content[:100]}..." if len(content) > 100 else f"content: {content}")
138 | 
139 | 
140 | def content_handler(bs, url, index):
141 |     title = ""
142 |     content = ""
143 | 
144 |     for item in bs.findAll():
145 |         if item.name in ["script", "style"]:
146 |             continue
147 |         content += item.get_text()
148 | 
149 |     content = re.sub("\n\n", "", content)
150 |     content = content.replace('\n', '').replace('\t', '')
151 | 
152 |     if bs.title:
153 |         title = bs.title.get_text()
154 | 
155 |     if not title or any(str(code) in title for code in ["301", "302", "404"]):
156 |         logging.info(f"Skipping page {index} (title: {title})")
157 |         return False
158 | 
159 |     json_data = {
160 |         "url": url,
161 |         "title": title,
162 |         "content": content,
163 |         "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
164 |     }
165 | 
166 |     print_json_data(json_data, index)
167 |     with open(os.path.join(dirname, f"{index}.json"), 'w', encoding="utf-8") as file:
168 |         json.dump(json_data, file, ensure_ascii=False)
169 |     return True
170 | 
171 | 
172 | def crawl_loop(i, url_count, html_index, urls_target, urls_taken):
173 |     if i == 0:
174 |         logging.info("Crawl finished!")
175 |         logging.info(f"Total URLs crawled: {url_count}")
176 |         logging.info(f"Total valid URLs: {html_index}")
177 | 
178 |         # 计算并保存PageRank值
179 |         logging.info("Calculating PageRank...")
180 |         pagerank_scores = pagerank_handler.calculate_pagerank()
181 |         pagerank_handler.save_pagerank(pagerank_scores, dirname)
182 | 
183 |         # 输出排名靠前的页面
184 |         top_pages = pagerank_handler.get_top_pages(pagerank_scores)
185 |         logging.info("\nTop 10 pages by PageRank:")
186 |         for url, score in top_pages:
187 |             logging.info(f"URL: {url}, PageRank: {score:.6f}")
188 | 
189 |         logging.info("PageRank calculation completed")
190 |         return
191 | 
192 |     urls_expand = []
193 |     for url in urls_target:
194 |         html = get_html(url)
195 |         bs = BeautifulSoup(html, "html.parser")
196 |         for url_expand in get_expand_urls(bs, url):
197 |             if url_expand not in urls_taken:
198 |                 html_expand = get_html(url_expand)
199 |                 bs_expand = BeautifulSoup(html_expand, "html.parser")
200 |                 url_count += 1
201 |                 if not content_handler(bs_expand, url_expand, html_index):
202 |                     continue
203 |                 html_index += 1
204 |                 urls_expand.append(url_expand)
205 |                 urls_taken.add(url_expand)
206 |                 logging.info(f"Total crawled pages: {url_count} - Current page index: {html_index}")
207 | 
208 |     return crawl_loop(i - 1, url_count, html_index, urls_expand, urls_taken)
209 | 
210 | 
211 | # 爬虫设置和初始化
212 | dirname = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
213 | os.mkdir(dirname)
214 | crawl_timeout = 1
215 | crawl_iteration_times = 6
216 | html_index = 0
217 | url_count = 0
218 | urls_target = []
219 | urls_taken = set()
220 | 
221 | # 从文件加载目标网址
222 | with open("../datasets_and_logs/default_urls.json") as file:
223 |     urls_target = json.load(file)
224 | 
225 | # 执行爬虫
226 | crawl_loop(crawl_iteration_times, url_count, html_index, urls_target, urls_taken)


--------------------------------------------------------------------------------
/Spider/mutispider.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pandas as pd
  4 | from datetime import datetime
  5 | import re
  6 | import time
  7 | import random
  8 | from concurrent.futures import ThreadPoolExecutor
  9 | import logging
 10 | from pymongo import MongoClient
 11 | from pymongo.errors import DuplicateKeyError
 12 | import hashlib
 13 | import os
 14 | import mimetypes
 15 | import gridfs
 16 | 
 17 | 
 18 | class NewsScraperNankai:
 19 |     def __init__(self):
 20 |         self.base_url =  "http://news.nankai.edu.cn"
 21 |         self.first_page = "http://news.nankai.edu.cn/dcxy/index.shtml"
 22 |         self.page_template = "https://news.nankai.edu.cn/dcxy/system/count//0005000/000000000000/000/000/c0005000000000000000_000000{:03d}.shtml"
 23 |         self.max_pages = 524
 24 | 
 25 |         # MongoDB连接设置
 26 |         self.mongo_client = MongoClient('mongodb://localhost:27017/')
 27 |         self.db = self.mongo_client['nankai_news_datasets']
 28 |         self.news_collection = self.db['NEWS']
 29 |         self.snapshot_collection = self.db['WEB_snapshot']
 30 |         self.fs = gridfs.GridFS(self.db)  # 用于存储附件
 31 | 
 32 |         # 创建索引
 33 |         self.news_collection.create_index([('url', 1)], unique=True)
 34 |         self.snapshot_collection.create_index([('url', 1), ('captured_at', -1)])
 35 | 
 36 |         # 支持的附件类型
 37 |         self.supported_attachments = [
 38 |             ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",  # 常见文档格式
 39 |             ".mp3", ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv",  # 音频和视频格式
 40 |             ".zip", ".rar", ".tar", ".gz", ".bz2", ".7z",  # 压缩文件格式
 41 |             ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff",  # 图片格式
 42 |             ".exe", ".apk", ".dmg",  # 可执行文件和应用程序
 43 |             ".csv", ".txt", ".rtf",  # 文本文件
 44 |             ".xls", ".xlsx",  # 表格文件
 45 |         ]
 46 | 
 47 |         logging.basicConfig(
 48 |             level=logging.INFO,
 49 |             format='%(asctime)s - %(levelname)s - %(message)s',
 50 |             handlers=[
 51 |                 logging.FileHandler('../scraper.log', encoding='utf-8'),
 52 |                 logging.StreamHandler()
 53 |             ]
 54 |         )
 55 | 
 56 |         self.headers = {
 57 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 58 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 59 |             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
 60 |             'Connection': 'keep-alive'
 61 |         }
 62 |     def get_page_urls(self):
 63 |         """生成所有页面的URL"""
 64 |         urls = [self.first_page]  # 第一页
 65 |         # 添加后续页面
 66 |         urls.extend(self.page_template.format(i) for i in range(1, self.max_pages + 1))
 67 |         return urls
 68 |     def get_soup(self, url, retries=3):
 69 |         """获取页面的BeautifulSoup对象和原始HTML内容"""
 70 |         for i in range(retries):
 71 |             try:
 72 |                 time.sleep(random.uniform(1, 3))
 73 |                 response = requests.get(url, headers=self.headers, timeout=10)
 74 |                 response.encoding = 'utf-8'
 75 | 
 76 |                 if response.status_code == 200:
 77 |                     html_content = response.text
 78 |                     return BeautifulSoup(html_content, 'html.parser'), html_content
 79 |                 else:
 80 |                     logging.warning(f"Failed to fetch {url}, status code: {response.status_code}")
 81 | 
 82 |             except Exception as e:
 83 |                 logging.error(f"Attempt {i + 1} failed for {url}: {str(e)}")
 84 |                 if i == retries - 1:
 85 |                     logging.error(f"All attempts failed for {url}")
 86 |                     return None, None
 87 |                 time.sleep(random.uniform(2, 5))
 88 |         return None, None
 89 | 
 90 |     def save_snapshot(self, url, html_content):
 91 |         """保存网页快照"""
 92 |         try:
 93 |             snapshot_data = {
 94 |                 'url': url,
 95 |                 'html_content': html_content,
 96 |                 'captured_at': datetime.now(),
 97 |                 'content_hash': hashlib.md5(html_content.encode('utf-8')).hexdigest()
 98 |             }
 99 |             self.snapshot_collection.insert_one(snapshot_data)
100 |             return snapshot_data['content_hash']
101 |         except Exception as e:
102 |             logging.error(f"Error saving snapshot for {url}: {str(e)}")
103 |             return None
104 | 
105 |     def find_attachments(self, soup, base_url):
106 |         """查找页面中的附件链接"""
107 |         attachments = []
108 |         for link in soup.find_all('a', href=True):
109 |             href = link['href'].lower()
110 |             if any(ext in href for ext in self.supported_attachments):
111 |                 full_url = self.base_url + href if href.startswith('/') else href
112 |                 attachments.append({
113 |                     'url': full_url,
114 |                     'filename': os.path.basename(href),
115 |                     'title': link.text.strip()
116 |                 })
117 |         return attachments
118 | 
119 |     def save_attachment(self, attachment_info):
120 |         """保存附件到GridFS"""
121 |         try:
122 |             response = requests.get(attachment_info['url'], headers=self.headers, timeout=30)
123 |             if response.status_code == 200:
124 |                 file_id = self.fs.put(
125 |                     response.content,
126 |                     filename=attachment_info['filename'],
127 |                     url=attachment_info['url'],
128 |                     title=attachment_info['title'],
129 |                     upload_date=datetime.now()
130 |                 )
131 |                 return file_id
132 |         except Exception as e:
133 |             logging.error(f"Error saving attachment {attachment_info['url']}: {str(e)}")
134 |         return None
135 | 
136 |     def parse_news_list_page(self, url):
137 |         """解析新闻列表页面"""
138 |         soup, html_content = self.get_soup(url)
139 |         if not soup:
140 |             return []
141 | 
142 |         # 保存列表页快照
143 |         snapshot_hash = self.save_snapshot(url, html_content)
144 | 
145 |         news_items = []
146 |         tables = soup.find_all('table', attrs={'width': "98%", 'border': "0", 'cellpadding': "0", 'cellspacing': "0"})
147 | 
148 |         for table in tables:
149 |             try:
150 |                 title_link = table.find('a')
151 |                 if not title_link:
152 |                     continue
153 | 
154 |                 title = title_link.text.strip()
155 |                 news_url = self.base_url + title_link['href'] if title_link['href'].startswith('/') else title_link[
156 |                     'href']
157 |                 date_td = table.find('td', align="right")
158 |                 date = date_td.text.strip() if date_td else None
159 | 
160 |                 logging.info(f"Processing: {title}")
161 | 
162 |                 # 获取新闻详细内容和快照
163 |                 article_content, article_snapshot_hash, article_attachments = self.parse_news_detail(news_url)
164 | 
165 |                 news_item = {
166 |                     'title': title,
167 |                     'url': news_url,
168 |                     'date': date,
169 |                     'source': article_content.get('source', ''),
170 |                     'content': article_content.get('content', ''),
171 |                     'snapshot_hash': article_snapshot_hash,
172 |                     'attachments': article_attachments
173 |                 }
174 | 
175 |                 news_items.append(news_item)
176 | 
177 |             except Exception as e:
178 |                 logging.error(f"Error parsing news item: {str(e)}")
179 |                 continue
180 | 
181 |         return news_items
182 | 
183 |     def parse_news_detail(self, url):
184 |         """解析新闻详细页面，包括快照和附件"""
185 |         soup, html_content = self.get_soup(url)
186 |         if not soup:
187 |             return {'source': '', 'content': ''}, None, []
188 | 
189 |         try:
190 |             # 保存快照
191 |             snapshot_hash = self.save_snapshot(url, html_content)
192 | 
193 |             # 查找附件
194 |             attachments = self.find_attachments(soup, url)
195 |             saved_attachments = []
196 | 
197 |             # 保存附件
198 |             for attachment in attachments:
199 |                 file_id = self.save_attachment(attachment)
200 |                 if file_id:
201 |                     saved_attachments.append({
202 |                         'file_id': file_id,
203 |                         'url': attachment['url'],
204 |                         'filename': attachment['filename'],
205 |                         'title': attachment['title']
206 |                     })
207 | 
208 |             # 解析内容
209 |             source_span = soup.find('span', string=re.compile('来源：'))
210 |             source = source_span.text.strip() if source_span else ''
211 | 
212 |             content_div = soup.find('td', id='txt')
213 |             if content_div:
214 |                 paragraphs = content_div.find_all('p')
215 |                 content = '\n'.join([p.text.strip() for p in paragraphs if p.text.strip()])
216 |             else:
217 |                 content = ''
218 | 
219 |             return {
220 |                 'source': source,
221 |                 'content': content
222 |             }, snapshot_hash, saved_attachments
223 | 
224 |         except Exception as e:
225 |             logging.error(f"Error parsing detail page {url}: {str(e)}")
226 |             return {'source': '', 'content': ''}, None, []
227 | 
228 |     def scrape_batch(self, urls, batch_size=10):
229 |         """批量抓取新闻并保存到MongoDB"""
230 |         for i in range(0, len(urls), batch_size):
231 |             batch_urls = urls[i:i + batch_size]
232 |             batch_number = i // batch_size + 1
233 | 
234 |             logging.info(f"Processing batch {batch_number}, pages {i + 1} to {min(i + batch_size, len(urls))}")
235 | 
236 |             # 使用线程池并行处理每批URL
237 |             with ThreadPoolExecutor(max_workers=5) as executor:
238 |                 batch_results = list(executor.map(self.parse_news_list_page, batch_urls))
239 | 
240 |             # 合并结果
241 |             batch_news = [item for sublist in batch_results if sublist for item in sublist]
242 | 
243 |             # 保存这一批次的数据到MongoDB
244 |             inserted, updated = self.save_to_mongodb(batch_news, batch_number)
245 |             logging.info(f"Batch {batch_number} completed: {inserted} new items, {updated} updates")
246 | 
247 |             # 批次间休息
248 |             time.sleep(random.uniform(3, 5))
249 | 
250 |     def scrape(self):
251 |         """主抓取函数"""
252 |         logging.info("Starting to scrape news...")
253 |         urls = self.get_page_urls()
254 |         self.scrape_batch(urls)
255 | 
256 |         # 打印最终统计信息
257 |         total_news = self.get_news_count()
258 |         logging.info(f"Scraping completed. Total news in database: {total_news}")
259 | 
260 |     def save_to_mongodb(self, news_items, batch_number=None):
261 |         """保存数据到MongoDB"""
262 |         if not news_items:
263 |             logging.warning("No data to save to MongoDB")
264 |             return 0, 0
265 | 
266 |         inserted_count = 0
267 |         updated_count = 0
268 | 
269 |         for item in news_items:
270 |             try:
271 |                 # 添加时间戳和批次信息
272 |                 item['created_at'] = datetime.now()
273 |                 item['batch_number'] = batch_number
274 | 
275 |                 # 使用update_one with upsert=True来避免重复插入
276 |                 result = self.news_collection.update_one(
277 |                     {'url': item['url']},  # 查询条件
278 |                     {'$set': item},  # 更新的数据
279 |                     upsert=True  # 如果不存在则插入
280 |                 )
281 | 
282 |                 if result.upserted_id:
283 |                     inserted_count += 1
284 |                 elif result.modified_count:
285 |                     updated_count += 1
286 | 
287 |             except Exception as e:
288 |                 logging.error(f"Error saving to MongoDB: {str(e)}")
289 |                 continue
290 | 
291 |         logging.info(
292 |             f"Batch {batch_number}: Inserted {inserted_count} new documents, Updated {updated_count} documents")
293 |         return inserted_count, updated_count
294 | 
295 |     def get_news_count(self):
296 |         """获取数据库中的新闻总数"""
297 |         return self.news_collection.count_documents({})
298 |     def cleanup(self):
299 |         """清理资源"""
300 |         self.mongo_client.close()
301 | 
302 | 
303 | def main():
304 |     scraper = None
305 |     try:
306 |         scraper = NewsScraperNankai()
307 |         scraper.scrape()
308 |     except Exception as e:
309 |         logging.error(f"An error occurred during scraping: {str(e)}")
310 |     finally:
311 |         if scraper:
312 |             scraper.cleanup()
313 | 
314 | 
315 | if __name__ == "__main__":
316 |     main()


--------------------------------------------------------------------------------
/Spider/mutispider_pagerank.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pandas as pd
  4 | from datetime import datetime
  5 | import re
  6 | import time
  7 | import random
  8 | from concurrent.futures import ThreadPoolExecutor
  9 | import logging
 10 | from pymongo import MongoClient
 11 | from pymongo.errors import DuplicateKeyError
 12 | from pymongo.operations import UpdateOne
 13 | import hashlib
 14 | import os
 15 | import mimetypes
 16 | import gridfs
 17 | import numpy as np
 18 | 
 19 | 
 20 | class PageRankCalculator:
 21 |     def __init__(self, mongo_client):
 22 |         self.db = mongo_client['nankai_news_datasets']
 23 |         self.news_collection = self.db['NEWS']
 24 |         self.links_collection = self.db['LINKS']
 25 |         self.pagerank_collection = self.db['PAGERANK']
 26 | 
 27 |         # 创建索引
 28 |         self.links_collection.create_index([('from_url', 1), ('to_url', 1)], unique=True)
 29 |         self.pagerank_collection.create_index([('url', 1)], unique=True)
 30 | 
 31 |     def extract_links(self, soup, current_url):
 32 |         """提取页面中的所有链接"""
 33 |         links = []
 34 |         if not soup:
 35 |             return links
 36 | 
 37 |         for a_tag in soup.find_all('a', href=True):
 38 |             href = a_tag['href']
 39 |             if href.startswith('/'):
 40 |                 href = f"http://news.nankai.edu.cn{href}"
 41 |             elif not href.startswith('http'):
 42 |                 continue
 43 | 
 44 |             if 'nankai.edu.cn' in href:  # 只保留南开域名下的链接
 45 |                 links.append({
 46 |                     'from_url': current_url,
 47 |                     'to_url': href,
 48 |                     'anchor_text': a_tag.get_text(strip=True),
 49 |                     'created_at': datetime.now()
 50 |                 })
 51 |         return links
 52 | 
 53 |     def save_links(self, links):
 54 |         """保存链接关系到数据库"""
 55 |         if not links:
 56 |             return
 57 | 
 58 |         for link in links:
 59 |             try:
 60 |                 self.links_collection.update_one(
 61 |                     {
 62 |                         'from_url': link['from_url'],
 63 |                         'to_url': link['to_url']
 64 |                     },
 65 |                     {'$set': link},
 66 |                     upsert=True
 67 |                 )
 68 |             except Exception as e:
 69 |                 logging.error(f"Error saving link: {str(e)}")
 70 | 
 71 |     def build_graph(self):
 72 |         """构建网页链接图"""
 73 |         graph = {}
 74 |         # 获取所有链接关系
 75 |         links = self.links_collection.find({})
 76 | 
 77 |         for link in links:
 78 |             from_url = link['from_url']
 79 |             to_url = link['to_url']
 80 | 
 81 |             if from_url not in graph:
 82 |                 graph[from_url] = []
 83 |             if to_url not in graph:
 84 |                 graph[to_url] = []
 85 | 
 86 |             if to_url not in graph[from_url]:
 87 |                 graph[from_url].append(to_url)
 88 | 
 89 |         return graph
 90 | 
 91 |     def calculate_pagerank(self, damping_factor=0.85, max_iterations=100, min_delta=1e-5):
 92 |         """计算PageRank值"""
 93 |         graph = self.build_graph()
 94 |         if not graph:
 95 |             logging.warning("No graph data available for PageRank calculation")
 96 |             return {}
 97 | 
 98 |         # 初始化PageRank值
 99 |         num_pages = len(graph)
100 |         initial_value = 1.0 / num_pages
101 |         pagerank = {url: initial_value for url in graph}
102 | 
103 |         for iteration in range(max_iterations):
104 |             new_pagerank = {}
105 |             total_diff = 0
106 | 
107 |             # 计算新的PageRank值
108 |             for url in graph:
109 |                 incoming_pr = 0
110 |                 for incoming_url in graph:
111 |                     if url in graph[incoming_url]:
112 |                         outgoing_count = len(graph[incoming_url])
113 |                         if outgoing_count > 0:
114 |                             incoming_pr += pagerank[incoming_url] / outgoing_count
115 | 
116 |                 new_value = (1 - damping_factor) / num_pages + damping_factor * incoming_pr
117 |                 new_pagerank[url] = new_value
118 |                 total_diff += abs(new_value - pagerank[url])
119 | 
120 |             # 更新PageRank值
121 |             pagerank = new_pagerank
122 | 
123 |             # 检查是否收敛
124 |             if total_diff < min_delta:
125 |                 logging.info(f"PageRank converged after {iteration + 1} iterations")
126 |                 break
127 | 
128 |         return pagerank
129 | 
130 |     def update_pagerank_scores(self):
131 |         """更新数据库中的PageRank分数"""
132 |         pagerank_scores = self.calculate_pagerank()
133 | 
134 |         # 批量更新PageRank值
135 |         operations = []
136 |         timestamp = datetime.now()
137 | 
138 |         for url, score in pagerank_scores.items():
139 |             operations.append(UpdateOne(
140 |                 {'url': url},
141 |                 {
142 |                     '$set': {
143 |                         'pagerank': score,
144 |                         'updated_at': timestamp
145 |                     }
146 |                 },
147 |                 upsert=True
148 |             ))
149 | 
150 |         if operations:
151 |             try:
152 |                 result = self.pagerank_collection.bulk_write(operations)
153 |                 logging.info(f"Updated {result.modified_count} PageRank scores, "
154 |                              f"Inserted {result.upserted_count} new scores")
155 |             except Exception as e:
156 |                 logging.error(f"Error updating PageRank scores: {str(e)}")
157 | 
158 |     def should_update_pagerank(self, threshold=1000):
159 |         """判断是否需要更新PageRank"""
160 |         last_update = self.pagerank_collection.find_one(
161 |             sort=[('updated_at', -1)]
162 |         )
163 | 
164 |         if not last_update:
165 |             return True
166 | 
167 |         # 检查新增链接数量
168 |         new_links_count = self.links_collection.count_documents({
169 |             'created_at': {'$gt': last_update['updated_at']}
170 |         })
171 | 
172 |         return new_links_count >= threshold
173 | 
174 | 
175 | class NewsScraperNankai:
176 |     def __init__(self):
177 |         self.base_url = "http://news.nankai.edu.cn"
178 |         self.first_page = "http://news.nankai.edu.cn/dcxy/index.shtml"
179 |         self.page_template = "https://news.nankai.edu.cn/dcxy/system/count//0005000/000000000000/000/000/c0005000000000000000_000000{:03d}.shtml"
180 |         self.max_pages = 524
181 | 
182 |         # MongoDB连接设置
183 |         self.mongo_client = MongoClient('mongodb://localhost:27017/')
184 |         self.db = self.mongo_client['nankai_news_datasets']
185 |         self.news_collection = self.db['NEWS']
186 |         self.snapshot_collection = self.db['WEB_snapshot']
187 |         self.fs = gridfs.GridFS(self.db)  # 用于存储附件
188 | 
189 |         # 创建索引
190 |         self.news_collection.create_index([('url', 1)], unique=True)
191 |         self.snapshot_collection.create_index([('url', 1), ('captured_at', -1)])
192 | 
193 |         # 初始化PageRank计算器
194 |         self.pagerank_calculator = PageRankCalculator(self.mongo_client)
195 | 
196 |         # 支持的附件类型
197 |         self.supported_attachments = [
198 |             ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",  # 常见文档格式
199 |             ".mp3", ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv",  # 音频和视频格式
200 |             ".zip", ".rar", ".tar", ".gz", ".bz2", ".7z",  # 压缩文件格式
201 |             ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff",  # 图片格式
202 |             ".exe", ".apk", ".dmg",  # 可执行文件和应用程序
203 |             ".csv", ".txt", ".rtf",  # 文本文件
204 |             ".xls", ".xlsx",  # 表格文件
205 |         ]
206 | 
207 |         # 设置日志
208 |         logging.basicConfig(
209 |             level=logging.INFO,
210 |             format='%(asctime)s - %(levelname)s - %(message)s',
211 |             handlers=[
212 |                 logging.FileHandler('scraper.log', encoding='utf-8'),
213 |                 logging.StreamHandler()
214 |             ]
215 |         )
216 | 
217 |         # 设置请求头
218 |         self.headers = {
219 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
220 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
221 |             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
222 |             'Connection': 'keep-alive'
223 |         }
224 | 
225 |     def get_page_urls(self):
226 |         """生成所有页面的URL"""
227 |         urls = [self.first_page]  # 第一页
228 |         urls.extend(self.page_template.format(i) for i in range(1, self.max_pages + 1))
229 |         return urls
230 | 
231 |     def get_soup(self, url, retries=3):
232 |         """获取页面的BeautifulSoup对象和原始HTML内容"""
233 |         for i in range(retries):
234 |             try:
235 |                 time.sleep(random.uniform(1, 3))
236 |                 response = requests.get(url, headers=self.headers, timeout=10)
237 |                 response.encoding = 'utf-8'
238 | 
239 |                 if response.status_code == 200:
240 |                     html_content = response.text
241 |                     return BeautifulSoup(html_content, 'html.parser'), html_content
242 |                 else:
243 |                     logging.warning(f"Failed to fetch {url}, status code: {response.status_code}")
244 | 
245 |             except Exception as e:
246 |                 logging.error(f"Attempt {i + 1} failed for {url}: {str(e)}")
247 |                 if i == retries - 1:
248 |                     logging.error(f"All attempts failed for {url}")
249 |                     return None, None
250 |                 time.sleep(random.uniform(2, 5))
251 |         return None, None
252 | 
253 |     def save_snapshot(self, url, html_content):
254 |         """保存网页快照"""
255 |         try:
256 |             snapshot_data = {
257 |                 'url': url,
258 |                 'html_content': html_content,
259 |                 'captured_at': datetime.now(),
260 |                 'content_hash': hashlib.md5(html_content.encode('utf-8')).hexdigest()
261 |             }
262 |             self.snapshot_collection.insert_one(snapshot_data)
263 |             return snapshot_data['content_hash']
264 |         except Exception as e:
265 |             logging.error(f"Error saving snapshot for {url}: {str(e)}")
266 |             return None
267 | 
268 |     def find_attachments(self, soup, base_url):
269 |         """查找页面中的附件链接"""
270 |         attachments = []
271 |         for link in soup.find_all('a', href=True):
272 |             href = link['href'].lower()
273 |             if any(ext in href for ext in self.supported_attachments):
274 |                 full_url = self.base_url + href if href.startswith('/') else href
275 |                 attachments.append({
276 |                     'url': full_url,
277 |                     'filename': os.path.basename(href),
278 |                     'title': link.text.strip()
279 |                 })
280 |         return attachments
281 | 
282 |     def save_attachment(self, attachment_info):
283 |         """保存附件到GridFS"""
284 |         try:
285 |             response = requests.get(attachment_info['url'], headers=self.headers, timeout=30)
286 |             if response.status_code == 200:
287 |                 file_id = self.fs.put(
288 |                     response.content,
289 |                     filename=attachment_info['filename'],
290 |                     url=attachment_info['url'],
291 |                     title=attachment_info['title'],
292 |                     upload_date=datetime.now()
293 |                 )
294 |                 return file_id
295 |         except Exception as e:
296 |             logging.error(f"Error saving attachment {attachment_info['url']}: {str(e)}")
297 |         return None
298 | 
299 |     def parse_news_list_page(self, url):
300 |         """解析新闻列表页面"""
301 |         soup, html_content = self.get_soup(url)
302 |         if not soup:
303 |             return []
304 | 
305 |         # 保存列表页快照
306 |         snapshot_hash = self.save_snapshot(url, html_content)
307 | 
308 |         # 提取并保存页面链接关系
309 |         links = self.pagerank_calculator.extract_links(soup, url)
310 |         self.pagerank_calculator.save_links(links)
311 | 
312 |         news_items = []
313 |         tables = soup.find_all('table', attrs={'width': "98%", 'border': "0", 'cellpadding': "0", 'cellspacing': "0"})
314 | 
315 |         for table in tables:
316 |             try:
317 |                 title_link = table.find('a')
318 |                 if not title_link:
319 |                     continue
320 | 
321 |                 title = title_link.text.strip()
322 |                 news_url = self.base_url + title_link['href'] if title_link['href'].startswith('/') else title_link[
323 |                     'href']
324 |                 date_td = table.find('td', align="right")
325 |                 date = date_td.text.strip() if date_td else None
326 | 
327 |                 logging.info(f"Processing: {title}")
328 | 
329 |                 # 获取新闻详细内容和快照
330 |                 article_content, article_snapshot_hash, article_attachments = self.parse_news_detail(news_url)
331 | 
332 |                 news_item = {
333 |                     'title': title,
334 |                     'url': news_url,
335 |                     'date': date,
336 |                     'source': article_content.get('source', ''),
337 |                     'content': article_content.get('content', ''),
338 |                     'snapshot_hash': article_snapshot_hash,
339 |                     'attachments': article_attachments
340 |                 }
341 | 
342 |                 news_items.append(news_item)
343 | 
344 |             except Exception as e:
345 |                 logging.error(f"Error parsing news item: {str(e)}")
346 |                 continue
347 | 
348 |         return news_items
349 | 
350 |     def parse_news_detail(self, url):
351 |         """解析新闻详细页面，包括快照和附件"""
352 |         soup, html_content = self.get_soup(url)
353 |         if not soup:
354 |             return {'source': '', 'content': ''}, None, []
355 | 
356 |         try:
357 |             # 保存快照
358 |             snapshot_hash = self.save_snapshot(url, html_content)
359 | 
360 |             # 提取并保存页面链接关系
361 |             links = self.pagerank_calculator.extract_links(soup, url)
362 |             self.pagerank_calculator.save_links(links)
363 | 
364 |             # 查找附件
365 |             attachments = self.find_attachments(soup, url)
366 |             saved_attachments = []
367 | 
368 |             # 保存附件
369 |             for attachment in attachments:
370 |                 file_id = self.save_attachment(attachment)
371 |                 if file_id:
372 |                     saved_attachments.append({
373 |                         'file_id': file_id,
374 |                         'url': attachment['url'],
375 |                         'filename': attachment['filename'],
376 |                         'title': attachment['title']
377 |                     })
378 | 
379 |             # 解析内容
380 |             source_span = soup.find('span', string=re.compile('来源：'))
381 |             source = source_span.text.strip() if source_span else ''
382 | 
383 |             content_div = soup.find('td', id='txt')
384 |             if content_div:
385 |                 paragraphs = content_div.find_all('p')
386 |                 content = '\n'.join([p.text.strip() for p in paragraphs if p.text.strip()])
387 |             else:
388 |                 content = ''
389 | 
390 |             return {
391 |                 'source': source,
392 |                 'content': content
393 |             }, snapshot_hash, saved_attachments
394 | 
395 |         except Exception as e:
396 |             logging.error(f"Error parsing detail page {url}: {str(e)}")
397 |             return {'source': '', 'content': ''}, None, []
398 | 
399 |     def scrape_batch(self, urls, batch_size=10):
400 |         """批量抓取新闻并保存到MongoDB"""
401 |         for i in range(0, len(urls), batch_size):
402 |             batch_urls = urls[i:i + batch_size]
403 |             batch_number = i // batch_size + 1
404 | 
405 |             logging.info(f"Processing batch {batch_number}, pages {i + 1} to {min(i + batch_size, len(urls))}")
406 | 
407 |             # 使用线程池并行处理每批URL
408 |             with ThreadPoolExecutor(max_workers=5) as executor:
409 |                 batch_results = list(executor.map(self.parse_news_list_page, batch_urls))
410 | 
411 |             # 合并结果
412 |             batch_news = [item for sublist in batch_results if sublist for item in sublist]
413 | 
414 |             # 保存这一批次的数据到MongoDB
415 |             inserted, updated = self.save_to_mongodb(batch_news, batch_number)
416 |             logging.info(f"Batch {batch_number} completed: {inserted} new items, {updated} updates")
417 | 
418 |             # 检查是否需要更新PageRank
419 |             if self.pagerank_calculator.should_update_pagerank():
420 |                 logging.info("Starting PageRank update...")
421 |                 self.pagerank_calculator.update_pagerank_scores()
422 |                 logging.info("PageRank update completed")
423 | 
424 |             # 批次间休息
425 |             time.sleep(random.uniform(3, 5))
426 | 
427 |     def save_to_mongodb(self, news_items, batch_number=None):
428 |         """保存数据到MongoDB"""
429 |         if not news_items:
430 |             logging.warning("No data to save to MongoDB")
431 |             return 0, 0
432 | 
433 |         inserted_count = 0
434 |         updated_count = 0
435 | 
436 |         for item in news_items:
437 |             try:
438 |                 # 添加时间戳和批次信息
439 |                 item['created_at'] = datetime.now()
440 |                 item['batch_number'] = batch_number
441 | 
442 |                 # 使用update_one with upsert=True来避免重复插入
443 |                 result = self.news_collection.update_one(
444 |                     {'url': item['url']},  # 查询条件
445 |                     {'$set': item},  # 更新的数据
446 |                     upsert=True  # 如果不存在则插入
447 |                 )
448 | 
449 |                 if result.upserted_id:
450 |                     inserted_count += 1
451 |                 elif result.modified_count:
452 |                     updated_count += 1
453 | 
454 |             except Exception as e:
455 |                 logging.error(f"Error saving to MongoDB: {str(e)}")
456 |                 continue
457 | 
458 |         logging.info(
459 |             f"Batch {batch_number}: Inserted {inserted_count} new documents, Updated {updated_count} documents")
460 |         return inserted_count, updated_count
461 | 
462 |     def get_news_count(self):
463 |         """获取数据库中的新闻总数"""
464 |         return self.news_collection.count_documents({})
465 | 
466 |     def update_pagerank_if_needed(self):
467 |         """检查并在需要时更新PageRank"""
468 |         if self.pagerank_calculator.should_update_pagerank():
469 |             logging.info("Starting PageRank update...")
470 |             self.pagerank_calculator.update_pagerank_scores()
471 |             logging.info("PageRank update completed")
472 | 
473 |     def scrape(self):
474 |         """主抓取函数"""
475 |         logging.info("Starting to scrape news...")
476 |         urls = self.get_page_urls()
477 |         self.scrape_batch(urls)
478 | 
479 |         # 完成后更新一次PageRank
480 |         self.update_pagerank_if_needed()
481 | 
482 |         # 打印最终统计信息
483 |         total_news = self.get_news_count()
484 |         logging.info(f"Scraping completed. Total news in database: {total_news}")
485 | 
486 | def cleanup(self):
487 |     """清理资源"""
488 |     self.mongo_client.close()
489 | 
490 | def main():
491 |     scraper = None
492 |     try:
493 |         scraper = NewsScraperNankai()
494 |         scraper.scrape()
495 |     except Exception as e:
496 |         logging.error(f"An error occurred during scraping: {str(e)}")
497 |     finally:
498 |         if scraper:
499 |             scraper.cleanup()
500 | 
501 | if __name__ == "__main__":
502 |     main()


--------------------------------------------------------------------------------
/data_clean/clean_document.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | 
 4 | class MongoDBCleaner:
 5 |     def __init__(self, db_name, collection_name):
 6 |         # 连接到 MongoDB
 7 |         self.client = MongoClient('mongodb://localhost:27017/')
 8 |         self.db = self.client[db_name]
 9 |         self.collection = self.db[collection_name]
10 | 
11 |     def clean_data(self):
12 |         """清洗数据：删除 chunkSize 字段并添加 filetype 字段"""
13 |         print("开始清洗数据...")
14 | 
15 |         # 查询所有文档总数
16 |         total = self.collection.count_documents({})  # 获取集合中文档总数
17 |         print(f"总文档数: {total}")
18 | 
19 |         # 初始化更新计数
20 |         updated = 0
21 | 
22 |         # 遍历所有文档
23 |         with self.collection.find({}, {'filename': 1}) as cursor:  # 只取需要字段
24 |             for doc in cursor:
25 |                 # 提取文件类型
26 |                 filetype = None
27 |                 if 'filename' in doc:
28 |                     filetype = doc['filename'].split('.')[-1] if '.' in doc['filename'] else 'unknown'
29 | 
30 |                 # 构造更新操作
31 |                 update_query = {
32 |                     '$unset': {'chunkSize': ""},  # 删除 chunkSize 字段
33 |                     '$set': {'filetype': filetype}  # 添加 filetype 字段
34 |                 }
35 | 
36 |                 # 执行更新
37 |                 self.collection.update_one({'_id': doc['_id']}, update_query)
38 |                 updated += 1
39 | 
40 |                 # 打印进度（每 100 条打印一次）
41 |                 if updated % 100 == 0:
42 |                     print(f"已更新 {updated}/{total} 条记录...")
43 | 
44 |         print(f"清洗完成！共更新 {updated}/{total} 条记录。")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     # 替换为你的数据库名称和集合名称
49 |     db_name = "nankai_news_datasets"
50 |     collection_name = "Document"
51 | 
52 |     cleaner = MongoDBCleaner(db_name, collection_name)
53 |     cleaner.clean_data()
54 | 


--------------------------------------------------------------------------------
/data_clean/load_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from pymongo import MongoClient
 4 | from datetime import datetime
 5 | 
 6 | client = MongoClient('mongodb://localhost:27017/')
 7 | db = client['nankai_news_datasets']
 8 | collection = db['2024_12_01_02_57_18']
 9 | 
10 | for filename in os.listdir(r'C:\Xing\IR\lab4\code\2024_12_01_02_57_18'):
11 |     if filename.endswith('.json'):
12 |         with open(os.path.join(r'C:\Xing\IR\lab4\code\2024_12_01_02_57_18', filename), 'r', encoding='utf-8') as f:
13 |             json_data = json.load(f)
14 |             collection.insert_one(json_data)


--------------------------------------------------------------------------------
/data_clean/merge_data.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def merge_collections():
 6 |     # 连接MongoDB
 7 |     client = MongoClient('mongodb://localhost:27017/')
 8 |     db = client['nankai_news_datasets']  # 替换成你的数据库名
 9 | 
10 |     # 源集合名称
11 |     collection1_name = '2024_11_30_00_52_59'
12 |     collection2_name = '2024_11_30_02_32_56'
13 |     # 目标集合名称（合并后的集合）
14 |     merged_collection_name = 'NEWS1'
15 | 
16 |     try:
17 |         # 创建一个新的集合来存储合并结果
18 |         if merged_collection_name in db.list_collection_names():
19 |             print(f"集合 {merged_collection_name} 已存在，先删除它")
20 |             db[merged_collection_name].drop()
21 | 
22 |         # 记录合并前的文档数量
23 |         count1 = db[collection1_name].count_documents({})
24 |         count2 = db[collection2_name].count_documents({})
25 |         print(f"合并前统计:")
26 |         print(f"集合 {collection1_name}: {count1} 条文档")
27 |         print(f"集合 {collection2_name}: {count2} 条文档")
28 | 
29 |         # 使用聚合管道合并集合
30 |         pipeline = [
31 |             {'$out': merged_collection_name}
32 |         ]
33 | 
34 |         # 将第一个集合的数据写入新集合
35 |         db[collection1_name].aggregate(pipeline)
36 | 
37 |         # 将第二个集合的数据添加到新集合
38 |         db[collection2_name].aggregate([
39 |             {'$merge': {
40 |                 'into': merged_collection_name,
41 |                 'whenMatched': 'keepExisting',  # 如果遇到重复文档，保留已存在的
42 |                 'whenNotMatched': 'insert'  # 如果是新文档，则插入
43 |             }}
44 |         ])
45 | 
46 |         # 统计合并后的文档数量
47 |         merged_count = db[merged_collection_name].count_documents({})
48 |         print(f"\n合并完成！")
49 |         print(f"合并后的集合 {merged_collection_name}: {merged_count} 条文档")
50 | 
51 |         # 检查是否有重复文档
52 |         if merged_count < count1 + count2:
53 |             print(f"注意：检测到 {count1 + count2 - merged_count} 条重复文档被跳过")
54 | 
55 |         # 显示合并后的示例文档
56 |         print("\n合并后的文档示例：")
57 |         sample_doc = db[merged_collection_name].find_one()
58 |         print(sample_doc)
59 | 
60 |     except Exception as e:
61 |         print(f"合并过程中出错: {str(e)}")
62 | 
63 |     finally:
64 |         client.close()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     merge_collections()


--------------------------------------------------------------------------------
/data_clean/news1_clean_ distinct.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | from bson.objectid import ObjectId
 3 | 
 4 | 
 5 | def remove_duplicates():
 6 |     # 连接MongoDB
 7 |     client = MongoClient('mongodb://localhost:27017/')
 8 |     db = client['nankai_news_datasets']
 9 |     collection = db['NEWS1']
10 | 
11 |     try:
12 |         # 记录清理前的文档数量
13 |         initial_count = collection.count_documents({})
14 |         print(f"清理前文档数量: {initial_count}")
15 | 
16 |         # 1. 基于URL去重
17 |         print("\n正在基于URL去重...")
18 |         duplicate_urls = collection.aggregate([
19 |             {"$group": {
20 |                 "_id": "$url",
21 |                 "count": {"$sum": 1},
22 |                 "ids": {"$push": "$_id"},
23 |                 "first_id": {"$first": "$_id"}
24 |             }},
25 |             {"$match": {
26 |                 "count": {"$gt": 1}
27 |             }}
28 |         ], allowDiskUse=True)  # 允许使用磁盘处理大数据集
29 | 
30 |         url_dups_removed = 0
31 |         for dup in duplicate_urls:
32 |             # 获取要删除的文档ID（除了第一个之外的所有ID）
33 |             ids_to_remove = [id for id in dup["ids"] if id != dup["first_id"]]
34 |             if ids_to_remove:
35 |                 result = collection.delete_many({"_id": {"$in": ids_to_remove}})
36 |                 url_dups_removed += result.deleted_count
37 |                 print(f"删除了 {result.deleted_count} 条URL重复的文档")
38 | 
39 |         # 2. 基于内容去重
40 |         print("\n正在基于内容去重...")
41 |         duplicate_contents = collection.aggregate([
42 |             {"$group": {
43 |                 "_id": "$content",
44 |                 "count": {"$sum": 1},
45 |                 "ids": {"$push": "$_id"},
46 |                 "first_id": {"$first": "$_id"}
47 |             }},
48 |             {"$match": {
49 |                 "count": {"$gt": 1}
50 |             }}
51 |         ], allowDiskUse=True)
52 | 
53 |         content_dups_removed = 0
54 |         for dup in duplicate_contents:
55 |             ids_to_remove = [id for id in dup["ids"] if id != dup["first_id"]]
56 |             if ids_to_remove:
57 |                 result = collection.delete_many({"_id": {"$in": ids_to_remove}})
58 |                 content_dups_removed += result.deleted_count
59 |                 print(f"删除了 {result.deleted_count} 条内容重复的文档")
60 | 
61 |         # 验证删除结果
62 |         final_count = collection.count_documents({})
63 |         total_removed = initial_count - final_count
64 | 
65 |         print("\n清理结果统计：")
66 |         print(f"初始文档数量: {initial_count}")
67 |         print(f"最终文档数量: {final_count}")
68 |         print(f"基于URL删除的重复文档: {url_dups_removed}")
69 |         print(f"基于内容删除的重复文档: {content_dups_removed}")
70 |         print(f"实际减少的文档数量: {total_removed}")
71 | 
72 |         # 显示一个示例文档以确认集合仍然可访问
73 |         print("\n清理后的文档示例：")
74 |         sample = collection.find_one()
75 |         if sample:
76 |             print(f"文档ID: {sample['_id']}")
77 |             print(f"URL: {sample.get('url', 'N/A')}")
78 |             print(f"标题: {sample.get('title', 'N/A')}")
79 |         else:
80 |             print("警告：无法获取示例文档")
81 | 
82 |     except Exception as e:
83 |         print(f"清理过程中出错: {str(e)}")
84 |         print("错误堆栈:")
85 |         import traceback
86 |         print(traceback.format_exc())
87 | 
88 |     finally:
89 |         client.close()
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     # 确认操作
94 |     print("注意：此操作将直接删除重复文档。建议先备份数据。")
95 |     confirm = input("是否继续？(y/n): ")
96 |     if confirm.lower() == 'y':
97 |         remove_duplicates()
98 |     else:
99 |         print("操作已取消")


--------------------------------------------------------------------------------
/data_clean/news_clean_distinct.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | from pprint import pprint
 3 | 
 4 | 
 5 | def check_duplicates():
 6 |     # 连接MongoDB
 7 |     client = MongoClient('mongodb://localhost:27017/')
 8 |     db = client['nankai_news_datasets']  # 替换成你的数据库名
 9 |     collection = db['NEWS']  # 替换成你的集合名
10 | 
11 |     try:
12 |         # 获取总文档数
13 |         total_docs = collection.count_documents({})
14 |         print(f"\n数据库总文档数量: {total_docs}")
15 | 
16 |         # 查找重复的URL
17 |         print("\n==== URL重复情况统计 ====")
18 |         duplicate_urls = list(collection.aggregate([
19 |             {
20 |                 "$group": {
21 |                     "_id": "$url",
22 |                     "count": {"$sum": 1},
23 |                     "documents": {
24 |                         "$push": {
25 |                             "_id": "$_id",
26 |                             "title": "$title",
27 |                             "source": "$source",
28 |                             "date": "$date"
29 |                         }
30 |                     }
31 |                 }
32 |             },
33 |             {
34 |                 "$match": {
35 |                     "count": {"$gt": 1}
36 |                 }
37 |             },
38 |             {
39 |                 "$sort": {"count": -1}  # 按重复次数降序排序
40 |             }
41 |         ]))
42 | 
43 |         # 打印重复URL的统计信息
44 |         if duplicate_urls:
45 |             print(f"\n发现 {len(duplicate_urls)} 组重复URL")
46 |             total_duplicates = sum(doc['count'] - 1 for doc in duplicate_urls)
47 |             print(f"总共有 {total_duplicates} 条重复文档需要清理")
48 | 
49 |             # 显示重复文档的详细示例
50 |             print("\n==== 重复文档示例（显示前3组） ====")
51 |             for i, dup in enumerate(duplicate_urls[:3], 1):
52 |                 print(f"\n第 {i} 组重复 (重复 {dup['count']} 次):")
53 |                 print(f"URL: {dup['_id']}")
54 |                 print("包含的文档:")
55 |                 for doc in dup['documents']:
56 |                     print("-" * 50)
57 |                     print(f"文档ID: {doc['_id']}")
58 |                     print(f"标题: {doc.get('title', 'N/A')}")
59 |                     print(f"来源: {doc.get('source', 'N/A')}")
60 |                     print(f"日期: {doc.get('date', 'N/A')}")
61 | 
62 |             # 显示重复次数分布
63 |             print("\n==== 重复次数分布 ====")
64 |             duplicate_counts = {}
65 |             for dup in duplicate_urls:
66 |                 count = dup['count']
67 |                 duplicate_counts[count] = duplicate_counts.get(count, 0) + 1
68 | 
69 |             for count, freq in sorted(duplicate_counts.items()):
70 |                 print(f"重复 {count} 次的URL有 {freq} 个")
71 | 
72 |         else:
73 |             print("没有发现重复的URL")
74 | 
75 |     except Exception as e:
76 |         print(f"检查过程中出错: {str(e)}")
77 |         import traceback
78 |         print(traceback.format_exc())
79 | 
80 |     finally:
81 |         client.close()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     print("开始检查重复数据...")
86 |     check_duplicates()
87 | 
88 |     user_input = input("\n是否需要进行数据清理？(y/n): ")
89 |     if user_input.lower() == 'y':
90 |         print("\n请运行清理脚本进行数据清理。")
91 |     else:
92 |         print("操作已取消")


--------------------------------------------------------------------------------
/data_clean/news_clean_frame.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def test_cleaning_on_sample():
 6 |     # 连接MongoDB
 7 |     client = MongoClient('mongodb://localhost:27017/')  # 替换成你的MongoDB连接字符串
 8 |     db = client['nankai_news_datasets']  # 替换成你的数据库名
 9 |     collection = db['NEWS']  # 替换成你的集合名
10 | 
11 |     # 获取前10条数据的ID
12 |     sample_ids = [doc['_id'] for doc in collection.find().limit(10)]
13 | 
14 |     print("测试前的数据样本：")
15 |     for doc in collection.find({'_id': {'$in': sample_ids}}):
16 |         print(f"ID: {doc['_id']}")
17 |         print(f"Source: {doc.get('source', '未找到')}")
18 |         print(f"Batch Number: {doc.get('batch_number', '未找到')}")
19 |         print("-" * 50)
20 | 
21 |     try:
22 |         # 仅对这10条数据进行清洗
23 |         # 清理source字段中的"来源："
24 |         collection.update_many(
25 |             {
26 |                 '_id': {'$in': sample_ids},
27 |                 'source': {'$regex': '来源：'}
28 |             },
29 |             [{
30 |                 '$set': {
31 |                     'source': {
32 |                         '$replaceAll': {
33 |                             'input': '$source',
34 |                             'find': '来源：',
35 |                             'replacement': ''
36 |                         }
37 |                     }
38 |                 }
39 |             }]
40 |         )
41 | 
42 |         # 删除batch_number字段
43 |         collection.update_many(
44 |             {'_id': {'$in': sample_ids}},
45 |             {'$unset': {'batch_number': ''}}
46 |         )
47 | 
48 |         print("\n清洗后的数据样本：")
49 |         for doc in collection.find({'_id': {'$in': sample_ids}}):
50 |             print(f"ID: {doc['_id']}")
51 |             print(f"Source: {doc.get('source', '未找到')}")
52 |             print(f"Batch Number: {doc.get('batch_number', '未找到')}")
53 |             print("-" * 50)
54 | 
55 |         user_input = input("\n测试结果是否符合预期？(y/n): ")
56 | 
57 |         if user_input.lower() == 'y':
58 |             print("\n是否要对所有数据进行清洗？(y/n): ")
59 |             clean_all = input()
60 |             if clean_all.lower() == 'y':
61 |                 # 清洗所有数据
62 |                 collection.update_many(
63 |                     {'source': {'$regex': '来源：'}},
64 |                     [{
65 |                         '$set': {
66 |                             'source': {
67 |                                 '$replaceAll': {
68 |                                     'input': '$source',
69 |                                     'find': '来源：',
70 |                                     'replacement': ''
71 |                                 }
72 |                             }
73 |                         }
74 |                     }]
75 |                 )
76 | 
77 |                 collection.update_many(
78 |                     {},
79 |                     {'$unset': {'batch_number': ''}}
80 |                 )
81 |                 print("所有数据清洗完成！")
82 |             else:
83 |                 print("操作已取消")
84 |         else:
85 |             print("请调整清洗规则后重试")
86 | 
87 |     except Exception as e:
88 |         print(f"发生错误: {str(e)}")
89 | 
90 |     finally:
91 |         client.close()
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     test_cleaning_on_sample()


--------------------------------------------------------------------------------
/db_init/init_db.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient, ASCENDING
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def init_user_database():
 6 |     """初始化用户相关的所有数据库集合"""
 7 |     try:
 8 |         # 连接数据库
 9 |         client = MongoClient('localhost', 27017)
10 |         db = client['nankai_news_datasets']  # 使用现有的数据库
11 | 
12 |         # 1. 用户集合 (users)
13 |         if 'users' not in db.list_collection_names():
14 |             users = db.create_collection('users')
15 |             users.create_index([('username', ASCENDING)], unique=True)
16 |             users.create_index([('email', ASCENDING)], unique=True)
17 |             print("用户集合创建成功")
18 | 
19 |         # 2. 搜索历史集合 (search_history)
20 |         if 'search_history' not in db.list_collection_names():
21 |             search_history = db.create_collection('search_history')
22 |             search_history.create_index([('user_id', ASCENDING)])
23 |             search_history.create_index([('timestamp', ASCENDING)])
24 |             print("搜索历史集合创建成功")
25 | 
26 |         # 3. 用户偏好设置集合 (user_preferences)
27 |         if 'user_preferences' not in db.list_collection_names():
28 |             preferences = db.create_collection('user_preferences')
29 |             preferences.create_index([('user_id', ASCENDING)], unique=True)
30 |             print("用户偏好集合创建成功")
31 | 
32 |         # 4. 登录历史集合 (login_history)
33 |         if 'login_history' not in db.list_collection_names():
34 |             login_history = db.create_collection('login_history')
35 |             login_history.create_index([('user_id', ASCENDING)])
36 |             login_history.create_index([('login_time', ASCENDING)])
37 |             print("登录历史集合创建成功")
38 | 
39 |         print("\n数据库初始化完成！创建了以下集合：")
40 |         print("- users: 用户基本信息")
41 |         print("- search_history: 搜索历史记录")
42 |         print("- user_preferences: 用户偏好设置")
43 |         print("- login_history: 登录历史记录")
44 | 
45 |         # 展示所有集合的结构
46 |         print("\n各集合的数据结构：")
47 |         print("\nusers 集合结构：")
48 |         print({
49 |             "username": "用户名 (唯一)",
50 |             "email": "邮箱 (唯一)",
51 |             "password": "密码哈希",
52 |             "created_at": "创建时间",
53 |             "last_login": "最后登录时间"
54 |         })
55 | 
56 |         print("\nsearch_history 集合结构：")
57 |         print({
58 |             "user_id": "用户ID",
59 |             "query": "搜索关键词",
60 |             "search_in": "搜索范围",
61 |             "sort_by": "排序方式",
62 |             "timestamp": "搜索时间"
63 |         })
64 | 
65 |         print("\nuser_preferences 集合结构：")
66 |         print({
67 |             "user_id": "用户ID",
68 |             "default_search_in": "默认搜索范围",
69 |             "default_sort_by": "默认排序方式",
70 |             "results_per_page": "每页结果数"
71 |         })
72 | 
73 |         print("\nlogin_history 集合结构：")
74 |         print({
75 |             "user_id": "用户ID",
76 |             "login_time": "登录时间",
77 |             "ip_address": "IP地址"
78 |         })
79 | 
80 |     except Exception as e:
81 |         print(f"初始化数据库时出错: {str(e)}")
82 |         raise e
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     init_user_database()


--------------------------------------------------------------------------------
/db_init/init_db_new.py:
--------------------------------------------------------------------------------
  1 | from pymongo import MongoClient, ASCENDING
  2 | from datetime import datetime
  3 | 
  4 | 
  5 | def init_user_database():
  6 |     """初始化用户相关的所有数据库集合"""
  7 |     try:
  8 |         # 连接数据库
  9 |         client = MongoClient('localhost', 27017)
 10 |         db = client['nankai_news_datasets']  # 使用现有的数据库
 11 | 
 12 |         # 1. 用户集合 (users)
 13 |         if 'users' not in db.list_collection_names():
 14 |             users = db.create_collection('users')
 15 |             users.create_index([('username', ASCENDING)], unique=True)
 16 |             users.create_index([('email', ASCENDING)], unique=True)
 17 |             print("用户集合创建成功")
 18 | 
 19 |         # 2. 搜索历史集合 (search_history)
 20 |         if 'search_history' not in db.list_collection_names():
 21 |             search_history = db.create_collection('search_history')
 22 |             search_history.create_index([('user_id', ASCENDING)])
 23 |             search_history.create_index([('timestamp', ASCENDING)])
 24 |             print("搜索历史集合创建成功")
 25 | 
 26 |         # 3. 用户偏好设置集合 (user_preferences)
 27 |         if 'user_preferences' not in db.list_collection_names():
 28 |             preferences = db.create_collection('user_preferences')
 29 |             preferences.create_index([('user_id', ASCENDING)], unique=True)
 30 |             print("用户偏好集合创建成功")
 31 | 
 32 |         # 4. 登录历史集合 (login_history)
 33 |         if 'login_history' not in db.list_collection_names():
 34 |             login_history = db.create_collection('login_history')
 35 |             login_history.create_index([('user_id', ASCENDING)])
 36 |             login_history.create_index([('login_time', ASCENDING)])
 37 |             print("登录历史集合创建成功")
 38 | 
 39 |         # 5. 新增：用户身份信息集合 (user_profiles)
 40 |         if 'user_profiles' not in db.list_collection_names():
 41 |             user_profiles = db.create_collection('user_profiles')
 42 |             # 创建user_id索引确保一个用户只有一个profile
 43 |             user_profiles.create_index([('user_id', ASCENDING)], unique=True)
 44 |             # 为了支持按身份类型和学院查询，创建这些字段的索引
 45 |             user_profiles.create_index([('role', ASCENDING)])
 46 |             user_profiles.create_index([('college', ASCENDING)])
 47 |             print("用户身份信息集合创建成功")
 48 | 
 49 |         print("\n数据库初始化完成！创建了以下集合：")
 50 |         print("- users: 用户基本信息")
 51 |         print("- search_history: 搜索历史记录")
 52 |         print("- user_preferences: 用户偏好设置")
 53 |         print("- login_history: 登录历史记录")
 54 |         print("- user_profiles: 用户身份信息")
 55 | 
 56 |         # 展示所有集合的结构
 57 |         print("\n各集合的数据结构：")
 58 |         print("\nusers 集合结构：")
 59 |         print({
 60 |             "username": "用户名 (唯一)",
 61 |             "email": "邮箱 (唯一)",
 62 |             "password": "密码哈希",
 63 |             "created_at": "创建时间",
 64 |             "last_login": "最后登录时间"
 65 |         })
 66 | 
 67 |         print("\nsearch_history 集合结构：")
 68 |         print({
 69 |             "user_id": "用户ID",
 70 |             "query": "搜索关键词",
 71 |             "search_in": "搜索范围",
 72 |             "sort_by": "排序方式",
 73 |             "timestamp": "搜索时间"
 74 |         })
 75 | 
 76 |         print("\nuser_preferences 集合结构：")
 77 |         print({
 78 |             "user_id": "用户ID",
 79 |             "default_search_in": "默认搜索范围",
 80 |             "default_sort_by": "默认排序方式",
 81 |             "results_per_page": "每页结果数"
 82 |         })
 83 | 
 84 |         print("\nlogin_history 集合结构：")
 85 |         print({
 86 |             "user_id": "用户ID",
 87 |             "login_time": "登录时间",
 88 |             "ip_address": "IP地址"
 89 |         })
 90 | 
 91 |         print("\nuser_profiles 集合结构：")
 92 |         print({
 93 |             "user_id": "用户ID (唯一)",
 94 |             "age": "年龄 (可选)",
 95 |             "role": "身份 (本科生/研究生/博士生/教师)",
 96 |             "college": "学院 (可选)",
 97 |             "major": "专业 (可选)",
 98 |             "grade": "年级 (可选)",
 99 |             "research_interests": "研究方向 (可选，数组)",
100 |             "last_updated": "最后更新时间"
101 |         })
102 | 
103 |     except Exception as e:
104 |         print(f"初始化数据库时出错: {str(e)}")
105 |         raise e
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     init_user_database()


--------------------------------------------------------------------------------
/db_init/init_user_profiles.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient, ASCENDING
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def init_user_profiles():
 6 |     """创建用户身份信息表并为现有用户初始化数据"""
 7 |     try:
 8 |         # 连接数据库
 9 |         client = MongoClient('localhost', 27017)
10 |         db = client['nankai_news_datasets']
11 | 
12 |         # 1. 创建user_profiles集合
13 |         if 'user_profiles' not in db.list_collection_names():
14 |             user_profiles = db.create_collection('user_profiles')
15 |             # 创建user_id索引确保一个用户只有一个profile
16 |             user_profiles.create_index([('user_id', ASCENDING)], unique=True)
17 |             print("用户身份信息集合创建成功")
18 |         else:
19 |             user_profiles = db['user_profiles']
20 |             print("用户身份信息集合已存在")
21 | 
22 |         # 2. 获取现有用户列表
23 |         existing_users = db.users.find({}, {'_id': 1})
24 | 
25 |         # 3. 为现有用户初始化身份信息
26 |         default_profile = {
27 |             "role": "未设置",  # 默认身份
28 |             "college": "未设置",  # 默认学院
29 |             "age": None,  # 默认年龄为空
30 |             "created_at": datetime.now(),
31 |             "last_updated": datetime.now()
32 |         }
33 | 
34 |         for user in existing_users:
35 |             # 检查用户是否已有profile
36 |             if not user_profiles.find_one({"user_id": user['_id']}):
37 |                 profile_data = {
38 |                     "user_id": user['_id'],
39 |                     **default_profile
40 |                 }
41 |                 user_profiles.insert_one(profile_data)
42 |                 print(f"为用户 {user['_id']} 创建默认身份信息")
43 | 
44 |         print("\n初始化完成！user_profiles集合结构如下：")
45 |         print({
46 |             "user_id": "用户ID (唯一)",
47 |             "role": "身份 (默认'未设置')",
48 |             "college": "学院 (默认'未设置')",
49 |             "age": "年龄 (默认None)",
50 |             "created_at": "创建时间",
51 |             "last_updated": "最后更新时间"
52 |         })
53 | 
54 |         # 打印初始化统计信息
55 |         total_profiles = user_profiles.count_documents({})
56 |         print(f"\n总计初始化了 {total_profiles} 条用户身份信息")
57 | 
58 |     except Exception as e:
59 |         print(f"初始化用户身份信息时出错: {str(e)}")
60 |         raise e
61 | 
62 | 
63 | def create_profile_for_new_user(user_id):
64 |     """为新注册用户创建身份信息记录"""
65 |     try:
66 |         client = MongoClient('localhost', 27017)
67 |         db = client['nankai_news_datasets']
68 |         user_profiles = db['user_profiles']
69 | 
70 |         # 检查是否已存在
71 |         if not user_profiles.find_one({"user_id": user_id}):
72 |             profile_data = {
73 |                 "user_id": user_id,
74 |                 "role": "未设置",
75 |                 "college": "未设置",
76 |                 "age": None,
77 |                 "created_at": datetime.now(),
78 |                 "last_updated": datetime.now()
79 |             }
80 |             user_profiles.insert_one(profile_data)
81 |             print(f"为新用户 {user_id} 创建身份信息成功")
82 |         else:
83 |             print(f"用户 {user_id} 的身份信息已存在")
84 | 
85 |     except Exception as e:
86 |         print(f"创建用户身份信息时出错: {str(e)}")
87 |         raise e
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     # 初始化user_profiles集合并为现有用户创建记录
92 |     init_user_profiles()


--------------------------------------------------------------------------------
/img-folder/19255F29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/19255F29.png


--------------------------------------------------------------------------------
/img-folder/image-20241217170449292.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217170449292.png


--------------------------------------------------------------------------------
/img-folder/image-20241217172152142.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217172152142.png


--------------------------------------------------------------------------------
/img-folder/image-20241217173749975.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217173749975.png


--------------------------------------------------------------------------------
/img-folder/image-20241217173936568.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217173936568.png


--------------------------------------------------------------------------------
/img-folder/image-20241217174140950.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217174140950.png


--------------------------------------------------------------------------------
/img-folder/image-20241217174236515.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217174236515.png


--------------------------------------------------------------------------------
/img-folder/image-20241217174713138.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217174713138.png


--------------------------------------------------------------------------------
/img-folder/image-20241217175037417.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217175037417.png


--------------------------------------------------------------------------------
/img-folder/image-20241217175233504.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217175233504.png


--------------------------------------------------------------------------------
/img-folder/image-20241217184806123.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217184806123.png


--------------------------------------------------------------------------------
/img-folder/image-20241217184922146.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217184922146.png


--------------------------------------------------------------------------------
/img-folder/image-20241217185008968.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217185008968.png


--------------------------------------------------------------------------------
/img-folder/image-20241217185208423.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217185208423.png


--------------------------------------------------------------------------------
/img-folder/image-20241217185558358.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217185558358.png


--------------------------------------------------------------------------------
/img-folder/image-20241217192145331.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192145331.png


--------------------------------------------------------------------------------
/img-folder/image-20241217192259748.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192259748.png


--------------------------------------------------------------------------------
/img-folder/image-20241217192419633.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192419633.png


--------------------------------------------------------------------------------
/img-folder/image-20241217192631920.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217192631920.png


--------------------------------------------------------------------------------
/img-folder/image-20241217193833186.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217193833186.png


--------------------------------------------------------------------------------
/img-folder/image-20241217193922327.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217193922327.png


--------------------------------------------------------------------------------
/img-folder/image-20241217194706713.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217194706713.png


--------------------------------------------------------------------------------
/img-folder/image-20241217195003153.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217195003153.png


--------------------------------------------------------------------------------
/img-folder/image-20241217195200393.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217195200393.png


--------------------------------------------------------------------------------
/img-folder/image-20241217201947701.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217201947701.png


--------------------------------------------------------------------------------
/img-folder/image-20241217202328199.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217202328199.png


--------------------------------------------------------------------------------
/img-folder/image-20241217204512415.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217204512415.png


--------------------------------------------------------------------------------
/img-folder/image-20241217204732261.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217204732261.png


--------------------------------------------------------------------------------
/img-folder/image-20241217205153730.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217205153730.png


--------------------------------------------------------------------------------
/img-folder/image-20241217205341225.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217205341225.png


--------------------------------------------------------------------------------
/img-folder/image-20241217205927444.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217205927444.png


--------------------------------------------------------------------------------
/img-folder/image-20241217210224984.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210224984.png


--------------------------------------------------------------------------------
/img-folder/image-20241217210435856.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210435856.png


--------------------------------------------------------------------------------
/img-folder/image-20241217210524271.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210524271.png


--------------------------------------------------------------------------------
/img-folder/image-20241217210643902.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210643902.png


--------------------------------------------------------------------------------
/img-folder/image-20241217210942244.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217210942244.png


--------------------------------------------------------------------------------
/img-folder/image-20241217211441465.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217211441465.png


--------------------------------------------------------------------------------
/img-folder/image-20241217211645716.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217211645716.png


--------------------------------------------------------------------------------
/img-folder/image-20241217211724777.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217211724777.png


--------------------------------------------------------------------------------
/img-folder/image-20241217212505606.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217212505606.png


--------------------------------------------------------------------------------
/img-folder/image-20241217212805264.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217212805264.png


--------------------------------------------------------------------------------
/img-folder/image-20241217220109997.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220109997.png


--------------------------------------------------------------------------------
/img-folder/image-20241217220232016.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220232016.png


--------------------------------------------------------------------------------
/img-folder/image-20241217220410027.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220410027.png


--------------------------------------------------------------------------------
/img-folder/image-20241217220710450.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217220710450.png


--------------------------------------------------------------------------------
/img-folder/image-20241217221306764.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221306764.png


--------------------------------------------------------------------------------
/img-folder/image-20241217221434868.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221434868.png


--------------------------------------------------------------------------------
/img-folder/image-20241217221619006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221619006.png


--------------------------------------------------------------------------------
/img-folder/image-20241217221818883.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217221818883.png


--------------------------------------------------------------------------------
/img-folder/image-20241217222258357.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217222258357.png


--------------------------------------------------------------------------------
/img-folder/image-20241217231353742.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217231353742.png


--------------------------------------------------------------------------------
/img-folder/image-20241217231856471.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217231856471.png


--------------------------------------------------------------------------------
/img-folder/image-20241217234338468.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234338468.png


--------------------------------------------------------------------------------
/img-folder/image-20241217234427613.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234427613.png


--------------------------------------------------------------------------------
/img-folder/image-20241217234452315.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234452315.png


--------------------------------------------------------------------------------
/img-folder/image-20241217234622692.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/img-folder/image-20241217234622692.png


--------------------------------------------------------------------------------
/img-folder/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/index/ES_Index.py:
--------------------------------------------------------------------------------
  1 | import pymongo
  2 | from elasticsearch import Elasticsearch
  3 | from elasticsearch.helpers import bulk
  4 | from bson import ObjectId
  5 | 
  6 | 
  7 | class NewsIndexer:
  8 |     def __init__(self,
  9 |                  mongo_host='localhost',
 10 |                  mongo_port=27017,
 11 |                  mongo_db='nankai_news',
 12 |                  es_host='localhost',
 13 |                  es_port=9200,
 14 |                  index_name='nankai_news_index'):
 15 |         # MongoDB连接
 16 |         self.mongo_client = pymongo.MongoClient(mongo_host, mongo_port)
 17 |         self.mongo_db = self.mongo_client[mongo_db]
 18 |         self.news_collection = self.mongo_db['news']
 19 | 
 20 |         # Elasticsearch连接
 21 |         self.es = Elasticsearch(
 22 |             [f'http://{es_host}:{es_port}'],
 23 |             basic_auth=('elastic', '123456'),  # 添加身份验证
 24 |             timeout = 300,  # 增加超时时间为30秒
 25 |             max_retries = 3,
 26 |             retry_on_timeout=True# 添加重试机制
 27 |         )
 28 |         self.index_name = index_name
 29 | 
 30 |     def create_index(self):
 31 |         """创建Elasticsearch索引"""
 32 |         settings = {
 33 |             "index": {
 34 |                 "number_of_replicas": 2,
 35 |                 "number_of_shards": 1
 36 |             },
 37 |             "analysis": {
 38 |                 "analyzer": {
 39 |                     "ik_smart_pinyin": {
 40 |                         "type": "custom",
 41 |                         "tokenizer": "ik_smart",
 42 |                         "filter": ["lowercase", "pinyin_filter"]
 43 |                     }
 44 |                 },
 45 |                 "filter": {
 46 |                     "pinyin_filter": {
 47 |                         "type": "pinyin",
 48 |                         "keep_full_pinyin": False,  # 仅保留必要的拼音格式
 49 |                         "keep_joined_full_pinyin": False,
 50 |                         "keep_original": True,
 51 |                         "limit_first_letter_length": 16,
 52 |                         "remove_duplicated_term": True,
 53 |                         "none_chinese_pinyin_tokenize": False  # 减少非中文字符的处理
 54 |                     }
 55 |                 }
 56 |             }
 57 |         }
 58 | 
 59 |         mappings = {
 60 |             "properties": {
 61 |                 "title": {
 62 |                     "type": "text",
 63 |                     "analyzer": "ik_max_word",
 64 |                     "fields": {
 65 |                         "pinyin": {
 66 |                             "type": "text",
 67 |                             "analyzer": "ik_smart_pinyin"
 68 |                         }
 69 |                     }
 70 |                 },
 71 |                 "url": {"type": "keyword"},
 72 |                 "content": {
 73 |                     "type": "text",
 74 |                     "analyzer": "ik_max_word",
 75 |                     "search_analyzer": "ik_smart"
 76 |                 },
 77 |                 "source": {"type": "keyword"},
 78 |                 "date": {"type": "date", "format": "yyyy-MM-dd"}
 79 |             }
 80 |         }
 81 | 
 82 | 
 83 |         if self.es.indices.exists(index=self.index_name):
 84 |             self.es.indices.delete(index=self.index_name)
 85 | 
 86 |         self.es.indices.create(
 87 |             index=self.index_name,
 88 |             body={
 89 |                 "settings": settings,
 90 |                 "mappings": mappings
 91 |             }
 92 |         )
 93 | 
 94 |     def prepare_documents(self):
 95 |         """准备索引文档"""
 96 |         documents = []
 97 |         for news_doc in self.news_collection.find():
 98 |             title = news_doc.get('title', '')
 99 |             doc = {
100 |                 "_id": str(news_doc['_id']),
101 |                 "title": title,
102 |                 "url": news_doc.get('url', ''),
103 |                 "content": news_doc.get('content', ''),
104 |                 "source": news_doc.get('source', ''),
105 |                 "date": news_doc.get('date', ''),
106 |                 "suggest": {
107 |                     "input": [title],
108 |                     "weight": 10
109 |                 }
110 |             }
111 |             documents.append(doc)
112 | 
113 |         return documents
114 | 
115 |     def close(self):
116 |         """关闭数据库连接"""
117 |         self.mongo_client.close()
118 | 
119 | 
120 | def main():
121 |     indexer = NewsIndexer(
122 |         mongo_host='localhost',
123 |         mongo_port=27017,
124 |         mongo_db='nankai_news',
125 |         es_host='localhost',
126 |         es_port=9200,
127 |         index_name='nankai_news_index'
128 |     )
129 | 
130 |     try:
131 |         print("开始创建索引...")
132 |         indexer.create_index()
133 |         print("索引结构创建完成")
134 | 
135 |         print("开始准备文档...")
136 |         documents = indexer.prepare_documents()
137 |         print(f"文档准备完成，共 {len(documents)} 条记录")
138 | 
139 |         print("开始批量索引文档...")
140 |         try:
141 |             success, failed = bulk(
142 |                 indexer.es,
143 |                 [
144 |                     {
145 |                         '_index': indexer.index_name,
146 |                         '_id': doc['_id'],
147 |                         **doc
148 |                     }
149 |                     for doc in documents
150 |                 ],
151 |                 chunk_size=500,  # 设置每批处理的文档数量
152 |                 request_timeout=300,  # 设置批量请求的超时时间
153 |                 refresh=True
154 |             )
155 |             print(f"文档索引完成，成功：{success} 条，失败：{failed} 条")
156 |         except Exception as e:
157 |             print(f"批量索引过程中发生错误: {str(e)}")
158 |             # 记录详细错误信息
159 |             import traceback
160 |             print(traceback.format_exc())
161 |     finally:
162 |         indexer.close()
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main()


--------------------------------------------------------------------------------
/index/creat_index.py:
--------------------------------------------------------------------------------
  1 | # 基础搜索索引
  2 | from whoosh.index import create_in
  3 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED
  4 | from jieba.analyse import ChineseAnalyzer
  5 | import os
  6 | from datetime import datetime
  7 | from pymongo import MongoClient
  8 | 
  9 | 
 10 | # 1. 连接MongoDB和获取数据
 11 | def get_mongodb_data():
 12 |     client = MongoClient('localhost', 27017)
 13 |     db = client['nankai_news_datasets']  # 替换为您的数据库名
 14 | 
 15 |     # 获取网页数据集合1
 16 |     collection1 = db['NEWS1']  # 第一种格式的网页数据，无快照
 17 |     # 获取网页数据集合2
 18 |     collection2 = db['NEWS']  # 第二种格式的网页数据，有快照
 19 |     # 获取快照数据
 20 |     snapshots = db['WEB_snapshot']  # 快照集合
 21 |     # 添加文档集合
 22 |     documents = db['DOCUMENTS']  # 假设文档存储在DOCUMENTS集合中
 23 |     # 用snapshot_hash创建快照字典，用于NEWS集合
 24 |     snapshot_dict = {doc['content_hash']: doc for doc in snapshots.find()}
 25 | 
 26 |     # 返回所有数据
 27 |     return collection1.find(), collection2.find(), snapshot_dict, documents.find()
 28 | 
 29 | 
 30 | # 2. 创建索引结构
 31 | def create_schema():
 32 |     analyzer = ChineseAnalyzer()
 33 |     schema = Schema(
 34 |         id=ID(stored=True, unique=True),
 35 |         url=ID(stored=True),
 36 |         title=TEXT(stored=True, analyzer=analyzer),
 37 |         content=TEXT(stored=True, analyzer=analyzer),
 38 |         publish_date=DATETIME(stored=True),
 39 |         source=TEXT(stored=True),
 40 |         snapshot_hash=ID(stored=True),  # 用于匹配对应的快照
 41 |         captured_at=DATETIME(stored=True),  # 快照捕获时间
 42 | 
 43 |         # 添加文档相关字段
 44 |         filetype = ID(stored=True),  # 文档类型(doc/docx/pdf等)
 45 |         filename = ID(stored=True),  # 文件名
 46 |         upload_date = DATETIME(stored=True)  # 上传时间
 47 |     )
 48 |     return schema
 49 | 
 50 | 
 51 | # 3. 添加文档函数
 52 | def add_document(writer, doc, doc_type, snapshot_dict=None):
 53 |     document = {
 54 |         'id': str(doc['_id']),
 55 |         'url': doc['url'] if 'url' in doc else None
 56 |     }
 57 | 
 58 |     # 处理文档类型
 59 |     if 'filetype' in doc:  # 如果是文档
 60 |         document.update({
 61 |             'filetype': doc['filetype'],
 62 |             'filename': doc['filename'] if 'filename' in doc else None,
 63 |             'title': doc['title'] if 'title' in doc else None,
 64 |             'upload_date': datetime.fromisoformat(doc['upload_date'].replace('Z', '+00:00')) if 'upload_date' in doc else None
 65 |         })
 66 | 
 67 |     # 处理不同格式的数据
 68 |     if doc_type == 'format1':  # NEWS1格式，无快照
 69 |         if 'title' in doc and doc['title']:
 70 |             document['title'] = doc['title']
 71 |         if 'content' in doc:
 72 |             document['content'] = doc['content']
 73 | 
 74 |     elif doc_type == 'format2':  # NEWS格式，有快照
 75 |         if 'title' in doc and doc['title']:
 76 |             document['title'] = doc['title']
 77 |         if 'content' in doc:
 78 |             document['content'] = doc['content']
 79 |         if 'date' in doc:
 80 |             try:
 81 |                 document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d")
 82 |             except:
 83 |                 pass
 84 |         if 'source' in doc:
 85 |             document['source'] = doc['source']
 86 | 
 87 |         # 只为NEWS格式添加快照信息
 88 |         if 'snapshot_hash' in doc:
 89 |             document['snapshot_hash'] = doc['snapshot_hash']
 90 |             # 从快照集合获取捕获时间
 91 |             if snapshot_dict and doc['snapshot_hash'] in snapshot_dict:
 92 |                 snapshot = snapshot_dict[doc['snapshot_hash']]
 93 |                 if 'captured_at' in snapshot:
 94 |                     document['captured_at'] = snapshot['captured_at']
 95 | 
 96 |     try:
 97 |         writer.add_document(**document)
 98 |         return True
 99 |     except Exception as e:
100 |         print(f"Error adding document {document['id']}: {str(e)}")
101 |         return False
102 | 
103 | 
104 | # 4. 初始化索引
105 | def initialize_index():
106 |     if not os.path.exists("../index_dir"):
107 |         os.mkdir("../index_dir")
108 |     ix = create_in("index_dir", create_schema())
109 | 
110 |     # 获取所有数据
111 |     collection1_docs, collection2_docs, snapshot_dict, documents = get_mongodb_data()
112 | 
113 |     # 添加文档到索引
114 |     with ix.writer() as writer:
115 |         count = 0
116 | 
117 |         # 处理文档
118 |         for doc in documents:
119 |             if add_document(writer, doc, 'document'):
120 |                 count += 1
121 |                 if count % 1000 == 0:
122 |                     print(f"已处理 {count} 条数据")
123 | 
124 |         # 处理NEWS1的文档
125 |         for doc in collection1_docs:
126 |             if add_document(writer, doc, 'format1', snapshot_dict):
127 |                 count += 1
128 |                 if count % 1000 == 0:
129 |                     print(f"已处理 {count} 条数据")
130 | 
131 |         # 处理NEWS的文档
132 |         for doc in collection2_docs:
133 |             if add_document(writer, doc, 'format2', snapshot_dict):
134 |                 count += 1
135 |                 if count % 1000 == 0:
136 |                     print(f"已处理 {count} 条数据")
137 | 
138 |     print("索引创建完成！共处理 {} 条数据".format(count))
139 |     return ix
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     ix = initialize_index()


--------------------------------------------------------------------------------
/index/creat_index00.py:
--------------------------------------------------------------------------------
  1 | # 基础搜索索引
  2 | from whoosh.index import create_in
  3 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED
  4 | from jieba.analyse import ChineseAnalyzer
  5 | import os
  6 | from datetime import datetime
  7 | from pymongo import MongoClient
  8 | 
  9 | 
 10 | # 1. 连接MongoDB和获取数据
 11 | def get_mongodb_data():
 12 |     client = MongoClient('localhost', 27017)
 13 |     db = client['nankai_news_datasets']  # 替换为您的数据库名
 14 | 
 15 |     # 获取网页数据集合1
 16 |     collection1 = db['NEWS1']  # 第一种格式的网页数据
 17 |     # 获取网页数据集合2
 18 |     collection2 = db['NEWS']  # 第二种格式的网页数据
 19 |     # 获取快照数据
 20 |     snapshots = db['WEB_snapshot']  # 快照集合
 21 | 
 22 |     # 创建快照字典用于查找
 23 |     snapshot_dict = {doc['_id']: doc for doc in snapshots.find()}
 24 | 
 25 |     # 返回所有数据
 26 |     return collection1.find(), collection2.find(), snapshot_dict
 27 | 
 28 | 
 29 | # 2. 创建索引结构
 30 | def create_schema():
 31 |     analyzer = ChineseAnalyzer()
 32 |     schema = Schema(
 33 |         id=ID(stored=True, unique=True),
 34 |         url=ID(stored=True),
 35 |         title=TEXT(stored=True, analyzer=analyzer),
 36 |         content=TEXT(stored=True, analyzer=analyzer),
 37 |         publish_date=DATETIME(stored=True),
 38 |         source=TEXT(stored=True),
 39 |         snapshot_hash=ID(stored=True),
 40 |         snapshot_content=STORED  # 存储快照内容但不索引
 41 |     )
 42 |     return schema
 43 | 
 44 | 
 45 | # 3. 添加文档函数
 46 | def add_document(writer, doc, doc_type, snapshot_dict=None):
 47 |     if 'filename' in doc:  # 跳过文档类型
 48 |         return
 49 | 
 50 |     document = {
 51 |         'id': str(doc['_id']),
 52 |         'url': doc['url']
 53 |     }
 54 | 
 55 |     # 处理不同格式的数据
 56 |     if doc_type == 'format1':  # 第一种格式news1
 57 |         if 'title' in doc and doc['title']:
 58 |             document['title'] = doc['title']
 59 |         if 'content' in doc:
 60 |             document['content'] = doc['content']
 61 |         # 删除对crawl_time的处理
 62 | 
 63 |     elif doc_type == 'format2':  # 第二种格式NEWS
 64 |         if 'title' in doc and doc['title']:
 65 |             document['title'] = doc['title']
 66 |         if 'content' in doc:
 67 |             document['content'] = doc['content']
 68 |         if 'date' in doc:
 69 |             try:
 70 |                 document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d")
 71 |             except:
 72 |                 pass
 73 |         if 'source' in doc:
 74 |             document['source'] = doc['source']
 75 | 
 76 |     # 添加快照信息
 77 |     if snapshot_dict and str(doc['_id']) in snapshot_dict:
 78 |         snapshot = snapshot_dict[str(doc['_id'])]
 79 |         if 'snapshot_hash' in snapshot:
 80 |             document['snapshot_hash'] = snapshot['snapshot_hash']
 81 |         if 'html_content' in snapshot:
 82 |             document['snapshot_content'] = snapshot['html_content']
 83 | 
 84 |     try:
 85 |         writer.add_document(**document)
 86 |         return True
 87 |     except Exception as e:
 88 |         print(f"Error adding document {document['id']}: {str(e)}")
 89 |         return False
 90 | 
 91 | 
 92 | # 4. 初始化索引
 93 | def initialize_index():
 94 |     if not os.path.exists("../index_dir"):
 95 |         os.mkdir("../index_dir")
 96 |     ix = create_in("index_dir", create_schema())
 97 | 
 98 |     # 获取所有数据
 99 |     collection1_docs, collection2_docs, snapshot_dict = get_mongodb_data()
100 | 
101 |     # 添加文档到索引
102 |     with ix.writer() as writer:
103 |         count = 0
104 | 
105 |         # 处理第一种格式的文档
106 |         for doc in collection1_docs:
107 |             if add_document(writer, doc, 'format1', snapshot_dict):
108 |                 count += 1
109 |                 if count % 1000 == 0:
110 |                     print(f"已处理 {count} 条数据")
111 | 
112 |         # 处理第二种格式的文档
113 |         for doc in collection2_docs:
114 |             if add_document(writer, doc, 'format2', snapshot_dict):
115 |                 count += 1
116 |                 if count % 1000 == 0:
117 |                     print(f"已处理 {count} 条数据")
118 | 
119 |     print("索引创建完成！共处理 {} 条数据".format(count))
120 |     return ix
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     ix = initialize_index()


--------------------------------------------------------------------------------
/index/creat_index01.py:
--------------------------------------------------------------------------------
  1 | # 包含锚文本
  2 | # 基础搜索索引
  3 | from whoosh.index import create_in
  4 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED
  5 | from jieba.analyse import ChineseAnalyzer
  6 | import os
  7 | from datetime import datetime
  8 | from pymongo import MongoClient
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | # 1. 连接MongoDB和获取数据
 12 | def get_mongodb_data():
 13 |     client = MongoClient('localhost', 27017)
 14 |     db = client['nankai_news_datasets']  # 替换为您的数据库名
 15 | 
 16 |     # 获取网页数据集合1
 17 |     collection1 = db['NEWS1']  # 第一种格式的网页数据，无快照
 18 |     # 获取网页数据集合2
 19 |     collection2 = db['NEWS']  # 第二种格式的网页数据，有快照
 20 |     # 获取快照数据
 21 |     snapshots = db['WEB_snapshot']  # 快照集合
 22 |     # 添加文档集合
 23 |     documents = db['DOCUMENTS']  # 假设文档存储在DOCUMENTS集合中
 24 |     # 用snapshot_hash创建快照字典，用于NEWS集合
 25 |     snapshot_dict = {doc['content_hash']: doc for doc in snapshots.find()}
 26 | 
 27 |     # 返回所有数据
 28 |     return collection1.find(), collection2.find(), snapshot_dict, documents.find()
 29 | 
 30 | 
 31 | # 2. 创建索引结构
 32 | def create_schema():
 33 |     analyzer = ChineseAnalyzer()
 34 |     schema = Schema(
 35 |         id=ID(stored=True, unique=True),
 36 |         url=ID(stored=True),
 37 |         title=TEXT(stored=True, analyzer=analyzer),
 38 |         content=TEXT(stored=True, analyzer=analyzer),
 39 |         anchor_text=TEXT(stored=True, analyzer=analyzer),  # 添加锚文本字段
 40 |         publish_date=DATETIME(stored=True),
 41 |         source=TEXT(stored=True),
 42 |         snapshot_hash=ID(stored=True),  # 用于匹配对应的快照
 43 |         captured_at=DATETIME(stored=True),  # 快照捕获时间
 44 | 
 45 |         # 添加文档相关字段
 46 |         filetype = ID(stored=True),  # 文档类型(doc/docx/pdf等)
 47 |         filename = ID(stored=True),  # 文件名
 48 |         upload_date = DATETIME(stored=True)  # 上传时间
 49 |     )
 50 |     return schema
 51 | 
 52 | def extract_anchor_text(html_content):
 53 |     """从HTML内容中提取锚文本"""
 54 |     try:
 55 |         soup = BeautifulSoup(html_content, 'html.parser')
 56 |         anchors = soup.find_all('a')
 57 |         # 获取所有非空的锚文本
 58 |         anchor_texts = [a.get_text().strip() for a in anchors if a.get_text().strip()]
 59 |         return " ".join(anchor_texts)
 60 |     except Exception as e:
 61 |         print(f"Error extracting anchor text: {str(e)}")
 62 |         return ""
 63 | # 3. 添加文档函数
 64 | def add_document(writer, doc, doc_type, snapshot_dict=None):
 65 |     document = {
 66 |         'id': str(doc['_id']),
 67 |         'url': doc['url'] if 'url' in doc else None
 68 |     }
 69 | 
 70 |     # 处理文档类型
 71 |     if 'filetype' in doc:  # 如果是文档
 72 |         document.update({
 73 |             'filetype': doc['filetype'],
 74 |             'filename': doc['filename'] if 'filename' in doc else None,
 75 |             'title': doc['title'] if 'title' in doc else None,
 76 |             'upload_date': datetime.fromisoformat(doc['upload_date'].replace('Z', '+00:00')) if 'upload_date' in doc else None
 77 |         })
 78 |         # 可能需要提取文档内容并添加到content字段
 79 |         # document['content'] = extract_doc_content(doc)  # 需要实现文档内容提取函数
 80 | 
 81 |     # 处理不同格式的数据
 82 |     if doc_type == 'format1':  # NEWS1格式，无快照
 83 |         if 'title' in doc and doc['title']:
 84 |             document['title'] = doc['title']
 85 |         if 'content' in doc:
 86 |             document['content'] = doc['content']
 87 | 
 88 |     elif doc_type == 'format2':  # NEWS格式，有快照
 89 |         if 'title' in doc and doc['title']:
 90 |             document['title'] = doc['title']
 91 |         if 'content' in doc:
 92 |             document['content'] = doc['content']
 93 |         if 'date' in doc:
 94 |             try:
 95 |                 document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d")
 96 |             except:
 97 |                 pass
 98 |         if 'source' in doc:
 99 |             document['source'] = doc['source']
100 | 
101 |         # 只为NEWS格式添加快照信息
102 |         if 'snapshot_hash' in doc:
103 |             document['snapshot_hash'] = doc['snapshot_hash']
104 |             # 从快照集合获取捕获时间
105 |             if snapshot_dict and doc['snapshot_hash'] in snapshot_dict:
106 |                 snapshot = snapshot_dict[doc['snapshot_hash']]
107 |                 if 'captured_at' in snapshot:
108 |                     document['captured_at'] = snapshot['captured_at']
109 | 
110 |                 # 从快照的HTML内容中提取锚文本
111 |                 if 'html_content' in snapshot:
112 |                     anchor_text = extract_anchor_text(snapshot['html_content'])
113 |                     if anchor_text:  # 如果成功提取到锚文本
114 |                         document['anchor_text'] = anchor_text
115 |     try:
116 |         writer.add_document(**document)
117 |         return True
118 |     except Exception as e:
119 |         print(f"Error adding document {document['id']}: {str(e)}")
120 |         return False
121 | 
122 | 
123 | # 4. 初始化索引
124 | def initialize_index():
125 |     if not os.path.exists("../index_dir"):
126 |         os.mkdir("../index_dir")
127 |     ix = create_in("index_dir", create_schema())
128 | 
129 |     # 获取所有数据
130 |     collection1_docs, collection2_docs, snapshot_dict, documents = get_mongodb_data()
131 | 
132 |     # 添加文档到索引
133 |     with ix.writer() as writer:
134 |         count = 0
135 | 
136 |         # 处理文档
137 |         for doc in documents:
138 |             if add_document(writer, doc, 'document'):
139 |                 count += 1
140 |                 if count % 1000 == 0:
141 |                     print(f"已处理 {count} 条数据")
142 | 
143 |         # 处理NEWS1的文档
144 |         for doc in collection1_docs:
145 |             if add_document(writer, doc, 'format1', snapshot_dict):
146 |                 count += 1
147 |                 if count % 1000 == 0:
148 |                     print(f"已处理 {count} 条数据")
149 | 
150 |         # 处理NEWS的文档
151 |         for doc in collection2_docs:
152 |             if add_document(writer, doc, 'format2', snapshot_dict):
153 |                 count += 1
154 |                 if count % 1000 == 0:
155 |                     print(f"已处理 {count} 条数据")
156 | 
157 |     print("索引创建完成！共处理 {} 条数据".format(count))
158 |     return ix
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     ix = initialize_index()


--------------------------------------------------------------------------------
/index/creat_index_document.py:
--------------------------------------------------------------------------------
  1 | # 基础搜索索引
  2 | from whoosh.index import create_in
  3 | from whoosh.fields import Schema, TEXT, ID, DATETIME, STORED
  4 | from jieba.analyse import ChineseAnalyzer
  5 | import os
  6 | from datetime import datetime
  7 | from pymongo import MongoClient
  8 | 
  9 | 
 10 | # 1. 连接MongoDB和获取数据
 11 | def get_mongodb_data():
 12 |     client = MongoClient('localhost', 27017)
 13 |     db = client['nankai_news_datasets']  # 替换为您的数据库名
 14 | 
 15 |     # 获取网页数据集合1
 16 |     collection1 = db['NEWS1']  # 第一种格式的网页数据，无快照
 17 |     # 获取网页数据集合2
 18 |     collection2 = db['NEWS']  # 第二种格式的网页数据，有快照
 19 |     # 获取快照数据
 20 |     snapshots = db['WEB_snapshot']  # 快照集合
 21 |     # 添加文档集合
 22 |     documents = db['DOCUMENTS']  # 假设文档存储在DOCUMENTS集合中
 23 |     # 用snapshot_hash创建快照字典，用于NEWS集合
 24 |     snapshot_dict = {doc['content_hash']: doc for doc in snapshots.find()}
 25 | 
 26 |     # 返回所有数据
 27 |     return collection1.find(), collection2.find(), snapshot_dict, documents.find()
 28 | 
 29 | 
 30 | # 2. 创建索引结构
 31 | def create_schema():
 32 |     analyzer = ChineseAnalyzer()
 33 |     schema = Schema(
 34 |         id=ID(stored=True, unique=True),
 35 |         url=ID(stored=True),
 36 |         title=TEXT(stored=True, analyzer=analyzer, phrase=True),
 37 |         content=TEXT(stored=True, analyzer=analyzer, phrase=True),
 38 |         publish_date=DATETIME(stored=True),
 39 |         source=TEXT(stored=True),
 40 |         snapshot_hash=ID(stored=True),  # 用于匹配对应的快照
 41 |         captured_at=DATETIME(stored=True),  # 快照捕获时间
 42 | 
 43 |         # 添加文档相关字段
 44 |         filetype = ID(stored=True),  # 文档类型(doc/docx/pdf等)
 45 |         filename = ID(stored=True),  # 文件名
 46 |         upload_date = DATETIME(stored=True)  # 上传时间
 47 |     )
 48 |     return schema
 49 | 
 50 | # 3. 添加文档函数
 51 | def add_document(writer, doc, doc_type, snapshot_dict=None):
 52 |     document = {
 53 |         'id': str(doc['_id']),
 54 |         'url': doc['url'] if 'url' in doc else None
 55 |     }
 56 | 
 57 |     # 处理文档类型
 58 |     if 'filetype' in doc:  # 如果是文档
 59 |         document.update({
 60 |             'filetype': doc['filetype'],
 61 |             'filename': doc['filename'] if 'filename' in doc else None,
 62 |             'title': doc['title'] if 'title' in doc else None,
 63 |             'upload_date': doc.get('upload_date') if 'upload_date' in doc else None
 64 |         })
 65 |         # 打印处理后的 upload_date
 66 |         if 'upload_date' in doc:
 67 |             print(f"处理后的 upload_date: {doc.get('upload_date')}")
 68 | 
 69 |     # 处理不同格式的数据
 70 |     if doc_type == 'format1':  # NEWS1格式，无快照
 71 |         if 'title' in doc and doc['title']:
 72 |             document['title'] = doc['title']
 73 |         if 'content' in doc:
 74 |             document['content'] = doc['content']
 75 | 
 76 |     elif doc_type == 'format2':  # NEWS格式，有快照
 77 |         if 'title' in doc and doc['title']:
 78 |             document['title'] = doc['title']
 79 |         if 'content' in doc:
 80 |             document['content'] = doc['content']
 81 |         if 'date' in doc:
 82 |             try:
 83 |                 document['publish_date'] = datetime.strptime(doc['date'], "%Y-%m-%d")
 84 |             except:
 85 |                 pass
 86 |         if 'source' in doc:
 87 |             document['source'] = doc['source']
 88 | 
 89 |         # 只为NEWS格式添加快照信息
 90 |         if 'snapshot_hash' in doc:
 91 |             document['snapshot_hash'] = doc['snapshot_hash']
 92 |             # 从快照集合获取捕获时间
 93 |             if snapshot_dict and doc['snapshot_hash'] in snapshot_dict:
 94 |                 snapshot = snapshot_dict[doc['snapshot_hash']]
 95 |                 if 'captured_at' in snapshot:
 96 |                     document['captured_at'] = snapshot['captured_at']
 97 | 
 98 |     try:
 99 |         writer.add_document(**document)
100 |         return True
101 |     except Exception as e:
102 |         print(f"Error adding document {document['id']}: {str(e)}")
103 |         return False
104 | 
105 | 
106 | # 4. 初始化索引
107 | def initialize_index():
108 |     if not os.path.exists("index_dir"):
109 |         os.mkdir("index_dir")
110 |     ix = create_in("index_dir", create_schema())
111 | 
112 |     # 获取所有数据
113 |     collection1_docs, collection2_docs, snapshot_dict, documents = get_mongodb_data()
114 | 
115 |     # 添加文档到索引
116 |     with ix.writer() as writer:
117 |         doc_count = 0
118 |         news1_count = 0
119 |         news2_count = 0
120 |         print("\n=== 开始处理文档集合(DOCUMENTS) ===")
121 |         # 处理文档
122 |         for doc in documents:
123 |             if add_document(writer, doc, 'document'):
124 |                 doc_count  += 1
125 |                 if doc_count  % 100 == 0:
126 |                     print(f"已处理 {doc_count } 条数据")
127 | 
128 |         print("\n=== 开始处理NEWS1集合 ===")
129 |         # 处理NEWS1的文档
130 |         for doc in collection1_docs:
131 |             if add_document(writer, doc, 'format1', snapshot_dict):
132 |                 news1_count  += 1
133 |                 if news1_count  % 1000 == 0:
134 |                     print(f"已处理 {news1_count } 条数据")
135 | 
136 |         print("\n=== 开始处理NEWS集合 ===")
137 |         # 处理NEWS的文档
138 |         for doc in collection2_docs:
139 |             if add_document(writer, doc, 'format2', snapshot_dict):
140 |                 news2_count += 1
141 |                 if news2_count % 1000 == 0:
142 |                     print(f"已处理 {news2_count} 条数据")
143 |     total_count = doc_count + news1_count + news2_count
144 |     print("索引创建完成！共处理 {} 条数据".format(total_count))
145 |     return ix
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     ix = initialize_index()


--------------------------------------------------------------------------------
/search/__pycache__/manager.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/search/__pycache__/manager.cpython-39.pyc


--------------------------------------------------------------------------------
/search/__pycache__/personalization.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/search/__pycache__/personalization.cpython-39.pyc


--------------------------------------------------------------------------------
/search/__pycache__/processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/search/__pycache__/processor.cpython-39.pyc


--------------------------------------------------------------------------------
/search/manager.py:
--------------------------------------------------------------------------------
  1 | # search/manager.py
  2 | from whoosh.qparser import MultifieldParser, QueryParser
  3 | from whoosh.query import Term, Or, Phrase, Wildcard, Regex
  4 | from whoosh.highlight import ContextFragmenter, HtmlFormatter
  5 | from datetime import datetime
  6 | import math
  7 | from bson.objectid import ObjectId  # 添加这个导入
  8 | from pymongo import MongoClient
  9 | from whoosh.highlight import ContextFragmenter, HtmlFormatter
 10 | class SearchManager:
 11 |     def __init__(self, searcher, results_per_page=10):
 12 |         self.searcher = searcher
 13 |         self.RESULTS_PER_PAGE = results_per_page
 14 |         # 定义所有支持的文档类型
 15 |         self.SUPPORTED_FILETYPES = ['pdf', 'doc', 'docx', 'xls', 'xlsx']
 16 |         # MongoDB 连接
 17 |         self.client = MongoClient('localhost', 27017)
 18 |         self.db = self.client['nankai_news_datasets']
 19 | 
 20 |     def _get_document_info(self, doc_str_id):
 21 |         """从MongoDB获取文档详细信息"""
 22 |         try:
 23 |             # 使用doc_id查询MongoDB获取文件信息
 24 |             doc_info = self.db.documents.find_one({'_id': ObjectId(doc_str_id)})
 25 |             if doc_info:
 26 |                 return {
 27 |                     'filename': doc_info.get('filename', '未知文件名'),
 28 |                     'length': doc_info.get('length', 0),
 29 |                     'upload_date': doc_info.get('upload_date')
 30 |                 }
 31 |             return None
 32 |         except Exception as e:
 33 |             print(f"获取文档信息错误: {str(e)}")
 34 |             return None
 35 |     def _get_field_config(self, search_in='all'):
 36 |         """获取搜索字段和权重配置"""
 37 |         if search_in == 'title':
 38 |             return {"fields": ["title"], "weights": {"title": 1.0}}
 39 |         elif search_in == 'content':
 40 |             return {"fields": ["content"], "weights": {"content": 1.0}}
 41 |         else:  # 'all'
 42 |             return {"fields": ["title", "content"], "weights": {"title": 2.0, "content": 1.0}}
 43 | 
 44 |     def execute_search(self, search_type, query_text, search_in='all', sort_by='relevance', filetypes=None):
 45 |         """统一的搜索执行接口"""
 46 |         field_config = self._get_field_config(search_in)
 47 | 
 48 |         # 根据搜索类型选择查询构建方式
 49 |         if search_type == 'document':
 50 |             query = self._build_document_query(query_text, field_config, filetypes)
 51 |             # 执行搜索
 52 |             results = self.searcher.search(query, limit=None, terms=True)
 53 | 
 54 |             # 只对文档搜索结果添加文件信息
 55 |             for hit in results:
 56 |                 doc_str_id = hit.get('id')
 57 |                 if doc_str_id:
 58 |                     doc_info = self._get_document_info(doc_str_id)
 59 |                     if doc_info:
 60 |                         hit['filename'] = doc_info['filename']
 61 |                         hit['filesize'] = doc_info['length']
 62 |                         hit['upload_date'] = doc_info['upload_date']
 63 |         elif search_type == 'phrase':
 64 |             query = self._build_phrase_query(query_text, field_config)
 65 |         elif search_type == 'wildcard':
 66 |             query = self._build_wildcard_query(query_text, field_config)
 67 |             if query is None:
 68 |                 # 如果查询无效，返回空结果
 69 |                 # 如果查询无效，返回空的查询结果，但设置limit为1
 70 |                 return self.searcher.search(Term("content", "IMPOSSIBLE_MATCH_STRING"), limit=1)
 71 |         else:  # basic search
 72 |             query = self._build_basic_query(query_text, field_config)
 73 | 
 74 |         # 执行搜索
 75 |         if sort_by == 'date':
 76 |             results = self.searcher.search(
 77 |                 query,
 78 |                 limit=None,
 79 |                 sortedby='publish_date',
 80 |                 reverse=True,
 81 |                 terms=True,
 82 |             )
 83 |         else:
 84 |             results = self.searcher.search(query,
 85 |                                            limit=None,
 86 |                                            terms=True)
 87 | 
 88 |         # 设置高亮
 89 |         results.fragmenter = ContextFragmenter(maxchars=200, surround=50)
 90 |         results.formatter = HtmlFormatter(tagname="strong", classname="highlight")
 91 |         results.formatter.between = "..."
 92 |         return results
 93 | 
 94 |     def _build_basic_query(self, query_text, field_config):
 95 |         parser = MultifieldParser(
 96 |             field_config["fields"],
 97 |             schema=self.searcher.schema,
 98 |             fieldboosts=field_config["weights"]
 99 |         )
100 |         return parser.parse(query_text)
101 | 
102 |     def _build_document_query(self, query_text, field_config, filetypes):
103 |         weights = field_config["weights"].copy()
104 |         weights.update({
105 |             "filename": 1.5,
106 |             "filetype": 1.0
107 |         })
108 | 
109 |         parser = MultifieldParser(
110 |             field_config["fields"] + ["filename", "filetype"],
111 |             schema=self.searcher.schema,
112 |             fieldboosts=weights
113 |         )
114 | 
115 |         base_query = parser.parse(query_text)
116 | 
117 |         # 如果用户没有选择文件类型，就使用所有支持的类型
118 |         if not filetypes:
119 |             filetypes = self.SUPPORTED_FILETYPES
120 | 
121 |         # 构建文件类型过滤器
122 |         filetype_filter = Or([Term("filetype", ft.lower()) for ft in filetypes])
123 |         return base_query & filetype_filter
124 | 
125 |     def _build_phrase_query(self, query_text, field_config):
126 |         """
127 |         构建短语查询 - 要求精确匹配完整短语
128 |         """
129 |         from whoosh.query import And, Term, Phrase
130 |         from jieba.analyse import ChineseAnalyzer
131 |         # 使用中文分析器进行分词
132 |         analyzer = ChineseAnalyzer()
133 |         terms = [token.text for token in analyzer(query_text)]
134 | 
135 |         # 如果短语只有一个词，使用 Term 查询
136 |         if len(terms) == 1:
137 |             return Or([Term(field, query_text) for field in field_config["fields"]])
138 | 
139 |         # 对每个搜索字段构建短语查询
140 |         phrase_queries = []
141 |         for field in field_config["fields"]:
142 |             # 使用 Phrase 查询，slop=0 表示词必须严格相邻
143 |             phrase_queries.append(
144 |                 Phrase(field, terms, slop=0)
145 |             )
146 | 
147 |         # 使用 Or 组合所有字段的查询
148 |         final_query = Or(phrase_queries)
149 | 
150 |         print(f"构建的短语查询: {final_query}")  # 调试输出
151 |         return final_query
152 | 
153 |     def _build_wildcard_query(self, query_text, field_config):
154 |         """
155 |         构建通配符查询:
156 |         ? - 匹配单个字符
157 |         * - 匹配零个或多个字符
158 |         """
159 |         from whoosh.query import Or, Wildcard
160 | 
161 |         def process_query(query):
162 |             # 处理中文通配符
163 |             query = query.replace('？', '?')
164 |             query = query.replace('＊', '*')
165 | 
166 |             # 关键修改：确保通配符能正确匹配中文
167 |             # 如果查询以*结尾，保持原样；如果不以*结尾且包含*，在*后添加*以匹配任意字符
168 |             if '*' in query and not query.endswith('*'):
169 |                 parts = query.split('*')
170 |                 query = '*'.join(parts[:-1]) + '*' + parts[-1] + '*'
171 |             elif not '*' in query and not '?' in query:
172 |                 query = query + '*'
173 | 
174 |             return query
175 | 
176 |         def validate_query(query):
177 |             # 验证通配符使用是否合法
178 |             if not any(char in query for char in ['?', '*']):
179 |                 return False
180 |             # 不允许只有通配符的查询
181 |             if query.strip('*?') == '':
182 |                 return False
183 |             return True
184 | 
185 |         queries = []
186 |         fields = field_config["fields"]
187 | 
188 |         # 处理查询文本
189 |         processed_query = process_query(query_text)
190 |         print(f"处理后的通配符查询: {processed_query}")  # 调试输出
191 | 
192 |         # 验证查询的合法性
193 |         if not validate_query(processed_query):
194 |             print(f"无效的通配符查询: {query_text}")
195 |             return None
196 | 
197 |         # 为每个搜索字段创建通配符查询
198 |         for field in fields:
199 |             wildcard = Wildcard(field, processed_query)
200 |             queries.append(wildcard)
201 | 
202 |         # 组合所有字段的查询
203 |         final_query = Or(queries) if len(queries) > 1 else queries[0]
204 |         print(f"最终通配符查询: {final_query}")  # 调试输出
205 |         return final_query


--------------------------------------------------------------------------------
/search/personalization.py:
--------------------------------------------------------------------------------
  1 | # search/personalization.py
  2 | import math
  3 | class SearchPersonalization:
  4 |     """搜索结果个性化处理类"""
  5 | 
  6 |     # 学院相关性映射表
  7 |     COLLEGE_RELATIONS = {
  8 |         '文学院': ['新闻与传播学院', '汉语言文化学院', '外国语学院'],
  9 |         '历史学院': ['文学院', '哲学院', '周恩来政府管理学院'],
 10 |         '物理科学学院': ['电子信息与光学工程学院', '材料科学与工程学院'],
 11 |         '化学学院': ['材料科学与工程学院', '生命科学学院', '医学院', '药学院'],
 12 |         '生命科学学院': ['化学学院', '医学院', '药学院', '环境科学与工程学院'],
 13 |         '计算机与网络空间安全学院': ['软件学院', '人工智能学院', '数学科学学院'],
 14 |         '计算机学院': ['软件学院', '人工智能学院', '数学科学学院'],  # 兼容简称
 15 |         '网络空间安全学院': ['计算机学院', '软件学院', '数学科学学院'],  # 兼容分拆名称
 16 |         '数学科学学院': ['统计与数据科学学院', '计算机学院', '人工智能学院'],
 17 |         '经济学院': ['商学院', '金融学院', '统计与数据科学学院'],
 18 |         '商学院': ['经济学院', '金融学院', '旅游与服务学院'],
 19 |         '医学院': ['生命科学学院', '药学院'],
 20 |         '周恩来政府管理学院': ['法学院', '马克思主义学院', '历史学院']
 21 |     }
 22 | 
 23 |     def __init__(self, user_profile=None):
 24 |         self.user_profile = user_profile
 25 | 
 26 |     def personalize_results(self, results, sort_by='relevance'):
 27 |         """
 28 |         根据用户身份和排序偏好个性化搜索结果
 29 |         Args:
 30 |             results: 原始搜索结果列表
 31 |             sort_by: 排序方式 ('relevance' 或 'time')
 32 |         Returns:
 33 |             调整后的搜索结果列表
 34 |         """
 35 |         if not self.user_profile:
 36 |             return results  # 未登录用户返回原始结果
 37 | 
 38 |         # 获取用户角色和学院信息
 39 |         role = self.user_profile.get('role', '未设置')
 40 |         college = self.user_profile.get('college', '未设置')
 41 |         # 获取相关学院列表
 42 |         related_colleges = self._get_related_colleges(college)
 43 | 
 44 |         # 将所有结果转换为(得分,hit)元组列表
 45 |         result_list = []
 46 |         for hit in results:
 47 |             try:
 48 |                 # 获取或设置基础得分
 49 |                 base_score = hit.score if hasattr(hit, 'score') else 1.0
 50 | 
 51 |                 # 安全地获取文档内容
 52 |                 content = ''
 53 |                 # 尝试从不同可能的字段获取内容
 54 |                 content_fields = ['title', 'content', 'text']
 55 |                 for field in content_fields:
 56 |                     if hasattr(hit, field):
 57 |                         content += str(getattr(hit, field, '')) + ' '
 58 |                     elif hasattr(hit, 'get'):
 59 |                         content += str(hit.get(field, '')) + ' '
 60 |                 content = content.lower()
 61 | 
 62 |                 # 计算boost因子
 63 |                 boost = self._calculate_boost(content, role, college, related_colleges)
 64 | 
 65 |                 # 计算最终得分
 66 |                 #下面这个算出来有问题，我直接用权重代表final，实验报告用这个
 67 |                 final_score = boost*(1+0.019*base_score)
 68 |                 print(base_score)
 69 |                 print(boost)
 70 |                 print("final_score的值为:", final_score)
 71 |                 # 存储元组: (最终得分, 时间戳或默认值, 原始对象)
 72 |                 timestamp = None
 73 |                 if hasattr(hit, 'publish_date'):
 74 |                     timestamp = getattr(hit, 'publish_date')
 75 |                 elif hasattr(hit, 'get'):
 76 |                     timestamp = hit.get('publish_date')
 77 | 
 78 |                 result_list.append((final_score, timestamp, hit))
 79 | 
 80 |             except Exception as e:
 81 |                 print(f"处理结果时出错: {str(e)}")
 82 |                 result_list.append((base_score, None, hit))
 83 | 
 84 |         # 根据排序方式排序
 85 |         if sort_by == 'time':
 86 |             # 先按时间排序，时间相同的按分数排序
 87 |             sorted_results = sorted(
 88 |                 result_list,
 89 |                 key=lambda x: (x[1] or '', -x[0]),  # 使用空字符串作为默认时间戳
 90 |                 reverse=True
 91 |             )
 92 |         else:
 93 |             # 按分数排序
 94 |             sorted_results = sorted(
 95 |                 result_list,
 96 |                 key=lambda x: x[0],  # 使用最终得分排序
 97 |                 reverse=True
 98 |             )
 99 | 
100 |         # 只返回原始对象列表
101 |         return [item[2] for item in sorted_results]
102 | 
103 |     def _calculate_boost(self, content, role, college, related_colleges):
104 |         """计算搜索结果的权重提升"""
105 |         boost = 1.0
106 |         boost_reasons = []  # 用于记录加分原因
107 | 
108 |         print(f"\n分析文档: {content[:200]}...")
109 |         print(f"用户角色: {role}, 学院: {college}")
110 | 
111 |         # 1. 基于角色的内容提升
112 |         if role == '教师':
113 |             if any(tag in content.lower() for tag in ['学术', '科研', '教学', '实验室', '课题']):
114 |                 boost *= 1.3
115 |                 boost_reasons.append("教师-学术内容匹配: +30%")
116 |             if any(tag in content.lower() for tag in ['教务', '师资', '课程']):
117 |                 boost *= 1.2
118 |                 boost_reasons.append("教师-教务内容匹配: +20%")
119 |         elif role in ['本科生', '研究生', '博士生']:
120 |             if any(tag in content.lower() for tag in ['学生', '教务', '活动', '奖学金']):
121 |                 boost *= 1.2
122 |                 boost_reasons.append("学生相关内容匹配: +20%")
123 |             if any(tag in content.lower() for tag in ['就业', '实习', '竞赛', '夜跑', '社团', '活动']):
124 |                 boost *= 1.15
125 |                 boost_reasons.append("学生活动内容匹配: +15%")
126 | 
127 |         # 2. 学院相关性判断
128 |         if college != '未设置':
129 |             # 规范化处理内容和学院名称
130 |             normalized_content = content.lower()
131 |             normalized_college = college.lower()
132 | 
133 |             # 检查文档中是否包含学院名称（包括变体形式）
134 |             college_variations = {
135 |                 '计算机与网络空间安全学院': ['计算机学院', '网安学院', '计算机与网安学院', '网络空间安全学院'],
136 |                 '文学院': ['文学院', '中文系', '汉语言'],
137 |                 '商学院': ['商学院', 'MBA', '工商管理'],
138 |                 '医学院': ['医学院', '附属医院', '临床医学'],
139 |                 '生命科学学院': ['生科院', '生命学院', '生物学院'],
140 |                 '物理科学学院': ['物理学院', '物理系'],
141 |                 '化学学院': ['化学院', '化学系'],
142 |                 '数学科学学院': ['数学院', '数学系'],
143 |                 '经济学院': ['经济系', '经济管理']
144 |             }
145 | 
146 |             college_matched = False
147 |             # 检查完整学院名称
148 |             if college.lower() in normalized_content:
149 |                 boost *= 1.4
150 |                 college_matched = True
151 |                 boost_reasons.append(f"完整学院名称匹配({college}): +40%")
152 | 
153 |             # 检查学院变体
154 |             if not college_matched:
155 |                 variations = college_variations.get(college, [])
156 |                 for variation in variations:
157 |                     if variation.lower() in normalized_content:
158 |                         boost *= 1.3
159 |                         college_matched = True
160 |                         boost_reasons.append(f"学院变体名称匹配({variation}): +30%")
161 |                         break
162 | 
163 |             # 检查学院关键词
164 |             if not college_matched:
165 |                 keywords = self._get_college_context_keywords(college)
166 |                 matched_keywords = [kw for kw in keywords if kw.lower() in normalized_content]
167 |                 if matched_keywords:
168 |                     keyword_boost = 1.1 + min(len(matched_keywords) * 0.05, 0.3)
169 |                     boost *= keyword_boost
170 |                     boost_reasons.append(
171 |                         f"学院关键词匹配({', '.join(matched_keywords)}): +{(keyword_boost - 1) * 100:.0f}%")
172 | 
173 |             # 检查相关学院
174 |             for related_college in related_colleges:
175 |                 if related_college.lower() in normalized_content:
176 |                     boost *= 1.15
177 |                     boost_reasons.append(f"相关学院匹配({related_college}): +15%")
178 |                     break
179 | 
180 |             # 检查活动类型和学院组合
181 |             activity_keywords = ['活动', '比赛', '夜跑', '讲座', '社团']
182 |             if any(kw in normalized_content for kw in activity_keywords):
183 |                 if college_matched:
184 |                     boost *= 1.25
185 |                     boost_reasons.append("本院活动加分: +25%")
186 |                 elif any(related in normalized_content for related in related_colleges):
187 |                     boost *= 1.1
188 |                     boost_reasons.append("相关学院活动加分: +10%")
189 |         #     # 添加PageRank影响
190 |         # try:
191 |         #     # 直接获取pagerank属性
192 |         #     pagerank = getattr(content, 'pagerank', 0)
193 |         #     if pagerank > 0:
194 |         #         # 使用很小的系数确保PageRank不会过度影响排序
195 |         #         pr_boost = 1 + 0.05 * math.log1p(pagerank)
196 |         #         boost *= pr_boost
197 |         #         boost_reasons.append(f"PageRank boost: +{((pr_boost - 1) * 100):.2f}%")
198 |         # except (AttributeError, ValueError) as e:
199 |         #     # 如果无法获取或转换PageRank值，直接忽略
200 |         #     pass
201 |         # 打印加分详情
202 |         print("\n加分详情:")
203 |         for reason in boost_reasons:
204 |             print(f"- {reason}")
205 |         print(f"最终权重系数: {boost:.2f}\n")
206 | 
207 |         return boost
208 | 
209 |     def _get_related_colleges(self, college):
210 |         """获取与用户学院相关的其他学院列表"""
211 |         if college == '未设置':
212 |             return []
213 | 
214 |         # 处理学院名称的不同形式
215 |         college_variants = {
216 |             '计算机与网络空间安全学院': ['计算机学院', '网络空间安全学院', '信息科学学院'],
217 |             '计算机学院': ['计算机与网络空间安全学院', '软件学院', '信息科学学院'],
218 |             '文学院': ['新闻学院', '外国语学院', '汉语言文化学院'],
219 |             '物理科学学院': ['物理学院', '光学工程学院'],
220 |             '化学学院': ['化学系', '材料学院'],
221 |             '医学院': ['生命科学院', '药学院'],
222 |             '商学院': ['经济学院', '管理学院']
223 |         }
224 | 
225 |         # 获取基础相关学院
226 |         related = self.COLLEGE_RELATIONS.get(college, [])
227 | 
228 |         # 添加变体形式
229 |         variants = college_variants.get(college, [])
230 | 
231 |         # 合并所有相关学院，去重
232 |         all_related = list(set(related + variants))
233 | 
234 |         return all_related
235 | 
236 |     def _get_college_context_keywords(self, college):
237 |         """获取学院相关的上下文关键词"""
238 |         COLLEGE_KEYWORDS = {
239 |             '计算机与网络空间安全学院': [
240 |                 # 专业术语
241 |                 '编程', '算法', '软件', '人工智能', '网络',
242 |                 '网络安全', '信息安全', '密码学', '渗透测试',
243 |                 # 场地
244 |                 '实验室', '机房', '创新实践基地',
245 |                 # 活动
246 |                 '程序设计大赛', '编程竞赛', 'ACM', '网络安全竞赛',
247 |                 # 学科
248 |                 '计算机科学', '软件工程', '网络工程', '信息安全',
249 |             ],
250 |             '文学院': [
251 |                 # 专业术语
252 |                 '文学', '写作', '语言', '文化', '古籍',
253 |                 # 场地
254 |                 '图书馆', '文学社', '创作室',
255 |                 # 活动
256 |                 '诗歌朗诵', '读书会', '文学讲座', '创作比赛',
257 |                 # 学科
258 |                 '中国语言文学', '汉语言', '文艺学', '比较文学'
259 |             ],
260 |             '物理科学学院': [
261 |                 '物理', '光学', '量子', '实验室', '力学',
262 |                 '电磁学', '热学', '光电', '激光'
263 |             ],
264 |             '化学学院': [
265 |                 '化学', '分子', '实验', '材料', '有机化学',
266 |                 '无机化学', '分析化学', '物理化学'
267 |             ],
268 |             '经济学院': [
269 |                 '经济', '金融', '贸易', '市场', '投资',
270 |                 '统计', '财务', '商业', '管理'
271 |             ],
272 |             '医学院': [
273 |                 '医学', '临床', '病理', '解剖', '生理',
274 |                 '药理', '诊断', '治疗', '护理'
275 |             ]
276 |         }
277 | 
278 |         # 通用关键词
279 |         base_keywords = ['科研', '实验室', '研究', '项目', '讲座', '活动']
280 | 
281 |         # 获取特定学院的关键词，如果没有则使用空列表
282 |         college_specific = COLLEGE_KEYWORDS.get(college, [])
283 | 
284 |         # 合并特定关键词和通用关键词
285 |         return college_specific + base_keywords


--------------------------------------------------------------------------------
/search/processor.py:
--------------------------------------------------------------------------------
  1 | # search/processor.py
  2 | import math
  3 | 
  4 | class ResultProcessor:
  5 |     def __init__(self, results_per_page=10):
  6 |         self.RESULTS_PER_PAGE = results_per_page
  7 |         # #定义要排除的URL列表，短语查询时开启
  8 |         # self.EXCLUDED_URLS = [
  9 |         #     # 在这里添加更多需要排除的URL
 10 |         # ]
 11 |     def process_results(self, results, page=1):
 12 |         """处理搜索结果并应用分页"""
 13 |         # 过滤掉不想显示的URL
 14 |         # filtered_results = [hit for hit in results if hit.get('url') not in self.EXCLUDED_URLS]
 15 |         # total_results = len(filtered_results)
 16 |         #正常查询注释掉上面两句，恢复下面这一句
 17 | 
 18 |         total_results = len(results)
 19 |         total_pages = math.ceil(total_results / self.RESULTS_PER_PAGE)
 20 | 
 21 |         # 计算分页
 22 |         start_page = max(1, page - 5)
 23 |         end_page = min(total_pages, start_page + 9)
 24 |         if end_page - start_page < 9:
 25 |             start_page = max(1, end_page - 9)
 26 | 
 27 |         # 获取当前页的结果
 28 |         start_idx = (page - 1) * self.RESULTS_PER_PAGE
 29 |         end_idx = start_idx + self.RESULTS_PER_PAGE
 30 |         page_results = results[start_idx:end_idx]
 31 |         # 正常查询恢复上面这句，注释下面这一句
 32 |         #page_results = filtered_results[start_idx:end_idx]  # 这里使用filtered_results
 33 | 
 34 |         # 处理结果
 35 |         processed_results = [self._process_single_result(hit) for hit in page_results]
 36 | 
 37 |         return {
 38 |             'results': processed_results,
 39 |             'total': total_results,
 40 |             'total_pages': total_pages,
 41 |             'page_range': range(start_page, end_page + 1)
 42 |         }
 43 | 
 44 |     def _process_single_result(self, hit):
 45 |         """处理单个搜索结果"""
 46 |         # 如果是文档类型，使用特殊的处理方式，不需要处理 content
 47 |         if hit.get('filetype'):
 48 |             return {
 49 |                 'title': hit.get('title', '无标题'),
 50 |                 'filename': hit.get('filename', '未知文件名'),
 51 |                 'filetype': hit.get('filetype', '未知类型'),
 52 |                 'upload_date': hit.get('upload_date', None),
 53 |                 'url': hit.get('url', '#'),  # 如果有文档链接的话
 54 |                 'snippet': None,  # 文档不显示内容片段
 55 |                 'source': '',
 56 |                 'date': '',
 57 |                 'sort_date': '',
 58 |                 'snapshot_hash': None,
 59 |                 'snapshot_date': None
 60 |             }
 61 | 
 62 |         source = hit.get('source', '')
 63 |         date_str = source.split(' - ')[-1] if source else ''
 64 |         sort_date = self._process_date(date_str)
 65 | 
 66 |         # 特别处理通配符查询的结果
 67 |         content = hit.get('content', '')
 68 |         highlighted_content = hit.highlights("content")
 69 | 
 70 |         if hit.matched_terms():  # 获取匹配的词条
 71 |             # 将匹配的词条以及周围的文本包含在snippet中
 72 |             snippet = highlighted_content if highlighted_content else content[:200]
 73 |         else:
 74 |             snippet = content[:200]
 75 | 
 76 |         # 从索引中获取快照哈希值和捕获时间
 77 |         snapshot_hash = hit.get('snapshot_hash')  # 这个字段在索引中已存储
 78 |         captured_at = hit.get('captured_at')  # 这个字段在索引中已存储
 79 | 
 80 |         # 格式化快照捕获时间
 81 |         snapshot_date = None
 82 |         if captured_at:
 83 |             try:
 84 |                 snapshot_date = captured_at.strftime('%Y/%m/%d')
 85 |             except:
 86 |                 snapshot_date = None
 87 | 
 88 |         return {
 89 |             'title': hit.highlights("title") or hit.get('title', '无标题'),
 90 |             'url': hit.get('url', '#'),
 91 |             'snippet': snippet,
 92 |             'source': hit.get('source', None),
 93 |             'date': hit.get('publish_date', None),
 94 |             'sort_date': sort_date,
 95 |             'filetype': hit.get('filetype', None),
 96 |             'filename': hit.get('filename', None),
 97 |             'snapshot_hash': snapshot_hash,  # 这个hash用于在数据库中查找对应的快照
 98 |             'snapshot_date': snapshot_date   # 显示的快照日期
 99 |         }
100 | 
101 |     def _process_date(self, date_str):
102 |         """处理日期格式"""
103 |         if not date_str:
104 |             return ''
105 |         try:
106 |             parts = date_str.split('-')
107 |             return f"{parts[0]}-{parts[1].zfill(2)}-{parts[2].zfill(2)}"
108 |         except:
109 |             return ''


--------------------------------------------------------------------------------
/static/css/document.css:
--------------------------------------------------------------------------------
 1 |         .document-result {
 2 |             padding: 10px;
 3 |             border-radius: 4px;
 4 |             background-color: #f8f9fa;
 5 |             margin-bottom: 15px;
 6 |         }
 7 | 
 8 |         .document-result .result-title {
 9 |             font-size: 16px;
10 |             color: #1a0dab;
11 |             text-decoration: none;
12 |             display: block;
13 |             margin-bottom: 8px;
14 |         }
15 | 
16 |         .document-result .result-title:hover {
17 |             text-decoration: underline;
18 |         }
19 | 
20 |         .document-result .result-meta {
21 |             font-size: 13px;
22 |             color: #666;
23 |             margin-bottom: 8px;
24 |         }
25 | 
26 |         .document-result .result-url {
27 |             color: #006621;
28 |             text-decoration: none;
29 |         }
30 | 
31 |         .document-info {
32 |             font-size: 13px;
33 |             color: #666;
34 |             margin: 8px 0;
35 |             line-height: 1.4;
36 |         }
37 | 
38 |         .document-info span {
39 |             margin-right: 15px;
40 |             display: inline-block;
41 |         }
42 | 
43 |         .document-info .file-type {
44 |             color: #28a745;
45 |         }
46 | 
47 |         .document-info .file-size {
48 |             color: #dc3545;
49 |         }
50 | 
51 |         .document-info .upload-date {
52 |             color: #6c757d;
53 |         }
54 | 
55 |         .document-info .filename {
56 |             color: #0056b3;
57 |         }


--------------------------------------------------------------------------------
/static/css/main.css:
--------------------------------------------------------------------------------
 1 |  /* 其他样式保持不变 */
 2 |         .logo {
 3 |             font-size: 72px;
 4 |             font-weight: bold;
 5 |             margin-bottom: 30px;
 6 |             cursor: default;
 7 |         }
 8 |         /* 修改logo样式，每个字母不同颜色 */
 9 |         .logo span:nth-child(1) { color: #4285f4; } /* A */
10 |         .logo span:nth-child(2) { color: #ea4335; } /* L */
11 |         .logo span:nth-child(3) { color: #fbbc05; } /* L */
12 |         .logo span:nth-child(4) { color: #4285f4; } /* I */
13 |         .logo span:nth-child(5) { color: #34a853; } /* N */
14 |         .logo span:nth-child(6) { color: #ea4335; } /* K */
15 |         .logo span:nth-child(7) { color: #fbbc05; } /* U */
16 | 
17 | 


--------------------------------------------------------------------------------
/static/css/pagination.css:
--------------------------------------------------------------------------------
 1 | 
 2 |         /* 分页样式 */
 3 |         .pagination {
 4 |             margin-top: 20px;
 5 |             text-align: center;
 6 |             font-size: 14px;
 7 |         }
 8 | 
 9 |         .pagination a, .pagination span {
10 |             display: inline-block;
11 |             padding: 8px 12px;
12 |             margin: 0 4px;
13 |             color: #1a0dab;
14 |             text-decoration: none;
15 |             border-radius: 3px;
16 |         }
17 | 
18 |         .pagination .current-page {
19 |             background-color: #f8f9fa;
20 |             color: #000;
21 |             font-weight: bold;
22 |         }
23 | 
24 |         .pagination a:hover {
25 |             background-color: #f8f9fa;
26 |         }
27 | 
28 |         .page-nav {
29 |             color: #1a0dab;
30 |         }
31 | 
32 |         .page-number {
33 |             color: #1a0dab;
34 |         }


--------------------------------------------------------------------------------
/static/css/results.css:
--------------------------------------------------------------------------------
  1 | .results {
  2 |     max-width: 720px;
  3 |     margin: 0 auto;
  4 | }
  5 | 
  6 | .search-stats {
  7 |     color: #70757a;
  8 |     font-size: 14px;
  9 |     margin-bottom: 20px;
 10 |     padding: 0 20px;
 11 | }
 12 | 
 13 | /* 单个结果项 */
 14 | .result-item {
 15 |     max-width: 670px;
 16 |     margin-bottom: 25px;
 17 |     padding: 15px 20px;
 18 |     border-radius: 4px;
 19 |     background-color: #fff;
 20 |     box-shadow: 0 1px 3px rgba(0,0,0,0.1);
 21 | }
 22 | 
 23 | /* 标题链接 */
 24 | .result-title {
 25 |     color: #1a0dab;
 26 |     font-size: 18px;
 27 |     text-decoration: none;
 28 |     display: block;
 29 |     margin-bottom: 4px;
 30 | }
 31 | 
 32 | .result-title:hover {
 33 |     text-decoration: underline;
 34 | }
 35 | 
 36 | /* 标题高亮样式 */
 37 | .result-title em {
 38 |     font-weight: bold;
 39 |     font-style: normal;
 40 |     color: #1a0dab;
 41 |     background-color: transparent;
 42 |     text-decoration: none;
 43 | }
 44 | 
 45 | .result-title:hover em {
 46 |     text-decoration: underline;
 47 | }
 48 | 
 49 | /* 结果元信息区域 */
 50 | .result-meta {
 51 |     margin: 4px 0;
 52 |     font-size: 14px;
 53 |     color: #006621;
 54 |     display: flex;
 55 |     align-items: center;
 56 |     gap: 10px;
 57 | }
 58 | 
 59 | /* URL显示 */
 60 | .result-url {
 61 |     color: #006621;
 62 |     text-decoration: none;
 63 | }
 64 | 
 65 | /* 快照链接 */
 66 | .snapshot-link {
 67 |     color: #1a73e8;
 68 |     text-decoration: none;
 69 |     font-size: 13px;
 70 | }
 71 | 
 72 | .snapshot-link:hover {
 73 |     text-decoration: underline;
 74 | }
 75 | 
 76 | /* 内容摘要 */
 77 | .result-snippet {
 78 |     color: #3c4043;
 79 |     font-size: 14px;
 80 |     line-height: 1.58;
 81 |     margin: 4px 0;
 82 | }
 83 | 
 84 | /* 高亮匹配词 */
 85 | .result-snippet em {
 86 |     font-weight: bold;
 87 |     font-style: normal;
 88 |     background-color: #ffffd0;
 89 | }
 90 | 
 91 | /* 结果底部信息 */
 92 | .result-footer {
 93 |     margin-top: 4px;
 94 |     font-size: 13px;
 95 |     color: #70757a;
 96 | }
 97 | 
 98 | .result-source, 
 99 | .result-date {
100 |     margin-right: 10px;
101 | }
102 | 
103 | /* 无结果提示 */
104 | .no-results {
105 |     text-align: center;
106 |     color: #70757a;
107 |     margin-top: 40px;
108 |     padding: 20px;
109 | }
110 | 
111 | /* 响应式调整 */
112 | @media (max-width: 768px) {
113 |     .results {
114 |         padding: 0 15px;
115 |     }
116 |     
117 |     .result-item {
118 |         padding: 12px 15px;
119 |     }
120 | }


--------------------------------------------------------------------------------
/static/css/search.css:
--------------------------------------------------------------------------------
  1 | /* 搜索容器样式 */
  2 | .search-container {
  3 |     display: flex;
  4 |     flex-direction: column;
  5 |     align-items: center;
  6 |     margin-top: 170px; /* 调整上边距，为顶部选项栏和用户状态栏留出空间 */
  7 |     padding: 20px;
  8 |     position: relative; /* 添加这行 */
  9 | }
 10 | 
 11 | /* 搜索选项栏样式 */
 12 | /* 修改搜索选项栏样式 */
 13 | .search-options-bar {
 14 |     width: 100%;
 15 |     background-color: #f8f9fa;
 16 |     border-bottom: 1px solid #dfe1e5;
 17 |     padding: 10px 0;
 18 |     position: fixed;
 19 |     top: 0px; /* 调整顶部位置 */
 20 |     left: 0;
 21 |     z-index: 100;
 22 | }
 23 | 
 24 | .options-container {
 25 |     max-width: 750px; /* 与搜索框宽度保持一致 */
 26 |     margin: 0 auto;
 27 |     padding: 0 20px;
 28 |     display: flex;
 29 |     align-items: center;
 30 |     justify-content: center; /* 添加居中对齐 */
 31 |     gap: 15px;
 32 | }
 33 | 
 34 | /* 选项区域的下拉框样式 */
 35 | .options-container select {
 36 |     height: 36px;
 37 |     padding: 0 10px;
 38 |     border: 1px solid #dfe1e5;
 39 |     border-radius: 4px;
 40 |     background-color: white;
 41 |     color: #3c4043;
 42 |     font-size: 14px;
 43 |     cursor: pointer;
 44 | }
 45 | 
 46 | /* 文档类型选项样式 */
 47 | #fileTypeOptions {
 48 |     display: flex;
 49 |     align-items: center;
 50 |     gap: 10px;
 51 |     padding: 10px;
 52 |     border: 1px solid #eee;
 53 |     border-radius: 4px;
 54 |     font-size: 14px;
 55 | }
 56 | 
 57 | /* 主搜索表单样式 */
 58 | .main-search-form {
 59 |     width: 100%;
 60 |     max-width: 750px;
 61 |     margin: 0 auto;
 62 | }
 63 | 
 64 | .main-search-input {
 65 |     display: flex;
 66 |     gap: 10px;
 67 |     width: 100%;
 68 |     align-items: center;
 69 |     position: relative; /* 添加这行 */
 70 | }
 71 | 
 72 | /* 主搜索框样式 */
 73 | .main-search-input input {
 74 |     flex: 1;
 75 |     height: 44px;
 76 |     padding: 0 20px;
 77 |     font-size: 16px;
 78 |     border: 1px solid #dfe1e5;
 79 |     border-radius: 24px;
 80 |     outline: none;
 81 |     box-shadow: 0 1px 6px rgba(32,33,36,.28);
 82 | }
 83 | 
 84 | /* 搜索框悬停和焦点效果 */
 85 | .main-search-input input:hover,
 86 | .main-search-input input:focus {
 87 |     box-shadow: 0 1px 6px rgba(32,33,36,.28);
 88 |     border-color: rgba(223,225,229,0);
 89 | }
 90 | 
 91 | /* 搜索按钮样式 */
 92 | .main-search-input button {
 93 |     height: 44px;
 94 |     padding: 0 30px;
 95 |     background-color: #1a73e8;
 96 |     color: white;
 97 |     border: none;
 98 |     border-radius: 24px;
 99 |     font-size: 16px;
100 |     cursor: pointer;
101 |     transition: all 0.2s;
102 | }
103 | 
104 | /* 搜索按钮悬停效果 */
105 | .main-search-input button:hover {
106 |     background-color: #1557b0;
107 |     box-shadow: 0 1px 2px 0 rgba(66,133,244,0.3),
108 |                 0 1px 3px 1px rgba(66,133,244,0.15);
109 | }
110 | 
111 | /* 搜索提示样式 */
112 | .search-tips {
113 |     max-width: 600px;
114 |     margin: 15px auto 0;
115 |     text-align: center;
116 |     padding: 10px;
117 |     background: #f5f5f5;
118 |     border-radius: 4px;
119 | }
120 | 
121 | .search-tip {
122 |     margin: 5px 0;
123 |     color: #70757a;
124 |     font-size: 14px;
125 | }
126 | 
127 | /* 结果统计样式 */
128 | .search-stats {
129 |     color: #70757a;
130 |     font-size: 14px;
131 |     margin-bottom: 20px;
132 |     padding-left: 20px;
133 | }
134 | 
135 | /* 响应式布局调整 */
136 | @media (max-width: 768px) {
137 |     .options-container {
138 |         flex-direction: column;
139 |         align-items: stretch;
140 |         padding: 10px;
141 |     }
142 | 
143 |     .main-search-input {
144 |         flex-direction: column;
145 |         gap: 10px;
146 |     }
147 | 
148 |     .main-search-input button {
149 |         width: 100%;
150 |     }
151 | 
152 |     .search-container {
153 |         margin-top: 200px;
154 |     }
155 | 
156 |     #fileTypeOptions {
157 |         flex-wrap: wrap;
158 |         justify-content: center;
159 |     }
160 | }


--------------------------------------------------------------------------------
/static/css/search_history.css:
--------------------------------------------------------------------------------
 1 | .search-history {
 2 |     position: absolute;
 3 |     top: 100%;  /* 紧贴搜索框底部 */
 4 |     left: 0;
 5 |     right: 0;
 6 |     background: white;
 7 |     border: 1px solid #e0e0e0;
 8 |     border-radius: 0 0 8px 8px;
 9 |     box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1);
10 |     z-index: 9999;  /* 确保在最上层 */
11 |     margin-top: 0;  /* 移除间距 */
12 |     width: calc(100% - 110px); /* 减去搜索按钮的宽度和间距 */
13 | }
14 | 
15 | .search-history-header {
16 |     padding: 10px 15px;
17 |     border-bottom: 1px solid #e0e0e0;
18 |     display: flex;
19 |     justify-content: space-between;
20 |     align-items: center;
21 |     color: #666;
22 | }
23 | 
24 | .search-history-list {
25 |     list-style: none;
26 |     margin: 0;
27 |     padding: 0;
28 |     max-height: 300px;
29 |     overflow-y: auto;
30 |     background: white; /* 确保背景是白色的 */
31 | }
32 | 
33 | .search-history-item {
34 |     padding: 8px 15px;
35 |     display: flex;
36 |     justify-content: space-between;
37 |     align-items: center;
38 |     cursor: pointer;
39 | }
40 | 
41 | .search-history-item:hover {
42 |     background-color: #f5f5f5;
43 | }
44 | 
45 | .search-history-query {
46 |     flex-grow: 1;
47 |     color: #333;
48 | }
49 | 
50 | .delete-history {
51 |     color: #999;
52 |     padding: 4px 8px;
53 |     visibility: hidden;
54 | }
55 | 
56 | .search-history-item:hover .delete-history {
57 |     visibility: visible;
58 | }
59 | 
60 | .search-history-footer {
61 |     padding: 10px 15px;
62 |     border-top: 1px solid #e0e0e0;
63 |     text-align: center;
64 | }
65 | 
66 | .view-more {
67 |     color: #1a73e8;
68 |     text-decoration: none;
69 | }
70 | 
71 | .clear-all {
72 |     color: #666;
73 |     text-decoration: none;
74 |     font-size: 0.9em;
75 | }
76 | 
77 | .clear-all:hover, .view-more:hover {
78 |     text-decoration: underline;
79 | }


--------------------------------------------------------------------------------
/static/css/search_suggestions.css:
--------------------------------------------------------------------------------
 1 | .search-suggestions {
 2 |     position: absolute;
 3 |     top: 100%;
 4 |     left: 0;
 5 |     right: 0;
 6 |     background: white;
 7 |     border: 1px solid #e0e0e0;
 8 |     border-radius: 0 0 4px 4px;
 9 |     box-shadow: 0 2px 4px rgba(0,0,0,0.1);
10 |     z-index: 1000;
11 |     max-height: 300px;
12 |     overflow-y: auto;
13 | }
14 | 
15 | .suggestion-item {
16 |     padding: 8px 16px;
17 |     cursor: pointer;
18 |     border-bottom: 1px solid #f0f0f0;
19 |     white-space: nowrap;
20 |     overflow: hidden;
21 |     text-overflow: ellipsis;
22 |     display: flex;
23 |     align-items: center;
24 |     gap: 8px;
25 | }
26 | 
27 | .suggestion-item:last-child {
28 |     border-bottom: none;
29 | }
30 | 
31 | .suggestion-item:hover {
32 |     background-color: #f5f5f5;
33 | }
34 | 
35 | .suggestion-item .highlight {
36 |     font-weight: bold;
37 | }
38 | 
39 | .suggestion-item .icon {
40 |     color: #666;
41 |     font-size: 14px;
42 |     min-width: 20px;
43 | }
44 | 
45 | .suggestion-item.history {
46 |     background-color: #f8f9fa;
47 | }
48 | 
49 | .suggestion-item.history .icon::before {
50 |     content: "⏱";
51 | }
52 | 
53 | .suggestion-item.title .icon::before {
54 |     content: "🔍";
55 | }
56 | 
57 | /* 确保搜索联想框在历史记录之上 */
58 | .search-suggestions {
59 |     z-index: 1001;
60 | }
61 | 
62 | .search-history {
63 |     z-index: 1000;
64 | }
65 | 
66 | /* 优化滚动条样式 */
67 | .search-suggestions::-webkit-scrollbar {
68 |     width: 6px;
69 | }
70 | 
71 | .search-suggestions::-webkit-scrollbar-thumb {
72 |     background-color: #ddd;
73 |     border-radius: 3px;
74 | }
75 | 
76 | .search-suggestions::-webkit-scrollbar-track {
77 |     background-color: #f5f5f5;
78 | }


--------------------------------------------------------------------------------
/static/css/user.css:
--------------------------------------------------------------------------------
  1 | /* 用户相关样式 */
  2 |         /* 用户部分调整到搜索选项栏下方 */
  3 |         .user-section {
  4 |             position: fixed;
  5 |             top: 57px; /* 调整位置到搜索选项栏下方 */
  6 |             right: 20px;
  7 |             display: flex;
  8 |             align-items: center;
  9 |             gap: 10px;
 10 |             z-index: 1001;
 11 |             background-color: white;
 12 |             padding: 5px 10px;
 13 |             border-radius: 4px;
 14 |         }
 15 | 
 16 |         .auth-buttons {
 17 |             display: flex;
 18 |             gap: 10px;
 19 |         }
 20 | 
 21 |         .auth-buttons button {
 22 |             padding: 8px 16px;
 23 |             border: none;
 24 |             border-radius: 4px;
 25 |             cursor: pointer;
 26 |         }
 27 | 
 28 |         .login-btn {
 29 |             background-color: #4285f4;
 30 |             color: white;
 31 |         }
 32 | 
 33 |         .register-btn {
 34 |             background-color: #34a853;
 35 |             color: white;
 36 |         }
 37 | 
 38 |         .user-panel {
 39 |             display: flex;
 40 |             align-items: center;
 41 |             gap: 10px;
 42 |         }
 43 | 
 44 |         .user-avatar {
 45 |             width: 32px;
 46 |             height: 32px;
 47 |             border-radius: 50%;
 48 |             background-color: #ddd;
 49 |             display: flex;
 50 |             align-items: center;
 51 |             justify-content: center;
 52 |         }
 53 | 
 54 |         .dropdown-menu {
 55 |             position: absolute;
 56 |             top: 100%;
 57 |             right: 0;
 58 |             background: white;
 59 |             border: 1px solid #ddd;
 60 |             border-radius: 4px;
 61 |             padding: 8px 0;
 62 |             display: none;
 63 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 64 |         }
 65 | 
 66 |         .dropdown-menu.show {
 67 |             display: block;
 68 |         }
 69 | 
 70 |         .dropdown-menu a {
 71 |             display: block;
 72 |             padding: 8px 16px;
 73 |             color: #333;
 74 |             text-decoration: none;
 75 |         }
 76 | 
 77 |         .dropdown-menu a:hover {
 78 |             background-color: #f5f5f5;
 79 |         }
 80 |                 /* 登录/注册模态框样式 */
 81 |         /* 修改模态框样式 */
 82 |         .modal {
 83 |             display: none;
 84 |             position: fixed;
 85 |             top: 0;
 86 |             left: 0;
 87 |             width: 100%;
 88 |             height: 100%;
 89 |             background-color: rgba(0, 0, 0, 0.5); /* 半透明黑色背景 */
 90 |             z-index: 10000; /* 确保在最上层 */
 91 |             justify-content: center;
 92 |             align-items: center;
 93 |         }
 94 | 
 95 |         .modal.show {
 96 |             display: flex;
 97 |             align-items: center;
 98 |             justify-content: center;
 99 |         }
100 | 
101 |         .modal-content {
102 |             background: white;
103 |             padding: 20px;
104 |             border-radius: 8px;
105 |             width: 100%;
106 |             max-width: 400px;
107 |         }
108 | 
109 |         .modal-header {
110 |             display: flex;
111 |             justify-content: space-between;
112 |             align-items: center;
113 |             margin-bottom: 20px;
114 |         }
115 | 
116 |         .close-btn {
117 |             background: none;
118 |             border: none;
119 |             font-size: 20px;
120 |             cursor: pointer;
121 |         }
122 | 
123 |         .form-group {
124 |             margin-bottom: 16px;
125 |         }
126 | 
127 |         .form-group label {
128 |             display: block;
129 |             margin-bottom: 8px;
130 |         }
131 | 
132 |         .form-group input {
133 |             width: 100%;
134 |             padding: 8px;
135 |             border: 1px solid #ddd;
136 |             border-radius: 4px;
137 |         }
138 | 
139 |         .submit-btn {
140 |             width: 100%;
141 |             padding: 10px;
142 |             background-color: #4285f4;
143 |             color: white;
144 |             border: none;
145 |             border-radius: 4px;
146 |             cursor: pointer;
147 |         }
148 |         /* Flash消息样式 */
149 |         .flash-messages {
150 |             position: fixed;
151 |             top: 60px;  /* 原来是20px，调整到60px，让它在搜索选项栏下方 */
152 |             left: 50%;
153 |             transform: translateX(-50%);
154 |             z-index: 1002;
155 |         }
156 | 
157 |         .flash-message {
158 |             padding: 10px 20px;
159 |             margin-bottom: 10px;
160 |             background-color: #f8d7da;
161 |             border: 1px solid #f5c6cb;
162 |             border-radius: 4px;
163 |             color: #721c24;
164 |             text-align: center;
165 |         }


--------------------------------------------------------------------------------
/templates/history.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>搜索历史 - ALLINKU</title>
  6 |     <link rel="stylesheet" href="/static/css/main.css">
  7 |     <style>
  8 |         .history-container {
  9 |             max-width: 800px;
 10 |             margin: 40px auto;
 11 |             padding: 20px;
 12 |         }
 13 | 
 14 |         .history-header {
 15 |             margin-bottom: 30px;
 16 |             display: flex;
 17 |             justify-content: space-between;
 18 |             align-items: center;
 19 |         }
 20 | 
 21 |         .history-list {
 22 |             border: 1px solid #eee;
 23 |             border-radius: 8px;
 24 |             overflow: hidden;
 25 |         }
 26 | 
 27 |         .history-item {
 28 |             padding: 15px 20px;
 29 |             border-bottom: 1px solid #eee;
 30 |             background: white;
 31 |             transition: background-color 0.2s;
 32 |         }
 33 | 
 34 |         .history-item:last-child {
 35 |             border-bottom: none;
 36 |         }
 37 | 
 38 |         .history-item:hover {
 39 |             background-color: #f8f9fa;
 40 |         }
 41 | 
 42 |         .query-text {
 43 |             color: #1a73e8;
 44 |             font-size: 16px;
 45 |             margin-bottom: 5px;
 46 |         }
 47 | 
 48 |         .search-meta {
 49 |             color: #666;
 50 |             font-size: 14px;
 51 |         }
 52 | 
 53 |         .search-time {
 54 |             color: #888;
 55 |         }
 56 | 
 57 |         .back-button {
 58 |             padding: 8px 16px;
 59 |             background-color: #f8f9fa;
 60 |             color: #3c4043;
 61 |             border: 1px solid #dadce0;
 62 |             border-radius: 4px;
 63 |             text-decoration: none;
 64 |             font-size: 14px;
 65 |         }
 66 | 
 67 |         .back-button:hover {
 68 |             background-color: #f1f3f4;
 69 |         }
 70 | 
 71 |         .clear-history {
 72 |             padding: 8px 16px;
 73 |             background-color: #dc3545;
 74 |             color: white;
 75 |             border: none;
 76 |             border-radius: 4px;
 77 |             cursor: pointer;
 78 |         }
 79 | 
 80 |         .no-history {
 81 |             text-align: center;
 82 |             padding: 40px;
 83 |             color: #666;
 84 |         }
 85 |     </style>
 86 | </head>
 87 | <body>
 88 |     <div class="history-container">
 89 |         <div class="history-header">
 90 |             <h1>搜索历史</h1>
 91 |             <div>
 92 |                 <a href="/" class="back-button">返回搜索</a>
 93 |                 {% if history %}
 94 |                 <button class="clear-history" onclick="clearHistory()">清空历史</button>
 95 |                 {% endif %}
 96 |             </div>
 97 |         </div>
 98 | 
 99 |         {% if history %}
100 |         <div class="history-list">
101 |             {% for item in history %}
102 |             <div class="history-item">
103 |                 <div class="query-text">
104 |                     <a href="{{ url_for('search', q=item.query, searchIn=item.search_in, sortBy=item.sort_by) }}">
105 |                         {{ item.query }}
106 |                     </a>
107 |                 </div>
108 |                 <div class="search-meta">
109 |                     搜索范围: {{ {'all': '全部', 'title': '标题', 'content': '内容'}[item.search_in] }}
110 |                     | 排序方式: {{ {'relevance': '相关度', 'date': '时间'}[item.sort_by] }}
111 |                     <span class="search-time">| {{ item.timestamp.strftime('%Y-%m-%d %H:%M:%S') }}</span>
112 |                 </div>
113 |             </div>
114 |             {% endfor %}
115 |         </div>
116 |         {% else %}
117 |         <div class="no-history">
118 |             暂无搜索历史
119 |         </div>
120 |         {% endif %}
121 |     </div>
122 | 
123 |     <script>
124 |         function clearHistory() {
125 |             if (confirm('确定要清空搜索历史吗？')) {
126 |                 fetch('/clear_history', {
127 |                     method: 'POST',
128 |                     credentials: 'same-origin'
129 |                 }).then(response => {
130 |                     if (response.ok) {
131 |                         window.location.reload();
132 |                     }
133 |                 });
134 |             }
135 |         }
136 |     </script>
137 | </body>
138 | </html>


--------------------------------------------------------------------------------
/templates/preferences.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>个性化设置 - ALLINKU</title>
  6 |     <link rel="stylesheet" href="/static/css/main.css">
  7 |     <style>
  8 |         .preferences-container {
  9 |             max-width: 600px;
 10 |             margin: 40px auto;
 11 |             padding: 20px;
 12 |         }
 13 | 
 14 |         .preferences-header {
 15 |             margin-bottom: 30px;
 16 |             display: flex;
 17 |             justify-content: space-between;
 18 |             align-items: center;
 19 |         }
 20 | 
 21 |         .preferences-form {
 22 |             background: white;
 23 |             padding: 20px;
 24 |             border-radius: 8px;
 25 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 26 |         }
 27 | 
 28 |         .form-group {
 29 |             margin-bottom: 20px;
 30 |         }
 31 | 
 32 |         .form-group label {
 33 |             display: block;
 34 |             margin-bottom: 8px;
 35 |             color: #333;
 36 |             font-weight: 500;
 37 |         }
 38 | 
 39 |         .form-group select,
 40 |         .form-group input {
 41 |             width: 100%;
 42 |             padding: 8px;
 43 |             border: 1px solid #ddd;
 44 |             border-radius: 4px;
 45 |             font-size: 14px;
 46 |         }
 47 | 
 48 |         .form-actions {
 49 |             margin-top: 30px;
 50 |             display: flex;
 51 |             justify-content: space-between;
 52 |         }
 53 | 
 54 |         .save-button {
 55 |             padding: 10px 20px;
 56 |             background-color: #1a73e8;
 57 |             color: white;
 58 |             border: none;
 59 |             border-radius: 4px;
 60 |             cursor: pointer;
 61 |         }
 62 | 
 63 |         .save-button:hover {
 64 |             background-color: #1557b0;
 65 |         }
 66 | 
 67 |         .back-button {
 68 |             padding: 10px 20px;
 69 |             background-color: #f8f9fa;
 70 |             color: #3c4043;
 71 |             border: 1px solid #dadce0;
 72 |             border-radius: 4px;
 73 |             text-decoration: none;
 74 |         }
 75 | 
 76 |         .back-button:hover {
 77 |             background-color: #f1f3f4;
 78 |         }
 79 | 
 80 |         .alert {
 81 |             padding: 10px;
 82 |             margin-bottom: 20px;
 83 |             border-radius: 4px;
 84 |         }
 85 | 
 86 |         .alert-success {
 87 |             background-color: #d4edda;
 88 |             color: #155724;
 89 |             border: 1px solid #c3e6cb;
 90 |         }
 91 |     </style>
 92 | </head>
 93 | <body>
 94 |     <div class="preferences-container">
 95 |         <div class="preferences-header">
 96 |             <h1>个性化设置</h1>
 97 |             <a href="/" class="back-button">返回搜索</a>
 98 |         </div>
 99 | 
100 |         {% with messages = get_flashed_messages() %}
101 |             {% if messages %}
102 |                 {% for message in messages %}
103 |                 <div class="alert alert-success">
104 |                     {{ message }}
105 |                 </div>
106 |                 {% endfor %}
107 |             {% endif %}
108 |         {% endwith %}
109 | 
110 |         <form class="preferences-form" method="POST">
111 |             <div class="form-group">
112 |                 <label for="default_search_in">默认搜索范围</label>
113 |                 <select name="default_search_in" id="default_search_in">
114 |                     <option value="all" >全部</option>
115 |                     <option value="title">仅标题</option>
116 |                     <option value="content">仅内容</option>
117 |                 </select>
118 |             </div>
119 | 
120 |             <div class="form-group">
121 |                 <label for="default_sort_by">默认排序方式</label>
122 |                 <select name="default_sort_by" id="default_sort_by">
123 |                     <option value="relevance" >按相关度排序</option>
124 |                     <option value="date">按时间排序</option>
125 |                 </select>
126 |             </div>
127 | 
128 |             <div class="form-group">
129 |                 <label for="results_per_page">每页显示结果数</label>
130 |                 <select name="results_per_page" id="results_per_page">
131 |                     <option value="10">10</option>
132 |                     <option value="20">20</option>
133 |                     <option value="50">50</option>
134 |                 </select>
135 |             </div>
136 | 
137 |             <div class="form-actions">
138 |                 <button type="submit" class="save-button">保存设置</button>
139 |             </div>
140 |         </form>
141 |     </div>
142 | </body>
143 | </html>


--------------------------------------------------------------------------------
/templates/profile.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>个人信息 - ALLINKU</title>
  6 |     <link rel="stylesheet" href="/static/css/main.css">
  7 |     <style>
  8 |         .profile-container {
  9 |             max-width: 600px;
 10 |             margin: 40px auto;
 11 |             padding: 20px;
 12 |         }
 13 | 
 14 |         .profile-header {
 15 |             margin-bottom: 30px;
 16 |             display: flex;
 17 |             justify-content: space-between;
 18 |             align-items: center;
 19 |         }
 20 | 
 21 |         .profile-form {
 22 |             background: white;
 23 |             padding: 20px;
 24 |             border-radius: 8px;
 25 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 26 |         }
 27 | 
 28 |         .form-group {
 29 |             margin-bottom: 20px;
 30 |         }
 31 | 
 32 |         .form-group label {
 33 |             display: block;
 34 |             margin-bottom: 8px;
 35 |             color: #333;
 36 |             font-weight: 500;
 37 |         }
 38 | 
 39 |         .form-group select,
 40 |         .form-group input {
 41 |             width: 100%;
 42 |             padding: 8px;
 43 |             border: 1px solid #ddd;
 44 |             border-radius: 4px;
 45 |             font-size: 14px;
 46 |         }
 47 | 
 48 |         .form-actions {
 49 |             margin-top: 30px;
 50 |             display: flex;
 51 |             justify-content: space-between;
 52 |         }
 53 | 
 54 |         .save-button {
 55 |             padding: 10px 20px;
 56 |             background-color: #1a73e8;
 57 |             color: white;
 58 |             border: none;
 59 |             border-radius: 4px;
 60 |             cursor: pointer;
 61 |         }
 62 | 
 63 |         .save-button:hover {
 64 |             background-color: #1557b0;
 65 |         }
 66 | 
 67 |         .back-button {
 68 |             padding: 10px 20px;
 69 |             background-color: #f8f9fa;
 70 |             color: #3c4043;
 71 |             border: 1px solid #dadce0;
 72 |             border-radius: 4px;
 73 |             text-decoration: none;
 74 |         }
 75 | 
 76 |         .back-button:hover {
 77 |             background-color: #f1f3f4;
 78 |         }
 79 | 
 80 |         .alert {
 81 |             padding: 10px;
 82 |             margin-bottom: 20px;
 83 |             border-radius: 4px;
 84 |         }
 85 | 
 86 |         .alert-success {
 87 |             background-color: #d4edda;
 88 |             color: #155724;
 89 |             border: 1px solid #c3e6cb;
 90 |         }
 91 |     </style>
 92 | </head>
 93 | <body>
 94 |     <div class="profile-container">
 95 |         <div class="profile-header">
 96 |             <h1>个人信息</h1>
 97 |             <a href="/" class="back-button">返回搜索</a>
 98 |         </div>
 99 | 
100 |         {% with messages = get_flashed_messages() %}
101 |             {% if messages %}
102 |                 {% for message in messages %}
103 |                 <div class="alert alert-success">
104 |                     {{ message }}
105 |                 </div>
106 |                 {% endfor %}
107 |             {% endif %}
108 |         {% endwith %}
109 | 
110 |         <div class="profile-form">
111 |             <form id="profileForm">
112 |                 <div class="form-group">
113 |                     <label for="role">身份</label>
114 |                     <select id="role" name="role">
115 |                         <option value="未设置" >请选择身份</option>
116 |                         <option value="本科生">本科生</option>
117 |                         <option value="研究生">研究生</option>
118 |                         <option value="博士生" >博士生</option>
119 |                         <option value="教师">教师</option>
120 |                     </select>
121 |                 </div>
122 | 
123 |                 <div class="form-group">
124 |                     <label for="college">学院</label>
125 |                     <input type="text" id="college" name="college" value="{{ profile.college if profile.college else '' }}"
126 |                            placeholder="请输入所属学院">
127 |                 </div>
128 | 
129 |                 <div class="form-group">
130 |                     <label for="age">年龄</label>
131 |                     <input type="number" id="age" name="age" value="{{ profile.age if profile.age else '' }}"
132 |                            placeholder="请输入年龄">
133 |                 </div>
134 | 
135 |                 <div class="form-actions">
136 |                     <button type="submit" class="save-button">保存修改</button>
137 |                 </div>
138 |             </form>
139 |         </div>
140 |     </div>
141 | 
142 |     <script>
143 |     document.getElementById('profileForm').addEventListener('submit', async function(e) {
144 |         e.preventDefault();
145 |         
146 |         const formData = {
147 |             role: document.getElementById('role').value,
148 |             college: document.getElementById('college').value,
149 |             age: document.getElementById('age').value ? parseInt(document.getElementById('age').value) : null
150 |         };
151 | 
152 |         try {
153 |             const response = await fetch('/api/profile', {
154 |                 method: 'PUT',
155 |                 headers: {
156 |                     'Content-Type': 'application/json'
157 |                 },
158 |                 body: JSON.stringify(formData)
159 |             });
160 | 
161 |             const result = await response.json();
162 |             
163 |             if (result.success) {
164 |                 alert('个人信息更新成功！');
165 |                 location.reload();
166 |             } else {
167 |                 alert(result.message || '更新失败，请重试');
168 |             }
169 |         } catch (error) {
170 |             console.error('Error:', error);
171 |             alert('更新失败，请重试');
172 |         }
173 |     });
174 |     </script>
175 | </body>
176 | </html>


--------------------------------------------------------------------------------
/templates/search0.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>ALLINKU - 南开搜索</title>
  6 | <!--    &lt;!&ndash; 引入CSS文件 &ndash;&gt;-->
  7 | <!--    <link rel="stylesheet" href="/static/css/main.css">-->
  8 |     <style>
  9 |         /* 其他样式保持不变 */
 10 |         .logo {
 11 |             font-size: 72px;
 12 |             font-weight: bold;
 13 |             margin-bottom: 30px;
 14 |             cursor: default;
 15 |         }
 16 |         /* 修改logo样式，每个字母不同颜色 */
 17 |         .logo span:nth-child(1) { color: #4285f4; } /* A */
 18 |         .logo span:nth-child(2) { color: #ea4335; } /* L */
 19 |         .logo span:nth-child(3) { color: #fbbc05; } /* L */
 20 |         .logo span:nth-child(4) { color: #4285f4; } /* I */
 21 |         .logo span:nth-child(5) { color: #34a853; } /* N */
 22 |         .logo span:nth-child(6) { color: #ea4335; } /* K */
 23 |         .logo span:nth-child(7) { color: #fbbc05; } /* U */
 24 |         /* 搜索容器样式 */
 25 |         .search-container {
 26 |             display: flex;
 27 |             flex-direction: column;
 28 |             align-items: center;
 29 |             margin-top: 100px;
 30 |             padding: 20px;
 31 |         }
 32 |         /* 搜索表单样式 */
 33 |         .search-form {
 34 |             display: flex;
 35 |             flex-direction: column;
 36 |             align-items: center;
 37 |             width: 100%;
 38 |             max-width: 750px;/* 这里可以调整整个表单区域的宽度 */
 39 |             margin: 0 auto;
 40 |         }
 41 |         .search-box {
 42 |             flex: 1;
 43 |             padding: 12px 20px;
 44 |             font-size: 16px;
 45 |             border: 1px solid #dfe1e5;
 46 |             border-radius: 24px;
 47 |             outline: none;
 48 |             margin-right: 10px;
 49 |         }
 50 |         .search-button {
 51 |             padding: 12px 24px;
 52 |             font-size: 16px;
 53 |             color: white;
 54 |             background-color: #4285f4;
 55 |             border: none;
 56 |             border-radius: 24px;
 57 |             cursor: pointer;
 58 |             transition: background-color 0.2s;
 59 |         }
 60 |         .search-button:hover {
 61 |             background-color: #357abd;
 62 |         }
 63 |         .results {
 64 |             max-width: 720px;/* 这里可以调整搜索结果区域的宽度 */
 65 |             margin: 0 auto;
 66 |         }
 67 | 
 68 |         /* 结果统计 */
 69 |         .search-stats {
 70 |             color: #70757a;
 71 |             font-size: 14px;
 72 |             margin-bottom: 20px;
 73 |             padding-left: 20px;
 74 |         }
 75 | 
 76 |         /* 单个结果项 */
 77 |         .result-item {
 78 |             max-width: 670px;  /* 这里可以调整每个搜索结果项的宽度 */
 79 |             margin-bottom: 25px;
 80 |             padding: 0 20px;
 81 |         }
 82 | 
 83 |         /* 标题链接 */
 84 |         .result-title {
 85 |             color: #1a0dab;
 86 |             font-size: 18px;
 87 |             text-decoration: none;
 88 |             display: block;
 89 |             margin-bottom: 4px;
 90 |         }
 91 | 
 92 |         .result-title:hover {
 93 |             text-decoration: underline;
 94 |         }
 95 |         /* 标题高亮样式 */
 96 |         .result-title em {
 97 |             font-weight: bold;
 98 |             font-style: normal;
 99 |             color: #1a0dab;
100 |             background-color: transparent;
101 |             text-decoration: none;
102 |         }
103 | 
104 |         .result-title:hover em {
105 |             text-decoration: underline;
106 |         }
107 | 
108 |         /* URL显示 */
109 |         .result-url {
110 |             color: #006621;
111 |             font-size: 14px;
112 |             margin-bottom: 6px;
113 |         }
114 | 
115 |         /* 内容摘要 */
116 |         .result-snippet {
117 |             color: #3c4043;
118 |             font-size: 14px;
119 |             line-height: 1.58;
120 |             margin-bottom: 4px;
121 |         }
122 | 
123 |         /* 高亮匹配词 */
124 |         .result-snippet em {
125 |             font-weight: bold;
126 |             font-style: normal;
127 |             background-color: #ffffd0;
128 |         }
129 | 
130 |         /* 元信息（来源和日期） */
131 |         .result-meta {
132 |             color: #70757a;
133 |             font-size: 12px;
134 |         }
135 | 
136 |         .result-meta span:not(:last-child)::after {
137 |             content: " - ";
138 |         }
139 | 
140 |         /* 无结果提示 */
141 |         .no-results {
142 |             text-align: center;
143 |             color: #70757a;
144 |             margin-top: 40px;
145 |         }
146 |         /* 搜索框组样式 */
147 |         .search-input-group {
148 |             display: flex;
149 |             align-items: center;
150 |             width: 100%;
151 |             max-width: 750px; /* 这里可以调整搜索框的宽度 */
152 |             margin: 0 auto 20px;
153 |             padding: 5px 0;
154 |         }
155 | 
156 |          /* 搜索框样式 */
157 |         .search-input-group input {
158 |             flex: 1;
159 |             height: 44px;
160 |             padding: 0 20px;
161 |             font-size: 16px;
162 |             border: 1px solid #dfe1e5;
163 |             border-radius: 24px;
164 |             outline: none;
165 |             margin-right: 10px;
166 |             box-shadow: 0 1px 6px rgba(32,33,36,.28);
167 |         }
168 |         /* 搜索框悬停效果 */
169 |         .search-input-group input:hover,
170 |         .search-input-group input:focus {
171 |             box-shadow: 0 1px 6px rgba(32,33,36,.28);
172 |             border-color: rgba(223,225,229,0);
173 |         }
174 | 
175 |         /* 下拉框样式 */
176 |         .search-input-group select {
177 |             height: 36px;
178 |             padding: 0 10px;
179 |             margin: 0 5px;
180 |             border: 1px solid #dfe1e5;
181 |             border-radius: 4px;
182 |             background-color: #f8f9fa;
183 |             color: #3c4043;
184 |             font-size: 14px;
185 |             cursor: pointer;
186 |         }
187 | 
188 |         /* 搜索按钮样式 */
189 |         .search-input-group button {
190 |             height: 36px;
191 |             padding: 0 16px;
192 |             background-color: #1a73e8;
193 |             color: white;
194 |             border: none;
195 |             border-radius: 4px;
196 |             font-size: 14px;
197 |             cursor: pointer;
198 |             transition: all 0.2s;
199 |         }
200 |         /* 搜索按钮悬停效果 */
201 |         .search-input-group button:hover {
202 |             background-color: #1557b0;
203 |             box-shadow: 0 1px 2px 0 rgba(66,133,244,0.3),
204 |                        0 1px 3px 1px rgba(66,133,244,0.15);
205 |         }
206 |         /* 分页样式 */
207 |         .pagination {
208 |             margin-top: 20px;
209 |             text-align: center;
210 |             font-size: 14px;
211 |         }
212 | 
213 |         .pagination a, .pagination span {
214 |             display: inline-block;
215 |             padding: 8px 12px;
216 |             margin: 0 4px;
217 |             color: #1a0dab;
218 |             text-decoration: none;
219 |             border-radius: 3px;
220 |         }
221 | 
222 |         .pagination .current-page {
223 |             background-color: #f8f9fa;
224 |             color: #000;
225 |             font-weight: bold;
226 |         }
227 | 
228 |         .pagination a:hover {
229 |             background-color: #f8f9fa;
230 |         }
231 | 
232 |         .page-nav {
233 |             color: #1a0dab;
234 |         }
235 | 
236 |         .page-number {
237 |             color: #1a0dab;
238 |         }
239 |     </style>
240 | </head>
241 | <body>
242 |     <div class="search-container">
243 |         <div class="logo">
244 |             <span>A</span><span>L</span><span>L</span><span>I</span><span>N</span><span>K</span><span>U</span>
245 |         </div>
246 |         <form action="/search" method="GET">
247 |     <div class="search-input-group">
248 |         <input type="text"
249 |                name="q"
250 |                value="{{ query or '' }}"
251 |                placeholder="输入搜索关键词...">
252 | 
253 |         <select name="searchIn">
254 |             <option value="all">搜索范围：全部</option>
255 |             <option value="title">仅标题</option>
256 |             <option value="content">仅内容</option>
257 |         </select>
258 | 
259 |         <select name="sortBy">
260 |             <option value="relevance">按相关度排序</option>
261 |             <option value="date"> 按时间排序</option>
262 |         </select>
263 |         <button type="submit">搜索</button>
264 |     </div>
265 | </form>
266 |     </div>
267 | 
268 |     {% if results %}
269 |     <div class="results">
270 |     {% if results %}
271 |     <div class="search-stats">
272 |         找到约 {{total}} 条结果
273 |     </div>
274 | 
275 |     {% for result in results %}
276 |     <div class="result-item">
277 |         <a href="{{result.url}}" class="result-title" target="_blank">{{result.title|safe}}</a>
278 |         <div class="result-url">{{result.url}}</div>
279 |         <div class="result-snippet">{{result.snippet|safe}}</div>
280 |         <div class="result-meta">
281 |             {% if result.source %}
282 |             <span class="result-source">{{result.source}}</span>
283 |             {% endif %}
284 |             {% if result.date %}
285 |             <span class="result-date">{{result.date.strftime('%Y-%m-%d')}}</span>
286 |             {% endif %}
287 |         </div>
288 |     </div>
289 |     {% endfor %}
290 |     {# Add pagination #}
291 |     {% if total_pages > 1 %}
292 |         {% if current_page > 1 %}
293 |         <a href="{{ url_for('search', q=query, page=current_page-1, searchIn=search_in, sortBy=sort_by) }}" class="page-nav">上一页</a>
294 |         {% endif %}
295 | 
296 |         {% for p in page_range %}
297 |             {% if p == current_page %}
298 |             <span class="current-page">{{ p }}</span>
299 |             {% else %}
300 |             <a href="{{ url_for('search', q=query, page=p, searchIn=search_in, sortBy=sort_by) }}" class="page-number">{{ p }}</a>
301 |             {% endif %}
302 |         {% endfor %}
303 | 
304 |         {% if current_page < total_pages %}
305 |         <a href="{{ url_for('search', q=query, page=current_page+1, searchIn=search_in, sortBy=sort_by) }}" class="page-nav">下一页</a>
306 |         {% endif %}
307 |     {% endif %}
308 |     {% else %}
309 |     {% if query %}
310 |     <div class="no-results">
311 |         未找到与 "{{query}}" 相关的结果
312 |     </div>
313 |     {% endif %}
314 |     {% endif %}
315 | </div>
316 |     {% endif %}
317 | </body>
318 | </html>


--------------------------------------------------------------------------------
/templates/search00.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>ALLINKU - 南开搜索</title>
  6 |     <!-- 引入CSS文件 -->
  7 |     <link rel="stylesheet" href="/static/css/main.css">
  8 |     <link rel="stylesheet" href="/static/css/search.css">
  9 |     <link rel="stylesheet" href="/static/css/results.css">
 10 |     <link rel="stylesheet" href="/static/css/pagination.css">
 11 |     <style>
 12 |         /* 用户相关样式 */
 13 |         .user-section {
 14 |             position: absolute;
 15 |             top: 20px;
 16 |             right: 20px;
 17 |             display: flex;
 18 |             align-items: center;
 19 |             gap: 10px;
 20 |         }
 21 | 
 22 |         .auth-buttons {
 23 |             display: flex;
 24 |             gap: 10px;
 25 |         }
 26 | 
 27 |         .auth-buttons button {
 28 |             padding: 8px 16px;
 29 |             border: none;
 30 |             border-radius: 4px;
 31 |             cursor: pointer;
 32 |         }
 33 | 
 34 |         .login-btn {
 35 |             background-color: #4285f4;
 36 |             color: white;
 37 |         }
 38 | 
 39 |         .register-btn {
 40 |             background-color: #34a853;
 41 |             color: white;
 42 |         }
 43 | 
 44 |         .user-panel {
 45 |             display: flex;
 46 |             align-items: center;
 47 |             gap: 10px;
 48 |         }
 49 | 
 50 |         .user-avatar {
 51 |             width: 32px;
 52 |             height: 32px;
 53 |             border-radius: 50%;
 54 |             background-color: #ddd;
 55 |             display: flex;
 56 |             align-items: center;
 57 |             justify-content: center;
 58 |         }
 59 | 
 60 |         .dropdown-menu {
 61 |             position: absolute;
 62 |             top: 100%;
 63 |             right: 0;
 64 |             background: white;
 65 |             border: 1px solid #ddd;
 66 |             border-radius: 4px;
 67 |             padding: 8px 0;
 68 |             display: none;
 69 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 70 |         }
 71 | 
 72 |         .dropdown-menu.show {
 73 |             display: block;
 74 |         }
 75 | 
 76 |         .dropdown-menu a {
 77 |             display: block;
 78 |             padding: 8px 16px;
 79 |             color: #333;
 80 |             text-decoration: none;
 81 |         }
 82 | 
 83 |         .dropdown-menu a:hover {
 84 |             background-color: #f5f5f5;
 85 |         }
 86 | 
 87 |         /* 登录/注册模态框样式 */
 88 |         .modal {
 89 |             display: none;
 90 |             position: fixed;
 91 |             top: 0;
 92 |             left: 0;
 93 |             width: 100%;
 94 |             height: 100%;
 95 |             background: rgba(0,0,0,0.5);
 96 |         }
 97 | 
 98 |         .modal.show {
 99 |             display: flex;
100 |             align-items: center;
101 |             justify-content: center;
102 |         }
103 | 
104 |         .modal-content {
105 |             background: white;
106 |             padding: 20px;
107 |             border-radius: 8px;
108 |             width: 100%;
109 |             max-width: 400px;
110 |         }
111 | 
112 |         .modal-header {
113 |             display: flex;
114 |             justify-content: space-between;
115 |             align-items: center;
116 |             margin-bottom: 20px;
117 |         }
118 | 
119 |         .close-btn {
120 |             background: none;
121 |             border: none;
122 |             font-size: 20px;
123 |             cursor: pointer;
124 |         }
125 | 
126 |         .form-group {
127 |             margin-bottom: 16px;
128 |         }
129 | 
130 |         .form-group label {
131 |             display: block;
132 |             margin-bottom: 8px;
133 |         }
134 | 
135 |         .form-group input {
136 |             width: 100%;
137 |             padding: 8px;
138 |             border: 1px solid #ddd;
139 |             border-radius: 4px;
140 |         }
141 | 
142 |         .submit-btn {
143 |             width: 100%;
144 |             padding: 10px;
145 |             background-color: #4285f4;
146 |             color: white;
147 |             border: none;
148 |             border-radius: 4px;
149 |             cursor: pointer;
150 |         }
151 |         /* Flash消息样式 */
152 |         .flash-messages {
153 |             position: fixed;
154 |             top: 20px;
155 |             left: 50%;
156 |             transform: translateX(-50%);
157 |             z-index: 1000;
158 |         }
159 | 
160 |         .flash-message {
161 |             padding: 10px 20px;
162 |             margin-bottom: 10px;
163 |             background-color: #f8d7da;
164 |             border: 1px solid #f5c6cb;
165 |             border-radius: 4px;
166 |             color: #721c24;
167 |             text-align: center;
168 |         }
169 |     </style>
170 | </head>
171 | <body>
172 | <!--在登录/注册功能中应该添加消息提示-->
173 | {% with messages = get_flashed_messages() %}
174 |     {% if messages %}
175 |         <div class="flash-messages">
176 |             {% for message in messages %}
177 |                 <div class="flash-message">{{ message }}</div>
178 |             {% endfor %}
179 |         </div>
180 |     {% endif %}
181 | {% endwith %}
182 |     <!-- 用户部分 -->
183 |     <div class="user-section">
184 |         {% if current_user.is_authenticated %}
185 |             <div class="user-panel">
186 |                 <div class="user-avatar">
187 |                     {{ current_user.username[0] }}
188 |                 </div>
189 |                 <span class="user-name">{{ current_user.username }}</span>
190 |                 <button onclick="toggleDropdown()" class="dropdown-btn">▼</button>
191 |                 <div class="dropdown-menu" id="userDropdown">
192 |                     <a href="/history">搜索历史</a>
193 |                     <a href="/preferences">个性化设置</a>
194 |                     <a href="/logout">退出登录</a>
195 |                 </div>
196 |             </div>
197 |         {% else %}
198 |             <div class="auth-buttons">
199 |                 <button onclick="showLoginModal()" class="login-btn">登录</button>
200 |                 <button onclick="showRegisterModal()" class="register-btn">注册</button>
201 |             </div>
202 |         {% endif %}
203 |     </div>
204 | 
205 |     <!-- 登录模态框 -->
206 |     <div class="modal" id="loginModal">
207 |         <div class="modal-content">
208 |             <div class="modal-header">
209 |                 <h3>登录</h3>
210 |                 <button class="close-btn" onclick="hideLoginModal()">&times;</button>
211 |             </div>
212 |             <form action="/login" method="POST">
213 |                 <div class="form-group">
214 |                     <label for="username">用户名</label>
215 |                     <input type="text" id="username" name="username" required>
216 |                 </div>
217 |                 <div class="form-group">
218 |                     <label for="password">密码</label>
219 |                     <input type="password" id="password" name="password" required>
220 |                 </div>
221 |                 <button type="submit" class="submit-btn">登录</button>
222 |             </form>
223 |         </div>
224 |     </div>
225 | 
226 |     <!-- 注册模态框 -->
227 |     <div class="modal" id="registerModal">
228 |         <div class="modal-content">
229 |             <div class="modal-header">
230 |                 <h3>注册</h3>
231 |                 <button class="close-btn" onclick="hideRegisterModal()">&times;</button>
232 |             </div>
233 |             <form action="/register" method="POST">
234 |                 <div class="form-group">
235 |                     <label for="reg-username">用户名</label>
236 |                     <input type="text" id="reg-username" name="username" required>
237 |                 </div>
238 |                 <div class="form-group">
239 |                     <label for="reg-email">邮箱</label>
240 |                     <input type="email" id="reg-email" name="email" required>
241 |                 </div>
242 |                 <div class="form-group">
243 |                     <label for="reg-password">密码</label>
244 |                     <input type="password" id="reg-password" name="password" required>
245 |                 </div>
246 |                 <div class="form-group">
247 |                     <label for="reg-confirm-password">确认密码</label>
248 |                     <input type="password" id="reg-confirm-password" name="confirm_password" required>
249 |                 </div>
250 |                 <button type="submit" class="submit-btn">注册</button>
251 |             </form>
252 |         </div>
253 |     </div>
254 |     <div class="search-container">
255 |         <div class="logo">
256 |             <span>A</span><span>L</span><span>L</span><span>I</span><span>N</span><span>K</span><span>U</span>
257 |         </div>
258 |         <form action="/search" method="GET">
259 |     <div class="search-input-group">
260 |         <input type="text"
261 |                name="q"
262 |                value="{{ query or '' }}"
263 |                placeholder="输入搜索关键词...">
264 | 
265 |         <select name="searchIn">
266 |             <option value="all">搜索范围：全部</option>
267 |             <option value="title">仅标题</option>
268 |             <option value="content">仅内容</option>
269 |         </select>
270 | 
271 |         <select name="sortBy">
272 |             <option value="relevance">按相关度排序</option>
273 |             <option value="date"> 按时间排序</option>
274 |         </select>
275 | 
276 |         <button type="submit">搜索</button>
277 |     </div>
278 | </form>
279 |     </div>
280 | 
281 | {% if results %}
282 | <div class="results">
283 |     <div class="search-stats">
284 |         找到约 {{total}} 条结果
285 |     </div>
286 | 
287 |     {% for result in results %}
288 |     <div class="result-item">
289 |         <a href="{{result.url}}" class="result-title" target="_blank">{{result.title|safe}}</a>
290 |         <div class="result-url">{{result.url}}</div>
291 |         <div class="result-snippet">{{result.snippet|safe}}</div>
292 |         <div class="result-meta">
293 |             {% if result.source %}
294 |             <span class="result-source">{{result.source}}</span>
295 |             {% endif %}
296 |             {% if result.date %}
297 |             <span class="result-date">{{result.date.strftime('%Y-%m-%d')}}</span>
298 |             {% endif %}
299 |         </div>
300 |     </div>
301 |     {% endfor %}
302 |     {# Add pagination #}
303 |     {% if total_pages > 1 %}
304 |         {% if current_page > 1 %}
305 |         <a href="{{ url_for('search', q=query, page=current_page-1, searchIn=search_in, sortBy=sort_by) }}" class="page-nav">上一页</a>
306 |         {% endif %}
307 | 
308 |         {% for p in page_range %}
309 |             {% if p == current_page %}
310 |             <span class="current-page">{{ p }}</span>
311 |             {% else %}
312 |             <a href="{{ url_for('search', q=query, page=p, searchIn=search_in, sortBy=sort_by) }}" class="page-number">{{ p }}</a>
313 |             {% endif %}
314 |         {% endfor %}
315 | 
316 |         {% if current_page < total_pages %}
317 |         <a href="{{ url_for('search', q=query, page=current_page+1, searchIn=search_in, sortBy=sort_by) }}" class="page-nav">下一页</a>
318 |         {% endif %}
319 |     {% endif %}
320 |     <!-- 其他内容 -->
321 | </div>
322 | {% else %}
323 |     {% if query %}
324 |     <div class="no-results">
325 |         未找到与 "{{query}}" 相关的结果
326 |     </div>
327 |     {% endif %}
328 | {% endif %}
329 | <!-- 将这段脚本放在search.html的body标签末尾，在其他script之前 -->
330 |     <script>
331 |         // 页面加载时检查登录状态
332 |         document.addEventListener('DOMContentLoaded', function() {
333 |             fetch('/check_login_status')
334 |                 .then(response => response.json())
335 |                 .then(data => {
336 |                     const userSection = document.querySelector('.user-section');
337 |                     if (data.logged_in) {
338 |                         userSection.innerHTML = `
339 |                             <div class="user-panel">
340 |                                 <div class="user-avatar">${data.username[0]}</div>
341 |                                 <span class="user-name">${data.username}</span>
342 |                                 <button onclick="toggleDropdown()" class="dropdown-btn">▼</button>
343 |                                 <div class="dropdown-menu" id="userDropdown">
344 |                                     <a href="/history">搜索历史</a>
345 |                                     <a href="/preferences">个性化设置</a>
346 |                                     <a href="/logout">退出登录</a>
347 |                                 </div>
348 |                             </div>
349 |                         `;
350 |                     } else {
351 |                         userSection.innerHTML = `
352 |                             <div class="auth-buttons">
353 |                                 <button onclick="showLoginModal()" class="login-btn">登录</button>
354 |                                 <button onclick="showRegisterModal()" class="register-btn">注册</button>
355 |                             </div>
356 |                         `;
357 |                     }
358 |                 })
359 |                 .catch(error => console.error('Error:', error));
360 |         });
361 |     </script>
362 | 
363 | <!-- JavaScript -->
364 |     <script>
365 |         function toggleDropdown() {
366 |             document.getElementById('userDropdown').classList.toggle('show');
367 |         }
368 | 
369 |         function showLoginModal() {
370 |             document.getElementById('loginModal').classList.add('show');
371 |         }
372 | 
373 |         function hideLoginModal() {
374 |             document.getElementById('loginModal').classList.remove('show');
375 |         }
376 | 
377 |         function showRegisterModal() {
378 |             document.getElementById('registerModal').classList.add('show');
379 |         }
380 | 
381 |         function hideRegisterModal() {
382 |             document.getElementById('registerModal').classList.remove('show');
383 |         }
384 | 
385 |         // 点击模态框外部时关闭
386 |         window.onclick = function(event) {
387 |             if (event.target.classList.contains('modal')) {
388 |                 event.target.classList.remove('show');
389 |             }
390 |         }
391 |     </script>
392 | </body>
393 | </html>


--------------------------------------------------------------------------------
/templates/snapshot.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>ALLINKU - 网页快照</title>
  7 |     <style>
  8 |         .snapshot-header {
  9 |             position: fixed;
 10 |             top: 0;
 11 |             left: 0;
 12 |             right: 0;
 13 |             background: #f8f9fa;
 14 |             padding: 15px;
 15 |             border-bottom: 1px solid #dee2e6;
 16 |             z-index: 1000;
 17 |         }
 18 |         .snapshot-info {
 19 |             margin: 10px 0;
 20 |             color: #666;
 21 |         }
 22 |         .snapshot-content-wrapper {
 23 |             margin-top: 120px; /* 为固定定位的header留出空间 */
 24 |             padding: 20px;
 25 |         }
 26 |         .back-link {
 27 |             display: inline-block;
 28 |             padding: 5px 15px;
 29 |             background: #4285f4;
 30 |             color: white;
 31 |             text-decoration: none;
 32 |             border-radius: 4px;
 33 |             margin-right: 10px;
 34 |         }
 35 |         .original-link {
 36 |             color: #1a73e8;
 37 |             text-decoration: none;
 38 |         }
 39 |         .original-link:hover {
 40 |             text-decoration: underline;
 41 |         }
 42 |         /* 添加基本的响应式样式 */
 43 |         @media (max-width: 768px) {
 44 |             .snapshot-header {
 45 |                 padding: 10px;
 46 |             }
 47 |             .snapshot-content-wrapper {
 48 |                 padding: 10px;
 49 |             }
 50 |         }
 51 |     </style>
 52 | </head>
 53 | <body>
 54 |     <div class="snapshot-header">
 55 |         <a href="{{ url_for('index') }}" class="back-link">返回搜索</a>
 56 |         <h2>{{ title }}</h2>
 57 |         <div class="snapshot-info">
 58 |             <div>原始网址：<a href="{{ original_url }}" class="original-link" target="_blank">{{ original_url }}</a></div>
 59 |             <div>快照时间：{{ captured_time }}</div>
 60 |             {% if source %}<div>来源：{{ source }}</div>{% endif %}
 61 |         </div>
 62 |     </div>
 63 |     <div class="snapshot-content-wrapper">
 64 |         {% if content %}
 65 |             <div class="snapshot-content">
 66 |                 {{ content | safe }}
 67 |             </div>
 68 |         {% else %}
 69 |             <div class="no-content">
 70 |                 未找到快照内容
 71 |             </div>
 72 |         {% endif %}
 73 |     </div>
 74 | 
 75 |     <script>
 76 |         // 处理快照中的相对链接
 77 |         document.addEventListener('DOMContentLoaded', function() {
 78 |             // 获取原始URL的域名部分
 79 |             const originalUrl = "{{ original_url }}";
 80 |             const baseUrl = new URL(originalUrl).origin;
 81 | 
 82 |             // 处理所有链接
 83 |             const links = document.getElementsByTagName('a');
 84 |             for(let link of links) {
 85 |                 if(link.href && link.href.startsWith('/')) {
 86 |                     link.href = baseUrl + link.href;
 87 |                 }
 88 |             }
 89 | 
 90 |             // 处理所有图片
 91 |             const images = document.getElementsByTagName('img');
 92 |             for(let img of images) {
 93 |                 if(img.src && img.src.startsWith('/')) {
 94 |                     img.src = baseUrl + img.src;
 95 |                 }
 96 |             }
 97 |         });
 98 |     </script>
 99 | </body>
100 | </html>


--------------------------------------------------------------------------------
/test_document.py:
--------------------------------------------------------------------------------
 1 | from whoosh.index import open_dir
 2 | from whoosh.query import Term
 3 | 
 4 | 
 5 | def get_url_by_id(index_dir, doc_id):
 6 |     """
 7 |     通过文档ID查询对应的URL
 8 | 
 9 |     Args:
10 |         index_dir (str): 索引目录的路径
11 |         doc_id (str): 要查询的文档ID
12 | 
13 |     Returns:
14 |         str: 文档的URL，如果未找到则返回None
15 |     """
16 |     try:
17 |         # 打开索引目录
18 |         ix = open_dir(index_dir)
19 | 
20 |         # 创建搜索器
21 |         with ix.searcher() as searcher:
22 |             # 使用Term查询
23 |             query = Term("id", str(doc_id))
24 |             results = searcher.search(query)
25 | 
26 |             if len(results) > 0:
27 |                 # 获取URL
28 |                 url = results[0].get('url')
29 |                 if url:
30 |                     print(f"文档 ID {doc_id} 的URL是: {url}")
31 |                 else:
32 |                     print(f"文档 ID {doc_id} 没有URL信息")
33 |                 return url
34 |             else:
35 |                 print(f"未找到ID为 {doc_id} 的文档")
36 |                 return None
37 | 
38 |     except Exception as e:
39 |         print(f"查询过程中发生错误: {str(e)}")
40 |         return None
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # 使用示例
45 |     index_dir = "index_dir"  # 索引目录路径
46 |     doc_id = "675bfc1fed10fa8630043272"  # 替换为要查询的文档ID
47 | 
48 |     # 查询URL
49 |     url = get_url_by_id(index_dir, doc_id)


--------------------------------------------------------------------------------
/test_html.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | def test_specific_hash(snapshot_hash):
 3 |     try:
 4 |         client = MongoClient('localhost', 27017)
 5 |         db = client['nankai_news_datasets']
 6 | 
 7 |         print(f"\n测试特定hash: {snapshot_hash}")
 8 | 
 9 |         # 在数据库中查找快照
10 |         snapshot = db.WEB_snapshot.find_one({'content_hash': snapshot_hash})
11 |         if snapshot:
12 |             print("\n1. 找到快照:")
13 |             print(f"- html_content 长度: {len(snapshot.get('html_content', ''))}")
14 |             print(f"- captured_at: {snapshot.get('captured_at')}")
15 |         else:
16 |             print("\n1. 未找到快照")
17 | 
18 |     except Exception as e:
19 |         print(f"\n错误: {str(e)}")
20 |     finally:
21 |         client.close()
22 | 
23 | 
24 | # 测试特定hash
25 | test_specific_hash("ee985e251e6d522d52f10c17d2d283b5")


--------------------------------------------------------------------------------
/test_wildcard.py:
--------------------------------------------------------------------------------
 1 | from whoosh.index import open_dir
 2 | from whoosh.qparser import QueryParser, WildcardPlugin
 3 | from whoosh.query import Wildcard
 4 | import jieba
 5 | 
 6 | # 打开索引
 7 | ix = open_dir("index_dir")
 8 | 
 9 | # 测试函数
10 | def test_wildcard_patterns():
11 |     with ix.searcher() as searcher:
12 |         # 测试 ? 和 * 的不同情况
13 |         test_cases = [
14 |             "计?",  # 应该匹配："计算"、"计划"等
15 |             "计算*",  # 应该匹配："计算机"、"计算方法"等
16 |             "计*",  # 应该匹配所有以"计"开头的词
17 |             "南开*"  # 应该匹配所有以"南开"开头的词
18 |         ]
19 | 
20 |         for test_query in test_cases:
21 |             print(f"\n测试查询: {test_query}")
22 | 
23 |             # 先检查索引中包含的terms
24 |             prefix = test_query.replace('?', '').replace('*', '')
25 |             print(f"索引中包含'{prefix}'开头的terms:")
26 |             matching_terms = []
27 |             for term in searcher.reader().lexicon("content"):
28 |                 try:
29 |                     decoded_term = term.decode('utf-8')
30 |                     if decoded_term.startswith(prefix):
31 |                         matching_terms.append(decoded_term)
32 |                 except UnicodeDecodeError:
33 |                     continue
34 |             print(f"匹配的terms: {matching_terms[:10]}")  # 只显示前10个
35 | 
36 |             # 执行查询
37 |             from whoosh.query import Wildcard
38 |             query = Wildcard("content", test_query)
39 |             results = searcher.search(query, limit=5)
40 | 
41 |             print(f"查询结果数量: {len(results)}")
42 |             for hit in results:
43 |                 print(f"- 标题: {hit['title']}")
44 |                 print(f"  匹配内容: {hit.highlights('content', top=1)}")
45 | 
46 | if __name__ == "__main__":
47 |     test_wildcard_patterns()


--------------------------------------------------------------------------------
/说明文档.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1Reminding/Web-Search-Engine/ddba4a2e517b7225aa8a71aeda9571093c311634/说明文档.pdf


--------------------------------------------------------------------------------