├── .gitignore ├── LICENSE ├── README.md ├── basic_dedup ├── README.md ├── find_duplicates.py ├── readme.txt └── write_meta_data_pkl.py ├── convert ├── README.md ├── convert.py └── wudao_convert.py ├── corpus_processing ├── blacklist.txt ├── clean_file.py ├── decp936messy.py ├── extract.py ├── move_file.py ├── passwords.txt └── readme.txt ├── parallel_dedup ├── README.md ├── convert_jsonl_to_csv.py ├── multiprocess_deduplication.py ├── reset_csv.py └── write_output_to_jsonl.py ├── requirements.txt ├── utils ├── customSimhash.py ├── redisSimhash.py └── utils.py └── words_dedup ├── add_jsonl_detailed_simhash.py └── alltext_simhash.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | converted/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 aplmikex 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MNBVC 去重部分 2 | 3 | ### 项目描述 4 | 5 | 本项目的主要目的是: 6 | 7 | 1. 将外界输入的文件以文件md5和文件大小进行重复检测,删除不同来源的同一文件。 8 | 2. 将大量文本文件(目前仅有txt文件)转换为格式化的、易于查询的数据。 9 | 3. 在个人电脑上,实现对百万个文件的量级的快速去重操作。 10 | 4. (TODO)在集群上,对全部类型的文件进行重复检测。 11 | 12 | ### 环境安装 13 | 14 | 1. 从gtihub下载本项目 15 | ```shell 16 | git clone https://github.com/aplmikex/deduplication_mnbvc 17 | ``` 18 | 2. 使用 `pip`命令安装所需的库 19 | ```shell 20 | # 进入这个库的目录 21 | cd deduplication_mnbvc 22 | # 安装项目所需要的依赖 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ### jsonl格式说明 27 | 28 | 1. 对于每个jsonl文件,其大小略大于500MiB,这个数值定义在 `utils.py`中的 `max_size`,可根据需要更改 29 | 3. 对于每一个文件,他的json结构层次如下: 30 | 31 | ```python 32 | { 33 | '文件名': '文件.txt', 34 | '是否待查文件': False, 35 | '是否重复文件': False, 36 | '文件大小': 1024, 37 | 'simhash': 0, 38 | '最长段落长度': 0, 39 | '段落数': 0, 40 | '去重段落数': 0, 41 | '低质量段落数': 0, 42 | '段落': [] 43 | } 44 | ``` 45 | 46 | 将每一行为一个段落,段落的json结构层次如下: 47 | 48 | ```python 49 | { 50 | '行号': line_number, 51 | '是否重复': False, 52 | '是否跨文件重复': False, 53 | 'md5': md5, 54 | '内容': line 55 | } 56 | ``` 57 | -------------------------------------------------------------------------------- /basic_dedup/README.md: -------------------------------------------------------------------------------- 1 | # MNBVC 基本去重 2 | 3 | ### 项目描述 4 | 5 | 本项目的主要目的是: 6 | 7 | 1. 指定一个文件夹,设定定时任务,定时新增从外界获取的文件的信息追加至pkl二进制文件中。 8 | 2. 按照文件大小与文件md5值,输出完全相同的文件至一个txt中。 9 | 3. 根据用户需要,用户手动写脚本删除txt中的完全相同的文件名。 10 | 11 | ### 使用说明 12 | 13 | 1. #### write_meta_data_pkl.py 14 | 15 | 1. 使用说明: 16 | 17 | - `write_meta_data_pkl.py`是把文件夹内不同格式的文件追加的写入到pkl文件中。 18 | - 可以对一个文件夹反复运行此代码,只要路径不改变,pkl文件不会重复添加。 19 | - 如果改变原始文件路径,请删除pkl重新生成,不然会多次删除同一个文件。 20 | - 在运行中若增加或删减文件,可能导致文件出错或者pkl文件较大,建议运行一段时间后删除pkl,重新生成。 21 | 2. 运行 `write_meta_data_pkl.py`文件并设置必要的参数。 22 | 23 | ```bash 24 | python write_meta_data_pkl.py --dir_path /path/to/directory --pkl_file file.pkl 25 | ``` 26 | 2. #### find_duplicates.py 27 | 28 | 1. 使用说明: 29 | 30 | - `find_duplicates.py`是输入pkl文件,输出除了第一次出现以外其他完全重复的文件。 31 | - 他的结果默认会输出到duplicates.txt文件中,是覆盖写,所以建议每次去重完直接删除duplicates.txt中的完全相同的文件名。 32 | - 去重后的pkl会覆盖到原来pkl文件。 33 | 2. 运行 `multiprocess_deduplication.py`文件并设置必要的参数。 34 | 35 | ```bash 36 | python find_duplicates.py --pkl_file file.pkl 37 | ``` 38 | 39 | ### DEMO示例 40 | 41 | 按照上面示例的使用说明执行就行了。 42 | -------------------------------------------------------------------------------- /basic_dedup/find_duplicates.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | 5 | # 查找重复文件 6 | def find_duplicates(pkl_file): 7 | df = pd.read_pickle(pkl_file) 8 | duplicates = df[df.duplicated(['SHA256', 'Size'], keep=False)] 9 | groups = duplicates.groupby(['SHA256', 'Size']) 10 | with open('duplicates.txt', 'w') as f: 11 | for _, group in groups: 12 | files = group['File'].tolist() 13 | for file in files[1:]: 14 | f.write(file+'\n') 15 | df.drop_duplicates(subset=['SHA256', 'Size'], keep='first', inplace=True) 16 | df.to_pickle(pkl_file) 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--pkl_file', required=True, help='The pickle file to read from') 21 | 22 | args = parser.parse_args() 23 | find_duplicates(args.pkl_file) -------------------------------------------------------------------------------- /basic_dedup/readme.txt: -------------------------------------------------------------------------------- 1 | write_meta_data_pkl.py 2 | 把原始文件,不限文件内容格式,保存到pandas的表中 3 | 添加方式为追加 4 | 以二进制pkl的格式保存下来 5 | find_duplicates.py 6 | 输入pkl文件,输出除了第一次出现以外完全重复的文件 7 | 输出到duplicates.txt文件中,覆盖写 8 | 把去重后的pkl覆盖到原来pkl文件 -------------------------------------------------------------------------------- /basic_dedup/write_meta_data_pkl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import pandas as pd 4 | import argparse 5 | 6 | # 计算文件的 SHA256 哈希值 7 | def sha256(filename): 8 | with open(filename, 'rb') as f: 9 | content = f.read() 10 | return hashlib.sha256(content).hexdigest() 11 | 12 | # 递归遍历目录并输出文件路径、文件大小和 SHA256 哈希值 13 | def get_all_files_list(dir_path): 14 | file_path_list = [] 15 | for root, _, files in os.walk(dir_path): 16 | for file in files: 17 | file_path = os.path.join(root, file) 18 | file_path_list.append(file_path) 19 | file_path_list = sorted(file_path_list) 20 | return file_path_list 21 | 22 | 23 | # 将文件路径、大小和哈希值写入 PKL 文件 24 | def write_to_csv(dir_path, pkl_file='files.pkl'): 25 | try: 26 | existing_df = pd.read_pickle(pkl_file) 27 | except FileNotFoundError: 28 | existing_df = pd.DataFrame({'File': [], 'Size': [], 'SHA256': []}) 29 | 30 | data = {'File': [], 'Size': [], 'SHA256': []} 31 | file_path_set = set(get_all_files_list(dir_path)) 32 | 33 | file_path_set -= set(existing_df['File']) 34 | 35 | for filepath in file_path_set: 36 | try: 37 | file_size = os.path.getsize(filepath) 38 | file_sha256 = sha256(filepath) 39 | data['File'].append(filepath) 40 | data['Size'].append(file_size) 41 | data['SHA256'].append(file_sha256) 42 | except: 43 | print('file not exist: {}'.format(filepath)) 44 | 45 | df = pd.concat([existing_df, pd.DataFrame(data)], ignore_index=True) 46 | 47 | # 将 DataFrame 写入 pickle 文件 48 | df.to_pickle(pkl_file) 49 | 50 | # 示例用法 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('--dir_path', required=True, help='The directory to traverse') 54 | parser.add_argument('--pkl_file', required=True, help='The pickle file to write to') 55 | 56 | args = parser.parse_args() 57 | write_to_csv(args.dir_path, args.pkl_file) -------------------------------------------------------------------------------- /convert/README.md: -------------------------------------------------------------------------------- 1 | # MNBVC 格式化 2 | 3 | ### 项目描述 4 | 5 | 本项目的主要目的是: 6 | 7 | 1. 将大量文本文件转换为格式化的、易于查询的jsonl数据。 8 | 2. 快速标注同一文件内是否有明显重复的情况,统一放在 `problem_i.jsonl`里面。 9 | 10 | ### convert.py 使用说明 11 | 12 | 1. 使用说明: 13 | * `convert.py`是快速把txt文件转化为jsonl文件,并挑出明显自我重复的txt文件留待观察。 14 | * 本项目假设所有需要被去重的txt文件编码均为UTF-8编码,批量转换请参考[chatset-mnbvc](https://github.com/alanshi/charset_mnbvc)。 15 | * 本项目暂时只实现了从txt到jsonl的转换,暂未考虑其他类型数据。 16 | * 本项目删去了原始txt文件中的空行,以及行首与行尾的空白符。 17 | 2. 运行 `convert.py`文件并设置必要的参数。 18 | ```shell 19 | python convert.py --src_dir /path/to/source/directory --dst_dir /path/to/destination/directory --n_process 4 --threshold 0.7 20 | ``` 21 | 22 | 其中 `--src_dir`参数是必须的,它指定了要转换的源文件夹路径。如果未提供此参数,则会引发错误。 23 | 3. 可选参数 24 | * `--src`:指定源文件类型,默认为 `txt`。 25 | * `--dst`:指定目标文件类型,默认为 `jsonl`。 26 | * `--dst_dir`:指定转换后文件的输出目录,默认为 `converted/`。 27 | * `--n_process`:指定要使用的进程数,默认为4。 28 | 29 | ### 输出的jsonl格式说明 30 | 31 | 1. 根据文件内段落的重复率是否高于给定的阈值,将文件分成正常文件和待查文件,其中正常文件数字加jsonl,如 `10.jsonl`,而待查文件则是problem_加数字加jsonl,如 `problem_7.jsonl` 32 | 2. 对于每个jsonl文件,其大小略大于500MiB,这个数值定义在 `utils.py`中的 `max_size`,可根据需要更改 33 | 3. 对于每一个文件,他的json结构层次如下: 34 | 35 | ```python 36 | { 37 | '文件名': '文件.txt', 38 | '是否待查文件': False, 39 | '是否重复文件': False, 40 | '文件大小': 1024, 41 | 'simhash': 0, 42 | '最长段落长度': 0, 43 | '段落数': 0, 44 | '去重段落数': 0, 45 | '低质量段落数': 0, 46 | '段落': [] 47 | } 48 | ``` 49 | 50 | 将每一行为一个段落,段落的json结构层次如下: 51 | 52 | ```python 53 | { 54 | '行号': line_number, 55 | '是否重复': False, 56 | '是否跨文件重复': False, 57 | 'md5': md5, 58 | '内容': line 59 | } 60 | ``` 61 | 62 | ### DEMO示例 63 | 64 | ```bash 65 | python convert.py --src_dir /home/xiang/文档/mnbvcfiles --dst_dir ./mnbvcfiles --n_process 8 --threshold 0.7 66 | ``` 67 | -------------------------------------------------------------------------------- /convert/convert.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | current_path = os.path.abspath(__file__) 3 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 4 | from multiprocessing import Process 5 | import multiprocessing 6 | import argparse 7 | import tqdm 8 | from utils.utils import max_size, get_all_files 9 | import jsonlines 10 | import hashlib 11 | import utils.customSimhash as customSimhash 12 | import logging 13 | 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 15 | 16 | def from_txt_to_json(file_path, threshold): 17 | 18 | # 定义json结构 19 | file_json = {'文件名': os.path.abspath(file_path), 20 | '是否待查文件': False, 21 | '是否重复文件': False, 22 | '文件大小': os.path.getsize(file_path), 23 | 'simhash': 0, 24 | '最长段落长度': 0, 25 | '段落数': 0, 26 | '去重段落数': 0, 27 | '低质量段落数': 0, 28 | '段落': []} 29 | # 定义用于去重的set 30 | hashs = set() 31 | 32 | # 读取每一行 33 | with open(file_path, 'r', encoding='utf-8', errors='strict') as f: 34 | texts = [] 35 | for line_number, line in enumerate(f): 36 | # 去除行首尾空格 37 | line = line.strip() 38 | # 计算最长段落长度 39 | file_json['最长段落长度'] = max(file_json['最长段落长度'], len(line)) 40 | # 删除空行 41 | if len(line) == 0: 42 | continue 43 | # 计算每一行的md5值 44 | md5 = hashlib.md5(line.encode()).hexdigest() 45 | # 将每一行内容添加到json中 46 | file_json['段落'].append({'行号': line_number, 47 | '是否重复': md5 in hashs, 48 | '是否跨文件重复': False, 49 | 'md5': md5, 50 | '内容': line 51 | }) 52 | if md5 not in hashs: 53 | texts.append(line) 54 | 55 | # 将md5值添加到set中,用于去重 56 | hashs.add(md5) 57 | 58 | if len(hashs) == 0: 59 | return None 60 | # 计算段落数和去重段落数 61 | file_json['段落数'] = len(file_json['段落']) 62 | file_json['去重段落数'] = len(hashs) 63 | # 计算simhash 64 | file_json['simhash'] = customSimhash.Simhash(texts).value 65 | # 判断是否是待查文件 66 | if (file_json['去重段落数'] / file_json['段落数']) < threshold: 67 | file_json['是否待查文件'] = True 68 | return file_json 69 | 70 | 71 | def run_process(file_path_queue, json_to_write_queue, threshold): 72 | # 不断从队列中获取文件路径 73 | while not file_path_queue.empty(): 74 | # 获取文件路径 75 | try: 76 | file_path = file_path_queue.get(timeout=0.2) 77 | except: 78 | break 79 | # 将文件转换为json 80 | try: 81 | one_json = from_txt_to_json(file_path, threshold) 82 | except UnicodeDecodeError: 83 | logging.error(f"Error: {file_path} is not encoded in utf-8.") 84 | json_to_write_queue.put(UnicodeDecodeError) 85 | exit(-1) 86 | # 把json写入到队列中 87 | json_to_write_queue.put(one_json) 88 | 89 | 90 | def write_jsonl(json_to_write_queue, file_nums, dst_dir): 91 | 92 | # 定义文件名 93 | file_name = 0 94 | problem_file_name = 0 95 | if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')): 96 | logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.') 97 | if os.path.exists(os.path.join(dst_dir, 'problem_' + str(problem_file_name) + '.jsonl')): 98 | logging.warning('Warning: problem_' + str(problem_file_name) + '.jsonl' + ' already exists.') 99 | # 遍历文件数量 100 | for _ in tqdm.tqdm(range(file_nums)): 101 | # 从队列中获取一个json 102 | one_json = json_to_write_queue.get() 103 | if one_json is None: 104 | continue 105 | if one_json == UnicodeDecodeError: 106 | return -1 107 | # 根据是否待查文件,写入不同的文件 108 | if one_json['是否待查文件']: 109 | with jsonlines.open(os.path.join(dst_dir, 'problem_' + str(problem_file_name) + '.jsonl'), 110 | mode='a') as last_problem_file: 111 | last_problem_file.write(one_json) 112 | # 如果当前文件大小超过限制,则更换文件名 113 | if last_problem_file._fp.tell() > max_size: 114 | problem_file_name += 1 115 | if os.path.exists(os.path.join(dst_dir, 'problem_' + str(problem_file_name) + '.jsonl')): 116 | logging.warning('Warning: problem_' + str(problem_file_name) + '.jsonl' + ' already exists.') 117 | else: 118 | with jsonlines.open(os.path.join(dst_dir, str(file_name) + '.jsonl'), mode='a') as last_file: 119 | last_file.write(one_json) 120 | # 如果当前文件大小超过限制,则更换文件名 121 | if last_file._fp.tell() > max_size: 122 | file_name += 1 123 | if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')): 124 | logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.') 125 | return 0 126 | 127 | def convert(src_dir, src='txt', dst='jsonl', dst_dir='converted/', n_process=4, threshold=0.95): 128 | # 检查输入参数是否合理 129 | assert os.path.exists(src_dir) 130 | assert src in ['txt', 'jsonl'] 131 | assert dst in ['txt', 'jsonl'] 132 | 133 | # 如果输出目录不存在,则创建 134 | if not os.path.exists(dst_dir): 135 | os.mkdir(dst_dir) 136 | 137 | # 如果源文件和目标文件类型不匹配,则抛出异常 138 | if src != 'txt' or dst != 'jsonl': 139 | raise NotImplementedError('Only support converting from txt to jsonl now.') 140 | 141 | # 获取源文件列表 142 | file_path_queue, file_nums = get_all_files(src_dir) 143 | json_to_write_queue = multiprocessing.Queue(200) 144 | 145 | # 启动多进程,将源文件转换为json 146 | processes = [] 147 | for _ in range(n_process): 148 | p = Process(target=run_process, args=(file_path_queue, json_to_write_queue, threshold)) 149 | p.start() 150 | processes.append(p) 151 | 152 | # 将json写入文件 153 | exit_code = write_jsonl(json_to_write_queue, file_nums, dst_dir) 154 | 155 | if exit_code == -1: 156 | for p in processes: 157 | p.terminate() 158 | else: 159 | for p in processes: 160 | p.join() 161 | 162 | 163 | if __name__ == '__main__': 164 | # 设置参数解析器 165 | parser = argparse.ArgumentParser() 166 | # 添加必须指定的参数 167 | parser.add_argument('--src_dir', type=str, required=True, help="源文件夹路径") 168 | # 添加可选参数,指定源文件类型,默认为txt 169 | parser.add_argument('--src', type=str, default='txt', help="指定源文件类型,默认为txt") 170 | # 添加可选参数,指定目标文件类型,默认为jsonl 171 | parser.add_argument('--dst', type=str, default='jsonl', help="指定目标文件类型,默认为jsonl") 172 | # 添加可选参数,指定转换后文件存放路径,默认为converted/ 173 | parser.add_argument('--dst_dir', type=str, default='converted/', help="指定转换后文件存放路径,默认为converted/") 174 | # 添加可选参数,指定进程数,默认为1 175 | parser.add_argument('--n_process', type=int, default=1, help="指定进程数,默认为1") 176 | # 添加可选参数,指定去重阈值,默认为0.5 177 | parser.add_argument('--threshold', type=float, default=0.5, help="指定去重阈值,默认为0.5") 178 | # 解析参数 179 | args = parser.parse_args() 180 | # 调用convert函数 181 | convert(args.src_dir, args.src, args.dst, args.dst_dir, args.n_process, args.threshold) 182 | -------------------------------------------------------------------------------- /convert/wudao_convert.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | current_path = os.path.abspath(__file__) 3 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 4 | from multiprocessing import Process 5 | import multiprocessing 6 | import argparse 7 | import tqdm 8 | from utils.utils import max_size, get_all_files 9 | import jsonlines, json 10 | import hashlib 11 | import utils.customSimhash as customSimhash 12 | import logging 13 | 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 15 | 16 | def from_wudaojson_to_json(file_path, one_json): 17 | 18 | # 定义json结构 19 | file_json = {'文件名': os.path.abspath(file_path)+':'+one_json['title'], 20 | '是否待查文件': False, 21 | '是否重复文件': False, 22 | '文件大小': len(json.dumps(one_json)), 23 | 'simhash': 0, 24 | '最长段落长度': len(one_json['content']), 25 | '数据类型': one_json['dataType'], 26 | '段落数': 1, 27 | '去重段落数': 1, 28 | '低质量段落数': 0, 29 | '段落': []} 30 | 31 | lines = [one_json['content']] 32 | 33 | # 定义用于去重的set 34 | hashs = set() 35 | 36 | 37 | texts = [] 38 | for line in lines: 39 | # 去除行首尾空格 40 | line = line.strip() 41 | # 计算最长段落长度 42 | file_json['最长段落长度'] = max(file_json['最长段落长度'], len(line)) 43 | # 删除空行 44 | if len(line) == 0: 45 | continue 46 | # 计算每一行的md5值 47 | md5 = hashlib.md5(line.encode()).hexdigest() 48 | # 将每一行内容添加到json中 49 | file_json['段落'].append({'行号': 1, 50 | '是否重复': md5 in hashs, 51 | '是否跨文件重复': False, 52 | 'md5': md5, 53 | '内容': line 54 | }) 55 | if md5 not in hashs: 56 | texts.append(line) 57 | 58 | # 将md5值添加到set中,用于去重 59 | hashs.add(md5) 60 | 61 | if len(hashs) == 0: 62 | return None 63 | 64 | # 计算simhash 65 | file_json['simhash'] = customSimhash.Simhash(texts).value 66 | 67 | return file_json 68 | 69 | 70 | 71 | 72 | def convert(src_dir, dst_dir='converted/'): 73 | # 检查输入参数是否合理 74 | assert os.path.exists(src_dir) 75 | 76 | file_name = 0 77 | if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')): 78 | logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.') 79 | 80 | # 如果输出目录不存在,则创建 81 | os.makedirs(dst_dir, exist_ok=True) 82 | 83 | # 获取源文件列表 84 | file_path_list, file_nums = get_all_files(src_dir,legal_file_type=('.json', ), return_file_type='list') 85 | for _ in tqdm.tqdm(range(file_nums)): 86 | file = file_path_list.pop() 87 | with open(file, 'r', encoding='utf-8') as f: 88 | file_json = json.load(f) 89 | for one_json in file_json: 90 | one_json = from_wudaojson_to_json(file, one_json) 91 | 92 | with jsonlines.open(os.path.join(dst_dir, str(file_name) + '.jsonl'), mode='a') as last_file: 93 | last_file.write(one_json) 94 | # 如果当前文件大小超过限制,则更换文件名 95 | if last_file._fp.tell() > max_size: 96 | file_name += 1 97 | if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')): 98 | logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.') 99 | 100 | 101 | if __name__ == '__main__': 102 | # 设置参数解析器 103 | parser = argparse.ArgumentParser() 104 | # 添加必须指定的参数 105 | parser.add_argument('--src_dir', type=str, required=True, help="源文件夹路径") 106 | # 添加可选参数,指定转换后文件存放路径,默认为converted/ 107 | parser.add_argument('--dst_dir', type=str, default='converted/', help="指定转换后文件存放路径,默认为converted/") 108 | # 解析参数 109 | args = parser.parse_args() 110 | # 调用convert函数 111 | convert(args.src_dir, args.dst_dir) 112 | -------------------------------------------------------------------------------- /corpus_processing/blacklist.txt: -------------------------------------------------------------------------------- 1 | tmp 2 | ini 3 | jpg 4 | png 5 | jpeg 6 | gif 7 | css 8 | swf 9 | bmp 10 | tiff 11 | tif 12 | raw 13 | svg 14 | webp 15 | ico 16 | psd 17 | ai 18 | cdr 19 | wmf 20 | pcx 21 | dng 22 | avi 23 | mp4 24 | mov 25 | wmv 26 | flv 27 | mkv 28 | webm 29 | m4v -------------------------------------------------------------------------------- /corpus_processing/clean_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | 5 | 6 | def clean_file(folder_path, blacklist_file): 7 | with open(blacklist_file, 'r') as f: 8 | balsklist = f.readlines() 9 | 10 | balsklist = [x.strip() for x in balsklist] 11 | 12 | with open('tobereomve.txt', 'w') as f: 13 | for root, _, files in os.walk(folder_path): 14 | for file in files: 15 | file_path = os.path.join(root, file) 16 | filename, extension = os.path.splitext(file_path) 17 | 18 | extension = extension.lower() 19 | 20 | # 扩展名带前面的.的,要多算一个 21 | if extension == '' or len(extension) > 7: 22 | f.write(file_path+'\n') 23 | elif extension[1:] in balsklist: 24 | f.write(file_path+'\n') 25 | elif not all(ord(c) < 128 for c in extension): 26 | f.write(file_path+'\n') 27 | 28 | 29 | 30 | if __name__ == '__main__': 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--folder_path', type=str, required=True, help="所有文件路径") 33 | parser.add_argument('--blacklist_file', type=str, required=True, help="后缀名黑名单文件路径") 34 | 35 | args = parser.parse_args() 36 | clean_file(args.folder_path, args.blacklist_file) -------------------------------------------------------------------------------- /corpus_processing/decp936messy.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | from charset_mnbvc import api 3 | import argparse, shutil 4 | 5 | def get_all_files_list(dir_path): 6 | file_path_list = [] 7 | for root, _, files in os.walk(dir_path): 8 | for file in files: 9 | file_path = os.path.join(root, file) 10 | file_path_list.append(file_path) 11 | return file_path_list, len(file_path_list) 12 | 13 | 14 | if __name__ == '__main__': 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--folder_path', type=str, required=True, help="乱码文件的路径,请输入最后一个正常非乱码的文件夹,会在同文件夹生成新的文件夹,但不会删除原来的") 17 | args = parser.parse_args() 18 | 19 | file_path_list, file_nums = get_all_files_list(args.folder_path) 20 | for file in file_path_list: 21 | relative_path = os.path.relpath(file, args.folder_path) 22 | try: 23 | coding_name = api.from_data(data=relative_path.encode('cp437'), mode=2) 24 | 25 | ret = api.convert_encoding( 26 | source_data=relative_path.encode('cp437'), 27 | source_encoding=coding_name, 28 | target_encoding="utf-8", 29 | ) 30 | os.makedirs(os.path.dirname(os.path.join(args.folder_path, ret)), exist_ok=True) 31 | shutil.move(file, os.path.join(args.folder_path, ret)) 32 | except UnicodeEncodeError: 33 | print(f"{file} :为非cp437的路径,不改变") 34 | except Exception as e: 35 | print(f"Move {file} failed: {e}") 36 | -------------------------------------------------------------------------------- /corpus_processing/extract.py: -------------------------------------------------------------------------------- 1 | import os, hashlib 2 | import argparse 3 | import shutil, io 4 | import tarfile 5 | import zipfile 6 | import bz2 7 | import gzip 8 | import rarfile 9 | import py7zr 10 | import os, sys 11 | from charset_mnbvc import api 12 | from better_zipfile import fixcharset_zipfile 13 | import shutil 14 | 15 | def get_directory_size(directory): 16 | total_size = 0 17 | for dirpath, dirnames, filenames in os.walk(directory): 18 | for filename in filenames: 19 | filepath = os.path.join(dirpath, filename) 20 | if not os.path.islink(filepath): 21 | total_size += os.path.getsize(filepath) 22 | return total_size 23 | 24 | 25 | def get_extension(file_path): 26 | filename, extension = os.path.splitext(file_path) 27 | 28 | extensions = [] 29 | if extension: 30 | extensions.insert(0, extension) 31 | filename_1, extension = os.path.splitext(filename) 32 | if extension == '.tar': 33 | extensions.insert(0, extension) 34 | filename = filename_1 35 | return filename, ''.join(extensions) 36 | 37 | 38 | def check_long_name(extract_full_path, zip_file_name):# longname返回true 39 | paths = zip_file_name.split('/') 40 | file_name = paths[-1] 41 | if len(file_name.encode()) > 255 and len(os.path.join(extract_full_path, zip_file_name).encode()) < 4095: 42 | print(f"File name too long: \n{os.path.join(extract_full_path, zip_file_name)} \n") 43 | basename, extensions = get_extension(file_name) 44 | length = (255-len(extensions.encode())-8)//2 45 | basename = basename.encode()[:length].decode('utf-8', errors='ignore')+hashlib.md5(file_name.encode()).hexdigest()[:8]+basename.encode()[-length:].decode('utf-8', errors='ignore') 46 | new_name = basename + extensions 47 | return os.path.join(extract_full_path, '/'.join(paths[:-1]), new_name), True 48 | elif any(len(path.encode()) > 255 for path in paths) or len(os.path.join(extract_full_path, zip_file_name).encode()) > 4095: 49 | print(f"File name too long: \n {os.path.join(extract_full_path, zip_file_name)} \n") 50 | 51 | length = min(255, 4096-len(os.path.join(extract_full_path, 'long_name').encode()))-8 52 | 53 | new_name = zip_file_name.encode()[:length//2-1].decode('utf-8', errors='ignore') +hashlib.md5(zip_file_name.encode()).hexdigest()[:8]+ zip_file_name.encode()[1-length//2:].decode('utf-8', errors='ignore') 54 | new_name = '_'.join(new_name.split('/')) 55 | return os.path.join(extract_full_path, 'long_name', new_name), True 56 | 57 | return os.path.join(extract_full_path, zip_file_name), False 58 | 59 | 60 | def extract_zip(file, password, extract_full_path): 61 | 62 | with fixcharset_zipfile.ZipFile(file, 'r') as zip: 63 | zip.setpassword(password) 64 | 65 | auto_filelists = [] 66 | 67 | for file in zip.namelist(): 68 | problem = False 69 | if file.endswith('/'): 70 | continue 71 | 72 | new_file_path, if_long_name = check_long_name(extract_full_path, file) 73 | if if_long_name: 74 | problem = True 75 | 76 | if problem: 77 | basename = os.path.dirname(new_file_path) 78 | os.makedirs(basename, exist_ok=True) 79 | with zip.open(file, 'r') as f_in: 80 | data = f_in.read() 81 | with open(new_file_path, 'wb') as f_out: 82 | f_out.write(data) 83 | else: 84 | auto_filelists.append(file) 85 | 86 | zip.extractall(extract_full_path, auto_filelists) 87 | 88 | 89 | 90 | 91 | 92 | def extract_archive(file_path, extract_full_path, file, password=None): 93 | 94 | filename, extension = get_extension(file) 95 | extract_succcessful = True 96 | try: 97 | if extension == '.tar': 98 | with tarfile.open(file_path, 'r') as tar: 99 | tar.extractall(extract_full_path) 100 | elif extension == '.tbz2' or extension == '.tar.bz2': 101 | with tarfile.open(file_path, 'r:bz2') as tar: 102 | tar.extractall(extract_full_path) 103 | elif extension == '.tgz' or extension == '.tar.gz' or extension == '.tar.Z': 104 | with tarfile.open(file_path, 'r:gz') as tar: 105 | tar.extractall(extract_full_path) 106 | elif extension == '.tar.xz': 107 | with tarfile.open(file_path, 'r:xz') as tar: 108 | tar.extractall(extract_full_path) 109 | elif extension == '.bz2': 110 | if not os.path.exists(extract_full_path): 111 | os.mkdir(extract_full_path) 112 | with bz2.open(file_path, 'rb') as f_in: 113 | with open(os.path.join(extract_full_path, filename), 'wb') as f_out: 114 | shutil.copyfileobj(f_in, f_out) 115 | elif extension == '.rar': 116 | with rarfile.RarFile(file_path, 'r') as rar: 117 | rar.setpassword(password) 118 | 119 | problem = False 120 | 121 | for file in rar.namelist(): 122 | if file.endswith('/'): 123 | continue 124 | new_file_path, if_long_name = check_long_name(extract_full_path, file) 125 | if if_long_name: 126 | problem = True 127 | break 128 | 129 | if problem: 130 | for file in rar.namelist(): 131 | if file.endswith('/'): 132 | continue 133 | new_file_path, _ = check_long_name(extract_full_path, file) 134 | basename = os.path.dirname(new_file_path) 135 | 136 | os.makedirs(basename, exist_ok=True) 137 | with rar.open(file, 'r') as f_in: 138 | data = f_in.read() 139 | with open(new_file_path, 'wb') as f_out: 140 | f_out.write(data) 141 | # print(f"File extract to: {new_file_path}") 142 | else: 143 | rar.extractall(extract_full_path) 144 | 145 | elif extension == '.gz': 146 | if not os.path.exists(extract_full_path): 147 | os.mkdir(extract_full_path) 148 | 149 | with gzip.open(file_path, 'rb') as f_in: 150 | with open(os.path.join(extract_full_path, filename), 'wb') as f_out: 151 | shutil.copyfileobj(f_in, f_out) 152 | elif extension in ('.zip', '.exe'): 153 | extract_zip(file_path, password, extract_full_path) 154 | 155 | elif extension == '.7z': 156 | with py7zr.SevenZipFile(file_path, mode='r', password=password) as seven_zip: 157 | seven_zip.extractall(extract_full_path) 158 | else: 159 | print(f"Unsupported file format: {extension}") 160 | extract_succcessful = False 161 | 162 | except Exception as e: 163 | print(f"Extracting {file_path} failed: {e}") 164 | extract_succcessful = False 165 | 166 | extract_dir_size = get_directory_size(extract_full_path) 167 | file_size = os.path.getsize(file_path) 168 | if extract_succcessful and file_size <= extract_dir_size: 169 | os.remove(file_path) 170 | print(f"文件 '{file_path}' 已成功删除。") 171 | elif os.path.isfile(extract_full_path) and file_size <= os.path.getsize(extract_full_path): 172 | #有时解压出来不是dir而是file,目前看到gz包有这种情况,具体原因还需分析 173 | os.remove(file_path) 174 | print(f"文件 '{file_path}' 已成功删除。") 175 | else: 176 | print(f"解压结果为 '{extract_succcessful}'。 解压前文件大小为'{file_size}',解压后文件夹大小为'{extract_dir_size}'") 177 | #检查路径长度,避免删除风险 178 | if len(extract_full_path) >= 20: 179 | # 确保路径存在,并且实际上是一个目录 180 | if os.path.isdir(extract_full_path): 181 | try: 182 | shutil.rmtree(extract_full_path) 183 | print(f"目录 '{extract_full_path}' 已删除。") 184 | except: 185 | print(f"Error:目录 '{extract_full_path}' 删除报错。") 186 | else: 187 | print(f"Error:提供的路径 '{extract_full_path}' 不是有效的目录。") 188 | else: 189 | print(f"Error:路径 '{extract_full_path}' 长度不足,为了安全起见,路径长度至少需要20个字符。") 190 | 191 | return extract_succcessful 192 | 193 | 194 | def traverse_directory(folder_path, passwords=None): 195 | if not os.path.exists(folder_path): 196 | print(f"{folder_path} does not exist!") 197 | return 198 | if not passwords is None: 199 | with open(passwords, 'r') as f: 200 | balsklist = f.readlines() 201 | passwords = [x.strip() for x in balsklist] 202 | else : 203 | passwords = [] 204 | 205 | 206 | for root, dirs, files in os.walk(folder_path): 207 | extract_path_set = set(dirs) 208 | 209 | for file in files: 210 | # 判断文件是否为压缩包类型 211 | if file.endswith(('.tar', '.tbz2', '.tgz', '.tar.bz2', '.tar.gz', '.tar.xz', '.tar.Z', '.bz2', '.rar', '.gz', '.zip', '.xz', '.7z', '.exe')): 212 | 213 | file_path = os.path.join(root, file) 214 | # 把压缩包解压到的文件夹名 215 | extract_path, _ = get_extension(file) 216 | 217 | if extract_path in extract_path_set: 218 | for i in range(1, 10000): 219 | if f"{extract_path}_{i}" not in extract_path_set: 220 | extract_path = f"{extract_path}_{i}" 221 | break 222 | if i == 9999: 223 | print(f"Too many files in {root}") 224 | raise Exception(f"Too many files in {root}") 225 | 226 | extract_full_path = os.path.join(root, extract_path) 227 | if not os.path.islink(file_path): 228 | extract_succcessful = extract_archive(file_path, extract_full_path, file) 229 | 230 | if not extract_succcessful: 231 | for password in passwords: 232 | print(f"Try password: {password}") 233 | extract_succcessful = extract_archive(file_path, extract_full_path, file, password=password.encode()) 234 | if extract_succcessful: 235 | break 236 | 237 | # if extract_succcessful: 238 | # traverse_directory(extract_full_path) 239 | 240 | extract_path_set.add(extract_path) 241 | 242 | 243 | if __name__ == '__main__': 244 | parser = argparse.ArgumentParser() 245 | parser.add_argument('--folder_path', type=str, required=True, help="压缩包路径") 246 | parser.add_argument('--passwords_files', type=str, default=None, help="压缩包密码文件路径") 247 | args = parser.parse_args() 248 | 249 | traverse_directory(args.folder_path, args.passwords_files) -------------------------------------------------------------------------------- /corpus_processing/move_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import argparse 4 | 5 | def move_files(input_dir, output_dir, suffix, keywords:str): 6 | if os.path.exists(input_dir) == False: 7 | raise ValueError('输入目录不存在') 8 | if os.path.abspath(input_dir) == os.path.abspath(output_dir): 9 | raise ValueError('输入目录和输出目录不能相同') 10 | 11 | keywords = [keyword for keyword in keywords.split(',') if keyword != ''] 12 | 13 | os.makedirs(output_dir, exist_ok=True) 14 | for root, _, files in os.walk(input_dir): 15 | # 获取相对于输入目录的路径 16 | relative_path = os.path.relpath(root, input_dir) 17 | 18 | # 创建目标目录 19 | target_dir = os.path.join(output_dir, relative_path) 20 | first_create = True 21 | # 移动符合条件的文件 22 | for file in files: 23 | file_path = os.path.join(root, file) 24 | # 检查file_path是否包含全部关键词 25 | in_keywords = True 26 | for keyword in keywords: 27 | if keyword not in file_path: 28 | in_keywords = False 29 | break 30 | if file.lower().endswith(suffix) and in_keywords: 31 | try: 32 | if first_create: 33 | os.makedirs(target_dir, exist_ok=True) 34 | first_create = False 35 | source_file = os.path.join(root, file) 36 | target_file = os.path.join(target_dir, file) 37 | target_file = target_file[:len(target_file) - len(suffix)] + suffix 38 | shutil.move(source_file, target_file) 39 | except Exception as e: 40 | print(f"移动文件时出现异常: {e}") 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--input_dir', type=str, required=True, help="输入目录") 45 | parser.add_argument('--output_dir', type=str, required=True, help="输出目录") 46 | parser.add_argument('--suffix', type=str, required=True, help="后缀名") 47 | parser.add_argument('--keywords', type=str, default='', help="关键词") 48 | 49 | args = parser.parse_args() 50 | move_files(args.input_dir, args.output_dir, args.suffix, args.keywords) -------------------------------------------------------------------------------- /corpus_processing/passwords.txt: -------------------------------------------------------------------------------- 1 | 253874 -------------------------------------------------------------------------------- /corpus_processing/readme.txt: -------------------------------------------------------------------------------- 1 | extract.py是压缩包解压的代码,可以嵌套解压,并且会原始压缩包删除 2 | 如果有密码或者不支持的类型,会报错 3 | 仅测试过几种类型,待测试更多 4 | 解压炸弹没处理 5 | 同一个名字的压缩包将会在后面加_数字,生成新的目录,小概率有空目录 6 | cleanfile.py是清理后缀扩展名的代码 7 | 黑名单制度,在黑名单上的会被删除 8 | move_file.py是移动文件的代码,可以在保留文件目录的前提下把文件移动到目标目录下 -------------------------------------------------------------------------------- /parallel_dedup/README.md: -------------------------------------------------------------------------------- 1 | # MNBVC 单机多进程文件去重 2 | 3 | ### 项目描述 4 | 5 | 本项目的主要目的是: 6 | 7 | 1. 在个人电脑上,实现对百万个文件的量级的快速去重操作。 8 | 9 | ### 使用说明 10 | 11 | 1. #### convert_jsonl_to_csv.py 12 | 13 | 1. 使用说明: 14 | 15 | - `convert_jsonl_to_csv.py`是把jsonl文件转化为对应txt的元数据组成的csv文件,用于储存文件的md5的集合。 16 | - 本项目依赖的是从 `convert.py`输出的jsonl文件,请不要使用其他格式。 17 | - 本项目使用对jsonl文件名的MD5值以及对原始文件的MD5值连接起来作为输出csv的文件名,会有极小概率丢失文件元数据,导致去重过程不包含该文件,如对全部文件的完整度非常在意,请检查生成的csv文件数与原始txt文件数量。 18 | 2. 运行 `convert_jsonl_to_csv.py`文件并设置必要的参数。 19 | 20 | ```bash 21 | python convert_jsonl_to_csv.py --src_dir /path/to/source/directory --dst_dir /path/to/destination/directory 22 | ``` 23 | 24 | 其中 `--src_dir`参数是必须的,它指定了要转换的jsonl源文件夹路径。如果未提供此参数,则会引发错误。 25 | 3. 可选参数 26 | 27 | - `--dst_dir`:指定转换后文件的输出目录,默认为 `output_csv/`。 28 | 2. #### multiprocess_deduplication.py 29 | 30 | 1. 使用说明: 31 | 32 | - `multiprocess_deduplication.py`是MNBVC单机多进程文件去重部分的主要代码。 33 | - 本项目依赖的是从 `convert_jsonl_to_csv.py`输出的csv文件,请不要使用其他格式。 34 | - 本项目最少使用2个进程,最多无上限,建议使用电脑cpu核心数的进程数量。其中一个进程进行去重的比较工作,其他所有进程用于读取重复的文件的csv列表,用于二次验证是否重复。 35 | 2. 运行 `multiprocess_deduplication.py`文件并设置必要的参数。 36 | 37 | ```bash 38 | python multiprocess_deduplication.py --src_dir /path/to/source/directory --n_process 10 --simhash_threshold 3 --jaccard_thresold 0.8 39 | ``` 40 | 41 | 其中 `--src_dir`参数是必须的,它指定了要转换的csv源文件夹路径。如果未提供此参数,则会引发错误。 42 | 3. 可选参数 43 | 44 | - `--simhash_threshold`:指定simhash阈值,默认设置为3,效果较好。这个值如果大于5,则会极慢无比。 45 | - `--jaccard_thresold`:指定jaccard阈值,默认为0.8。低于这个数的可以手动看,进行决策是否重复。一般来说simhash阈值在3的时候,极少有真正重复的。 46 | - `--n_process`:指定要使用的进程数,默认为13。 47 | 3. #### reset_csv.py 48 | 49 | 1. 使用说明: 50 | 51 | - `reset_csv.py`是清除csv文件去重结果代码。我们会用 `multiprocess_deduplication.py`把去重结果写到csv文件中,若某次 `multiprocess_deduplication.py`参数选择出错,可以用本代码清除csv状态,重新进行去重。 52 | 2. 运行 `reset_csv.py`文件并设置必要的参数。 53 | 54 | ```bash 55 | python reset_csv.py --src_dir /path/to/source/directory 56 | ``` 57 | 58 | 其中 `--src_dir`参数是必须的,它指定了要转换的csv源文件夹路径。如果未提供此参数,则会引发错误。 59 | 4. #### write_output_to_jsonl.py 60 | 61 | 1. 使用说明: 62 | 63 | - `write_output_to_jsonl.py`是将csv去重的结果保存到原始的jsonl文件中去,属于去重最后一步。 64 | 2. 运行 `write_output_to_jsonl.py`文件并设置必要的参数。 65 | 66 | ```bash 67 | python write_output_to_jsonl.py --csv_dir /path/to/source/csvdirectory --jsonl_dir /path/to/source/jsonldirectory 68 | ``` 69 | 70 | 其中 `--csv_dir`参数是必须的,它指定了csv源文件夹路径。如果未提供此参数,则会引发错误。 71 | 72 | 其中 `--jsonl_dir`参数是必须的,它指定了jsonl文件夹路径。如果未提供此参数,则会引发错误。 73 | 74 | ### 输出的csv格式说明 75 | 76 | 1. 对于每个jsonl文件,输出他jsonl路径名的MD5哈希以及对应每个txt文件MD5哈希的csv名,放入指定文件夹中 77 | 2. 对于每一个文件,他的csv文件结构层次如下: 78 | 79 | 第一行: 80 | 81 | | 是否重复(0代表不重复,1代表重复) | jsonl文件名 | txt文件名 | simhash值 | 82 | | :--------------------------------: | :---------: | :-------: | :-------: | 83 | | 0 | jsonl文件名 | txt文件名 | simhash值 | 84 | 85 | 第二行: 86 | 87 | MD5列表,每一列对应一个MD5值,为节省空间算力,只截取第8位到24位,既中间16位的MD5值。 88 | 89 | ### Demo示例 90 | 91 | ```bash 92 | python convert_jsonl_to_csv.py --src_dir ./mnbvcfiles --dst_dir ./output_csv 93 | # 假如simhash参数设置错误,simhash_threshold设置成12,导致速度极慢 94 | python multiprocess_deduplication.py --src_dir ./output_csv --n_process 15 --simhash_threshold 12 --jaccard_thresold 0.8 95 | # 先强行结束,在运行reset 96 | python reset_csv.py --src_dir ./output_csv 97 | # 正常跑一遍 98 | python multiprocess_deduplication.py --src_dir ./output_csv --n_process 15 --simhash_threshold 3 --jaccard_thresold 0.8 99 | python write_output_to_jsonl.py --csv_dir ./output_csv --jsonl_dir ./mnbvcfiles 100 | ``` 101 | -------------------------------------------------------------------------------- /parallel_dedup/convert_jsonl_to_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tqdm 3 | import os, sys 4 | current_path = os.path.abspath(__file__) 5 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 6 | from utils.utils import get_all_files 7 | import jsonlines 8 | import csv 9 | import hashlib 10 | 11 | def convert_jsonl_to_csv(src_dir, dst_dir): 12 | os.makedirs(dst_dir, exist_ok=True) 13 | 14 | # 获取所有jsonl文件 15 | file_path_list, file_nums = get_all_files(src_dir, ['.jsonl'], 'list') 16 | 17 | for i in tqdm.tqdm(range(file_nums)): 18 | with jsonlines.open(file_path_list[i]) as reader: 19 | file_path_list[i] = os.path.abspath(file_path_list[i]) 20 | for one_json in reader: 21 | file_name = hashlib.md5(file_path_list[i].encode('utf-8')).hexdigest() + hashlib.md5(one_json['文件名'].encode('utf-8')).hexdigest() + '.csv' 22 | with open(os.path.join(dst_dir, file_name), 'w', encoding='utf-8') as f: 23 | writer = csv.writer(f) 24 | row = [0, file_path_list[i], one_json['文件名'], one_json['simhash']] 25 | writer.writerow(row) 26 | md5s = {one_json['段落'][i]['md5'][8:-8] for i in range(len(one_json['段落']))} 27 | writer.writerow(md5s) 28 | 29 | 30 | ### 31 | # @description: 将jsonl文件转换为csv文件 32 | # @param src_dir: jsonl源文件夹路径 33 | # @param dst_dir: 转换后文件存放路径 34 | # 输出csv,第一行第一个是是否重复,第二个是jsonl文件名,第三个是txt文件名,第四个是simhash 35 | # 第二行是md5集合 36 | ### 37 | 38 | if __name__ == '__main__': 39 | # 设置参数解析器 40 | parser = argparse.ArgumentParser() 41 | # 添加必须指定的参数 42 | parser.add_argument('--src_dir', type=str, required=True, help="jsonl源文件夹路径") 43 | # 添加可选参数,指定转换后文件存放路径,默认为converted/ 44 | parser.add_argument('--dst_dir', type=str, default='output_csv/', help="指定转换后文件存放路径,默认为 output_csv/") 45 | 46 | # 解析参数 47 | args = parser.parse_args() 48 | # 调用convert函数 49 | convert_jsonl_to_csv(args.src_dir, args.dst_dir) 50 | -------------------------------------------------------------------------------- /parallel_dedup/multiprocess_deduplication.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os, sys 3 | current_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 5 | from utils.utils import jaccard_distance, get_all_files 6 | import tqdm 7 | import multiprocessing 8 | import utils.customSimhash as customSimhash 9 | import csv, pickle 10 | 11 | 12 | def deduplication(file_path_list, rs_pkl, simhash_threshold, similar_file_queue, flag): 13 | 14 | lsh = customSimhash.SimhashIndex([], f=64, k=simhash_threshold) 15 | 16 | if rs_pkl != None: 17 | try: 18 | lsh.bucket = pickle.load(open(rs_pkl, 'rb')) 19 | except: 20 | print('不存在该pkl文件,无法读取') 21 | 22 | count_dedup = 0 23 | for i in tqdm.tqdm(range(len(file_path_list))): 24 | with open(file_path_list[i], encoding="utf-8") as csvfile: 25 | reader = csv.reader(csvfile) 26 | row = next(reader) 27 | simhash_value = customSimhash.Simhash(int(row[3])) 28 | similar= lsh.add(file_path_list[i], simhash_value, return_similar=True) 29 | if(similar != ""): 30 | count_dedup += 1 31 | similar_file = [file_path_list[i], similar] 32 | similar_file_queue.put(similar_file) 33 | 34 | print('一共有:', count_dedup, '个重复文件被检查出来') 35 | 36 | if rs_pkl != None: 37 | pickle.dump(lsh.bucket, open(rs_pkl, 'wb')) 38 | print('已经把文件记录保存到', rs_pkl, '中') 39 | 40 | flag.value = True 41 | 42 | def check_similar_file(similar_file_queue, jaccard_thresold, flag): 43 | while True: 44 | try: 45 | similar_file = similar_file_queue.get(timeout=0.2) 46 | except: 47 | if(flag.value): 48 | break 49 | else: 50 | continue 51 | 52 | with open(similar_file[0], encoding="utf-8") as csvfile: 53 | reader = csv.reader(csvfile) 54 | next(reader) 55 | md5_set1 = set(next(reader)) 56 | with open(similar_file[1], encoding="utf-8") as csvfile: 57 | reader = csv.reader(csvfile) 58 | next(reader) 59 | md5_set2 = set(next(reader)) 60 | if(jaccard_distance(md5_set1, md5_set2) < jaccard_thresold): 61 | print(similar_file[0], similar_file[1],'jaccard相似度检查失败') 62 | print('相似度为', jaccard_distance(md5_set1, md5_set2)) 63 | else: 64 | with open(similar_file[0], 'r+') as file: 65 | # 将第一个字符替换成'1' 66 | file.write('1') 67 | 68 | 69 | def files_deplication(src_dir = 'output_csv/', rs_pkl = None, simhash_threshold = 3, jaccard_thresold = 0.8 , n_process = 13): 70 | """ 71 | 将多个csv中的文件进行去重 72 | :param src_dir: csv文件路径 73 | :param rs_pkl: 保存去重结果的pkl文件路径 74 | :param simhash_threshold: 指定去重阈值,默认为3,也就是simhash值相差3以内算相似 75 | :param n_process: 指定进程数,最低是2,也就是一个主进程一个检验去重结果进程,默认是13 76 | """ 77 | # 获取所有jsonl文件 78 | file_path_list, file_nums = get_all_files(src_dir, ['.csv'], 'list') 79 | similar_file_queue = multiprocessing.Queue(200) 80 | flag = multiprocessing.Value('b', False) 81 | for _ in range(n_process-1): 82 | p = multiprocessing.Process(target=check_similar_file, args=(similar_file_queue, jaccard_thresold, flag)) 83 | p.start() 84 | 85 | deduplication(file_path_list, rs_pkl, simhash_threshold, similar_file_queue, flag) 86 | 87 | 88 | if __name__ == '__main__': 89 | # 设置参数解析器 90 | parser = argparse.ArgumentParser() 91 | # 添加必须指定的参数 92 | parser.add_argument('--src_dir', type=str, default='output_csv/', help="源文件夹路径") 93 | # read save pkl 94 | parser.add_argument('--rs_pkl', required=False, help="源文件夹路径") 95 | # 添加可选参数,指定去重阈值 96 | parser.add_argument('--simhash_threshold', type=int, default=3, help="指定simhash去重阈值,默认为3") 97 | # 添加可选参数,指定jaccard相似度阈值 98 | parser.add_argument('--jaccard_thresold', type=float, default=0.8, help="指定jaccard相似度阈值,默认为0.8") 99 | # 添加可选参数,指定进程数,默认为13 100 | parser.add_argument('--n_process', type=int, default=13, help="指定进程数,默认为13") 101 | # 解析参数 102 | args = parser.parse_args() 103 | # 调用convert函数 104 | files_deplication(args.src_dir, args.rs_pkl, args.simhash_threshold, args.jaccard_thresold, args.n_process) -------------------------------------------------------------------------------- /parallel_dedup/reset_csv.py: -------------------------------------------------------------------------------- 1 | import argparse, tqdm 2 | import os, sys 3 | current_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 5 | from utils.utils import get_all_files 6 | 7 | if __name__ == '__main__': 8 | # 设置参数解析器 9 | parser = argparse.ArgumentParser() 10 | # 添加必须指定的参数 11 | parser.add_argument('--src_dir', type=str, required=True, help="csv源文件夹路径") 12 | 13 | # 解析参数 14 | args = parser.parse_args() 15 | 16 | # 获取所有jsonl文件 17 | file_path_list, file_nums = get_all_files(args.src_dir, ['.csv'], 'list') 18 | 19 | for i in tqdm.tqdm(range(file_nums)): 20 | with open(file_path_list[i], 'r+', encoding='utf-8') as f: 21 | f.write('0') 22 | -------------------------------------------------------------------------------- /parallel_dedup/write_output_to_jsonl.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | current_path = os.path.abspath(__file__) 3 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 4 | import argparse 5 | import tqdm 6 | from utils.utils import get_all_files 7 | import jsonlines 8 | import hashlib 9 | import tempfile 10 | import json 11 | 12 | def write_output_to_jsonl(csv_dir, jsonl_dir): 13 | 14 | # 获取所有jsonl文件 15 | file_path_list, file_nums = get_all_files(jsonl_dir, ['.jsonl'], 'list') 16 | for i in tqdm.tqdm(range(file_nums)): 17 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: 18 | with jsonlines.open(file_path_list[i]) as reader: 19 | file_path_list[i] = os.path.abspath(file_path_list[i]) 20 | for one_json in reader: 21 | file_name = hashlib.md5(file_path_list[i].encode('utf-8')).hexdigest() + hashlib.md5(one_json['文件名'].encode('utf-8')).hexdigest() + '.csv' 22 | with open(os.path.join(csv_dir, file_name), 'r', encoding='utf-8') as f: 23 | if f.read(1) == '1': 24 | one_json['是否重复文件'] = True 25 | else: 26 | one_json['是否重复文件'] = False 27 | temp_file.write(json.dumps(one_json) + '\n') 28 | os.replace(temp_file.name, file_path_list[i]) 29 | 30 | 31 | 32 | if __name__ == '__main__': 33 | # 设置参数解析器 34 | parser = argparse.ArgumentParser() 35 | # 添加必须指定的参数 36 | parser.add_argument('--csv_dir', type=str, required=True, help="csv源文件夹路径") 37 | # 添加可选参数,指定转换后文件存放路径,默认为converted/ 38 | parser.add_argument('--jsonl_dir', type=str, required=True, help="jsonl源文件夹路径") 39 | 40 | # 解析参数 41 | args = parser.parse_args() 42 | # 调用convert函数 43 | write_output_to_jsonl(args.csv_dir, args.jsonl_dir) 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jsonlines==3.1.0 2 | tqdm==4.64.1 3 | argparse==1.4.0 4 | simhash==2.1.2 5 | cityhash 6 | better-zipfile>=0.0.3 7 | charset-mnbvc>=0.0.12 -------------------------------------------------------------------------------- /utils/customSimhash.py: -------------------------------------------------------------------------------- 1 | # Created by 1e0n in 2013 2 | from __future__ import division, unicode_literals 3 | 4 | import collections 5 | import hashlib 6 | import logging 7 | import numbers 8 | import re 9 | import sys 10 | 11 | import numpy as np 12 | 13 | try: 14 | from collections.abc import Iterable 15 | except ImportError: 16 | from collections import Iterable 17 | 18 | if sys.version_info[0] >= 3: 19 | basestring = str 20 | unicode = str 21 | long = int 22 | 23 | def int_to_bytes(n, length): 24 | return n.to_bytes(length, 'big') 25 | 26 | def bytes_to_int(b): 27 | return int.from_bytes(b, 'big') 28 | else: 29 | range = xrange 30 | 31 | def int_to_bytes(n, length): 32 | return '{:0{}x}'.format(n, length * 2).decode('hex') 33 | 34 | def bytes_to_int(b): 35 | return int(b.encode('hex'), 16) 36 | 37 | def _hashfunc(x): 38 | return hashlib.md5(x).digest() 39 | 40 | def count_elements(features): 41 | 42 | result = {} 43 | current_key = None 44 | count = 0 45 | 46 | for feature in sorted(features): 47 | if feature != current_key: 48 | if current_key is not None: 49 | result[current_key] = count 50 | current_key = feature 51 | count = 1 52 | else: 53 | count += 1 54 | 55 | # 处理最后一个分组 56 | if current_key is not None: 57 | result[current_key] = count 58 | 59 | return result 60 | 61 | class Simhash(object): 62 | # Constants used in calculating simhash. Larger values will use more RAM. 63 | large_weight_cutoff = 50 64 | batch_size = 200 65 | 66 | def __init__( 67 | self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=_hashfunc, log=None 68 | ): 69 | """ 70 | `f` is the dimensions of fingerprints, in bits. Must be a multiple of 8. 71 | 72 | `reg` is meaningful only when `value` is basestring and describes 73 | what is considered to be a letter inside parsed string. Regexp 74 | object can also be specified (some attempt to handle any letters 75 | is to specify reg=re.compile(r'\w', re.UNICODE)) 76 | 77 | `hashfunc` accepts a utf-8 encoded string and returns either bytes 78 | (preferred) or an unsigned integer, in at least `f // 8` bytes. 79 | """ 80 | if f % 8: 81 | raise ValueError('f must be a multiple of 8') 82 | 83 | self.f = f 84 | self.f_bytes = f // 8 85 | self.reg = reg 86 | self.value = None 87 | self.hashfunc = hashfunc 88 | self.hashfunc_returns_int = isinstance(hashfunc(b"test"), numbers.Integral) 89 | 90 | if log is None: 91 | self.log = logging.getLogger("simhash") 92 | else: 93 | self.log = log 94 | 95 | if isinstance(value, Simhash): 96 | self.value = value.value 97 | elif isinstance(value, basestring): 98 | self.build_by_text(unicode(value)) 99 | elif isinstance(value, Iterable): 100 | self.build_by_features(value) 101 | elif isinstance(value, numbers.Integral): 102 | self.value = value 103 | else: 104 | raise Exception('Bad parameter with type {}'.format(type(value))) 105 | 106 | def __eq__(self, other): 107 | """ 108 | Compare two simhashes by their value. 109 | 110 | :param Simhash other: The Simhash object to compare to 111 | """ 112 | return self.value == other.value 113 | 114 | def _slide(self, content, width=4): 115 | return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))] 116 | 117 | def _tokenize(self, content): 118 | content = content.lower() 119 | content = ''.join(re.findall(self.reg, content)) 120 | ans = self._slide(content) 121 | return ans 122 | 123 | def build_by_text(self, content): 124 | features = self._tokenize(content) 125 | features = count_elements(features) 126 | return self.build_by_features(features) 127 | 128 | def build_by_features(self, features): 129 | """ 130 | `features` might be a list of unweighted tokens (a weight of 1 131 | will be assumed), a list of (token, weight) tuples or 132 | a token -> weight dict. 133 | """ 134 | sums = [] 135 | batch = [] 136 | count = 0 137 | w = 1 138 | truncate_mask = 2 ** self.f - 1 139 | if isinstance(features, dict): 140 | features = features.items() 141 | 142 | for f in features: 143 | skip_batch = False 144 | if not isinstance(f, basestring): 145 | f, w = f 146 | skip_batch = w > self.large_weight_cutoff or not isinstance(w, int) 147 | 148 | count += w 149 | if self.hashfunc_returns_int: 150 | h = int_to_bytes(self.hashfunc(f.encode('utf-8')) & truncate_mask, self.f_bytes) 151 | else: 152 | h = self.hashfunc(f.encode('utf-8'))[-self.f_bytes:] 153 | 154 | if skip_batch: 155 | sums.append(self._bitarray_from_bytes(h) * w) 156 | else: 157 | batch.append(h * w) 158 | if len(batch) >= self.batch_size: 159 | sums.append(self._sum_hashes(batch)) 160 | batch = [] 161 | 162 | if len(sums) >= self.batch_size: 163 | sums = [np.sum(sums, 0)] 164 | 165 | if batch: 166 | sums.append(self._sum_hashes(batch)) 167 | 168 | combined_sums = np.sum(sums, 0) 169 | self.value = bytes_to_int(np.packbits(combined_sums > count / 2).tobytes()) 170 | 171 | def _sum_hashes(self, digests): 172 | bitarray = self._bitarray_from_bytes(b''.join(digests)) 173 | rows = np.reshape(bitarray, (-1, self.f)) 174 | return np.sum(rows, 0) 175 | 176 | @staticmethod 177 | def _bitarray_from_bytes(b): 178 | return np.unpackbits(np.frombuffer(b, dtype='>B')) 179 | 180 | def distance(self, another): 181 | assert self.f == another.f 182 | x = (self.value ^ another.value) & ((1 << self.f) - 1) 183 | ans = 0 184 | while x: 185 | ans += 1 186 | x &= x - 1 187 | return ans 188 | 189 | 190 | class SimhashIndex(object): 191 | 192 | def __init__(self, objs, f=64, k=2, log=None): 193 | """ 194 | `objs` is a list of (obj_id, simhash) 195 | obj_id is a string, simhash is an instance of Simhash 196 | `f` is the same with the one for Simhash 197 | `k` is the tolerance 198 | """ 199 | self.k = k 200 | self.f = f 201 | count = len(objs) 202 | 203 | if log is None: 204 | self.log = logging.getLogger("simhash") 205 | else: 206 | self.log = log 207 | 208 | self.log.info('Initializing %s data.', count) 209 | 210 | self.bucket = collections.defaultdict(set) 211 | 212 | for i, q in enumerate(objs): 213 | if i % 10000 == 0 or i == count - 1: 214 | self.log.info('%s/%s', i + 1, count) 215 | 216 | self.add(*q) 217 | 218 | def get_near_dups(self, simhash): 219 | """ 220 | `simhash` is an instance of Simhash 221 | return a list of obj_id, which is in type of str 222 | """ 223 | assert simhash.f == self.f 224 | 225 | ans = set() 226 | 227 | for key in self.get_keys(simhash): 228 | dups = self.bucket[key] 229 | self.log.debug('key:%s', key) 230 | if len(dups) > 200: 231 | self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups)) 232 | 233 | for dup in dups: 234 | sim2, obj_id = dup.split(',', 1) 235 | sim2 = Simhash(long(sim2, 16), self.f) 236 | 237 | d = simhash.distance(sim2) 238 | if d <= self.k: 239 | ans.add(obj_id) 240 | return list(ans) 241 | 242 | def get_near_dup(self, simhash): 243 | """ 244 | `simhash` is an instance of Simhash 245 | return a list of obj_id, which is in type of str 246 | """ 247 | assert simhash.f == self.f 248 | 249 | for key in self.get_keys(simhash): 250 | dups = self.bucket[key] 251 | self.log.debug('key:%s', key) 252 | if len(dups) > 200: 253 | self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups)) 254 | 255 | for dup in dups: 256 | sim2, obj_id = dup.split(',', 1) 257 | sim2 = Simhash(long(sim2, 16), self.f) 258 | 259 | d = simhash.distance(sim2) 260 | if d <= self.k: 261 | return obj_id 262 | return '' 263 | 264 | def add(self, obj_id, simhash, return_similar=False): 265 | """ 266 | `obj_id` is a string 267 | `simhash` is an instance of Simhash 268 | `return_similar` is a bool, if True, return the similar obj_id 269 | """ 270 | assert simhash.f == self.f 271 | 272 | similar = '' 273 | for key in self.get_keys(simhash): 274 | v = '%x,%s' % (simhash.value, obj_id) 275 | # 如果当前文件已经在bucket里面,就直接返回 276 | if v in self.bucket[key]: 277 | return '' 278 | 279 | if return_similar and similar == '': 280 | for dup in self.bucket[key]: 281 | sim2, obj_id2 = dup.split(',', 1) 282 | sim2 = Simhash(long(sim2, 16), self.f) 283 | 284 | d = simhash.distance(sim2) 285 | if d <= self.k: 286 | similar = obj_id2 287 | 288 | self.bucket[key].add(v) 289 | 290 | return similar 291 | 292 | def delete(self, obj_id, simhash): 293 | """ 294 | `obj_id` is a string 295 | `simhash` is an instance of Simhash 296 | """ 297 | assert simhash.f == self.f 298 | 299 | for key in self.get_keys(simhash): 300 | v = '%x,%s' % (simhash.value, obj_id) 301 | if v in self.bucket[key]: 302 | self.bucket[key].remove(v) 303 | 304 | @property 305 | def offsets(self): 306 | """ 307 | You may optimize this method according to 308 | """ 309 | return [self.f // (self.k + 1) * i for i in range(self.k + 1)] 310 | 311 | def get_keys(self, simhash): 312 | for i, offset in enumerate(self.offsets): 313 | if i == (len(self.offsets) - 1): 314 | m = 2 ** (self.f - offset) - 1 315 | else: 316 | m = 2 ** (self.offsets[i + 1] - offset) - 1 317 | c = simhash.value >> offset & m 318 | yield '%x:%x' % (c, i) 319 | 320 | def bucket_size(self): 321 | return len(self.bucket) 322 | -------------------------------------------------------------------------------- /utils/redisSimhash.py: -------------------------------------------------------------------------------- 1 | # # Created by 1e0n in 2013 2 | # from __future__ import division, unicode_literals 3 | 4 | # import collections 5 | # import hashlib 6 | # import logging 7 | # import numbers 8 | # import re 9 | # import sys 10 | # from itertools import groupby 11 | 12 | # import numpy as np 13 | 14 | # try: 15 | # from collections.abc import Iterable 16 | # except ImportError: 17 | # from collections import Iterable 18 | # import redis 19 | 20 | 21 | # if sys.version_info[0] >= 3: 22 | # basestring = str 23 | # unicode = str 24 | # long = int 25 | 26 | # def int_to_bytes(n, length): 27 | # return n.to_bytes(length, 'big') 28 | 29 | # def bytes_to_int(b): 30 | # return int.from_bytes(b, 'big') 31 | # else: 32 | # range = xrange 33 | 34 | # def int_to_bytes(n, length): 35 | # return '{:0{}x}'.format(n, length * 2).decode('hex') 36 | 37 | # def bytes_to_int(b): 38 | # return int(b.encode('hex'), 16) 39 | 40 | # def _hashfunc(x): 41 | # return hashlib.md5(x).digest() 42 | 43 | 44 | # class Simhash(object): 45 | # # Constants used in calculating simhash. Larger values will use more RAM. 46 | # large_weight_cutoff = 50 47 | # batch_size = 200 48 | 49 | # def __init__( 50 | # self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=_hashfunc, log=None 51 | # ): 52 | # """ 53 | # `f` is the dimensions of fingerprints, in bits. Must be a multiple of 8. 54 | 55 | # `reg` is meaningful only when `value` is basestring and describes 56 | # what is considered to be a letter inside parsed string. Regexp 57 | # object can also be specified (some attempt to handle any letters 58 | # is to specify reg=re.compile(r'\w', re.UNICODE)) 59 | 60 | # `hashfunc` accepts a utf-8 encoded string and returns either bytes 61 | # (preferred) or an unsigned integer, in at least `f // 8` bytes. 62 | # """ 63 | # if f % 8: 64 | # raise ValueError('f must be a multiple of 8') 65 | 66 | # self.f = f 67 | # self.f_bytes = f // 8 68 | # self.reg = reg 69 | # self.value = None 70 | # self.hashfunc = hashfunc 71 | # self.hashfunc_returns_int = isinstance(hashfunc(b"test"), numbers.Integral) 72 | 73 | # if log is None: 74 | # self.log = logging.getLogger("simhash") 75 | # else: 76 | # self.log = log 77 | 78 | # if isinstance(value, Simhash): 79 | # self.value = value.value 80 | # elif isinstance(value, basestring): 81 | # self.build_by_text(unicode(value)) 82 | # elif isinstance(value, Iterable): 83 | # self.build_by_features(value) 84 | # elif isinstance(value, numbers.Integral): 85 | # self.value = value 86 | # else: 87 | # raise Exception('Bad parameter with type {}'.format(type(value))) 88 | 89 | # def __eq__(self, other): 90 | # """ 91 | # Compare two simhashes by their value. 92 | 93 | # :param Simhash other: The Simhash object to compare to 94 | # """ 95 | # return self.value == other.value 96 | 97 | # def _slide(self, content, width=4): 98 | # return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))] 99 | 100 | # def _tokenize(self, content): 101 | # content = content.lower() 102 | # content = ''.join(re.findall(self.reg, content)) 103 | # ans = self._slide(content) 104 | # return ans 105 | 106 | # def build_by_text(self, content): 107 | # features = self._tokenize(content) 108 | # features = {k:sum(1 for _ in g) for k, g in groupby(sorted(features))} 109 | # return self.build_by_features(features) 110 | 111 | # def build_by_features(self, features): 112 | # """ 113 | # `features` might be a list of unweighted tokens (a weight of 1 114 | # will be assumed), a list of (token, weight) tuples or 115 | # a token -> weight dict. 116 | # """ 117 | # sums = [] 118 | # batch = [] 119 | # count = 0 120 | # w = 1 121 | # truncate_mask = 2 ** self.f - 1 122 | # if isinstance(features, dict): 123 | # features = features.items() 124 | 125 | # for f in features: 126 | # skip_batch = False 127 | # if not isinstance(f, basestring): 128 | # f, w = f 129 | # skip_batch = w > self.large_weight_cutoff or not isinstance(w, int) 130 | 131 | # count += w 132 | # if self.hashfunc_returns_int: 133 | # h = int_to_bytes(self.hashfunc(f.encode('utf-8')) & truncate_mask, self.f_bytes) 134 | # else: 135 | # h = self.hashfunc(f.encode('utf-8'))[-self.f_bytes:] 136 | 137 | # if skip_batch: 138 | # sums.append(self._bitarray_from_bytes(h) * w) 139 | # else: 140 | # batch.append(h * w) 141 | # if len(batch) >= self.batch_size: 142 | # sums.append(self._sum_hashes(batch)) 143 | # batch = [] 144 | 145 | # if len(sums) >= self.batch_size: 146 | # sums = [np.sum(sums, 0)] 147 | 148 | # if batch: 149 | # sums.append(self._sum_hashes(batch)) 150 | 151 | # combined_sums = np.sum(sums, 0) 152 | # self.value = bytes_to_int(np.packbits(combined_sums > count / 2).tobytes()) 153 | 154 | # def _sum_hashes(self, digests): 155 | # bitarray = self._bitarray_from_bytes(b''.join(digests)) 156 | # rows = np.reshape(bitarray, (-1, self.f)) 157 | # return np.sum(rows, 0) 158 | 159 | # @staticmethod 160 | # def _bitarray_from_bytes(b): 161 | # return np.unpackbits(np.frombuffer(b, dtype='>B')) 162 | 163 | # def distance(self, another): 164 | # assert self.f == another.f 165 | # x = (self.value ^ another.value) & ((1 << self.f) - 1) 166 | # ans = 0 167 | # while x: 168 | # ans += 1 169 | # x &= x - 1 170 | # return ans 171 | 172 | 173 | # class SimhashIndex(object): 174 | 175 | # def __init__(self, objs, r, f=64, k=2, log=None): 176 | # """ 177 | # `objs` is a list of (obj_id, simhash) 178 | # obj_id is a string, simhash is an instance of Simhash 179 | # `f` is the same with the one for Simhash 180 | # `k` is the tolerance 181 | # """ 182 | # self.k = k 183 | # self.f = f 184 | # count = len(objs) 185 | 186 | # if log is None: 187 | # self.log = logging.getLogger("simhash") 188 | # else: 189 | # self.log = log 190 | 191 | # self.log.info('Initializing %s data.', count) 192 | 193 | # self.r = r 194 | 195 | 196 | # for i, q in enumerate(objs): 197 | # if i % 10000 == 0 or i == count - 1: 198 | # self.log.info('%s/%s', i + 1, count) 199 | 200 | # self.add(*q) 201 | 202 | # # def get_near_dups(self, simhash): 203 | # # """ 204 | # # `simhash` is an instance of Simhash 205 | # # return a list of obj_id, which is in type of str 206 | # # """ 207 | # # assert simhash.f == self.f 208 | 209 | # # for key in self.get_keys(simhash): 210 | # # dups = self.r.smembers(key) 211 | # # self.log.debug('key:%s', key) 212 | # # if len(dups) > 200: 213 | # # self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups)) 214 | 215 | # # for dup in dups: 216 | # # sim2, obj_id = dup.split(',', 1) 217 | # # sim2 = Simhash(long(sim2, 16), self.f) 218 | 219 | # # d = simhash.distance(sim2) 220 | # # if d <= self.k: 221 | # # return obj_id 222 | # # return "" 223 | 224 | # def add(self, obj_id, simhash): 225 | # """ 226 | # `obj_id` is a string 227 | # `simhash` is an instance of Simhash 228 | # """ 229 | # assert simhash.f == self.f 230 | 231 | # similar = "" 232 | # for key in self.get_keys(simhash): 233 | # v = '%x,%s' % (simhash.value, obj_id) 234 | # if(similar == ""): 235 | # dups = self.r.smembers(key) 236 | # if(len(dups)>0): 237 | # for dup in dups: 238 | # dup = str(dup.decode('utf-8')) 239 | # sim2, obj_id = dup.split(',', 1) 240 | # sim2 = Simhash(long(sim2[2:], 16), self.f) 241 | 242 | # d = simhash.distance(sim2) 243 | # if d <= self.k: 244 | # similar = obj_id 245 | # break 246 | # self.r.sadd(key, v) 247 | 248 | # return similar 249 | 250 | # def delete(self, obj_id, simhash): 251 | # """ 252 | # `obj_id` is a string 253 | # `simhash` is an instance of Simhash 254 | # """ 255 | # assert simhash.f == self.f 256 | 257 | # for key in self.get_keys(simhash): 258 | # v = '%x,%s' % (simhash.value, obj_id) 259 | # if v in self.r.smembers(key): 260 | # self.r.srem(key, v) 261 | 262 | # @property 263 | # def offsets(self): 264 | # """ 265 | # You may optimize this method according to 266 | # """ 267 | # return [self.f // (self.k + 1) * i for i in range(self.k + 1)] 268 | 269 | # def get_keys(self, simhash): 270 | # for i, offset in enumerate(self.offsets): 271 | # if i == (len(self.offsets) - 1): 272 | # m = 2 ** (self.f - offset) - 1 273 | # else: 274 | # m = 2 ** (self.offsets[i + 1] - offset) - 1 275 | # c = simhash.value >> offset & m 276 | # yield '%x:%x' % (c, i) 277 | 278 | # def bucket_size(self): 279 | # return len(self.bucket) 280 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import multiprocessing 3 | 4 | str_encode = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 5 | max_size = 500 * 1024 * 1024 6 | max_memory = 1024 * 1024 * 1024 7 | 8 | 9 | # 递归读取文件夹下所有文件 10 | def get_all_files(dir_path, legal_file_type=('.txt',), return_file_type='queue'): 11 | if return_file_type == 'queue': 12 | return get_all_files_queue(dir_path, legal_file_type) 13 | elif return_file_type == 'list': 14 | return get_all_files_list(dir_path, legal_file_type) 15 | 16 | 17 | def get_all_files_queue(dir_path, legal_file_type=('.txt',)): 18 | file_nums = 0 19 | file_path_queue = multiprocessing.Manager().Queue() 20 | 21 | for root, _, files in os.walk(dir_path): 22 | for file in files: 23 | if os.path.splitext(file)[-1] not in legal_file_type: 24 | continue 25 | file_path = os.path.join(root, file) 26 | file_path_queue.put(file_path) 27 | file_nums += 1 28 | return file_path_queue, file_nums 29 | 30 | 31 | def get_all_files_list(dir_path, legal_file_type=('.txt',)): 32 | file_path_list = [] 33 | for root, _, files in os.walk(dir_path): 34 | for file in files: 35 | if os.path.splitext(file)[-1] not in legal_file_type: 36 | continue 37 | file_path = os.path.join(root, file) 38 | file_path_list.append(file_path) 39 | file_path_list = sorted(file_path_list) 40 | return file_path_list, len(file_path_list) 41 | 42 | 43 | def get_common_prefix_and_removed_list(strs): 44 | if not strs: 45 | return "", [] 46 | prefix = strs[0] 47 | for s in strs: 48 | while not s.startswith(prefix): 49 | prefix = prefix[:-1] 50 | if not prefix: 51 | return "", strs 52 | return prefix, [s[len(prefix):] for s in strs] 53 | 54 | 55 | def jaccard_distance(md5_list1, md5_list2): 56 | nominator = md5_list1.intersection(md5_list2) 57 | # 求集合 A 和集合 B 的并集 58 | denominator = md5_list1.union(md5_list2) 59 | # 计算比率 60 | similarity = len(nominator) / len(denominator) 61 | return similarity 62 | 63 | 64 | # 递归读取文件夹下所有文件夹 65 | def get_dictory_path(dir_path, return_file_type='queue'): 66 | if return_file_type == 'queue': 67 | 68 | def get_dictory_path_queue(dir_path): 69 | dictory_path_queue = multiprocessing.Queue() 70 | for root, dirs, _ in os.walk(dir_path): 71 | for dir in dirs: 72 | dictory_path = os.path.join(root, dir) 73 | dictory_path_queue.put(dictory_path) 74 | return dictory_path_queue 75 | 76 | return get_dictory_path_queue(dir_path) 77 | 78 | elif return_file_type == 'list': 79 | 80 | def get_dictory_path_list(dir_path): 81 | dictory_path_list = [] 82 | for root, dirs, _ in os.walk(dir_path): 83 | for dir in dirs: 84 | dictory_path = os.path.join(root, dir) 85 | dictory_path_list.append(dictory_path) 86 | return dictory_path_list 87 | 88 | return get_dictory_path_list(dir_path) 89 | 90 | 91 | # 不递归的读取当前文件夹的文件 92 | def get_files(dir_path, legal_file_type=('.txt',), return_file_type='queue'): 93 | if return_file_type == 'queue': 94 | def get_files_queue(dir_path, legal_file_type=('.txt',)): 95 | file_path_queue = multiprocessing.Queue() 96 | file_nums = 0 97 | for file in os.listdir(dir_path): 98 | if os.path.splitext(file)[-1] not in legal_file_type: 99 | continue 100 | file_path = os.path.join(dir_path, file) 101 | file_path_queue.put(file_path) 102 | file_nums += 1 103 | return file_path_queue, file_nums 104 | 105 | return get_files_queue(dir_path, legal_file_type) 106 | 107 | elif return_file_type == 'list': 108 | 109 | def get_files_list(dir_path, legal_file_type=('.txt',)): 110 | file_path_list = [] 111 | for file in os.listdir(dir_path): 112 | if os.path.splitext(file)[-1] not in legal_file_type: 113 | continue 114 | file_path = os.path.join(dir_path, file) 115 | file_path_list.append(file_path) 116 | return file_path_list, len(file_path_list) 117 | 118 | return get_files_list(dir_path, legal_file_type) 119 | -------------------------------------------------------------------------------- /words_dedup/add_jsonl_detailed_simhash.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import jsonlines 3 | import tqdm, os 4 | import tempfile 5 | import os, sys 6 | current_path = os.path.abspath(__file__) 7 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 8 | import utils.customSimhash as customSimhash 9 | from utils.utils import max_size, get_all_files 10 | import multiprocessing 11 | 12 | from cityhash import CityHash64 13 | 14 | def hashfunc(x): 15 | return CityHash64(x) 16 | 17 | def calculate_simhash(args): 18 | one_json, hashfunc = args 19 | text = '' 20 | for line_json in one_json['段落']: 21 | text += line_json['内容'] 22 | 23 | simhash = customSimhash.Simhash(text, hashfunc=hashfunc) 24 | one_json['alltext_simhash'] = simhash.value 25 | 26 | return one_json 27 | 28 | def convert(src_dir, num_processes): 29 | file_path_list, file_nums = get_all_files(src_dir, ['.jsonl'], 'list') 30 | 31 | for i in tqdm.tqdm(range(file_nums)): 32 | with jsonlines.open(file_path_list[i]) as input_file, tempfile.NamedTemporaryFile(mode='w', delete=False) as output_file: 33 | with multiprocessing.Pool(num_processes) as pool: 34 | args = [(one_json, hashfunc) for one_json in input_file] 35 | results = pool.imap_unordered(calculate_simhash, args) 36 | 37 | for result in results: 38 | jsonlines.Writer(output_file).write(result) 39 | 40 | output_file.close() 41 | os.replace(output_file.name, file_path_list[i]) 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument("-d", "--directory", help="Directory to convert", required=True) 46 | parser.add_argument("-p", "--processes", help="Number of processes to use", type=int, default=multiprocessing.cpu_count()) 47 | args = parser.parse_args() 48 | 49 | convert(args.directory, args.processes) -------------------------------------------------------------------------------- /words_dedup/alltext_simhash.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import tqdm 4 | import jsonlines 5 | import os, sys 6 | current_path = os.path.abspath(__file__) 7 | sys.path.append(os.path.dirname(os.path.dirname(current_path))) 8 | import utils.customSimhash as customSimhash 9 | import pickle 10 | import tempfile 11 | from utils.utils import get_all_files 12 | 13 | 14 | def deduplication(file_path_list, rs_pkl, simhash_threshold): 15 | lsh = customSimhash.SimhashIndex([], f=64, k=simhash_threshold) 16 | 17 | if rs_pkl is not None: 18 | try: 19 | lsh.bucket = pickle.load(open(rs_pkl, 'rb')) 20 | except: 21 | print('不存在该pkl文件,无法读取') 22 | 23 | count_dedup = 0 24 | for i in tqdm.tqdm(range(len(file_path_list))): 25 | with jsonlines.open(file_path_list[i]) as input_file, tempfile.NamedTemporaryFile(mode='w', delete=False) as output_file: 26 | for one_json in input_file: 27 | simhash_value = customSimhash.Simhash(one_json['alltext_simhash']) 28 | similar = lsh.add(file_path_list[i] + one_json['文件名'], simhash_value, return_similar=True) 29 | if similar != "": 30 | count_dedup += 1 31 | one_json['是否重复文件'] = True 32 | with open('重复文件.txt', 'a') as f: 33 | f.write(file_path_list[i] + one_json['文件名'] + '和' + similar + '是重复的\n') 34 | 35 | jsonlines.Writer(output_file).write(one_json) 36 | 37 | output_file.close() 38 | os.replace(output_file.name, file_path_list[i]) 39 | 40 | print('一共有:', count_dedup, '个重复文件被检查出来') 41 | 42 | if rs_pkl is not None: 43 | pickle.dump(lsh.bucket, open(rs_pkl, 'wb')) 44 | print('已经把文件记录保存到', rs_pkl, '中') 45 | 46 | 47 | def files_deplication(src_dir='output_csv/', rs_pkl=None, simhash_threshold=3): 48 | """ 49 | 将多个csv中的文件进行去重 50 | :param src_dir: csv文件路径 51 | :param rs_pkl: 保存去重结果的pkl文件路径 52 | :param simhash_threshold: 指定去重阈值,默认为3,也就是simhash值相差3以内算相似 53 | :param n_process: 指定进程数,最低是2,也就是一个主进程一个检验去重结果进程,默认是13 54 | """ 55 | file_path_list, file_nums = get_all_files(src_dir, ['.jsonl'], 'list') 56 | 57 | deduplication(file_path_list, rs_pkl, simhash_threshold) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('--src_dir', type=str, default='xxqgfilesjsonl copy/', help="源文件夹路径") 63 | parser.add_argument('--rs_pkl', required=False, help="源文件夹路径") 64 | parser.add_argument('--simhash_threshold', type=int, default=3, help="指定simhash去重阈值,默认为3") 65 | 66 | args = parser.parse_args() 67 | with open('重复文件.txt', 'w') as f: 68 | f.write('') 69 | files_deplication(args.src_dir, args.rs_pkl, args.simhash_threshold) --------------------------------------------------------------------------------