├── .gitignore
├── LICENSE
├── README.md
├── basic_dedup
    ├── README.md
    ├── find_duplicates.py
    ├── readme.txt
    └── write_meta_data_pkl.py
├── convert
    ├── README.md
    ├── convert.py
    └── wudao_convert.py
├── corpus_processing
    ├── blacklist.txt
    ├── clean_file.py
    ├── decp936messy.py
    ├── extract.py
    ├── move_file.py
    ├── passwords.txt
    └── readme.txt
├── parallel_dedup
    ├── README.md
    ├── convert_jsonl_to_csv.py
    ├── multiprocess_deduplication.py
    ├── reset_csv.py
    └── write_output_to_jsonl.py
├── requirements.txt
├── utils
    ├── customSimhash.py
    ├── redisSimhash.py
    └── utils.py
└── words_dedup
    ├── add_jsonl_detailed_simhash.py
    └── alltext_simhash.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | converted/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 aplmikex
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MNBVC 去重部分
 2 | 
 3 | ### 项目描述
 4 | 
 5 | 本项目的主要目的是：
 6 | 
 7 | 1. 将外界输入的文件以文件md5和文件大小进行重复检测，删除不同来源的同一文件。
 8 | 2. 将大量文本文件（目前仅有txt文件）转换为格式化的、易于查询的数据。
 9 | 3. 在个人电脑上，实现对百万个文件的量级的快速去重操作。
10 | 4. （TODO）在集群上，对全部类型的文件进行重复检测。
11 | 
12 | ### 环境安装
13 | 
14 | 1. 从gtihub下载本项目
15 |    ```shell
16 |    git clone https://github.com/aplmikex/deduplication_mnbvc
17 |    ```
18 | 2. 使用 `pip`命令安装所需的库
19 |    ```shell
20 |    # 进入这个库的目录
21 |    cd deduplication_mnbvc
22 |    # 安装项目所需要的依赖
23 |    pip install -r requirements.txt
24 |    ```
25 | 
26 | ### jsonl格式说明
27 | 
28 | 1. 对于每个jsonl文件，其大小略大于500MiB，这个数值定义在 `utils.py`中的 `max_size`，可根据需要更改
29 | 3. 对于每一个文件，他的json结构层次如下：
30 | 
31 |    ```python
32 |    {
33 |        '文件名': '文件.txt',
34 |        '是否待查文件': False,
35 |        '是否重复文件': False,
36 |        '文件大小': 1024,
37 |        'simhash': 0,
38 |        '最长段落长度': 0,
39 |        '段落数': 0,
40 |        '去重段落数': 0,
41 |        '低质量段落数': 0,
42 |        '段落': []
43 |    }
44 |    ```
45 | 
46 |    将每一行为一个段落，段落的json结构层次如下：
47 | 
48 |    ```python
49 |    {
50 |        '行号': line_number,
51 |        '是否重复': False,
52 |        '是否跨文件重复': False,
53 |        'md5': md5,
54 |        '内容': line
55 |    }
56 |    ```
57 | 


--------------------------------------------------------------------------------
/basic_dedup/README.md:
--------------------------------------------------------------------------------
 1 | # MNBVC 基本去重
 2 | 
 3 | ### 项目描述
 4 | 
 5 | 本项目的主要目的是：
 6 | 
 7 | 1. 指定一个文件夹，设定定时任务，定时新增从外界获取的文件的信息追加至pkl二进制文件中。
 8 | 2. 按照文件大小与文件md5值，输出完全相同的文件至一个txt中。
 9 | 3. 根据用户需要，用户手动写脚本删除txt中的完全相同的文件名。
10 | 
11 | ### 使用说明
12 | 
13 | 1. #### write_meta_data_pkl.py
14 | 
15 |    1. 使用说明：
16 | 
17 |       - `write_meta_data_pkl.py`是把文件夹内不同格式的文件追加的写入到pkl文件中。
18 |       - 可以对一个文件夹反复运行此代码，只要路径不改变，pkl文件不会重复添加。
19 |       - 如果改变原始文件路径，请删除pkl重新生成，不然会多次删除同一个文件。
20 |       - 在运行中若增加或删减文件，可能导致文件出错或者pkl文件较大，建议运行一段时间后删除pkl，重新生成。
21 |    2. 运行 `write_meta_data_pkl.py`文件并设置必要的参数。
22 | 
23 |       ```bash
24 |       python write_meta_data_pkl.py --dir_path /path/to/directory --pkl_file file.pkl
25 |       ```
26 | 2. #### find_duplicates.py
27 | 
28 |    1. 使用说明：
29 | 
30 |       - `find_duplicates.py`是输入pkl文件，输出除了第一次出现以外其他完全重复的文件。
31 |       - 他的结果默认会输出到duplicates.txt文件中，是覆盖写，所以建议每次去重完直接删除duplicates.txt中的完全相同的文件名。
32 |       - 去重后的pkl会覆盖到原来pkl文件。
33 |    2. 运行 `multiprocess_deduplication.py`文件并设置必要的参数。
34 | 
35 |       ```bash
36 |       python find_duplicates.py --pkl_file file.pkl
37 |       ```
38 | 
39 | ### DEMO示例
40 | 
41 | 按照上面示例的使用说明执行就行了。
42 | 


--------------------------------------------------------------------------------
/basic_dedup/find_duplicates.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | 
 4 | 
 5 | # 查找重复文件
 6 | def find_duplicates(pkl_file):
 7 |     df = pd.read_pickle(pkl_file)
 8 |     duplicates = df[df.duplicated(['SHA256', 'Size'], keep=False)]
 9 |     groups = duplicates.groupby(['SHA256', 'Size'])
10 |     with open('duplicates.txt', 'w') as f:
11 |         for _, group in groups:
12 |             files = group['File'].tolist()
13 |             for file in files[1:]:
14 |                 f.write(file+'\n')
15 |     df.drop_duplicates(subset=['SHA256', 'Size'], keep='first', inplace=True)
16 |     df.to_pickle(pkl_file)
17 | 
18 | if __name__ == '__main__':
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--pkl_file', required=True, help='The pickle file to read from')
21 | 
22 |     args = parser.parse_args()
23 |     find_duplicates(args.pkl_file)


--------------------------------------------------------------------------------
/basic_dedup/readme.txt:
--------------------------------------------------------------------------------
1 | write_meta_data_pkl.py
2 | 把原始文件，不限文件内容格式，保存到pandas的表中
3 | 添加方式为追加
4 | 以二进制pkl的格式保存下来
5 | find_duplicates.py
6 | 输入pkl文件，输出除了第一次出现以外完全重复的文件
7 | 输出到duplicates.txt文件中，覆盖写
8 | 把去重后的pkl覆盖到原来pkl文件


--------------------------------------------------------------------------------
/basic_dedup/write_meta_data_pkl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import pandas as pd
 4 | import argparse
 5 | 
 6 | # 计算文件的 SHA256 哈希值
 7 | def sha256(filename):
 8 |     with open(filename, 'rb') as f:
 9 |         content = f.read()
10 |         return hashlib.sha256(content).hexdigest()
11 | 
12 | # 递归遍历目录并输出文件路径、文件大小和 SHA256 哈希值
13 | def get_all_files_list(dir_path):
14 |     file_path_list = []
15 |     for root, _, files in os.walk(dir_path):
16 |         for file in files:
17 |             file_path = os.path.join(root, file)
18 |             file_path_list.append(file_path)
19 |     file_path_list = sorted(file_path_list)
20 |     return file_path_list
21 | 
22 | 
23 | # 将文件路径、大小和哈希值写入 PKL 文件
24 | def write_to_csv(dir_path, pkl_file='files.pkl'):
25 |     try:
26 |         existing_df = pd.read_pickle(pkl_file)
27 |     except FileNotFoundError:
28 |         existing_df = pd.DataFrame({'File': [], 'Size': [], 'SHA256': []})
29 |     
30 |     data = {'File': [], 'Size': [], 'SHA256': []}
31 |     file_path_set = set(get_all_files_list(dir_path))
32 | 
33 |     file_path_set -= set(existing_df['File'])
34 | 
35 |     for filepath in file_path_set:
36 |         try:
37 |             file_size = os.path.getsize(filepath)
38 |             file_sha256 = sha256(filepath)
39 |             data['File'].append(filepath)
40 |             data['Size'].append(file_size)
41 |             data['SHA256'].append(file_sha256)
42 |         except:
43 |             print('file not exist: {}'.format(filepath))
44 | 
45 |     df = pd.concat([existing_df, pd.DataFrame(data)], ignore_index=True)
46 | 
47 |     # 将 DataFrame 写入 pickle 文件
48 |     df.to_pickle(pkl_file)
49 | 
50 | # 示例用法
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument('--dir_path', required=True, help='The directory to traverse')
54 |     parser.add_argument('--pkl_file', required=True, help='The pickle file to write to')
55 | 
56 |     args = parser.parse_args()
57 |     write_to_csv(args.dir_path, args.pkl_file)


--------------------------------------------------------------------------------
/convert/README.md:
--------------------------------------------------------------------------------
 1 | # MNBVC 格式化
 2 | 
 3 | ### 项目描述
 4 | 
 5 | 本项目的主要目的是：
 6 | 
 7 | 1. 将大量文本文件转换为格式化的、易于查询的jsonl数据。
 8 | 2. 快速标注同一文件内是否有明显重复的情况，统一放在 `problem_i.jsonl`里面。
 9 | 
10 | ### convert.py 使用说明
11 | 
12 | 1. 使用说明：
13 |    * `convert.py`是快速把txt文件转化为jsonl文件，并挑出明显自我重复的txt文件留待观察。
14 |    * 本项目假设所有需要被去重的txt文件编码均为UTF-8编码，批量转换请参考[chatset-mnbvc](https://github.com/alanshi/charset_mnbvc)。
15 |    * 本项目暂时只实现了从txt到jsonl的转换，暂未考虑其他类型数据。
16 |    * 本项目删去了原始txt文件中的空行，以及行首与行尾的空白符。
17 | 2. 运行 `convert.py`文件并设置必要的参数。
18 |    ```shell
19 |    python convert.py --src_dir /path/to/source/directory --dst_dir /path/to/destination/directory --n_process 4 --threshold 0.7
20 |    ```
21 | 
22 |    其中 `--src_dir`参数是必须的，它指定了要转换的源文件夹路径。如果未提供此参数，则会引发错误。
23 | 3. 可选参数
24 |    * `--src`：指定源文件类型，默认为 `txt`。
25 |    * `--dst`：指定目标文件类型，默认为 `jsonl`。
26 |    * `--dst_dir`：指定转换后文件的输出目录，默认为 `converted/`。
27 |    * `--n_process`：指定要使用的进程数，默认为4。
28 | 
29 | ### 输出的jsonl格式说明
30 | 
31 | 1. 根据文件内段落的重复率是否高于给定的阈值，将文件分成正常文件和待查文件，其中正常文件数字加jsonl，如 `10.jsonl`，而待查文件则是problem_加数字加jsonl，如 `problem_7.jsonl`
32 | 2. 对于每个jsonl文件，其大小略大于500MiB，这个数值定义在 `utils.py`中的 `max_size`，可根据需要更改
33 | 3. 对于每一个文件，他的json结构层次如下：
34 | 
35 |    ```python
36 |    {
37 |        '文件名': '文件.txt',
38 |        '是否待查文件': False,
39 |        '是否重复文件': False,
40 |        '文件大小': 1024,
41 |        'simhash': 0,
42 |        '最长段落长度': 0,
43 |        '段落数': 0,
44 |        '去重段落数': 0,
45 |        '低质量段落数': 0,
46 |        '段落': []
47 |    }
48 |    ```
49 | 
50 |    将每一行为一个段落，段落的json结构层次如下：
51 | 
52 |    ```python
53 |    {
54 |        '行号': line_number,
55 |        '是否重复': False,
56 |        '是否跨文件重复': False,
57 |        'md5': md5,
58 |        '内容': line
59 |    }
60 |    ```
61 | 
62 | ### DEMO示例
63 | 
64 | ```bash
65 | python convert.py --src_dir /home/xiang/文档/mnbvcfiles --dst_dir ./mnbvcfiles --n_process 8 --threshold 0.7
66 | ```
67 | 


--------------------------------------------------------------------------------
/convert/convert.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | current_path = os.path.abspath(__file__)
  3 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
  4 | from multiprocessing import Process
  5 | import multiprocessing
  6 | import argparse
  7 | import tqdm
  8 | from utils.utils import max_size, get_all_files
  9 | import jsonlines
 10 | import hashlib
 11 | import utils.customSimhash as customSimhash
 12 | import logging
 13 | 
 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 15 | 
 16 | def from_txt_to_json(file_path, threshold):
 17 | 
 18 |     # 定义json结构
 19 |     file_json = {'文件名': os.path.abspath(file_path),
 20 |                  '是否待查文件': False,
 21 |                  '是否重复文件': False,
 22 |                  '文件大小': os.path.getsize(file_path),
 23 |                  'simhash': 0,
 24 |                  '最长段落长度': 0,
 25 |                  '段落数': 0,
 26 |                  '去重段落数': 0,
 27 |                  '低质量段落数': 0,
 28 |                  '段落': []}
 29 |     # 定义用于去重的set
 30 |     hashs = set()
 31 | 
 32 |     # 读取每一行
 33 |     with open(file_path, 'r', encoding='utf-8', errors='strict') as f:
 34 |         texts = []
 35 |         for line_number, line in enumerate(f):
 36 |             # 去除行首尾空格
 37 |             line = line.strip()
 38 |             # 计算最长段落长度
 39 |             file_json['最长段落长度'] = max(file_json['最长段落长度'], len(line))
 40 |             # 删除空行
 41 |             if len(line) == 0:
 42 |                 continue
 43 |             # 计算每一行的md5值
 44 |             md5 = hashlib.md5(line.encode()).hexdigest()
 45 |             # 将每一行内容添加到json中
 46 |             file_json['段落'].append({'行号': line_number,
 47 |                                     '是否重复': md5 in hashs,
 48 |                                     '是否跨文件重复': False,
 49 |                                     'md5': md5,
 50 |                                     '内容': line
 51 |                                     })
 52 |             if md5 not in hashs:
 53 |                 texts.append(line)
 54 | 
 55 |             # 将md5值添加到set中，用于去重
 56 |             hashs.add(md5)
 57 | 
 58 |     if len(hashs) == 0:
 59 |         return None
 60 |     # 计算段落数和去重段落数
 61 |     file_json['段落数'] = len(file_json['段落'])
 62 |     file_json['去重段落数'] = len(hashs)
 63 |     # 计算simhash
 64 |     file_json['simhash'] = customSimhash.Simhash(texts).value
 65 |     # 判断是否是待查文件
 66 |     if (file_json['去重段落数'] / file_json['段落数']) < threshold:
 67 |         file_json['是否待查文件'] = True
 68 |     return file_json
 69 | 
 70 | 
 71 | def run_process(file_path_queue, json_to_write_queue, threshold):
 72 |     # 不断从队列中获取文件路径
 73 |     while not file_path_queue.empty():
 74 |         # 获取文件路径
 75 |         try:
 76 |             file_path = file_path_queue.get(timeout=0.2)
 77 |         except:
 78 |             break
 79 |         # 将文件转换为json
 80 |         try:
 81 |             one_json = from_txt_to_json(file_path, threshold)
 82 |         except UnicodeDecodeError:
 83 |             logging.error(f"Error: {file_path} is not encoded in utf-8.")
 84 |             json_to_write_queue.put(UnicodeDecodeError)
 85 |             exit(-1)
 86 |         # 把json写入到队列中
 87 |         json_to_write_queue.put(one_json)
 88 | 
 89 | 
 90 | def write_jsonl(json_to_write_queue, file_nums, dst_dir):
 91 | 
 92 |     # 定义文件名
 93 |     file_name = 0
 94 |     problem_file_name = 0
 95 |     if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')):
 96 |         logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.')
 97 |     if os.path.exists(os.path.join(dst_dir, 'problem_' + str(problem_file_name) + '.jsonl')):
 98 |         logging.warning('Warning: problem_' + str(problem_file_name) + '.jsonl' + ' already exists.')
 99 |     # 遍历文件数量
100 |     for _ in tqdm.tqdm(range(file_nums)):
101 |         # 从队列中获取一个json
102 |         one_json = json_to_write_queue.get()
103 |         if one_json is None:
104 |             continue
105 |         if one_json == UnicodeDecodeError:
106 |             return -1
107 |         # 根据是否待查文件，写入不同的文件
108 |         if one_json['是否待查文件']:
109 |             with jsonlines.open(os.path.join(dst_dir, 'problem_' + str(problem_file_name) + '.jsonl'),
110 |                                 mode='a') as last_problem_file:
111 |                 last_problem_file.write(one_json)
112 |                 # 如果当前文件大小超过限制，则更换文件名
113 |                 if last_problem_file._fp.tell() > max_size:
114 |                     problem_file_name += 1
115 |                     if os.path.exists(os.path.join(dst_dir, 'problem_' + str(problem_file_name) + '.jsonl')):
116 |                         logging.warning('Warning: problem_' + str(problem_file_name) + '.jsonl' + ' already exists.')
117 |         else:
118 |             with jsonlines.open(os.path.join(dst_dir, str(file_name) + '.jsonl'), mode='a') as last_file:
119 |                 last_file.write(one_json)
120 |                 # 如果当前文件大小超过限制，则更换文件名
121 |                 if last_file._fp.tell() > max_size:
122 |                     file_name += 1
123 |                     if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')):
124 |                         logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.')
125 |     return 0
126 | 
127 | def convert(src_dir, src='txt', dst='jsonl', dst_dir='converted/', n_process=4, threshold=0.95):
128 |     # 检查输入参数是否合理
129 |     assert os.path.exists(src_dir)
130 |     assert src in ['txt', 'jsonl']
131 |     assert dst in ['txt', 'jsonl']
132 | 
133 |     # 如果输出目录不存在，则创建
134 |     if not os.path.exists(dst_dir):
135 |         os.mkdir(dst_dir)
136 | 
137 |     # 如果源文件和目标文件类型不匹配，则抛出异常
138 |     if src != 'txt' or dst != 'jsonl':
139 |         raise NotImplementedError('Only support converting from txt to jsonl now.')
140 | 
141 |     # 获取源文件列表
142 |     file_path_queue, file_nums = get_all_files(src_dir)
143 |     json_to_write_queue = multiprocessing.Queue(200)
144 | 
145 |     # 启动多进程，将源文件转换为json
146 |     processes = []
147 |     for _ in range(n_process):
148 |         p = Process(target=run_process, args=(file_path_queue, json_to_write_queue, threshold))
149 |         p.start()
150 |         processes.append(p)
151 | 
152 |     # 将json写入文件
153 |     exit_code = write_jsonl(json_to_write_queue, file_nums, dst_dir)
154 | 
155 |     if exit_code == -1:
156 |         for p in processes:
157 |             p.terminate()
158 |     else:
159 |         for p in processes:
160 |             p.join()
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     # 设置参数解析器
165 |     parser = argparse.ArgumentParser()
166 |     # 添加必须指定的参数
167 |     parser.add_argument('--src_dir', type=str, required=True, help="源文件夹路径")
168 |     # 添加可选参数，指定源文件类型，默认为txt
169 |     parser.add_argument('--src', type=str, default='txt', help="指定源文件类型，默认为txt")
170 |     # 添加可选参数，指定目标文件类型，默认为jsonl
171 |     parser.add_argument('--dst', type=str, default='jsonl', help="指定目标文件类型，默认为jsonl")
172 |     # 添加可选参数，指定转换后文件存放路径，默认为converted/
173 |     parser.add_argument('--dst_dir', type=str, default='converted/', help="指定转换后文件存放路径，默认为converted/")
174 |     # 添加可选参数，指定进程数，默认为1
175 |     parser.add_argument('--n_process', type=int, default=1, help="指定进程数，默认为1")
176 |     # 添加可选参数，指定去重阈值，默认为0.5
177 |     parser.add_argument('--threshold', type=float, default=0.5, help="指定去重阈值，默认为0.5")
178 |     # 解析参数
179 |     args = parser.parse_args()
180 |     # 调用convert函数
181 |     convert(args.src_dir, args.src, args.dst, args.dst_dir, args.n_process, args.threshold)
182 | 


--------------------------------------------------------------------------------
/convert/wudao_convert.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | current_path = os.path.abspath(__file__)
  3 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
  4 | from multiprocessing import Process
  5 | import multiprocessing
  6 | import argparse
  7 | import tqdm
  8 | from utils.utils import max_size, get_all_files
  9 | import jsonlines, json
 10 | import hashlib
 11 | import utils.customSimhash as customSimhash
 12 | import logging
 13 | 
 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 15 | 
 16 | def from_wudaojson_to_json(file_path, one_json):
 17 | 
 18 |     # 定义json结构
 19 |     file_json = {'文件名': os.path.abspath(file_path)+':'+one_json['title'],
 20 |                  '是否待查文件': False,
 21 |                  '是否重复文件': False,
 22 |                  '文件大小': len(json.dumps(one_json)),
 23 |                  'simhash': 0,
 24 |                  '最长段落长度': len(one_json['content']),
 25 |                  '数据类型': one_json['dataType'],
 26 |                  '段落数': 1,
 27 |                  '去重段落数': 1,
 28 |                  '低质量段落数': 0,
 29 |                  '段落': []}
 30 |     
 31 |     lines = [one_json['content']]
 32 | 
 33 |     # 定义用于去重的set
 34 |     hashs = set()
 35 | 
 36 | 
 37 |     texts = []
 38 |     for line in lines:
 39 |         # 去除行首尾空格
 40 |         line = line.strip()
 41 |         # 计算最长段落长度
 42 |         file_json['最长段落长度'] = max(file_json['最长段落长度'], len(line))
 43 |         # 删除空行
 44 |         if len(line) == 0:
 45 |             continue
 46 |         # 计算每一行的md5值
 47 |         md5 = hashlib.md5(line.encode()).hexdigest()
 48 |         # 将每一行内容添加到json中
 49 |         file_json['段落'].append({'行号': 1,
 50 |                                 '是否重复': md5 in hashs,
 51 |                                 '是否跨文件重复': False,
 52 |                                 'md5': md5,
 53 |                                 '内容': line
 54 |                                 })
 55 |         if md5 not in hashs:
 56 |             texts.append(line)
 57 | 
 58 |         # 将md5值添加到set中，用于去重
 59 |         hashs.add(md5)
 60 | 
 61 |     if len(hashs) == 0:
 62 |         return None
 63 | 
 64 |     # 计算simhash
 65 |     file_json['simhash'] = customSimhash.Simhash(texts).value
 66 | 
 67 |     return file_json
 68 | 
 69 | 
 70 | 
 71 | 
 72 | def convert(src_dir, dst_dir='converted/'):
 73 |     # 检查输入参数是否合理
 74 |     assert os.path.exists(src_dir)
 75 | 
 76 |     file_name = 0
 77 |     if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')):
 78 |         logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.')
 79 | 
 80 |     # 如果输出目录不存在，则创建
 81 |     os.makedirs(dst_dir, exist_ok=True)
 82 | 
 83 |     # 获取源文件列表
 84 |     file_path_list, file_nums = get_all_files(src_dir,legal_file_type=('.json', ),  return_file_type='list')
 85 |     for _ in tqdm.tqdm(range(file_nums)):
 86 |         file = file_path_list.pop()
 87 |         with open(file, 'r', encoding='utf-8') as f:
 88 |             file_json = json.load(f)
 89 |             for one_json in file_json:
 90 |                 one_json = from_wudaojson_to_json(file, one_json)
 91 |             
 92 |                 with jsonlines.open(os.path.join(dst_dir, str(file_name) + '.jsonl'), mode='a') as last_file:
 93 |                     last_file.write(one_json)
 94 |                     # 如果当前文件大小超过限制，则更换文件名
 95 |                     if last_file._fp.tell() > max_size:
 96 |                         file_name += 1
 97 |                         if os.path.exists(os.path.join(dst_dir, str(file_name) + '.jsonl')):
 98 |                             logging.warning('Warning: ' + str(file_name) + '.jsonl' + ' already exists.')
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     # 设置参数解析器
103 |     parser = argparse.ArgumentParser()
104 |     # 添加必须指定的参数
105 |     parser.add_argument('--src_dir', type=str, required=True, help="源文件夹路径")
106 |     # 添加可选参数，指定转换后文件存放路径，默认为converted/
107 |     parser.add_argument('--dst_dir', type=str, default='converted/', help="指定转换后文件存放路径，默认为converted/")
108 |     # 解析参数
109 |     args = parser.parse_args()
110 |     # 调用convert函数
111 |     convert(args.src_dir, args.dst_dir)
112 | 


--------------------------------------------------------------------------------
/corpus_processing/blacklist.txt:
--------------------------------------------------------------------------------
 1 | tmp
 2 | ini
 3 | jpg
 4 | png
 5 | jpeg
 6 | gif
 7 | css
 8 | swf
 9 | bmp
10 | tiff
11 | tif
12 | raw
13 | svg
14 | webp
15 | ico
16 | psd
17 | ai
18 | cdr
19 | wmf
20 | pcx
21 | dng
22 | avi
23 | mp4
24 | mov
25 | wmv
26 | flv
27 | mkv
28 | webm
29 | m4v


--------------------------------------------------------------------------------
/corpus_processing/clean_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | 
 5 | 
 6 | def clean_file(folder_path, blacklist_file):
 7 |     with open(blacklist_file, 'r') as f:
 8 |         balsklist = f.readlines()
 9 |     
10 |     balsklist = [x.strip() for x in balsklist]
11 | 
12 |     with open('tobereomve.txt', 'w') as f:
13 |         for root, _, files in os.walk(folder_path):
14 |             for file in files:
15 |                 file_path = os.path.join(root, file)
16 |                 filename, extension = os.path.splitext(file_path)
17 | 
18 |                 extension = extension.lower()
19 | 
20 |                 # 扩展名带前面的.的，要多算一个
21 |                 if extension == '' or len(extension) > 7:
22 |                     f.write(file_path+'\n')
23 |                 elif extension[1:] in balsklist:
24 |                     f.write(file_path+'\n')
25 |                 elif not all(ord(c) < 128 for c in extension):
26 |                     f.write(file_path+'\n')
27 | 
28 |             
29 | 
30 | if __name__ == '__main__':
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument('--folder_path', type=str, required=True, help="所有文件路径")
33 |     parser.add_argument('--blacklist_file', type=str, required=True, help="后缀名黑名单文件路径")
34 | 
35 |     args = parser.parse_args()
36 |     clean_file(args.folder_path, args.blacklist_file)


--------------------------------------------------------------------------------
/corpus_processing/decp936messy.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | from charset_mnbvc import api
 3 | import argparse, shutil
 4 | 
 5 | def get_all_files_list(dir_path):
 6 |     file_path_list = []
 7 |     for root, _, files in os.walk(dir_path):
 8 |         for file in files:
 9 |             file_path = os.path.join(root, file)
10 |             file_path_list.append(file_path)
11 |     return file_path_list, len(file_path_list)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('--folder_path', type=str, required=True, help="乱码文件的路径，请输入最后一个正常非乱码的文件夹，会在同文件夹生成新的文件夹，但不会删除原来的")
17 |     args = parser.parse_args()
18 | 
19 |     file_path_list, file_nums = get_all_files_list(args.folder_path)
20 |     for file in file_path_list:
21 |         relative_path = os.path.relpath(file, args.folder_path)
22 |         try:
23 |             coding_name = api.from_data(data=relative_path.encode('cp437'), mode=2)
24 |         
25 |             ret = api.convert_encoding(
26 |                 source_data=relative_path.encode('cp437'),
27 |                 source_encoding=coding_name,
28 |                 target_encoding="utf-8",
29 |             )
30 |             os.makedirs(os.path.dirname(os.path.join(args.folder_path, ret)), exist_ok=True)
31 |             shutil.move(file, os.path.join(args.folder_path, ret))
32 |         except UnicodeEncodeError:
33 |             print(f"{file} :为非cp437的路径，不改变")
34 |         except Exception as e:
35 |             print(f"Move {file} failed: {e}")
36 | 


--------------------------------------------------------------------------------
/corpus_processing/extract.py:
--------------------------------------------------------------------------------
  1 | import os, hashlib
  2 | import argparse
  3 | import shutil, io
  4 | import tarfile
  5 | import zipfile
  6 | import bz2
  7 | import gzip
  8 | import rarfile
  9 | import py7zr
 10 | import os, sys
 11 | from charset_mnbvc import api
 12 | from better_zipfile import fixcharset_zipfile
 13 | import shutil
 14 | 
 15 | def get_directory_size(directory):
 16 |     total_size = 0
 17 |     for dirpath, dirnames, filenames in os.walk(directory):
 18 |         for filename in filenames:
 19 |             filepath = os.path.join(dirpath, filename)
 20 |             if not os.path.islink(filepath):
 21 |                 total_size += os.path.getsize(filepath)
 22 |     return total_size
 23 | 
 24 | 
 25 | def get_extension(file_path):
 26 |     filename, extension = os.path.splitext(file_path)
 27 | 
 28 |     extensions = []
 29 |     if extension:
 30 |         extensions.insert(0, extension)
 31 |         filename_1, extension = os.path.splitext(filename)
 32 |         if extension == '.tar':
 33 |             extensions.insert(0, extension)
 34 |             filename = filename_1
 35 |     return filename, ''.join(extensions)
 36 | 
 37 | 
 38 | def check_long_name(extract_full_path, zip_file_name):# longname返回true
 39 |     paths = zip_file_name.split('/')
 40 |     file_name = paths[-1]
 41 |     if len(file_name.encode()) > 255 and len(os.path.join(extract_full_path, zip_file_name).encode()) < 4095:
 42 |         print(f"File name too long: \n{os.path.join(extract_full_path, zip_file_name)} \n")
 43 |         basename, extensions =  get_extension(file_name)
 44 |         length = (255-len(extensions.encode())-8)//2
 45 |         basename = basename.encode()[:length].decode('utf-8', errors='ignore')+hashlib.md5(file_name.encode()).hexdigest()[:8]+basename.encode()[-length:].decode('utf-8', errors='ignore')
 46 |         new_name = basename + extensions
 47 |         return os.path.join(extract_full_path, '/'.join(paths[:-1]), new_name), True
 48 |     elif any(len(path.encode()) > 255 for path in paths) or len(os.path.join(extract_full_path, zip_file_name).encode()) > 4095:
 49 |         print(f"File name too long: \n {os.path.join(extract_full_path, zip_file_name)} \n")
 50 | 
 51 |         length = min(255, 4096-len(os.path.join(extract_full_path, 'long_name').encode()))-8
 52 | 
 53 |         new_name = zip_file_name.encode()[:length//2-1].decode('utf-8', errors='ignore') +hashlib.md5(zip_file_name.encode()).hexdigest()[:8]+ zip_file_name.encode()[1-length//2:].decode('utf-8', errors='ignore')
 54 |         new_name = '_'.join(new_name.split('/'))
 55 |         return os.path.join(extract_full_path, 'long_name', new_name), True
 56 | 
 57 |     return os.path.join(extract_full_path, zip_file_name), False
 58 | 
 59 | 
 60 | def extract_zip(file, password, extract_full_path):
 61 | 
 62 |     with fixcharset_zipfile.ZipFile(file, 'r') as zip:
 63 |         zip.setpassword(password)
 64 | 
 65 |         auto_filelists = []
 66 | 
 67 |         for file in zip.namelist():
 68 |             problem = False
 69 |             if file.endswith('/'):
 70 |                 continue
 71 | 
 72 |             new_file_path, if_long_name = check_long_name(extract_full_path, file)
 73 |             if if_long_name:
 74 |                 problem = True
 75 |             
 76 |             if problem:
 77 |                 basename = os.path.dirname(new_file_path)
 78 |                 os.makedirs(basename, exist_ok=True)
 79 |                 with zip.open(file, 'r') as f_in:
 80 |                     data = f_in.read()
 81 |                     with open(new_file_path, 'wb') as f_out:
 82 |                         f_out.write(data)
 83 |             else:
 84 |                 auto_filelists.append(file)
 85 |         
 86 |         zip.extractall(extract_full_path, auto_filelists)
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | def extract_archive(file_path, extract_full_path, file, password=None):
 93 | 
 94 |     filename, extension = get_extension(file)
 95 |     extract_succcessful = True
 96 |     try:
 97 |         if extension == '.tar':
 98 |             with tarfile.open(file_path, 'r') as tar:
 99 |                 tar.extractall(extract_full_path)
100 |         elif extension == '.tbz2' or extension == '.tar.bz2':
101 |             with tarfile.open(file_path, 'r:bz2') as tar:
102 |                 tar.extractall(extract_full_path)
103 |         elif extension == '.tgz' or extension == '.tar.gz' or extension == '.tar.Z':
104 |             with tarfile.open(file_path, 'r:gz') as tar:
105 |                 tar.extractall(extract_full_path)
106 |         elif extension == '.tar.xz':
107 |             with tarfile.open(file_path, 'r:xz') as tar:  
108 |                 tar.extractall(extract_full_path)
109 |         elif extension == '.bz2':
110 |             if not os.path.exists(extract_full_path):
111 |                 os.mkdir(extract_full_path)
112 |             with bz2.open(file_path, 'rb') as f_in:
113 |                 with open(os.path.join(extract_full_path, filename), 'wb') as f_out:
114 |                     shutil.copyfileobj(f_in, f_out)
115 |         elif extension == '.rar':
116 |             with rarfile.RarFile(file_path, 'r') as rar:
117 |                 rar.setpassword(password)
118 | 
119 |                 problem = False
120 | 
121 |                 for file in rar.namelist():
122 |                     if file.endswith('/'):
123 |                         continue
124 |                     new_file_path, if_long_name = check_long_name(extract_full_path, file)
125 |                     if if_long_name:
126 |                         problem = True
127 |                         break
128 | 
129 |                 if problem:
130 |                     for file in rar.namelist():
131 |                         if file.endswith('/'):
132 |                             continue
133 |                         new_file_path, _ = check_long_name(extract_full_path, file)
134 |                         basename = os.path.dirname(new_file_path)
135 | 
136 |                         os.makedirs(basename, exist_ok=True)
137 |                         with rar.open(file, 'r') as f_in:
138 |                             data = f_in.read()
139 |                             with open(new_file_path, 'wb') as f_out:
140 |                                 f_out.write(data)
141 |                         # print(f"File extract to: {new_file_path}")
142 |                 else:
143 |                     rar.extractall(extract_full_path)
144 | 
145 |         elif extension == '.gz':
146 |             if not os.path.exists(extract_full_path):
147 |                 os.mkdir(extract_full_path)
148 | 
149 |             with gzip.open(file_path, 'rb') as f_in:
150 |                 with open(os.path.join(extract_full_path, filename), 'wb') as f_out:
151 |                     shutil.copyfileobj(f_in, f_out)
152 |         elif extension in ('.zip', '.exe'):
153 |             extract_zip(file_path, password, extract_full_path)
154 | 
155 |         elif extension == '.7z':
156 |             with py7zr.SevenZipFile(file_path, mode='r', password=password) as seven_zip:
157 |                 seven_zip.extractall(extract_full_path)
158 |         else:
159 |             print(f"Unsupported file format: {extension}")
160 |             extract_succcessful = False
161 |     
162 |     except Exception as e:
163 |         print(f"Extracting {file_path} failed: {e}")
164 |         extract_succcessful = False
165 |     
166 |     extract_dir_size = get_directory_size(extract_full_path)
167 |     file_size = os.path.getsize(file_path)
168 |     if extract_succcessful and file_size <= extract_dir_size:
169 |         os.remove(file_path)
170 |         print(f"文件 '{file_path}' 已成功删除。")
171 |     elif os.path.isfile(extract_full_path) and file_size <= os.path.getsize(extract_full_path):
172 |     	#有时解压出来不是dir而是file，目前看到gz包有这种情况，具体原因还需分析
173 |     	os.remove(file_path)
174 |     	print(f"文件 '{file_path}' 已成功删除。")
175 |     else:
176 |         print(f"解压结果为 '{extract_succcessful}'。 解压前文件大小为'{file_size}'，解压后文件夹大小为'{extract_dir_size}'")
177 |         #检查路径长度，避免删除风险
178 |         if len(extract_full_path) >= 20:
179 |             # 确保路径存在，并且实际上是一个目录
180 |             if os.path.isdir(extract_full_path):
181 |                 try:
182 |                     shutil.rmtree(extract_full_path)
183 |                     print(f"目录 '{extract_full_path}' 已删除。")
184 |                 except:
185 |                     print(f"Error:目录 '{extract_full_path}' 删除报错。")
186 |             else:
187 |                 print(f"Error:提供的路径 '{extract_full_path}' 不是有效的目录。")
188 |         else:
189 |             print(f"Error:路径 '{extract_full_path}' 长度不足，为了安全起见，路径长度至少需要20个字符。")
190 |     
191 |     return extract_succcessful
192 | 
193 | 
194 | def traverse_directory(folder_path, passwords=None):
195 |     if not os.path.exists(folder_path):
196 |         print(f"{folder_path} does not exist!")
197 |         return
198 |     if not passwords is None:
199 |         with open(passwords, 'r') as f:
200 |             balsklist = f.readlines()
201 |         passwords = [x.strip() for x in balsklist]
202 |     else :
203 |         passwords = []
204 | 
205 | 
206 |     for root, dirs, files in os.walk(folder_path):
207 |         extract_path_set = set(dirs)
208 | 
209 |         for file in files:
210 |             # 判断文件是否为压缩包类型
211 |             if file.endswith(('.tar', '.tbz2', '.tgz', '.tar.bz2', '.tar.gz', '.tar.xz', '.tar.Z', '.bz2', '.rar', '.gz', '.zip', '.xz', '.7z', '.exe')):
212 | 
213 |                 file_path = os.path.join(root, file)
214 |                 # 把压缩包解压到的文件夹名
215 |                 extract_path, _ = get_extension(file)
216 | 
217 |                 if extract_path in extract_path_set:
218 |                     for i in range(1, 10000):
219 |                         if f"{extract_path}_{i}" not in extract_path_set:
220 |                             extract_path = f"{extract_path}_{i}"
221 |                             break
222 |                         if i == 9999:
223 |                             print(f"Too many files in {root}")
224 |                             raise Exception(f"Too many files in {root}")
225 | 
226 |                 extract_full_path = os.path.join(root, extract_path)
227 |                 if not os.path.islink(file_path):
228 |                     extract_succcessful = extract_archive(file_path, extract_full_path, file)
229 |                 
230 |                 if not extract_succcessful:
231 |                     for password in passwords:
232 |                         print(f"Try password: {password}")
233 |                         extract_succcessful = extract_archive(file_path, extract_full_path, file, password=password.encode())
234 |                         if extract_succcessful:
235 |                             break
236 |                 
237 |                 # if extract_succcessful:
238 |                 #     traverse_directory(extract_full_path)
239 |                 
240 |                 extract_path_set.add(extract_path)
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     parser = argparse.ArgumentParser()
245 |     parser.add_argument('--folder_path', type=str, required=True, help="压缩包路径")
246 |     parser.add_argument('--passwords_files', type=str, default=None, help="压缩包密码文件路径")
247 |     args = parser.parse_args()
248 | 
249 |     traverse_directory(args.folder_path, args.passwords_files)


--------------------------------------------------------------------------------
/corpus_processing/move_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import argparse
 4 | 
 5 | def move_files(input_dir, output_dir, suffix, keywords:str):
 6 |     if os.path.exists(input_dir) == False:
 7 |         raise ValueError('输入目录不存在')
 8 |     if os.path.abspath(input_dir) == os.path.abspath(output_dir):
 9 |         raise ValueError('输入目录和输出目录不能相同')
10 | 
11 |     keywords = [keyword for keyword in keywords.split(',') if keyword != '']
12 | 
13 |     os.makedirs(output_dir, exist_ok=True)
14 |     for root, _, files in os.walk(input_dir):
15 |         # 获取相对于输入目录的路径
16 |         relative_path = os.path.relpath(root, input_dir)
17 |         
18 |         # 创建目标目录
19 |         target_dir = os.path.join(output_dir, relative_path)
20 |         first_create = True
21 |         # 移动符合条件的文件
22 |         for file in files:
23 |             file_path = os.path.join(root, file)
24 |             # 检查file_path是否包含全部关键词
25 |             in_keywords = True
26 |             for keyword in keywords:
27 |                 if keyword not in file_path:
28 |                     in_keywords = False
29 |                     break
30 |             if file.lower().endswith(suffix) and in_keywords:
31 |                 try:
32 |                     if first_create:
33 |                         os.makedirs(target_dir, exist_ok=True)
34 |                         first_create = False
35 |                     source_file = os.path.join(root, file)
36 |                     target_file = os.path.join(target_dir, file)
37 |                     target_file = target_file[:len(target_file) - len(suffix)] + suffix
38 |                     shutil.move(source_file, target_file)
39 |                 except Exception as e:
40 |                     print(f"移动文件时出现异常: {e}")
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument('--input_dir', type=str, required=True, help="输入目录")
45 |     parser.add_argument('--output_dir', type=str, required=True, help="输出目录")
46 |     parser.add_argument('--suffix', type=str, required=True, help="后缀名")
47 |     parser.add_argument('--keywords', type=str, default='', help="关键词")
48 | 
49 |     args = parser.parse_args()
50 |     move_files(args.input_dir, args.output_dir, args.suffix, args.keywords)


--------------------------------------------------------------------------------
/corpus_processing/passwords.txt:
--------------------------------------------------------------------------------
1 | 253874


--------------------------------------------------------------------------------
/corpus_processing/readme.txt:
--------------------------------------------------------------------------------
1 | extract.py是压缩包解压的代码，可以嵌套解压，并且会原始压缩包删除
2 | 如果有密码或者不支持的类型，会报错
3 | 仅测试过几种类型，待测试更多
4 | 解压炸弹没处理
5 | 同一个名字的压缩包将会在后面加_数字，生成新的目录，小概率有空目录
6 | cleanfile.py是清理后缀扩展名的代码
7 | 黑名单制度，在黑名单上的会被删除
8 | move_file.py是移动文件的代码，可以在保留文件目录的前提下把文件移动到目标目录下


--------------------------------------------------------------------------------
/parallel_dedup/README.md:
--------------------------------------------------------------------------------
  1 | # MNBVC 单机多进程文件去重
  2 | 
  3 | ### 项目描述
  4 | 
  5 | 本项目的主要目的是：
  6 | 
  7 | 1. 在个人电脑上，实现对百万个文件的量级的快速去重操作。
  8 | 
  9 | ### 使用说明
 10 | 
 11 | 1. #### convert_jsonl_to_csv.py
 12 | 
 13 |    1. 使用说明：
 14 | 
 15 |       - `convert_jsonl_to_csv.py`是把jsonl文件转化为对应txt的元数据组成的csv文件，用于储存文件的md5的集合。
 16 |       - 本项目依赖的是从 `convert.py`输出的jsonl文件，请不要使用其他格式。
 17 |       - 本项目使用对jsonl文件名的MD5值以及对原始文件的MD5值连接起来作为输出csv的文件名，会有极小概率丢失文件元数据，导致去重过程不包含该文件，如对全部文件的完整度非常在意，请检查生成的csv文件数与原始txt文件数量。
 18 |    2. 运行 `convert_jsonl_to_csv.py`文件并设置必要的参数。
 19 | 
 20 |       ```bash
 21 |       python convert_jsonl_to_csv.py --src_dir /path/to/source/directory --dst_dir /path/to/destination/directory 
 22 |       ```
 23 | 
 24 |       其中 `--src_dir`参数是必须的，它指定了要转换的jsonl源文件夹路径。如果未提供此参数，则会引发错误。
 25 |    3. 可选参数
 26 | 
 27 |       - `--dst_dir`：指定转换后文件的输出目录，默认为 `output_csv/`。
 28 | 2. #### multiprocess_deduplication.py
 29 | 
 30 |    1. 使用说明：
 31 | 
 32 |       - `multiprocess_deduplication.py`是MNBVC单机多进程文件去重部分的主要代码。
 33 |       - 本项目依赖的是从 `convert_jsonl_to_csv.py`输出的csv文件，请不要使用其他格式。
 34 |       - 本项目最少使用2个进程，最多无上限，建议使用电脑cpu核心数的进程数量。其中一个进程进行去重的比较工作，其他所有进程用于读取重复的文件的csv列表，用于二次验证是否重复。
 35 |    2. 运行 `multiprocess_deduplication.py`文件并设置必要的参数。
 36 | 
 37 |       ```bash
 38 |       python multiprocess_deduplication.py --src_dir /path/to/source/directory --n_process 10 --simhash_threshold 3 --jaccard_thresold 0.8
 39 |       ```
 40 | 
 41 |       其中 `--src_dir`参数是必须的，它指定了要转换的csv源文件夹路径。如果未提供此参数，则会引发错误。
 42 |    3. 可选参数
 43 | 
 44 |       - `--simhash_threshold`：指定simhash阈值，默认设置为3，效果较好。这个值如果大于5，则会极慢无比。
 45 |       - `--jaccard_thresold`：指定jaccard阈值，默认为0.8。低于这个数的可以手动看，进行决策是否重复。一般来说simhash阈值在3的时候，极少有真正重复的。
 46 |       - `--n_process`：指定要使用的进程数，默认为13。
 47 | 3. #### reset_csv.py
 48 | 
 49 |    1. 使用说明：
 50 | 
 51 |       - `reset_csv.py`是清除csv文件去重结果代码。我们会用 `multiprocess_deduplication.py`把去重结果写到csv文件中，若某次 `multiprocess_deduplication.py`参数选择出错，可以用本代码清除csv状态，重新进行去重。
 52 |    2. 运行 `reset_csv.py`文件并设置必要的参数。
 53 | 
 54 |       ```bash
 55 |       python reset_csv.py --src_dir /path/to/source/directory 
 56 |       ```
 57 | 
 58 |       其中 `--src_dir`参数是必须的，它指定了要转换的csv源文件夹路径。如果未提供此参数，则会引发错误。
 59 | 4. #### write_output_to_jsonl.py
 60 | 
 61 |    1. 使用说明：
 62 | 
 63 |       - `write_output_to_jsonl.py`是将csv去重的结果保存到原始的jsonl文件中去，属于去重最后一步。
 64 |    2. 运行 `write_output_to_jsonl.py`文件并设置必要的参数。
 65 | 
 66 |       ```bash
 67 |       python write_output_to_jsonl.py --csv_dir /path/to/source/csvdirectory --jsonl_dir /path/to/source/jsonldirectory 
 68 |       ```
 69 | 
 70 |       其中 `--csv_dir`参数是必须的，它指定了csv源文件夹路径。如果未提供此参数，则会引发错误。
 71 | 
 72 |       其中 `--jsonl_dir`参数是必须的，它指定了jsonl文件夹路径。如果未提供此参数，则会引发错误。
 73 | 
 74 | ### 输出的csv格式说明
 75 | 
 76 | 1. 对于每个jsonl文件，输出他jsonl路径名的MD5哈希以及对应每个txt文件MD5哈希的csv名，放入指定文件夹中
 77 | 2. 对于每一个文件，他的csv文件结构层次如下：
 78 | 
 79 |    第一行：
 80 | 
 81 |    | 是否重复（0代表不重复，1代表重复） | jsonl文件名 | txt文件名 | simhash值 |
 82 |    | :--------------------------------: | :---------: | :-------: | :-------: |
 83 |    |                 0                 | jsonl文件名 | txt文件名 | simhash值 |
 84 | 
 85 |    第二行：
 86 | 
 87 |    MD5列表，每一列对应一个MD5值，为节省空间算力，只截取第8位到24位，既中间16位的MD5值。
 88 | 
 89 | ### Demo示例
 90 | 
 91 | ```bash
 92 | python convert_jsonl_to_csv.py --src_dir ./mnbvcfiles --dst_dir ./output_csv
 93 | # 假如simhash参数设置错误，simhash_threshold设置成12，导致速度极慢
 94 | python multiprocess_deduplication.py --src_dir ./output_csv --n_process 15 --simhash_threshold 12 --jaccard_thresold 0.8
 95 | # 先强行结束，在运行reset
 96 | python reset_csv.py --src_dir ./output_csv
 97 | # 正常跑一遍
 98 | python multiprocess_deduplication.py --src_dir ./output_csv --n_process 15 --simhash_threshold 3 --jaccard_thresold 0.8
 99 | python write_output_to_jsonl.py --csv_dir ./output_csv --jsonl_dir ./mnbvcfiles
100 | ```
101 | 


--------------------------------------------------------------------------------
/parallel_dedup/convert_jsonl_to_csv.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import tqdm
 3 | import os, sys
 4 | current_path = os.path.abspath(__file__)
 5 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
 6 | from utils.utils import get_all_files
 7 | import jsonlines
 8 | import csv
 9 | import hashlib
10 | 
11 | def convert_jsonl_to_csv(src_dir, dst_dir):
12 |     os.makedirs(dst_dir, exist_ok=True)
13 | 
14 |     # 获取所有jsonl文件
15 |     file_path_list, file_nums = get_all_files(src_dir, ['.jsonl'], 'list')
16 | 
17 |     for i in tqdm.tqdm(range(file_nums)):
18 |         with jsonlines.open(file_path_list[i]) as reader:
19 |             file_path_list[i] = os.path.abspath(file_path_list[i])
20 |             for one_json in reader:
21 |                 file_name = hashlib.md5(file_path_list[i].encode('utf-8')).hexdigest() + hashlib.md5(one_json['文件名'].encode('utf-8')).hexdigest() + '.csv'
22 |                 with open(os.path.join(dst_dir, file_name), 'w', encoding='utf-8') as f:
23 |                     writer = csv.writer(f)
24 |                     row = [0, file_path_list[i], one_json['文件名'], one_json['simhash']]
25 |                     writer.writerow(row)
26 |                     md5s = {one_json['段落'][i]['md5'][8:-8] for i in range(len(one_json['段落']))}
27 |                     writer.writerow(md5s)
28 | 
29 | 
30 | ###
31 | # @description: 将jsonl文件转换为csv文件
32 | # @param src_dir: jsonl源文件夹路径
33 | # @param dst_dir: 转换后文件存放路径
34 | # 输出csv，第一行第一个是是否重复，第二个是jsonl文件名，第三个是txt文件名，第四个是simhash
35 | # 第二行是md5集合
36 | ###
37 | 
38 | if __name__ == '__main__':
39 |     # 设置参数解析器
40 |     parser = argparse.ArgumentParser()
41 |     # 添加必须指定的参数
42 |     parser.add_argument('--src_dir', type=str, required=True, help="jsonl源文件夹路径")
43 |     # 添加可选参数，指定转换后文件存放路径，默认为converted/
44 |     parser.add_argument('--dst_dir', type=str, default='output_csv/', help="指定转换后文件存放路径，默认为 output_csv/")
45 | 
46 |     # 解析参数
47 |     args = parser.parse_args()
48 |     # 调用convert函数
49 |     convert_jsonl_to_csv(args.src_dir, args.dst_dir)
50 | 


--------------------------------------------------------------------------------
/parallel_dedup/multiprocess_deduplication.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os, sys
  3 | current_path = os.path.abspath(__file__)
  4 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
  5 | from utils.utils import jaccard_distance, get_all_files
  6 | import tqdm
  7 | import multiprocessing
  8 | import utils.customSimhash as customSimhash
  9 | import csv, pickle
 10 | 
 11 | 
 12 | def deduplication(file_path_list, rs_pkl, simhash_threshold, similar_file_queue, flag):
 13 | 
 14 |     lsh = customSimhash.SimhashIndex([], f=64, k=simhash_threshold)
 15 | 
 16 |     if rs_pkl != None:
 17 |         try:
 18 |             lsh.bucket = pickle.load(open(rs_pkl, 'rb'))
 19 |         except:
 20 |             print('不存在该pkl文件，无法读取')
 21 | 
 22 |     count_dedup = 0
 23 |     for i in tqdm.tqdm(range(len(file_path_list))):
 24 |         with open(file_path_list[i], encoding="utf-8") as csvfile:
 25 |             reader = csv.reader(csvfile)
 26 |             row = next(reader)
 27 |             simhash_value = customSimhash.Simhash(int(row[3]))
 28 |             similar= lsh.add(file_path_list[i], simhash_value, return_similar=True)
 29 |             if(similar != ""):
 30 |                 count_dedup += 1
 31 |                 similar_file = [file_path_list[i], similar]
 32 |                 similar_file_queue.put(similar_file)
 33 | 
 34 |     print('一共有:', count_dedup, '个重复文件被检查出来')
 35 |     
 36 |     if rs_pkl != None:
 37 |         pickle.dump(lsh.bucket, open(rs_pkl, 'wb'))
 38 |         print('已经把文件记录保存到', rs_pkl, '中')
 39 | 
 40 |     flag.value = True
 41 | 
 42 | def check_similar_file(similar_file_queue, jaccard_thresold, flag):
 43 |     while True:
 44 |         try:
 45 |             similar_file = similar_file_queue.get(timeout=0.2) 
 46 |         except:
 47 |             if(flag.value):
 48 |                 break
 49 |             else:
 50 |                 continue
 51 | 
 52 |         with open(similar_file[0], encoding="utf-8") as csvfile:
 53 |             reader = csv.reader(csvfile)
 54 |             next(reader)
 55 |             md5_set1 = set(next(reader))
 56 |         with open(similar_file[1], encoding="utf-8") as csvfile:
 57 |             reader = csv.reader(csvfile)
 58 |             next(reader)
 59 |             md5_set2 = set(next(reader))
 60 |         if(jaccard_distance(md5_set1, md5_set2) < jaccard_thresold):
 61 |             print(similar_file[0], similar_file[1],'jaccard相似度检查失败')
 62 |             print('相似度为', jaccard_distance(md5_set1, md5_set2))
 63 |         else:
 64 |             with open(similar_file[0], 'r+') as file:
 65 |                 # 将第一个字符替换成'1'
 66 |                 file.write('1')
 67 | 
 68 | 
 69 | def files_deplication(src_dir = 'output_csv/', rs_pkl = None, simhash_threshold = 3, jaccard_thresold = 0.8 , n_process = 13):
 70 |     """
 71 |     将多个csv中的文件进行去重
 72 |     :param src_dir: csv文件路径
 73 |     :param rs_pkl: 保存去重结果的pkl文件路径
 74 |     :param simhash_threshold: 指定去重阈值，默认为3，也就是simhash值相差3以内算相似
 75 |     :param n_process: 指定进程数，最低是2，也就是一个主进程一个检验去重结果进程，默认是13
 76 |     """
 77 |     # 获取所有jsonl文件
 78 |     file_path_list, file_nums = get_all_files(src_dir, ['.csv'], 'list')
 79 |     similar_file_queue = multiprocessing.Queue(200)
 80 |     flag = multiprocessing.Value('b', False)
 81 |     for _ in range(n_process-1):
 82 |         p = multiprocessing.Process(target=check_similar_file, args=(similar_file_queue, jaccard_thresold, flag))
 83 |         p.start()
 84 | 
 85 |     deduplication(file_path_list, rs_pkl, simhash_threshold, similar_file_queue, flag)
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     # 设置参数解析器
 90 |     parser = argparse.ArgumentParser()
 91 |     # 添加必须指定的参数
 92 |     parser.add_argument('--src_dir', type=str, default='output_csv/', help="源文件夹路径")
 93 |     # read save pkl
 94 |     parser.add_argument('--rs_pkl', required=False, help="源文件夹路径")
 95 |     # 添加可选参数，指定去重阈值
 96 |     parser.add_argument('--simhash_threshold', type=int, default=3, help="指定simhash去重阈值，默认为3")
 97 |     # 添加可选参数，指定jaccard相似度阈值
 98 |     parser.add_argument('--jaccard_thresold', type=float, default=0.8, help="指定jaccard相似度阈值，默认为0.8")
 99 |     # 添加可选参数，指定进程数，默认为13
100 |     parser.add_argument('--n_process', type=int, default=13, help="指定进程数，默认为13")
101 |     # 解析参数
102 |     args = parser.parse_args()
103 |     # 调用convert函数
104 |     files_deplication(args.src_dir, args.rs_pkl, args.simhash_threshold, args.jaccard_thresold, args.n_process)


--------------------------------------------------------------------------------
/parallel_dedup/reset_csv.py:
--------------------------------------------------------------------------------
 1 | import argparse, tqdm
 2 | import os, sys
 3 | current_path = os.path.abspath(__file__)
 4 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
 5 | from utils.utils import get_all_files
 6 | 
 7 | if __name__ == '__main__':
 8 |     # 设置参数解析器
 9 |     parser = argparse.ArgumentParser()
10 |     # 添加必须指定的参数
11 |     parser.add_argument('--src_dir', type=str, required=True, help="csv源文件夹路径")
12 | 
13 |     # 解析参数
14 |     args = parser.parse_args()
15 | 
16 |     # 获取所有jsonl文件
17 |     file_path_list, file_nums = get_all_files(args.src_dir, ['.csv'], 'list')
18 | 
19 |     for i in tqdm.tqdm(range(file_nums)):
20 |         with open(file_path_list[i], 'r+', encoding='utf-8') as f:
21 |             f.write('0')
22 | 


--------------------------------------------------------------------------------
/parallel_dedup/write_output_to_jsonl.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | current_path = os.path.abspath(__file__)
 3 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
 4 | import argparse
 5 | import tqdm
 6 | from utils.utils import get_all_files
 7 | import jsonlines
 8 | import hashlib
 9 | import tempfile    
10 | import json
11 | 
12 | def write_output_to_jsonl(csv_dir, jsonl_dir):
13 | 
14 |     # 获取所有jsonl文件
15 |     file_path_list, file_nums = get_all_files(jsonl_dir, ['.jsonl'], 'list')
16 |     for i in tqdm.tqdm(range(file_nums)):
17 |         with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
18 |             with jsonlines.open(file_path_list[i]) as reader:
19 |                 file_path_list[i] = os.path.abspath(file_path_list[i])
20 |                 for one_json in reader:
21 |                     file_name = hashlib.md5(file_path_list[i].encode('utf-8')).hexdigest() + hashlib.md5(one_json['文件名'].encode('utf-8')).hexdigest() + '.csv'
22 |                     with open(os.path.join(csv_dir, file_name), 'r', encoding='utf-8') as f:
23 |                         if f.read(1) == '1':
24 |                             one_json['是否重复文件'] = True
25 |                         else:
26 |                             one_json['是否重复文件'] = False
27 |                     temp_file.write(json.dumps(one_json) + '\n')
28 |             os.replace(temp_file.name, file_path_list[i])
29 | 
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     # 设置参数解析器
34 |     parser = argparse.ArgumentParser()
35 |     # 添加必须指定的参数
36 |     parser.add_argument('--csv_dir', type=str, required=True, help="csv源文件夹路径")
37 |     # 添加可选参数，指定转换后文件存放路径，默认为converted/
38 |     parser.add_argument('--jsonl_dir', type=str, required=True, help="jsonl源文件夹路径")
39 | 
40 |     # 解析参数
41 |     args = parser.parse_args()
42 |     # 调用convert函数
43 |     write_output_to_jsonl(args.csv_dir, args.jsonl_dir)
44 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines==3.1.0
2 | tqdm==4.64.1
3 | argparse==1.4.0
4 | simhash==2.1.2
5 | cityhash
6 | better-zipfile>=0.0.3
7 | charset-mnbvc>=0.0.12


--------------------------------------------------------------------------------
/utils/customSimhash.py:
--------------------------------------------------------------------------------
  1 | # Created by 1e0n in 2013
  2 | from __future__ import division, unicode_literals
  3 | 
  4 | import collections
  5 | import hashlib
  6 | import logging
  7 | import numbers
  8 | import re
  9 | import sys
 10 | 
 11 | import numpy as np
 12 | 
 13 | try:
 14 |     from collections.abc import Iterable
 15 | except ImportError:
 16 |     from collections import Iterable
 17 | 
 18 | if sys.version_info[0] >= 3:
 19 |     basestring = str
 20 |     unicode = str
 21 |     long = int
 22 | 
 23 |     def int_to_bytes(n, length):
 24 |         return n.to_bytes(length, 'big')
 25 | 
 26 |     def bytes_to_int(b):
 27 |         return int.from_bytes(b, 'big')
 28 | else:
 29 |     range = xrange
 30 | 
 31 |     def int_to_bytes(n, length):
 32 |         return '{:0{}x}'.format(n, length * 2).decode('hex')
 33 | 
 34 |     def bytes_to_int(b):
 35 |         return int(b.encode('hex'), 16)
 36 | 
 37 | def _hashfunc(x):
 38 |     return hashlib.md5(x).digest()
 39 | 
 40 | def count_elements(features):
 41 | 
 42 |     result = {}
 43 |     current_key = None
 44 |     count = 0
 45 | 
 46 |     for feature in sorted(features):
 47 |         if feature != current_key:
 48 |             if current_key is not None:
 49 |                 result[current_key] = count
 50 |             current_key = feature
 51 |             count = 1
 52 |         else:
 53 |             count += 1
 54 | 
 55 |     # 处理最后一个分组
 56 |     if current_key is not None:
 57 |         result[current_key] = count
 58 |     
 59 |     return result
 60 | 
 61 | class Simhash(object):
 62 |     # Constants used in calculating simhash. Larger values will use more RAM.
 63 |     large_weight_cutoff = 50
 64 |     batch_size = 200
 65 | 
 66 |     def __init__(
 67 |             self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=_hashfunc, log=None
 68 |     ):
 69 |         """
 70 |         `f` is the dimensions of fingerprints, in bits. Must be a multiple of 8.
 71 | 
 72 |         `reg` is meaningful only when `value` is basestring and describes
 73 |         what is considered to be a letter inside parsed string. Regexp
 74 |         object can also be specified (some attempt to handle any letters
 75 |         is to specify reg=re.compile(r'\w', re.UNICODE))
 76 | 
 77 |         `hashfunc` accepts a utf-8 encoded string and returns either bytes
 78 |         (preferred) or an unsigned integer, in at least `f // 8` bytes.
 79 |         """
 80 |         if f % 8:
 81 |             raise ValueError('f must be a multiple of 8')
 82 | 
 83 |         self.f = f
 84 |         self.f_bytes = f // 8
 85 |         self.reg = reg
 86 |         self.value = None
 87 |         self.hashfunc = hashfunc
 88 |         self.hashfunc_returns_int = isinstance(hashfunc(b"test"), numbers.Integral)
 89 | 
 90 |         if log is None:
 91 |             self.log = logging.getLogger("simhash")
 92 |         else:
 93 |             self.log = log
 94 | 
 95 |         if isinstance(value, Simhash):
 96 |             self.value = value.value
 97 |         elif isinstance(value, basestring):
 98 |             self.build_by_text(unicode(value))
 99 |         elif isinstance(value, Iterable):
100 |             self.build_by_features(value)
101 |         elif isinstance(value, numbers.Integral):
102 |             self.value = value
103 |         else:
104 |             raise Exception('Bad parameter with type {}'.format(type(value)))
105 | 
106 |     def __eq__(self, other):
107 |         """
108 |         Compare two simhashes by their value.
109 | 
110 |         :param Simhash other: The Simhash object to compare to
111 |         """
112 |         return self.value == other.value
113 | 
114 |     def _slide(self, content, width=4):
115 |         return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]
116 | 
117 |     def _tokenize(self, content):
118 |         content = content.lower()
119 |         content = ''.join(re.findall(self.reg, content))
120 |         ans = self._slide(content)
121 |         return ans     
122 | 
123 |     def build_by_text(self, content):
124 |         features = self._tokenize(content)
125 |         features = count_elements(features)
126 |         return self.build_by_features(features)
127 | 
128 |     def build_by_features(self, features):
129 |         """
130 |         `features` might be a list of unweighted tokens (a weight of 1
131 |                    will be assumed), a list of (token, weight) tuples or
132 |                    a token -> weight dict.
133 |         """
134 |         sums = []
135 |         batch = []
136 |         count = 0
137 |         w = 1
138 |         truncate_mask = 2 ** self.f - 1
139 |         if isinstance(features, dict):
140 |             features = features.items()
141 | 
142 |         for f in features:
143 |             skip_batch = False
144 |             if not isinstance(f, basestring):
145 |                 f, w = f
146 |                 skip_batch = w > self.large_weight_cutoff or not isinstance(w, int)
147 | 
148 |             count += w
149 |             if self.hashfunc_returns_int:
150 |                 h = int_to_bytes(self.hashfunc(f.encode('utf-8')) & truncate_mask, self.f_bytes)
151 |             else:
152 |                 h = self.hashfunc(f.encode('utf-8'))[-self.f_bytes:]
153 | 
154 |             if skip_batch:
155 |                 sums.append(self._bitarray_from_bytes(h) * w)
156 |             else:
157 |                 batch.append(h * w)
158 |                 if len(batch) >= self.batch_size:
159 |                     sums.append(self._sum_hashes(batch))
160 |                     batch = []
161 | 
162 |             if len(sums) >= self.batch_size:
163 |                 sums = [np.sum(sums, 0)]
164 | 
165 |         if batch:
166 |             sums.append(self._sum_hashes(batch))
167 | 
168 |         combined_sums = np.sum(sums, 0)
169 |         self.value = bytes_to_int(np.packbits(combined_sums > count / 2).tobytes())
170 | 
171 |     def _sum_hashes(self, digests):
172 |         bitarray = self._bitarray_from_bytes(b''.join(digests))
173 |         rows = np.reshape(bitarray, (-1, self.f))
174 |         return np.sum(rows, 0)
175 | 
176 |     @staticmethod
177 |     def _bitarray_from_bytes(b):
178 |         return np.unpackbits(np.frombuffer(b, dtype='>B'))
179 | 
180 |     def distance(self, another):
181 |         assert self.f == another.f
182 |         x = (self.value ^ another.value) & ((1 << self.f) - 1)
183 |         ans = 0
184 |         while x:
185 |             ans += 1
186 |             x &= x - 1
187 |         return ans
188 | 
189 | 
190 | class SimhashIndex(object):
191 | 
192 |     def __init__(self, objs, f=64, k=2, log=None):
193 |         """
194 |         `objs` is a list of (obj_id, simhash)
195 |         obj_id is a string, simhash is an instance of Simhash
196 |         `f` is the same with the one for Simhash
197 |         `k` is the tolerance
198 |         """
199 |         self.k = k
200 |         self.f = f
201 |         count = len(objs)
202 | 
203 |         if log is None:
204 |             self.log = logging.getLogger("simhash")
205 |         else:
206 |             self.log = log
207 | 
208 |         self.log.info('Initializing %s data.', count)
209 | 
210 |         self.bucket = collections.defaultdict(set)
211 | 
212 |         for i, q in enumerate(objs):
213 |             if i % 10000 == 0 or i == count - 1:
214 |                 self.log.info('%s/%s', i + 1, count)
215 | 
216 |             self.add(*q)
217 | 
218 |     def get_near_dups(self, simhash):
219 |         """
220 |         `simhash` is an instance of Simhash
221 |         return a list of obj_id, which is in type of str
222 |         """
223 |         assert simhash.f == self.f
224 | 
225 |         ans = set()
226 | 
227 |         for key in self.get_keys(simhash):
228 |             dups = self.bucket[key]
229 |             self.log.debug('key:%s', key)
230 |             if len(dups) > 200:
231 |                 self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups))
232 | 
233 |             for dup in dups:
234 |                 sim2, obj_id = dup.split(',', 1)
235 |                 sim2 = Simhash(long(sim2, 16), self.f)
236 | 
237 |                 d = simhash.distance(sim2)
238 |                 if d <= self.k:
239 |                     ans.add(obj_id)
240 |         return list(ans)
241 | 
242 |     def get_near_dup(self, simhash):
243 |         """
244 |         `simhash` is an instance of Simhash
245 |         return a list of obj_id, which is in type of str
246 |         """
247 |         assert simhash.f == self.f
248 | 
249 |         for key in self.get_keys(simhash):
250 |             dups = self.bucket[key]
251 |             self.log.debug('key:%s', key)
252 |             if len(dups) > 200:
253 |                 self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups))
254 | 
255 |             for dup in dups:
256 |                 sim2, obj_id = dup.split(',', 1)
257 |                 sim2 = Simhash(long(sim2, 16), self.f)
258 | 
259 |                 d = simhash.distance(sim2)
260 |                 if d <= self.k:
261 |                     return obj_id
262 |         return ''
263 | 
264 |     def add(self, obj_id, simhash, return_similar=False):
265 |         """
266 |         `obj_id` is a string
267 |         `simhash` is an instance of Simhash
268 |         `return_similar` is a bool, if True, return the similar obj_id
269 |         """
270 |         assert simhash.f == self.f
271 | 
272 |         similar = ''
273 |         for key in self.get_keys(simhash):
274 |             v = '%x,%s' % (simhash.value, obj_id)
275 |             # 如果当前文件已经在bucket里面，就直接返回
276 |             if v in self.bucket[key]:
277 |                 return ''
278 | 
279 |             if return_similar and similar == '':
280 |                 for dup in self.bucket[key]:
281 |                     sim2, obj_id2 = dup.split(',', 1)
282 |                     sim2 = Simhash(long(sim2, 16), self.f)
283 | 
284 |                     d = simhash.distance(sim2)
285 |                     if d <= self.k:
286 |                         similar = obj_id2
287 |             
288 |             self.bucket[key].add(v)
289 |         
290 |         return similar
291 | 
292 |     def delete(self, obj_id, simhash):
293 |         """
294 |         `obj_id` is a string
295 |         `simhash` is an instance of Simhash
296 |         """
297 |         assert simhash.f == self.f
298 | 
299 |         for key in self.get_keys(simhash):
300 |             v = '%x,%s' % (simhash.value, obj_id)
301 |             if v in self.bucket[key]:
302 |                 self.bucket[key].remove(v)
303 | 
304 |     @property
305 |     def offsets(self):
306 |         """
307 |         You may optimize this method according to <http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33026.pdf>
308 |         """
309 |         return [self.f // (self.k + 1) * i for i in range(self.k + 1)]
310 | 
311 |     def get_keys(self, simhash):
312 |         for i, offset in enumerate(self.offsets):
313 |             if i == (len(self.offsets) - 1):
314 |                 m = 2 ** (self.f - offset) - 1
315 |             else:
316 |                 m = 2 ** (self.offsets[i + 1] - offset) - 1
317 |             c = simhash.value >> offset & m
318 |             yield '%x:%x' % (c, i)
319 | 
320 |     def bucket_size(self):
321 |         return len(self.bucket)
322 | 


--------------------------------------------------------------------------------
/utils/redisSimhash.py:
--------------------------------------------------------------------------------
  1 | # # Created by 1e0n in 2013
  2 | # from __future__ import division, unicode_literals
  3 | 
  4 | # import collections
  5 | # import hashlib
  6 | # import logging
  7 | # import numbers
  8 | # import re
  9 | # import sys
 10 | # from itertools import groupby
 11 | 
 12 | # import numpy as np
 13 | 
 14 | # try:
 15 | #     from collections.abc import Iterable
 16 | # except ImportError:
 17 | #     from collections import Iterable
 18 | # import redis
 19 | 
 20 | 
 21 | # if sys.version_info[0] >= 3:
 22 | #     basestring = str
 23 | #     unicode = str
 24 | #     long = int
 25 | 
 26 | #     def int_to_bytes(n, length):
 27 | #         return n.to_bytes(length, 'big')
 28 | 
 29 | #     def bytes_to_int(b):
 30 | #         return int.from_bytes(b, 'big')
 31 | # else:
 32 | #     range = xrange
 33 | 
 34 | #     def int_to_bytes(n, length):
 35 | #         return '{:0{}x}'.format(n, length * 2).decode('hex')
 36 | 
 37 | #     def bytes_to_int(b):
 38 | #         return int(b.encode('hex'), 16)
 39 | 
 40 | # def _hashfunc(x):
 41 | #     return hashlib.md5(x).digest()
 42 | 
 43 | 
 44 | # class Simhash(object):
 45 | #     # Constants used in calculating simhash. Larger values will use more RAM.
 46 | #     large_weight_cutoff = 50
 47 | #     batch_size = 200
 48 | 
 49 | #     def __init__(
 50 | #             self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=_hashfunc, log=None
 51 | #     ):
 52 | #         """
 53 | #         `f` is the dimensions of fingerprints, in bits. Must be a multiple of 8.
 54 | 
 55 | #         `reg` is meaningful only when `value` is basestring and describes
 56 | #         what is considered to be a letter inside parsed string. Regexp
 57 | #         object can also be specified (some attempt to handle any letters
 58 | #         is to specify reg=re.compile(r'\w', re.UNICODE))
 59 | 
 60 | #         `hashfunc` accepts a utf-8 encoded string and returns either bytes
 61 | #         (preferred) or an unsigned integer, in at least `f // 8` bytes.
 62 | #         """
 63 | #         if f % 8:
 64 | #             raise ValueError('f must be a multiple of 8')
 65 | 
 66 | #         self.f = f
 67 | #         self.f_bytes = f // 8
 68 | #         self.reg = reg
 69 | #         self.value = None
 70 | #         self.hashfunc = hashfunc
 71 | #         self.hashfunc_returns_int = isinstance(hashfunc(b"test"), numbers.Integral)
 72 | 
 73 | #         if log is None:
 74 | #             self.log = logging.getLogger("simhash")
 75 | #         else:
 76 | #             self.log = log
 77 | 
 78 | #         if isinstance(value, Simhash):
 79 | #             self.value = value.value
 80 | #         elif isinstance(value, basestring):
 81 | #             self.build_by_text(unicode(value))
 82 | #         elif isinstance(value, Iterable):
 83 | #             self.build_by_features(value)
 84 | #         elif isinstance(value, numbers.Integral):
 85 | #             self.value = value
 86 | #         else:
 87 | #             raise Exception('Bad parameter with type {}'.format(type(value)))
 88 | 
 89 | #     def __eq__(self, other):
 90 | #         """
 91 | #         Compare two simhashes by their value.
 92 | 
 93 | #         :param Simhash other: The Simhash object to compare to
 94 | #         """
 95 | #         return self.value == other.value
 96 | 
 97 | #     def _slide(self, content, width=4):
 98 | #         return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]
 99 | 
100 | #     def _tokenize(self, content):
101 | #         content = content.lower()
102 | #         content = ''.join(re.findall(self.reg, content))
103 | #         ans = self._slide(content)
104 | #         return ans     
105 | 
106 | #     def build_by_text(self, content):
107 | #         features = self._tokenize(content)
108 | #         features = {k:sum(1 for _ in g) for k, g in groupby(sorted(features))}
109 | #         return self.build_by_features(features)
110 | 
111 | #     def build_by_features(self, features):
112 | #         """
113 | #         `features` might be a list of unweighted tokens (a weight of 1
114 | #                    will be assumed), a list of (token, weight) tuples or
115 | #                    a token -> weight dict.
116 | #         """
117 | #         sums = []
118 | #         batch = []
119 | #         count = 0
120 | #         w = 1
121 | #         truncate_mask = 2 ** self.f - 1
122 | #         if isinstance(features, dict):
123 | #             features = features.items()
124 | 
125 | #         for f in features:
126 | #             skip_batch = False
127 | #             if not isinstance(f, basestring):
128 | #                 f, w = f
129 | #                 skip_batch = w > self.large_weight_cutoff or not isinstance(w, int)
130 | 
131 | #             count += w
132 | #             if self.hashfunc_returns_int:
133 | #                 h = int_to_bytes(self.hashfunc(f.encode('utf-8')) & truncate_mask, self.f_bytes)
134 | #             else:
135 | #                 h = self.hashfunc(f.encode('utf-8'))[-self.f_bytes:]
136 | 
137 | #             if skip_batch:
138 | #                 sums.append(self._bitarray_from_bytes(h) * w)
139 | #             else:
140 | #                 batch.append(h * w)
141 | #                 if len(batch) >= self.batch_size:
142 | #                     sums.append(self._sum_hashes(batch))
143 | #                     batch = []
144 | 
145 | #             if len(sums) >= self.batch_size:
146 | #                 sums = [np.sum(sums, 0)]
147 | 
148 | #         if batch:
149 | #             sums.append(self._sum_hashes(batch))
150 | 
151 | #         combined_sums = np.sum(sums, 0)
152 | #         self.value = bytes_to_int(np.packbits(combined_sums > count / 2).tobytes())
153 | 
154 | #     def _sum_hashes(self, digests):
155 | #         bitarray = self._bitarray_from_bytes(b''.join(digests))
156 | #         rows = np.reshape(bitarray, (-1, self.f))
157 | #         return np.sum(rows, 0)
158 | 
159 | #     @staticmethod
160 | #     def _bitarray_from_bytes(b):
161 | #         return np.unpackbits(np.frombuffer(b, dtype='>B'))
162 | 
163 | #     def distance(self, another):
164 | #         assert self.f == another.f
165 | #         x = (self.value ^ another.value) & ((1 << self.f) - 1)
166 | #         ans = 0
167 | #         while x:
168 | #             ans += 1
169 | #             x &= x - 1
170 | #         return ans
171 | 
172 | 
173 | # class SimhashIndex(object):
174 | 
175 | #     def __init__(self, objs, r, f=64, k=2, log=None):
176 | #         """
177 | #         `objs` is a list of (obj_id, simhash)
178 | #         obj_id is a string, simhash is an instance of Simhash
179 | #         `f` is the same with the one for Simhash
180 | #         `k` is the tolerance
181 | #         """
182 | #         self.k = k
183 | #         self.f = f
184 | #         count = len(objs)
185 | 
186 | #         if log is None:
187 | #             self.log = logging.getLogger("simhash")
188 | #         else:
189 | #             self.log = log
190 | 
191 | #         self.log.info('Initializing %s data.', count)
192 | 
193 | #         self.r = r
194 | 
195 | 
196 | #         for i, q in enumerate(objs):
197 | #             if i % 10000 == 0 or i == count - 1:
198 | #                 self.log.info('%s/%s', i + 1, count)
199 | 
200 | #             self.add(*q)
201 | 
202 | #     # def get_near_dups(self, simhash):
203 | #     #     """
204 | #     #     `simhash` is an instance of Simhash
205 | #     #     return a list of obj_id, which is in type of str
206 | #     #     """
207 | #     #     assert simhash.f == self.f
208 | 
209 | #     #     for key in self.get_keys(simhash):
210 | #     #         dups = self.r.smembers(key)
211 | #     #         self.log.debug('key:%s', key)
212 | #     #         if len(dups) > 200:
213 | #     #             self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups))
214 | 
215 | #     #         for dup in dups:
216 | #     #             sim2, obj_id = dup.split(',', 1)
217 | #     #             sim2 = Simhash(long(sim2, 16), self.f)
218 | 
219 | #     #             d = simhash.distance(sim2)
220 | #     #             if d <= self.k:
221 | #     #                 return obj_id
222 | #     #     return ""
223 | 
224 | #     def add(self, obj_id, simhash):
225 | #         """
226 | #         `obj_id` is a string
227 | #         `simhash` is an instance of Simhash
228 | #         """
229 | #         assert simhash.f == self.f
230 | 
231 | #         similar = ""
232 | #         for key in self.get_keys(simhash):
233 | #             v = '%x,%s' % (simhash.value, obj_id)
234 | #             if(similar == ""):
235 | #                 dups = self.r.smembers(key)
236 | #                 if(len(dups)>0):
237 | #                     for dup in dups:
238 | #                         dup = str(dup.decode('utf-8'))
239 | #                         sim2, obj_id = dup.split(',', 1)
240 | #                         sim2 = Simhash(long(sim2[2:], 16), self.f)
241 | 
242 | #                         d = simhash.distance(sim2)
243 | #                         if d <= self.k:
244 | #                             similar = obj_id
245 | #                             break
246 | #             self.r.sadd(key, v)
247 | 
248 | #         return similar
249 | 
250 | #     def delete(self, obj_id, simhash):
251 | #         """
252 | #         `obj_id` is a string
253 | #         `simhash` is an instance of Simhash
254 | #         """
255 | #         assert simhash.f == self.f
256 | 
257 | #         for key in self.get_keys(simhash):
258 | #             v = '%x,%s' % (simhash.value, obj_id)
259 | #             if v in self.r.smembers(key):
260 | #                 self.r.srem(key, v)
261 | 
262 | #     @property
263 | #     def offsets(self):
264 | #         """
265 | #         You may optimize this method according to <http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33026.pdf>
266 | #         """
267 | #         return [self.f // (self.k + 1) * i for i in range(self.k + 1)]
268 | 
269 | #     def get_keys(self, simhash):
270 | #         for i, offset in enumerate(self.offsets):
271 | #             if i == (len(self.offsets) - 1):
272 | #                 m = 2 ** (self.f - offset) - 1
273 | #             else:
274 | #                 m = 2 ** (self.offsets[i + 1] - offset) - 1
275 | #             c = simhash.value >> offset & m
276 | #             yield '%x:%x' % (c, i)
277 | 
278 | #     def bucket_size(self):
279 | #         return len(self.bucket)
280 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import multiprocessing
  3 | 
  4 | str_encode = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
  5 | max_size = 500 * 1024 * 1024
  6 | max_memory = 1024 * 1024 * 1024
  7 | 
  8 | 
  9 | # 递归读取文件夹下所有文件
 10 | def get_all_files(dir_path, legal_file_type=('.txt',), return_file_type='queue'):
 11 |     if return_file_type == 'queue':
 12 |         return get_all_files_queue(dir_path, legal_file_type)
 13 |     elif return_file_type == 'list':
 14 |         return get_all_files_list(dir_path, legal_file_type)
 15 | 
 16 | 
 17 | def get_all_files_queue(dir_path, legal_file_type=('.txt',)):
 18 |     file_nums = 0
 19 |     file_path_queue = multiprocessing.Manager().Queue()
 20 | 
 21 |     for root, _, files in os.walk(dir_path):
 22 |         for file in files:
 23 |             if os.path.splitext(file)[-1] not in legal_file_type:
 24 |                 continue
 25 |             file_path = os.path.join(root, file)
 26 |             file_path_queue.put(file_path)
 27 |             file_nums += 1
 28 |     return file_path_queue, file_nums
 29 | 
 30 | 
 31 | def get_all_files_list(dir_path, legal_file_type=('.txt',)):
 32 |     file_path_list = []
 33 |     for root, _, files in os.walk(dir_path):
 34 |         for file in files:
 35 |             if os.path.splitext(file)[-1] not in legal_file_type:
 36 |                 continue
 37 |             file_path = os.path.join(root, file)
 38 |             file_path_list.append(file_path)
 39 |     file_path_list = sorted(file_path_list)
 40 |     return file_path_list, len(file_path_list)
 41 | 
 42 | 
 43 | def get_common_prefix_and_removed_list(strs):
 44 |     if not strs:
 45 |         return "", []
 46 |     prefix = strs[0]
 47 |     for s in strs:
 48 |         while not s.startswith(prefix):
 49 |             prefix = prefix[:-1]
 50 |             if not prefix:
 51 |                 return "", strs
 52 |     return prefix, [s[len(prefix):] for s in strs]
 53 | 
 54 | 
 55 | def jaccard_distance(md5_list1, md5_list2):
 56 |     nominator = md5_list1.intersection(md5_list2)
 57 |     # 求集合 A 和集合 B 的并集
 58 |     denominator = md5_list1.union(md5_list2)
 59 |     # 计算比率
 60 |     similarity = len(nominator) / len(denominator)
 61 |     return similarity
 62 | 
 63 | 
 64 | # 递归读取文件夹下所有文件夹
 65 | def get_dictory_path(dir_path, return_file_type='queue'):
 66 |     if return_file_type == 'queue':
 67 | 
 68 |         def get_dictory_path_queue(dir_path):
 69 |             dictory_path_queue = multiprocessing.Queue()
 70 |             for root, dirs, _ in os.walk(dir_path):
 71 |                 for dir in dirs:
 72 |                     dictory_path = os.path.join(root, dir)
 73 |                     dictory_path_queue.put(dictory_path)
 74 |             return dictory_path_queue
 75 | 
 76 |         return get_dictory_path_queue(dir_path)
 77 | 
 78 |     elif return_file_type == 'list':
 79 | 
 80 |         def get_dictory_path_list(dir_path):
 81 |             dictory_path_list = []
 82 |             for root, dirs, _ in os.walk(dir_path):
 83 |                 for dir in dirs:
 84 |                     dictory_path = os.path.join(root, dir)
 85 |                     dictory_path_list.append(dictory_path)
 86 |             return dictory_path_list
 87 | 
 88 |         return get_dictory_path_list(dir_path)
 89 | 
 90 | 
 91 | # 不递归的读取当前文件夹的文件
 92 | def get_files(dir_path, legal_file_type=('.txt',), return_file_type='queue'):
 93 |     if return_file_type == 'queue':
 94 |         def get_files_queue(dir_path, legal_file_type=('.txt',)):
 95 |             file_path_queue = multiprocessing.Queue()
 96 |             file_nums = 0
 97 |             for file in os.listdir(dir_path):
 98 |                 if os.path.splitext(file)[-1] not in legal_file_type:
 99 |                     continue
100 |                 file_path = os.path.join(dir_path, file)
101 |                 file_path_queue.put(file_path)
102 |                 file_nums += 1
103 |             return file_path_queue, file_nums
104 | 
105 |         return get_files_queue(dir_path, legal_file_type)
106 | 
107 |     elif return_file_type == 'list':
108 | 
109 |         def get_files_list(dir_path, legal_file_type=('.txt',)):
110 |             file_path_list = []
111 |             for file in os.listdir(dir_path):
112 |                 if os.path.splitext(file)[-1] not in legal_file_type:
113 |                     continue
114 |                 file_path = os.path.join(dir_path, file)
115 |                 file_path_list.append(file_path)
116 |             return file_path_list, len(file_path_list)
117 | 
118 |         return get_files_list(dir_path, legal_file_type)
119 | 


--------------------------------------------------------------------------------
/words_dedup/add_jsonl_detailed_simhash.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import jsonlines
 3 | import tqdm, os
 4 | import tempfile
 5 | import os, sys
 6 | current_path = os.path.abspath(__file__)
 7 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
 8 | import utils.customSimhash as customSimhash
 9 | from utils.utils import max_size, get_all_files
10 | import multiprocessing
11 | 
12 | from cityhash import CityHash64
13 | 
14 | def hashfunc(x):
15 |     return CityHash64(x)
16 | 
17 | def calculate_simhash(args):
18 |     one_json, hashfunc = args
19 |     text = ''
20 |     for line_json in one_json['段落']:
21 |         text += line_json['内容']
22 | 
23 |     simhash = customSimhash.Simhash(text, hashfunc=hashfunc)
24 |     one_json['alltext_simhash'] = simhash.value
25 | 
26 |     return one_json
27 | 
28 | def convert(src_dir, num_processes):
29 |     file_path_list, file_nums = get_all_files(src_dir, ['.jsonl'], 'list')
30 | 
31 |     for i in tqdm.tqdm(range(file_nums)):
32 |         with jsonlines.open(file_path_list[i]) as input_file, tempfile.NamedTemporaryFile(mode='w', delete=False) as output_file:
33 |             with multiprocessing.Pool(num_processes) as pool:
34 |                 args = [(one_json, hashfunc) for one_json in input_file]
35 |                 results = pool.imap_unordered(calculate_simhash, args)
36 | 
37 |                 for result in results:
38 |                     jsonlines.Writer(output_file).write(result)
39 | 
40 |         output_file.close()
41 |         os.replace(output_file.name, file_path_list[i])
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument("-d", "--directory", help="Directory to convert", required=True)
46 |     parser.add_argument("-p", "--processes", help="Number of processes to use", type=int, default=multiprocessing.cpu_count())
47 |     args = parser.parse_args()
48 | 
49 |     convert(args.directory, args.processes)


--------------------------------------------------------------------------------
/words_dedup/alltext_simhash.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import tqdm
 4 | import jsonlines
 5 | import os, sys
 6 | current_path = os.path.abspath(__file__)
 7 | sys.path.append(os.path.dirname(os.path.dirname(current_path)))
 8 | import utils.customSimhash as customSimhash
 9 | import pickle
10 | import tempfile
11 | from utils.utils import get_all_files
12 | 
13 | 
14 | def deduplication(file_path_list, rs_pkl, simhash_threshold):
15 |     lsh = customSimhash.SimhashIndex([], f=64, k=simhash_threshold)
16 | 
17 |     if rs_pkl is not None:
18 |         try:
19 |             lsh.bucket = pickle.load(open(rs_pkl, 'rb'))
20 |         except:
21 |             print('不存在该pkl文件，无法读取')
22 | 
23 |     count_dedup = 0
24 |     for i in tqdm.tqdm(range(len(file_path_list))):
25 |         with jsonlines.open(file_path_list[i]) as input_file, tempfile.NamedTemporaryFile(mode='w', delete=False) as output_file:
26 |             for one_json in input_file:
27 |                 simhash_value = customSimhash.Simhash(one_json['alltext_simhash'])
28 |                 similar = lsh.add(file_path_list[i] + one_json['文件名'], simhash_value, return_similar=True)
29 |                 if similar != "":
30 |                     count_dedup += 1
31 |                     one_json['是否重复文件'] = True
32 |                     with open('重复文件.txt', 'a') as f:
33 |                         f.write(file_path_list[i] + one_json['文件名'] + '和' + similar + '是重复的\n')
34 | 
35 |                 jsonlines.Writer(output_file).write(one_json)
36 | 
37 |         output_file.close()
38 |         os.replace(output_file.name, file_path_list[i])
39 | 
40 |     print('一共有:', count_dedup, '个重复文件被检查出来')
41 | 
42 |     if rs_pkl is not None:
43 |         pickle.dump(lsh.bucket, open(rs_pkl, 'wb'))
44 |         print('已经把文件记录保存到', rs_pkl, '中')
45 | 
46 | 
47 | def files_deplication(src_dir='output_csv/', rs_pkl=None, simhash_threshold=3):
48 |     """
49 |     将多个csv中的文件进行去重
50 |     :param src_dir: csv文件路径
51 |     :param rs_pkl: 保存去重结果的pkl文件路径
52 |     :param simhash_threshold: 指定去重阈值，默认为3，也就是simhash值相差3以内算相似
53 |     :param n_process: 指定进程数，最低是2，也就是一个主进程一个检验去重结果进程，默认是13
54 |     """
55 |     file_path_list, file_nums = get_all_files(src_dir, ['.jsonl'], 'list')
56 | 
57 |     deduplication(file_path_list, rs_pkl, simhash_threshold)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('--src_dir', type=str, default='xxqgfilesjsonl copy/', help="源文件夹路径")
63 |     parser.add_argument('--rs_pkl', required=False, help="源文件夹路径")
64 |     parser.add_argument('--simhash_threshold', type=int, default=3, help="指定simhash去重阈值，默认为3")
65 | 
66 |     args = parser.parse_args()
67 |     with open('重复文件.txt', 'w') as f:
68 |         f.write('')
69 |     files_deplication(args.src_dir, args.rs_pkl, args.simhash_threshold)


--------------------------------------------------------------------------------