├── README.md └── package ├── __init__.py ├── __pycache__ ├── __init__.cpython-35.pyc └── deal_file.cpython-35.pyc └── deal_file.py /README.md: -------------------------------------------------------------------------------- 1 | python大文件去重 2 | 3 | 亲测百万条数据只需一秒不到即可完成去重 4 | 5 | 6 | 7 | 使用方法: 8 | 9 | 从git上拉取下来后,导入文件 10 | 11 | ``` 12 | from package.deal_file import LargeFileRemoveDuplication 13 | 14 | # 指定要去重的文件 15 | oldfile = '/home/wry/workspace/large_file_remove/files/article_link.txt' 16 | # 指定要生成的文件 17 | newfile = '/home/wry/workspace/large_file_remove/files/test_space.txt' 18 | # 运行 19 | LargeFileRemoveDuplication(oldfile, newfile) 20 | ``` -------------------------------------------------------------------------------- /package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wrydeveloper/large_file_remove_duplication/305cc2844414a435ccd09d49861c84c694af1ac8/package/__init__.py -------------------------------------------------------------------------------- /package/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wrydeveloper/large_file_remove_duplication/305cc2844414a435ccd09d49861c84c694af1ac8/package/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /package/__pycache__/deal_file.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wrydeveloper/large_file_remove_duplication/305cc2844414a435ccd09d49861c84c694af1ac8/package/__pycache__/deal_file.cpython-35.pyc -------------------------------------------------------------------------------- /package/deal_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | # 进行文件去重 3 | 4 | class LargeFileRemoveDuplication: 5 | 6 | def __init__(self, oldfilepath, newfilepath, split_count=10, author='wry', temp_dir = 'temp_dir/'): 7 | self.oldfilepath = oldfilepath 8 | self.split_count = split_count 9 | self.newfilepath = newfilepath 10 | self.author = author 11 | self.temp_dir = temp_dir 12 | 13 | handle_file, files = self.generate_file() 14 | self.calcu_hash(handle_file) 15 | self.close_file(handle_file) 16 | self.data_uniq(files) 17 | 18 | # tip1 生成分割保存的文件 19 | def generate_file(self): 20 | # files存储路径 21 | handle_file, files = [], [] 22 | temp_file_path_isexists = os.path.exists(self.temp_dir) 23 | if not temp_file_path_isexists: 24 | os.makedirs(self.temp_dir) 25 | 26 | for i in range(self.split_count): 27 | dir = self.temp_dir + 'split_' + str(i) 28 | files.append(dir) 29 | f = open(dir, 'w') 30 | handle_file.append(f) 31 | 32 | return handle_file, files 33 | 34 | # tip2把内容分摊到各个文件 35 | def calcu_hash(self, handle_file): 36 | with open(self.oldfilepath, 'r') as f: 37 | for line in f: 38 | handle_file[hash(line)%self.split_count-1].write(line) 39 | 40 | # tip3 把各个文件关闭 41 | def close_file(self, handle_file): 42 | for i in range(len(handle_file)): 43 | handle_file[i].close() 44 | 45 | # tip4 从各个文件中读取,去重 46 | def data_uniq(self, files): 47 | dataset = dict() 48 | new_file = open(self.newfilepath, 'w') 49 | for filename in files: 50 | f = open(filename, 'r') 51 | for line in f: 52 | dataset[line] = 1 53 | 54 | f.close() 55 | for key in dataset.keys(): 56 | new_file.write(key) 57 | 58 | dataset = {} 59 | 60 | new_file.close() 61 | --------------------------------------------------------------------------------