├── README.md
└── package
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-35.pyc
        └── deal_file.cpython-35.pyc
    └── deal_file.py


/README.md:
--------------------------------------------------------------------------------
 1 | python大文件去重
 2 | 
 3 | 亲测百万条数据只需一秒不到即可完成去重
 4 | 
 5 | 
 6 | 
 7 | 使用方法：
 8 | 
 9 | 从git上拉取下来后，导入文件
10 | 
11 | ```
12 | from package.deal_file import LargeFileRemoveDuplication
13 | 
14 | # 指定要去重的文件
15 | oldfile = '/home/wry/workspace/large_file_remove/files/article_link.txt'
16 | # 指定要生成的文件
17 | newfile = '/home/wry/workspace/large_file_remove/files/test_space.txt'
18 | # 运行
19 | LargeFileRemoveDuplication(oldfile, newfile)
20 | ```


--------------------------------------------------------------------------------
/package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wrydeveloper/large_file_remove_duplication/305cc2844414a435ccd09d49861c84c694af1ac8/package/__init__.py


--------------------------------------------------------------------------------
/package/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wrydeveloper/large_file_remove_duplication/305cc2844414a435ccd09d49861c84c694af1ac8/package/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/package/__pycache__/deal_file.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wrydeveloper/large_file_remove_duplication/305cc2844414a435ccd09d49861c84c694af1ac8/package/__pycache__/deal_file.cpython-35.pyc


--------------------------------------------------------------------------------
/package/deal_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # 进行文件去重
 3 | 
 4 | class LargeFileRemoveDuplication:
 5 | 
 6 |     def __init__(self, oldfilepath, newfilepath, split_count=10, author='wry', temp_dir = 'temp_dir/'):
 7 |         self.oldfilepath = oldfilepath
 8 |         self.split_count  = split_count
 9 |         self.newfilepath = newfilepath
10 |         self.author = author
11 |         self.temp_dir = temp_dir
12 | 
13 |         handle_file, files = self.generate_file()
14 |         self.calcu_hash(handle_file)
15 |         self.close_file(handle_file)
16 |         self.data_uniq(files)
17 | 
18 |     # tip1 生成分割保存的文件
19 |     def generate_file(self):
20 |         # files存储路径
21 |         handle_file, files = [], []
22 |         temp_file_path_isexists = os.path.exists(self.temp_dir)
23 |         if not temp_file_path_isexists:
24 |             os.makedirs(self.temp_dir)
25 | 
26 |         for i in range(self.split_count):
27 |             dir = self.temp_dir + 'split_' + str(i)
28 |             files.append(dir)
29 |             f = open(dir, 'w')
30 |             handle_file.append(f)
31 | 
32 |         return handle_file, files
33 | 
34 |     # tip2把内容分摊到各个文件
35 |     def calcu_hash(self,  handle_file):
36 |         with open(self.oldfilepath, 'r') as f:
37 |             for line in f:
38 |                 handle_file[hash(line)%self.split_count-1].write(line)
39 | 
40 |     # tip3 把各个文件关闭
41 |     def close_file(self, handle_file):
42 |         for i in range(len(handle_file)):
43 |             handle_file[i].close()
44 | 
45 |     # tip4 从各个文件中读取，去重
46 |     def data_uniq(self, files):
47 |         dataset = dict()
48 |         new_file = open(self.newfilepath, 'w')
49 |         for filename in files:
50 |             f = open(filename, 'r')
51 |             for line in f:
52 |                 dataset[line] = 1
53 | 
54 |             f.close()
55 |             for key in dataset.keys():
56 |                 new_file.write(key)
57 | 
58 |             dataset = {}
59 | 
60 |         new_file.close()
61 | 


--------------------------------------------------------------------------------