├── LICENSE ├── README.md ├── dedup_files.py └── util.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dedup_files 2 | for file deduplication(文件去重) 3 | 本程序的原理是扫描目标目录的所有文件,计算他们的md5值,如果一样,说明是一个文件。 4 | 统计出来所有md5值后,找到副本数量大于1的文件,只保留一份副本,然后用这份副本做硬链接到其它的几个副本文件名处。 5 | 从而尝试释放空间。 6 | **由于FAT/FAT32/EXFAT等文件系统不支持硬链接,所以这样的文件系统是不被支持的,也就不用尝试了。** 7 | (当前一般windows都是ntfs文件系统,支持硬链接) 8 | 最初写来是用来给微信数据文件夹瘦身的,因为微信的各种文件转发,图片转发什么的,都是copy,不是hard link。 9 | **注:不知个人微信在哪个版本开始,文件/图片转发,已经从copy改成hard link了。从当前测试的win10上的微信3.9.7.25版本看,转发时,微信会智能做硬链接了。** 10 | 当然这么久了,肯定有历史积累,有历史包袱,历史包袱部分,还都是副本,不是硬链接,用本程序还是可以去重 11 | 对于我个人而言,改成hard link后,个人的30G占用空间,一下子就少了10G。 12 | 当然非微信的目录也是一样的可以文件去重。第一次执行时,它会自动生成一个dedup_files.ini的配置文件, 13 | 其中dirs里面指定了哪些目录是要扫描进行去重的, 14 | cache_file:参数指定存放cache文件的路径,默认放在当前程序路径下 15 | md5_key_file:以md5为key的hash dict,文件存放路径 16 | to_del_file: 以md5为key的hash dict,放置了后面要删除文件改用硬链接的文件名称 17 | ask_before_del:删除文件前,是否进行询问 18 | max_workers:算md5时的,最大并发线程数量 19 | 20 | 21 | 增加了统计预计可以释放的空间大小(自动忽略已经做了硬链接的文件),要副本改硬链接的文件数量,以及实际执行完后释放了的空间大小。 22 | 增加了多线程并发,多个文件同时并发计算md5,加快速度。但python的GIL限制,也就能快个2-3倍左右 23 | -------------------------------------------------------------------------------- /dedup_files.py: -------------------------------------------------------------------------------- 1 | import os, sys, re, json, time 2 | import hashlib 3 | from collections import defaultdict 4 | import ast 5 | import util 6 | import concurrent.futures 7 | import threading 8 | sys.stdout.reconfigure(encoding='utf-8') 9 | 10 | SUPPORT_HARD_LINK_FS=['ext2','ext2','ext4','xfs','zfs','ntfs'] #fat和exfat文件系统,都不支持硬链接 11 | 12 | #本程序,扫描给定目录及其子目录的所有文件,并计算每个文件的md5值(假定md5没有碰撞),结果存入md5_dict 13 | 14 | app_path = os.path.dirname(os.path.abspath(sys.argv[0])) 15 | log_file = os.path.basename(sys.argv[0]).split('.')[0] + '.log' 16 | cfg_file=os.path.basename(sys.argv[0]).split('.')[0] + '.ini' 17 | 18 | md5_dict=defaultdict(list) 19 | ino_dict=defaultdict(list) 20 | md5_array=[] 21 | 22 | mylog=util.get_logger(log_file) #设置写入日志函数 23 | cfg=util.get_cfg(cfg_file) #从配置文件中读入配置,如果文件不存在,则创建一个 24 | mylog.info(json.dumps(cfg,indent=2,ensure_ascii=True)) 25 | 26 | # 创建锁对象 27 | md5_lock = threading.Lock() 28 | ino_lock = threading.Lock() 29 | array_lock = threading.Lock() 30 | 31 | #检查配置的去重目录是否支持硬链接,如果不支持,进行警告提醒,并直接退出 32 | for dir in cfg['dirs']: 33 | fs_type=util.get_fs_type(dir) 34 | if fs_type in SUPPORT_HARD_LINK_FS: 35 | mylog.info(f"{dir}所在文件系统为{fs_type},支持硬链接") 36 | else: 37 | print() 38 | mylog.error(f"{dir}所在文件系统为{fs_type},不支持硬链接,请重新调整{cfg_file}中dirs的设置") 39 | print() 40 | input("按回车键退出") 41 | sys.exit(-1) 42 | 43 | def write_cache_file(cache_data,fname=None): #把所有的内存中的每个文件的md5的数据,按一条条格式,全部dump到cache_file里面去 44 | if not fname: 45 | fname=cfg['cache_file'] 46 | with open(fname, 'w', encoding='utf-8') as file: 47 | for entry in cache_data: 48 | # 使用repr()将字典转换为字符串,并写入文件 49 | file.write(repr(entry) + '\n') 50 | 51 | def append_record_to_cache(record_data): #把当前这么一条数据,按照字典一行的模式,追加到cache_file中去 52 | # 使用'a'模式打开文件,以便追加数据 53 | with open(cfg['cache_file'], 'a', encoding='utf-8') as file: 54 | # 使用repr()将字典转换为字符串,并追加到文件末尾 55 | file.write(repr(record_data) + '\n') 56 | 57 | def read_cache_file(): #把cache中所有文件的md5的数值,全部读入到内存中来,返回cache_data的数组 58 | cache_data = [] 59 | if not os.path.isfile(cfg['cache_file']): 60 | return [] 61 | with open(cfg['cache_file'], 'r', encoding='utf-8') as file: 62 | for line in file: 63 | # 使用ast.literal_eval将文本行解析为字典 64 | data = ast.literal_eval(line.strip()) 65 | cache_data.append(data) 66 | return cache_data 67 | 68 | def convert_array_to_dict(input_array): #把md5的{'file_path':file_path,'size':size,'md5':md5_value}数组,转换成基于每一个md5的hash dict中,dict的key为md5值 69 | result_dict =defaultdict(list) 70 | for item in input_array: 71 | if item['md5'] not in result_dict: 72 | result_dict[item['md5']] = [item] 73 | else: 74 | result_dict[item['md5']].append(item) 75 | return result_dict 76 | 77 | def convert_array_to_ino_dict(input_array): #把md5的{'file_path':file_path,'size':size,'md5':md5_value}数组,转换成基于每一个ino的hash dict中,dict的key为ino值 78 | result_dict =defaultdict(list) 79 | for item in input_array: 80 | if item['ino'] not in result_dict: 81 | result_dict[item['ino']] = [item] 82 | else: 83 | result_dict[item['ino']].append(item) 84 | return result_dict 85 | 86 | def convert_dict_to_array(input_dict): #把基于md5做hash key的dict,转换为md5的{'file_path':file_path,'size':size,'md5':md5_value}数组 87 | result = [] 88 | for uuid in input_dict: 89 | result.append(input_dict[uuid]) 90 | return result 91 | 92 | def find_record_by_file_path(file_path): 93 | #给定一个file_path,检查是否已经在md5_array列表里面,如果在的话,返回True,否则False 94 | #如果没缓存过,或者没变化,会返回None 95 | global md5_array 96 | for record in md5_array: 97 | if file_path== record['file_path']: 98 | return True 99 | return None 100 | 101 | def calculate_md5(file_path, md5_dict, ino_dict, md5_array): 102 | try: 103 | md5_value = util.md5_file(file_path) 104 | size = os.stat(file_path).st_size 105 | ino = os.stat(file_path).st_ino 106 | item_record = {'file_path': file_path, 'size': size, 'md5': md5_value, 'ino': ino} 107 | if ino in ino_dict: 108 | return # Skip hard links 109 | # 使用锁来确保对字典和数组的访问是线程安全的 110 | with md5_lock: 111 | md5_dict[md5_value].append(item_record) 112 | with ino_lock: 113 | ino_dict[ino].append(item_record) 114 | with array_lock: 115 | md5_array.append(item_record) 116 | with array_lock: 117 | append_record_to_cache(item_record) 118 | except Exception as e: 119 | mylog.error(f"Error calculating MD5 for {file_path}: {e}") 120 | 121 | 122 | def get_md5_info(): 123 | # 用于存储文件MD5哈希值和路径的字典 124 | global md5_array, md5_dict, ino_dict 125 | i = 0 126 | with concurrent.futures.ThreadPoolExecutor(max_workers=cfg['max_workers']) as executor: 127 | for directory in cfg['dirs']: 128 | mylog.info(f"遍历{directory}获得每一个文件中……") 129 | for root, _, files in os.walk(directory): 130 | for file in files: 131 | file_path = os.path.join(root, file) 132 | i += 1 133 | ss_file_path = util.remove_unprintable_chars(file_path) 134 | log_ss = f"{i=},{ss_file_path}," 135 | if find_record_by_file_path(file_path): 136 | mylog.info(f"{log_ss} in cache, will pass") 137 | continue 138 | mylog.info(f"{log_ss} not in cache, will calculate md5 now") 139 | executor.submit(calculate_md5, file_path, md5_dict, ino_dict, md5_array) 140 | 141 | open(cfg['md5_key_file'], 'w', encoding='utf8').write(json.dumps(md5_dict, indent=2, ensure_ascii=False)) 142 | return md5_dict 143 | 144 | 145 | #从cache文件中,载入已经算过的md5数据 146 | md5_array=read_cache_file() 147 | md5_dict = convert_array_to_dict(md5_array) 148 | ino_dict = convert_array_to_ino_dict(md5_array) 149 | 150 | #开始计算每一个文件的md5 151 | stime=time.time() 152 | md5_dict = get_md5_info() 153 | etime=time.time() 154 | mylog.info(f"遍历所有目录完成,耗时{etime-stime}秒,现在比对重复文件") 155 | 156 | #统计当前有哪些文件有多份副本,如果删除它们,可以节约多少空间 157 | duplicate_records=[] 158 | save_size=0 #可以节约的空间 159 | to_del_md5_dict=defaultdict(list) #存储需要删除文件清单 160 | for md5_value in md5_dict: 161 | records=md5_dict[md5_value] 162 | src_file=records[0]['file_path'] #取第一个记录作为源 163 | try: 164 | src_ino=os.stat(src_file).st_ino #由于有cache缓存里记录了这个文件,就会用最开始的inode number,即使后面删除文件,做了硬链接,也没有更新cache.dat和md5_key_files.dat,因此这里现获得一遍 165 | except Exception as e: 166 | mylog.warning(f"{src_file}这个文件可能没有了,现在跳过这个文件,reason:{e}") 167 | continue 168 | if len(records) > 1: #只有副本数量>1的,才是重复文件 169 | for other_record in records[1:]: #把副本的第一个记录去掉,剩下的几个记录,就都是可以节约空间的 170 | cur_file=other_record['file_path'] 171 | try: 172 | cur_ino=os.stat(cur_file).st_ino 173 | except Exception as e: 174 | mylog.warning(f"{cur_file}这个文件可能没有了,现在跳过这个文件,reason:{e}") 175 | continue 176 | if src_ino==cur_ino: #如果两个文件的inode number相同,认为是一个硬链接,那么跳过这个 177 | continue 178 | if md5_value in to_del_md5_dict: 179 | to_del_md5_dict[md5_value].append(other_record) 180 | else: 181 | to_del_md5_dict[md5_value]=[records[0],other_record] 182 | save_size+=other_record['size'] 183 | 184 | cnt_to_del_couple_files=0 185 | cnt_to_del_files=0 186 | 187 | for md5_value in to_del_md5_dict: 188 | cnt_to_del_couple_files+=1 189 | cnt_to_del_files+=len(to_del_md5_dict[md5_value]) 190 | 191 | if save_size==0: 192 | mylog.info("当前没有重复副本需要优化,可优化空间为0字节") 193 | input("按回车键退出") 194 | sys.exit(0) 195 | 196 | open(cfg['to_del_file'],'w',encoding='utf8').write(json.dumps(to_del_md5_dict,indent=2,ensure_ascii=False)) 197 | 198 | if cfg['ask_before_del']: 199 | a=input(f"共有{cnt_to_del_couple_files}文件组(共{cnt_to_del_files}个文件),预计可节约{save_size/1024/1024:0.2f}MB空间,是否继续?(Y/N)") 200 | if not a.lstrip().rstrip().lower()=='y': 201 | mylog.info("你没有选择Y,本程序将会退出,不进行删除副本操作") 202 | print() 203 | input("按回车键退出") 204 | sys.exit() 205 | 206 | cnt_real_del_files=0 #实际删除文件数量 207 | size_real_del_files=0 #实际删除掉的文件,释放的空间数量 208 | # 创建硬链接 209 | for md5_value in md5_dict: 210 | records=md5_dict[md5_value] 211 | if len(records) > 1: 212 | # 找到重复的文件,保留一个,删除其它的,然后创建硬链接 213 | reference_file = records[0]['file_path'] 214 | ss_reference_file=util.remove_unprintable_chars(reference_file) 215 | for other_record in records[1:]: 216 | duplicate_file=other_record['file_path'] 217 | ss_duplicate_file = util.remove_unprintable_chars(duplicate_file) 218 | try: 219 | mylog.warning(f"deleting {ss_duplicate_file}") 220 | except Exception as e: 221 | mylog.error(e) 222 | continue 223 | try: 224 | os.remove(duplicate_file) 225 | cnt_real_del_files += 1 226 | size_real_del_files += other_record['size'] 227 | except Exception as e: 228 | mylog.error(f"remove file failed,reason:{e}") 229 | continue 230 | try: 231 | mylog.info(f"link from {ss_reference_file} to {ss_duplicate_file}") 232 | except Exception as e: 233 | mylog.error(e) 234 | continue 235 | try: 236 | os.link(reference_file, duplicate_file) 237 | except Exception as e: 238 | mylog.error(f"link file failed,reason:{e}") 239 | continue 240 | mylog.info(f"共整理{cnt_real_del_files}个文件,释放了{size_real_del_files/1024/1024:0.2f}MB空间") 241 | print() 242 | input("按回车键退出") 243 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import sys,os,logging,json 3 | import hashlib 4 | import winreg 5 | import platform 6 | import subprocess 7 | import ctypes 8 | import ctypes as ct 9 | from ctypes import wintypes as w 10 | import multiprocessing 11 | 12 | app_path = os.path.dirname(os.path.abspath(sys.argv[0])) 13 | 14 | def read_registry_value(key_path, value_name): 15 | try: 16 | key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, key_path) 17 | value, _ = winreg.QueryValueEx(key, value_name) 18 | winreg.CloseKey(key) 19 | return value 20 | except FileNotFoundError: 21 | print("Registry key not found.") 22 | except PermissionError: 23 | print("Permission denied.") 24 | except Exception as e: 25 | print("Error occurred:", str(e)) 26 | 27 | 28 | def get_wechat_path(): 29 | try: 30 | user = getpass.getuser() 31 | dirs = [] #函数返回结果,存放微信存储数据的目录(可能又历史遗留,多个地方装了,这个就都扫描一遍,返回一个路径的数组) 32 | dic = { 33 | 'pc': 'C:\\Users\\' + user + '\\Documents\\WeChat Files', 34 | 'forwin10': 'C:\\Users\\' + user + '\\AppData\\Local\\Packages\\TencentWeChatLimited.forWindows10_sdtnhv12zgd7a\\LocalCache\\Roaming\\Tencent\\WeChatAppStore\\WeChatAppStore Files', 35 | 'foruwp': 'C:\\Users\\' + user + '\\AppData\\Local\\Packages\\TencentWeChatLimited.WeChatUWP_sdtnhv12zgd7a\\LocalCache\\Roaming\\Tencent\\WeChatAppStore\\WeChatAppStore Files' 36 | } 37 | for key in dic: 38 | if os.path.exists(dic[key]): 39 | dirs.append(dic[key]) 40 | # 注册表路径和字段名 41 | registry_key_path = r"software\tencent\wechat" 42 | value_name = "FileSavePath" 43 | # 读取注册表里面微信的存储路劲 44 | value = read_registry_value(registry_key_path, value_name) 45 | if not value in dirs: 46 | dirs.append(value) 47 | return dirs 48 | except Exception as e: 49 | return [os.getcwd()] #如果没获得到微信数据目录,那么返回当前程序所在目录,否则返回None,不太友好 50 | 51 | def get_fs_type_old(path): #给定一个path,获得这个path所在文件系统的类型 52 | system = platform.system() 53 | if system == 'Windows': 54 | # 设置命令行编码为437(英文) 55 | subprocess.run('chcp 437', shell=True) #修改page页,后续命令英文输出 56 | drive, _ = os.path.splitdrive(path) 57 | command = f'fsutil fsinfo volumeinfo {drive}' 58 | output = subprocess.check_output(command, shell=True).decode() 59 | # 提取文件系统类型 60 | file_system_type = "" 61 | for line in output.split('\n'): 62 | if "File System Name : " in line: 63 | file_system_type = line.split("File System Name : ")[1].strip().lstrip().lower() 64 | break 65 | return file_system_type 66 | elif system == 'Linux': 67 | command = 'df -PT '+path 68 | output = subprocess.check_output(command, shell=True).decode() 69 | # 解析Linux df命令的输出,获取文件系统类型 70 | lines = output.split('\n') 71 | if len(lines) > 1: 72 | fields = lines[1].split() 73 | if len(fields) >= 2: 74 | file_system_type = fields[1] 75 | return file_system_type.lower() 76 | return "Unknown" 77 | 78 | def get_fs_type(path): #给定一个path,获得这个path所在文件系统的类型 79 | system = platform.system() 80 | try: 81 | if system == 'Windows': 82 | if system == 'Windows': 83 | drive, _ = os.path.splitdrive(path) 84 | target_disk=drive+"\\" 85 | volumeNameBuffer = ct.create_unicode_buffer(w.MAX_PATH + 1) 86 | fileSystemNameBuffer = ct.create_unicode_buffer(w.MAX_PATH + 1) 87 | volume_name_buffer = ctypes.create_unicode_buffer(drive) 88 | serial_number = w.DWORD() 89 | max_component_length = w.DWORD() 90 | file_system_flags = w.DWORD() 91 | result = ctypes.windll.kernel32.GetVolumeInformationW( 92 | target_disk, 93 | volumeNameBuffer, ct.sizeof(volumeNameBuffer), 94 | ct.byref(serial_number), 95 | ct.byref(max_component_length), 96 | ct.byref(file_system_flags), 97 | fileSystemNameBuffer, ct.sizeof(fileSystemNameBuffer)) 98 | print(f"{result=},{serial_number.value=},{file_system_flags.value=},{fileSystemNameBuffer.value=}") 99 | if result != 0: 100 | return fileSystemNameBuffer.value.lower() 101 | elif system == 'Linux': 102 | command = 'df -PT '+path 103 | output = subprocess.check_output(command, shell=True).decode() 104 | # 解析Linux df命令的输出,获取文件系统类型 105 | lines = output.split('\n') 106 | if len(lines) > 1: 107 | fields = lines[1].split() 108 | if len(fields) >= 2: 109 | file_system_type = fields[1] 110 | return file_system_type.lower() 111 | return "Unknown" 112 | except Exception as e: 113 | print(f"ERROR:{e}") 114 | return None 115 | 116 | def get_logger(log_file): 117 | # 定log输出格式,配置同时输出到标准输出与log文件,返回logger这个对象 118 | logger = logging.getLogger('mylogger') 119 | logger.setLevel(logging.DEBUG) 120 | log_format = logging.Formatter( 121 | '%(asctime)s - %(filename)s- %(levelname)s - %(message)s') 122 | log_fh = logging.FileHandler(log_file) 123 | log_fh.setLevel(logging.DEBUG) 124 | log_fh.setFormatter(log_format) 125 | log_ch = logging.StreamHandler() 126 | log_ch.setLevel(logging.DEBUG) 127 | log_ch.setFormatter(log_format) 128 | logger.addHandler(log_fh) 129 | logger.addHandler(log_ch) 130 | return logger 131 | 132 | 133 | def get_cpu_cores(): #获取cpu的核数,用于后面给允许几个线程算md5做参考 134 | try: 135 | # 使用os模块获取CPU核数 136 | num_cores = os.cpu_count() 137 | if num_cores is None: 138 | # 如果os.cpu_count()返回None,使用multiprocessing模块获取CPU核数 139 | num_cores = multiprocessing.cpu_count() 140 | return num_cores 141 | except Exception as e: 142 | return 1 #如果异常的话,也要给个数字1 143 | 144 | def get_cfg(cfg_file): #读取配置文件,获得系统配置 145 | cfg={} 146 | if os.path.isfile(cfg_file): 147 | ss=open(cfg_file,'r',encoding='utf8').read() 148 | cfg=json.loads(ss) 149 | else: 150 | cfg['dirs']=get_wechat_path() #尝试获得微信的存储目录,作为要被扫描的目录 151 | cfg['cache_file'] = os.path.join(app_path,'cache.dat') #cache文件存放位置,默认放在当前程序目录下,如果空间紧张,可以把它放别处 152 | cfg['md5_key_file']=os.path.join(app_path,'md5_key_files.dat') #以md5为key的hash dict,文件存放路径 153 | cfg['to_del_file'] = os.path.join(app_path, 'to_del_files.dat') # 以md5为key的hash dict,但是存放的只是要清理的文件信息,存放路径 154 | cfg['ask_before_del']=True #批量删除前,先进行确认下 155 | cfg['max_workers']=get_cpu_cores() #起多个线程计算md5 156 | open(cfg_file,'w',encoding='utf8').write(json.dumps(cfg,indent=2,ensure_ascii=True)) 157 | return cfg 158 | 159 | def md5_file(file_path): #计算文件的md5值 160 | with open(file_path, 'rb') as f: 161 | md5_hash = hashlib.md5() 162 | while True: 163 | data = f.read(8192) 164 | if not data: 165 | break 166 | md5_hash.update(data) 167 | md5_value = md5_hash.hexdigest() 168 | return md5_value 169 | 170 | def remove_unprintable_chars(input_str): #把不能打印出来的字符删掉,否则print屏幕的时候,直接报错中断 171 | # 使用可打印字符的Unicode范围过滤字符串 172 | printable_chars = [char for char in input_str if char.isprintable()] 173 | # 将过滤后的字符重新连接成一个字符串 174 | filtered_str = ''.join(printable_chars) 175 | return filtered_str 176 | 177 | def cmp_files(file1,file2): #通过查看文件的inode number,比较两个文件,是否是同一个文件 178 | file1_stat=os.stat(file1) 179 | file2_stat = os.stat(file2) 180 | if not file1_stat.st_size==file2_stat.st_size: #如果两个文件大小不等,直接判断不是一个文件 181 | return False 182 | if file1_stat.st_ino==file2_stat.st_ino: #如果前面大小想等,那么这里判断是不是inode number一样 183 | return True 184 | else: 185 | return False 186 | 187 | if __name__=='__main__': 188 | file1=r"D:\\wechat2\\WeChat Files\\wxid_cpyn7pe119rs21\\Applet\\wx0bc2c17d023b213d\\usrmmkvstorage0\\wx0bc2c17d023b213d.crc" 189 | file2=r"D:\\wechat2\\WeChat Files\\wxid_cpyn7pe119rs21\\Applet\\wxff2aab9aa679ef93\\usrmmkvstorage1\\wxff2aab9aa679ef93.crc" 190 | print(cmp_files(file1,file2)) --------------------------------------------------------------------------------