├── .gitignore ├── LICENSE ├── README.md ├── dest_files └── .empty ├── main.go └── source_files ├── a.bin ├── a1.bin ├── b.bin ├── c.bin └── c1.bin /.gitignore: -------------------------------------------------------------------------------- 1 | apfs-compactor 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 15 | .glide/ 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ralph Caraveo III 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # apfs-compactor 2 | 3 | This tool is a proof-of-concept userland tool for doing compaction of files against Apples new file-system called: APFS. *This code should be regarded as a prototype and only used on test files or data that you can easily reproduce from backups.* 4 | 5 | This file system is applied only to those MacOS systems that have upgraded to High Sierra and have a SSD drive as their boot drive. Additionally, older HFS+ drives can be manually converted to APFS using the built-in Disk Utility tool. 6 | 7 | ## Use cases 8 | 9 | Suppose you have many potential duplicate files dispersed through various folders and want to exploit the APFS to save disk space. Using this tool will identify all the files that are dupes, re-copy these files therefore exploiting the features of the APFS and preserve the original logical source folder including the names. 10 | 11 | ## Motivation 12 | 13 | The APFS has a few innovations around saving disk space by implementing copy-on-write, and smarter meta-data linking when it comes to file creation and copy operations. Currently, when a drive is converted from HFS+ to APFS, there is no concept of deduplication on any existing files. The magic happens however when files are copied. APFS, is smart enough to understand when to make logical vs physical copies of data. Additionally, it's smart enough to implement a delta copy strategy much like a Git repo does when differences are made to files. 14 | 15 | The end result of this behavior can be a considerable disk savings and speed benefit during copy operations. 16 | 17 | Again, no deduplication takes place by default except when attempts to copy data are made. This means, if you simply convert a drive from HFS+ to APFS you likely won't notice much of a difference initially because the conversation process will simply copy files byte for byte as seen on the source volume and as it's written to the destination volume. 18 | 19 | That is where this tool comes: This tool will effectively compact a folder (recursively) by identifiying duplicates files first by file size (for speed purposes) then by calculating a SHA1 hash against only those files that have the same byte size. 20 | 21 | Once these exact duplicate files are identified they'll be copied to a destination folder and upon copying the APFS, will intelligently know that these are duplicates and only copy meta-data. None of this magic happens within the Go source code as it is a feature of the APFS that is being exploited. 22 | 23 | Once the duplicate files are copied (with the original folder hiearchy and names preserved) the non-duplicate files will additionally be copied over. 24 | 25 | Upon completion of this operation you should have two identical folders that exist at the file-system level but due to the APFS smarts, disk space will have been intelligently used and the newly recreated destination folder will logically be the same as the source folder. 26 | 27 | Lastly after verification the source folder can be destroyed and this is when the APFS will simply update it's Metadata database to ensure things are preserved as needed. 28 | 29 | ## Trying this out 30 | 31 | 1. Clone this repository on a USB thumb-drive that is HFS+ formatted. 32 | 2. Use Get Info on the thumb-drive and make a note of how much disk space is used. 33 | 2. Once cloned, run Disk Utility to convert the thumb-drive to APFS. 34 | 3. Run: go run main.go source_files to start the compaction process 35 | 4. Compare the source_folder and the dest_folder. 36 | 5. Finally delete the source_folder. 37 | 6. Use Get Info on the thumb-drive and observe that the drive has more free space then before the migration was applied. The end result will actually be less overall bytes used because for those files that were copied over and identified as exact duplicates, APFS was smart enough to detect these duplicates and just perform Meta-data writes vs a real byte for byte copy. 38 | 39 | # Notes 40 | 41 | ## Test Data 42 | The files I was working with were generated with the following command with various settings: 43 | 44 | ```sh 45 | # https://www.skorks.com/2010/03/how-to-quickly-generate-a-large-file-on-the-command-line-with-linux/ 46 | dd if=/dev/urandom of=file.txt bs=2048 count=10 47 | ``` 48 | 49 | These files are not anything special and opaque to this application in fact this should work on any kind of file whether it be video, audio, text, system files, etc. 50 | 51 | ## File timestamps 52 | I didn't bother to preserve timestamps when the files are copied over to the dest directory...so whatever happens by default with this code is the case...although I'm sure this code could be updated to better preserve "stat" based data. -------------------------------------------------------------------------------- /dest_files/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deckarep/apfs-compactor/ce7ecc0aa59931c92aeb092b33a0436d725407cd/dest_files/.empty -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Open Source Initiative OSI - The MIT License (MIT):Licensing 3 | The MIT License (MIT) 4 | Copyright (c) 2018 Ralph Caraveo (deckarep@gmail.com) 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 9 | of the Software, and to permit persons to whom the Software is furnished to do 10 | so, subject to the following conditions: 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | */ 21 | 22 | package main 23 | 24 | import ( 25 | "crypto/md5" 26 | "crypto/sha1" 27 | "flag" 28 | "fmt" 29 | "io" 30 | "log" 31 | "os" 32 | "path" 33 | "path/filepath" 34 | "strings" 35 | 36 | mapset "github.com/deckarep/golang-set" 37 | ) 38 | 39 | const ( 40 | destFolder = "dest_files" 41 | ) 42 | 43 | var ( 44 | fileTypes = mapset.NewSetFromSlice([]interface{}{"bin"}) 45 | allFilesSeen = mapset.NewSet() 46 | fileSizeCorpus = make(map[int64][]string) 47 | fileHashCorpus = make(map[string][]string) 48 | ) 49 | 50 | func main() { 51 | flag.Parse() 52 | 53 | rootFolder := flag.Arg(0) 54 | // Walk the file-system for a given root folder. 55 | err := filepath.Walk(rootFolder, visit) 56 | if err != nil { 57 | log.Println("Walk failed with err: ", err) 58 | } 59 | 60 | // Identifies files first by byte size. 61 | // Then for matching files will hash them using sha1. 62 | fmt.Println("Identifying and compacting duplicates...") 63 | fmt.Println(strings.Repeat("-", 30)) 64 | identifyDuplicates() 65 | 66 | // Anything in this list is a duplicate. 67 | for h, items := range fileHashCorpus { 68 | fmt.Printf("h: %s\n", h) 69 | for _, f := range items { 70 | fmt.Printf("\tRecreating file as clone: %s\n", f) 71 | err := copyFile(f, path.Join(destFolder, path.Base(f))) 72 | if err != nil { 73 | log.Fatalf("Couldn't copy file source: %s to dest: %s", f, path.Join(destFolder, path.Base(f))) 74 | } 75 | } 76 | } 77 | 78 | // Move over anything that wasn't a duplicate. 79 | fmt.Println("Moving over non-duplicates...") 80 | fmt.Println(strings.Repeat("-", 30)) 81 | copyNonExistingFiles(rootFolder, destFolder) 82 | } 83 | 84 | func copyNonExistingFiles(srcFolder, dstFolder string) error { 85 | allFilesSeen.Each(func(item interface{}) bool { 86 | f := item.(string) 87 | baseFile := path.Base(f) 88 | destFile := path.Join(dstFolder, baseFile) 89 | if _, err := os.Stat(destFile); os.IsNotExist(err) { 90 | fmt.Println("Copying non-duplicate file", destFile) 91 | err := copyFile(f, destFile) 92 | if err != nil { 93 | log.Fatalf("Coudn't copy non-duplicate file: %s", f) 94 | } 95 | } 96 | return false 97 | }) 98 | return nil 99 | } 100 | 101 | func copyFile(src, dst string) error { 102 | in, err := os.Open(src) 103 | if err != nil { 104 | return err 105 | } 106 | defer in.Close() 107 | 108 | out, err := os.Create(dst) 109 | if err != nil { 110 | return err 111 | } 112 | defer out.Close() 113 | 114 | _, err = io.Copy(out, in) 115 | if err != nil { 116 | return err 117 | } 118 | return out.Close() 119 | } 120 | 121 | func identifyDuplicates() { 122 | for _, v := range fileSizeCorpus { 123 | if len(v) > 1 { 124 | for _, path := range v { 125 | h := hashFile(path) 126 | item, ok := fileHashCorpus[h] 127 | if ok { 128 | item = append(item, path) 129 | fileHashCorpus[h] = item 130 | } else { 131 | fileHashCorpus[h] = []string{path} 132 | } 133 | } 134 | } 135 | } 136 | } 137 | 138 | func visit(path string, f os.FileInfo, err error) error { 139 | if !f.IsDir() { 140 | pieces := strings.Split(path, ".") 141 | ext := pieces[len(pieces)-1] 142 | 143 | if fileTypes.Contains(ext) { 144 | allFilesSeen.Add(path) 145 | 146 | fi, err := os.Open(path) 147 | if err != nil { 148 | panic("Could open file!") 149 | } 150 | 151 | defer fi.Close() 152 | 153 | in, err := fi.Stat() 154 | if err != nil { 155 | panic("Could get file info!") 156 | } 157 | 158 | item, ok := fileSizeCorpus[in.Size()] 159 | if ok { 160 | item = append(item, path) 161 | fileSizeCorpus[in.Size()] = item 162 | } else { 163 | fileSizeCorpus[in.Size()] = []string{path} 164 | } 165 | } 166 | } 167 | return nil 168 | } 169 | 170 | func hashString(s string) string { 171 | h := md5.New() 172 | h.Write([]byte(s)) 173 | return fmt.Sprintf("%x", h.Sum(nil)) 174 | } 175 | 176 | func hashFile(path string) string { 177 | h := sha1.New() 178 | f, err := os.Open(path) 179 | if err != nil { 180 | log.Fatal("Couldn't open file: ", err) 181 | } 182 | defer f.Close() 183 | 184 | _, err = io.Copy(h, f) 185 | if err != nil { 186 | log.Fatal("Couldn't copy file bytes over to hash: ", err) 187 | } 188 | return fmt.Sprintf("%x", h.Sum(nil)) 189 | } 190 | -------------------------------------------------------------------------------- /source_files/b.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source_files/c.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source_files/c1.bin: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------------