├── README.md ├── clean ├── .DS_Store ├── duplicates │ ├── entries.go │ ├── entries_test.go │ ├── index.go │ └── index_test.go ├── main.go ├── testdata │ ├── .DS_Store │ ├── copy.txt │ └── text.txt └── utils │ ├── utils.go │ └── utils_test.go └── spaghet └── main.go /README.md: -------------------------------------------------------------------------------- 1 | # Cleaning Code in Go 2 | 3 | ## Introduction 4 | So, this article is a little different from my others. Instead of focusing on a specific product, or solving a speficied problem, we will be looking at something a little more abstract. This article will be focusing on writing "clean code" with golang. The article will be starting with a short introduction as to what is defined by "clean code" and then, we will move onto a practical example, in which we refactor an example application, into a cleaner version. You can find all code for this article at https://github.com/Pungyeon/clean-go 5 | 6 | ## What is "Clean Code" 7 | The idea of clean code, is not something that is particularly rigid in definition. In my opinion, the closest thing to a defacto standard, are the books produced by Robert C. Martin (also known as "Uncle Bob"), who has written the "Clean Code" series, as well as having produced an excellent and extensive video series on the topic. 8 | 9 | However, I will attempt to give a brief summary of what I believe to be clean code: 10 | 11 | 1. Easy to read code 12 | - Clean code is easy to read. In fact, it should be almost as easy to read as prose. If there is need for comments or the like, the code most likely isn't clean. It's intentions should be very clear, just from skimming the code. 13 | 2. Independent of rest of code base 14 | - Clean code ensures that if code changes in one part of the codebase, the rest of the codebase is essentially unaffected. In other words, code is segregated into functionality silos, independent of the rest of the code base. 15 | 3. Testable 16 | - If code is not testable, we can be very sure that it's not clean. Of course, *all code* should be tested, this is not necessarily something that is strictly related to clean code. Making code testable, however, is a big aspect of clean code. 17 | 18 | There are many other additions to these sentiments. Code shouldn't be duplicated, functions shouldn't be very long etc. However, we will cover this later. These three rules are, in my opinion, the most important aspects to writing clean code. 19 | 20 | Whereas most aspects of clean code make sense and seems extremely intuitive, there are also some counterintuitive aspects of clean code. Writing clean code can potentially produce more lines of code than dirty code (also referred to as smelly or sphagetti code). It's therefore very important to recognize, that writing clean code is not making the code "fat free" exclusively. The main goal of writing clean code is to make future development of code easier, and to reduce / eliminate introdution of bugs to applications. 21 | 22 | > NOTE: In this article, I will not be writing tests along with the refactoring. Writing tests before refactoring (and before developing for that matter), is extremely important when writing clean code. However, I typically find that explaining TDD in text, rather than in video is not enjoyable for the writer, nor the reader. However, please please please, write tests when refactoring, to ensure that your refactoring is not destroying your code. I have provided some test examples in the source code for this article. 23 | 24 | ## Our Application 25 | So, let's get right to it. I made a simple program, which traverses a file system and returns a list of duplicate files, based on their file contents. The way we are doing this is by reading the file and hashing the contents as a `sha256` string, which is stored in a hash table, and then comparing the files on each traversal iteration. 26 | 27 | ### Sphagetti Code 28 | This is my first iteration of the program. Which was written pretty fast, without the consideration of anyone else going reading the code: 29 | 30 | ```go 31 | package main 32 | 33 | import ( 34 | "crypto/sha1" 35 | "flag" 36 | "fmt" 37 | "io/ioutil" 38 | "os" 39 | "path" 40 | "strconv" 41 | "sync/atomic" 42 | ) 43 | 44 | func traverseDir(hashes, duplicates map[string]string, dupeSize *int64, entries []os.FileInfo, directory string) { 45 | for _, entry := range entries { 46 | fullpath := (path.Join(directory, entry.Name())) 47 | 48 | if !entry.Mode().IsDir() && !entry.Mode().IsRegular() { 49 | continue 50 | } 51 | 52 | if entry.IsDir() { 53 | dirFiles, err := ioutil.ReadDir(fullpath) 54 | if err != nil { 55 | panic(err) 56 | } 57 | traverseDir(hashes, duplicates, dupeSize, dirFiles, fullpath) 58 | continue 59 | } 60 | file, err := ioutil.ReadFile(fullpath) 61 | if err != nil { 62 | panic(err) 63 | } 64 | hash := sha1.New() 65 | if _, err := hash.Write(file); err != nil { 66 | panic(err) 67 | } 68 | hashSum := hash.Sum(nil) 69 | hashString := fmt.Sprintf("%x", hashSum) 70 | if hashEntry, ok := hashes[hashString]; ok { 71 | duplicates[hashEntry] = fullpath 72 | atomic.AddInt64(dupeSize, entry.Size()) 73 | } else { 74 | hashes[hashString] = fullpath 75 | } 76 | } 77 | } 78 | 79 | func toReadableSize(nbytes int64) string { 80 | if nbytes > 1000*1000*1000*1000 { 81 | return strconv.FormatInt(nbytes/(1000*1000*1000*1000), 10) + " TB" 82 | } 83 | if nbytes > 1000*1000*1000 { 84 | return strconv.FormatInt(nbytes/(1000*1000*1000), 10) + " GB" 85 | } 86 | if nbytes > 1000*1000 { 87 | return strconv.FormatInt(nbytes/(1000*1000), 10) + " MB" 88 | } 89 | if nbytes > 1000 { 90 | return strconv.FormatInt(nbytes/1000, 10) + " KB" 91 | } 92 | return strconv.FormatInt(nbytes, 10) + " B" 93 | } 94 | 95 | func main() { 96 | var err error 97 | dir := flag.String("path", "", "the path to traverse searching for duplicates") 98 | flag.Parse() 99 | 100 | if *dir == "" { 101 | *dir, err = os.Getwd() 102 | if err != nil { 103 | panic(err) 104 | } 105 | } 106 | 107 | hashes := map[string]string{} 108 | duplicates := map[string]string{} 109 | var dupeSize int64 110 | 111 | entries, err := ioutil.ReadDir(*dir) 112 | if err != nil { 113 | panic(err) 114 | } 115 | 116 | traverseDir(hashes, duplicates, &dupeSize, entries, *dir) 117 | 118 | fmt.Println("DUPLICATES") 119 | for key, val := range duplicates { 120 | fmt.Printf("key: %s, val: %s\n", key, val) 121 | } 122 | fmt.Println("TOTAL FILES:", len(hashes)) 123 | fmt.Println("DUPLICATES:", len(duplicates)) 124 | fmt.Println("TOTAL DUPLICATE SIZE:", toReadableSize(dupeSize)) 125 | } 126 | 127 | // running into problems of not being able to open directories inside .app folders 128 | ``` 129 | 130 | Going through the code via. the `main` method, we are parsing an input parameter `path`, and using this to read files from a directory. These files will be sent to the function `traverseDir`, in which we are also parsing two hash `map` objects `hashes` (all file hashes) and `duplicates` (all duplicate file hashes). Lastly, we are also inputting the `dupeSize` parameter, which will indicate the cummultative file size of our duplicate files. 131 | 132 | Finally, we print out our results in a 'human readable' format. Instead of presenting our results as byte count, we will convert them to the appropriate size unit (KB, MB, GB etc.). 133 | 134 | ## Refactoring 135 | 136 | ### Refactoring `toReadableSize` 137 | 138 | First, we are going to be picking the low-hanging-fruits. The function `toReadableSize` looks pretty ugly. Firstly, we are using multiples of `1000`. For everyone who knows what this number represents, it makes sense, however, for anyone reading the code for the first time, this number is rather ambiguous. Therefore, we will establish some global constants for the different values of the sizes that we are returning (GB, MB etc.). We use this when determining the readable size of `nbytes`, and change the if statement blocks into switch statements. As you might have noticed, we are only returning integers, where it would make more sense to return floats: 139 | 140 | ```go 141 | const ( 142 | TB = GB * 1000.0 143 | GB = MB * 1000.0 144 | MB = KB * 1000.0 145 | KB = 1000.0 146 | ) 147 | 148 | 149 | func ToReadableSize(nbytes int64) string { 150 | switch { 151 | case nbytes > TB: 152 | return strconv.FormatFloat(float64(nbytes)/TB, 'f', 2, 64) + " TB" 153 | case nbytes > GB: 154 | return strconv.FormatFloat(float64(nbytes)/GB, 'f', 2, 64) + " GB" 155 | case nbytes > MB: 156 | return strconv.FormatFloat(float64(nbytes)/MB, 'f', 2, 64) + " MB" 157 | case nbytes > KB: 158 | return strconv.FormatFloat(float64(nbytes)/KB, 'f', 2, 64) + " KB" 159 | } 160 | return strconv.FormatFloat(float64(nbytes), 'f', 2, 64) + " B" 161 | } 162 | ``` 163 | 164 | However, this is still very ugly and just as (if not more unreadable) than before. There is a lot of code duplication here, which we should get rid of. So let's make our own `toFloatString` function: 165 | 166 | ```go 167 | func toFloatString(nbytes int64, divider float64) string { 168 | return strconv.FormatFloat(float64(nbytes)/divider, 'f', 2, 64) 169 | } 170 | 171 | func ToReadableSize(nbytes int64) string { 172 | switch { 173 | case nbytes > TB: 174 | return toFloatString(nbytes, TB) + " TB" 175 | case nbytes > GB: 176 | return toFloatString(nbytes, GB) + " GB" 177 | case nbytes > MB: 178 | return toFloatString(nbytes, MB) + " MB" 179 | case nbytes > KB: 180 | return toFloatString(nbytes, KB) + " KB" 181 | } 182 | return strconv.FormatInt(nbytes, 10) + " B" 183 | } 184 | ``` 185 | 186 | Now, our function is nice and readable again. This refactor obviously isn't game changing, but it's a good example to start off with. The intention of this function is now much clearer, with very little effort. 187 | 188 | ### Refactoring `traverseDir` 189 | 190 | Ok, let's go to the more interesting function, `traverseDir`. Why do we want to refactor this function? A good way to think about this, is to think of how you would describe this function in pseudo code and then compare it to your actual code. I'm thinking that this function could be reduced to the following pseudo-code. 191 | 192 | ``` 193 | traverseDir: 194 | for each entry in directory: 195 | if dir: 196 | return traverseDir 197 | if file: 198 | check file is duplicate 199 | ``` 200 | 201 | That is a lot less lines than what we have now... and definitely more readable than what we have now. Pseudo code is a pretty good way to establish a 'goal' for what your clean code should look like. At the very least, you should aim to make your actual code as readable as pseudo code. We can do this, by moving code into functions with descriptive names. This however, is an iterative process. We will start small and bit by bit, we will find a solution as to how to make our code simple and readable. 202 | 203 | So let's look for code, which we can move out of this function... Something that is nice about golang's, otherwise very criticised, error handling system, is that it's quite easy to spot when there is potential for refactoring. Whenever you see two `if err != nil` statements in the same function, you know you can split this out to a single function. In our case, this: 204 | 205 | ```go 206 | func traverseDir(...) 207 | ... 208 | file, err := ioutil.ReadFile(fullpath) 209 | if err != nil { 210 | panic(err) 211 | } 212 | hash := sha1.New() 213 | if _, err := hash.Write(file); err != nil { 214 | panic(err) 215 | } 216 | hashSum := hash.Sum(nil) 217 | hashString := fmt.Sprintf("%x", hashSum) 218 | ... 219 | } 220 | ``` 221 | 222 | Can be refactored to the following: 223 | 224 | ```go 225 | func traverseDir(...) { 226 | ... 227 | hash, err := newFileHash(fullpath) 228 | if err != nil { 229 | panic(err) 230 | } 231 | ... 232 | } 233 | 234 | func newFileHash(path string) (string, error) { 235 | file, err := ioutil.ReadFile(path) 236 | if err != nil { 237 | return "", err 238 | } 239 | hash := sha1.New() 240 | if _, err := hash.Write(file); err != nil { 241 | return "", err 242 | } 243 | hashSum := hash.Sum(nil) 244 | return fmt.Sprintf("%x", hashSum), nil 245 | } 246 | ``` 247 | 248 | The justification behind this, is that when reading our `traverseDir` we aren't immediately concerned with how we are creating a new file hash (sum). We just need to know that we are creating a new file hash. If we want to dig into the details of this, then we can by looking at our `newFileHash`. In other words, we are removing unecessary clutter from the function, improving readability. 249 | 250 | Looking for more low-hanging fruites, we are still panicking in the case of an error, this is pretty dirty, so let's clean it up a little, by making `traverseDir` return an `error`, by adding `error` to the end of the function delcaration and replacing `panic(err)` with `return err`: 251 | 252 | ```go 253 | func traverseDir(hashes, duplicates map[string]string, dupeSize *int64, entries []os.FileInfo, directory string) error { 254 | ... 255 | return err 256 | ... 257 | return nil 258 | } 259 | ``` 260 | 261 | Now, looking at the function signature, we can see that it's a bit... long? We are expecting five input parameters. Not only does this make our function signature super long, it can also makes it very confusing to read on invokation. Consider the following code (taken from the golang rabbitmq tutorial): 262 | 263 | ```go 264 | q, err := ch.QueueDeclare("hello",false,false,false,false,nil) 265 | ``` 266 | There is absolutely no chance of understanding what this means. We know that we are declaring a queue, but all the boolean inputs... well, they could be anything? So, we have to either look at the source code or look at the documentation. This is tedious and slows down development speed and increases the risk of mistakes. A good rule of thumb is to have two input parameters (three at most), to try to avoid this type of confusion. 267 | 268 | Generally, if there are more input parameters it is recommended to extract a type (creating a new type, which will be used as the input parameters). As an example: 269 | 270 | ```go 271 | type QueueOptions struct { 272 | Name string 273 | Durable bool 274 | DeleteWhenUsed bool 275 | Exclusive bool 276 | NoWait bool 277 | Arguments interface{} 278 | } 279 | ``` 280 | 281 | Now, our declaration of our queue, could look something like the following: 282 | 283 | ```go 284 | q, err := ch.NewQueue(QueueOptions{ 285 | Name: "hello", 286 | Durable: true, 287 | DeleteWhenUsed: false, 288 | Exclusive: false, 289 | NoWait: false, 290 | Arguments: nil, 291 | }) 292 | ``` 293 | 294 | Now there is, at the very least, less confusion as to what kind of queue that we are declaring. We can very easily identify that our queue name is `hello` and is a `durable` queue. Another way to go about this, is to create a wrapper function, which explains the type of queue we are creating. This is preferable, when you have no control over the code, such as when using a library: 295 | 296 | ```go 297 | func DeclareDurableQueue() (ch.Queue, error) { 298 | return ch.QueueDeclare("hello", true, false, false, false, nil) 299 | } 300 | ``` 301 | 302 | So, how do we go about solving this issue for our `traverseDir`? We need all the values, which is why we are passing them to the function. However, when we see this kind of pattern, it's usually a sign, that we need to extract a `type`. So, let's make a new `type`, which holds the paremeters that we need: 303 | 304 | Here we create `DuplicateIndex` for keeping track of our hashes and duplicates: 305 | 306 | ```go 307 | type DuplicateIndex struct { 308 | hashes map[string]string 309 | duplicates map[string]string 310 | dupeSize int64 311 | } 312 | 313 | func NewDuplicateIndex() *DuplicateIndex { 314 | return &DuplicateIndex{ 315 | hashes: map[string]string{}, 316 | duplicates: map[string]string{}, 317 | } 318 | } 319 | 320 | func (index *DuplicateIndex) AddEntry(hash, path string, size int64) { 321 | if entry, ok := index.hashes[hash]; ok { 322 | index.duplicates[entry] = path 323 | index.dupeSize += size 324 | return 325 | } 326 | index.hashes[hash] = path 327 | } 328 | ``` 329 | 330 | With this, we can actually replace `hashes`, `duplicates` and `dupeSize` from our function parameters and also replace our insert of the hash: 331 | 332 | ```go 333 | func traverseDir(index *DuplicateIndex, entries []os.FileInfo, directory string) { 334 | ... 335 | index.AddEntry(hash, fullpath, entry.Size()) 336 | ... 337 | } 338 | ``` 339 | 340 | But ah! We can actually make this a method on the `DuplicateIndex` `type` and that way, we now only have two input parameters :clap: We can also move the reading of the directory out of the for loop, and just accept a single parameter `path`. So now our method looks like this: 341 | 342 | ```go 343 | func (index *DuplicateIndex) TraverseDirRecursively(directory string) error { 344 | entries, err := ioutil.ReadDir(directory) 345 | if err != nil { 346 | return err 347 | } 348 | for _, entry := range entries { 349 | fullpath := (path.Join(directory, entry.Name())) 350 | 351 | if entry.IsDir() { 352 | index.TraverseDirRecursively(fullpath) 353 | continue 354 | } 355 | if !entry.Mode().IsRegular() { 356 | continue 357 | } 358 | 359 | hash, err := newFileHash(fullpath) 360 | if err != nil { 361 | return err 362 | } 363 | index.AddEntry(hash, fullpath, entry.Size()) 364 | } 365 | return nil 366 | } 367 | ``` 368 | 369 | > Notice that we have also renamed our function name for clarity 370 | 371 | Now we are almost happy. However, the for lopp in `TraverseDirRecursively` is still smelling a little... what should we do? Well, one way we can get rid of the code smell, is to get rid of the `if` statements inside, by creating an `interface` together with a factory-like constructor. This means we will return the appropriate type determined by the input of the constructor. This returned type will implement an `interface`, which implements a single function: `Handle`. This function will perform the appropriate action associated with the type. Let's see what this looks like in action. 372 | 373 | We will need quite a lot of code, but please don't let that scare you off! 374 | 375 | ```go 376 | type EntryHandler interface { 377 | Handle(*DuplicateIndex) error 378 | } 379 | 380 | type DirEntry struct { 381 | fullpath string 382 | } 383 | 384 | type FileEntry struct { 385 | fullpath string 386 | size int64 387 | } 388 | 389 | type NilEntry struct{} 390 | 391 | func NewEntryHandler(entry os.FileInfo, directory string) EntryHandler { 392 | fullpath := path.Join(directory, entry.Name()) 393 | if entry.Mode().IsDir() { 394 | return &DirEntry{fullpath} 395 | } 396 | if entry.Mode().IsRegular() { 397 | return &FileEntry{fullpath, entry.Size()} 398 | } 399 | return &NilEntry{} 400 | } 401 | 402 | func (entry *DirEntry) Handle(index *DuplicateIndex) error { 403 | return index.TraverseDirRecursively(entry.fullpath) 404 | } 405 | 406 | func (entry *FileEntry) Handle(index *DuplicateIndex) error { 407 | hash, err := newFileHash(entry.fullpath) 408 | if err != nil { 409 | return err 410 | } 411 | index.AddEntry(hash, entry.fullpath, entry.size) 412 | return nil 413 | } 414 | 415 | func (entry *NilEntry) Handle(index *DuplicateIndex) error { 416 | return nil 417 | } 418 | ``` 419 | 420 | With these types, our `TraverseDirRecursively` can now be refactored to the following: 421 | 422 | ```go 423 | func (index *DuplicateIndex) TraverseDirRecursively(directory string) error { 424 | entries, err := ioutil.ReadDir(directory) 425 | if err != nil { 426 | return err 427 | } 428 | for _, entry := range entries { 429 | if err := NewEntryHandler(entry, directory).Handle(index); err != nil { 430 | return err 431 | } 432 | } 433 | return nil 434 | } 435 | ``` 436 | 437 | It may seem extensive, to add almost 40 lines of code just to remove 14. However, there is a reason behind the madness. When looking at the `TraverseDirRecursively` function, it is now only 9 lines of code. This is very easily digestable by the brain, whereas 23 lines might be hard to contain at first. The big gain though, is that we are isolating code, we can test all of our functions very easily and understand exactly what they do, with very little effort. Another great advantage of this isolation, is that we are also making our `TraverseDirRecursively` more dynamic. If we find out, that there is a new type of entry that we need to handle (Shortcut for example), we can just add a new type implementing `EntryHandler` and add it to our mini-factory `NewEntryHandler`. We are now only changing the logic of `NewEntryHandler` as every other code addition, is completely separate. The obvious advantage of this is, it makes it easier to implement new code, without it breaking the rest of our code. We like this :thumbs_up: 438 | 439 | We can also attach the `newFileHash` function to our `FileEntry` `type` and remove the `path` input parameter. We can also rename this function. Since it's attached to our `FileEntry`, there is no need to specify that we are creating a `FileHash`: 440 | 441 | ```go 442 | func (entry *FileEntry) newHash() (string, error) { 443 | file, err := ioutil.ReadFile(entry.fullpath) 444 | if err != nil { 445 | return "", err 446 | } 447 | hash := sha1.New() 448 | if _, err := hash.Write(file); err != nil { 449 | return "", err 450 | } 451 | return fmt.Sprintf("%x", hash.Sum(nil)), nil 452 | } 453 | ``` 454 | 455 | ### Refactoring `main` 456 | 457 | So, now we are pretty much all refactored on the functions of the program. All we need to do now, is to refactor the main function. 458 | 459 | Firstly, I don't like the `var err error` that has to go! Whenever we see this, it's a sign that we are doing something wrong (in my opinion :)) Normally, this indicates that we should move our code into a new function, but in this case, we can actually just move the logic around a little... 460 | 461 | The last thing we are going to do, is that we will create a `Result()` function on the `DuplicateIndex`, which will return a similar string to what we are printing now: 462 | 463 | ```go 464 | func (index *DuplicateIndex) Result() string { 465 | buf := &bytes.Buffer{} 466 | buf.WriteString("DUPLICATES\n") 467 | for key, val := range index.duplicates { 468 | buf.WriteString( 469 | fmt.Sprintf("key: %s, val: %s\n", key, val), 470 | ) 471 | } 472 | buf.WriteString(fmt.Sprintln("TOTAL FILES:", len(index.hashes))) 473 | buf.WriteString(fmt.Sprintln("DUPLICATES:", len(index.duplicates))) 474 | buf.WriteString(fmt.Sprintln("TOTAL DUPLICATE SIZE:", toReadableSize(index.dupeSize))) 475 | return buf.String() 476 | } 477 | ``` 478 | 479 | this makes our final `main` function, look like this: 480 | 481 | ```go 482 | func main() { 483 | defaultPath, err := os.Getwd() 484 | if err != nil { 485 | panic(err) 486 | } 487 | 488 | dir := flag.String("path", defaultPath, "the path to traverse searching for duplicates") 489 | flag.Parse() 490 | 491 | index := NewDuplicateIndex() 492 | if err := index.TraverseDirRecursively(*dir); err != nil { 493 | panic(err) 494 | } 495 | 496 | fmt.Println(index.Result()) 497 | } 498 | ``` 499 | 500 | We could do some more refactoring, but for this short article, I think this is a good point to stop. Of course, in actual code, we would separate these functions into packages, to separate / isolate the responsibility of the code. However, again for the brevity of this article, I have decided to omit this refactoring step. You can, however, see how I decided to do this in the source code. 501 | 502 | Now, let's sum up the result of our code refactor: 503 | * Our code is now easy to implement for other developers. 504 | * It's much easier to read than before. We can skim the code to begin with, and then go into detail on the parts that we wish to. There is less ambiguous / vague code, making everything generally easier to comprehend. 505 | * Our code is super easy to test. This makes further development a lot easier and decreases the chances for bugs, for this very reason. 506 | 507 | As mentioned to begin with 'clean code' is not necessarily super well defined and sometimes comes down to subjective opinion on what is 'more readable' or 'nicer looking'. However, I hope this article gave some insight as to why it's important to refactor your code, as well as how easy it actually is! 508 | 509 | Let me know if you have any feedback or questions on this articles content, by sending me an e-mail at lasse@jakobsen.dev thanks! :wave: 510 | -------------------------------------------------------------------------------- /clean/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pungyeon/clean-go/7b1861fa92fc9cd84f5e4c649f1b71190ce1f14e/clean/.DS_Store -------------------------------------------------------------------------------- /clean/duplicates/entries.go: -------------------------------------------------------------------------------- 1 | package duplicates 2 | 3 | import ( 4 | "crypto/sha1" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "path" 9 | ) 10 | 11 | type EntryHandler interface { 12 | Handle(*DuplicateIndex) error 13 | } 14 | 15 | type DirEntry struct { 16 | fullpath string 17 | } 18 | 19 | type FileEntry struct { 20 | fullpath string 21 | size int64 22 | } 23 | 24 | type NilEntry struct{} 25 | 26 | func NewEntryHandler(entry os.FileInfo, directory string) EntryHandler { 27 | fullpath := path.Join(directory, entry.Name()) 28 | if entry.Mode().IsDir() { 29 | return &DirEntry{fullpath} 30 | } 31 | if entry.Mode().IsRegular() { 32 | return &FileEntry{fullpath, entry.Size()} 33 | } 34 | return &NilEntry{} 35 | } 36 | 37 | func (entry *DirEntry) Handle(index *DuplicateIndex) error { 38 | return index.TraverseDirRecursively(entry.fullpath) 39 | } 40 | 41 | func (entry *FileEntry) Handle(index *DuplicateIndex) error { 42 | hash, err := entry.newHash() 43 | if err != nil { 44 | return err 45 | } 46 | index.AddEntry(hash, entry.fullpath, entry.size) 47 | return nil 48 | } 49 | 50 | func (entry *FileEntry) newHash() (string, error) { 51 | file, err := ioutil.ReadFile(entry.fullpath) 52 | if err != nil { 53 | return "", err 54 | } 55 | hash := sha1.New() 56 | if _, err := hash.Write(file); err != nil { 57 | return "", err 58 | } 59 | return fmt.Sprintf("%x", hash.Sum(nil)), nil 60 | } 61 | 62 | func (entry *NilEntry) Handle(index *DuplicateIndex) error { 63 | return nil 64 | } 65 | -------------------------------------------------------------------------------- /clean/duplicates/entries_test.go: -------------------------------------------------------------------------------- 1 | package duplicates 2 | 3 | import ( 4 | "io/ioutil" 5 | "testing" 6 | ) 7 | 8 | const ( 9 | filehash = "2123251bdbfbb162fcd77b74f4954726461e8093" 10 | ) 11 | 12 | func TestFileEntry(t *testing.T) { 13 | tt := []struct { 14 | name string 15 | fullpath string 16 | size int64 17 | expectError bool 18 | }{ 19 | {"handle existing file", "../testdata/text.txt", 100, false}, 20 | {"handle non existing file", "../testdata/does_not_exist.txt", 100, true}, 21 | } 22 | 23 | for _, tc := range tt { 24 | t.Run(tc.name, func(t *testing.T) { 25 | fileEntry := FileEntry{ 26 | fullpath: tc.fullpath, 27 | size: tc.size, 28 | } 29 | err := fileEntry.Handle(NewDuplicateIndex()) 30 | if err != nil && tc.expectError == false { 31 | t.Errorf("expected error: %v, actual error: %v", err, tc.expectError) 32 | } 33 | }) 34 | } 35 | } 36 | 37 | func TestFileHash(t *testing.T) { 38 | fileEntry := FileEntry{ 39 | fullpath: "../testdata/text.txt", 40 | size: 100, 41 | } 42 | 43 | hash, err := fileEntry.newHash() 44 | if err != nil { 45 | t.Error(err) 46 | } 47 | 48 | if hash != filehash { 49 | t.Error(hash) 50 | } 51 | } 52 | 53 | func TestNilEntry(t *testing.T) { 54 | nilEntry := NilEntry{} 55 | 56 | result := nilEntry.Handle(&DuplicateIndex{}) 57 | if result != nil { 58 | t.Error("nil entry should always return nil on handle, but instead return: " + result.Error()) 59 | } 60 | } 61 | 62 | func TestEntryHandlers(t *testing.T) { 63 | entries, err := ioutil.ReadDir("../testdata") 64 | if err != nil { 65 | t.Fatal(err) 66 | } 67 | index := NewDuplicateIndex() 68 | 69 | for _, entry := range entries { 70 | if err := NewEntryHandler(entry, "../testdata").Handle(index); err != nil { 71 | t.Error(err) 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /clean/duplicates/index.go: -------------------------------------------------------------------------------- 1 | package duplicates 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io/ioutil" 7 | 8 | "github.com/Pungyeon/clean-go-code/clean/utils" 9 | ) 10 | 11 | type DuplicateIndex struct { 12 | hashes map[string]string 13 | duplicates map[string]string 14 | dupeSize int64 15 | } 16 | 17 | func NewDuplicateIndex() *DuplicateIndex { 18 | return &DuplicateIndex{ 19 | hashes: map[string]string{}, 20 | duplicates: map[string]string{}, 21 | } 22 | } 23 | 24 | func (index *DuplicateIndex) AddEntry(hash, path string, size int64) { 25 | if entry, ok := index.hashes[hash]; ok { 26 | index.duplicates[entry] = path 27 | index.dupeSize += size 28 | return 29 | } 30 | index.hashes[hash] = path 31 | } 32 | 33 | func (index *DuplicateIndex) TraverseDirRecursively(directory string) error { 34 | entries, err := ioutil.ReadDir(directory) 35 | if err != nil { 36 | return err 37 | } 38 | for _, entry := range entries { 39 | if err := NewEntryHandler(entry, directory).Handle(index); err != nil { 40 | return err 41 | } 42 | } 43 | return nil 44 | } 45 | 46 | func (index *DuplicateIndex) Result() string { 47 | buf := &bytes.Buffer{} 48 | buf.WriteString("DUPLICATES\n") 49 | for key, val := range index.duplicates { 50 | buf.WriteString( 51 | fmt.Sprintf("key: %s, val: %s\n", key, val), 52 | ) 53 | } 54 | buf.WriteString(fmt.Sprintln("TOTAL FILES:", len(index.hashes))) 55 | buf.WriteString(fmt.Sprintln("DUPLICATES:", len(index.duplicates))) 56 | buf.WriteString(fmt.Sprintln("TOTAL DUPLICATE SIZE:", utils.ToReadableSize(index.dupeSize))) 57 | return buf.String() 58 | } 59 | -------------------------------------------------------------------------------- /clean/duplicates/index_test.go: -------------------------------------------------------------------------------- 1 | package duplicates 2 | 3 | import "testing" 4 | 5 | const ( 6 | result = `DUPLICATES 7 | key: ../testdata/copy.txt, val: ../testdata/text.txt 8 | TOTAL FILES: 2 9 | DUPLICATES: 1 10 | TOTAL DUPLICATE SIZE: 41 B 11 | ` 12 | ) 13 | 14 | func TestTraverseDir(t *testing.T) { 15 | tt := []struct { 16 | name string 17 | directory string 18 | expectError bool 19 | }{ 20 | {"traverse existing directory", "../testdata", false}, 21 | {"traverse non-existing directory", "../does_not_exist", true}, 22 | } 23 | 24 | for _, tc := range tt { 25 | t.Run(tc.name, func(t *testing.T) { 26 | err := NewDuplicateIndex().TraverseDirRecursively(tc.directory) 27 | if err != nil && tc.expectError == false { 28 | t.Error(err) 29 | } 30 | }) 31 | } 32 | } 33 | 34 | func TestTraverseDirResult(t *testing.T) { 35 | index := NewDuplicateIndex() 36 | if err := index.TraverseDirRecursively("../testdata"); err != nil { 37 | t.Error(err) 38 | } 39 | if index.Result() != result { 40 | t.Error("unexpected result") 41 | t.Error(index.Result()) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /clean/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/Pungyeon/clean-go-code/clean/duplicates" 9 | ) 10 | 11 | func main() { 12 | defaultPath, err := os.Getwd() 13 | if err != nil { 14 | panic(err) 15 | } 16 | 17 | dir := flag.String("path", defaultPath, "the path to traverse searching for duplicates") 18 | flag.Parse() 19 | 20 | index := duplicates.NewDuplicateIndex() 21 | if err := index.TraverseDirRecursively(*dir); err != nil { 22 | panic(err) 23 | } 24 | 25 | fmt.Println(index.Result()) 26 | } 27 | -------------------------------------------------------------------------------- /clean/testdata/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pungyeon/clean-go/7b1861fa92fc9cd84f5e4c649f1b71190ce1f14e/clean/testdata/.DS_Store -------------------------------------------------------------------------------- /clean/testdata/copy.txt: -------------------------------------------------------------------------------- 1 | i am a test file! Don't worry about me :) -------------------------------------------------------------------------------- /clean/testdata/text.txt: -------------------------------------------------------------------------------- 1 | i am a test file! Don't worry about me :) -------------------------------------------------------------------------------- /clean/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import "strconv" 4 | 5 | const ( 6 | TB = GB * 1000.0 7 | GB = MB * 1000.0 8 | MB = KB * 1000.0 9 | KB = 1000.0 10 | ) 11 | 12 | func toFloatString(nbytes int64, divider float64) string { 13 | return strconv.FormatFloat(float64(nbytes)/divider, 'f', 2, 64) 14 | } 15 | 16 | func ToReadableSize(nbytes int64) string { 17 | switch { 18 | case nbytes > TB: 19 | return toFloatString(nbytes, TB) + " TB" 20 | case nbytes > GB: 21 | return toFloatString(nbytes, GB) + " GB" 22 | case nbytes > MB: 23 | return toFloatString(nbytes, MB) + " MB" 24 | case nbytes > KB: 25 | return toFloatString(nbytes, KB) + " KB" 26 | } 27 | return strconv.FormatInt(nbytes, 10) + " B" 28 | } 29 | -------------------------------------------------------------------------------- /clean/utils/utils_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import "testing" 4 | 5 | func TestToReadableSize(t *testing.T) { 6 | tt := []struct { 7 | name string 8 | input int64 9 | expected string 10 | }{ 11 | {"byte return", 125, "125 B"}, 12 | {"kilobyte return", 1010, "1.01 KB"}, 13 | {"megabyte return", 1988909, "1.99 MB"}, 14 | {"gigabyte return", 29121988909, "29.12 GB"}, 15 | {"gigabyte return", 890929121988909, "890.93 TB"}, 16 | } 17 | 18 | for _, tc := range tt { 19 | t.Run(tc.name, func(t *testing.T) { 20 | output := ToReadableSize(tc.input) 21 | if output != tc.expected { 22 | t.Errorf("input %d, unexpected output: %s", tc.input, output) 23 | } 24 | }) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spaghet/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "crypto/sha1" 5 | "flag" 6 | "fmt" 7 | "io/ioutil" 8 | "os" 9 | "path" 10 | "strconv" 11 | "sync/atomic" 12 | ) 13 | 14 | func traverseDir(hashes, duplicates map[string]string, dupeSize *int64, entries []os.FileInfo, directory string) { 15 | for _, entry := range entries { 16 | fullpath := (path.Join(directory, entry.Name())) 17 | 18 | if !entry.Mode().IsDir() && !entry.Mode().IsRegular() { 19 | continue 20 | } 21 | 22 | if entry.IsDir() { 23 | dirFiles, err := ioutil.ReadDir(fullpath) 24 | if err != nil { 25 | panic(err) 26 | } 27 | traverseDir(hashes, duplicates, dupeSize, dirFiles, fullpath) 28 | continue 29 | } 30 | file, err := ioutil.ReadFile(fullpath) 31 | if err != nil { 32 | panic(err) 33 | } 34 | hash := sha1.New() 35 | if _, err := hash.Write(file); err != nil { 36 | panic(err) 37 | } 38 | hashSum := hash.Sum(nil) 39 | hashString := fmt.Sprintf("%x", hashSum) 40 | if hashEntry, ok := hashes[hashString]; ok { 41 | duplicates[hashEntry] = fullpath 42 | atomic.AddInt64(dupeSize, entry.Size()) 43 | } else { 44 | hashes[hashString] = fullpath 45 | } 46 | } 47 | } 48 | 49 | func toReadableSize(nbytes int64) string { 50 | if nbytes > 1000*1000*1000*1000 { 51 | return strconv.FormatInt(nbytes/(1000*1000*1000*1000), 10) + " TB" 52 | } 53 | if nbytes > 1000*1000*1000 { 54 | return strconv.FormatInt(nbytes/(1000*1000*1000), 10) + " GB" 55 | } 56 | if nbytes > 1000*1000 { 57 | return strconv.FormatInt(nbytes/(1000*1000), 10) + " MB" 58 | } 59 | if nbytes > 1000 { 60 | return strconv.FormatInt(nbytes/1000, 10) + " KB" 61 | } 62 | return strconv.FormatInt(nbytes, 10) + " B" 63 | } 64 | 65 | func main() { 66 | var err error 67 | dir := flag.String("path", "", "the path to traverse searching for duplicates") 68 | flag.Parse() 69 | 70 | if *dir == "" { 71 | *dir, err = os.Getwd() 72 | if err != nil { 73 | panic(err) 74 | } 75 | } 76 | 77 | hashes := map[string]string{} 78 | duplicates := map[string]string{} 79 | var dupeSize int64 80 | 81 | entries, err := ioutil.ReadDir(*dir) 82 | if err != nil { 83 | panic(err) 84 | } 85 | 86 | traverseDir(hashes, duplicates, &dupeSize, entries, *dir) 87 | 88 | fmt.Println("DUPLICATES") 89 | for key, val := range duplicates { 90 | fmt.Printf("key: %s, val: %s\n", key, val) 91 | } 92 | fmt.Println("TOTAL FILES:", len(hashes)) 93 | fmt.Println("DUPLICATES:", len(duplicates)) 94 | fmt.Println("TOTAL DUPLICATE SIZE:", toReadableSize(dupeSize)) 95 | } 96 | 97 | // running into problems of not being able to open directories inside .app folders 98 | --------------------------------------------------------------------------------