├── .gitignore ├── AUTHORS ├── MIT-LICENSE.txt ├── README.md ├── bin └── traversaltime.go ├── cwalk.go ├── go.mod └── tests └── error-handling.go /.gitignore: -------------------------------------------------------------------------------- 1 | ~* 2 | *.exe 3 | .idea/ 4 | bin/traversaltime 5 | tests/error-handling 6 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | AUTHOR / MAINTAINER 2 | 3 | Igor Afanasyev 4 | 5 | CONTRIBUTORS 6 | 7 | Seth 8 | -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Igor Afanasyev, https://github.com/iafan/cwalk 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### cwalk = Concurrent filepath.Walk 2 | 3 | A concurrent version of https://golang.org/pkg/path/filepath/#Walk function 4 | that scans files in a directory tree and runs a callback for each file. 5 | 6 | Since scanning (and callback execution) is done from within goroutines, 7 | this may result in a significant performance boost on multicore systems 8 | in cases when the bottleneck is the CPU, not the I/O. 9 | 10 | My tests showed ~3.5x average speed increase on an 8-core CPU and 8 workers. 11 | For measurements, I used the provided `bin/traversaltime.go` utility that measures 12 | directory traversal time for both concurrent (`cwalk.Walk()`) and standard 13 | (`filepath.Walk()`) functions. 14 | 15 | Here are two common use cases when `cwalk` might be useful: 16 | 17 | 1. You're doing subsequent scans of the same directory 18 | (e.g. monitoring it for changes), which means that the directory structure 19 | is likely cached in memory by OS; 20 | 21 | 2. You're doing some CPU-heavy processing for each file in the callback. 22 | 23 | ### Installation 24 | 25 | ```shell 26 | $ go get github.com/iafan/cwalk 27 | ``` 28 | 29 | ### Usage 30 | 31 | ```go 32 | import "github.com/iafan/cwalk" 33 | 34 | ... 35 | 36 | func walkFunc(path string, info os.FileInfo, err error) error { 37 | ... 38 | } 39 | 40 | ... 41 | 42 | err := cwalk.Walk("/path/to/dir", walkFunc) 43 | ``` 44 | 45 | ### Errors 46 | An error such as a file limit being exceeded will be reported as `too many open files` for a particular file. Each occurance of this is available in the returned error via the `type WalkerError struct`. When errors are encountered the file walk will be completed prematurley, not all paths/files shall be walked. You can check and access for errors like this: 47 | 48 | ``` 49 | if err != nil { 50 | fmt.Printf("Error : %s\n", err.Error()) 51 | for _, errors := range err.(cwalk.WalkerError).ErrorList { 52 | fmt.Println(errors) 53 | } 54 | } 55 | ``` 56 | 57 | ### Differences from filepath.Walk 58 | 59 | `filepath.Walk` sorts directory results while traversing the tree, which makes processing repeatable between runs. `cwalk.Walk()` processes files concurrentrly, sp there's no way to guarantee the order in which files or even folders are processed. If needed, you can sort the results once the entire tree is processed. 60 | -------------------------------------------------------------------------------- /bin/traversaltime.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "sync/atomic" 9 | "time" 10 | 11 | "github.com/iafan/cwalk" 12 | ) 13 | 14 | var fileCount int32 15 | var folderCount int32 16 | var errorCount int32 17 | 18 | var followSymlinks bool 19 | var processingTime time.Duration 20 | 21 | // This callback simply counts files and folders. 22 | // 23 | // Note that the callback function should be thread-safe 24 | // (this is why we use "atomic.AddInt32()" function to increment counters). 25 | func callback(path string, info os.FileInfo, err error) error { 26 | if err != nil { 27 | atomic.AddInt32(&errorCount, 1) 28 | } else { 29 | if info.IsDir() { 30 | atomic.AddInt32(&folderCount, 1) 31 | } else { 32 | atomic.AddInt32(&fileCount, 1) 33 | } 34 | } 35 | if processingTime > 0 { 36 | time.Sleep(processingTime) 37 | } 38 | return nil 39 | } 40 | 41 | func init() { 42 | flag.BoolVar(&followSymlinks, "follow-symlinks", false, "When specified, directory symlinks will be processed and followed") 43 | flag.BoolVar(&followSymlinks, "f", false, "Shorthand for -follow-symlinks") 44 | 45 | flag.DurationVar(&processingTime, "file-processing-time", 0, "An artificial delay, for each file processed, to imitate actual work. Omitting this parameter means no delay. Example: 50ms") 46 | flag.DurationVar(&processingTime, "t", 0, "Shorthand for -file-processing-time") 47 | } 48 | 49 | func main() { 50 | flag.Parse() 51 | 52 | if len(flag.Args()) < 1 || flag.Args()[0] == "" { 53 | fmt.Println("Usage:") 54 | fmt.Println(" traversaltime [-f] [-t N] ") 55 | fmt.Println("Options:") 56 | flag.PrintDefaults() 57 | os.Exit(0) 58 | } 59 | dir := flag.Args()[0] 60 | fmt.Println("Directory:", dir) 61 | 62 | // run the concurrent version 63 | 64 | folderCount = 0 65 | fileCount = 0 66 | errorCount = 0 67 | 68 | start := time.Now() 69 | var err error 70 | 71 | if followSymlinks { 72 | fmt.Printf("Running a concurrent version that follows symlinks with %d workers and %s file processing time... ", cwalk.NumWorkers, processingTime) 73 | err = cwalk.WalkWithSymlinks(dir, callback) 74 | } else { 75 | fmt.Printf("Running a concurrent version that doesn't follow symlinks with %d workers and %s file processing time... ", cwalk.NumWorkers, processingTime) 76 | err = cwalk.Walk(dir, callback) 77 | } 78 | 79 | fmt.Printf("done in %s\n", time.Since(start)) 80 | fmt.Printf("\t%d directories found\n", folderCount) 81 | fmt.Printf("\t%d files found\n", fileCount) 82 | fmt.Printf("\t%d errors found\n", errorCount) 83 | if err != nil { 84 | fmt.Printf("\nErrors: %s\n\n", err) 85 | } 86 | 87 | // run the standard (single-threaded) version 88 | 89 | folderCount = 0 90 | fileCount = 0 91 | errorCount = 0 92 | 93 | fmt.Printf("Running a standard version (single-threaded, doesn't follow symlinks) with %s file processing time... ", processingTime) 94 | start = time.Now() 95 | 96 | err = filepath.Walk(dir, callback) 97 | 98 | fmt.Printf("done in %s\n", time.Since(start)) 99 | fmt.Printf("\t%d directories found\n", folderCount) 100 | fmt.Printf("\t%d files found\n", fileCount) 101 | fmt.Printf("\t%d errors found\n", errorCount) 102 | if err != nil { 103 | fmt.Printf("\nError: %s\n\n", err) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /cwalk.go: -------------------------------------------------------------------------------- 1 | package cwalk 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "runtime" 9 | "strings" 10 | "sync" 11 | ) 12 | 13 | // NumWorkers defines how many workers to run 14 | // on each Walk() function invocation 15 | var NumWorkers = runtime.GOMAXPROCS(0) 16 | 17 | // BufferSize defines the size of the job buffer 18 | var BufferSize = NumWorkers 19 | 20 | // ErrNotDir indicates that the path, which is being passed 21 | // to a walker function, does not point to a directory 22 | var ErrNotDir = errors.New("Not a directory") 23 | 24 | // WalkerError struct stores individual errors reported from each worker routine 25 | type WalkerError struct { 26 | error error 27 | path string 28 | } 29 | 30 | // WalkerErrorList struct store a list of errors reported from all worker routines 31 | type WalkerErrorList struct { 32 | ErrorList []WalkerError 33 | } 34 | 35 | // Implement the error interface for WalkerError 36 | func (we WalkerError) Error() string { 37 | return we.error.Error() 38 | } 39 | 40 | // Implement the error interface fo WalkerErrorList 41 | func (wel WalkerErrorList) Error() string { 42 | if len(wel.ErrorList) > 0 { 43 | out := make([]string, len(wel.ErrorList)) 44 | for i, err := range wel.ErrorList { 45 | out[i] = err.Error() 46 | } 47 | return strings.Join(out, "\n") 48 | } 49 | return "" 50 | } 51 | 52 | // Walker is constructed for each Walk() function invocation 53 | type Walker struct { 54 | wg sync.WaitGroup 55 | ewg sync.WaitGroup // a separate wg for error collection 56 | jobs chan string 57 | root string 58 | followSymlinks bool 59 | walkFunc filepath.WalkFunc 60 | errors chan WalkerError 61 | errorList WalkerErrorList // this is where we store the errors as we go 62 | } 63 | 64 | // the readDirNames function below was taken from the original 65 | // implementation (see https://golang.org/src/path/filepath/path.go) 66 | // but has sorting removed (sorting doesn't make sense 67 | // in concurrent execution, anyway) 68 | 69 | // readDirNames reads the directory named by dirname and returns 70 | // a list of directory entries. 71 | func readDirNames(dirname string) ([]string, error) { 72 | f, err := os.Open(dirname) 73 | if err != nil { 74 | return nil, err 75 | } 76 | names, err := f.Readdirnames(-1) 77 | f.Close() 78 | if err != nil { 79 | return nil, err 80 | } 81 | return names, nil 82 | } 83 | 84 | // lstat is a wrapper for os.Lstat which accepts a path 85 | // relative to Walker.root and also follows symlinks 86 | func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) { 87 | path := filepath.Join(w.root, relpath) 88 | info, err = os.Lstat(path) 89 | if err != nil { 90 | return nil, err 91 | } 92 | // check if this is a symlink 93 | if w.followSymlinks && info.Mode()&os.ModeSymlink > 0 { 94 | path, err = filepath.EvalSymlinks(path) 95 | if err != nil { 96 | return nil, err 97 | } 98 | info, err = os.Lstat(path) 99 | if err != nil { 100 | return nil, err 101 | } 102 | } 103 | return 104 | } 105 | 106 | // collectErrors processes any any errors passed via the error channel 107 | // and stores them in the errorList 108 | func (w *Walker) collectErrors() { 109 | defer w.ewg.Done() 110 | for err := range w.errors { 111 | w.errorList.ErrorList = append(w.errorList.ErrorList, err) 112 | } 113 | } 114 | 115 | // processPath processes one directory and adds 116 | // its subdirectories to the queue for further processing 117 | func (w *Walker) processPath(relpath string) error { 118 | defer w.wg.Done() 119 | 120 | path := filepath.Join(w.root, relpath) 121 | names, err := readDirNames(path) 122 | if err != nil { 123 | return err 124 | } 125 | 126 | for _, name := range names { 127 | subpath := filepath.Join(relpath, name) 128 | info, err := w.lstat(subpath) 129 | 130 | err = w.walkFunc(subpath, info, err) 131 | 132 | if err == filepath.SkipDir { 133 | return nil 134 | } 135 | 136 | if err != nil { 137 | w.errors <- WalkerError{ 138 | error: err, 139 | path: subpath, 140 | } 141 | continue 142 | } 143 | 144 | if info == nil { 145 | w.errors <- WalkerError{ 146 | error: fmt.Errorf("Broken symlink: %s", subpath), 147 | path: subpath, 148 | } 149 | continue 150 | } 151 | 152 | if info.IsDir() { 153 | w.addJob(subpath) 154 | } 155 | } 156 | return nil 157 | } 158 | 159 | // addJob increments the job counter 160 | // and pushes the path to the jobs channel 161 | func (w *Walker) addJob(path string) { 162 | w.wg.Add(1) 163 | select { 164 | // try to push the job to the channel 165 | case w.jobs <- path: // ok 166 | default: // buffer overflow 167 | // process job synchronously 168 | err := w.processPath(path) 169 | if err != nil { 170 | w.errors <- WalkerError{ 171 | error: err, 172 | path: path, 173 | } 174 | } 175 | } 176 | } 177 | 178 | // worker processes all the jobs 179 | // until the jobs channel is explicitly closed 180 | func (w *Walker) worker() { 181 | for path := range w.jobs { 182 | err := w.processPath(path) 183 | if err != nil { 184 | w.errors <- WalkerError{ 185 | error: err, 186 | path: path, 187 | } 188 | } 189 | } 190 | 191 | } 192 | 193 | // Walk recursively descends into subdirectories, 194 | // calling walkFn for each file or directory 195 | // in the tree, including the root directory. 196 | func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error { 197 | w.errors = make(chan WalkerError, BufferSize) 198 | w.jobs = make(chan string, BufferSize) 199 | w.walkFunc = walkFn 200 | 201 | w.ewg.Add(1) // a separate error waitgroup so we wait until all errors are reported before exiting 202 | go w.collectErrors() 203 | 204 | info, err := w.lstat(relpath) 205 | err = w.walkFunc(relpath, info, err) 206 | if err == filepath.SkipDir { 207 | return nil 208 | } 209 | if err != nil { 210 | return err 211 | } 212 | 213 | if info == nil { 214 | return fmt.Errorf("Broken symlink: %s", relpath) 215 | } 216 | 217 | if !info.IsDir() { 218 | return ErrNotDir 219 | } 220 | 221 | // spawn workers 222 | for n := 1; n <= NumWorkers; n++ { 223 | go w.worker() 224 | } 225 | w.addJob(relpath) // add this path as a first job 226 | w.wg.Wait() // wait till all paths are processed 227 | close(w.jobs) // signal workers to close 228 | close(w.errors) // signal errors to close 229 | w.ewg.Wait() // wait for all errors to be collected 230 | 231 | if len(w.errorList.ErrorList) > 0 { 232 | return w.errorList 233 | } 234 | return nil 235 | } 236 | 237 | // Walk is a wrapper function for the Walker object 238 | // that mimics the behavior of filepath.Walk, 239 | // and doesn't follow symlinks. 240 | func Walk(root string, walkFn filepath.WalkFunc) error { 241 | w := Walker{ 242 | root: root, 243 | } 244 | return w.Walk("", walkFn) 245 | } 246 | 247 | // WalkWithSymlinks is a wrapper function for the Walker object 248 | // that mimics the behavior of filepath.Walk, but follows 249 | // directory symlinks. 250 | func WalkWithSymlinks(root string, walkFn filepath.WalkFunc) error { 251 | w := Walker{ 252 | root: root, 253 | followSymlinks: true, 254 | } 255 | return w.Walk("", walkFn) 256 | } 257 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/iafan/cwalk 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /tests/error-handling.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "sync/atomic" 8 | "time" 9 | 10 | "github.com/iafan/cwalk" 11 | ) 12 | 13 | // This example takes bin/traversaltime.go program as a basis, 14 | // but for the sake of testing, starts to accumulate open handles, 15 | // processing them slowly to artificially cause errors 16 | // related to exceeding the number of open file handles 17 | 18 | var fileCount int32 19 | var folderCount int32 20 | var errorCount int32 21 | var openFiles = make(chan string) 22 | var fileSlice []*os.File 23 | 24 | // This callback simply counts files and folders. 25 | // 26 | // Note that the callback function should be thread-safe 27 | // (this is why we use "atomic.AddInt32()" function to increment counters). 28 | func callback(path string, info os.FileInfo, err error) error { 29 | if err != nil { 30 | atomic.AddInt32(&errorCount, 1) 31 | return err 32 | } else { 33 | if info.IsDir() { 34 | atomic.AddInt32(&folderCount, 1) 35 | } else { 36 | atomic.AddInt32(&fileCount, 1) 37 | } 38 | } 39 | return nil 40 | } 41 | 42 | // This callback simply counts files and folders and also opens 43 | // the file in order to artificially cause errors related 44 | // to exceeding the number of open file handles 45 | // 46 | // Note that the callback function should be thread-safe 47 | // (this is why we use "atomic.AddInt32()" function to increment counters). 48 | func errorCallback(path string, info os.FileInfo, err error) error { 49 | if err != nil { 50 | atomic.AddInt32(&errorCount, 1) 51 | return err 52 | } else { 53 | if info.IsDir() { 54 | atomic.AddInt32(&folderCount, 1) 55 | } else { 56 | // open the file in order to artificially cause errors 57 | f, _ := os.Open(path) 58 | fileSlice = append(fileSlice, f) 59 | atomic.AddInt32(&fileCount, 1) 60 | } 61 | } 62 | return nil 63 | } 64 | 65 | func main() { 66 | if len(os.Args) < 2 || os.Args[1] == "" { 67 | fmt.Println("Usage: error-handling ") 68 | os.Exit(0) 69 | } 70 | dir := os.Args[1] 71 | 72 | // run the concurrent version 73 | 74 | folderCount = 0 75 | fileCount = 0 76 | errorCount = 0 77 | 78 | fmt.Print("Running concurrent version... ") 79 | start := time.Now() 80 | 81 | err := cwalk.Walk(dir, callback) 82 | 83 | fmt.Printf("done in %s\n", time.Since(start)) 84 | fmt.Printf("\t%d directories found\n", folderCount) 85 | fmt.Printf("\t%d files found\n", fileCount) 86 | fmt.Printf("\t%d errors detected by the callback\n", errorCount) 87 | 88 | if err != nil { 89 | fmt.Printf("\t%d errors returned by cwalk\n", len(err.(cwalk.WalkerErrorList).ErrorList)) 90 | fmt.Printf("Error :\n%s\n", err.Error()) 91 | } 92 | 93 | // run the standard (single-threaded) version 94 | 95 | folderCount = 0 96 | fileCount = 0 97 | errorCount = 0 98 | 99 | fmt.Print("Running standard version... ") 100 | start = time.Now() 101 | 102 | err = filepath.Walk(dir, callback) 103 | 104 | fmt.Printf("done in %s\n", time.Since(start)) 105 | fmt.Printf("\t%d directories found\n", folderCount) 106 | fmt.Printf("\t%d files found\n", fileCount) 107 | fmt.Printf("\t%d errors detected by the callback\n", errorCount) 108 | 109 | if err != nil { 110 | fmt.Printf("Error : %s\n", err.Error()) 111 | } 112 | 113 | // run the concurrent triggering errors version 114 | 115 | folderCount = 0 116 | fileCount = 0 117 | errorCount = 0 118 | 119 | fmt.Print("Running concurrent version to trigger errors...\n") 120 | fmt.Print("Errors are triggered by exceeding file limits by opening every file... ") 121 | 122 | start = time.Now() 123 | 124 | //start the open files func 125 | time.Sleep(1000) 126 | 127 | err = cwalk.Walk(dir, errorCallback) 128 | 129 | close(openFiles) 130 | 131 | fmt.Printf("done in %s\n", time.Since(start)) 132 | fmt.Printf("\t%d directories found\n", folderCount) 133 | fmt.Printf("\t%d files found\n", fileCount) 134 | fmt.Printf("\t%d errors detected by the callback\n", errorCount) 135 | 136 | if err != nil { 137 | fmt.Printf("\t%d errors returned by cwalk\n", len(err.(cwalk.WalkerErrorList).ErrorList)) 138 | fmt.Printf("Error :\n%s\n", err.Error()) 139 | } 140 | } 141 | --------------------------------------------------------------------------------