├── .gitignore ├── LICENSE ├── README.md ├── doc.go ├── walker.go └── walker_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | 25 | test_files/ 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Stretchr, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Powerwalk 2 | 3 | Go package for walking files and concurrently calling user code to handle each file. This package walks the file system in the same way `filepath.Walk` does, except instead of calling the `walkFn` inline, it uses goroutines to allow the files to be handled concurrently. 4 | 5 | Powerwalk functions by walking concurrently over many files. In order to realize any benefits from this approach, you must tell the runtime to use multiple CPUs. For example: 6 | 7 | ``` 8 | runtime.GOMAXPROCS(runtime.NumCPU()) 9 | ``` 10 | 11 | ## Usage 12 | 13 | Powerwalk is a drop-in replacement for the `filepath.Walk` method ([read about that for more details](http://golang.org/pkg/path/filepath/#Walk)), and so has the same signature, even using the `filepath.WalkFunc` too. 14 | 15 | ``` 16 | powerwalk.Walk(root string, walkFn filepath.WalkFunc) error 17 | ``` 18 | 19 | By default, Powerwalk will call the `walkFn` for `powerwalk.DefaultConcurrentWalks` (currently `100`) files at a time. To be specific about the number of concurrent files to walk, use the `WalkLimit` alternative. 20 | 21 | ``` 22 | powerwalk.WalkLimit(root string, walkFn filepath.WalkFunc, limit int) error 23 | ``` 24 | 25 | The `WalkLimit` function does the same as `Walk`, except allows you to specify the number of files to concurrently walk using the `limit` argument. The `limit` argument must be one or higher (i.e. `>0`). Specificying a limit that's too high, causes unnecessary overhead so sensible numbers are encouraged but not enforced. 26 | 27 | See the [godoc documentation](http://godoc.org/github.com/stretchr/powerwalk) for more information. 28 | 29 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Package powerwalk concurrently walks file trees. 2 | // Aside from SkipDir functionality not working and the fact that the 3 | // WalkFunc gets run concurrently, this is a drop-in replacement 4 | // for filepath.Walk. 5 | package powerwalk 6 | -------------------------------------------------------------------------------- /walker.go: -------------------------------------------------------------------------------- 1 | package powerwalk 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "path/filepath" 7 | "sync" 8 | ) 9 | 10 | // DefaultConcurrentWalks is the default number of files that will be walked at the 11 | // same time when the Walk function is called. 12 | // To use a value other than this one, use the WalkLimit function. 13 | const DefaultConcurrentWalks int = 100 14 | 15 | // Walk walks the file tree rooted at root, calling walkFn for each file or 16 | // directory in the tree, including root. All errors that arise visiting files 17 | // and directories are filtered by walkFn. The output is non-deterministic. 18 | // WalkLimit does not follow symbolic links. 19 | // 20 | // For each file and directory encountered, Walk will trigger a new Go routine 21 | // allowing you to handle each item concurrently. A maximum of DefaultConcurrentWalks 22 | // walkFns will be called at any one time. 23 | func Walk(root string, walkFn filepath.WalkFunc) error { 24 | return WalkLimit(root, walkFn, DefaultConcurrentWalks) 25 | } 26 | 27 | // WalkLimit walks the file tree rooted at root, calling walkFn for each file or 28 | // directory in the tree, including root. All errors that arise visiting files 29 | // and directories are filtered by walkFn. The output is non-deterministic. 30 | // WalkLimit does not follow symbolic links. 31 | // 32 | // For each file and directory encountered, Walk will trigger a new Go routine 33 | // allowing you to handle each item concurrently. A maximum of limit walkFns will 34 | // be called at any one time. 35 | func WalkLimit(root string, walkFn filepath.WalkFunc, limit int) error { 36 | 37 | // make sure limit is sensible 38 | if limit < 1 { 39 | panic("powerwalk: limit must be greater than zero.") 40 | } 41 | 42 | // filesMg is a wait group that waits for all files to 43 | // be processed before finishing. 44 | var filesWg sync.WaitGroup 45 | 46 | // files is a channel that receives lists of channels 47 | files := make(chan *walkArgs) 48 | kill := make(chan struct{}) 49 | errs := make(chan error) 50 | 51 | for i := 0; i < limit; i++ { 52 | go func(i int) { 53 | for { 54 | select { 55 | case file, ok := <-files: 56 | if !ok { 57 | continue 58 | } 59 | if err := walkFn(file.path, file.info, file.err); err != nil { 60 | errs <- err 61 | } 62 | filesWg.Done() 63 | case <-kill: 64 | return 65 | } 66 | } 67 | }(i) 68 | } 69 | 70 | var walkErr error 71 | 72 | // check for errors 73 | go func() { 74 | select { 75 | case walkErr = <-errs: 76 | close(kill) 77 | case <-kill: 78 | return 79 | } 80 | }() 81 | 82 | // setup a waitgroup and wait for everything to 83 | // be done 84 | var walkerWg sync.WaitGroup 85 | walkerWg.Add(1) 86 | 87 | go func() { 88 | 89 | filepath.Walk(root, func(p string, info os.FileInfo, err error) error { 90 | select { 91 | case <-kill: 92 | close(files) 93 | return errors.New("kill received while walking") 94 | default: 95 | filesWg.Add(1) 96 | select { 97 | case files <- &walkArgs{path: p, info: info, err: err}: 98 | } 99 | return nil 100 | } 101 | }) 102 | 103 | // everything is done 104 | walkerWg.Done() 105 | 106 | }() 107 | 108 | // wait for all walker calls 109 | walkerWg.Wait() 110 | 111 | if walkErr == nil { 112 | filesWg.Wait() 113 | close(kill) 114 | } 115 | 116 | return walkErr 117 | } 118 | 119 | // walkArgs holds the arguments that were passed to the Walk or WalkLimit 120 | // functions. 121 | type walkArgs struct { 122 | path string 123 | info os.FileInfo 124 | err error 125 | } 126 | -------------------------------------------------------------------------------- /walker_test.go: -------------------------------------------------------------------------------- 1 | package powerwalk 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "path" 9 | "path/filepath" 10 | "runtime" 11 | "sync" 12 | "testing" 13 | "time" 14 | 15 | "github.com/stretchr/testify/assert" 16 | ) 17 | 18 | const testFiles string = "./test_files" 19 | 20 | func makeTestFiles(dirs, files int) { 21 | var counter int 22 | for i := 1; i < dirs+1; i++ { 23 | dir := fmt.Sprintf("%s/dir_%02d", testFiles, i) 24 | if err := os.MkdirAll(dir, 0777); err == nil { 25 | for j := 1; j < files+1; j++ { 26 | counter++ 27 | filename := fmt.Sprintf("%s/file-%03d", dir, counter) 28 | ioutil.WriteFile(filename, []byte(fmt.Sprintf("This is file %d", counter)), 0777) 29 | } 30 | } else { 31 | panic(fmt.Sprintf("%s", err)) 32 | } 33 | } 34 | } 35 | func deleteTestFiles() { 36 | os.RemoveAll("./test_files") 37 | } 38 | 39 | // BenchFilepathWalk uses the default Go implementation of filepath.Walk 40 | func BenchmarkWalkFilepath(b *testing.B) { 41 | 42 | // max concurrency out 43 | runtime.GOMAXPROCS(runtime.NumCPU()) 44 | 45 | b.StopTimer() 46 | makeTestFiles(10, 20) 47 | 48 | walkFunc := func(p string, info os.FileInfo, err error) error { 49 | time.Sleep(10 * time.Millisecond) 50 | return nil 51 | } 52 | 53 | b.StartTimer() 54 | 55 | for i := 0; i < b.N; i++ { 56 | filepath.Walk(testFiles, walkFunc) 57 | } 58 | 59 | b.StopTimer() 60 | deleteTestFiles() 61 | 62 | } 63 | 64 | // BenchmarkPowerwalk uses the power walker. 65 | func BenchmarkPowerwalk(b *testing.B) { 66 | 67 | // max concurrency out 68 | runtime.GOMAXPROCS(runtime.NumCPU()) 69 | 70 | b.StopTimer() 71 | makeTestFiles(10, 20) 72 | 73 | walkFunc := func(p string, info os.FileInfo, err error) error { 74 | time.Sleep(10 * time.Millisecond) 75 | return nil 76 | } 77 | 78 | b.StartTimer() 79 | 80 | for i := 0; i < b.N; i++ { 81 | Walk(testFiles, walkFunc) 82 | } 83 | 84 | b.StopTimer() 85 | deleteTestFiles() 86 | 87 | } 88 | 89 | func TestWalkFilepath(t *testing.T) { 90 | 91 | // max concurrency out 92 | runtime.GOMAXPROCS(runtime.NumCPU()) 93 | 94 | makeTestFiles(10, 20) 95 | defer deleteTestFiles() 96 | 97 | seen := make(map[string]bool) 98 | walkFunc := func(p string, info os.FileInfo, err error) error { 99 | if !info.IsDir() { 100 | filename := path.Base(p) 101 | seen[filename] = true 102 | } 103 | return nil 104 | } 105 | 106 | assert.NoError(t, filepath.Walk(testFiles, walkFunc)) 107 | 108 | // make sure everything was seen 109 | if assert.NotEqual(t, len(seen), 0, "Walker should visit at least one file.") { 110 | for k, v := range seen { 111 | assert.True(t, v, k) 112 | } 113 | } 114 | 115 | } 116 | 117 | func TestPowerWalk(t *testing.T) { 118 | 119 | // max concurrency out 120 | runtime.GOMAXPROCS(runtime.NumCPU()) 121 | 122 | makeTestFiles(10, 20) 123 | defer deleteTestFiles() 124 | 125 | var seenLock sync.Mutex 126 | seen := make(map[string]bool) 127 | walkFunc := func(p string, info os.FileInfo, err error) error { 128 | if !info.IsDir() { 129 | filename := path.Base(p) 130 | seenLock.Lock() 131 | defer seenLock.Unlock() 132 | seen[filename] = true 133 | } 134 | return nil 135 | } 136 | 137 | assert.NoError(t, Walk(testFiles, walkFunc)) 138 | 139 | // make sure everything was seen 140 | if assert.NotEqual(t, len(seen), 0, "Walker should visit at least one file.") { 141 | for k, v := range seen { 142 | assert.True(t, v, k) 143 | } 144 | } 145 | 146 | } 147 | 148 | /* 149 | // This test is commented out as it takes an extremely long time. 150 | func TestPowerWalkMassive(t *testing.T) { 151 | 152 | // max concurrency out 153 | runtime.GOMAXPROCS(runtime.NumCPU()) 154 | 155 | rand.Seed(time.Now().UnixNano()) 156 | 157 | makeTestFiles(200, 100) 158 | defer deleteTestFiles() 159 | 160 | count := 0 161 | total := 200 * 100 162 | 163 | var seenLock sync.Mutex 164 | seen := make(map[string]bool) 165 | walkFunc := func(p string, info os.FileInfo, err error) error { 166 | if !info.IsDir() { 167 | filename := path.Base(p) 168 | seenLock.Lock() 169 | seen[filename] = true 170 | count++ 171 | seenLock.Unlock() 172 | 173 | // simulate some processing 174 | time.Sleep(time.Duration(rand.Int31n(1000)) * time.Millisecond) 175 | os.Stdout.Sync() 176 | } 177 | return nil 178 | } 179 | 180 | assert.NoError(t, Walk(testFiles, walkFunc)) 181 | 182 | // make sure everything was seen 183 | if assert.NotEqual(t, len(seen), 0, "Walker should visit at least one file.") { 184 | for k, v := range seen { 185 | assert.True(t, v, k) 186 | } 187 | } 188 | 189 | } 190 | */ 191 | 192 | func TestPowerWalkLimit(t *testing.T) { 193 | 194 | // max concurrency out 195 | runtime.GOMAXPROCS(runtime.NumCPU()) 196 | 197 | makeTestFiles(10, 20) 198 | defer deleteTestFiles() 199 | 200 | var seenLock sync.Mutex 201 | seen := make(map[string]bool) 202 | walkFunc := func(p string, info os.FileInfo, err error) error { 203 | if !info.IsDir() { 204 | filename := path.Base(p) 205 | seenLock.Lock() 206 | defer seenLock.Unlock() 207 | seen[filename] = true 208 | } 209 | return nil 210 | } 211 | 212 | assert.NoError(t, WalkLimit(testFiles, walkFunc, 1)) 213 | 214 | // make sure everything was seen 215 | if assert.NotEqual(t, len(seen), 0, "Walker should visit at least one file.") { 216 | for k, v := range seen { 217 | assert.True(t, v, k) 218 | } 219 | } 220 | 221 | } 222 | 223 | func TestPowerWalkLimitInvalidArgs(t *testing.T) { 224 | 225 | makeTestFiles(10, 20) 226 | defer deleteTestFiles() 227 | 228 | walkFunc := func(p string, info os.FileInfo, err error) error { 229 | return nil 230 | } 231 | assert.Panics(t, func() { 232 | WalkLimit(testFiles, walkFunc, 0) 233 | }) 234 | 235 | } 236 | 237 | func TestPowerWalkLimitUselessThreadsDontBlock(t *testing.T) { 238 | 239 | makeTestFiles(10, 20) 240 | defer deleteTestFiles() 241 | 242 | walkFunc := func(p string, info os.FileInfo, err error) error { 243 | return nil 244 | } 245 | assert.NoError(t, WalkLimit(testFiles, walkFunc, 500)) 246 | 247 | } 248 | 249 | func TestPowerWalkError(t *testing.T) { 250 | 251 | // max concurrency out 252 | runtime.GOMAXPROCS(runtime.NumCPU()) 253 | 254 | makeTestFiles(10, 20) 255 | defer deleteTestFiles() 256 | 257 | theErr := errors.New("kaboom") 258 | var seenLock sync.Mutex 259 | seen := make(map[string]bool) 260 | walkFunc := func(p string, info os.FileInfo, err error) error { 261 | if !info.IsDir() { 262 | filename := path.Base(p) 263 | seenLock.Lock() 264 | defer seenLock.Unlock() 265 | if len(seen) > 20 { 266 | return theErr 267 | } 268 | seen[filename] = true 269 | } 270 | return nil 271 | } 272 | 273 | assert.Equal(t, Walk(testFiles, walkFunc), theErr) 274 | 275 | // make sure everything was seen 276 | if assert.NotEqual(t, len(seen), 0, "Walker should visit at least one file.") { 277 | for k, v := range seen { 278 | assert.True(t, v, k) 279 | } 280 | } 281 | 282 | } 283 | --------------------------------------------------------------------------------