├── .github ├── dependabot.yml └── workflows │ └── build.yml ├── .gitignore ├── .golangci.yml ├── AUTHORS ├── LICENSE ├── Makefile ├── README.md ├── assert_test.go ├── cmd └── pprof │ └── main.go ├── doc.go ├── docs ├── how-it-works.drawio └── how-it-works.svg ├── example_test.go ├── go.mod ├── go.sum ├── internal ├── helper.go ├── helper_test.go ├── logger.go └── logger_test.go ├── reader.go ├── reader_test.go ├── scripts ├── killweb.sh └── pprof.sh └── testdata ├── empty.csv ├── example_products.csv ├── file_with_header.csv ├── file_with_quote_in_unquoted_field.csv ├── file_without_header.csv └── invalid_row.csv /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: ['*'] 6 | tags: ['v*'] 7 | pull_request: 8 | branches: ['*'] 9 | 10 | jobs: 11 | build: 12 | name: Build 13 | timeout-minutes: 10 14 | strategy: 15 | matrix: 16 | go-version: [1.23.x, 1.24.x] 17 | platform: [ubuntu-latest, macos-latest, windows-latest] 18 | runs-on: ${{ matrix.platform }} 19 | 20 | steps: 21 | - name: Install Go 22 | uses: actions/setup-go@v5 23 | with: 24 | go-version: ${{ matrix.go-version }} 25 | 26 | - name: Checkout code 27 | uses: actions/checkout@v4 28 | 29 | - name: Load cached dependencies 30 | uses: actions/cache@v4 31 | with: 32 | path: ~/go/pkg/mod 33 | key: ${{ runner.os }}-go-${{ matrix.go-version }}-${{ hashFiles('**/go.sum') }} 34 | restore-keys: | 35 | ${{ runner.os }}-go-${{ matrix.go-version }}- 36 | 37 | - name: Download dependencies 38 | run: make setup 39 | 40 | - name: Run linter 41 | run: make lint 42 | 43 | - name: Run unit tests 44 | run: make clean cover 45 | 46 | - name: Upload coverage to coveralls.io 47 | if: matrix.platform == 'ubuntu-latest' && matrix.go-version == '1.24.x' 48 | uses: coverallsapp/github-action@v2 49 | with: 50 | file: cover.out 51 | flag-name: ${{ runner.os }}-go-${{ matrix.go-version }} 52 | fail-on-error: false 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | .vscode/* 3 | bin/* 4 | 5 | .DS_Store 6 | 7 | *.exe 8 | *.exe~ 9 | *.so 10 | 11 | cover.out 12 | cover.html 13 | *.prof -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | run: 4 | tests: true 5 | 6 | linters: 7 | default: none 8 | enable: 9 | - goconst 10 | - gocritic 11 | - godot 12 | - govet 13 | - ineffassign 14 | - intrange 15 | - misspell 16 | - nlreturn 17 | - noctx 18 | - revive 19 | - staticcheck 20 | - unused 21 | - whitespace 22 | 23 | formatters: 24 | enable: 25 | - gofmt 26 | settings: 27 | gofmt: 28 | rewrite-rules: 29 | - pattern: 'interface{}' 30 | replacement: 'any' 31 | - pattern: 'a[b:len(a)]' 32 | replacement: 'a[b:]' 33 | - pattern: 'a[0:b]' 34 | replacement: 'a[:b]' 35 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the official list of actforgood/bigcsvreader authors for copyright purposes. 2 | 3 | Bogdan Constantinescu (https://github.com/bogcon) 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 The ActForGood Authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LINTER_VERSION=v2.1.6 2 | LINTER=./bin/golangci-lint 3 | ifeq ($(OS),Windows_NT) 4 | LINTER=./bin/golangci-lint.exe 5 | endif 6 | pkgs=$(shell go list ./... | grep -v /cmd/) 7 | 8 | .PHONY: all 9 | all: clean setup lint test ## Run sequentially clean, setup, lint and test. 10 | 11 | .PHONY: lint 12 | lint: ## Run linter and detect go mod tidy changes. 13 | $(LINTER) run -c ./.golangci.yml --fix 14 | @make tidy 15 | @if ! git diff --quiet; then \ 16 | echo "'go mod tidy' resulted in changes or working tree is dirty:"; \ 17 | git --no-pager diff; \ 18 | fi 19 | 20 | .PHONY: setup 21 | setup: ## Download dependencies. 22 | go mod download 23 | @if [ ! -f "$(LINTER)" ]; then \ 24 | curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s $(LINTER_VERSION); \ 25 | fi 26 | 27 | .PHONY: test 28 | test: ## Run tests (with race condition detection). 29 | go test -race -timeout=10m $(pkgs) 30 | 31 | .PHONY: bench 32 | bench: ## Run benchmarks. 33 | go test -race -timeout=15m -benchmem -benchtime=2x -bench . 34 | 35 | .PHONY: cover 36 | cover: ## Run tests with coverage. Generates "cover.out" profile and its html representation. 37 | go test -race -timeout=10m -coverprofile=cover.out -coverpkg=./... $(pkgs) 38 | go tool cover -html=cover.out -o cover.html 39 | 40 | .PHONY: tidy 41 | tidy: ## Simply runs 'go mod tidy'. 42 | go mod tidy 43 | 44 | .PHONY: clean 45 | clean: ## Clean up go tests cache and coverage generated files. 46 | go clean -testcache 47 | @for file in cover.html cover.out; do \ 48 | if [ -f $$file ]; then \ 49 | echo "rm -f $$file"; \ 50 | rm -f $$file; \ 51 | fi \ 52 | done 53 | 54 | .PHONY: help 55 | # Absolutely awesome: https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html 56 | help: 57 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 58 | 59 | .DEFAULT_GOAL := help 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigCsvReader 2 | 3 | [![Build Status](https://github.com/actforgood/bigcsvreader/actions/workflows/build.yml/badge.svg)](https://github.com/actforgood/bigcsvreader/actions/workflows/build.yml) 4 | [![License](https://img.shields.io/badge/license-MIT-blue)](https://raw.githubusercontent.com/actforgood/bigcsvreader/main/LICENSE) 5 | [![Coverage Status](https://coveralls.io/repos/github/actforgood/bigcsvreader/badge.svg?branch=main)](https://coveralls.io/github/actforgood/bigcsvreader?branch=main) 6 | [![Goreportcard](https://goreportcard.com/badge/github.com/actforgood/bigcsvreader)](https://goreportcard.com/report/github.com/actforgood/bigcsvreader) 7 | [![Go Reference](https://pkg.go.dev/badge/github.com/actforgood/bigcsvreader.svg)](https://pkg.go.dev/github.com/actforgood/bigcsvreader) 8 | 9 | --- 10 | 11 | Package `bigcscvreader` offers a multi-threaded approach for reading a large CSV file in order to improve the time of reading and processing it. 12 | It spawns multiple goroutines, each reading a piece of the file. 13 | Read rows are put into channels equal in number to the spawned goroutines, in this way also the processing of those rows can be parallelized. 14 | 15 | 16 | ### Installation 17 | 18 | ```shell 19 | $ go get github.com/actforgood/bigcsvreader 20 | ``` 21 | 22 | ### Example 23 | 24 | Please refer to this [example](https://pkg.go.dev/github.com/actforgood/bigcsvreader#example-CsvReader). 25 | 26 | 27 | ### How it is designed to work 28 | ![BigCsvReader-HowItWorks](docs/how-it-works.svg) 29 | 30 | 31 | ### Benchmarks 32 | ``` 33 | go version go1.22.1 darwin/amd64 34 | go test -timeout=15m -benchmem -benchtime=2x -bench . 35 | goos: darwin 36 | goarch: amd64 37 | pkg: github.com/actforgood/bigcsvreader 38 | cpu: Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz 39 | Benchmark50000Rows_50Mb_withBigCsvReader-8 2 8076491568 ns/op 61744680 B/op 100269 allocs/op 40 | Benchmark50000Rows_50Mb_withStdGoCsvReaderReadAll-8 2 65237799108 ns/op 67924264 B/op 100043 allocs/op 41 | Benchmark50000Rows_50Mb_withStdGoCsvReaderReadOneByOneAndReuseRecord-8 2 66750849960 ns/op 57606432 B/op 50020 allocs/op 42 | Benchmark50000Rows_50Mb_withStdGoCsvReaderReadOneByOneProcessParalell-8 2 8184433872 ns/op 61607624 B/op 100040 allocs/op 43 | ``` 44 | 45 | Benchmarks are made with a file of ~`50Mb` in size, also a fake processing of any given row of `1ms` was taken into consideration. 46 | bigcsvreader was launched with `8` goroutines. 47 | Other benchmarks are made using directly the `encoding/csv` go package. 48 | As you can see, bigcsvreader reads and processes all rows in ~`8s`. 49 | Go standard csv package reads and processes all rows in ~`65s` (sequentially). 50 | Go standard csv package read and a parallel processing of rows timing is comparable to the one of bigcsvreader (so this strategy is a good alternative to this package). 51 | `ReadAll` API has the disadvantage of keeping all rows into memory. 52 | `Read` rows one by one API with `ReuseRecord` flag set has the advantage of fewer allocations, but has the cost of sequentially reading rows. 53 | > Note: It's a coincidence that parallelized version timing was ~equal to sequential timing divided by no of started goroutines. You should not take this as a rule. 54 | 55 | Bellow are some process stats captured with unix `TOP` command while running each benchmark. 56 | | Bench | %CPU | MEM | 57 | | --- | --- | --- | 58 | | Benchmark50000Rows_50Mb_withBigCsvReader | 17.3 | 9652K | 59 | | Benchmark50000Rows_50Mb_withStdGoCsvReaderReadAll | 5.8 | 66M | 60 | | Benchmark50000Rows_50Mb_withStdGoCsvReaderReadOneByOneAndReuseRecord | 11.3 | 6908K | 61 | 62 | 63 | **(!) Known issue**: 64 | This package does not work as expected with multiline columns. 65 | 66 | 67 | ### License 68 | This package is released under a MIT license. See [LICENSE](LICENSE). 69 | -------------------------------------------------------------------------------- /assert_test.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package bigcsvreader_test 7 | 8 | import ( 9 | "reflect" 10 | "testing" 11 | ) 12 | 13 | // Note: this file contains some assertion utilities. 14 | 15 | // assertEqual checks if 2 values are equal. 16 | // Returns successful assertion status. 17 | func assertEqual(t *testing.T, expected, actual any) bool { 18 | t.Helper() 19 | if !reflect.DeepEqual(expected, actual) { 20 | t.Errorf( 21 | "\n\t"+`expected "%+v" (%T),`+ 22 | "\n\t"+`but got "%+v" (%T)`+"\n", 23 | expected, expected, 24 | actual, actual, 25 | ) 26 | 27 | return false 28 | } 29 | 30 | return true 31 | } 32 | 33 | // assertNotNil checks if value passed is not nil. 34 | // Returns successful assertion status. 35 | func assertNotNil(t *testing.T, actual any) bool { 36 | t.Helper() 37 | if isNil(actual) { 38 | t.Error("should not be nil") 39 | 40 | return false 41 | } 42 | 43 | return true 44 | } 45 | 46 | // assertNil checks if value passed is nil. 47 | // Returns successful assertion status. 48 | func assertNil(t *testing.T, actual any) bool { 49 | t.Helper() 50 | if !isNil(actual) { 51 | t.Errorf("expected nil, but got %+v", actual) 52 | 53 | return false 54 | } 55 | 56 | return true 57 | } 58 | 59 | // assertTrue checks if value passed is true. 60 | // Returns successful assertion status. 61 | func assertTrue(t *testing.T, actual bool) bool { 62 | t.Helper() 63 | if !actual { 64 | t.Error("should be true") 65 | 66 | return false 67 | } 68 | 69 | return true 70 | } 71 | 72 | // isNil checks an interface if it is nil. 73 | func isNil(object any) bool { 74 | if object == nil { 75 | return true 76 | } 77 | 78 | value := reflect.ValueOf(object) 79 | 80 | kind := value.Kind() 81 | switch kind { 82 | case reflect.Ptr: 83 | return value.IsNil() 84 | case reflect.Slice: 85 | return value.IsNil() 86 | case reflect.Map: 87 | return value.IsNil() 88 | case reflect.Interface: 89 | return value.IsNil() 90 | case reflect.Func: 91 | return value.IsNil() 92 | case reflect.Chan: 93 | return value.IsNil() 94 | } 95 | 96 | return false 97 | } 98 | -------------------------------------------------------------------------------- /cmd/pprof/main.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | // Package main contains an executable for profiling different strategies of reading a CSV. 7 | // Note: this file is only for dev only. 8 | package main 9 | 10 | import ( 11 | "context" 12 | "encoding/csv" 13 | "flag" 14 | "io" 15 | "log" 16 | "os" 17 | "runtime" 18 | "runtime/pprof" 19 | "strconv" 20 | "sync" 21 | "sync/atomic" 22 | 23 | "github.com/actforgood/bigcsvreader" 24 | ) 25 | 26 | var generateProfileFor = flag.String("for", "bigcsvreader", "Generate memory and cpu profile for given case. Can be one of bigcsvreader/gocsvreadall/gocsvreadonebyone.") 27 | 28 | const rowsCount = 5e4 29 | 30 | func main() { 31 | flag.Parse() 32 | 33 | // create a file 34 | fName, err := setUpTmpCsvFile(rowsCount) 35 | if err != nil { 36 | log.Fatal("prerequisite failed: could not generate CSV file: ", err) 37 | } 38 | defer tearDownTmpCsvFile(fName) 39 | 40 | // enable cpu profiling 41 | fCPU, err := os.Create("./cpu_" + *generateProfileFor + ".prof") 42 | if err != nil { 43 | log.Println("could not create CPU profile: ", err) 44 | 45 | return 46 | } 47 | defer fCPU.Close() 48 | if err := pprof.StartCPUProfile(fCPU); err != nil { 49 | log.Println("could not start CPU profile: ", err) 50 | 51 | return 52 | } 53 | defer pprof.StopCPUProfile() 54 | 55 | switch *generateProfileFor { 56 | case "gocsvreadall": 57 | goStandardCsvReaderReadAll(fName) 58 | case "gocsvreadonebyone": 59 | goStandardCsvReaderReadOneByOne(fName) 60 | default: 61 | bigCsvReader(fName) 62 | } 63 | 64 | // enable memory profiling 65 | fMem, err := os.Create("./mem_" + *generateProfileFor + ".prof") 66 | if err != nil { 67 | log.Println("could not create memory profile: ", err) 68 | 69 | return 70 | } 71 | defer fMem.Close() 72 | runtime.GC() // get up-to-date statistics 73 | if err := pprof.WriteHeapProfile(fMem); err != nil { 74 | log.Println("could not write memory profile: ", err) 75 | 76 | return 77 | } 78 | } 79 | 80 | // setUpTmpCsvFile creates a CSV file in the OS's temp directory, like `/tmp/bigcsvreder_-.csv` . 81 | // The file will have the provided number of rows. 82 | // Rows look like: 83 | // 84 | // 1,Product_1,"Lorem ipsum...",150.99,35\n 85 | // 2,Product_2,"Lorem ipsum...",150.99,35\n 86 | // , Product_, static text: Lorem ipsum..., static price: 150.99, static stock qty: 35 EOL 87 | func setUpTmpCsvFile(rowsCount int64) (string, error) { 88 | filePattern := "bigcsvreader_" + strconv.FormatInt(rowsCount, 10) + "-*.csv" 89 | f, err := os.CreateTemp("", filePattern) 90 | if err != nil { 91 | return "", err 92 | } 93 | fName := f.Name() 94 | 95 | var id int64 96 | buf := make([]byte, 0, 1280) 97 | bufLenConst := 4 + 2 + 1 + len(colValueNamePrefix) + len(colValueDescription) + len(colValuePrice) + len(colValueStock) // 4 x comma, 2 x quote, 1 x \n, 98 | for id = 1; id <= rowsCount; id++ { 99 | buf = buf[0:0:1280] 100 | idStr := strconv.FormatInt(id, 10) 101 | buf = append(buf, idStr...) 102 | buf = append(buf, ',') 103 | buf = append(buf, colValueNamePrefix...) 104 | buf = append(buf, idStr...) 105 | buf = append(buf, `,"`...) 106 | buf = append(buf, colValueDescription...) 107 | buf = append(buf, `",`...) 108 | buf = append(buf, colValuePrice...) 109 | buf = append(buf, ',') 110 | buf = append(buf, colValueStock...) 111 | buf = append(buf, "\n"...) 112 | bufLen := bufLenConst + 2*len(idStr) 113 | _, err := f.Write(buf[:bufLen]) 114 | if err != nil { 115 | _ = f.Close() 116 | tearDownTmpCsvFile(fName) 117 | 118 | return "", err 119 | } 120 | } 121 | 122 | _ = f.Close() 123 | 124 | return fName, nil 125 | } 126 | 127 | // tearDownTmpCsvFile deletes the file provided. 128 | func tearDownTmpCsvFile(filePath string) { 129 | _ = os.Remove(filePath) 130 | } 131 | 132 | const ( 133 | colValueNamePrefix = "Product_" 134 | colValueDescription = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean" 135 | colValuePrice = "150.99" 136 | colValueStock = "35" 137 | ) 138 | 139 | func goStandardCsvReaderReadOneByOne(fName string) { 140 | var count int64 141 | f, err := os.Open(fName) 142 | if err != nil { 143 | log.Fatal("could not open CSV file", err) 144 | } 145 | defer f.Close() 146 | subject := csv.NewReader(f) 147 | subject.FieldsPerRecord = 5 148 | subject.ReuseRecord = true 149 | subject.Comma = ',' 150 | for { 151 | record, err := subject.Read() 152 | if err != nil { 153 | if err == io.EOF { 154 | break 155 | } 156 | log.Println("Read error: ", err) 157 | } else { // "consume" row 158 | count++ 159 | _ = record 160 | } 161 | } 162 | log.Println("Rows Count: ", count) 163 | } 164 | 165 | func goStandardCsvReaderReadAll(fName string) { 166 | var count int64 167 | f, err := os.Open(fName) 168 | if err != nil { 169 | log.Fatal("could not open CSV file", err) 170 | } 171 | defer f.Close() 172 | subject := csv.NewReader(f) 173 | subject.FieldsPerRecord = 5 174 | subject.Comma = ',' 175 | rows, err := subject.ReadAll() 176 | if err != nil { 177 | log.Println("ReadAll error: ", err) 178 | } else { 179 | for _, record := range rows { // "consume" rows 180 | count++ 181 | _ = record 182 | } 183 | } 184 | log.Println("Rows Count: ", count) 185 | } 186 | 187 | func bigCsvReader(fName string) { 188 | subject := bigcsvreader.New() 189 | subject.SetFilePath(fName) 190 | subject.ColumnsCount = 5 191 | subject.MaxGoroutinesNo = 8 192 | ctx, cancelCtx := context.WithCancel(context.Background()) 193 | defer cancelCtx() 194 | var count int64 195 | 196 | rowsChans, errsChan := subject.Read(ctx) 197 | count = consumeBigCsvReaderResults(rowsChans, errsChan) 198 | log.Println("Rows Count: ", count) 199 | } 200 | 201 | // consumeBigCsvReaderResults just counts the records received from big csv reader. 202 | func consumeBigCsvReaderResults(rowsChans []bigcsvreader.RowsChan, errsChan bigcsvreader.ErrsChan) int64 { 203 | var ( 204 | count int64 205 | wg sync.WaitGroup 206 | ) 207 | 208 | for i := range rowsChans { 209 | wg.Add(1) 210 | go func(rowsChan bigcsvreader.RowsChan, waitGr *sync.WaitGroup) { 211 | var localCount int64 212 | for record := range rowsChan { 213 | localCount++ 214 | _ = record 215 | } 216 | atomic.AddInt64(&count, localCount) 217 | waitGr.Done() 218 | }(rowsChans[i], &wg) 219 | } 220 | 221 | wg.Add(1) 222 | go func(errsCh bigcsvreader.ErrsChan, waitGr *sync.WaitGroup) { 223 | for err := range errsCh { 224 | log.Println("Read error: ", err) 225 | } 226 | waitGr.Done() 227 | }(errsChan, &wg) 228 | 229 | wg.Wait() 230 | 231 | return count 232 | } 233 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | // Package bigcsvreader offers a multi-threaded approach for reading a large CSV file 7 | // in order to improve the time of reading and processing it. 8 | // It spawns multiple goroutines, each reading a piece of the file. 9 | // Read rows are put into channels equal in number to the spawned goroutines, 10 | // in this way also the processing of those rows can be parallelized. 11 | package bigcsvreader 12 | -------------------------------------------------------------------------------- /docs/how-it-works.drawio: -------------------------------------------------------------------------------- 1 | 7V1tc6M4Ev41rrr9EBcgwPAxL5Ns1c3dZme29mY+EizbzGDkA5w48+tXgIRBLRvZAWwneKomIEDgbvXT/bRaeIRul5uH2Fst/kOmOBwZ2nQzQncjw9B1x6Z/spbXosV2zKJhHgdTdtK24WvwC7NGjbWugylOaiemhIRpsKo3+iSKsJ/W2rw4Ji/102YkrN915c0xaPjqeyFs/V8wTRdFq2Np2/bfcTBf8DvrGjuy9PjJrCFZeFPyUmlCn0boNiYkLbaWm1scZsLjcimuu99xtHywGEepygV/Ot9+/PEt+nvx43m2+fXwZfEz/X5lsW6evXDNvvFNML9Nnr9gb4pj9uTpKxcH/RKrbHO9DO9jb0k3b14WQYq/rjw/a3+ho4C2LdJlSPd0ujn1kgWesp0ZiVKmY93MrmUi1emz06u4IFG2NwvC8JaEJM7vjO7zD22H35t/BxyneFNpYnJ4wGSJ0/iVnsKOcpWwMcmV+bJV8MRhbYuKci0+KD02qOZlz1u50w0m+kPUYAE1XK9WYeB7aUCid6oFR2tUQwkD/agBGsPIsMM0k1/wXFOB/f91Zrc3ta159pdCDr33gplPcTVtyDsozjiyz6Groauhq5N1xa94isUWsWcBrBuQmWP5cjPPIqjxk5cE/nhK/PUyh9YbLwzmET3Bp7sUUtBN6D3h8JEkQe4bKgcy6KUuI/wsnLAMptPsacoTrlmX5QEB5WezmeH72cOlMfmJK0em9pNt2e3g/6QO/7oJ8b9sq+K/0Rn+TyT4L2gzDCJcCoZHhLogwojkJ3HNhXiW7pN+QgdHEM3/InRo3F3p25bP+YV3aNvyhfvmfHCnXuo95U+mScZFXJx7syJBlOaism5G1l3eEqe3JKJfwgty9WEvSV9wkoqRQgtaNic1LSNdh1qWBVtORzp2gI6p5vDYT56Bruk3TAVjrdkDU/MezQPTBLqXoUNM1tE014EmidbyffaQRgsK0jVBQzwYrWjIligIdWWF9gRoAk8pKWO7dOwuyJxEXvhp2yrIbHvOZ5JbVaa8HzhNX5kkvXVK6qqlAoxfv7Hr853v2c7Y4rt3m+rBu1eJegy2XwFMLf/sM6WErGMf74tKmaBTL57jdB98sR4zae3VfIxDSiye6xRXpkd26WOGIBWTdgTkts2xI9hr8bDsQmFAlE/yhkhdA2b8QOgYSDN4NjQdjqAwDFaJAlMC5lZ3jVMLO1NT5hod4wnZLblGZIu+0S4lXDFLV2aWqCtyJGNHopSj6XWWdMmgL/QSGsnUhduMbDLTwZsgLS2TblcMk+5t7TLb4Wa5NeexfphB53uPOA6o2DLsvuvJes0TWa9hOeOJVe+l+FbAekFfyBKRwAJ9dY0EBhiWWe5K6kLywDgbhjgJflUip8oIVffcqqMXDp295sUSmezpRmWuSgE8lAfG2wIo91LdM+rEPRuqBs4z3+0Z+Jv0aOx3oRJwv2wX6qISmk7lQo3BhXZqYZxr9u1CkYZac6GUj/XtQvkovHQXalyKCy0nCfv1oT34QqRoqa51Xr4Q7fWF/31nvhDpk/GpXaF9kCtkebVaclKY3nz0Ugo3Ud5iaAgkaY2jnedODTSO3op0LdlMJmt7qzvSRHckpOZUfZHYkWEKHXXtiFzoiLIKDjD1crvwKMSH0kxPXceHmmd9CBQfyQSJlf2Tma2df9oxWxvMkMAU0ERqtGNr9xh6k9kiBbPlM1r+Og5fb2LP/5m5giZNbNWW64XC6+9qOmo20wMKQwxxtgLCpCGTeFezFaYCY3hH8pbNAcrkbdogRm5P5DAWACIfZgEPVLRIciZQz73OAprmoOSup3qpA5NEmf2qGdbVDWpuWc26g06sZIWyjffkIx1XNhcodZMwldSe1GGsDqQ+5DKPz5DwSLvrXCYSSAYdSGPjyFymYTn1vnR3bIuddcwhLTitcZHJTG5fZ5/MtGAo9RgTHyc13i6pscwEdJXkErvOmLyx2oxklZdFCmAny9cPY/n7anQ6LHZ3NauO4qYpm46S+U0xCdOe5g7LwZ0LhB8I4MdjNR/ZjVjN81ZdY7UtLFzRJ2gsph5UsdoR50dtNNaFbG/XUA0Dt8d1dtmFQTW3o/OHalj8OkB1M1TLA+5+ofoyo+3+oNpRhGpOVvuG6jysttuBarAAomOctmFIfZk4fSkhtQ2z/gNON+K0bLKmV5C2jQGk94E0H9bN1SGtV0qqgTQy9dbiaYSEx+kapOG01UWCNDei8wdpmPf4l/8bbaD/jCv6H1WIdu2n9yR+IAQmoM5vddlRy5cOWO9v1i3EsiWTgDwn0stKswn0s9zBZnJQcrHaDhf7x78r64OL3nYsEI4XZPm0ThR8LlilO/M1MG7yI9z7VrSrdatd0xXyCZJiFSTxxt0pF3rjVwyjnua6Swkq7pj/eyJpSpa5xyy9PFnhqGhhinD2auw46+T+Xz/G/x+4ouJ4/z9R9f/K+TQ29q60sWPyx+DoUuy9MUKwxFFtq81mUO17r5XT2Izt7vuImT12n13Rhg2yd7Xz6UbxBMLV/HHIbJbgTsKQCfSKEXnfRrc1tO9VO2uah7SqZqef3ug4VWo0OjbkrlqyMDHbprmqi4lhX6aYH4d9Ncbz3duIA/Mpn6CJqFaYe8mqePHZLNhkY7ipzP/+voGaAzspr2glTKir24UxoKySwegqSnAkqZZ1sgDqOL9o/agVHAcUH2tiPAfLfHp9LYQDCW5TcThMv7zj4nAXlXUbp6sPd6D7HxjV0VApatiSabhXUuXAgsqBVJ2UVHGLa2/mq3dSlS15VawSeyuvqtxKlVrBS07HrhxYsfOx2JWuaH11dqWdgfWpLk3vmF1ZlmqVTyO7oi2gr3NgV7Cq6MOyK4NPUpyMXkkqjgZ6NZLQK8eW1YP2yrBcGLw3MSy4Mv79Miz5qvmeCZYrW9E0EKxWCBbSXaDeXtmVC8O7gV2dlF1xc2uvZGXLrmxzUmdXZjshn/hyImSKaNQWudJ33Wknt7IarjgdtXJh2DhQq1NSK3XTcxRNr11qBWLtiQuKx5SpFRLswoR9nQG1cmE0/2GpFXJgLNgrtXLhkoCBWsmoleyV873yKp2/pWhfYfDR1df7pNlh9bWYDTJlr26RRc9ud2I2IDxRvxrvY7Afhr2atnMG84O6JitTHlALopYpqY7tGbUUXp5y/HKGfeJ82xse+uKGJaw3RqglMp3JCwPFKS4kBimqix4mAgMs4aS3n4H4wMl+y6xHpKak4rrXiFTXul5fzN15W7HTmaxcM23omHtduaZrw/riBqxXXWCsjvUtg7jpumP70OlYjuOO2JehmDNsDcklv+hzkevXSlM64QI2urv9XeFCQ9tfZ0af/gE= -------------------------------------------------------------------------------- /docs/how-it-works.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
BigCsvReader
BigCsvReader
Application
Application
row/header
row
row
row
row
row
row
row
row
row
row
row
row

row/header...
file.csv
file.csv
Goroutine 1
Goroutine 1
Read
Read
Goroutine 2
Goroutine 2
Read
Read
Goroutine N
Goroutine N
Rows
Channel 1
Rows...
Read
Read
Process
Rows
Process...
Pull
Pull
Process
Rows
Process...
Pull
Pull
Process
Rows
Process...
Pull
Pull
(c) 2022-2023 ActForGood
(c) 2022-2023 ActForGood
OK
OK
yes
yes
no
no
E
E
Push
Push
Rows
Channel 2
Rows...
OK
OK
yes
yes
no
no
E
E
Push
Push
Rows
Channel N
Rows...
OK
OK
yes
yes
no
no
E
E
Push
Push
Errors
Channel
Errors...
Push
Push
E
E
Process
Errors
Process...
Pull
Pull
Text is not SVG - cannot display
-------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package bigcsvreader_test 7 | 8 | import ( 9 | "context" 10 | "fmt" 11 | "strconv" 12 | "sync" 13 | 14 | "github.com/actforgood/bigcsvreader" 15 | ) 16 | 17 | const ( 18 | columnProductID = iota 19 | columnProductName 20 | columnProductDescription 21 | columnProductPrice 22 | columnProductQty 23 | ) 24 | 25 | const noOfColumns = 5 26 | 27 | type Product struct { 28 | ID int 29 | Name string 30 | Desc string 31 | Price float64 32 | Qty int 33 | } 34 | 35 | func ExampleCsvReader() { 36 | // initialize the big csv reader 37 | bigCSV := bigcsvreader.New() 38 | bigCSV.SetFilePath("testdata/example_products.csv") 39 | bigCSV.ColumnsCount = noOfColumns 40 | bigCSV.MaxGoroutinesNo = 16 41 | 42 | ctx, cancelCtx := context.WithCancel(context.Background()) 43 | defer cancelCtx() 44 | var wg sync.WaitGroup 45 | 46 | // start multi-thread reading 47 | rowsChans, errsChan := bigCSV.Read(ctx) 48 | 49 | // process rows and errors: 50 | 51 | for i := range rowsChans { 52 | wg.Add(1) 53 | go rowWorker(rowsChans[i], &wg) 54 | } 55 | 56 | wg.Add(1) 57 | go errWorker(errsChan, &wg) 58 | 59 | wg.Wait() 60 | 61 | // Unordered output: 62 | // {ID:1 Name:Apple iPhone 13 Desc:Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean Price:1025.99 Qty:100} 63 | // {ID:2 Name:Samsung Galaxy S22 Desc:Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean Price:400.99 Qty:12} 64 | // {ID:3 Name:Apple MacBook Air Desc:Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean Price:700.99 Qty:34} 65 | // {ID:4 Name:Lenovo ThinkPad X1 Desc:Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean Price:550.99 Qty:90} 66 | // {ID:5 Name:Logitech Mouse G203 Desc:Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean Price:30.5 Qty:35} 67 | } 68 | 69 | func rowWorker(rowsChan bigcsvreader.RowsChan, waitGr *sync.WaitGroup) { 70 | for row := range rowsChan { 71 | processRow(row) 72 | } 73 | waitGr.Done() 74 | } 75 | 76 | func errWorker(errsChan bigcsvreader.ErrsChan, waitGr *sync.WaitGroup) { 77 | for err := range errsChan { 78 | handleError(err) 79 | } 80 | waitGr.Done() 81 | } 82 | 83 | // processRow can be used to implement business logic 84 | // like validation / converting to a struct / persisting row into a storage. 85 | func processRow(row []string) { 86 | id, _ := strconv.Atoi(row[columnProductID]) 87 | price, _ := strconv.ParseFloat(row[columnProductPrice], 64) 88 | qty, _ := strconv.Atoi(row[columnProductQty]) 89 | name := row[columnProductName] 90 | desc := row[columnProductDescription] 91 | 92 | product := Product{ 93 | ID: id, 94 | Name: name, 95 | Desc: desc, 96 | Price: price, 97 | Qty: qty, 98 | } 99 | 100 | fmt.Printf("%+v\n", product) 101 | } 102 | 103 | // handleError handles the error. 104 | // errors can be fatal like file does not exist, or row related like a given row could not be parsed, etc... 105 | func handleError(err error) { 106 | fmt.Println(err) 107 | } 108 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/actforgood/bigcsvreader 2 | 3 | go 1.23 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/actforgood/bigcsvreader/ba0745bcdccc752ef3a48bd0c1761c83dcf7682c/go.sum -------------------------------------------------------------------------------- /internal/helper.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package internal 7 | 8 | // ComputeGoroutineOffsets computes how many goroutines will handle totalBytes, and their [start, end] offset intervals. 9 | // First arguments represents the total bytes to be handled. 10 | // Second argument represents the maximum goroutines that will handle total bytes. 11 | // Third argument represents the minimum bytes a goroutine should handle. 12 | // It returns a slice (up to maxGoroutines in length) of [start, end] intervals each goroutine should handle. 13 | func ComputeGoroutineOffsets(totalBytes, maxGoroutines, minBytesReadByAGoroutine int) [][2]int { 14 | // make some checks 15 | if totalBytes <= 0 { 16 | return nil 17 | } 18 | if minBytesReadByAGoroutine <= 0 { 19 | minBytesReadByAGoroutine = 1 20 | } 21 | if maxGoroutines <= 0 { 22 | maxGoroutines = 1 23 | } 24 | 25 | // skip rest of computations and return immediately if total bytes < min bytes, 26 | // it means we only have 1 goroutine reading total bytes. 27 | if totalBytes <= minBytesReadByAGoroutine { 28 | return [][2]int{{0, totalBytes - 1}} 29 | } 30 | 31 | totalGoroutines := totalBytes / minBytesReadByAGoroutine 32 | if totalGoroutines == 1 { 33 | return [][2]int{{0, totalBytes - 1}} 34 | } 35 | if totalGoroutines > maxGoroutines { 36 | totalGoroutines = maxGoroutines 37 | } 38 | bytesPerGoroutine := totalBytes / totalGoroutines 39 | distribution := make([][2]int, totalGoroutines) 40 | start, end := 0, bytesPerGoroutine-1 41 | for goroutineNo := range totalGoroutines - 1 { 42 | distribution[goroutineNo] = [2]int{start, end} 43 | start = end + 1 44 | end += bytesPerGoroutine 45 | } 46 | distribution[totalGoroutines-1] = [2]int{start, totalBytes - 1} 47 | 48 | return distribution 49 | } 50 | -------------------------------------------------------------------------------- /internal/helper_test.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package internal_test 7 | 8 | import ( 9 | "reflect" 10 | "testing" 11 | 12 | "github.com/actforgood/bigcsvreader/internal" 13 | ) 14 | 15 | func TestComputeGoroutineOffsets(t *testing.T) { 16 | t.Parallel() 17 | 18 | // arrange 19 | tests := [...]struct { 20 | name string 21 | inputTotalBytes int 22 | inputMaxGoroutines int 23 | inputMinBytesReadByAGoroutine int 24 | expectedResult [][2]int 25 | }{ 26 | { 27 | name: "even distribution, min bytes 0 is counted as 1", 28 | inputTotalBytes: 18, 29 | inputMaxGoroutines: 3, 30 | inputMinBytesReadByAGoroutine: 0, // this is adjusted to 1 31 | expectedResult: [][2]int{{0, 5}, {6, 11}, {12, 17}}, 32 | }, 33 | { 34 | name: "even distribution, min bytes 1", 35 | inputTotalBytes: 18, 36 | inputMaxGoroutines: 3, 37 | inputMinBytesReadByAGoroutine: 1, 38 | expectedResult: [][2]int{{0, 5}, {6, 11}, {12, 17}}, 39 | }, 40 | { 41 | name: "even distribution, min bytes 3", 42 | inputTotalBytes: 18, 43 | inputMaxGoroutines: 3, 44 | inputMinBytesReadByAGoroutine: 3, 45 | expectedResult: [][2]int{{0, 5}, {6, 11}, {12, 17}}, 46 | }, 47 | { 48 | name: "even distribution, min bytes is maximum to reach max goroutines", 49 | inputTotalBytes: 18, 50 | inputMaxGoroutines: 3, 51 | inputMinBytesReadByAGoroutine: 6, // 6 x 3 = 18 52 | expectedResult: [][2]int{{0, 5}, {6, 11}, {12, 17}}, 53 | }, 54 | { 55 | name: "max goroutines is not reached, even distribution 1", 56 | inputTotalBytes: 18, 57 | inputMaxGoroutines: 3, 58 | inputMinBytesReadByAGoroutine: 6 + 1, 59 | expectedResult: [][2]int{{0, 8}, {9, 17}}, 60 | }, 61 | { 62 | name: "max goroutines is not reached, even distribution 2", 63 | inputTotalBytes: 18, 64 | inputMaxGoroutines: 3, 65 | inputMinBytesReadByAGoroutine: 6 + 3, 66 | expectedResult: [][2]int{{0, 8}, {9, 17}}, 67 | }, 68 | { 69 | name: "max goroutines is not reached, with reminder", 70 | inputTotalBytes: 19, 71 | inputMaxGoroutines: 3, 72 | inputMinBytesReadByAGoroutine: 9, 73 | expectedResult: [][2]int{{0, 8}, {9, 17 + 1}}, 74 | }, 75 | { 76 | name: "max goroutines is not reached, total bytes is < min bytes", 77 | inputTotalBytes: 18, 78 | inputMaxGoroutines: 3, 79 | inputMinBytesReadByAGoroutine: 19, 80 | expectedResult: [][2]int{{0, 17}}, 81 | }, 82 | { 83 | name: "max goroutines is not reached, total bytes is = min bytes", 84 | inputTotalBytes: 18, 85 | inputMaxGoroutines: 3, 86 | inputMinBytesReadByAGoroutine: 18, 87 | expectedResult: [][2]int{{0, 17}}, 88 | }, 89 | { 90 | name: "max goroutines is not reached, total bytes is a little bigger than min bytes", 91 | inputTotalBytes: 18, 92 | inputMaxGoroutines: 3, 93 | inputMinBytesReadByAGoroutine: 17, 94 | expectedResult: [][2]int{{0, 17}}, 95 | }, 96 | { 97 | name: "reminder gets to last goroutine", 98 | inputTotalBytes: 20, 99 | inputMaxGoroutines: 3, 100 | inputMinBytesReadByAGoroutine: 1, 101 | expectedResult: [][2]int{{0, 5}, {6, 11}, {12, 17 + 2}}, 102 | }, 103 | { 104 | name: "0 total bytes returns nil result", 105 | inputTotalBytes: 0, 106 | inputMaxGoroutines: 3, 107 | inputMinBytesReadByAGoroutine: 10, 108 | expectedResult: nil, 109 | }, 110 | { 111 | name: "0 max goroutines is counted as 1 goroutine", 112 | inputTotalBytes: 10, 113 | inputMaxGoroutines: 0, 114 | inputMinBytesReadByAGoroutine: 10, 115 | expectedResult: [][2]int{{0, 9}}, 116 | }, 117 | } 118 | 119 | for _, testData := range tests { 120 | t.Run(testData.name, func(t *testing.T) { 121 | // act 122 | result := internal.ComputeGoroutineOffsets( 123 | testData.inputTotalBytes, 124 | testData.inputMaxGoroutines, 125 | testData.inputMinBytesReadByAGoroutine, 126 | ) 127 | 128 | // assert 129 | if !reflect.DeepEqual(result, testData.expectedResult) { 130 | t.Errorf("expected %v, but got %v | %s", testData.expectedResult, result, testData.name) 131 | } 132 | }) 133 | } 134 | } 135 | 136 | func BenchmarkComputeGoroutineOffsets_1(b *testing.B) { 137 | b.ReportAllocs() 138 | for range b.N { 139 | _ = internal.ComputeGoroutineOffsets(1024, 32, 1) 140 | } 141 | } 142 | 143 | func BenchmarkComputeGoroutineOffsets_2(b *testing.B) { 144 | b.ReportAllocs() 145 | for range b.N { 146 | _ = internal.ComputeGoroutineOffsets(1024, 32, 1025) 147 | } 148 | } 149 | 150 | func BenchmarkComputeGoroutineOffsets_3(b *testing.B) { 151 | b.ReportAllocs() 152 | for range b.N { 153 | _ = internal.ComputeGoroutineOffsets(1024, 32, 1023) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /internal/logger.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | // Package internal contains internal logic. 7 | package internal 8 | 9 | // Logger logs information while processing a CSV file. 10 | type Logger interface { 11 | // Debug logs debug information. 12 | Debug(keyValues ...any) 13 | // Error logs any error occurred. 14 | Error(keyValues ...any) 15 | } 16 | 17 | // NopLogger does not log anything. 18 | type NopLogger struct{} 19 | 20 | // Debug does nothing. 21 | func (NopLogger) Debug(...any) {} 22 | 23 | // Error does nothing. 24 | func (NopLogger) Error(...any) {} 25 | -------------------------------------------------------------------------------- /internal/logger_test.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package internal_test 7 | 8 | import ( 9 | "testing" 10 | 11 | "github.com/actforgood/bigcsvreader/internal" 12 | ) 13 | 14 | func init() { 15 | var _ internal.Logger = (*internal.NopLogger)(nil) // ensure NopLogger is a Logger 16 | } 17 | 18 | func TestNopLogger(t *testing.T) { 19 | // Note: this test is more for coverage, does not test anything after all. 20 | t.Parallel() 21 | 22 | // arrange 23 | subject := internal.NopLogger{} 24 | 25 | // act 26 | subject.Error("foo", "bar", "abc", 123) 27 | subject.Debug("foo", "bar", "abc", 123, "err", "some error") 28 | } 29 | -------------------------------------------------------------------------------- /reader.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package bigcsvreader 7 | 8 | import ( 9 | "bufio" 10 | "bytes" 11 | "context" 12 | "encoding/csv" 13 | "errors" 14 | "fmt" 15 | "io" 16 | "os" 17 | "path" 18 | "runtime" 19 | "sync" 20 | 21 | "github.com/actforgood/bigcsvreader/internal" 22 | ) 23 | 24 | const ( 25 | chanSize = 256 26 | minBytesToReadByAGoroutine = 2048 27 | ) 28 | 29 | // ErrEmptyFile is an error returned if CSV file is empty. 30 | var ErrEmptyFile = errors.New("empty csv file") 31 | 32 | // RowsChan is the channel where read rows will be pushed into. 33 | // Has a buffer of 256 entries. 34 | type RowsChan <-chan []string 35 | 36 | // ErrsChan is the channel where error(s) will be pushed in case 37 | // an error occurs during file read. Has a buffer of 256 entries. 38 | // Some errors can be fatal, like file does not exist, some errors like 39 | // rows parsing may occur for each affected row. 40 | type ErrsChan <-chan error 41 | 42 | // CsvReader reads async rows from a CSV file. 43 | // It does that by initializing multiple goroutines, each of them handling 44 | // a chunk of data from the file. 45 | type CsvReader struct { 46 | // MaxGoroutinesNo is the maximum goroutines to start parsing the CSV file. 47 | // Minimum required bytes to start a new goroutine is 2048 bytes. 48 | // Defaults to [runtime.NumCPU]. 49 | MaxGoroutinesNo int 50 | // FileHasHeader is a flag indicating if file's first row is the header (columns names). 51 | // If so, the header line is disregarded and not returned as a row. 52 | // Defaults to false. 53 | FileHasHeader bool 54 | // ColumnsCount is the number of columns the CSV file has. 55 | ColumnsCount int 56 | // ColumnsDelimiter is the delimiter char between columns. Defaults to comma. 57 | ColumnsDelimiter rune 58 | // BufferSize is used internally for [bufio.Reader] size. Has a default value of 4096. 59 | // If you have lines bigger than this value, adjust it not to get "buffer full" error. 60 | BufferSize int 61 | // Logger can be set to perform some debugging/error logging. 62 | // Defaults to a no-operation logger (no log is performed). 63 | // You can enable logging by passing a logger that implements [internal.Logger] contract. 64 | Logger internal.Logger 65 | // filePath is the CSV file path. 66 | filePath string 67 | // fileBaseName is the base name of the file extracted from filePath. 68 | // Is used in logging. 69 | fileBaseName string 70 | // LazyQuotes is a flag used to allow quotes in an unquoted field and non-doubled quotes 71 | // in a quoted field 72 | LazyQuotes bool 73 | } 74 | 75 | // New instantiates a new CsvReader object with some default fields preset. 76 | func New() *CsvReader { 77 | return &CsvReader{ 78 | MaxGoroutinesNo: runtime.NumCPU(), 79 | ColumnsDelimiter: ',', 80 | Logger: internal.NopLogger{}, 81 | BufferSize: 4096, 82 | } 83 | } 84 | 85 | // SetFilePath sets the CSV file path. 86 | func (cr *CsvReader) SetFilePath(csvFilePath string) { 87 | cr.filePath = csvFilePath 88 | cr.fileBaseName = path.Base(csvFilePath) 89 | } 90 | 91 | // Read extracts asynchronously CSV rows, each started goroutine putting them into a RowsChan. 92 | // Error(s) occurred during parsing are sent through ErrsChan. 93 | func (cr *CsvReader) Read(ctx context.Context) ([]RowsChan, ErrsChan) { 94 | cr.Logger.Debug( 95 | "msg", "starting file reading", 96 | "filePath", cr.filePath, 97 | "fileColumnsCount", cr.ColumnsCount, 98 | "fileHasHeader", cr.FileHasHeader, 99 | "maxThreads", cr.MaxGoroutinesNo, 100 | ) 101 | 102 | errsChan := make(chan error, chanSize) 103 | fileSize, err := cr.getFileSize() 104 | if err != nil { 105 | errsChan <- fmt.Errorf( 106 | "bigcsvreader: file size error (%w)", 107 | err, 108 | ) 109 | close(errsChan) 110 | cr.Logger.Error( 111 | "msg", "file size error", 112 | "err", err, 113 | "file", cr.fileBaseName, 114 | ) 115 | 116 | return nil, errsChan 117 | } 118 | 119 | threadsInfo := internal.ComputeGoroutineOffsets(fileSize, cr.MaxGoroutinesNo, minBytesToReadByAGoroutine) 120 | totalThreads := len(threadsInfo) 121 | cr.Logger.Debug( 122 | "msg", "stats", 123 | "file", cr.fileBaseName, "fileSize", fileSize, 124 | "totalThreads", totalThreads, "initialOffsetsDistribution", threadsInfo, 125 | ) 126 | 127 | rowsChans := make([]RowsChan, totalThreads) 128 | rowsChs := make([]chan<- []string, totalThreads) 129 | for i := range totalThreads { 130 | rowsChan := make(chan []string, chanSize) 131 | rowsChans[i] = rowsChan 132 | rowsChs[i] = rowsChan 133 | } 134 | 135 | go cr.readAsync(ctx, threadsInfo, rowsChs, errsChan) 136 | 137 | return rowsChans, errsChan 138 | } 139 | 140 | func (cr *CsvReader) readAsync( 141 | ctx context.Context, 142 | threadsInfo [][2]int, 143 | rowsChans []chan<- []string, 144 | errsChan chan<- error, 145 | ) { 146 | defer func() { 147 | close(errsChan) 148 | for i := range rowsChans { 149 | close(rowsChans[i]) 150 | } 151 | }() 152 | totalThreads := len(threadsInfo) 153 | 154 | // create a wait group pool as we need to wait for all goroutines to terminate. 155 | var wg sync.WaitGroup 156 | wg.Add(totalThreads) 157 | worker := cr.readBetweenOffsetsAsync 158 | for thread := range totalThreads { 159 | go worker( 160 | ctx, 161 | thread+1, 162 | threadsInfo[thread][0], // start offset 163 | threadsInfo[thread][1], // end offset 164 | &wg, 165 | rowsChans[thread], 166 | errsChan, 167 | ) 168 | } 169 | wg.Wait() 170 | 171 | cr.Logger.Debug("msg", "finished file reading", "file", cr.fileBaseName) 172 | } 173 | 174 | // readBetweenOffsetsAsync reads the piece of file allocated to a given thread. 175 | func (cr *CsvReader) readBetweenOffsetsAsync( 176 | ctx context.Context, 177 | currentThreadNo, offsetStart, offsetEnd int, 178 | wg *sync.WaitGroup, 179 | rowsChan chan<- []string, 180 | errsChan chan<- error, 181 | ) { 182 | defer wg.Done() 183 | 184 | f := cr.openFile(currentThreadNo, errsChan) 185 | if f == nil { 186 | return 187 | } 188 | defer f.Close() 189 | 190 | var line []byte 191 | 192 | // move offset to startOffset and skip the whole line. 193 | r := bufio.NewReaderSize(f, cr.BufferSize) 194 | _, _ = f.Seek(int64(offsetStart), io.SeekStart) 195 | if currentThreadNo != 1 || cr.FileHasHeader { 196 | line = cr.readLine(r, currentThreadNo, offsetStart, errsChan) 197 | if line == nil { 198 | return 199 | } 200 | } 201 | realOffsetStart := offsetStart + len(line) 202 | currentOffsetPos := realOffsetStart 203 | 204 | bytesReader := bytes.NewReader(line) 205 | csvReader := csv.NewReader(bytesReader) 206 | csvReader.Comma = cr.ColumnsDelimiter 207 | csvReader.FieldsPerRecord = cr.ColumnsCount 208 | csvReader.LazyQuotes = cr.LazyQuotes 209 | 210 | ForLoop: 211 | for { 212 | select { 213 | case <-ctx.Done(): 214 | if ctx.Err() != nil { 215 | errsChan <- fmt.Errorf( 216 | "bigcsvreader: thread #%d received context error (%w)", 217 | currentThreadNo, ctx.Err(), 218 | ) 219 | } 220 | 221 | return 222 | default: 223 | line = cr.readLine(r, currentThreadNo, currentOffsetPos, errsChan) 224 | if line == nil { 225 | break ForLoop 226 | } 227 | 228 | // pass read line through standard go CSV reader. 229 | bytesReader.Reset(line) 230 | record, err := csvReader.Read() 231 | if err != nil { 232 | errsChan <- fmt.Errorf( 233 | "bigcsvreader: thread #%d could not parse row at offset %d (%w)", 234 | currentThreadNo, currentOffsetPos, err, 235 | ) 236 | cr.Logger.Error( 237 | "msg", "could not parse row", "err", err, 238 | "file", cr.fileBaseName, "thread", currentThreadNo, 239 | "offset", currentOffsetPos, "row", string(line), 240 | ) 241 | } else { 242 | rowsChan <- record 243 | } 244 | 245 | currentOffsetPos += len(line) 246 | if currentOffsetPos-1 > offsetEnd { 247 | break ForLoop // next thread will handle eventual next lines. 248 | } 249 | } 250 | } 251 | 252 | cr.Logger.Debug( 253 | "msg", "done", 254 | "file", cr.fileBaseName, "thread", currentThreadNo, 255 | "offsetStart", offsetStart, "offsetEnd", offsetEnd, 256 | "realOffsetStart", realOffsetStart, "realOffsetEnd", currentOffsetPos-1, 257 | "bytesCount", currentOffsetPos-realOffsetStart, 258 | ) 259 | } 260 | 261 | // openFile returns the fd of CSV file or nil if the file could not be opened. 262 | func (cr *CsvReader) openFile(thread int, errsChan chan<- error) *os.File { 263 | f, err := os.Open(cr.filePath) 264 | if err == nil { 265 | return f 266 | } 267 | 268 | errsChan <- fmt.Errorf( 269 | "bigcsvreader: thread #%d could not open file (%w)", 270 | thread, err, 271 | ) 272 | cr.Logger.Error( 273 | "msg", "could not open file", "err", err, 274 | "file", cr.fileBaseName, "thread", thread, 275 | ) 276 | 277 | return nil 278 | } 279 | 280 | // readLine reads returns a row from file, or nil if something bad happens or [io.EOF] is encountered. 281 | func (cr *CsvReader) readLine(r *bufio.Reader, thread, offsetPos int, errsChan chan<- error) []byte { 282 | // did not use [bufio.Reader.ReadLine] as it disregards end line delimiter(s) (\n / \r\n) 283 | // and we need the whole line length in advancing offset. 284 | // [bufio.Reader.ReadSlice] also has the advantage of returning the subslice of buffered bytes, 285 | // without allocating another slice. 286 | line, err := r.ReadSlice('\n') 287 | if err == nil { 288 | return line 289 | } 290 | if err == io.EOF { 291 | if len(line) != 0 { 292 | return line 293 | } 294 | } else { 295 | errsChan <- fmt.Errorf( 296 | "bigcsvreader: thread #%d could not read line at offset %d (%w)", 297 | thread, offsetPos, err, 298 | ) 299 | cr.Logger.Error( 300 | "msg", "could not read line", "err", err, 301 | "file", cr.fileBaseName, "thread", thread, 302 | "offset", offsetPos, 303 | ) 304 | } 305 | 306 | return nil 307 | } 308 | 309 | // getFileSize returns file's size as each goroutine will 310 | // read approx. fileSize/totalGoroutines bytes. 311 | func (cr *CsvReader) getFileSize() (int, error) { 312 | fileInfo, err := os.Stat(cr.filePath) 313 | if err != nil { 314 | return 0, err 315 | } 316 | fileSize := int(fileInfo.Size()) 317 | if fileSize < 1 { 318 | return 0, ErrEmptyFile 319 | } 320 | 321 | return fileSize, nil 322 | } 323 | -------------------------------------------------------------------------------- /reader_test.go: -------------------------------------------------------------------------------- 1 | // Copyright The ActForGood Authors. 2 | // Use of this source code is governed by an MIT-style 3 | // license that can be found in the LICENSE file or at 4 | // https://github.com/actforgood/bigcsvreader/blob/main/LICENSE. 5 | 6 | package bigcsvreader_test 7 | 8 | import ( 9 | "context" 10 | "encoding/csv" 11 | "errors" 12 | "io" 13 | "os" 14 | "reflect" 15 | "runtime" 16 | "strconv" 17 | "strings" 18 | "sync" 19 | "sync/atomic" 20 | "testing" 21 | "time" 22 | 23 | "github.com/actforgood/bigcsvreader" 24 | ) 25 | 26 | func TestCsvReader(t *testing.T) { 27 | t.Parallel() 28 | 29 | t.Run("csv file with header", testCsvReaderByHeader(true)) 30 | t.Run("csv file without header", testCsvReaderByHeader(false)) 31 | t.Run("empty file", testCsvReaderWithEmptyFile) 32 | t.Run("not found file", testCsvReaderWithNotFoundFile) 33 | t.Run("with 10k rows file", testCsvReaderWithDifferentFileSizesAndMaxGoroutines(1e4)) 34 | t.Run("with 100k rows file", testCsvReaderWithDifferentFileSizesAndMaxGoroutines(1e5)) 35 | t.Run("with 500k rows file", testCsvReaderWithDifferentFileSizesAndMaxGoroutines(5e5)) 36 | t.Run("context is canceled", testCsvReaderWithContextCanceled) 37 | t.Run("invalid row", testCsvReaderWithInvalidRow) 38 | t.Run("small buffer size", testCsvReaderWithSmallBufferSize) 39 | t.Run("quotes in unquoted field", testCsvReaderWithLazyQuotes) 40 | } 41 | 42 | func testCsvReaderByHeader(withHeader bool) func(t *testing.T) { 43 | return func(t *testing.T) { 44 | t.Parallel() 45 | 46 | // arrange 47 | subject := bigcsvreader.New() 48 | subject.ColumnsCount = 3 49 | if withHeader { 50 | subject.SetFilePath("testdata/file_with_header.csv") 51 | subject.FileHasHeader = true 52 | subject.ColumnsDelimiter = ';' 53 | } else { 54 | subject.SetFilePath("testdata/file_without_header.csv") 55 | } 56 | 57 | ctx, cancelCtx := context.WithTimeout(context.Background(), 15*time.Second) 58 | defer cancelCtx() 59 | expectedRecords := [][]string{ 60 | {"1", "John", "33"}, 61 | {"2", "Jane", "30"}, 62 | {"3", "Mike", "18"}, 63 | {"4", "Ronaldinho", "23"}, 64 | {"5", "Elisabeth", "45"}, 65 | } 66 | 67 | // act 68 | rowsChans, errsChan := subject.Read(ctx) 69 | records, err := gatherRecords(rowsChans, errsChan) 70 | 71 | // assert 72 | assertNil(t, err) 73 | assertEqual(t, len(expectedRecords), len(records)) 74 | for _, expectedRecord := range expectedRecords { 75 | found := false 76 | for _, record := range records { 77 | if reflect.DeepEqual(expectedRecord, record) { 78 | found = true 79 | 80 | break 81 | } 82 | } 83 | if !found { 84 | t.Errorf("record '%v' was expected to be found, but was not", expectedRecord) 85 | } 86 | } 87 | } 88 | } 89 | 90 | func testCsvReaderWithEmptyFile(t *testing.T) { 91 | t.Parallel() 92 | 93 | // arrange 94 | subject := bigcsvreader.New() 95 | subject.SetFilePath("testdata/empty.csv") 96 | ctx, cancelCtx := context.WithTimeout(context.Background(), 15*time.Second) 97 | defer cancelCtx() 98 | expectedErr := bigcsvreader.ErrEmptyFile 99 | 100 | // act 101 | rowsChans, errsChan := subject.Read(ctx) 102 | records, err := gatherRecords(rowsChans, errsChan) 103 | 104 | // assert 105 | assertTrue(t, errors.Is(err, expectedErr)) 106 | assertNil(t, records) 107 | } 108 | 109 | func testCsvReaderWithNotFoundFile(t *testing.T) { 110 | t.Parallel() 111 | 112 | // arrange 113 | subject := bigcsvreader.New() 114 | subject.SetFilePath("testdata/this_file_does_not_exist.csv") 115 | ctx, cancelCtx := context.WithTimeout(context.Background(), 15*time.Second) 116 | defer cancelCtx() 117 | expectedErr := os.ErrNotExist 118 | 119 | // act 120 | rowsChans, errsChan := subject.Read(ctx) 121 | records, err := gatherRecords(rowsChans, errsChan) 122 | 123 | // assert 124 | assertTrue(t, errors.Is(err, expectedErr)) 125 | assertNil(t, records) 126 | } 127 | 128 | func testCsvReaderWithDifferentFileSizesAndMaxGoroutines(rowsCount int64) func(t *testing.T) { 129 | return func(t *testing.T) { 130 | t.Parallel() 131 | 132 | // arrange 133 | fName, err := setUpTmpCsvFile(t.TempDir(), rowsCount) 134 | if err != nil { 135 | t.Fatalf("prerequisite failed: could not generate CSV file: %v", err) 136 | } 137 | subject := bigcsvreader.New() 138 | subject.SetFilePath(fName) 139 | subject.ColumnsCount = 5 140 | ctx, cancelCtx := context.WithCancel(context.Background()) 141 | defer cancelCtx() 142 | var sumIDs int64 143 | var wg sync.WaitGroup 144 | 145 | for maxGoroutines := 1; maxGoroutines <= 16; maxGoroutines++ { 146 | subject.MaxGoroutinesNo = maxGoroutines 147 | sumIDs = 0 148 | expectedSumIDs := rowsCount * (rowsCount + 1) / 2 149 | 150 | // act 151 | rowsChans, errsChan := subject.Read(ctx) 152 | 153 | // assert 154 | for i := range rowsChans { 155 | wg.Add(1) 156 | go func(rowsChan bigcsvreader.RowsChan, waitGr *sync.WaitGroup) { 157 | var localSumIDs int64 158 | for record := range rowsChan { 159 | if !assertEqual(t, 5, len(record)) { 160 | continue 161 | } 162 | id, _ := strconv.ParseInt(record[colID], 10, 64) 163 | localSumIDs += id 164 | expectedColName := colValueNamePrefix + record[colID] 165 | assertEqual(t, expectedColName, record[colName]) 166 | assertEqual(t, colValueDescription, record[colDescription]) 167 | assertEqual(t, colValuePrice, record[colPrice]) 168 | assertEqual(t, colValueStock, record[colStock]) 169 | } 170 | atomic.AddInt64(&sumIDs, localSumIDs) 171 | waitGr.Done() 172 | }(rowsChans[i], &wg) 173 | } 174 | for err := range errsChan { 175 | assertNil(t, err) 176 | } 177 | wg.Wait() 178 | assertEqual(t, expectedSumIDs, sumIDs) 179 | } 180 | } 181 | } 182 | 183 | func testCsvReaderWithLazyQuotes(t *testing.T) { 184 | t.Parallel() 185 | 186 | // arrange 187 | subject := bigcsvreader.New() 188 | subject.SetFilePath("testdata/file_with_quote_in_unquoted_field.csv") 189 | subject.ColumnsCount = 3 190 | subject.FileHasHeader = false 191 | subject.LazyQuotes = true 192 | 193 | expectedRecords := [][]string{ 194 | {"1", "John \"The Bomb\" Miguel", "33"}, 195 | {"2", "Jane", "30"}, 196 | {"3", "Mike", "18"}, 197 | {"4", "Ronaldinho", "23"}, 198 | {"5", "Elisabeth", "45"}, 199 | } 200 | 201 | ctx, cancelCtx := context.WithTimeout(context.Background(), 15*time.Second) 202 | defer cancelCtx() 203 | 204 | // act 205 | rowsChans, errsChan := subject.Read(ctx) 206 | records, err := gatherRecords(rowsChans, errsChan) 207 | 208 | // assert 209 | assertNil(t, err) 210 | assertEqual(t, len(expectedRecords), len(records)) 211 | for _, expectedRecord := range expectedRecords { 212 | found := false 213 | for _, record := range records { 214 | if reflect.DeepEqual(expectedRecord, record) { 215 | found = true 216 | 217 | break 218 | } 219 | } 220 | if !found { 221 | t.Errorf("record '%v' was expected to be found, but was not", expectedRecord) 222 | } 223 | } 224 | } 225 | 226 | func testCsvReaderWithInvalidRow(t *testing.T) { 227 | t.Parallel() 228 | 229 | // arrange 230 | subject := bigcsvreader.New() 231 | subject.SetFilePath("testdata/invalid_row.csv") 232 | subject.ColumnsCount = 3 233 | subject.FileHasHeader = true 234 | var expectedErr *csv.ParseError 235 | 236 | ctx, cancelCtx := context.WithTimeout(context.Background(), 15*time.Second) 237 | defer cancelCtx() 238 | 239 | // act 240 | rowsChans, errsChan := subject.Read(ctx) 241 | records, err := gatherRecords(rowsChans, errsChan) 242 | 243 | // assert 244 | assertTrue(t, errors.As(err, &expectedErr)) 245 | assertNil(t, records) 246 | } 247 | 248 | func testCsvReaderWithContextCanceled(t *testing.T) { 249 | t.Parallel() 250 | 251 | // arrange 252 | subject := bigcsvreader.New() 253 | subject.SetFilePath("testdata/file_without_header.csv") 254 | subject.ColumnsCount = 3 255 | expectedErr := context.Canceled 256 | 257 | ctx, cancelCtx := context.WithCancel(context.Background()) 258 | 259 | // act 260 | cancelCtx() 261 | rowsChans, errsChan := subject.Read(ctx) 262 | records, err := gatherRecords(rowsChans, errsChan) 263 | 264 | // assert 265 | assertTrue(t, errors.Is(err, expectedErr)) 266 | assertNil(t, records) 267 | } 268 | 269 | func testCsvReaderWithSmallBufferSize(t *testing.T) { 270 | t.Parallel() 271 | 272 | // arrange 273 | subject := bigcsvreader.New() 274 | subject.SetFilePath("testdata/file_without_header.csv") 275 | subject.ColumnsCount = 3 276 | subject.BufferSize = 16 // min buffer size set by bufio - Ronaldinho line has len 17 and err should arise 277 | 278 | ctx, cancelCtx := context.WithCancel(context.Background()) 279 | defer cancelCtx() 280 | 281 | // act 282 | rowsChans, errsChan := subject.Read(ctx) 283 | records, err := gatherRecords(rowsChans, errsChan) 284 | 285 | // assert 286 | if assertNotNil(t, err) { 287 | assertTrue(t, strings.Contains(err.Error(), "buffer full")) 288 | } 289 | assertNil(t, records) 290 | } 291 | 292 | // gatherRecords returns the rows from big csv reader, or an error if something bad happened. 293 | func gatherRecords(rowsChans []bigcsvreader.RowsChan, errsChan bigcsvreader.ErrsChan) ([][]string, error) { 294 | var ( 295 | mu sync.Mutex 296 | wg sync.WaitGroup 297 | records = make([][]string, 0) 298 | ) 299 | for i := range rowsChans { 300 | wg.Add(1) 301 | go func(rowsChan bigcsvreader.RowsChan, mutex *sync.Mutex, waitGr *sync.WaitGroup) { 302 | for record := range rowsChan { 303 | mutex.Lock() 304 | records = append(records, record) 305 | mu.Unlock() 306 | } 307 | waitGr.Done() 308 | }(rowsChans[i], &mu, &wg) 309 | } 310 | 311 | for err := range errsChan { 312 | return nil, err 313 | } 314 | wg.Wait() 315 | 316 | return records, nil 317 | } 318 | 319 | const ( 320 | colID = iota 321 | colName 322 | colDescription 323 | colPrice 324 | colStock 325 | ) 326 | 327 | const ( 328 | colValueNamePrefix = "Product_" 329 | colValueDescription = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean" 330 | colValuePrice = "150.99" 331 | colValueStock = "35" 332 | ) 333 | 334 | // setUpTmpCsvFile creates a CSV file in the OS's temp directory, like `/tmp/bigcsvreader_-.csv` . 335 | // The file will have the provided number of rows. 336 | // Rows look like: 337 | // 338 | // 1,Product_1,"Lorem ipsum...",150.99,35\n 339 | // 2,Product_2,"Lorem ipsum...",150.99,35\n 340 | // , Product_, static text: Lorem ipsum..., static price: 150.99, static stock qty: 35 EOL 341 | func setUpTmpCsvFile(tmpDir string, rowsCount int64) (string, error) { 342 | filePattern := "bigcsvreader_" + strconv.FormatInt(rowsCount, 10) + "-*.csv" 343 | f, err := os.CreateTemp(tmpDir, filePattern) 344 | if err != nil { 345 | return "", err 346 | } 347 | defer f.Close() 348 | fName := f.Name() 349 | 350 | var id int64 351 | buf := make([]byte, 0, 1280) 352 | bufLenConst := 4 + 2 + 1 + len(colValueNamePrefix) + len(colValueDescription) + len(colValuePrice) + len(colValueStock) // 4 x comma, 2 x quote, 1 x \n, 353 | for id = 1; id <= rowsCount; id++ { 354 | buf = buf[0:0:1280] 355 | idStr := strconv.FormatInt(id, 10) 356 | buf = append(buf, idStr...) 357 | buf = append(buf, ',') 358 | buf = append(buf, colValueNamePrefix...) 359 | buf = append(buf, idStr...) 360 | buf = append(buf, `,"`...) 361 | buf = append(buf, colValueDescription...) 362 | buf = append(buf, `",`...) 363 | buf = append(buf, colValuePrice...) 364 | buf = append(buf, ',') 365 | buf = append(buf, colValueStock...) 366 | buf = append(buf, "\n"...) 367 | bufLen := bufLenConst + 2*len(idStr) 368 | _, err := f.Write(buf[:bufLen]) 369 | if err != nil { 370 | return "", err 371 | } 372 | } 373 | 374 | return fName, nil 375 | } 376 | 377 | // fakeProcessRow simulates the processing a row from the CSV file. 378 | // normally record should be validated / converted to a struct / saved into a db / sent over an API... 379 | // here simulates an operation with the cost of 1ms. 380 | func fakeProcessRow(_ []string) { 381 | time.Sleep(time.Millisecond) 382 | } 383 | 384 | func benchmarkBigCsvReader(rowsCount int64) func(b *testing.B) { 385 | return func(b *testing.B) { 386 | fName, err := setUpTmpCsvFile(b.TempDir(), rowsCount) 387 | if err != nil { 388 | b.Fatalf("prerequisite failed: could not generate CSV file: %v", err) 389 | } 390 | subject := bigcsvreader.New() 391 | subject.SetFilePath(fName) 392 | subject.ColumnsCount = 5 393 | subject.MaxGoroutinesNo = 8 394 | ctx, cancelCtx := context.WithCancel(context.Background()) 395 | defer cancelCtx() 396 | var count int64 397 | 398 | b.ReportAllocs() 399 | b.ResetTimer() 400 | 401 | for range b.N { 402 | rowsChans, errsChan := subject.Read(ctx) 403 | count = consumeBenchResults(rowsChans, errsChan) 404 | if count != rowsCount { 405 | b.Errorf("expected %d, but got %d", rowsCount, count) 406 | } 407 | } 408 | } 409 | } 410 | 411 | // consumeBenchResults just counts the records received from big csv reader and applies a delay of 1ms. 412 | func consumeBenchResults(rowsChans []bigcsvreader.RowsChan, _ bigcsvreader.ErrsChan) int64 { 413 | var ( 414 | count int64 415 | wg sync.WaitGroup 416 | ) 417 | 418 | for i := range rowsChans { 419 | wg.Add(1) 420 | go func(rowsChan bigcsvreader.RowsChan, waitGr *sync.WaitGroup) { 421 | var localCount int64 422 | for record := range rowsChan { 423 | localCount++ 424 | fakeProcessRow(record) 425 | } 426 | atomic.AddInt64(&count, localCount) 427 | waitGr.Done() 428 | }(rowsChans[i], &wg) 429 | } 430 | wg.Wait() 431 | 432 | return count 433 | } 434 | 435 | func benchmarkStdGoCsvReaderReadAll(rowsCount int64) func(b *testing.B) { 436 | return func(b *testing.B) { 437 | fName, err := setUpTmpCsvFile(b.TempDir(), rowsCount) 438 | if err != nil { 439 | b.Fatalf("prerequisite failed: could not generate CSV file: %v", err) 440 | } 441 | var count int64 442 | 443 | b.ReportAllocs() 444 | b.ResetTimer() 445 | 446 | for range b.N { 447 | f, err := os.Open(fName) 448 | if err != nil { 449 | b.Fatal(err) 450 | } 451 | subject := csv.NewReader(f) 452 | subject.FieldsPerRecord = 5 453 | subject.Comma = ',' 454 | count = 0 455 | rows, err := subject.ReadAll() 456 | if err != nil { 457 | b.Error(err) 458 | } else { 459 | for _, record := range rows { // "consume" rows 460 | count++ 461 | fakeProcessRow(record) 462 | } 463 | } 464 | _ = f.Close() 465 | if count != rowsCount { 466 | b.Errorf("expected %d, but got %d", rowsCount, count) 467 | } 468 | } 469 | } 470 | } 471 | 472 | func benchmarkStdGoCsvReaderReadOneByOneWithReuseRecord(rowsCount int64) func(b *testing.B) { 473 | return func(b *testing.B) { 474 | fName, err := setUpTmpCsvFile(b.TempDir(), rowsCount) 475 | if err != nil { 476 | b.Fatalf("prerequisite failed: could not generate CSV file: %v", err) 477 | } 478 | var count int64 479 | 480 | b.ReportAllocs() 481 | b.ResetTimer() 482 | 483 | for range b.N { 484 | f, err := os.Open(fName) 485 | if err != nil { 486 | b.Fatal(err) 487 | } 488 | subject := csv.NewReader(f) 489 | subject.FieldsPerRecord = 5 490 | subject.Comma = ',' 491 | subject.ReuseRecord = true 492 | count = 0 493 | 494 | for { 495 | record, err := subject.Read() 496 | if err != nil { 497 | if err == io.EOF { 498 | break 499 | } 500 | b.Error(err) 501 | } else { // "consume" row 502 | count++ 503 | fakeProcessRow(record) 504 | } 505 | } 506 | _ = f.Close() 507 | if count != rowsCount { 508 | b.Errorf("expected %d, but got %d", rowsCount, count) 509 | } 510 | } 511 | } 512 | } 513 | 514 | func benchmarkStdGoCsvReaderReadOneByOneProcessParalell(rowsCount int64) func(b *testing.B) { 515 | return func(b *testing.B) { 516 | fName, err := setUpTmpCsvFile(b.TempDir(), rowsCount) 517 | if err != nil { 518 | b.Fatalf("prerequisite failed: could not generate CSV file: %v", err) 519 | } 520 | 521 | numWorkers := runtime.GOMAXPROCS(0) 522 | 523 | b.ReportAllocs() 524 | b.ResetTimer() 525 | 526 | for range b.N { 527 | // setup workers for parallel processing 528 | rowsChan := make(chan []string, numWorkers) 529 | var ( 530 | count int64 531 | wg sync.WaitGroup 532 | ) 533 | for range numWorkers { 534 | wg.Add(1) 535 | go func() { 536 | var localCount int64 537 | for record := range rowsChan { 538 | localCount++ 539 | fakeProcessRow(record) 540 | } 541 | atomic.AddInt64(&count, localCount) 542 | wg.Done() 543 | }() 544 | } 545 | 546 | // sequential reading 547 | f, err := os.Open(fName) 548 | if err != nil { 549 | b.Fatal(err) 550 | } 551 | subject := csv.NewReader(f) 552 | subject.FieldsPerRecord = 5 553 | subject.Comma = ',' 554 | count = 0 555 | 556 | for { 557 | record, err := subject.Read() 558 | if err != nil { 559 | if err == io.EOF { 560 | break 561 | } 562 | b.Error(err) 563 | } else { // "consume" row 564 | rowsChan <- record 565 | } 566 | } 567 | close(rowsChan) 568 | wg.Wait() 569 | _ = f.Close() 570 | if count != rowsCount { 571 | b.Errorf("expected %d, but got %d", rowsCount, count) 572 | } 573 | } 574 | } 575 | } 576 | 577 | func Benchmark50000Rows_50Mb_withBigCsvReader(b *testing.B) { 578 | benchmarkBigCsvReader(5e4)(b) 579 | } 580 | 581 | func Benchmark50000Rows_50Mb_withStdGoCsvReaderReadAll(b *testing.B) { 582 | benchmarkStdGoCsvReaderReadAll(5e4)(b) 583 | } 584 | 585 | func Benchmark50000Rows_50Mb_withStdGoCsvReaderReadOneByOneAndReuseRecord(b *testing.B) { 586 | benchmarkStdGoCsvReaderReadOneByOneWithReuseRecord(5e4)(b) 587 | } 588 | 589 | func Benchmark50000Rows_50Mb_withStdGoCsvReaderReadOneByOneProcessParalell(b *testing.B) { 590 | benchmarkStdGoCsvReaderReadOneByOneProcessParalell(5e4)(b) 591 | } 592 | -------------------------------------------------------------------------------- /scripts/killweb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # This script kills the http servers started in background with pprof.sh 5 | # 6 | 7 | for port in 8084 8085 8086 8087 8088 8089; do \ 8 | ps aux | grep "go tool pprof -http=:${port}" | grep -v "grep go tool pprof -http=:${port}" | awk '{print $2}' | xargs kill 9 | done 10 | -------------------------------------------------------------------------------- /scripts/pprof.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # This script generates memory and cpu profiles. 5 | # 6 | 7 | SCRIPT_PATH=$(dirname "$(readlink -f "$0")") 8 | 9 | "${SCRIPT_PATH}/killweb.sh" 10 | 11 | profilesFor=(bigcsvreader gocsvreadall gocsvreadonebyone) 12 | port=8084 13 | for profileFor in "${profilesFor[@]}" 14 | do 15 | echo "Handling profiles for ${profileFor}" 16 | go run "${SCRIPT_PATH}/../cmd/pprof/main.go" -for="${profileFor}" 17 | go tool pprof -http=":${port}" "mem_${profileFor}.prof" & 18 | port=$(( port + 1 )) 19 | go tool pprof -http=":${port}" "cpu_${profileFor}.prof" & 20 | port=$(( port + 1 )) 21 | done 22 | -------------------------------------------------------------------------------- /testdata/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/actforgood/bigcsvreader/ba0745bcdccc752ef3a48bd0c1761c83dcf7682c/testdata/empty.csv -------------------------------------------------------------------------------- /testdata/example_products.csv: -------------------------------------------------------------------------------- 1 | 1,Apple iPhone 13,"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean",1025.99,100 2 | 2,Samsung Galaxy S22,"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean",400.99,12 3 | 3,Apple MacBook Air,"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean",700.99,34 4 | 4,Lenovo ThinkPad X1,"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean",550.99,90 5 | 5,Logitech Mouse G203,"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eleifend felis quis magna auctor, ut lacinia eros efficitur. Maecenas mattis dolor a pharetra gravida. Aenean at eros sed metus posuere feugiat in vitae libero. Morbi a diam volutpat, tempor lacus sed, sagittis velit. Donec eget dignissim mauris, sed aliquam ex. Duis eros dolor, vestibulum ac aliquam eget, viverra in enim. Aenean ut turpis quis purus porta lobortis. Etiam sollicitudin lectus vitae velit tincidunt, ut volutpat justo aliquam. Aenean vitae vehicula arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc viverra enim nec risus mollis elementum nec dictum ex. Nunc lorem eros, vulputate a rutrum nec, scelerisque non augue. Sed in egestas eros. Quisque felis lorem, vehicula ac venenatis vel, tristique id sapien. Morbi vitae odio eget orci facilisis suscipit. Cras sodales, augue vitae tincidunt tempus, diam turpis volutpat est, vitae fringilla augue leo semper augue. Integer scelerisque tempor mauris, ac posuere sem aenean",30.50,35 6 | -------------------------------------------------------------------------------- /testdata/file_with_header.csv: -------------------------------------------------------------------------------- 1 | "ID";"Name";"Age" 2 | 1;"John";33 3 | 2;"Jane";30 4 | 3;"Mike";18 5 | 4;"Ronaldinho";23 6 | 5;Elisabeth;45 -------------------------------------------------------------------------------- /testdata/file_with_quote_in_unquoted_field.csv: -------------------------------------------------------------------------------- 1 | 1,John "The Bomb" Miguel,33 2 | 2,"Jane",30 3 | 3,"Mike",18 4 | 4,"Ronaldinho",23 5 | 5,Elisabeth,45 -------------------------------------------------------------------------------- /testdata/file_without_header.csv: -------------------------------------------------------------------------------- 1 | 1,"John",33 2 | 2,"Jane",30 3 | 3,"Mike",18 4 | 4,"Ronaldinho",23 5 | 5,Elisabeth,45 6 | -------------------------------------------------------------------------------- /testdata/invalid_row.csv: -------------------------------------------------------------------------------- 1 | "ID","Name","Age" 2 | 1,"John",33 3 | 2,"Jane",30 4 | 3,"INVALID_NO_OF_COLUMNS" 5 | 4,"Fernando",23 6 | 5,Elisabeth,45 7 | --------------------------------------------------------------------------------