├── .cargo
    └── config.toml
├── .envrc
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── gem-push.yml
    │   └── ruby.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.md
├── Rakefile
├── benchmark
    ├── benchmark.sh
    ├── comparison_benchmark.rb
    ├── profile.sh
    └── ruby_profiling_script.rb
├── ext
    └── osv
    │   ├── Cargo.toml
    │   ├── extconf.rb
    │   └── src
    │       ├── allocator.rs
    │       ├── csv
    │           ├── builder.rs
    │           ├── header_cache.rs
    │           ├── mod.rs
    │           ├── parser.rs
    │           ├── record.rs
    │           ├── record_reader.rs
    │           └── ruby_reader.rs
    │       ├── lib.rs
    │       ├── reader.rs
    │       └── utils.rs
├── flake.lock
├── flake.nix
├── lib
    ├── osv.rb
    ├── osv.rbi
    └── osv
    │   └── version.rb
├── osv.gemspec
├── overlay.nix
└── test
    ├── big_test.rb
    ├── concurrency_test.rb
    ├── core_functionality_test.rb
    ├── encoding_test.rb
    ├── format_options_test.rb
    ├── gc_stress_test.rb
    ├── io_handling_test.rb
    ├── memory_safety_test.rb
    ├── performance_test.rb
    ├── stress_test.rb
    ├── test.csv
    └── test.tsv


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [profile.profiling]
2 | inherits = "release"
3 | debug = true
4 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
 1 | source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.5/direnvrc" "sha256-RuwIS+QKFj/T9M2TFXScjBsLR6V3A17YVoEW/Q6AZ1w="
 2 | 
 3 | nix_direnv_manual_reload
 4 | 
 5 | use flake . --fallback --accept-flake-config
 6 | 
 7 | # When running in a nix shell, the build assumes it's happening in CI and forces a release build.
 8 | # Setting this env var forces it to respect the RB_SYS_CARGO_PROFILE env var.
 9 | export RB_SYS_TEST=1
10 | 
11 | dotenv
12 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [njaremko]
4 | 


--------------------------------------------------------------------------------
/.github/workflows/gem-push.yml:
--------------------------------------------------------------------------------
 1 | name: Ruby Gem
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   build:
 8 |     name: Build + Publish
 9 |     runs-on: ubuntu-latest
10 | 
11 |     permissions:
12 |       id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
13 |       contents: write # IMPORTANT: this permission is required for `rake release` to push the release tag
14 | 
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         include:
19 |           - platform: x86_64-linux
20 |             target: x86_64-unknown-linux-gnu
21 |           - platform: x86_64-linux-musl
22 |             target: x86_64-unknown-linux-musl
23 |           - platform: aarch64-linux
24 |             target: aarch64-unknown-linux-gnu
25 |           - platform: aarch64-linux-musl
26 |             target: aarch64-unknown-linux-musl
27 |           - platform: x86_64-darwin
28 |             target: x86_64-apple-darwin
29 |           - platform: arm64-darwin
30 |             target: aarch64-apple-darwin
31 |           - platform: normal
32 |             target: normal
33 | 
34 |     steps:
35 |       - uses: actions/checkout@v4
36 | 
37 |       - name: Set up Ruby 3.2
38 |         uses: ruby/setup-ruby@v1
39 |         with:
40 |           ruby-version: 3.2
41 | 
42 |       # For some reason, I need to manually install this, even though it's seemingly automated below...
43 |       - name: rb-sys
44 |         run: |
45 |           gem install rb_sys
46 | 
47 |       - uses: oxidize-rb/actions/cross-gem@v1
48 |         if: ${{ matrix.target != 'normal' }}
49 |         id: cross-gem
50 |         with:
51 |           platform: ${{ matrix.platform }}
52 |           ruby-versions: "3.4,3.3,3.2"
53 | 
54 |       - uses: actions/upload-artifact@v4
55 |         if: ${{ matrix.target != 'normal' }}
56 |         with:
57 |           name: cross-gem-${{ matrix.platform }}
58 |           path: ${{ steps.cross-gem.outputs.gem-path }}
59 | 
60 |       - name: Set remote URL
61 |         shell: bash
62 |         run: |
63 |           # Attribute commits to the last committer on HEAD
64 |           git config --global user.email "$(git log -1 --pretty=format:'%ae')"
65 |           git config --global user.name "$(git log -1 --pretty=format:'%an')"
66 |           git remote set-url origin "https://x-access-token:${{ github.token }}@github.com/$GITHUB_REPOSITORY"
67 | 
68 |       - name: Configure trusted publishing credentials
69 |         uses: rubygems/configure-rubygems-credentials@v1.0.0
70 | 
71 |       - name: Download patch
72 |         shell: bash
73 |         run: |
74 |           wget https://raw.githubusercontent.com/rubygems/release-gem/refs/heads/v1/rubygems-attestation-patch.rb
75 | 
76 |       - name: Run release rake task
77 |         if: ${{ matrix.target != 'normal' }}
78 |         shell: bash
79 |         env:
80 |           RUBYOPT: "${{ format('-r{0}/rubygems-attestation-patch.rb {1}', github.workspace, env.RUBYOPT) || env.RUBYOPT }}"
81 |         run: |
82 |           gem push --key rubygems ${{ steps.cross-gem.outputs.gem-path }}
83 | 
84 |       - name: Run release rake task
85 |         if: ${{ matrix.target == 'normal' }}
86 |         shell: bash
87 |         env:
88 |           RUBYOPT: "${{ format('-r{0}/rubygems-attestation-patch.rb {1}', github.workspace, env.RUBYOPT) || env.RUBYOPT }}"
89 |         run: |
90 |           gem build osv.gemspec
91 |           gem push --key rubygems osv-*.gem
92 | 


--------------------------------------------------------------------------------
/.github/workflows/ruby.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
 7 | 
 8 | name: Ruby
 9 | 
10 | on:
11 |   push:
12 |     branches: ["main"]
13 |   pull_request:
14 |     branches: ["main"]
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   test:
21 |     runs-on: ubuntu-latest
22 |     strategy:
23 |       matrix:
24 |         ruby-version: ["3.2"]
25 | 
26 |     steps:
27 |       - uses: actions/checkout@v4
28 |       - name: Set up Ruby
29 |         uses: ruby/setup-ruby@v1
30 |         with:
31 |           ruby-version: ${{ matrix.ruby-version }}
32 |           bundler-cache: true # runs 'bundle install' and caches installed gems automatically
33 |       - name: Run tests
34 |         run: bundle exec rake
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target*
 2 | /tmp
 3 | /**/*.bundle
 4 | .direnv
 5 | pkg/
 6 | **/.DS_Store
 7 | .env
 8 | /benchmark/*.csv*
 9 | profile.json
10 | flamegraph.svg
11 | CLAUDE.md
12 | generate_context.py
13 | context.txt
14 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## 0.5.3
  4 | 
  5 | - Fix a bug dealing with header interning. We weren't actually storing the reference to the interned string, so we kept interning every time, and Ruby seems to have a bug that triggered occasional, random segfaults.
  6 | 
  7 | ## 0.5.2
  8 | 
  9 | - Lots of new tests
 10 | - One bug fix with extremely wide CSVs
 11 | - Do not intern headers when parsing with result type set to array
 12 | 
 13 | ## 0.5.1
 14 | 
 15 | - Attempting to determine if the value being read is a `StringIO` is difficult to due safely, so just treat it as an `IO`-like object.
 16 | 
 17 | ## 0.5.0
 18 | 
 19 | - Got rid of surprising behaviour that bypassed ruby if the provided IO had a file descriptor. It led to confusing bugs where people would write a custom read method that was ignored because we read the file descriptor directly.
 20 | - No longer read file into memory when reading gzipped data...
 21 | - Cleanup the reader implementation in general
 22 | 
 23 | ## 0.4.4
 24 | 
 25 | - Added support for cross-compilation for multiple platforms
 26 | 
 27 | ## 0.4.2 and 0.4.3
 28 | 
 29 | - Fix occasional segfault when parsing with `result_type: :hash`
 30 | 
 31 | ## 0.4.1
 32 | 
 33 | - Fix bug with lossy not being respected when parsing headers
 34 | 
 35 | ## 0.4.0
 36 | 
 37 | - Added `lossy` option to `for_each` that allows replacing invalid UTF-8 characters with a replacement character
 38 | - Removed `flexible_default` option from `for_each`
 39 | 
 40 | ## 0.3.21
 41 | 
 42 | - Fix bug where `ignore_null_bytes` was not being respected in enumerators.
 43 | 
 44 | ## 0.3.19 and 0.3.20
 45 | 
 46 | - Added `ignore_null_bytes` option to `for_each` that allows ignoring null bytes in fields
 47 | - The latter just removes an unneeded string copy when filtering out null bytes
 48 | 
 49 | ## 0.3.18
 50 | 
 51 | - Fix handling of passing in explicit nil for optional arguments.
 52 | 
 53 | ## 0.3.17
 54 | 
 55 | - Remove multi-threaded parsing. It was a bad idea. Performance is better without it. Code is simpler.
 56 | 
 57 | ## 0.3.16
 58 | 
 59 | - Optimize hash construction by interning key strings
 60 | 
 61 | ## 0.3.15
 62 | 
 63 | - Some internal refactoring to improve maintainability
 64 | - More optimizations for parsing IO-like objects without an underlying file handle
 65 | 
 66 | ## 0.3.14
 67 | 
 68 | After quite a bit of profiling:
 69 | 
 70 | - When you give `OSV` a file handle IO object, we have an optimization to grab the underlying open file handle and do all reading directly in Rust. This release adds lots of optimizations for parsing objects that implement `IO`'s `read` method without having an underlying file handle available.
 71 | - This release adds a lot of optimizations for parsing `StringIO` objects, as well as anything that doesn't implement `IO`'s `read` method, but does implement `to_str` or `to_s` methods.
 72 | - Further optimizations to string allocations in Rust code.
 73 | 
 74 | ## 0.3.13
 75 | 
 76 | - Turns out, gemspec descriptions cannot be markdown. Fixing that.
 77 | 
 78 | ## 0.3.12
 79 | 
 80 | - Attempt at improving RubyGems page for the gem
 81 | 
 82 | ## 0.3.11
 83 | 
 84 | - Set license to MIT in gemspec
 85 | 
 86 | ## 0.3.10
 87 | 
 88 | - Added `trim` option to `for_each` that allows trimming of fields and headers
 89 | 
 90 | ## 0.3.9
 91 | 
 92 | - Some optimizations, and a fix for a bug where file handles weren't being closed
 93 | 
 94 | ## 0.3.8
 95 | 
 96 | - Added `flexible` option to `for_each` that allows flexible parsing of CSV files without a default value
 97 | 
 98 | ## 0.3.7
 99 | 
100 | - Added `flexible_default` option to `for_each` that allows flexible parsing of CSV files when set to a string. Defaults to `nil`.
101 | 
102 | ## 0.3.6
103 | 
104 | - Fix bug introduced in 0.3.5 where `nil_string` was not being parsed correctly
105 | 
106 | ## 0.3.5
107 | 
108 | - `nil_string` no longer defaults to an empty string. It now defaults to `nil`. Which means that empty strings are interpreted as empty strings.
109 | 
110 | ## 0.3.4
111 | 
112 | - Added support for handling non-file backed IO objects in single threaded mode
113 | - General refactoring to improve performance and reduce allocations
114 | 
115 | ## 0.3.3
116 | 
117 | - Added support for gzip files
118 | 
119 | ## 0.3.2
120 | 
121 | - Intern strings used as keys in hashes until no longer referenced by Ruby to get rid of extra allocations
122 | 
123 | ## 0.3.0
124 | 
125 | - Got rid of `for_each_compat`. Now use `for_each(result_type: "array")` or `for_each(result_type: :array)`
126 | - Added `result_type` option to `parse_csv`
127 | - Added `buffer_size` option to `parse_csv`
128 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 4
  4 | 
  5 | [[package]]
  6 | name = "adler2"
  7 | version = "2.0.0"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
 10 | 
 11 | [[package]]
 12 | name = "ahash"
 13 | version = "0.8.11"
 14 | source = "registry+https://github.com/rust-lang/crates.io-index"
 15 | checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 16 | dependencies = [
 17 |  "cfg-if",
 18 |  "getrandom 0.2.15",
 19 |  "once_cell",
 20 |  "version_check",
 21 |  "zerocopy",
 22 | ]
 23 | 
 24 | [[package]]
 25 | name = "aho-corasick"
 26 | version = "1.1.3"
 27 | source = "registry+https://github.com/rust-lang/crates.io-index"
 28 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 29 | dependencies = [
 30 |  "memchr",
 31 | ]
 32 | 
 33 | [[package]]
 34 | name = "bindgen"
 35 | version = "0.69.5"
 36 | source = "registry+https://github.com/rust-lang/crates.io-index"
 37 | checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
 38 | dependencies = [
 39 |  "bitflags",
 40 |  "cexpr",
 41 |  "clang-sys",
 42 |  "itertools 0.12.1",
 43 |  "lazy_static",
 44 |  "lazycell",
 45 |  "proc-macro2",
 46 |  "quote",
 47 |  "regex",
 48 |  "rustc-hash",
 49 |  "shlex",
 50 |  "syn",
 51 | ]
 52 | 
 53 | [[package]]
 54 | name = "bitflags"
 55 | version = "2.6.0"
 56 | source = "registry+https://github.com/rust-lang/crates.io-index"
 57 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 58 | 
 59 | [[package]]
 60 | name = "cc"
 61 | version = "1.2.7"
 62 | source = "registry+https://github.com/rust-lang/crates.io-index"
 63 | checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7"
 64 | dependencies = [
 65 |  "shlex",
 66 | ]
 67 | 
 68 | [[package]]
 69 | name = "cexpr"
 70 | version = "0.6.0"
 71 | source = "registry+https://github.com/rust-lang/crates.io-index"
 72 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
 73 | dependencies = [
 74 |  "nom",
 75 | ]
 76 | 
 77 | [[package]]
 78 | name = "cfg-if"
 79 | version = "1.0.0"
 80 | source = "registry+https://github.com/rust-lang/crates.io-index"
 81 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 82 | 
 83 | [[package]]
 84 | name = "clang-sys"
 85 | version = "1.8.1"
 86 | source = "registry+https://github.com/rust-lang/crates.io-index"
 87 | checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
 88 | dependencies = [
 89 |  "glob",
 90 |  "libc",
 91 |  "libloading",
 92 | ]
 93 | 
 94 | [[package]]
 95 | name = "crc32fast"
 96 | version = "1.4.2"
 97 | source = "registry+https://github.com/rust-lang/crates.io-index"
 98 | checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
 99 | dependencies = [
100 |  "cfg-if",
101 | ]
102 | 
103 | [[package]]
104 | name = "csv"
105 | version = "1.3.1"
106 | source = "registry+https://github.com/rust-lang/crates.io-index"
107 | checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
108 | dependencies = [
109 |  "csv-core",
110 |  "itoa",
111 |  "ryu",
112 |  "serde",
113 | ]
114 | 
115 | [[package]]
116 | name = "csv-core"
117 | version = "0.1.11"
118 | source = "registry+https://github.com/rust-lang/crates.io-index"
119 | checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
120 | dependencies = [
121 |  "memchr",
122 | ]
123 | 
124 | [[package]]
125 | name = "either"
126 | version = "1.13.0"
127 | source = "registry+https://github.com/rust-lang/crates.io-index"
128 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
129 | 
130 | [[package]]
131 | name = "errno"
132 | version = "0.3.10"
133 | source = "registry+https://github.com/rust-lang/crates.io-index"
134 | checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
135 | dependencies = [
136 |  "libc",
137 |  "windows-sys",
138 | ]
139 | 
140 | [[package]]
141 | name = "fastrand"
142 | version = "2.3.0"
143 | source = "registry+https://github.com/rust-lang/crates.io-index"
144 | checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
145 | 
146 | [[package]]
147 | name = "flate2"
148 | version = "1.0.35"
149 | source = "registry+https://github.com/rust-lang/crates.io-index"
150 | checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
151 | dependencies = [
152 |  "crc32fast",
153 |  "miniz_oxide",
154 | ]
155 | 
156 | [[package]]
157 | name = "getrandom"
158 | version = "0.2.15"
159 | source = "registry+https://github.com/rust-lang/crates.io-index"
160 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
161 | dependencies = [
162 |  "cfg-if",
163 |  "libc",
164 |  "wasi 0.11.0+wasi-snapshot-preview1",
165 | ]
166 | 
167 | [[package]]
168 | name = "getrandom"
169 | version = "0.3.1"
170 | source = "registry+https://github.com/rust-lang/crates.io-index"
171 | checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
172 | dependencies = [
173 |  "cfg-if",
174 |  "libc",
175 |  "wasi 0.13.3+wasi-0.2.2",
176 |  "windows-targets",
177 | ]
178 | 
179 | [[package]]
180 | name = "glob"
181 | version = "0.3.1"
182 | source = "registry+https://github.com/rust-lang/crates.io-index"
183 | checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
184 | 
185 | [[package]]
186 | name = "itertools"
187 | version = "0.12.1"
188 | source = "registry+https://github.com/rust-lang/crates.io-index"
189 | checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
190 | dependencies = [
191 |  "either",
192 | ]
193 | 
194 | [[package]]
195 | name = "itertools"
196 | version = "0.14.0"
197 | source = "registry+https://github.com/rust-lang/crates.io-index"
198 | checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
199 | dependencies = [
200 |  "either",
201 | ]
202 | 
203 | [[package]]
204 | name = "itoa"
205 | version = "1.0.14"
206 | source = "registry+https://github.com/rust-lang/crates.io-index"
207 | checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
208 | 
209 | [[package]]
210 | name = "jemalloc-sys"
211 | version = "0.5.4+5.3.0-patched"
212 | source = "registry+https://github.com/rust-lang/crates.io-index"
213 | checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
214 | dependencies = [
215 |  "cc",
216 |  "libc",
217 | ]
218 | 
219 | [[package]]
220 | name = "jemallocator"
221 | version = "0.5.4"
222 | source = "registry+https://github.com/rust-lang/crates.io-index"
223 | checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
224 | dependencies = [
225 |  "jemalloc-sys",
226 |  "libc",
227 | ]
228 | 
229 | [[package]]
230 | name = "lazy_static"
231 | version = "1.5.0"
232 | source = "registry+https://github.com/rust-lang/crates.io-index"
233 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
234 | 
235 | [[package]]
236 | name = "lazycell"
237 | version = "1.3.0"
238 | source = "registry+https://github.com/rust-lang/crates.io-index"
239 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
240 | 
241 | [[package]]
242 | name = "libc"
243 | version = "0.2.169"
244 | source = "registry+https://github.com/rust-lang/crates.io-index"
245 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
246 | 
247 | [[package]]
248 | name = "libloading"
249 | version = "0.8.6"
250 | source = "registry+https://github.com/rust-lang/crates.io-index"
251 | checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
252 | dependencies = [
253 |  "cfg-if",
254 |  "windows-targets",
255 | ]
256 | 
257 | [[package]]
258 | name = "libmimalloc-sys"
259 | version = "0.1.39"
260 | source = "registry+https://github.com/rust-lang/crates.io-index"
261 | checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
262 | dependencies = [
263 |  "cc",
264 |  "libc",
265 | ]
266 | 
267 | [[package]]
268 | name = "linux-raw-sys"
269 | version = "0.4.15"
270 | source = "registry+https://github.com/rust-lang/crates.io-index"
271 | checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
272 | 
273 | [[package]]
274 | name = "magnus"
275 | version = "0.7.1"
276 | source = "registry+https://github.com/rust-lang/crates.io-index"
277 | checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
278 | dependencies = [
279 |  "magnus-macros",
280 |  "rb-sys",
281 |  "rb-sys-env",
282 |  "seq-macro",
283 | ]
284 | 
285 | [[package]]
286 | name = "magnus-macros"
287 | version = "0.6.0"
288 | source = "registry+https://github.com/rust-lang/crates.io-index"
289 | checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
290 | dependencies = [
291 |  "proc-macro2",
292 |  "quote",
293 |  "syn",
294 | ]
295 | 
296 | [[package]]
297 | name = "memchr"
298 | version = "2.7.4"
299 | source = "registry+https://github.com/rust-lang/crates.io-index"
300 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
301 | 
302 | [[package]]
303 | name = "mimalloc"
304 | version = "0.1.43"
305 | source = "registry+https://github.com/rust-lang/crates.io-index"
306 | checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
307 | dependencies = [
308 |  "libmimalloc-sys",
309 | ]
310 | 
311 | [[package]]
312 | name = "minimal-lexical"
313 | version = "0.2.1"
314 | source = "registry+https://github.com/rust-lang/crates.io-index"
315 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
316 | 
317 | [[package]]
318 | name = "miniz_oxide"
319 | version = "0.8.2"
320 | source = "registry+https://github.com/rust-lang/crates.io-index"
321 | checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394"
322 | dependencies = [
323 |  "adler2",
324 | ]
325 | 
326 | [[package]]
327 | name = "nom"
328 | version = "7.1.3"
329 | source = "registry+https://github.com/rust-lang/crates.io-index"
330 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
331 | dependencies = [
332 |  "memchr",
333 |  "minimal-lexical",
334 | ]
335 | 
336 | [[package]]
337 | name = "once_cell"
338 | version = "1.20.2"
339 | source = "registry+https://github.com/rust-lang/crates.io-index"
340 | checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
341 | 
342 | [[package]]
343 | name = "osv"
344 | version = "0.1.0"
345 | dependencies = [
346 |  "ahash",
347 |  "csv",
348 |  "flate2",
349 |  "itertools 0.14.0",
350 |  "jemallocator",
351 |  "magnus",
352 |  "mimalloc",
353 |  "rb-sys",
354 |  "serde",
355 |  "serde_magnus",
356 |  "tempfile",
357 |  "thiserror",
358 | ]
359 | 
360 | [[package]]
361 | name = "proc-macro2"
362 | version = "1.0.92"
363 | source = "registry+https://github.com/rust-lang/crates.io-index"
364 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
365 | dependencies = [
366 |  "unicode-ident",
367 | ]
368 | 
369 | [[package]]
370 | name = "quote"
371 | version = "1.0.37"
372 | source = "registry+https://github.com/rust-lang/crates.io-index"
373 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
374 | dependencies = [
375 |  "proc-macro2",
376 | ]
377 | 
378 | [[package]]
379 | name = "rb-sys"
380 | version = "0.9.104"
381 | source = "registry+https://github.com/rust-lang/crates.io-index"
382 | checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2"
383 | dependencies = [
384 |  "rb-sys-build",
385 | ]
386 | 
387 | [[package]]
388 | name = "rb-sys-build"
389 | version = "0.9.104"
390 | source = "registry+https://github.com/rust-lang/crates.io-index"
391 | checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc"
392 | dependencies = [
393 |  "bindgen",
394 |  "lazy_static",
395 |  "proc-macro2",
396 |  "quote",
397 |  "regex",
398 |  "shell-words",
399 |  "syn",
400 | ]
401 | 
402 | [[package]]
403 | name = "rb-sys-env"
404 | version = "0.1.2"
405 | source = "registry+https://github.com/rust-lang/crates.io-index"
406 | checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
407 | 
408 | [[package]]
409 | name = "regex"
410 | version = "1.11.1"
411 | source = "registry+https://github.com/rust-lang/crates.io-index"
412 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
413 | dependencies = [
414 |  "aho-corasick",
415 |  "memchr",
416 |  "regex-automata",
417 |  "regex-syntax",
418 | ]
419 | 
420 | [[package]]
421 | name = "regex-automata"
422 | version = "0.4.9"
423 | source = "registry+https://github.com/rust-lang/crates.io-index"
424 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
425 | dependencies = [
426 |  "aho-corasick",
427 |  "memchr",
428 |  "regex-syntax",
429 | ]
430 | 
431 | [[package]]
432 | name = "regex-syntax"
433 | version = "0.8.5"
434 | source = "registry+https://github.com/rust-lang/crates.io-index"
435 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
436 | 
437 | [[package]]
438 | name = "rustc-hash"
439 | version = "1.1.0"
440 | source = "registry+https://github.com/rust-lang/crates.io-index"
441 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
442 | 
443 | [[package]]
444 | name = "rustix"
445 | version = "0.38.44"
446 | source = "registry+https://github.com/rust-lang/crates.io-index"
447 | checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
448 | dependencies = [
449 |  "bitflags",
450 |  "errno",
451 |  "libc",
452 |  "linux-raw-sys",
453 |  "windows-sys",
454 | ]
455 | 
456 | [[package]]
457 | name = "ryu"
458 | version = "1.0.18"
459 | source = "registry+https://github.com/rust-lang/crates.io-index"
460 | checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
461 | 
462 | [[package]]
463 | name = "seq-macro"
464 | version = "0.3.5"
465 | source = "registry+https://github.com/rust-lang/crates.io-index"
466 | checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
467 | 
468 | [[package]]
469 | name = "serde"
470 | version = "1.0.216"
471 | source = "registry+https://github.com/rust-lang/crates.io-index"
472 | checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
473 | dependencies = [
474 |  "serde_derive",
475 | ]
476 | 
477 | [[package]]
478 | name = "serde_derive"
479 | version = "1.0.216"
480 | source = "registry+https://github.com/rust-lang/crates.io-index"
481 | checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
482 | dependencies = [
483 |  "proc-macro2",
484 |  "quote",
485 |  "syn",
486 | ]
487 | 
488 | [[package]]
489 | name = "serde_magnus"
490 | version = "0.9.0"
491 | source = "registry+https://github.com/rust-lang/crates.io-index"
492 | checksum = "51b8b945a2dadb221f1c5490cfb411cab6c3821446b8eca50ee07e5a3893ec51"
493 | dependencies = [
494 |  "magnus",
495 |  "serde",
496 |  "tap",
497 | ]
498 | 
499 | [[package]]
500 | name = "shell-words"
501 | version = "1.1.0"
502 | source = "registry+https://github.com/rust-lang/crates.io-index"
503 | checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
504 | 
505 | [[package]]
506 | name = "shlex"
507 | version = "1.3.0"
508 | source = "registry+https://github.com/rust-lang/crates.io-index"
509 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
510 | 
511 | [[package]]
512 | name = "syn"
513 | version = "2.0.91"
514 | source = "registry+https://github.com/rust-lang/crates.io-index"
515 | checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035"
516 | dependencies = [
517 |  "proc-macro2",
518 |  "quote",
519 |  "unicode-ident",
520 | ]
521 | 
522 | [[package]]
523 | name = "tap"
524 | version = "1.0.1"
525 | source = "registry+https://github.com/rust-lang/crates.io-index"
526 | checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
527 | 
528 | [[package]]
529 | name = "tempfile"
530 | version = "3.17.1"
531 | source = "registry+https://github.com/rust-lang/crates.io-index"
532 | checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230"
533 | dependencies = [
534 |  "cfg-if",
535 |  "fastrand",
536 |  "getrandom 0.3.1",
537 |  "once_cell",
538 |  "rustix",
539 |  "windows-sys",
540 | ]
541 | 
542 | [[package]]
543 | name = "thiserror"
544 | version = "2.0.9"
545 | source = "registry+https://github.com/rust-lang/crates.io-index"
546 | checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc"
547 | dependencies = [
548 |  "thiserror-impl",
549 | ]
550 | 
551 | [[package]]
552 | name = "thiserror-impl"
553 | version = "2.0.9"
554 | source = "registry+https://github.com/rust-lang/crates.io-index"
555 | checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4"
556 | dependencies = [
557 |  "proc-macro2",
558 |  "quote",
559 |  "syn",
560 | ]
561 | 
562 | [[package]]
563 | name = "unicode-ident"
564 | version = "1.0.14"
565 | source = "registry+https://github.com/rust-lang/crates.io-index"
566 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
567 | 
568 | [[package]]
569 | name = "version_check"
570 | version = "0.9.5"
571 | source = "registry+https://github.com/rust-lang/crates.io-index"
572 | checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
573 | 
574 | [[package]]
575 | name = "wasi"
576 | version = "0.11.0+wasi-snapshot-preview1"
577 | source = "registry+https://github.com/rust-lang/crates.io-index"
578 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
579 | 
580 | [[package]]
581 | name = "wasi"
582 | version = "0.13.3+wasi-0.2.2"
583 | source = "registry+https://github.com/rust-lang/crates.io-index"
584 | checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
585 | dependencies = [
586 |  "wit-bindgen-rt",
587 | ]
588 | 
589 | [[package]]
590 | name = "windows-sys"
591 | version = "0.59.0"
592 | source = "registry+https://github.com/rust-lang/crates.io-index"
593 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
594 | dependencies = [
595 |  "windows-targets",
596 | ]
597 | 
598 | [[package]]
599 | name = "windows-targets"
600 | version = "0.52.6"
601 | source = "registry+https://github.com/rust-lang/crates.io-index"
602 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
603 | dependencies = [
604 |  "windows_aarch64_gnullvm",
605 |  "windows_aarch64_msvc",
606 |  "windows_i686_gnu",
607 |  "windows_i686_gnullvm",
608 |  "windows_i686_msvc",
609 |  "windows_x86_64_gnu",
610 |  "windows_x86_64_gnullvm",
611 |  "windows_x86_64_msvc",
612 | ]
613 | 
614 | [[package]]
615 | name = "windows_aarch64_gnullvm"
616 | version = "0.52.6"
617 | source = "registry+https://github.com/rust-lang/crates.io-index"
618 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
619 | 
620 | [[package]]
621 | name = "windows_aarch64_msvc"
622 | version = "0.52.6"
623 | source = "registry+https://github.com/rust-lang/crates.io-index"
624 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
625 | 
626 | [[package]]
627 | name = "windows_i686_gnu"
628 | version = "0.52.6"
629 | source = "registry+https://github.com/rust-lang/crates.io-index"
630 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
631 | 
632 | [[package]]
633 | name = "windows_i686_gnullvm"
634 | version = "0.52.6"
635 | source = "registry+https://github.com/rust-lang/crates.io-index"
636 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
637 | 
638 | [[package]]
639 | name = "windows_i686_msvc"
640 | version = "0.52.6"
641 | source = "registry+https://github.com/rust-lang/crates.io-index"
642 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
643 | 
644 | [[package]]
645 | name = "windows_x86_64_gnu"
646 | version = "0.52.6"
647 | source = "registry+https://github.com/rust-lang/crates.io-index"
648 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
649 | 
650 | [[package]]
651 | name = "windows_x86_64_gnullvm"
652 | version = "0.52.6"
653 | source = "registry+https://github.com/rust-lang/crates.io-index"
654 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
655 | 
656 | [[package]]
657 | name = "windows_x86_64_msvc"
658 | version = "0.52.6"
659 | source = "registry+https://github.com/rust-lang/crates.io-index"
660 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
661 | 
662 | [[package]]
663 | name = "wit-bindgen-rt"
664 | version = "0.33.0"
665 | source = "registry+https://github.com/rust-lang/crates.io-index"
666 | checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
667 | dependencies = [
668 |  "bitflags",
669 | ]
670 | 
671 | [[package]]
672 | name = "zerocopy"
673 | version = "0.7.35"
674 | source = "registry+https://github.com/rust-lang/crates.io-index"
675 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
676 | dependencies = [
677 |  "zerocopy-derive",
678 | ]
679 | 
680 | [[package]]
681 | name = "zerocopy-derive"
682 | version = "0.7.35"
683 | source = "registry+https://github.com/rust-lang/crates.io-index"
684 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
685 | dependencies = [
686 |  "proc-macro2",
687 |  "quote",
688 |  "syn",
689 | ]
690 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = ["./ext/osv"]
3 | resolver = "2"
4 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "https://rubygems.org"
 2 | 
 3 | gem "rb_sys", "~> 0.9.56"
 4 | gem "rake"
 5 | 
 6 | # Use local version of osv
 7 | gemspec
 8 | 
 9 | group :development, :test do
10 |   gem "csv"
11 |   gem "minitest", "~> 5.0"
12 |   gem "benchmark-ips", "~> 2.12"
13 |   gem "fastcsv", "~> 0.0.7"
14 | end
15 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | PATH
 2 |   remote: .
 3 |   specs:
 4 |     osv (0.5.2)
 5 |       rb_sys (~> 0.9.39)
 6 | 
 7 | GEM
 8 |   remote: https://rubygems.org/
 9 |   specs:
10 |     benchmark-ips (2.14.0)
11 |     csv (3.3.2)
12 |     fastcsv (0.0.7)
13 |     minitest (5.25.4)
14 |     rake (13.2.1)
15 |     rake-compiler (1.2.0)
16 |       rake
17 |     rb_sys (0.9.104)
18 | 
19 | PLATFORMS
20 |   arm64-darwin-23
21 |   ruby
22 | 
23 | DEPENDENCIES
24 |   benchmark-ips (~> 2.12)
25 |   csv
26 |   fastcsv (~> 0.0.7)
27 |   minitest (~> 5.0)
28 |   osv!
29 |   rake
30 |   rake-compiler (~> 1.2.0)
31 |   rb_sys (~> 0.9.56)
32 | 
33 | BUNDLED WITH
34 |    2.5.11
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Nathan Jaremko
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OSV
  2 | 
  3 | [![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv)
  4 | 
  5 | OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
  6 | 
  7 | It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
  8 | 
  9 | The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
 10 | 
 11 | ## Installation
 12 | 
 13 | Add this line to your application's Gemfile:
 14 | 
 15 | ```ruby
 16 | gem 'osv'
 17 | ```
 18 | 
 19 | And then execute:
 20 | 
 21 | ```bash
 22 | bundle install
 23 | ```
 24 | 
 25 | Or install it directly:
 26 | 
 27 | ```bash
 28 | gem install osv
 29 | ```
 30 | 
 31 | ## Usage
 32 | 
 33 | ### Reading CSV Files
 34 | 
 35 | ```ruby
 36 | require 'osv'
 37 | 
 38 | # Basic usage - each row as a hash
 39 | OSV.for_each("data.csv") do |row|
 40 |   puts row["name"]  # => "John"
 41 |   puts row["age"]   # => "25"
 42 | end
 43 | 
 44 | # Return an enumerator instead of using a block
 45 | rows = OSV.for_each("data.csv")
 46 | rows.each { |row| puts row["name"] }
 47 | 
 48 | # High-performance array mode
 49 | OSV.for_each("data.csv", result_type: :array) do |row|
 50 |   puts row[0]  # First column
 51 |   puts row[1]  # Second column
 52 | end
 53 | ```
 54 | 
 55 | ### Input Sources
 56 | 
 57 | ```ruby
 58 | # From a file path
 59 | OSV.for_each("data.csv") { |row| puts row["name"] }
 60 | 
 61 | # From a file path
 62 | OSV.for_each("data.csv.gz") { |row| puts row["name"] }
 63 | 
 64 | # From an IO object
 65 | File.open("data.csv") { |file| OSV.for_each(file) { |row| puts row["name"] } }
 66 | 
 67 | # From a string
 68 | data = StringIO.new("name,age\nJohn,25")
 69 | OSV.for_each(data) { |row| puts row["name"] }
 70 | ```
 71 | 
 72 | ### Configuration Options
 73 | 
 74 | ```ruby
 75 | OSV.for_each("data.csv",
 76 |   # Input formatting
 77 |   has_headers: true,     # First row contains headers (default: true)
 78 |   col_sep: ",",          # Column separator (default: ",")
 79 |   quote_char: '"',       # Quote character (default: '"')
 80 | 
 81 |   # Output formatting
 82 |   result_type: :hash,    # :hash or :array (hash is default)
 83 |   nil_string: nil,       # String to interpret as nil when parsing (default: nil)
 84 | 
 85 |   # Parsing behavior
 86 |   flexible: false,       # Allow varying number of fields (default: false)
 87 |   trim: :all,            # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
 88 |   buffer_size: 1024,     # Number of rows to buffer in memory (default: 1024)
 89 |   ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
 90 |   lossy: false,             # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
 91 | )
 92 | ```
 93 | 
 94 | #### Available Options
 95 | 
 96 | - `has_headers`: Boolean indicating if the first row contains headers (default: true)
 97 | - `col_sep`: String specifying the field separator (default: ",")
 98 | - `quote_char`: String specifying the quote character (default: "\"")
 99 | - `nil_string`: String that should be interpreted as nil
100 |   - by default, empty strings are interpreted as empty strings
101 |   - if you want to interpret empty strings as nil, set this to an empty string
102 | - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
103 | - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
104 | - `flexible`: Boolean specifying if the parser should be flexible (default: false)
105 | - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
106 | - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
107 | - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
108 | 
109 | When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
110 | 
111 | ## Requirements
112 | 
113 | - Ruby >= 3.1.0
114 | - Rust toolchain (for installation from source)
115 | 
116 | ## Performance
117 | 
118 | This library is faster than the standard Ruby CSV library. It's also faster than any other CSV gem I've been able to find.
119 | 
120 | Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
121 | 
122 | ### 1,000,000 records
123 | 
124 | ```
125 | 🏃 Running benchmarks...
126 | Benchmarking with 3000001 lines of data
127 | 
128 | ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
129 | Warming up --------------------------------------
130 |       CSV - StringIO     1.000 i/100ms
131 |   FastCSV - StringIO     1.000 i/100ms
132 |       OSV - StringIO     1.000 i/100ms
133 |    CSV - Hash output     1.000 i/100ms
134 |    OSV - Hash output     1.000 i/100ms
135 |   CSV - Array output     1.000 i/100ms
136 |   OSV - Array output     1.000 i/100ms
137 | FastCSV - Array output
138 |                          1.000 i/100ms
139 | OSV - Direct Open Array output
140 |                          1.000 i/100ms
141 |        OSV - Gzipped     1.000 i/100ms
142 | OSV - Gzipped Direct     1.000 i/100ms
143 |    FastCSV - Gzipped     1.000 i/100ms
144 |        CSV - Gzipped     1.000 i/100ms
145 | Calculating -------------------------------------
146 |       CSV - StringIO      0.081 (± 0.0%) i/s    (12.36 s/i) -      3.000 in  37.155983s
147 |   FastCSV - StringIO      0.367 (± 0.0%) i/s     (2.73 s/i) -     11.000 in  30.182262s
148 |       OSV - StringIO      0.673 (± 0.0%) i/s     (1.49 s/i) -     20.000 in  30.247575s
149 |    CSV - Hash output      0.056 (± 0.0%) i/s    (17.73 s/i) -      2.000 in  35.464673s
150 |    OSV - Hash output      0.266 (± 0.0%) i/s     (3.77 s/i) -      8.000 in  30.511406s
151 |   CSV - Array output      0.068 (± 0.0%) i/s    (14.76 s/i) -      3.000 in  44.371496s
152 |   OSV - Array output      0.631 (± 0.0%) i/s     (1.59 s/i) -     19.000 in  30.896566s
153 | FastCSV - Array output
154 |                           0.369 (± 0.0%) i/s     (2.71 s/i) -     12.000 in  32.518984s
155 | OSV - Direct Open Array output
156 |                           0.642 (± 0.0%) i/s     (1.56 s/i) -     19.000 in  30.162703s
157 |        OSV - Gzipped      0.519 (± 0.0%) i/s     (1.93 s/i) -     16.000 in  31.551051s
158 | OSV - Gzipped Direct      0.512 (± 0.0%) i/s     (1.95 s/i) -     16.000 in  31.630035s
159 |    FastCSV - Gzipped      0.321 (± 0.0%) i/s     (3.12 s/i) -     10.000 in  31.795400s
160 |        CSV - Gzipped      0.058 (± 0.0%) i/s    (17.34 s/i) -      2.000 in  34.686451s
161 | 
162 | Comparison:
163 |                 OSV - StringIO:        0.7 i/s
164 | OSV - Direct Open Array output:        0.6 i/s - 1.05x  slower
165 |             OSV - Array output:        0.6 i/s - 1.07x  slower
166 |                  OSV - Gzipped:        0.5 i/s - 1.30x  slower
167 |           OSV - Gzipped Direct:        0.5 i/s - 1.31x  slower
168 |         FastCSV - Array output:        0.4 i/s - 1.82x  slower
169 |             FastCSV - StringIO:        0.4 i/s - 1.83x  slower
170 |              FastCSV - Gzipped:        0.3 i/s - 2.10x  slower
171 |              OSV - Hash output:        0.3 i/s - 2.53x  slower
172 |                 CSV - StringIO:        0.1 i/s - 8.31x  slower
173 |             CSV - Array output:        0.1 i/s - 9.93x  slower
174 |                  CSV - Gzipped:        0.1 i/s - 11.66x  slower
175 |              CSV - Hash output:        0.1 i/s - 11.92x  slower
176 | ```
177 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "rake/testtask"
 4 | require "rb_sys/extensiontask"
 5 | 
 6 | task default: :test
 7 | 
 8 | GEMSPEC = Gem::Specification.load("osv.gemspec")
 9 | 
10 | platforms = [
11 |   "x86_64-linux",
12 |   "x86_64-linux-musl",
13 |   "aarch64-linux",
14 |   "aarch64-linux-musl",
15 |   "x86_64-darwin",
16 |   "arm64-darwin"
17 | ]
18 | 
19 | RbSys::ExtensionTask.new("osv", GEMSPEC) do |ext|
20 |   ext.lib_dir = "lib/osv"
21 |   ext.ext_dir = "ext/osv"
22 |   ext.cross_compile = true
23 |   ext.cross_platform = platforms
24 |   ext.cross_compiling do |spec|
25 |     spec.dependencies.reject! { |dep| dep.name == "rb_sys" }
26 |     spec.files.reject! { |file| File.fnmatch?("ext/*", file, File::FNM_EXTGLOB) }
27 |   end
28 | end
29 | 
30 | Rake::TestTask.new do |t|
31 |   t.deps << :compile
32 |   t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
33 |   t.libs << "lib"
34 |   t.libs << "test"
35 | end
36 | 
37 | task :release do
38 |   sh "bundle exec rake test"
39 |   sh "mkdir -p pkg"
40 |   sh "gem build osv.gemspec -o pkg/osv-#{OSV::VERSION}.gem"
41 |   sh "gem push pkg/osv-#{OSV::VERSION}.gem"
42 | end
43 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | export RB_SYS_CARGO_PROFILE=profiling
 5 | 
 6 | # echo "🧹 Cleaning previous build..."
 7 | # cargo clean
 8 | 
 9 | echo "📦 Installing Ruby dependencies..."
10 | bundle install
11 | 
12 | echo "🔨 Compiling Rust extension..."
13 | bundle exec rake compile
14 | 
15 | echo "🏃 Running benchmarks..."
16 | bundle exec benchmark/comparison_benchmark.rb
17 | 


--------------------------------------------------------------------------------
/benchmark/comparison_benchmark.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | # frozen_string_literal: true
  3 | 
  4 | require "benchmark/ips"
  5 | require "csv"
  6 | require "osv"
  7 | require "fastcsv"
  8 | require "stringio"
  9 | require "zlib"
 10 | require "fileutils"
 11 | 
 12 | RubyVM::YJIT.enable
 13 | 
 14 | # Generate a larger test file for more meaningful benchmarks
 15 | def generate_test_data(rows = 1_000_000)
 16 |   if File.exist?("benchmark/test.csv")
 17 |     age_total = 0
 18 |     CSV.foreach("benchmark/test.csv", headers: true) { |row| age_total += row["age"].to_i }
 19 |     return StringIO.new(File.read("benchmark/test.csv")), age_total
 20 |   end
 21 | 
 22 |   age = 0
 23 |   headers = %w[
 24 |     id
 25 |     name
 26 |     age
 27 |     email
 28 |     city
 29 |     country
 30 |     salary
 31 |     department
 32 |     hire_date
 33 |     manager_id
 34 |     performance_score
 35 |     project_count
 36 |     active
 37 |     notes
 38 |     last_login
 39 |     description
 40 |     skills
 41 |     address
 42 |   ]
 43 |   CSV.open("benchmark/test.csv", "w", write_headers: true, headers: headers) do |csv|
 44 |     rows.times do |i|
 45 |       row_age = rand(18..80)
 46 |       age += row_age
 47 |       csv << [
 48 |         i,
 49 |         "Person#{i}",
 50 |         row_age,
 51 |         "person#{i}@example.com",
 52 |         "City#{i}",
 53 |         "Country#{i}",
 54 |         rand(30_000..200_000),
 55 |         %w[Engineering Sales Marketing HR Finance].sample,
 56 |         "2020-#{rand(1..12)}-#{rand(1..28)}",
 57 |         rand(1..1000),
 58 |         rand(1..5).to_f,
 59 |         rand(1..10),
 60 |         [true, false].sample,
 61 |         "",
 62 |         "",
 63 |         # Large quoted text with commas and quotes
 64 |         "A very long description of person #{i}'s background, including multiple, comma-separated clauses. The person has \"special\" skills and experience in various fields.",
 65 |         # Array-like quoted text with commas
 66 |         "Ruby,Python,JavaScript,\"DevOps\",\"Cloud Architecture\"",
 67 |         # Address with embedded newlines and quotes
 68 |         "123 Main St.\nApt \"B\"\nSuite 100"
 69 |       ]
 70 |     end
 71 |   end
 72 | 
 73 |   file_string = File.read("benchmark/test.csv")
 74 | 
 75 |   Zlib::GzipWriter.open("benchmark/test.csv.gz") do |gz|
 76 |     CSV
 77 |       .new(gz, write_headers: true, headers: headers)
 78 |       .tap { |csv| CSV.parse(file_string, headers: true) { |row| csv << row } }
 79 |   end
 80 | 
 81 |   str = StringIO.new(file_string)
 82 |   [str, age]
 83 | end
 84 | 
 85 | TEST_FILES = %w[benchmark/test.csv benchmark/test.csv.gz].freeze
 86 | 
 87 | begin
 88 |   # Create test files
 89 |   test_data, age = generate_test_data
 90 | 
 91 |   # Create gzipped version
 92 | 
 93 |   puts "Benchmarking with #{`wc -l benchmark/test.csv`.to_i} lines of data\n\n"
 94 | 
 95 |   Benchmark.ips do |x|
 96 |     x.config(time: 30, warmup: 5)
 97 | 
 98 |     x.report("CSV - StringIO") do
 99 |       count = 0
100 |       io = StringIO.new(test_data.string)
101 |       CSV.new(io).each { |row| count += row[2].to_i }
102 |       io.close
103 |       raise "Age mismatch: #{age} != #{count}" if age != count
104 |     end
105 | 
106 |     x.report("FastCSV - StringIO") do
107 |       count = 0
108 |       io = StringIO.new(test_data.string)
109 |       FastCSV.raw_parse(io) { |row| count += row[2].to_i }
110 | 
111 |       raise "Age mismatch: #{age} != #{count}" if age != count
112 |     end
113 | 
114 |     x.report("OSV - StringIO") do
115 |       count = 0
116 |       io = StringIO.new(test_data.string)
117 |       OSV.for_each(io, result_type: :array) { |row| count += row[2].to_i }
118 |       raise "Age mismatch: #{age} != #{count}" if age != count
119 |     end
120 | 
121 |     x.report("CSV - Hash output") do
122 |       count = 0
123 |       File.open("benchmark/test.csv") { |f| CSV.new(f, headers: true).each { |row| count += row["age"].to_i } }
124 |       raise "Age mismatch: #{age} != #{count}" if age != count
125 |     end
126 | 
127 |     x.report("OSV - Hash output") do
128 |       count = 0
129 |       File.open("benchmark/test.csv") { |f| OSV.for_each(f) { |row| count += row["age"].to_i } }
130 |       raise "Age mismatch: #{age} != #{count}" if age != count
131 |     end
132 | 
133 |     x.report("CSV - Array output") do
134 |       count = 0
135 |       File.open("benchmark/test.csv") { |f| CSV.new(f).each { |row| count += row[2].to_i } }
136 |       raise "Age mismatch: #{age} != #{count}" if age != count
137 |     end
138 | 
139 |     x.report("OSV - Array output") do
140 |       count = 0
141 |       File.open("benchmark/test.csv") { |f| OSV.for_each(f, result_type: :array) { |row| count += row[2].to_i } }
142 |       raise "Age mismatch: #{age} != #{count}" if age != count
143 |     end
144 | 
145 |     x.report("FastCSV - Array output") do
146 |       count = 0
147 |       File.open("benchmark/test.csv") { |f| FastCSV.raw_parse(f) { |row| count += row[2].to_i } }
148 |       raise "Age mismatch: #{age} != #{count}" if age != count
149 |     end
150 | 
151 |     x.report("OSV - Direct Open Array output") do
152 |       count = 0
153 |       OSV.for_each("benchmark/test.csv", result_type: :array) { |row| count += row[2].to_i }
154 |       raise "Age mismatch: #{age} != #{count}" if age != count
155 |     end
156 | 
157 |     x.report("OSV - Gzipped") do
158 |       count = 0
159 |       Zlib::GzipReader.open("benchmark/test.csv.gz") do |gz|
160 |         OSV.for_each(gz, result_type: :array) { |row| count += row[2].to_i }
161 |       end
162 |       raise "Age mismatch: #{age} != #{count}" if age != count
163 |     end
164 | 
165 |     x.report("OSV - Gzipped Direct") do
166 |       count = 0
167 |       OSV.for_each("benchmark/test.csv.gz", result_type: :array) { |row| count += row[2].to_i }
168 |       raise "Age mismatch: #{age} != #{count}" if age != count
169 |     end
170 | 
171 |     x.report("FastCSV - Gzipped") do
172 |       count = 0
173 |       Zlib::GzipReader.open("benchmark/test.csv.gz") { |gz| FastCSV.raw_parse(gz) { |row| count += row[2].to_i } }
174 |       raise "Age mismatch: #{age} != #{count}" if age != count
175 |     end
176 | 
177 |     x.report("CSV - Gzipped") do
178 |       count = 0
179 |       Zlib::GzipReader.open("benchmark/test.csv.gz") do |gz|
180 |         CSV.new(gz, headers: true).each { |row| count += row["age"].to_i }
181 |       end
182 |       raise "Age mismatch: #{age} != #{count}" if age != count
183 |     end
184 | 
185 |     x.compare!
186 |   end
187 | ensure
188 |   # Cleanup test files even if the script fails or is interrupted
189 |   # FileUtils.rm_f(TEST_FILES)
190 | end
191 | 


--------------------------------------------------------------------------------
/benchmark/profile.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | export RB_SYS_CARGO_PROFILE=profiling
 5 | 
 6 | echo "📦 Installing Ruby dependencies..."
 7 | bundle install
 8 | 
 9 | echo "🔨 Compiling Rust extension..."
10 | bundle exec rake compile
11 | 
12 | # cargo install flamegraph
13 | sudo flamegraph -o flamegraph.svg -- bundle exec benchmark/ruby_profiling_script.rb
14 | 


--------------------------------------------------------------------------------
/benchmark/ruby_profiling_script.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | require "osv"
 5 | require "fastcsv"
 6 | require "stringio"
 7 | require "time"
 8 | 
 9 | # Generate a larger test file for more meaningful benchmarks
10 | def generate_test_data(rows = 1_000_000)
11 |   headers = %w[
12 |     id
13 |     name
14 |     age
15 |     email
16 |     city
17 |     country
18 |     salary
19 |     department
20 |     hire_date
21 |     manager_id
22 |     performance_score
23 |     project_count
24 |     active
25 |     notes
26 |     last_login
27 |   ]
28 |   StringIO.new.tap do |io|
29 |     io.puts headers.join(",")
30 |     rows.times do |i|
31 |       row = [
32 |         i,
33 |         "Person#{i}",
34 |         rand(18..80),
35 |         "person#{i}@example.com",
36 |         "City#{i}",
37 |         "Country#{i}",
38 |         rand(30_000..200_000),
39 |         %w[Engineering Sales Marketing HR Finance].sample,
40 |         "2020-#{rand(1..12)}-#{rand(1..28)}",
41 |         rand(1..1000),
42 |         rand(1..5).to_f,
43 |         rand(1..10),
44 |         [true, false].sample,
45 |         "",
46 |         ""
47 |       ]
48 |       io.puts row.join(",")
49 |     end
50 |     io.rewind
51 |   end
52 | end
53 | 
54 | # Generate test data and write to file
55 | test_data = generate_test_data.string
56 | File.write("benchmark_test.csv", test_data)
57 | 
58 | io = StringIO.new(test_data)
59 | 
60 | # Process the file in a loop for 10 seconds
61 | end_time = Time.now + 30
62 | iterations = 0
63 | 
64 | while Time.now < end_time
65 |   count = 0
66 |   OSV.for_each(io, result_type: :array) { |row| count += row[2].to_i }
67 |   # FastCSV.raw_parse(io) { |row| count += row[2].to_i }
68 |   io.rewind
69 |   iterations += 1
70 | end
71 | 
72 | puts "Completed #{iterations} iterations in 10 seconds"
73 | 
74 | # Cleanup
75 | File.delete("benchmark_test.csv")
76 | 


--------------------------------------------------------------------------------
/ext/osv/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "osv"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [lib]
 7 | crate-type = ["cdylib"]
 8 | 
 9 | [dependencies]
10 | ahash = "0.8"
11 | csv = "^1.3"
12 | flate2 = "1.0.35"
13 | magnus = { version = "0.7", features = ["rb-sys"] }
14 | rb-sys = "^0.9"
15 | serde = { version = "1.0", features = ["derive"] }
16 | serde_magnus = "0.9.0"
17 | thiserror = "2.0"
18 | itertools = "^0.14"
19 | tempfile = "3.17.1"
20 | 
21 | [target.'cfg(target_os = "linux")'.dependencies]
22 | jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
23 | 
24 | [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
25 | mimalloc = { version = "0.1", default-features = false }
26 | 


--------------------------------------------------------------------------------
/ext/osv/extconf.rb:
--------------------------------------------------------------------------------
1 | require "mkmf"
2 | require "rb_sys/mkmf"
3 | 
4 | create_rust_makefile("osv/osv")
5 | 


--------------------------------------------------------------------------------
/ext/osv/src/allocator.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(target_os = "linux")]
 2 | use jemallocator::Jemalloc;
 3 | 
 4 | #[cfg(not(any(target_os = "linux", target_os = "windows")))]
 5 | use mimalloc::MiMalloc;
 6 | 
 7 | #[global_allocator]
 8 | #[cfg(target_os = "linux")]
 9 | static ALLOC: Jemalloc = Jemalloc;
10 | 
11 | #[global_allocator]
12 | #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13 | static ALLOC: MiMalloc = MiMalloc;
14 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/builder.rs:
--------------------------------------------------------------------------------
  1 | use super::{
  2 |     header_cache::{CacheError, StringCache},
  3 |     parser::RecordParser,
  4 |     record_reader::{RecordReader, READ_BUFFER_SIZE},
  5 |     ruby_reader::RubyReader,
  6 | };
  7 | use magnus::{Error as MagnusError, RString, Ruby, Value};
  8 | use std::{
  9 |     borrow::Cow,
 10 |     io::{self, BufReader},
 11 |     marker::PhantomData,
 12 | };
 13 | 
 14 | use thiserror::Error;
 15 | 
 16 | /// Errors that can occur when building a RecordReader
 17 | #[derive(Error, Debug)]
 18 | pub enum ReaderError {
 19 |     #[error("Failed to get file descriptor: {0}")]
 20 |     FileDescriptor(String),
 21 |     #[error("Invalid file descriptor: {0}")]
 22 |     InvalidFileDescriptor(i32),
 23 |     #[error("Failed to open file: {0}")]
 24 |     FileOpen(#[from] io::Error),
 25 |     #[error("Failed to intern headers: {0}")]
 26 |     HeaderIntern(#[from] CacheError),
 27 |     #[error("Invalid flexible default value: {0}")]
 28 |     InvalidFlexibleDefault(String),
 29 |     #[error("Invalid null string value: {0}")]
 30 |     InvalidNullString(String),
 31 |     #[error("Failed to parse CSV record: {0}")]
 32 |     CsvParse(#[from] csv::Error),
 33 |     #[error("Invalid UTF-8: {0}")]
 34 |     InvalidUtf8(String),
 35 |     #[error("Ruby error: {0}")]
 36 |     Ruby(String),
 37 | }
 38 | 
 39 | impl From<MagnusError> for ReaderError {
 40 |     fn from(err: MagnusError) -> Self {
 41 |         Self::Ruby(err.to_string())
 42 |     }
 43 | }
 44 | 
 45 | impl From<ReaderError> for MagnusError {
 46 |     fn from(err: ReaderError) -> Self {
 47 |         let ruby = Ruby::get().unwrap();
 48 |         match err {
 49 |             ReaderError::CsvParse(csv_err) => {
 50 |                 if csv_err.to_string().contains("invalid utf-8") {
 51 |                     MagnusError::new(ruby.exception_encoding_error(), csv_err.to_string())
 52 |                 } else {
 53 |                     MagnusError::new(ruby.exception_runtime_error(), csv_err.to_string())
 54 |                 }
 55 |             }
 56 |             ReaderError::InvalidUtf8(utf8_err) => {
 57 |                 MagnusError::new(ruby.exception_encoding_error(), utf8_err.to_string())
 58 |             }
 59 |             _ => MagnusError::new(ruby.exception_runtime_error(), err.to_string()),
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | /// Builder for configuring and creating a RecordReader instance.
 65 | ///
 66 | /// This struct provides a fluent interface for setting up CSV parsing options
 67 | /// and creating a RecordReader with the specified configuration.
 68 | pub struct RecordReaderBuilder<'a, 'r, T: RecordParser<'a>> {
 69 |     ruby: &'r Ruby,
 70 |     to_read: Value,
 71 |     has_headers: bool,
 72 |     delimiter: u8,
 73 |     quote_char: u8,
 74 |     null_string: Option<String>,
 75 |     flexible: bool,
 76 |     trim: csv::Trim,
 77 |     ignore_null_bytes: bool,
 78 |     lossy: bool,
 79 |     _phantom: PhantomData<T>,
 80 |     _phantom_a: PhantomData<&'a ()>,
 81 | }
 82 | 
 83 | impl<'a, 'r, T: RecordParser<'a>> RecordReaderBuilder<'a, 'r, T> {
 84 |     /// Creates a new builder instance with default settings.
 85 |     pub fn new(ruby: &'r Ruby, to_read: Value) -> Self {
 86 |         Self {
 87 |             ruby,
 88 |             to_read,
 89 |             has_headers: true,
 90 |             delimiter: b',',
 91 |             quote_char: b'"',
 92 |             null_string: None,
 93 |             flexible: false,
 94 |             trim: csv::Trim::None,
 95 |             ignore_null_bytes: false,
 96 |             lossy: false,
 97 |             _phantom: PhantomData,
 98 |             _phantom_a: PhantomData,
 99 |         }
100 |     }
101 | 
102 |     /// Sets whether the CSV file has headers.
103 |     #[must_use]
104 |     pub fn has_headers(mut self, has_headers: bool) -> Self {
105 |         self.has_headers = has_headers;
106 |         self
107 |     }
108 | 
109 |     /// Sets the delimiter character for the CSV.
110 |     #[must_use]
111 |     pub fn delimiter(mut self, delimiter: u8) -> Self {
112 |         self.delimiter = delimiter;
113 |         self
114 |     }
115 | 
116 |     /// Sets the quote character for the CSV.
117 |     #[must_use]
118 |     pub fn quote_char(mut self, quote_char: u8) -> Self {
119 |         self.quote_char = quote_char;
120 |         self
121 |     }
122 | 
123 |     /// Sets the string that should be interpreted as null.
124 |     #[must_use]
125 |     pub fn null_string(mut self, null_string: Option<String>) -> Self {
126 |         self.null_string = null_string;
127 |         self
128 |     }
129 | 
130 |     /// Sets whether the reader should be flexible with field counts.
131 |     #[must_use]
132 |     pub fn flexible(mut self, flexible: bool) -> Self {
133 |         self.flexible = flexible;
134 |         self
135 |     }
136 | 
137 |     /// Sets the trimming mode for fields.
138 |     #[must_use]
139 |     pub fn trim(mut self, trim: csv::Trim) -> Self {
140 |         self.trim = trim;
141 |         self
142 |     }
143 | 
144 |     #[must_use]
145 |     pub fn ignore_null_bytes(mut self, ignore_null_bytes: bool) -> Self {
146 |         self.ignore_null_bytes = ignore_null_bytes;
147 |         self
148 |     }
149 | 
150 |     #[must_use]
151 |     pub fn lossy(mut self, lossy: bool) -> Self {
152 |         self.lossy = lossy;
153 |         self
154 |     }
155 | 
156 |     /// Builds the RecordReader with the configured options.
157 |     pub fn build(self) -> Result<RecordReader<'a, 'r, T>, ReaderError> {
158 |         let readable = RubyReader::try_from(self.to_read)?;
159 | 
160 |         let flexible = self.flexible;
161 |         let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
162 | 
163 |         let mut reader = csv::ReaderBuilder::new()
164 |             .has_headers(self.has_headers)
165 |             .delimiter(self.delimiter)
166 |             .quote(self.quote_char)
167 |             .flexible(flexible)
168 |             .trim(self.trim)
169 |             .from_reader(reader);
170 | 
171 |         let mut headers =
172 |             RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers, self.lossy)?;
173 | 
174 |         if self.ignore_null_bytes {
175 |             headers = headers.iter().map(|h| h.replace("\0", "")).collect();
176 |         }
177 | 
178 |         let static_headers = if T::uses_headers() {
179 |             StringCache::intern_many(&headers)?
180 |         } else {
181 |             Vec::new()
182 |         };
183 | 
184 |         let null_string = self
185 |             .null_string
186 |             .map(|s| {
187 |                 RString::new(&s)
188 |                     .to_interned_str()
189 |                     .as_str()
190 |                     .map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
191 |             })
192 |             .transpose()?
193 |             .map(Cow::Borrowed);
194 | 
195 |         Ok(RecordReader::new(
196 |             self.ruby,
197 |             reader,
198 |             static_headers,
199 |             null_string,
200 |             self.ignore_null_bytes,
201 |             self.lossy,
202 |         ))
203 |     }
204 | }
205 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/header_cache.rs:
--------------------------------------------------------------------------------
 1 | /// This module exists to avoid cloning header keys in returned HashMaps.
 2 | /// Since the underlying RString creation already involves cloning,
 3 | /// this caching layer aims to reduce redundant allocations.
 4 | ///
 5 | /// Note: Performance testing on macOS showed minimal speed improvements,
 6 | /// so this optimization could be removed if any issues arise.
 7 | use std::{
 8 |     collections::HashMap,
 9 |     sync::{LazyLock, Mutex},
10 | };
11 | 
12 | use magnus::{
13 |     r_string::FString,
14 |     value::{InnerValue, Opaque},
15 |     IntoValue, RString, Ruby, Value,
16 | };
17 | 
18 | use thiserror::Error;
19 | 
20 | #[derive(Debug, Clone, Error)]
21 | pub enum CacheError {
22 |     #[error("Failed to acquire lock: {0}")]
23 |     LockError(String),
24 |     #[error("Failed to convert Ruby String to interned string: {0}")]
25 |     RStringConversion(String),
26 | }
27 | 
28 | static STRING_CACHE: LazyLock<Mutex<HashMap<String, StringCacheKey>>> =
29 |     LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
30 | 
31 | pub struct StringCache;
32 | 
33 | #[derive(Copy, Clone)]
34 | pub struct StringCacheKey(Opaque<FString>);
35 | 
36 | impl StringCacheKey {
37 |     pub fn new(string: &str) -> Result<Self, CacheError> {
38 |         let rstr = RString::new(string);
39 |         let fstr = rstr.to_interned_str();
40 |         // FStrings should not be collected by the GC anyway, but just in case.
41 |         magnus::gc::register_mark_object(fstr);
42 |         Ok(Self(Opaque::from(fstr)))
43 |     }
44 | 
45 |     pub fn as_fstr(&self, handle: &Ruby) -> FString {
46 |         self.0.get_inner_with(handle)
47 |     }
48 | 
49 |     pub fn as_str(&self, handle: &Ruby) -> Result<&'static str, CacheError> {
50 |         self.0
51 |             .get_inner_with(handle)
52 |             .as_str()
53 |             .map_err(|e| CacheError::RStringConversion(e.to_string()))
54 |     }
55 | }
56 | 
57 | impl IntoValue for StringCacheKey {
58 |     fn into_value_with(self, handle: &Ruby) -> Value {
59 |         handle.into_value(self.0)
60 |     }
61 | }
62 | 
63 | impl IntoValue for &StringCacheKey {
64 |     fn into_value_with(self, handle: &Ruby) -> Value {
65 |         handle.into_value(self.0)
66 |     }
67 | }
68 | 
69 | impl StringCache {
70 |     pub fn intern_many<AsStr: AsRef<str>>(
71 |         strings: &[AsStr],
72 |     ) -> Result<Vec<StringCacheKey>, CacheError> {
73 |         let mut cache = STRING_CACHE
74 |             .lock()
75 |             .map_err(|e| CacheError::LockError(e.to_string()))?;
76 | 
77 |         let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
78 |         for string in strings {
79 |             if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) {
80 |                 result.push(*interned_string);
81 |             } else {
82 |                 let interned = StringCacheKey::new(string.as_ref())?;
83 |                 cache.insert(string.as_ref().to_string(), interned);
84 |                 result.push(interned);
85 |             }
86 |         }
87 |         Ok(result)
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/mod.rs:
--------------------------------------------------------------------------------
 1 | mod builder;
 2 | mod header_cache;
 3 | mod parser;
 4 | mod record;
 5 | mod record_reader;
 6 | mod ruby_reader;
 7 | 
 8 | pub use builder::RecordReaderBuilder;
 9 | pub use record::CowStr;
10 | pub use record::CsvRecord;
11 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/parser.rs:
--------------------------------------------------------------------------------
  1 | use super::builder::ReaderError;
  2 | use super::header_cache::StringCacheKey;
  3 | use super::CowStr;
  4 | use magnus::Ruby;
  5 | use std::borrow::Cow;
  6 | use std::collections::HashMap;
  7 | use std::hash::BuildHasher;
  8 | 
  9 | pub enum CsvRecordType {
 10 |     String(csv::StringRecord),
 11 |     Byte(csv::ByteRecord),
 12 | }
 13 | 
 14 | pub trait RecordParser<'a> {
 15 |     type Output;
 16 | 
 17 |     fn parse(
 18 |         handle: &Ruby,
 19 |         headers: &[StringCacheKey],
 20 |         record: &CsvRecordType,
 21 |         null_string: Option<Cow<'a, str>>,
 22 |         ignore_null_bytes: bool,
 23 |     ) -> Result<Self::Output, ReaderError>;
 24 | 
 25 |     fn uses_headers() -> bool;
 26 | }
 27 | 
 28 | impl<'a, S: BuildHasher + Default> RecordParser<'a>
 29 |     for HashMap<&'static str, Option<CowStr<'a>>, S>
 30 | {
 31 |     type Output = Self;
 32 | 
 33 |     #[inline]
 34 |     fn uses_headers() -> bool {
 35 |         true
 36 |     }
 37 | 
 38 |     #[inline]
 39 |     fn parse(
 40 |         handle: &Ruby,
 41 |         headers: &[StringCacheKey],
 42 |         record: &CsvRecordType,
 43 |         null_string: Option<Cow<'a, str>>,
 44 |         ignore_null_bytes: bool,
 45 |     ) -> Result<Self::Output, ReaderError> {
 46 |         let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
 47 |         let shared_empty = Cow::Borrowed("");
 48 | 
 49 |         for (i, header) in headers.iter().enumerate() {
 50 |             let value = match record {
 51 |                 CsvRecordType::String(s) => s.get(i).and_then(|field| {
 52 |                     convert_field_to_cow_str(
 53 |                         field,
 54 |                         null_string.as_deref(),
 55 |                         ignore_null_bytes,
 56 |                         &shared_empty,
 57 |                     )
 58 |                 }),
 59 |                 CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
 60 |                     let field = String::from_utf8_lossy(field);
 61 |                     convert_field_to_cow_str(
 62 |                         &field,
 63 |                         null_string.as_deref(),
 64 |                         ignore_null_bytes,
 65 |                         &shared_empty,
 66 |                     )
 67 |                 }),
 68 |             };
 69 | 
 70 |             map.insert(header.as_str(handle)?, value);
 71 |         }
 72 | 
 73 |         Ok(map)
 74 |     }
 75 | }
 76 | 
 77 | impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
 78 |     type Output = Self;
 79 | 
 80 |     #[inline]
 81 |     fn uses_headers() -> bool {
 82 |         false
 83 |     }
 84 | 
 85 |     #[inline]
 86 |     fn parse(
 87 |         _handle: &Ruby,
 88 |         headers: &[StringCacheKey],
 89 |         record: &CsvRecordType,
 90 |         null_string: Option<Cow<'a, str>>,
 91 |         ignore_null_bytes: bool,
 92 |     ) -> Result<Self::Output, ReaderError> {
 93 |         let target_len = headers.len();
 94 |         let mut vec = Vec::with_capacity(target_len);
 95 |         let shared_empty = Cow::Borrowed("");
 96 | 
 97 |         match record {
 98 |             CsvRecordType::String(record) => {
 99 |                 for field in record.iter() {
100 |                     let value = convert_field_to_cow_str(
101 |                         field,
102 |                         null_string.as_deref(),
103 |                         ignore_null_bytes,
104 |                         &shared_empty,
105 |                     );
106 |                     vec.push(value);
107 |                 }
108 |             }
109 |             CsvRecordType::Byte(record) => {
110 |                 for field in record.iter() {
111 |                     let field = String::from_utf8_lossy(field);
112 |                     let value = convert_field_to_cow_str(
113 |                         &field,
114 |                         null_string.as_deref(),
115 |                         ignore_null_bytes,
116 |                         &shared_empty,
117 |                     );
118 |                     vec.push(value);
119 |                 }
120 |             }
121 |         }
122 | 
123 |         Ok(vec)
124 |     }
125 | }
126 | 
127 | #[inline]
128 | fn convert_field_to_cow_str<'a>(
129 |     field: &str,
130 |     null_string: Option<&str>,
131 |     ignore_null_bytes: bool,
132 |     shared_empty: &Cow<'a, str>,
133 | ) -> Option<CowStr<'a>> {
134 |     if Some(field) == null_string {
135 |         None
136 |     } else if field.is_empty() {
137 |         Some(CowStr(shared_empty.clone()))
138 |     } else if ignore_null_bytes {
139 |         Some(CowStr(Cow::Owned(field.replace("\0", ""))))
140 |     } else {
141 |         Some(CowStr(Cow::Owned(field.to_string())))
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/record.rs:
--------------------------------------------------------------------------------
 1 | use itertools::Itertools;
 2 | use magnus::{value::ReprValue, IntoValue, Ruby, Value};
 3 | use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
 4 | 
 5 | #[derive(Debug)]
 6 | pub enum CsvRecord<'a, S: BuildHasher + Default> {
 7 |     Vec(Vec<Option<CowStr<'a>>>),
 8 |     Map(HashMap<&'static str, Option<CowStr<'a>>, S>),
 9 | }
10 | 
11 | impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
12 |     #[inline]
13 |     fn into_value_with(self, handle: &Ruby) -> Value {
14 |         match self {
15 |             CsvRecord::Vec(vec) => {
16 |                 let ary = handle.ary_new_capa(vec.len());
17 |                 vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
18 |                 ary.into_value_with(handle)
19 |             }
20 |             CsvRecord::Map(map) => {
21 |                 // Pre-allocate the hash with the known size
22 |                 let hash = handle.hash_new_capa(map.len());
23 | 
24 |                 let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
25 |                 let mut i = 0;
26 | 
27 |                 for chunk in &map.into_iter().chunks(64) {
28 |                     for (k, v) in chunk {
29 |                         values[i] = handle.into_value(k);
30 |                         values[i + 1] = handle.into_value(v);
31 |                         i += 2;
32 |                     }
33 |                     hash.bulk_insert(&values[..i]).unwrap();
34 | 
35 |                     // Zero out used values
36 |                     values[..i].fill(handle.qnil().as_value());
37 |                     i = 0;
38 |                 }
39 | 
40 |                 hash.into_value_with(handle)
41 |             }
42 |         }
43 |     }
44 | }
45 | 
46 | #[derive(Debug, Clone)]
47 | pub struct CowStr<'a>(pub Cow<'a, str>);
48 | 
49 | impl IntoValue for CowStr<'_> {
50 |     fn into_value_with(self, handle: &Ruby) -> Value {
51 |         self.0.into_value_with(handle)
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/record_reader.rs:
--------------------------------------------------------------------------------
  1 | use super::builder::ReaderError;
  2 | use super::header_cache::StringCacheKey;
  3 | use super::parser::{CsvRecordType, RecordParser};
  4 | use super::ruby_reader::RubyReader;
  5 | use magnus::{Error, Ruby};
  6 | use std::borrow::Cow;
  7 | use std::io::{BufReader, Read};
  8 | 
  9 | /// Size of the internal buffer used for reading CSV records
 10 | pub(crate) const READ_BUFFER_SIZE: usize = 16384;
 11 | 
 12 | /// A reader that processes CSV records using a specified parser.
 13 | ///
 14 | /// This struct implements Iterator to provide a streaming interface for CSV records.
 15 | pub struct RecordReader<'a, 'r, T: RecordParser<'a>> {
 16 |     handle: &'r Ruby,
 17 |     reader: csv::Reader<BufReader<RubyReader>>,
 18 |     headers: Vec<StringCacheKey>,
 19 |     null_string: Option<Cow<'a, str>>,
 20 |     string_record: CsvRecordType,
 21 |     parser: std::marker::PhantomData<T>,
 22 |     ignore_null_bytes: bool,
 23 | }
 24 | 
 25 | impl<'a, 'r, T: RecordParser<'a>> RecordReader<'a, 'r, T> {
 26 |     /// Reads and processes headers from a CSV reader.
 27 |     ///
 28 |     /// # Arguments
 29 |     /// * `ruby` - Ruby VM context for error handling
 30 |     /// * `reader` - CSV reader instance
 31 |     /// * `has_headers` - Whether the CSV file contains headers
 32 |     ///
 33 |     /// # Returns
 34 |     /// A vector of header strings or generated column names if `has_headers` is false
 35 |     #[inline]
 36 |     pub(crate) fn get_headers(
 37 |         ruby: &Ruby,
 38 |         reader: &mut csv::Reader<impl Read>,
 39 |         has_headers: bool,
 40 |         lossy: bool,
 41 |     ) -> Result<Vec<String>, Error> {
 42 |         let headers = if lossy {
 43 |             let first_row = reader.byte_headers().map_err(|e| {
 44 |                 Error::new(
 45 |                     ruby.exception_runtime_error(),
 46 |                     format!("Failed to read headers: {e}"),
 47 |                 )
 48 |             })?;
 49 |             if has_headers {
 50 |                 first_row
 51 |                     .iter()
 52 |                     .map(String::from_utf8_lossy)
 53 |                     .map(|x| x.to_string())
 54 |                     .collect()
 55 |             } else {
 56 |                 (0..first_row.len()).map(|i| format!("c{i}")).collect()
 57 |             }
 58 |         } else {
 59 |             let first_row = reader.headers().map_err(|e| {
 60 |                 Error::new(
 61 |                     ruby.exception_runtime_error(),
 62 |                     format!("Failed to read headers: {e}"),
 63 |                 )
 64 |             })?;
 65 |             if has_headers {
 66 |                 first_row.iter().map(String::from).collect()
 67 |             } else {
 68 |                 (0..first_row.len()).map(|i| format!("c{i}")).collect()
 69 |             }
 70 |         };
 71 | 
 72 |         Ok(headers)
 73 |     }
 74 | 
 75 |     /// Creates a new RecordReader instance.
 76 |     pub(crate) fn new(
 77 |         handle: &'r Ruby,
 78 |         reader: csv::Reader<BufReader<RubyReader>>,
 79 |         headers: Vec<StringCacheKey>,
 80 |         null_string: Option<Cow<'a, str>>,
 81 |         ignore_null_bytes: bool,
 82 |         lossy: bool,
 83 |     ) -> Self {
 84 |         let headers_len = headers.len();
 85 |         Self {
 86 |             handle,
 87 |             reader,
 88 |             headers,
 89 |             null_string,
 90 |             string_record: if lossy {
 91 |                 CsvRecordType::Byte(csv::ByteRecord::with_capacity(
 92 |                     READ_BUFFER_SIZE,
 93 |                     headers_len,
 94 |                 ))
 95 |             } else {
 96 |                 CsvRecordType::String(csv::StringRecord::with_capacity(
 97 |                     READ_BUFFER_SIZE,
 98 |                     headers_len,
 99 |                 ))
100 |             },
101 |             parser: std::marker::PhantomData,
102 |             ignore_null_bytes,
103 |         }
104 |     }
105 | 
106 |     /// Attempts to read the next record, returning any errors encountered.
107 |     fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
108 |         let record = match self.string_record {
109 |             CsvRecordType::String(ref mut record) => self.reader.read_record(record),
110 |             CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
111 |         }?;
112 |         if record {
113 |             Ok(Some(T::parse(
114 |                 self.handle,
115 |                 &self.headers,
116 |                 &self.string_record,
117 |                 self.null_string.clone(),
118 |                 self.ignore_null_bytes,
119 |             )?))
120 |         } else {
121 |             Ok(None)
122 |         }
123 |     }
124 | }
125 | 
126 | impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, '_, T> {
127 |     type Item = Result<T::Output, ReaderError>;
128 | 
129 |     #[inline]
130 |     fn next(&mut self) -> Option<Self::Item> {
131 |         match self.try_next() {
132 |             Ok(Some(record)) => Some(Ok(record)),
133 |             Ok(None) => None,
134 |             Err(e) => Some(Err(e)),
135 |         }
136 |     }
137 | 
138 |     #[inline]
139 |     fn size_hint(&self) -> (usize, Option<usize>) {
140 |         (0, None) // Cannot determine size without reading entire file
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/ext/osv/src/csv/ruby_reader.rs:
--------------------------------------------------------------------------------
  1 | use flate2::bufread::GzDecoder;
  2 | use magnus::{
  3 |     value::{Opaque, ReprValue},
  4 |     RString, Ruby, Value,
  5 | };
  6 | use std::{
  7 |     fs::File,
  8 |     io::{self, BufReader, Read, Write},
  9 | };
 10 | 
 11 | use super::{builder::ReaderError, record_reader::READ_BUFFER_SIZE};
 12 | 
 13 | /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
 14 | /// and provide a standard Read implementation for them.
 15 | pub enum RubyReader {
 16 |     String {
 17 |         inner: Opaque<RString>,
 18 |         offset: usize,
 19 |     },
 20 |     RubyIoLike {
 21 |         inner: Opaque<Value>,
 22 |     },
 23 |     NativeProxyIoLike {
 24 |         proxy_file: Box<dyn Read>,
 25 |     },
 26 | }
 27 | 
 28 | impl RubyReader {
 29 |     fn is_io_like(value: &Value) -> bool {
 30 |         value.respond_to("read", false).unwrap_or(false)
 31 |     }
 32 | }
 33 | 
 34 | impl TryFrom<Value> for RubyReader {
 35 |     type Error = ReaderError;
 36 | 
 37 |     fn try_from(value: Value) -> Result<Self, Self::Error> {
 38 |         let ruby = unsafe { Ruby::get_unchecked() };
 39 |         if RubyReader::is_io_like(&value) {
 40 |             Ok(RubyReader::RubyIoLike {
 41 |                 inner: Opaque::from(value),
 42 |             })
 43 |         } else if value.is_kind_of(ruby.class_string()) {
 44 |             let ruby_string = value.to_r_string()?;
 45 |             let file_path = ruby_string.to_string()?;
 46 |             let file = File::open(&file_path)?;
 47 | 
 48 |             let x: Box<dyn Read> = if file_path.ends_with(".gz") {
 49 |                 let decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file));
 50 |                 Box::new(decoder)
 51 |             } else {
 52 |                 Box::new(file)
 53 |             };
 54 | 
 55 |             Ok(RubyReader::NativeProxyIoLike { proxy_file: x })
 56 |         } else {
 57 |             // Try calling `to_str`, and if that fails, try `to_s`
 58 |             let string_content = value
 59 |                 .funcall::<_, _, RString>("to_str", ())
 60 |                 .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
 61 |             Ok(RubyReader::String {
 62 |                 inner: Opaque::from(string_content),
 63 |                 offset: 0,
 64 |             })
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | impl Read for RubyReader {
 70 |     fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
 71 |         let ruby = unsafe { Ruby::get_unchecked() };
 72 |         match self {
 73 |             RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
 74 |             RubyReader::String { inner, offset } => {
 75 |                 let unwrapped_inner = ruby.get_inner(*inner);
 76 | 
 77 |                 let string_buffer = unsafe { unwrapped_inner.as_slice() };
 78 |                 if *offset >= string_buffer.len() {
 79 |                     return Ok(0); // EOF
 80 |                 }
 81 | 
 82 |                 let remaining = string_buffer.len() - *offset;
 83 |                 let copy_size = remaining.min(buf.len());
 84 |                 buf[..copy_size].copy_from_slice(&string_buffer[*offset..*offset + copy_size]);
 85 | 
 86 |                 *offset += copy_size;
 87 | 
 88 |                 Ok(copy_size)
 89 |             }
 90 |             RubyReader::RubyIoLike { inner } => {
 91 |                 let unwrapped_inner = ruby.get_inner(*inner);
 92 | 
 93 |                 let bytes = unwrapped_inner
 94 |                     .funcall::<_, _, Option<RString>>("read", (buf.len(),))
 95 |                     .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
 96 | 
 97 |                 match bytes {
 98 |                     Some(bytes) => {
 99 |                         let string_buffer = unsafe { bytes.as_slice() };
100 |                         buf.write_all(string_buffer)?;
101 |                         Ok(string_buffer.len())
102 |                     }
103 |                     None => Ok(0),
104 |                 }
105 |             }
106 |         }
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/ext/osv/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod allocator;
 2 | mod csv;
 3 | mod reader;
 4 | mod utils;
 5 | 
 6 | use crate::reader::*;
 7 | 
 8 | use magnus::{Error, Ruby};
 9 | 
10 | /// Initializes the Ruby extension and defines methods.
11 | #[magnus::init]
12 | fn init(ruby: &Ruby) -> Result<(), Error> {
13 |     let module = ruby.define_module("OSV")?;
14 |     module.define_module_function("for_each", magnus::method!(parse_csv, -1))?;
15 |     Ok(())
16 | }
17 | 


--------------------------------------------------------------------------------
/ext/osv/src/reader.rs:
--------------------------------------------------------------------------------
  1 | use crate::csv::{CowStr, CsvRecord, RecordReaderBuilder};
  2 | use crate::utils::*;
  3 | use ahash::RandomState;
  4 | use csv::Trim;
  5 | use magnus::value::ReprValue;
  6 | use magnus::{Error, IntoValue, KwArgs, Ruby, Symbol, Value};
  7 | use std::collections::HashMap;
  8 | 
  9 | /// Valid result types for CSV parsing
 10 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 11 | enum ResultType {
 12 |     Hash,
 13 |     Array,
 14 | }
 15 | 
 16 | impl ResultType {
 17 |     fn from_str(s: &str) -> Option<Self> {
 18 |         match s {
 19 |             "hash" => Some(Self::Hash),
 20 |             "array" => Some(Self::Array),
 21 |             _ => None,
 22 |         }
 23 |     }
 24 | }
 25 | 
 26 | /// Arguments for creating an enumerator
 27 | #[derive(Debug)]
 28 | struct EnumeratorArgs {
 29 |     rb_self: Value,
 30 |     to_read: Value,
 31 |     has_headers: bool,
 32 |     delimiter: u8,
 33 |     quote_char: u8,
 34 |     null_string: Option<String>,
 35 |     result_type: String,
 36 |     flexible: bool,
 37 |     trim: Option<String>,
 38 |     ignore_null_bytes: bool,
 39 |     lossy: bool,
 40 | }
 41 | 
 42 | /// Parses a CSV file with the given configuration.
 43 | ///
 44 | /// # Safety
 45 | /// This function uses unsafe code to get the Ruby runtime and leak memory for static references.
 46 | /// This is necessary for Ruby integration but should be used with caution.
 47 | pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
 48 |     //  SAFETY: We're in a Ruby callback, so Ruby runtime is guaranteed to be initialized
 49 |     let ruby = unsafe { Ruby::get_unchecked() };
 50 | 
 51 |     let ReadCsvArgs {
 52 |         to_read,
 53 |         has_headers,
 54 |         delimiter,
 55 |         quote_char,
 56 |         null_string,
 57 |         result_type,
 58 |         flexible,
 59 |         trim,
 60 |         ignore_null_bytes,
 61 |         lossy,
 62 |     } = parse_read_csv_args(&ruby, args)?;
 63 | 
 64 |     if !ruby.block_given() {
 65 |         return create_enumerator(
 66 |             &ruby,
 67 |             EnumeratorArgs {
 68 |                 rb_self,
 69 |                 to_read,
 70 |                 has_headers,
 71 |                 delimiter,
 72 |                 quote_char,
 73 |                 null_string,
 74 |                 result_type,
 75 |                 flexible,
 76 |                 trim: match trim {
 77 |                     Trim::All => Some("all".to_string()),
 78 |                     Trim::Headers => Some("headers".to_string()),
 79 |                     Trim::Fields => Some("fields".to_string()),
 80 |                     _ => None,
 81 |                 },
 82 |                 ignore_null_bytes,
 83 |                 lossy,
 84 |             },
 85 |         )
 86 |         .map(|yield_enum| yield_enum.into_value_with(&ruby));
 87 |     }
 88 | 
 89 |     let result_type = ResultType::from_str(&result_type).ok_or_else(|| {
 90 |         Error::new(
 91 |             ruby.exception_runtime_error(),
 92 |             "Invalid result type, expected 'hash' or 'array'",
 93 |         )
 94 |     })?;
 95 | 
 96 |     match result_type {
 97 |         ResultType::Hash => {
 98 |             let builder = RecordReaderBuilder::<
 99 |                 HashMap<&'static str, Option<CowStr<'_>>, RandomState>,
100 |             >::new(&ruby, to_read)
101 |             .has_headers(has_headers)
102 |             .flexible(flexible)
103 |             .trim(trim)
104 |             .delimiter(delimiter)
105 |             .quote_char(quote_char)
106 |             .null_string(null_string)
107 |             .ignore_null_bytes(ignore_null_bytes)
108 |             .lossy(lossy)
109 |             .build()?;
110 | 
111 |             let ruby = unsafe { Ruby::get_unchecked() };
112 |             for result in builder {
113 |                 let record = result?;
114 |                 let _: Value = ruby.yield_value(CsvRecord::<ahash::RandomState>::Map(record))?;
115 |             }
116 |         }
117 |         ResultType::Array => {
118 |             let builder = RecordReaderBuilder::<Vec<Option<CowStr<'_>>>>::new(&ruby, to_read)
119 |                 .has_headers(has_headers)
120 |                 .flexible(flexible)
121 |                 .trim(trim)
122 |                 .delimiter(delimiter)
123 |                 .quote_char(quote_char)
124 |                 .null_string(null_string)
125 |                 .ignore_null_bytes(ignore_null_bytes)
126 |                 .lossy(lossy)
127 |                 .build()?;
128 | 
129 |             let ruby = unsafe { Ruby::get_unchecked() };
130 |             for result in builder {
131 |                 let record = result?;
132 |                 let _: Value = ruby.yield_value(CsvRecord::<ahash::RandomState>::Vec(record))?;
133 |             }
134 |         }
135 |     }
136 | 
137 |     let ruby = unsafe { Ruby::get_unchecked() };
138 |     Ok(ruby.qnil().into_value_with(&ruby))
139 | }
140 | 
141 | /// Creates an enumerator for lazy CSV parsing
142 | fn create_enumerator(ruby: &Ruby, args: EnumeratorArgs) -> Result<magnus::Enumerator, Error> {
143 |     let kwargs = ruby.hash_new();
144 |     kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
145 |     kwargs.aset(
146 |         Symbol::new("col_sep"),
147 |         String::from_utf8(vec![args.delimiter]).unwrap(),
148 |     )?;
149 |     kwargs.aset(
150 |         Symbol::new("quote_char"),
151 |         String::from_utf8(vec![args.quote_char]).unwrap(),
152 |     )?;
153 |     kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
154 |     kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
155 |     kwargs.aset(Symbol::new("flexible"), args.flexible)?;
156 |     kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
157 |     kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?;
158 |     kwargs.aset(Symbol::new("lossy"), args.lossy)?;
159 |     Ok(args
160 |         .rb_self
161 |         .enumeratorize("for_each", (args.to_read, KwArgs(kwargs))))
162 | }
163 | 


--------------------------------------------------------------------------------
/ext/osv/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use magnus::{
  2 |     scan_args::{get_kwargs, scan_args},
  3 |     value::ReprValue,
  4 |     Error, RString, Ruby, Symbol, Value,
  5 | };
  6 | 
  7 | fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
  8 |     if value.is_nil() {
  9 |         Ok(None)
 10 |     } else if value.is_kind_of(ruby.class_string()) {
 11 |         RString::from_value(value)
 12 |             .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
 13 |             .to_string()
 14 |             .map(Some)
 15 |     } else if value.is_kind_of(ruby.class_symbol()) {
 16 |         Symbol::from_value(value)
 17 |             .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
 18 |             .funcall("to_s", ())
 19 |             .map(Some)
 20 |     } else {
 21 |         Err(Error::new(
 22 |             magnus::exception::type_error(),
 23 |             "Value must be a String or Symbol",
 24 |         ))
 25 |     }
 26 | }
 27 | 
 28 | #[derive(Debug)]
 29 | pub struct ReadCsvArgs {
 30 |     pub to_read: Value,
 31 |     pub has_headers: bool,
 32 |     pub delimiter: u8,
 33 |     pub quote_char: u8,
 34 |     pub null_string: Option<String>,
 35 |     pub result_type: String,
 36 |     pub flexible: bool,
 37 |     pub trim: csv::Trim,
 38 |     pub ignore_null_bytes: bool,
 39 |     pub lossy: bool,
 40 | }
 41 | 
 42 | /// Parse common arguments for CSV parsing
 43 | pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, Error> {
 44 |     let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
 45 |     let (to_read,) = parsed_args.required;
 46 | 
 47 |     let kwargs = get_kwargs::<
 48 |         _,
 49 |         (),
 50 |         (
 51 |             Option<Option<bool>>,
 52 |             Option<Option<String>>,
 53 |             Option<Option<String>>,
 54 |             Option<Option<String>>,
 55 |             Option<Option<Value>>,
 56 |             Option<Option<bool>>,
 57 |             Option<Option<Value>>,
 58 |             Option<Option<bool>>,
 59 |             Option<Option<bool>>,
 60 |         ),
 61 |         (),
 62 |     >(
 63 |         parsed_args.keywords,
 64 |         &[],
 65 |         &[
 66 |             "has_headers",
 67 |             "col_sep",
 68 |             "quote_char",
 69 |             "nil_string",
 70 |             "result_type",
 71 |             "flexible",
 72 |             "trim",
 73 |             "ignore_null_bytes",
 74 |             "lossy",
 75 |         ],
 76 |     )?;
 77 | 
 78 |     let has_headers = kwargs.optional.0.flatten().unwrap_or(true);
 79 | 
 80 |     let delimiter = *kwargs
 81 |         .optional
 82 |         .1
 83 |         .flatten()
 84 |         .unwrap_or_else(|| ",".to_string())
 85 |         .as_bytes()
 86 |         .first()
 87 |         .ok_or_else(|| {
 88 |             Error::new(
 89 |                 magnus::exception::runtime_error(),
 90 |                 "Delimiter cannot be empty",
 91 |             )
 92 |         })?;
 93 | 
 94 |     let quote_char = *kwargs
 95 |         .optional
 96 |         .2
 97 |         .flatten()
 98 |         .unwrap_or_else(|| "\"".to_string())
 99 |         .as_bytes()
100 |         .first()
101 |         .ok_or_else(|| {
102 |             Error::new(
103 |                 magnus::exception::runtime_error(),
104 |                 "Quote character cannot be empty",
105 |             )
106 |         })?;
107 | 
108 |     let null_string = kwargs.optional.3.unwrap_or_default();
109 | 
110 |     let result_type = match kwargs
111 |         .optional
112 |         .4
113 |         .flatten()
114 |         .map(|value| parse_string_or_symbol(ruby, value))
115 |     {
116 |         Some(Ok(Some(parsed))) => match parsed.as_str() {
117 |             "hash" | "array" => parsed,
118 |             _ => {
119 |                 return Err(Error::new(
120 |                     magnus::exception::runtime_error(),
121 |                     "result_type must be either 'hash' or 'array'",
122 |                 ))
123 |             }
124 |         },
125 |         Some(Ok(None)) => String::from("hash"),
126 |         Some(Err(_)) => {
127 |             return Err(Error::new(
128 |                 magnus::exception::type_error(),
129 |                 "result_type must be a String or Symbol",
130 |             ))
131 |         }
132 |         None => String::from("hash"),
133 |     };
134 | 
135 |     let flexible = kwargs.optional.5.flatten().unwrap_or_default();
136 | 
137 |     let trim = match kwargs
138 |         .optional
139 |         .6
140 |         .flatten()
141 |         .map(|value| parse_string_or_symbol(ruby, value))
142 |     {
143 |         Some(Ok(Some(parsed))) => match parsed.as_str() {
144 |             "all" => csv::Trim::All,
145 |             "headers" => csv::Trim::Headers,
146 |             "fields" => csv::Trim::Fields,
147 |             invalid => {
148 |                 return Err(Error::new(
149 |                     magnus::exception::runtime_error(),
150 |                     format!(
151 |                         "trim must be either 'all', 'headers', or 'fields' but got '{}'",
152 |                         invalid
153 |                     ),
154 |                 ))
155 |             }
156 |         },
157 |         Some(Ok(None)) => csv::Trim::None,
158 |         Some(Err(_)) => {
159 |             return Err(Error::new(
160 |                 magnus::exception::type_error(),
161 |                 "trim must be a String or Symbol",
162 |             ))
163 |         }
164 |         None => csv::Trim::None,
165 |     };
166 | 
167 |     let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default();
168 | 
169 |     let lossy = kwargs.optional.8.flatten().unwrap_or_default();
170 | 
171 |     Ok(ReadCsvArgs {
172 |         to_read,
173 |         has_headers,
174 |         delimiter,
175 |         quote_char,
176 |         null_string,
177 |         result_type,
178 |         flexible,
179 |         trim,
180 |         ignore_null_bytes,
181 |         lossy,
182 |     })
183 | }
184 | 


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": {
  3 |     "crane": {
  4 |       "locked": {
  5 |         "lastModified": 1734808813,
  6 |         "narHash": "sha256-3aH/0Y6ajIlfy7j52FGZ+s4icVX0oHhqBzRdlOeztqg=",
  7 |         "owner": "ipetkov",
  8 |         "repo": "crane",
  9 |         "rev": "72e2d02dbac80c8c86bf6bf3e785536acf8ee926",
 10 |         "type": "github"
 11 |       },
 12 |       "original": {
 13 |         "owner": "ipetkov",
 14 |         "repo": "crane",
 15 |         "type": "github"
 16 |       }
 17 |     },
 18 |     "flake-parts": {
 19 |       "inputs": {
 20 |         "nixpkgs-lib": "nixpkgs-lib"
 21 |       },
 22 |       "locked": {
 23 |         "lastModified": 1733312601,
 24 |         "narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=",
 25 |         "owner": "hercules-ci",
 26 |         "repo": "flake-parts",
 27 |         "rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9",
 28 |         "type": "github"
 29 |       },
 30 |       "original": {
 31 |         "owner": "hercules-ci",
 32 |         "repo": "flake-parts",
 33 |         "type": "github"
 34 |       }
 35 |     },
 36 |     "flake-utils": {
 37 |       "inputs": {
 38 |         "systems": "systems"
 39 |       },
 40 |       "locked": {
 41 |         "lastModified": 1731533236,
 42 |         "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
 43 |         "owner": "numtide",
 44 |         "repo": "flake-utils",
 45 |         "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
 46 |         "type": "github"
 47 |       },
 48 |       "original": {
 49 |         "owner": "numtide",
 50 |         "repo": "flake-utils",
 51 |         "type": "github"
 52 |       }
 53 |     },
 54 |     "napalm": {
 55 |       "inputs": {
 56 |         "flake-utils": [
 57 |           "flake-utils"
 58 |         ],
 59 |         "nixpkgs": [
 60 |           "nixpkgs"
 61 |         ]
 62 |       },
 63 |       "locked": {
 64 |         "lastModified": 1717929455,
 65 |         "narHash": "sha256-BiI5xWygriOJuNISnGAeL0KYxrEMnjgpg+7wDskVBhI=",
 66 |         "owner": "nix-community",
 67 |         "repo": "napalm",
 68 |         "rev": "e1babff744cd278b56abe8478008b4a9e23036cf",
 69 |         "type": "github"
 70 |       },
 71 |       "original": {
 72 |         "owner": "nix-community",
 73 |         "repo": "napalm",
 74 |         "type": "github"
 75 |       }
 76 |     },
 77 |     "nixpkgs": {
 78 |       "locked": {
 79 |         "lastModified": 1735060202,
 80 |         "narHash": "sha256-5ADWDE/TTw9mKHuWKSmKI6Kyh6HwBh8JJ6QxyaWTnXA=",
 81 |         "owner": "NixOS",
 82 |         "repo": "nixpkgs",
 83 |         "rev": "defcdc88d552bc9758a7afd04b7f4dc8d43aa50b",
 84 |         "type": "github"
 85 |       },
 86 |       "original": {
 87 |         "owner": "NixOS",
 88 |         "ref": "master",
 89 |         "repo": "nixpkgs",
 90 |         "type": "github"
 91 |       }
 92 |     },
 93 |     "nixpkgs-lib": {
 94 |       "locked": {
 95 |         "lastModified": 1733096140,
 96 |         "narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=",
 97 |         "type": "tarball",
 98 |         "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
 99 |       },
100 |       "original": {
101 |         "type": "tarball",
102 |         "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
103 |       }
104 |     },
105 |     "root": {
106 |       "inputs": {
107 |         "crane": "crane",
108 |         "flake-parts": "flake-parts",
109 |         "flake-utils": "flake-utils",
110 |         "napalm": "napalm",
111 |         "nixpkgs": "nixpkgs",
112 |         "rust-overlay": "rust-overlay"
113 |       }
114 |     },
115 |     "rust-overlay": {
116 |       "inputs": {
117 |         "nixpkgs": [
118 |           "nixpkgs"
119 |         ]
120 |       },
121 |       "locked": {
122 |         "lastModified": 1735007320,
123 |         "narHash": "sha256-NdhUgB9BkLGW9I+Q1GyUUCc3CbDgsg7HLWjG7WZBR5Q=",
124 |         "owner": "oxalica",
125 |         "repo": "rust-overlay",
126 |         "rev": "fb5fdba697ee9a2391ca9ceea3b853b4e3ce37a5",
127 |         "type": "github"
128 |       },
129 |       "original": {
130 |         "owner": "oxalica",
131 |         "repo": "rust-overlay",
132 |         "type": "github"
133 |       }
134 |     },
135 |     "systems": {
136 |       "locked": {
137 |         "lastModified": 1681028828,
138 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
139 |         "owner": "nix-systems",
140 |         "repo": "default",
141 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
142 |         "type": "github"
143 |       },
144 |       "original": {
145 |         "owner": "nix-systems",
146 |         "repo": "default",
147 |         "type": "github"
148 |       }
149 |     }
150 |   },
151 |   "root": "root",
152 |   "version": 7
153 | }
154 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   description = "Central repository of all giza builds";
 3 |   nixConfig = {
 4 |     max-jobs = 32;
 5 |     http-connections = 128;
 6 |     max-substitution-jobs = 128;
 7 |     substituters = [
 8 |       "https://cache.nixos.org?priority=1"
 9 |       "https://nix-community.cachix.org?priority=2"
10 |     ];
11 |     trusted-public-keys = [
12 |       "cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
13 |       "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
14 |     ];
15 |     # This setting, when true, tries to use symlinks to optimise storage use between nix derivations.
16 |     # However, on MacOS, it sometimes runs into issues, and causes stuff to build from scratch...
17 |     # Which is strictly worse than using some extra storage sometimes. So we'll force it to false.
18 |     auto-optimise-store = false;
19 |   };
20 |   inputs = {
21 |     flake-utils.url = "github:numtide/flake-utils";
22 |     flake-parts.url = "github:hercules-ci/flake-parts";
23 |     napalm = {
24 |       url = "github:nix-community/napalm";
25 |       inputs = {
26 |         nixpkgs.follows = "nixpkgs";
27 |         flake-utils.follows = "flake-utils";
28 |       };
29 |     };
30 |     nixpkgs.url = "github:NixOS/nixpkgs/master";
31 |     rust-overlay = {
32 |       url = "github:oxalica/rust-overlay";
33 |       inputs = {
34 |         nixpkgs.follows = "nixpkgs";
35 |       };
36 |     };
37 |     crane = {
38 |       url = "github:ipetkov/crane";
39 |     };
40 |   };
41 |   outputs = inputs:
42 |     inputs.flake-parts.lib.mkFlake { inherit inputs; } {
43 |       systems = [ "x86_64-linux" "aarch64-linux" "aarch64-darwin" "x86_64-darwin" ];
44 |       imports = [
45 | 
46 |       ];
47 | 
48 |       perSystem = { config, self', inputs', pkgs, system, ... }:
49 |         let
50 |           linuxSystem = builtins.replaceStrings [ "darwin" ] [ "linux" ] system;
51 |         in
52 |         {
53 |           _module.args.linuxSystem = linuxSystem;
54 |           _module.args.pkgs = import inputs.nixpkgs {
55 |             inherit system;
56 |             overlays = import ./overlay.nix inputs;
57 |             config = {
58 |               allowUnfree = true;
59 |               allowUnsupportedSystem = true;
60 |               permittedInsecurePackages = [
61 |                 "openssl-1.1.1w"
62 |               ];
63 |             };
64 |           };
65 |           _module.args.pkgsLinux = import inputs.nixpkgs {
66 |             system = linuxSystem;
67 |             overlays = import ./overlay.nix inputs;
68 |             config = {
69 |               allowUnfree = true;
70 |               allowUnsupportedSystem = true;
71 |               permittedInsecurePackages = [
72 |                 "openssl-1.1.1w"
73 |               ];
74 |             };
75 |           };
76 |           legacyPackages.nixpkgs = pkgs;
77 |           devShells.default = pkgs.mkShell {
78 |             packages = with pkgs;[
79 |               ruby_3_3
80 |               bundler
81 |               rust-analyzer-unwrapped
82 |               rust-dev-toolchain
83 |             ];
84 |           };
85 |         };
86 |     };
87 | }
88 | 


--------------------------------------------------------------------------------
/lib/osv.rb:
--------------------------------------------------------------------------------
 1 | require_relative "osv/version"
 2 | 
 3 | begin
 4 |   require "osv/#{RUBY_VERSION.to_f}/osv"
 5 | rescue LoadError
 6 |   require "osv/osv"
 7 | end
 8 | 
 9 | module OSV
10 | end
11 | 


--------------------------------------------------------------------------------
/lib/osv.rbi:
--------------------------------------------------------------------------------
 1 | # typed: strict
 2 | 
 3 | module OSV
 4 |   # Options:
 5 |   #   - `has_headers`: Boolean indicating if the first row contains headers
 6 |   #                    (default: true)
 7 |   #   - `col_sep`: String specifying the field separator
 8 |   #                (default: ",")
 9 |   #   - `quote_char`: String specifying the quote character
10 |   #                   (default: "\"")
11 |   #   - `nil_string`: String that should be interpreted as nil
12 |   #                   By default, empty strings are interpreted as empty strings.
13 |   #                   If you want to interpret empty strings as nil, set this to
14 |   #                   an empty string.
15 |   #   - `buffer_size`: Integer specifying the read buffer size
16 |   #   - `result_type`: String specifying the output format
17 |   #                    ("hash" or "array" or :hash or :array)
18 |   #   - `flexible`: Boolean specifying if the parser should be flexible
19 |   #                 (default: false)
20 |   #   - `trim`: String specifying the trim mode
21 |   #             ("all" or "headers" or "fields" or :all or :headers or :fields)
22 |   #             (default: `nil`)
23 |   #   - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
24 |   #                         (default: false)
25 |   #   - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character
26 |   sig do
27 |     params(
28 |       input: T.any(String, StringIO, IO),
29 |       has_headers: T.nilable(T::Boolean),
30 |       col_sep: T.nilable(String),
31 |       quote_char: T.nilable(String),
32 |       nil_string: T.nilable(String),
33 |       buffer_size: T.nilable(Integer),
34 |       result_type: T.nilable(T.any(String, Symbol)),
35 |       flexible: T.nilable(T::Boolean),
36 |       ignore_null_bytes: T.nilable(T::Boolean),
37 |       trim: T.nilable(T.any(String, Symbol)),
38 |       blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
39 |     ).returns(T.any(Enumerator, T.untyped))
40 |   end
41 |   def self.for_each(
42 |     input,
43 |     has_headers: true,
44 |     col_sep: nil,
45 |     quote_char: nil,
46 |     nil_string: nil,
47 |     buffer_size: nil,
48 |     result_type: nil,
49 |     flexible: nil,
50 |     ignore_null_bytes: nil,
51 |     trim: nil,
52 |     lossy: nil,
53 |     &blk
54 |   )
55 |   end
56 | end
57 | 


--------------------------------------------------------------------------------
/lib/osv/version.rb:
--------------------------------------------------------------------------------
1 | module OSV
2 |   VERSION = "0.5.3"
3 | end
4 | 


--------------------------------------------------------------------------------
/osv.gemspec:
--------------------------------------------------------------------------------
 1 | require_relative "lib/osv/version"
 2 | 
 3 | Gem::Specification.new do |spec|
 4 |   spec.name = "osv"
 5 |   spec.version = OSV::VERSION
 6 |   spec.authors = ["Nathan Jaremko"]
 7 |   spec.email = ["nathan@jaremko.ca"]
 8 | 
 9 |   spec.summary = "CSV parser for Ruby"
10 |   spec.description = <<-EOF
11 |     OSV is a high-performance CSV parser for Ruby, implemented in Rust.
12 |     It wraps BurntSushi's csv-rs crate to provide fast CSV parsing with support for both hash-based and array-based row formats.
13 |     Features include: Flexible input sources (file paths, gzipped files, IO objects, strings),
14 |     configurable parsing options (headers, separators, quote chars), support for both hash and array output formats,
15 |     whitespace trimming options, strict or flexible parsing modes, and is significantly faster than Ruby's standard CSV library.
16 |   EOF
17 |   spec.homepage = "https://github.com/njaremko/osv"
18 |   spec.license = "MIT"
19 |   spec.required_ruby_version = ">= 3.1.0"
20 | 
21 |   spec.metadata["homepage_uri"] = spec.homepage
22 |   spec.metadata["source_code_uri"] = "https://github.com/njaremko/osv"
23 |   spec.metadata["readme_uri"] = "https://github.com/njaremko/osv/blob/main/README.md"
24 |   spec.metadata["changelog_uri"] = "https://github.com/njaremko/osv/blob/main/CHANGELOG.md"
25 |   spec.metadata["documentation_uri"] = "https://www.rubydoc.info/gems/osv"
26 |   spec.metadata["funding_uri"] = "https://github.com/sponsors/njaremko"
27 | 
28 |   spec.files =
29 |     Dir[
30 |       "{ext,lib}/**/*",
31 |       "LICENSE",
32 |       "README.md",
33 |       "Cargo.*",
34 |       "Gemfile",
35 |       "Rakefile"
36 |     ]
37 |   spec.require_paths = ["lib"]
38 | 
39 |   spec.extensions = ["ext/osv/extconf.rb"]
40 | 
41 |   # needed until rubygems supports Rust support is out of beta
42 |   spec.add_dependency "rb_sys", "~> 0.9.39"
43 | 
44 |   # only needed when developing or packaging your gem
45 |   spec.add_development_dependency "rake-compiler", "~> 1.2.0"
46 | end
47 | 


--------------------------------------------------------------------------------
/overlay.nix:
--------------------------------------------------------------------------------
 1 | inputs:
 2 | [
 3 |   (import inputs.rust-overlay)
 4 |   (final: prev: {
 5 |     bundler = prev.bundler.override { ruby = final.ruby_3_3; };
 6 |     bundix = prev.bundix.overrideAttrs (oldAtts: {
 7 |       ruby = final.ruby_3_3;
 8 |     });
 9 |     ruby_3_3 = ((final.mkRuby {
10 |       version = final.mkRubyVersion "3" "3" "6" "";
11 |       hash = "sha256-jcSP/68nD4bxAZBT8o5R5NpMzjKjZ2CgYDqa7mfX/Y0=";
12 |       cargoHash = "sha256-GeelTMRFIyvz1QS2L+Q3KAnyQy7jc0ejhx3TdEFVEbk=";
13 |     }).override
14 |       {
15 |         jemallocSupport = true;
16 |       });
17 |     craneLib = (inputs.crane.mkLib final).overrideToolchain final.rust-bin.stable.latest.default;
18 |     rust-toolchain = prev.rust-bin.stable.latest.default;
19 |     # This is an extended rust toolchain with `rust-src` since that's required for IDE stuff
20 |     rust-dev-toolchain = prev.rust-bin.stable.latest.default.override {
21 |       extensions = [ "rust-src" ];
22 |     };
23 |   })
24 | ]
25 | 


--------------------------------------------------------------------------------
/test/big_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "osv"
 4 | require "zlib"
 5 | require "minitest/autorun"
 6 | 
 7 | class BigTest < Minitest::Test
 8 |   def test_parse_csv_with_many_rows
 9 |     # Generate test data with 2000 rows
10 |     Tempfile.create(%w[test_many_rows .csv]) do |test_file|
11 |       test_file.write "id,name,age\n"
12 |       2000.times { |i| test_file.write "#{i},Person#{i},#{20 + i % 50}\n" }
13 |       test_file.close
14 | 
15 |       # Parse and verify
16 |       actual = []
17 |       OSV.for_each(test_file.path) { |row| actual << row }
18 | 
19 |       assert_equal 2000, actual.size
20 |     end
21 |   end
22 | 
23 |   def test_parse_csv_with_many_rows_stringio
24 |     # Generate test data with 2000 rows
25 |     io = StringIO.new
26 |     io.write "id,name,age\n"
27 |     2000.times { |i| io.write "#{i},Person#{i},#{20 + i % 50}\n" }
28 |     io.rewind
29 | 
30 |     # Parse and verify
31 |     actual = []
32 |     OSV.for_each(io) { |row| actual << row }
33 | 
34 |     assert_equal 2000, actual.size
35 |   end
36 | end
37 | 


--------------------------------------------------------------------------------
/test/concurrency_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "minitest/autorun"
  5 | 
  6 | # Tests focused on concurrency and thread-safety
  7 | class ConcurrencyTest < Minitest::Test
  8 |   def test_parse_csv_in_multiple_threads
  9 |     expected = [
 10 |       { "id" => "1", "age" => "25", "name" => "John" },
 11 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 12 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 13 |     ]
 14 | 
 15 |     threads =
 16 |       100.times.map do
 17 |         Thread.new do
 18 |           result = OSV.for_each("test/test.csv").to_a
 19 |           assert_equal expected, result
 20 |         end
 21 |       end
 22 | 
 23 |     threads.each(&:join)
 24 |   end
 25 | 
 26 |   def test_parse_csv_in_multiple_threads_block
 27 |     expected = [
 28 |       { "id" => "1", "age" => "25", "name" => "John" },
 29 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 30 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 31 |     ]
 32 | 
 33 |     threads =
 34 |       100.times.map do
 35 |         Thread.new do
 36 |           results = []
 37 |           OSV.for_each("test/test.csv") { |row| results << row }
 38 |           assert_equal expected, results
 39 |         end
 40 |       end
 41 | 
 42 |     threads.each(&:join)
 43 |   end
 44 | 
 45 |   def test_interleaved_parsing_with_threads
 46 |     # Create two files to parse
 47 |     file1 = Tempfile.new(%w[thread1 .csv])
 48 |     file2 = Tempfile.new(%w[thread2 .csv])
 49 | 
 50 |     begin
 51 |       # Write different content to each file
 52 |       file1.write("id,name\n")
 53 |       file2.write("code,description\n")
 54 | 
 55 |       100.times do |i|
 56 |         file1.write("#{i},name#{i}\n")
 57 |         file2.write("code#{i},desc#{i}\n")
 58 |       end
 59 | 
 60 |       file1.flush
 61 |       file2.flush
 62 | 
 63 |       # Parse both files in interleaved fashion with threads
 64 |       enum1 = OSV.for_each(file1.path)
 65 |       enum2 = OSV.for_each(file2.path)
 66 | 
 67 |       threads = []
 68 |       results1 = Queue.new
 69 |       results2 = Queue.new
 70 | 
 71 |       # Thread 1 processes enum1
 72 |       threads << Thread.new do
 73 |         begin
 74 |           results1 << enum1.next while true
 75 |         rescue StopIteration
 76 |           # Expected when enumeration is complete
 77 |         end
 78 |       end
 79 | 
 80 |       # Thread 2 processes enum2
 81 |       threads << Thread.new do
 82 |         begin
 83 |           results2 << enum2.next while true
 84 |         rescue StopIteration
 85 |           # Expected when enumeration is complete
 86 |         end
 87 |       end
 88 | 
 89 |       # Wait for both threads to complete
 90 |       threads.each(&:join)
 91 | 
 92 |       # Verify results
 93 |       assert_equal 100, results1.size
 94 |       assert_equal 100, results2.size
 95 | 
 96 |       # Check first and last items from each queue
 97 |       first1 = results1.pop
 98 |       assert_equal "0", first1["id"]
 99 |       assert_equal "name0", first1["name"]
100 | 
101 |       first2 = results2.pop
102 |       assert_equal "code0", first2["code"]
103 |       assert_equal "desc0", first2["description"]
104 |     ensure
105 |       file1.close
106 |       file1.unlink
107 |       file2.close
108 |       file2.unlink
109 |     end
110 |   end
111 | end


--------------------------------------------------------------------------------
/test/core_functionality_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "zlib"
  5 | require "minitest/autorun"
  6 | 
  7 | # Core functionality tests for the OSV CSV parser
  8 | class CoreFunctionalityTest < Minitest::Test
  9 |   def test_parse_csv_with_headers
 10 |     expected = [
 11 |       { "id" => "1", "age" => "25", "name" => "John" },
 12 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 13 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 14 |     ]
 15 |     actual = []
 16 |     OSV.for_each("test/test.csv") { |row| actual << row }
 17 |     assert_equal expected, actual
 18 |   end
 19 | 
 20 |   def test_parse_csv_with_tsv
 21 |     expected = [
 22 |       { "id" => "1", "age" => "25", "name" => "John" },
 23 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 24 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 25 |     ]
 26 |     actual = []
 27 |     OSV.for_each("test/test.tsv", col_sep: "\t") { |row| actual << row }
 28 |     assert_equal expected, actual
 29 |   end
 30 | 
 31 |   def test_parse_csv_without_headers
 32 |     expected = [
 33 |       { "c0" => "id", "c1" => "name", "c2" => "age" },
 34 |       { "c1" => "John", "c2" => "25", "c0" => "1" },
 35 |       { "c1" => "Jane", "c2" => "30", "c0" => "2" },
 36 |       { "c0" => "3", "c1" => "Jim", "c2" => "35" }
 37 |     ]
 38 |     actual = []
 39 |     OSV.for_each("test/test.csv", has_headers: false) { |row| actual << row }
 40 |     assert_equal expected, actual
 41 |   end
 42 | 
 43 |   def test_parse_csv_with_io
 44 |     expected = [
 45 |       { "id" => "1", "age" => "25", "name" => "John" },
 46 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 47 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 48 |     ]
 49 |     actual = []
 50 |     File.open("test/test.csv") { |file| OSV.for_each(file) { |row| actual << row } }
 51 |     assert_equal expected, actual
 52 |   end
 53 | 
 54 |   def test_parse_csv_with_string_io
 55 |     expected = [
 56 |       { "id" => "1", "age" => "25", "name" => "John" },
 57 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 58 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 59 |     ]
 60 |     actual = []
 61 |     csv_data = File.read("test/test.csv")
 62 |     string_io = StringIO.new(csv_data)
 63 |     OSV.for_each(string_io) { |row| actual << row }
 64 |     assert_equal expected, actual
 65 |   end
 66 | 
 67 |   def test_enumerator_raises_stop_iteration
 68 |     enum = OSV.for_each("test/test.csv")
 69 |     3.times { enum.next } # Consume all records
 70 |     assert_raises(StopIteration) { enum.next }
 71 |   end
 72 | 
 73 |   def test_for_each_without_block
 74 |     result = OSV.for_each("test/test.csv")
 75 |     assert_instance_of Enumerator, result
 76 |     expected = [
 77 |       { "id" => "1", "age" => "25", "name" => "John" },
 78 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 79 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 80 |     ]
 81 |     assert_equal expected, result.to_a
 82 |   end
 83 | 
 84 |   def test_for_each_compat_without_block
 85 |     result = OSV.for_each("test/test.csv", result_type: "array")
 86 |     assert_instance_of Enumerator, result
 87 |     expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]]
 88 |     assert_equal expected, result.to_a
 89 |   end
 90 | 
 91 |   def test_parse_csv_compat_without_headers
 92 |     expected = [%w[id name age], %w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]]
 93 |     actual = []
 94 |     OSV.for_each("test/test.csv", has_headers: false, result_type: "array") { |row| actual << row }
 95 |     assert_equal expected, actual
 96 |   end
 97 | 
 98 |   def test_parse_csv_compat_with_headers
 99 |     expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]]
100 |     actual = []
101 |     OSV.for_each("test/test.csv", has_headers: true, result_type: "array") { |row| actual << row }
102 |     assert_equal expected, actual
103 |   end
104 | 
105 |   def test_parse_csv_compat_with_io_and_headers
106 |     expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]]
107 |     actual = []
108 |     File.open("test/test.csv") { |file| OSV.for_each(file, result_type: "array") { |row| actual << row } }
109 |     assert_equal expected, actual
110 |   end
111 | 
112 |   def test_parse_csv_compat_with_io_without_headers
113 |     expected = [%w[id name age], %w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]]
114 |     actual = []
115 |     File.open("test/test.csv") do |file|
116 |       OSV.for_each(file, has_headers: false, result_type: "array") { |row| actual << row }
117 |     end
118 |     assert_equal expected, actual
119 |   end
120 | end


--------------------------------------------------------------------------------
/test/encoding_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "minitest/autorun"
  5 | 
  6 | # Tests focused on encoding handling
  7 | class EncodingTest < Minitest::Test
  8 |   def test_parse_csv_with_invalid_utf8
  9 |     invalid_utf8 = StringIO.new("id,name\n1,\xFF\xFF\n")
 10 |     assert_raises(EncodingError) do
 11 |       OSV.for_each(invalid_utf8) { |_row| }
 12 |     rescue => e
 13 |       assert e.message.include?("invalid utf-8")
 14 |       raise
 15 |     end
 16 |   end
 17 | 
 18 |   def test_parse_csv_with_invalid_utf8_file
 19 |     File.write("test/invalid_utf8.csv", "id,name\n1,\xFF\xFF\n")
 20 |     assert_raises(EncodingError) do
 21 |       OSV.for_each("test/invalid_utf8.csv") { |_row| }
 22 |     rescue => e
 23 |       assert e.message.include?("invalid utf-8")
 24 |       raise
 25 |     ensure
 26 |       begin
 27 |         File.delete("test/invalid_utf8.csv")
 28 |       rescue StandardError
 29 |         nil
 30 |       end
 31 |     end
 32 |   end
 33 | 
 34 |   def test_parse_csv_with_invalid_utf8_file_lossy
 35 |     File.write("test/invalid_utf8.csv", "id,name\n1,\xFF\xFF\n")
 36 |     actual = []
 37 |     OSV.for_each("test/invalid_utf8.csv", lossy: true) { |row| actual << row }
 38 |     assert_equal [{ "id" => "1", "name" => "��" }], actual
 39 |   ensure
 40 |     begin
 41 |       File.delete("test/invalid_utf8.csv")
 42 |     rescue StandardError
 43 |       nil
 44 |     end
 45 |   end
 46 | 
 47 |   def test_parse_csv_with_invalid_utf8_headers_lossy
 48 |     File.write("test/invalid_utf8_headers.csv", "\xFF\xFF,name\n1,test\n")
 49 |     actual = []
 50 |     OSV.for_each("test/invalid_utf8_headers.csv", lossy: true) { |row| actual << row }
 51 |     assert_equal [{ "��" => "1", "name" => "test" }], actual
 52 |   ensure
 53 |     begin
 54 |       File.delete("test/invalid_utf8_headers.csv")
 55 |     rescue StandardError
 56 |       nil
 57 |     end
 58 |   end
 59 | 
 60 |   def test_parse_csv_with_unicode
 61 |     csv_content = <<~CSV
 62 |       id,name,description
 63 |       1,"José García","Señor developer 👨‍💻"
 64 |       2,"Zoë Smith","⭐ Project lead"
 65 |     CSV
 66 | 
 67 |     expected = [
 68 |       { "id" => "1", "name" => "José García", "description" => "Señor developer 👨‍💻" },
 69 |       { "id" => "2", "name" => "Zoë Smith", "description" => "⭐ Project lead" }
 70 |     ]
 71 | 
 72 |     actual = []
 73 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
 74 |     assert_equal expected, actual
 75 |   end
 76 | 
 77 |   def test_parse_csv_with_bom
 78 |     csv_content = "\xEF\xBB\xBF" + <<~CSV
 79 |       id,name,age
 80 |       1,John,25
 81 |       2,Jane,30
 82 |     CSV
 83 | 
 84 |     expected = [{ "id" => "1", "name" => "John", "age" => "25" }, { "id" => "2", "name" => "Jane", "age" => "30" }]
 85 | 
 86 |     actual = []
 87 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
 88 |     assert_equal expected, actual
 89 |   end
 90 |   
 91 |   def test_parse_csv_with_null_bytes
 92 |     csv_content = <<~CSV
 93 |       id,na\0me,description
 94 |       1,Jo\0hn,test
 95 |       2,Jane,te\0st
 96 |     CSV
 97 | 
 98 |     expected = [
 99 |       { "id" => "1", "name" => "John", "description" => "test" },
100 |       { "id" => "2", "name" => "Jane", "description" => "test" }
101 |     ]
102 | 
103 |     actual = []
104 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io, ignore_null_bytes: true) { |row| actual << row } }
105 |     assert_equal expected, actual
106 | 
107 |     actual = OSV.for_each(StringIO.new(csv_content), ignore_null_bytes: true).to_a
108 |     assert_equal expected, actual
109 | 
110 |     # Without ignore_null_bytes, null bytes are preserved
111 |     actual = []
112 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
113 |     assert_equal [
114 |                    { "id" => "1", "na\0me" => "Jo\0hn", "description" => "test" },
115 |                    { "id" => "2", "na\0me" => "Jane", "description" => "te\0st" }
116 |                  ],
117 |                  actual
118 |   end
119 | end


--------------------------------------------------------------------------------
/test/format_options_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "minitest/autorun"
  5 | 
  6 | # Tests focused on parsing options and formatting
  7 | class FormatOptionsTest < Minitest::Test
  8 |   def test_parse_csv_with_headers_null
  9 |     expected = [
 10 |       { "id" => "1", "age" => "25", "name" => "John" },
 11 |       { "name" => nil, "id" => "2", "age" => "30" },
 12 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 13 |     ]
 14 |     actual = []
 15 |     OSV.for_each("test/test.csv", nil_string: "Jane") { |row| actual << row }
 16 |     assert_equal expected, actual
 17 |   end
 18 | 
 19 |   def test_parse_csv_compat_with_headers_null
 20 |     expected = [%w[1 John 25], ["2", nil, "30"], %w[3 Jim 35]]
 21 |     actual = []
 22 |     OSV.for_each("test/test.csv", has_headers: true, nil_string: "Jane", result_type: "array") { |row| actual << row }
 23 |     assert_equal expected, actual
 24 |   end
 25 | 
 26 |   def test_parse_csv_with_empty_field
 27 |     Tempfile.create(%w[test .csv]) do |tempfile|
 28 |       # Copy existing content and add a line with empty field
 29 |       content = File.read("test/test.csv")
 30 |       content += "4,,40\n"
 31 |       tempfile.write(content)
 32 |       tempfile.close
 33 | 
 34 |       expected = [
 35 |         { "id" => "1", "age" => "25", "name" => "John" },
 36 |         { "name" => "Jane", "id" => "2", "age" => "30" },
 37 |         { "name" => "Jim", "age" => "35", "id" => "3" },
 38 |         { "id" => "4", "name" => "", "age" => "40" }
 39 |       ]
 40 |       actual = []
 41 |       OSV.for_each(tempfile.path) { |row| actual << row }
 42 |       assert_equal expected, actual
 43 |     end
 44 |   end
 45 | 
 46 |   def test_parse_csv_with_empty_field_as_nil_string
 47 |     Tempfile.create(%w[test .csv]) do |tempfile|
 48 |       # Copy existing content and add a line with empty field
 49 |       content = File.read("test/test.csv")
 50 |       content += "4,,40\n"
 51 |       tempfile.write(content)
 52 |       tempfile.close
 53 | 
 54 |       expected = [
 55 |         { "id" => "1", "age" => "25", "name" => "John" },
 56 |         { "name" => "Jane", "id" => "2", "age" => "30" },
 57 |         { "name" => "Jim", "age" => "35", "id" => "3" },
 58 |         { "id" => "4", "name" => nil, "age" => "40" }
 59 |       ]
 60 |       actual = []
 61 |       OSV.for_each(tempfile.path, nil_string: "") { |row| actual << row }
 62 |       assert_equal expected, actual
 63 |     end
 64 |   end
 65 | 
 66 |   def test_parse_csv_with_missing_field_default_strict
 67 |     Tempfile.create(%w[test .csv]) do |tempfile|
 68 |       content = File.read("test/test.csv")
 69 |       content += "4,oops\n"
 70 |       tempfile.write(content)
 71 |       tempfile.close
 72 | 
 73 |       expected = [
 74 |         { "id" => "1", "age" => "25", "name" => "John" },
 75 |         { "name" => "Jane", "id" => "2", "age" => "30" },
 76 |         { "name" => "Jim", "age" => "35", "id" => "3" }
 77 |       ]
 78 |       actual = []
 79 | 
 80 |       assert_raises(RuntimeError) do
 81 |         OSV.for_each(tempfile.path) { |row| actual << row }
 82 |       rescue RuntimeError => e
 83 |         assert e.message.include?("found record with 2 fields, but the previous record has 3 fields")
 84 |         raise
 85 |       end
 86 | 
 87 |       assert_equal expected, actual
 88 |     end
 89 |   end
 90 | 
 91 |   def test_parse_csv_with_missing_field_flexible
 92 |     Tempfile.create(%w[test .csv]) do |tempfile|
 93 |       content = File.read("test/test.csv")
 94 |       content += "4,oops\n"
 95 |       tempfile.write(content)
 96 |       tempfile.close
 97 | 
 98 |       expected = [
 99 |         { "id" => "1", "age" => "25", "name" => "John" },
100 |         { "name" => "Jane", "id" => "2", "age" => "30" },
101 |         { "name" => "Jim", "age" => "35", "id" => "3" },
102 |         { "id" => "4", "name" => "oops", "age" => nil }
103 |       ]
104 |       actual = []
105 |       OSV.for_each(tempfile.path, flexible: true) { |row| actual << row }
106 |       assert_equal expected, actual
107 |     end
108 |   end
109 | 
110 |   def test_parse_csv_with_missing_field_flexible_without_headers
111 |     Tempfile.create(%w[test .csv]) do |tempfile|
112 |       content = File.read("test/test.csv")
113 |       content += "4,oops\n"
114 |       tempfile.write(content)
115 |       tempfile.close
116 | 
117 |       expected = [
118 |         { "c2" => "age", "c0" => "id", "c1" => "name" },
119 |         { "c2" => "25", "c0" => "1", "c1" => "John" },
120 |         { "c1" => "Jane", "c2" => "30", "c0" => "2" },
121 |         { "c0" => "3", "c2" => "35", "c1" => "Jim" },
122 |         { "c1" => "oops", "c0" => "4", "c2" => nil }
123 |       ]
124 |       actual = []
125 |       OSV.for_each(tempfile.path, has_headers: false, flexible: true) { |row| actual << row }
126 |       assert_equal expected, actual
127 |     end
128 |   end
129 | 
130 |   def test_parse_csv_with_missing_field_flexible_array
131 |     Tempfile.create(%w[test .csv]) do |tempfile|
132 |       content = File.read("test/test.csv")
133 |       content += "4,oops\n"
134 |       tempfile.write(content)
135 |       tempfile.close
136 | 
137 |       expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35], %w[4 oops]]
138 |       actual = []
139 |       OSV.for_each(tempfile.path, flexible: true, result_type: :array) { |row| actual << row }
140 |       assert_equal expected, actual
141 |     end
142 |   end
143 | 
144 |   def test_for_each_trim_all
145 |     csv_content = <<~CSV
146 |       id , name , age
147 |       1 , John , 25
148 |       2 , Jane , 30
149 |       3 , Jim , 35
150 |     CSV
151 | 
152 |     expected = [
153 |       { "id" => "1", "name" => "John", "age" => "25" },
154 |       { "id" => "2", "name" => "Jane", "age" => "30" },
155 |       { "id" => "3", "name" => "Jim", "age" => "35" }
156 |     ]
157 | 
158 |     actual = []
159 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io, trim: "all") { |row| actual << row } }
160 |     assert_equal expected, actual
161 |   end
162 | 
163 |   def test_for_each_trim_headers
164 |     csv_content = <<~CSV
165 |       id , name , age
166 |       1, John, 25
167 |       2, Jane, 30
168 |       3, Jim, 35
169 |     CSV
170 | 
171 |     expected = [
172 |       { "id" => "1", "name" => " John", "age" => " 25" },
173 |       { "id" => "2", "name" => " Jane", "age" => " 30" },
174 |       { "id" => "3", "name" => " Jim", "age" => " 35" }
175 |     ]
176 | 
177 |     actual = []
178 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io, trim: :headers) { |row| actual << row } }
179 |     assert_equal expected, actual
180 |   end
181 | 
182 |   def test_for_each_trim_fields
183 |     csv_content = <<~CSV
184 |       id,name,age
185 |       1 , John , 25
186 |       2 , Jane , 30
187 |       3 , Jim , 35
188 |     CSV
189 | 
190 |     expected = [
191 |       { "id" => "1", "name" => "John", "age" => "25" },
192 |       { "id" => "2", "name" => "Jane", "age" => "30" },
193 |       { "id" => "3", "name" => "Jim", "age" => "35" }
194 |     ]
195 | 
196 |     actual = []
197 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io, trim: "fields") { |row| actual << row } }
198 |     assert_equal expected, actual
199 |   end
200 | 
201 |   def test_parse_csv_with_quoted_commas
202 |     csv_content = <<~CSV
203 |       id,name,description
204 |       1,"Smith, John","Manager, Sales"
205 |       2,"Doe, Jane","Director, HR"
206 |     CSV
207 | 
208 |     expected = [
209 |       { "id" => "1", "name" => "Smith, John", "description" => "Manager, Sales" },
210 |       { "id" => "2", "name" => "Doe, Jane", "description" => "Director, HR" }
211 |     ]
212 | 
213 |     actual = []
214 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
215 |     assert_equal expected, actual
216 |   end
217 | 
218 |   def test_parse_csv_with_escaped_quotes
219 |     csv_content = <<~CSV
220 |       id,name,quote
221 |       1,"John","He said ""Hello World"""
222 |       2,"Jane","She replied ""Hi there!"""
223 |     CSV
224 | 
225 |     expected = [
226 |       { "id" => "1", "name" => "John", "quote" => 'He said "Hello World"' },
227 |       { "id" => "2", "name" => "Jane", "quote" => 'She replied "Hi there!"' }
228 |     ]
229 | 
230 |     actual = []
231 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
232 |     assert_equal expected, actual
233 |   end
234 | 
235 |   def test_parse_csv_with_newlines_in_quotes
236 |     csv_content = <<~CSV
237 |       id,name,address
238 |       1,"John Smith","123 Main St.
239 |       Apt 4B
240 |       New York, NY"
241 |       2,"Jane Doe","456 Park Ave.
242 |       Suite 789"
243 |     CSV
244 | 
245 |     expected = [
246 |       { "id" => "1", "name" => "John Smith", "address" => "123 Main St.\nApt 4B\nNew York, NY" },
247 |       { "id" => "2", "name" => "Jane Doe", "address" => "456 Park Ave.\nSuite 789" }
248 |     ]
249 | 
250 |     actual = []
251 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
252 |     assert_equal expected, actual
253 |   end
254 |   
255 |   def test_parse_csv_with_explicit_nil_kwargs
256 |     csv_content = <<~CSV
257 |       id,name,age
258 |       1,John,25
259 |       2,Jane,30
260 |     CSV
261 | 
262 |     expected = [{ "id" => "1", "name" => "John", "age" => "25" }, { "id" => "2", "name" => "Jane", "age" => "30" }]
263 | 
264 |     actual = []
265 |     StringIO
266 |       .new(csv_content)
267 |       .tap do |io|
268 |         OSV.for_each(
269 |           io,
270 |           has_headers: nil,
271 |           col_sep: nil,
272 |           quote_char: nil,
273 |           nil_string: nil,
274 |           result_type: nil,
275 |           flexible: nil,
276 |           trim: nil
277 |         ) { |row| actual << row }
278 |       end
279 |     assert_equal expected, actual
280 |   end
281 | end


--------------------------------------------------------------------------------
/test/gc_stress_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "osv"
 4 | require "minitest/autorun"
 5 | require "stringio"
 6 | require "tempfile"
 7 | 
 8 | class GCStressTest < Minitest::Test
 9 |   def setup
10 |     # Create a CSV string to test with
11 |     csv = String.new("id,header1,header2\n")
12 |     100.times do |i|
13 |       csv << "#{i},value_#{i}_1,value_#{i}_2\n"
14 |     end
15 |     @csv_string = csv
16 |     
17 |     # Set GC to maximum stress level
18 |     GC.stress = true
19 |   end
20 |   
21 |   def teardown
22 |     # Reset GC settings
23 |     GC.stress = false
24 |   end
25 |   
26 |   def test_parse_with_gc_stress
27 |     # Parse the CSV with GC stress enabled
28 |     results = []
29 |     
30 |     # Use a StringIO to avoid filesystem operations
31 |     io = StringIO.new(@csv_string)
32 |     
33 |     # Parse with OSV
34 |     enum = OSV.for_each(io)
35 |     
36 |     # Read all rows with aggressive GC between each
37 |     count = 0
38 |     begin
39 |       while count < 100
40 |         row = enum.next
41 |         results << row
42 |         count += 1
43 |         
44 |         # Force garbage collection
45 |         GC.start(full_mark: true, immediate_sweep: true)
46 |       end
47 |     rescue StopIteration
48 |       # Expected at end of file
49 |     end
50 |     
51 |     # Verify we read everything
52 |     assert_equal 100, results.size
53 |     
54 |     # Verify some random values
55 |     assert_equal "0", results[0]["id"]
56 |     assert_equal "value_0_1", results[0]["header1"]
57 |     assert_equal "50", results[50]["id"]
58 |     assert_equal "value_50_1", results[50]["header1"]
59 |     assert_equal "99", results[99]["id"]
60 |     assert_equal "value_99_2", results[99]["header2"]
61 |   end
62 |   
63 |   def test_file_handle_gc_safety
64 |     # Test with file handles that might be garbage collected
65 |     file = Tempfile.new(['gc_stress', '.csv'])
66 |     begin
67 |       # Write CSV data
68 |       file.write(@csv_string)
69 |       file.flush
70 |       
71 |       # Create parser from the file
72 |       path = file.path
73 |       enum = OSV.for_each(path)
74 |       
75 |       # Read some rows with GC pressure
76 |       10.times do
77 |         row = enum.next
78 |         assert_equal row["header1"], "value_#{row["id"]}_1"
79 |         
80 |         # Force GC
81 |         GC.start(full_mark: true, immediate_sweep: true)
82 |       end
83 |     ensure
84 |       file.close
85 |       file.unlink
86 |     end
87 |   end
88 | end


--------------------------------------------------------------------------------
/test/io_handling_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "zlib"
  5 | require "minitest/autorun"
  6 | 
  7 | # Tests focused on IO handling capabilities
  8 | class IoHandlingTest < Minitest::Test
  9 |   def test_parse_csv_with_gzip
 10 |     expected = [
 11 |       { "id" => "1", "age" => "25", "name" => "John" },
 12 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 13 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 14 |     ]
 15 |     actual = []
 16 |     File.open("test/test.csv.gz", "wb") do |gz_file|
 17 |       gz = Zlib::GzipWriter.new(gz_file)
 18 |       gz.write(File.read("test/test.csv"))
 19 |       gz.close
 20 |     end
 21 |     OSV.for_each("test/test.csv.gz") { |row| actual << row }
 22 |     assert_equal expected, actual
 23 |   ensure
 24 |     FileUtils.rm_f("test/test.csv.gz")
 25 |   end
 26 | 
 27 |   def test_parse_csv_with_gzip_io
 28 |     expected = [
 29 |       { "id" => "1", "age" => "25", "name" => "John" },
 30 |       { "name" => "Jane", "id" => "2", "age" => "30" },
 31 |       { "name" => "Jim", "age" => "35", "id" => "3" }
 32 |     ]
 33 |     actual = []
 34 |     File.open("test/test2.csv.gz", "wb") do |gz_file|
 35 |       gz = Zlib::GzipWriter.new(gz_file)
 36 |       gz.write(File.read("test/test.csv"))
 37 |       gz.close
 38 |     end
 39 |     Zlib::GzipReader.open("test/test2.csv.gz") { |gz| OSV.for_each(gz) { |row| actual << row } }
 40 |     assert_equal expected, actual
 41 |   ensure
 42 |     FileUtils.rm_f("test/test2.csv.gz")
 43 |   end
 44 | 
 45 |   def test_parse_with_gzip_corrupted
 46 |     # Create a corrupted gzip file
 47 |     File.open("test/corrupted.csv.gz", "wb") do |file|
 48 |       file.write("This is not a valid gzip file but has .gz extension")
 49 |     end
 50 | 
 51 |     assert_raises(RuntimeError) { OSV.for_each("test/corrupted.csv.gz") { |row| } }
 52 |   ensure
 53 |     FileUtils.rm_f("test/corrupted.csv.gz")
 54 |   end
 55 | 
 56 |   def test_parse_input_modified_during_iteration
 57 |     temp_file = Tempfile.new(%w[dynamic .csv])
 58 |     begin
 59 |       temp_file.write("id,name\n1,John\n2,Jane\n")
 60 |       temp_file.flush
 61 | 
 62 |       enum = OSV.for_each(temp_file.path)
 63 |       # Get first row
 64 |       enum.next
 65 | 
 66 |       # Modify file between iterations
 67 |       File.open(temp_file.path, "a") { |f| f.write("3,Modified\n") }
 68 | 
 69 |       # Continue iteration
 70 |       second = enum.next
 71 |       assert_equal({ "id" => "2", "name" => "Jane" }, second)
 72 | 
 73 |       # This might read the newly appended line or might not depending on buffering
 74 |       # Either way, it shouldn't crash
 75 |       begin
 76 |         third = enum.next
 77 |         assert_equal({ "id" => "3", "name" => "Modified" }, third)
 78 |       rescue StopIteration
 79 |         # This is also acceptable
 80 |       end
 81 |     ensure
 82 |       temp_file.close
 83 |       temp_file.unlink
 84 |     end
 85 |   end
 86 | 
 87 |   def test_parse_with_extremely_large_row
 88 |     Tempfile.create(%w[large .csv]) do |tempfile|
 89 |       tempfile.write("id,name,description\n")
 90 |       tempfile.write("1,test,#{"x" * 10_000_000}\n") # 10MB row
 91 |       tempfile.flush
 92 | 
 93 |       result = nil
 94 |       # This shouldn't crash, though it might use a lot of memory
 95 |       OSV.for_each(tempfile.path) do |row|
 96 |         result = row
 97 |         break # Only read the first row
 98 |       end
 99 | 
100 |       assert_equal "1", result["id"]
101 |       assert_equal "test", result["name"]
102 |       assert_equal 10_000_000, result["description"].length
103 |     end
104 |   end
105 | 
106 |   def test_parse_csv_with_mixed_line_endings
107 |     csv_content = "id,name,age\r\n1,John,25\n2,Jane,30\r\n3,Jim,35"
108 | 
109 |     expected = [
110 |       { "id" => "1", "name" => "John", "age" => "25" },
111 |       { "id" => "2", "name" => "Jane", "age" => "30" },
112 |       { "id" => "3", "name" => "Jim", "age" => "35" }
113 |     ]
114 | 
115 |     actual = []
116 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
117 |     assert_equal expected, actual
118 |   end
119 | 
120 |   def test_parse_csv_with_empty_lines
121 |     csv_content = <<~CSV
122 |       id,name,age
123 | 
124 |       1,John,25
125 | 
126 |       2,Jane,30
127 | 
128 |       3,Jim,35
129 | 
130 |     CSV
131 | 
132 |     expected = [
133 |       { "id" => "1", "name" => "John", "age" => "25" },
134 |       { "id" => "2", "name" => "Jane", "age" => "30" },
135 |       { "id" => "3", "name" => "Jim", "age" => "35" }
136 |     ]
137 | 
138 |     actual = []
139 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
140 |     assert_equal expected, actual
141 |   end
142 |   
143 |   def test_parse_empty_file
144 |     Tempfile.create(%w[empty .csv]) do |tempfile|
145 |       # Empty file
146 |       tempfile.flush
147 | 
148 |       count = 0
149 |       OSV.for_each(tempfile.path) { |row| count += 1 }
150 | 
151 |       assert_equal 0, count
152 | 
153 |       # Also test with headers but no data
154 |       tempfile.write("id,name\n")
155 |       tempfile.flush
156 | 
157 |       count = 0
158 |       OSV.for_each(tempfile.path) { |row| count += 1 }
159 | 
160 |       assert_equal 0, count
161 |     end
162 |   end
163 | 
164 |   def test_parse_csv_with_whitespace_and_quotes
165 |     csv_content = <<~CSV
166 |       id,name,description
167 |       1,  John  ,  unquoted spaces
168 |       2," Jane ",  "  quoted spaces  "
169 |       3,"Jim","  mixed  "
170 |     CSV
171 | 
172 |     expected = [
173 |       { "id" => "1", "description" => "  unquoted spaces", "name" => "  John  " },
174 |       { "id" => "2", "description" => "  \"  quoted spaces  \"", "name" => " Jane " },
175 |       { "id" => "3", "description" => "  mixed  ", "name" => "Jim" }
176 |     ]
177 |     actual = []
178 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
179 |     assert_equal expected, actual
180 |   end
181 | 
182 |   def test_parse_csv_with_empty_quoted_vs_unquoted
183 |     csv_content = <<~CSV
184 |       id,quoted,unquoted
185 |       1,"",
186 |       2,," "
187 |       3,,
188 |       4,"  ",
189 |     CSV
190 | 
191 |     expected = [
192 |       { "id" => "1", "quoted" => "", "unquoted" => "" },
193 |       { "id" => "2", "quoted" => "", "unquoted" => " " },
194 |       { "id" => "3", "quoted" => "", "unquoted" => "" },
195 |       { "id" => "4", "quoted" => "  ", "unquoted" => "" }
196 |     ]
197 | 
198 |     actual = []
199 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
200 |     assert_equal expected, actual
201 |   end
202 | 
203 |   def test_parse_csv_with_duplicate_headers
204 |     csv_content = <<~CSV
205 |       id,name,id,name
206 |       1,John,A,Johnny
207 |       2,Jane,B,Janet
208 |     CSV
209 | 
210 |     expected = [%w[1 John A Johnny], %w[2 Jane B Janet]]
211 | 
212 |     actual = []
213 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io, result_type: :array) { |row| actual << row } }
214 |     assert_equal expected, actual
215 |   end
216 | end


--------------------------------------------------------------------------------
/test/memory_safety_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "zlib"
  5 | require "minitest/autorun"
  6 | 
  7 | class MemorySafetyTest < Minitest::Test
  8 |   # Test to target potential issues with Ruby string slices in RubyReader
  9 |   # Focuses on the RubyReader::String variant that uses as_slice()
 10 |   def test_string_slice_gc_safety
 11 |     begin
 12 |       # Create a large string with CSV content
 13 |       csv_string = "id,name,description\n"
 14 |       1000.times do |i|
 15 |         csv_string += "#{i},name#{i},desc#{i}\n"
 16 |       end
 17 |       
 18 |       # Create a StringIO with the string
 19 |       string_io = StringIO.new(csv_string)
 20 |       
 21 |       # Start parsing
 22 |       enum = OSV.for_each(string_io)
 23 |       
 24 |       # Read a few rows
 25 |       rows = []
 26 |       5.times { rows << enum.next }
 27 |       
 28 |       # Clear the original string reference and force GC
 29 |       csv_string = nil
 30 |       GC.start(full_mark: true, immediate_sweep: true)
 31 |       
 32 |       # Create memory pressure by allocating large objects
 33 |       large_objects = []
 34 |       10.times { large_objects << "x" * (1024 * 1024) }
 35 |       
 36 |       # Continue reading - this would segfault if RubyReader keeps unsafe references
 37 |       # to the original string after GC
 38 |       begin
 39 |         20.times { rows << enum.next }
 40 |       rescue StopIteration
 41 |         # Expected at end of file
 42 |       end
 43 |       
 44 |       # Verify we read the expected data
 45 |       assert rows.size > 5
 46 |       assert_equal "5", rows[5]["id"] if rows.size > 5
 47 |     end
 48 |   end
 49 | 
 50 |   # Test for potential issues with IO object and its premature garbage collection
 51 |   def test_io_object_gc_safety
 52 |     begin
 53 |       # Create a custom IO-like object that we can control
 54 |       custom_io = Class.new do
 55 |         def initialize(data)
 56 |           @data = data
 57 |           @position = 0
 58 |         end
 59 |         
 60 |         def read(bytes)
 61 |           return nil if @position >= @data.length
 62 |           chunk = @data[@position, [bytes, 100].min] # Read in small chunks
 63 |           @position += chunk.length
 64 |           chunk
 65 |         end
 66 |       end
 67 |       
 68 |       # Create CSV data
 69 |       csv_data = "id,name,value\n"
 70 |       100.times { |i| csv_data += "#{i},name#{i},value#{i}\n" }
 71 |       
 72 |       # Create our custom IO object
 73 |       io_obj = custom_io.new(csv_data)
 74 |       
 75 |       # Create an enumerator using the custom IO
 76 |       enum = OSV.for_each(io_obj)
 77 |       
 78 |       # Read a few rows
 79 |       rows = []
 80 |       5.times { rows << enum.next }
 81 |       
 82 |       # Release references to the IO object and force GC
 83 |       io_obj = nil
 84 |       GC.start(full_mark: true, immediate_sweep: true)
 85 |       
 86 |       # Allocate objects to increase memory pressure
 87 |       10.times { "x" * (1024 * 1024) }
 88 |       
 89 |       # Try to continue reading after GC (should either work correctly or raise
 90 |       # a Ruby exception, but shouldn't segfault)
 91 |       begin
 92 |         10.times { rows << enum.next }
 93 |       rescue => e
 94 |         # If Rust has unsafe references to Ruby objects that were GC'd,
 95 |         # this might segfault instead of a proper Ruby exception
 96 |         assert_match(/io|read|closed/i, e.message)
 97 |       end
 98 |       
 99 |       # Success if we got this far without segfault
100 |       assert true
101 |     end
102 |   end
103 |   
104 |   # Test to exercise thread safety issues with unchecked Ruby VM access
105 |   def test_thread_safety_ruby_vm_access
106 |     # Create a CSV file to read
107 |     file = Tempfile.new(['thread_safety', '.csv'])
108 |     begin
109 |       # Create a large CSV file
110 |       file.write("id,name,description,value,extra\n")
111 |       500.times { |i| file.write("#{i},name#{i},desc#{i},value#{i},extra#{i}\n") }
112 |       file.flush
113 |       
114 |       # Create shared data structures
115 |       results = Queue.new
116 |       error_count = 0
117 |       mutex = Mutex.new
118 |       
119 |       # Create multiple threads that will read from the same file concurrently
120 |       # This can expose thread safety issues with Ruby VM access
121 |       threads = 8.times.map do |thread_id|
122 |         Thread.new do
123 |           begin
124 |             # Each thread creates its own enumerator
125 |             enum = OSV.for_each(file.path)
126 |             
127 |             # Skip to a different starting point
128 |             skip_count = thread_id * 10
129 |             skip_count.times { enum.next rescue nil }
130 |             
131 |             # Read rows with aggressive GC in between
132 |             10.times do |i|
133 |               begin
134 |                 row = enum.next
135 |                 results << [thread_id, row["id"]]
136 |                 
137 |                 # Force GC frequently
138 |                 GC.start if i % 2 == 0
139 |                 
140 |                 # Create temporary objects to increase memory pressure
141 |                 temp = "x" * (1024 * (thread_id + 1))
142 |                 temp = nil
143 |               rescue StopIteration
144 |                 break
145 |               rescue => e
146 |                 mutex.synchronize { error_count += 1 }
147 |                 break
148 |               end
149 |             end
150 |           rescue => e
151 |             mutex.synchronize { error_count += 1 }
152 |           end
153 |         end
154 |       end
155 |       
156 |       # Wait for all threads to complete
157 |       threads.each(&:join)
158 |       
159 |       # Check that we got results without segfaults
160 |       assert results.size > 0
161 |       assert_equal 0, error_count, "Expected no errors during concurrent parsing"
162 |     ensure
163 |       file.close
164 |       file.unlink
165 |     end
166 |   end
167 |   
168 |   # Test buffer boundary handling which can cause issues with memory safety
169 |   def test_buffer_boundary_handling
170 |     # Create a file with content designed to test buffer boundaries
171 |     file = Tempfile.new(['buffer_boundary', '.csv'])
172 |     begin
173 |       # The READ_BUFFER_SIZE in the implementation is 16384 bytes
174 |       buffer_size = 16384
175 |       
176 |       # Write CSV header
177 |       file.write("id,name,description\n")
178 |       
179 |       # Row 1: Create a field that ends exactly at buffer boundary
180 |       content_size = buffer_size - "1,name1,".length - 1 # -1 for newline
181 |       file.write("1,name1,#{"x" * content_size}\n")
182 |       
183 |       # Row 2: Field that causes buffer boundary to occur right before a delimiter
184 |       content_size = buffer_size - "2,".length - 1 # -1 for newline
185 |       file.write("2,#{"y" * content_size},desc2\n")
186 |       
187 |       # Row 3: Field with quoted content that spans across buffer boundary
188 |       content_size = buffer_size - 10
189 |       file.write("3,\"#{"z" * content_size}\",desc3\n")
190 |       
191 |       # Row 4: Multiple quoted fields with escaped quotes near buffer boundary
192 |       file.write("4,\"#{"a" * (buffer_size/2 - 10)}\"\"#{"b" * 10}\",\"multi\"\"quote\"\n")
193 |       
194 |       # Flush to ensure content is written
195 |       file.flush
196 |       
197 |       # Try parsing with different options
198 |       [
199 |         {},
200 |         { result_type: "array" },
201 |         { flexible: true },
202 |         { lossy: true }
203 |       ].each do |opts|
204 |         begin
205 |           # Parse with each option set
206 |           enum = OSV.for_each(file.path, **opts)
207 |           
208 |           # Read rows while doing aggressive GC
209 |           rows = []
210 |           begin
211 |             loop do
212 |               rows << enum.next
213 |               GC.start if rows.size % 2 == 0
214 |             end
215 |           rescue StopIteration
216 |             # Expected at end of file
217 |           end
218 |           
219 |           # Verify we read all rows
220 |           assert_equal 4, rows.size, "Should have read 4 rows with options: #{opts}"
221 |         rescue => e
222 |           raise
223 |         end
224 |       end
225 |     ensure
226 |       file.close
227 |       file.unlink
228 |     end
229 |   end
230 |   
231 |   # Test with different Ruby string encodings to find encoding-related memory issues
232 |   def test_string_encoding_safety
233 |     begin
234 |       # Create strings with different encodings
235 |       utf8_string = "id,name,description\n1,John,Regular\n2,José,Café\n3,你好,世界\n"
236 |       ascii_string = utf8_string.encode("ASCII-8BIT", invalid: :replace, undef: :replace)
237 |       utf16_string = utf8_string.encode("UTF-16LE")
238 |       
239 |       # Test with each encoding
240 |       [utf8_string, ascii_string, utf16_string].each do |str|
241 |         string_io = StringIO.new(str)
242 |         
243 |         begin
244 |           # Parse the string
245 |           enum = OSV.for_each(string_io)
246 |           
247 |           # Read while forcing GC
248 |           rows = []
249 |           begin
250 |             while rows.size < 10
251 |               row = enum.next
252 |               rows << row
253 |               GC.start if rows.size % 2 == 0
254 |             end
255 |           rescue StopIteration
256 |             # Expected at end
257 |           rescue => e
258 |             # Some encodings may cause valid errors
259 |             if str == utf16_string
260 |               assert_match(/invalid|encoding/i, e.message)
261 |             else
262 |               raise
263 |             end
264 |           end
265 |         rescue => e
266 |           # Only UTF-16 is expected to have encoding issues
267 |           if str == utf16_string
268 |             assert_match(/invalid|encoding/i, e.message)
269 |           else
270 |             raise
271 |           end
272 |         end
273 |       end
274 |     end
275 |   end
276 | end


--------------------------------------------------------------------------------
/test/performance_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "osv"
 4 | require "minitest/autorun"
 5 | 
 6 | # Tests focused on performance aspects
 7 | class PerformanceTest < Minitest::Test
 8 |   def test_parse_csv_with_many_rows
 9 |     # Generate test data with 2000 rows
10 |     Tempfile.create(%w[test_many_rows .csv]) do |test_file|
11 |       test_file.write "id,name,age\n"
12 |       2000.times { |i| test_file.write "#{i},Person#{i},#{20 + i % 50}\n" }
13 |       test_file.close
14 | 
15 |       # Parse and verify
16 |       actual = []
17 |       OSV.for_each(test_file.path) { |row| actual << row }
18 | 
19 |       assert_equal 2000, actual.size
20 |     end
21 |   end
22 | 
23 |   def test_parse_csv_with_many_rows_stringio
24 |     # Generate test data with 2000 rows
25 |     io = StringIO.new
26 |     io.write "id,name,age\n"
27 |     2000.times { |i| io.write "#{i},Person#{i},#{20 + i % 50}\n" }
28 |     io.rewind
29 | 
30 |     # Parse and verify
31 |     actual = []
32 |     OSV.for_each(io) { |row| actual << row }
33 | 
34 |     assert_equal 2000, actual.size
35 |   end
36 | end


--------------------------------------------------------------------------------
/test/stress_test.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "osv"
  4 | require "minitest/autorun"
  5 | 
  6 | # Tests focused on stress-testing and edge cases
  7 | class StressTest < Minitest::Test
  8 |   def test_segfault_stress_csv_parser_with_many_instances
  9 |     # This test creates many parser instances simultaneously,
 10 |     # which can stress the memory management and potentially trigger segfaults
 11 |     
 12 |     files = []
 13 |     enumerators = []
 14 |     
 15 |     begin
 16 |       # Create several moderate-sized CSV files
 17 |       5.times do |file_idx|
 18 |         file = Tempfile.new(["stress_#{file_idx}", '.csv'])
 19 |         files << file
 20 |         
 21 |         file.write("id,name,value\n")
 22 |         500.times { |i| file.write("#{i},name#{i},value#{i}\n") }
 23 |         file.flush
 24 |       end
 25 |       
 26 |       # Create many parser instances for each file
 27 |       files.each do |file|
 28 |         10.times do
 29 |           enumerators << OSV.for_each(file.path)
 30 |         end
 31 |       end
 32 |       
 33 |       # Force memory pressure with large temporary objects
 34 |       temp_strings = []
 35 |       10.times { temp_strings << "x" * (1024 * 1024) }
 36 |       GC.start
 37 |       
 38 |       # Read partially from random enumerators
 39 |       100.times do
 40 |         enum = enumerators.sample
 41 |         begin
 42 |           # Read a random number of records, but not too many
 43 |           rand(1..5).times { enum.next }
 44 |         rescue StopIteration
 45 |           # Expected for some enumerators
 46 |         end
 47 |       end
 48 |       
 49 |       # Force more GC pressure
 50 |       temp_strings = nil
 51 |       GC.start(full_mark: true, immediate_sweep: true)
 52 |       
 53 |       # Success if we get here without a segfault
 54 |       assert true
 55 |     ensure
 56 |       # Clean up
 57 |       files.each do |file|
 58 |         begin
 59 |           file.close
 60 |           file.unlink
 61 |         rescue
 62 |           # Ignore cleanup errors
 63 |         end
 64 |       end
 65 |     end
 66 |   end
 67 | 
 68 |   def test_parse_csv_with_long_line
 69 |     long_text = "x" * 1_000_000
 70 |     csv_content = <<~CSV
 71 |       id,name,description
 72 |       1,John,#{long_text}
 73 |       2,Jane,Short description
 74 |     CSV
 75 | 
 76 |     expected = [
 77 |       { "id" => "1", "name" => "John", "description" => long_text },
 78 |       { "id" => "2", "name" => "Jane", "description" => "Short description" }
 79 |     ]
 80 | 
 81 |     actual = []
 82 |     StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } }
 83 |     assert_equal expected, actual
 84 |   end
 85 | 
 86 |   def test_parse_with_garbage_collection_stress
 87 |     # Create a medium-sized file
 88 |     Tempfile.create(%w[gc_stress .csv]) do |tempfile|
 89 |       # Write a decent amount of data
 90 |       tempfile.write("id,name,value\n")
 91 |       1000.times { |i| tempfile.write("#{i},name#{i},value#{i}\n") }
 92 |       tempfile.flush
 93 | 
 94 |       # Enable GC stress mode during parsing
 95 |       GC.stress = true
 96 |       begin
 97 |         count = 0
 98 |         OSV.for_each(tempfile.path) do |row|
 99 |           count += 1
100 |           # Force some allocations
101 |           row.transform_values(&:dup)
102 |           # Occasionally force GC
103 |           GC.start if count % 100 == 0
104 |         end
105 |         assert_equal 1000, count
106 |       ensure
107 |         GC.stress = false
108 |       end
109 |     end
110 |   end
111 | 
112 |   def test_parse_csv_with_large_data
113 |     skip "Skipping large data test in normal test runs" unless ENV["RUN_LARGE_TESTS"]
114 | 
115 |     # Only run during specific stress test sessions
116 |     # Create a large file
117 |     Tempfile.create(%w[large_data .csv]) do |tempfile|
118 |       tempfile.write("id,name,value\n")
119 | 
120 |       # Write about 1GB of data
121 |       100_000.times do |i|
122 |         # ~10KB per line × 100K = ~1GB
123 |         value = "value_#{i}_" + ("x" * 10_000)
124 |         tempfile.write("#{i},name#{i},#{value}\n")
125 |       end
126 |       tempfile.flush
127 | 
128 |       # Parse the file
129 |       count = 0
130 |       OSV.for_each(tempfile.path) do |row|
131 |         count += 1
132 |         # Verify some values to ensure proper parsing
133 |         assert_equal count - 1, row["id"].to_i
134 |         assert_equal "name#{count - 1}", row["name"]
135 |         assert row["value"].start_with?("value_#{count - 1}_")
136 |         
137 |         # Only read a portion to keep test runtime reasonable
138 |         break if count >= 10_000
139 |       end
140 | 
141 |       assert count > 0, "Should have read some rows"
142 |     end
143 |   end
144 | end


--------------------------------------------------------------------------------
/test/test.csv:
--------------------------------------------------------------------------------
1 | id,name,age
2 | 1,John,25
3 | 2,Jane,30
4 | 3,Jim,35
5 | 


--------------------------------------------------------------------------------
/test/test.tsv:
--------------------------------------------------------------------------------
1 | id	name	age
2 | 1	John	25
3 | 2	Jane	30
4 | 3	Jim	35
5 | 


--------------------------------------------------------------------------------