├── .cargo └── config.toml ├── .envrc ├── .github ├── FUNDING.yml └── workflows │ ├── gem-push.yml │ └── ruby.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.md ├── Rakefile ├── benchmark ├── benchmark.sh ├── comparison_benchmark.rb ├── profile.sh └── ruby_profiling_script.rb ├── ext └── osv │ ├── Cargo.toml │ ├── extconf.rb │ └── src │ ├── allocator.rs │ ├── csv │ ├── builder.rs │ ├── header_cache.rs │ ├── mod.rs │ ├── parser.rs │ ├── record.rs │ ├── record_reader.rs │ └── ruby_reader.rs │ ├── lib.rs │ ├── reader.rs │ └── utils.rs ├── flake.lock ├── flake.nix ├── lib ├── osv.rb ├── osv.rbi └── osv │ └── version.rb ├── osv.gemspec ├── overlay.nix └── test ├── big_test.rb ├── concurrency_test.rb ├── core_functionality_test.rb ├── encoding_test.rb ├── format_options_test.rb ├── gc_stress_test.rb ├── io_handling_test.rb ├── memory_safety_test.rb ├── performance_test.rb ├── stress_test.rb ├── test.csv └── test.tsv /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [profile.profiling] 2 | inherits = "release" 3 | debug = true 4 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.5/direnvrc" "sha256-RuwIS+QKFj/T9M2TFXScjBsLR6V3A17YVoEW/Q6AZ1w=" 2 | 3 | nix_direnv_manual_reload 4 | 5 | use flake . --fallback --accept-flake-config 6 | 7 | # When running in a nix shell, the build assumes it's happening in CI and forces a release build. 8 | # Setting this env var forces it to respect the RB_SYS_CARGO_PROFILE env var. 9 | export RB_SYS_TEST=1 10 | 11 | dotenv 12 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [njaremko] 4 | -------------------------------------------------------------------------------- /.github/workflows/gem-push.yml: -------------------------------------------------------------------------------- 1 | name: Ruby Gem 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build: 8 | name: Build + Publish 9 | runs-on: ubuntu-latest 10 | 11 | permissions: 12 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 13 | contents: write # IMPORTANT: this permission is required for `rake release` to push the release tag 14 | 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | include: 19 | - platform: x86_64-linux 20 | target: x86_64-unknown-linux-gnu 21 | - platform: x86_64-linux-musl 22 | target: x86_64-unknown-linux-musl 23 | - platform: aarch64-linux 24 | target: aarch64-unknown-linux-gnu 25 | - platform: aarch64-linux-musl 26 | target: aarch64-unknown-linux-musl 27 | - platform: x86_64-darwin 28 | target: x86_64-apple-darwin 29 | - platform: arm64-darwin 30 | target: aarch64-apple-darwin 31 | - platform: normal 32 | target: normal 33 | 34 | steps: 35 | - uses: actions/checkout@v4 36 | 37 | - name: Set up Ruby 3.2 38 | uses: ruby/setup-ruby@v1 39 | with: 40 | ruby-version: 3.2 41 | 42 | # For some reason, I need to manually install this, even though it's seemingly automated below... 43 | - name: rb-sys 44 | run: | 45 | gem install rb_sys 46 | 47 | - uses: oxidize-rb/actions/cross-gem@v1 48 | if: ${{ matrix.target != 'normal' }} 49 | id: cross-gem 50 | with: 51 | platform: ${{ matrix.platform }} 52 | ruby-versions: "3.4,3.3,3.2" 53 | 54 | - uses: actions/upload-artifact@v4 55 | if: ${{ matrix.target != 'normal' }} 56 | with: 57 | name: cross-gem-${{ matrix.platform }} 58 | path: ${{ steps.cross-gem.outputs.gem-path }} 59 | 60 | - name: Set remote URL 61 | shell: bash 62 | run: | 63 | # Attribute commits to the last committer on HEAD 64 | git config --global user.email "$(git log -1 --pretty=format:'%ae')" 65 | git config --global user.name "$(git log -1 --pretty=format:'%an')" 66 | git remote set-url origin "https://x-access-token:${{ github.token }}@github.com/$GITHUB_REPOSITORY" 67 | 68 | - name: Configure trusted publishing credentials 69 | uses: rubygems/configure-rubygems-credentials@v1.0.0 70 | 71 | - name: Download patch 72 | shell: bash 73 | run: | 74 | wget https://raw.githubusercontent.com/rubygems/release-gem/refs/heads/v1/rubygems-attestation-patch.rb 75 | 76 | - name: Run release rake task 77 | if: ${{ matrix.target != 'normal' }} 78 | shell: bash 79 | env: 80 | RUBYOPT: "${{ format('-r{0}/rubygems-attestation-patch.rb {1}', github.workspace, env.RUBYOPT) || env.RUBYOPT }}" 81 | run: | 82 | gem push --key rubygems ${{ steps.cross-gem.outputs.gem-path }} 83 | 84 | - name: Run release rake task 85 | if: ${{ matrix.target == 'normal' }} 86 | shell: bash 87 | env: 88 | RUBYOPT: "${{ format('-r{0}/rubygems-attestation-patch.rb {1}', github.workspace, env.RUBYOPT) || env.RUBYOPT }}" 89 | run: | 90 | gem build osv.gemspec 91 | gem push --key rubygems osv-*.gem 92 | -------------------------------------------------------------------------------- /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby 7 | 8 | name: Ruby 9 | 10 | on: 11 | push: 12 | branches: ["main"] 13 | pull_request: 14 | branches: ["main"] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | test: 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | ruby-version: ["3.2"] 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Ruby 29 | uses: ruby/setup-ruby@v1 30 | with: 31 | ruby-version: ${{ matrix.ruby-version }} 32 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 33 | - name: Run tests 34 | run: bundle exec rake 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target* 2 | /tmp 3 | /**/*.bundle 4 | .direnv 5 | pkg/ 6 | **/.DS_Store 7 | .env 8 | /benchmark/*.csv* 9 | profile.json 10 | flamegraph.svg 11 | CLAUDE.md 12 | generate_context.py 13 | context.txt 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.5.3 4 | 5 | - Fix a bug dealing with header interning. We weren't actually storing the reference to the interned string, so we kept interning every time, and Ruby seems to have a bug that triggered occasional, random segfaults. 6 | 7 | ## 0.5.2 8 | 9 | - Lots of new tests 10 | - One bug fix with extremely wide CSVs 11 | - Do not intern headers when parsing with result type set to array 12 | 13 | ## 0.5.1 14 | 15 | - Attempting to determine if the value being read is a `StringIO` is difficult to due safely, so just treat it as an `IO`-like object. 16 | 17 | ## 0.5.0 18 | 19 | - Got rid of surprising behaviour that bypassed ruby if the provided IO had a file descriptor. It led to confusing bugs where people would write a custom read method that was ignored because we read the file descriptor directly. 20 | - No longer read file into memory when reading gzipped data... 21 | - Cleanup the reader implementation in general 22 | 23 | ## 0.4.4 24 | 25 | - Added support for cross-compilation for multiple platforms 26 | 27 | ## 0.4.2 and 0.4.3 28 | 29 | - Fix occasional segfault when parsing with `result_type: :hash` 30 | 31 | ## 0.4.1 32 | 33 | - Fix bug with lossy not being respected when parsing headers 34 | 35 | ## 0.4.0 36 | 37 | - Added `lossy` option to `for_each` that allows replacing invalid UTF-8 characters with a replacement character 38 | - Removed `flexible_default` option from `for_each` 39 | 40 | ## 0.3.21 41 | 42 | - Fix bug where `ignore_null_bytes` was not being respected in enumerators. 43 | 44 | ## 0.3.19 and 0.3.20 45 | 46 | - Added `ignore_null_bytes` option to `for_each` that allows ignoring null bytes in fields 47 | - The latter just removes an unneeded string copy when filtering out null bytes 48 | 49 | ## 0.3.18 50 | 51 | - Fix handling of passing in explicit nil for optional arguments. 52 | 53 | ## 0.3.17 54 | 55 | - Remove multi-threaded parsing. It was a bad idea. Performance is better without it. Code is simpler. 56 | 57 | ## 0.3.16 58 | 59 | - Optimize hash construction by interning key strings 60 | 61 | ## 0.3.15 62 | 63 | - Some internal refactoring to improve maintainability 64 | - More optimizations for parsing IO-like objects without an underlying file handle 65 | 66 | ## 0.3.14 67 | 68 | After quite a bit of profiling: 69 | 70 | - When you give `OSV` a file handle IO object, we have an optimization to grab the underlying open file handle and do all reading directly in Rust. This release adds lots of optimizations for parsing objects that implement `IO`'s `read` method without having an underlying file handle available. 71 | - This release adds a lot of optimizations for parsing `StringIO` objects, as well as anything that doesn't implement `IO`'s `read` method, but does implement `to_str` or `to_s` methods. 72 | - Further optimizations to string allocations in Rust code. 73 | 74 | ## 0.3.13 75 | 76 | - Turns out, gemspec descriptions cannot be markdown. Fixing that. 77 | 78 | ## 0.3.12 79 | 80 | - Attempt at improving RubyGems page for the gem 81 | 82 | ## 0.3.11 83 | 84 | - Set license to MIT in gemspec 85 | 86 | ## 0.3.10 87 | 88 | - Added `trim` option to `for_each` that allows trimming of fields and headers 89 | 90 | ## 0.3.9 91 | 92 | - Some optimizations, and a fix for a bug where file handles weren't being closed 93 | 94 | ## 0.3.8 95 | 96 | - Added `flexible` option to `for_each` that allows flexible parsing of CSV files without a default value 97 | 98 | ## 0.3.7 99 | 100 | - Added `flexible_default` option to `for_each` that allows flexible parsing of CSV files when set to a string. Defaults to `nil`. 101 | 102 | ## 0.3.6 103 | 104 | - Fix bug introduced in 0.3.5 where `nil_string` was not being parsed correctly 105 | 106 | ## 0.3.5 107 | 108 | - `nil_string` no longer defaults to an empty string. It now defaults to `nil`. Which means that empty strings are interpreted as empty strings. 109 | 110 | ## 0.3.4 111 | 112 | - Added support for handling non-file backed IO objects in single threaded mode 113 | - General refactoring to improve performance and reduce allocations 114 | 115 | ## 0.3.3 116 | 117 | - Added support for gzip files 118 | 119 | ## 0.3.2 120 | 121 | - Intern strings used as keys in hashes until no longer referenced by Ruby to get rid of extra allocations 122 | 123 | ## 0.3.0 124 | 125 | - Got rid of `for_each_compat`. Now use `for_each(result_type: "array")` or `for_each(result_type: :array)` 126 | - Added `result_type` option to `parse_csv` 127 | - Added `buffer_size` option to `parse_csv` 128 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "adler2" 7 | version = "2.0.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" 10 | 11 | [[package]] 12 | name = "ahash" 13 | version = "0.8.11" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" 16 | dependencies = [ 17 | "cfg-if", 18 | "getrandom 0.2.15", 19 | "once_cell", 20 | "version_check", 21 | "zerocopy", 22 | ] 23 | 24 | [[package]] 25 | name = "aho-corasick" 26 | version = "1.1.3" 27 | source = "registry+https://github.com/rust-lang/crates.io-index" 28 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 29 | dependencies = [ 30 | "memchr", 31 | ] 32 | 33 | [[package]] 34 | name = "bindgen" 35 | version = "0.69.5" 36 | source = "registry+https://github.com/rust-lang/crates.io-index" 37 | checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" 38 | dependencies = [ 39 | "bitflags", 40 | "cexpr", 41 | "clang-sys", 42 | "itertools 0.12.1", 43 | "lazy_static", 44 | "lazycell", 45 | "proc-macro2", 46 | "quote", 47 | "regex", 48 | "rustc-hash", 49 | "shlex", 50 | "syn", 51 | ] 52 | 53 | [[package]] 54 | name = "bitflags" 55 | version = "2.6.0" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" 58 | 59 | [[package]] 60 | name = "cc" 61 | version = "1.2.7" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7" 64 | dependencies = [ 65 | "shlex", 66 | ] 67 | 68 | [[package]] 69 | name = "cexpr" 70 | version = "0.6.0" 71 | source = "registry+https://github.com/rust-lang/crates.io-index" 72 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" 73 | dependencies = [ 74 | "nom", 75 | ] 76 | 77 | [[package]] 78 | name = "cfg-if" 79 | version = "1.0.0" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 82 | 83 | [[package]] 84 | name = "clang-sys" 85 | version = "1.8.1" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" 88 | dependencies = [ 89 | "glob", 90 | "libc", 91 | "libloading", 92 | ] 93 | 94 | [[package]] 95 | name = "crc32fast" 96 | version = "1.4.2" 97 | source = "registry+https://github.com/rust-lang/crates.io-index" 98 | checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" 99 | dependencies = [ 100 | "cfg-if", 101 | ] 102 | 103 | [[package]] 104 | name = "csv" 105 | version = "1.3.1" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" 108 | dependencies = [ 109 | "csv-core", 110 | "itoa", 111 | "ryu", 112 | "serde", 113 | ] 114 | 115 | [[package]] 116 | name = "csv-core" 117 | version = "0.1.11" 118 | source = "registry+https://github.com/rust-lang/crates.io-index" 119 | checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" 120 | dependencies = [ 121 | "memchr", 122 | ] 123 | 124 | [[package]] 125 | name = "either" 126 | version = "1.13.0" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" 129 | 130 | [[package]] 131 | name = "errno" 132 | version = "0.3.10" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" 135 | dependencies = [ 136 | "libc", 137 | "windows-sys", 138 | ] 139 | 140 | [[package]] 141 | name = "fastrand" 142 | version = "2.3.0" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" 145 | 146 | [[package]] 147 | name = "flate2" 148 | version = "1.0.35" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" 151 | dependencies = [ 152 | "crc32fast", 153 | "miniz_oxide", 154 | ] 155 | 156 | [[package]] 157 | name = "getrandom" 158 | version = "0.2.15" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 161 | dependencies = [ 162 | "cfg-if", 163 | "libc", 164 | "wasi 0.11.0+wasi-snapshot-preview1", 165 | ] 166 | 167 | [[package]] 168 | name = "getrandom" 169 | version = "0.3.1" 170 | source = "registry+https://github.com/rust-lang/crates.io-index" 171 | checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" 172 | dependencies = [ 173 | "cfg-if", 174 | "libc", 175 | "wasi 0.13.3+wasi-0.2.2", 176 | "windows-targets", 177 | ] 178 | 179 | [[package]] 180 | name = "glob" 181 | version = "0.3.1" 182 | source = "registry+https://github.com/rust-lang/crates.io-index" 183 | checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" 184 | 185 | [[package]] 186 | name = "itertools" 187 | version = "0.12.1" 188 | source = "registry+https://github.com/rust-lang/crates.io-index" 189 | checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" 190 | dependencies = [ 191 | "either", 192 | ] 193 | 194 | [[package]] 195 | name = "itertools" 196 | version = "0.14.0" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" 199 | dependencies = [ 200 | "either", 201 | ] 202 | 203 | [[package]] 204 | name = "itoa" 205 | version = "1.0.14" 206 | source = "registry+https://github.com/rust-lang/crates.io-index" 207 | checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" 208 | 209 | [[package]] 210 | name = "jemalloc-sys" 211 | version = "0.5.4+5.3.0-patched" 212 | source = "registry+https://github.com/rust-lang/crates.io-index" 213 | checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2" 214 | dependencies = [ 215 | "cc", 216 | "libc", 217 | ] 218 | 219 | [[package]] 220 | name = "jemallocator" 221 | version = "0.5.4" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc" 224 | dependencies = [ 225 | "jemalloc-sys", 226 | "libc", 227 | ] 228 | 229 | [[package]] 230 | name = "lazy_static" 231 | version = "1.5.0" 232 | source = "registry+https://github.com/rust-lang/crates.io-index" 233 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 234 | 235 | [[package]] 236 | name = "lazycell" 237 | version = "1.3.0" 238 | source = "registry+https://github.com/rust-lang/crates.io-index" 239 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" 240 | 241 | [[package]] 242 | name = "libc" 243 | version = "0.2.169" 244 | source = "registry+https://github.com/rust-lang/crates.io-index" 245 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 246 | 247 | [[package]] 248 | name = "libloading" 249 | version = "0.8.6" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" 252 | dependencies = [ 253 | "cfg-if", 254 | "windows-targets", 255 | ] 256 | 257 | [[package]] 258 | name = "libmimalloc-sys" 259 | version = "0.1.39" 260 | source = "registry+https://github.com/rust-lang/crates.io-index" 261 | checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" 262 | dependencies = [ 263 | "cc", 264 | "libc", 265 | ] 266 | 267 | [[package]] 268 | name = "linux-raw-sys" 269 | version = "0.4.15" 270 | source = "registry+https://github.com/rust-lang/crates.io-index" 271 | checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" 272 | 273 | [[package]] 274 | name = "magnus" 275 | version = "0.7.1" 276 | source = "registry+https://github.com/rust-lang/crates.io-index" 277 | checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab" 278 | dependencies = [ 279 | "magnus-macros", 280 | "rb-sys", 281 | "rb-sys-env", 282 | "seq-macro", 283 | ] 284 | 285 | [[package]] 286 | name = "magnus-macros" 287 | version = "0.6.0" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3" 290 | dependencies = [ 291 | "proc-macro2", 292 | "quote", 293 | "syn", 294 | ] 295 | 296 | [[package]] 297 | name = "memchr" 298 | version = "2.7.4" 299 | source = "registry+https://github.com/rust-lang/crates.io-index" 300 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 301 | 302 | [[package]] 303 | name = "mimalloc" 304 | version = "0.1.43" 305 | source = "registry+https://github.com/rust-lang/crates.io-index" 306 | checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" 307 | dependencies = [ 308 | "libmimalloc-sys", 309 | ] 310 | 311 | [[package]] 312 | name = "minimal-lexical" 313 | version = "0.2.1" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" 316 | 317 | [[package]] 318 | name = "miniz_oxide" 319 | version = "0.8.2" 320 | source = "registry+https://github.com/rust-lang/crates.io-index" 321 | checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" 322 | dependencies = [ 323 | "adler2", 324 | ] 325 | 326 | [[package]] 327 | name = "nom" 328 | version = "7.1.3" 329 | source = "registry+https://github.com/rust-lang/crates.io-index" 330 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" 331 | dependencies = [ 332 | "memchr", 333 | "minimal-lexical", 334 | ] 335 | 336 | [[package]] 337 | name = "once_cell" 338 | version = "1.20.2" 339 | source = "registry+https://github.com/rust-lang/crates.io-index" 340 | checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" 341 | 342 | [[package]] 343 | name = "osv" 344 | version = "0.1.0" 345 | dependencies = [ 346 | "ahash", 347 | "csv", 348 | "flate2", 349 | "itertools 0.14.0", 350 | "jemallocator", 351 | "magnus", 352 | "mimalloc", 353 | "rb-sys", 354 | "serde", 355 | "serde_magnus", 356 | "tempfile", 357 | "thiserror", 358 | ] 359 | 360 | [[package]] 361 | name = "proc-macro2" 362 | version = "1.0.92" 363 | source = "registry+https://github.com/rust-lang/crates.io-index" 364 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" 365 | dependencies = [ 366 | "unicode-ident", 367 | ] 368 | 369 | [[package]] 370 | name = "quote" 371 | version = "1.0.37" 372 | source = "registry+https://github.com/rust-lang/crates.io-index" 373 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 374 | dependencies = [ 375 | "proc-macro2", 376 | ] 377 | 378 | [[package]] 379 | name = "rb-sys" 380 | version = "0.9.104" 381 | source = "registry+https://github.com/rust-lang/crates.io-index" 382 | checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2" 383 | dependencies = [ 384 | "rb-sys-build", 385 | ] 386 | 387 | [[package]] 388 | name = "rb-sys-build" 389 | version = "0.9.104" 390 | source = "registry+https://github.com/rust-lang/crates.io-index" 391 | checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc" 392 | dependencies = [ 393 | "bindgen", 394 | "lazy_static", 395 | "proc-macro2", 396 | "quote", 397 | "regex", 398 | "shell-words", 399 | "syn", 400 | ] 401 | 402 | [[package]] 403 | name = "rb-sys-env" 404 | version = "0.1.2" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb" 407 | 408 | [[package]] 409 | name = "regex" 410 | version = "1.11.1" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 413 | dependencies = [ 414 | "aho-corasick", 415 | "memchr", 416 | "regex-automata", 417 | "regex-syntax", 418 | ] 419 | 420 | [[package]] 421 | name = "regex-automata" 422 | version = "0.4.9" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 425 | dependencies = [ 426 | "aho-corasick", 427 | "memchr", 428 | "regex-syntax", 429 | ] 430 | 431 | [[package]] 432 | name = "regex-syntax" 433 | version = "0.8.5" 434 | source = "registry+https://github.com/rust-lang/crates.io-index" 435 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 436 | 437 | [[package]] 438 | name = "rustc-hash" 439 | version = "1.1.0" 440 | source = "registry+https://github.com/rust-lang/crates.io-index" 441 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 442 | 443 | [[package]] 444 | name = "rustix" 445 | version = "0.38.44" 446 | source = "registry+https://github.com/rust-lang/crates.io-index" 447 | checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" 448 | dependencies = [ 449 | "bitflags", 450 | "errno", 451 | "libc", 452 | "linux-raw-sys", 453 | "windows-sys", 454 | ] 455 | 456 | [[package]] 457 | name = "ryu" 458 | version = "1.0.18" 459 | source = "registry+https://github.com/rust-lang/crates.io-index" 460 | checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" 461 | 462 | [[package]] 463 | name = "seq-macro" 464 | version = "0.3.5" 465 | source = "registry+https://github.com/rust-lang/crates.io-index" 466 | checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" 467 | 468 | [[package]] 469 | name = "serde" 470 | version = "1.0.216" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" 473 | dependencies = [ 474 | "serde_derive", 475 | ] 476 | 477 | [[package]] 478 | name = "serde_derive" 479 | version = "1.0.216" 480 | source = "registry+https://github.com/rust-lang/crates.io-index" 481 | checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" 482 | dependencies = [ 483 | "proc-macro2", 484 | "quote", 485 | "syn", 486 | ] 487 | 488 | [[package]] 489 | name = "serde_magnus" 490 | version = "0.9.0" 491 | source = "registry+https://github.com/rust-lang/crates.io-index" 492 | checksum = "51b8b945a2dadb221f1c5490cfb411cab6c3821446b8eca50ee07e5a3893ec51" 493 | dependencies = [ 494 | "magnus", 495 | "serde", 496 | "tap", 497 | ] 498 | 499 | [[package]] 500 | name = "shell-words" 501 | version = "1.1.0" 502 | source = "registry+https://github.com/rust-lang/crates.io-index" 503 | checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde" 504 | 505 | [[package]] 506 | name = "shlex" 507 | version = "1.3.0" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 510 | 511 | [[package]] 512 | name = "syn" 513 | version = "2.0.91" 514 | source = "registry+https://github.com/rust-lang/crates.io-index" 515 | checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035" 516 | dependencies = [ 517 | "proc-macro2", 518 | "quote", 519 | "unicode-ident", 520 | ] 521 | 522 | [[package]] 523 | name = "tap" 524 | version = "1.0.1" 525 | source = "registry+https://github.com/rust-lang/crates.io-index" 526 | checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" 527 | 528 | [[package]] 529 | name = "tempfile" 530 | version = "3.17.1" 531 | source = "registry+https://github.com/rust-lang/crates.io-index" 532 | checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230" 533 | dependencies = [ 534 | "cfg-if", 535 | "fastrand", 536 | "getrandom 0.3.1", 537 | "once_cell", 538 | "rustix", 539 | "windows-sys", 540 | ] 541 | 542 | [[package]] 543 | name = "thiserror" 544 | version = "2.0.9" 545 | source = "registry+https://github.com/rust-lang/crates.io-index" 546 | checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" 547 | dependencies = [ 548 | "thiserror-impl", 549 | ] 550 | 551 | [[package]] 552 | name = "thiserror-impl" 553 | version = "2.0.9" 554 | source = "registry+https://github.com/rust-lang/crates.io-index" 555 | checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" 556 | dependencies = [ 557 | "proc-macro2", 558 | "quote", 559 | "syn", 560 | ] 561 | 562 | [[package]] 563 | name = "unicode-ident" 564 | version = "1.0.14" 565 | source = "registry+https://github.com/rust-lang/crates.io-index" 566 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" 567 | 568 | [[package]] 569 | name = "version_check" 570 | version = "0.9.5" 571 | source = "registry+https://github.com/rust-lang/crates.io-index" 572 | checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 573 | 574 | [[package]] 575 | name = "wasi" 576 | version = "0.11.0+wasi-snapshot-preview1" 577 | source = "registry+https://github.com/rust-lang/crates.io-index" 578 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 579 | 580 | [[package]] 581 | name = "wasi" 582 | version = "0.13.3+wasi-0.2.2" 583 | source = "registry+https://github.com/rust-lang/crates.io-index" 584 | checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" 585 | dependencies = [ 586 | "wit-bindgen-rt", 587 | ] 588 | 589 | [[package]] 590 | name = "windows-sys" 591 | version = "0.59.0" 592 | source = "registry+https://github.com/rust-lang/crates.io-index" 593 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 594 | dependencies = [ 595 | "windows-targets", 596 | ] 597 | 598 | [[package]] 599 | name = "windows-targets" 600 | version = "0.52.6" 601 | source = "registry+https://github.com/rust-lang/crates.io-index" 602 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 603 | dependencies = [ 604 | "windows_aarch64_gnullvm", 605 | "windows_aarch64_msvc", 606 | "windows_i686_gnu", 607 | "windows_i686_gnullvm", 608 | "windows_i686_msvc", 609 | "windows_x86_64_gnu", 610 | "windows_x86_64_gnullvm", 611 | "windows_x86_64_msvc", 612 | ] 613 | 614 | [[package]] 615 | name = "windows_aarch64_gnullvm" 616 | version = "0.52.6" 617 | source = "registry+https://github.com/rust-lang/crates.io-index" 618 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 619 | 620 | [[package]] 621 | name = "windows_aarch64_msvc" 622 | version = "0.52.6" 623 | source = "registry+https://github.com/rust-lang/crates.io-index" 624 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 625 | 626 | [[package]] 627 | name = "windows_i686_gnu" 628 | version = "0.52.6" 629 | source = "registry+https://github.com/rust-lang/crates.io-index" 630 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 631 | 632 | [[package]] 633 | name = "windows_i686_gnullvm" 634 | version = "0.52.6" 635 | source = "registry+https://github.com/rust-lang/crates.io-index" 636 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 637 | 638 | [[package]] 639 | name = "windows_i686_msvc" 640 | version = "0.52.6" 641 | source = "registry+https://github.com/rust-lang/crates.io-index" 642 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 643 | 644 | [[package]] 645 | name = "windows_x86_64_gnu" 646 | version = "0.52.6" 647 | source = "registry+https://github.com/rust-lang/crates.io-index" 648 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 649 | 650 | [[package]] 651 | name = "windows_x86_64_gnullvm" 652 | version = "0.52.6" 653 | source = "registry+https://github.com/rust-lang/crates.io-index" 654 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 655 | 656 | [[package]] 657 | name = "windows_x86_64_msvc" 658 | version = "0.52.6" 659 | source = "registry+https://github.com/rust-lang/crates.io-index" 660 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 661 | 662 | [[package]] 663 | name = "wit-bindgen-rt" 664 | version = "0.33.0" 665 | source = "registry+https://github.com/rust-lang/crates.io-index" 666 | checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" 667 | dependencies = [ 668 | "bitflags", 669 | ] 670 | 671 | [[package]] 672 | name = "zerocopy" 673 | version = "0.7.35" 674 | source = "registry+https://github.com/rust-lang/crates.io-index" 675 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 676 | dependencies = [ 677 | "zerocopy-derive", 678 | ] 679 | 680 | [[package]] 681 | name = "zerocopy-derive" 682 | version = "0.7.35" 683 | source = "registry+https://github.com/rust-lang/crates.io-index" 684 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 685 | dependencies = [ 686 | "proc-macro2", 687 | "quote", 688 | "syn", 689 | ] 690 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["./ext/osv"] 3 | resolver = "2" 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gem "rb_sys", "~> 0.9.56" 4 | gem "rake" 5 | 6 | # Use local version of osv 7 | gemspec 8 | 9 | group :development, :test do 10 | gem "csv" 11 | gem "minitest", "~> 5.0" 12 | gem "benchmark-ips", "~> 2.12" 13 | gem "fastcsv", "~> 0.0.7" 14 | end 15 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | osv (0.5.2) 5 | rb_sys (~> 0.9.39) 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | benchmark-ips (2.14.0) 11 | csv (3.3.2) 12 | fastcsv (0.0.7) 13 | minitest (5.25.4) 14 | rake (13.2.1) 15 | rake-compiler (1.2.0) 16 | rake 17 | rb_sys (0.9.104) 18 | 19 | PLATFORMS 20 | arm64-darwin-23 21 | ruby 22 | 23 | DEPENDENCIES 24 | benchmark-ips (~> 2.12) 25 | csv 26 | fastcsv (~> 0.0.7) 27 | minitest (~> 5.0) 28 | osv! 29 | rake 30 | rake-compiler (~> 1.2.0) 31 | rb_sys (~> 0.9.56) 32 | 33 | BUNDLED WITH 34 | 2.5.11 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Nathan Jaremko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OSV 2 | 3 | [![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv) 4 | 5 | OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate. 6 | 7 | It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats. 8 | 9 | The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode. 10 | 11 | ## Installation 12 | 13 | Add this line to your application's Gemfile: 14 | 15 | ```ruby 16 | gem 'osv' 17 | ``` 18 | 19 | And then execute: 20 | 21 | ```bash 22 | bundle install 23 | ``` 24 | 25 | Or install it directly: 26 | 27 | ```bash 28 | gem install osv 29 | ``` 30 | 31 | ## Usage 32 | 33 | ### Reading CSV Files 34 | 35 | ```ruby 36 | require 'osv' 37 | 38 | # Basic usage - each row as a hash 39 | OSV.for_each("data.csv") do |row| 40 | puts row["name"] # => "John" 41 | puts row["age"] # => "25" 42 | end 43 | 44 | # Return an enumerator instead of using a block 45 | rows = OSV.for_each("data.csv") 46 | rows.each { |row| puts row["name"] } 47 | 48 | # High-performance array mode 49 | OSV.for_each("data.csv", result_type: :array) do |row| 50 | puts row[0] # First column 51 | puts row[1] # Second column 52 | end 53 | ``` 54 | 55 | ### Input Sources 56 | 57 | ```ruby 58 | # From a file path 59 | OSV.for_each("data.csv") { |row| puts row["name"] } 60 | 61 | # From a file path 62 | OSV.for_each("data.csv.gz") { |row| puts row["name"] } 63 | 64 | # From an IO object 65 | File.open("data.csv") { |file| OSV.for_each(file) { |row| puts row["name"] } } 66 | 67 | # From a string 68 | data = StringIO.new("name,age\nJohn,25") 69 | OSV.for_each(data) { |row| puts row["name"] } 70 | ``` 71 | 72 | ### Configuration Options 73 | 74 | ```ruby 75 | OSV.for_each("data.csv", 76 | # Input formatting 77 | has_headers: true, # First row contains headers (default: true) 78 | col_sep: ",", # Column separator (default: ",") 79 | quote_char: '"', # Quote character (default: '"') 80 | 81 | # Output formatting 82 | result_type: :hash, # :hash or :array (hash is default) 83 | nil_string: nil, # String to interpret as nil when parsing (default: nil) 84 | 85 | # Parsing behavior 86 | flexible: false, # Allow varying number of fields (default: false) 87 | trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil) 88 | buffer_size: 1024, # Number of rows to buffer in memory (default: 1024) 89 | ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false) 90 | lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false) 91 | ) 92 | ``` 93 | 94 | #### Available Options 95 | 96 | - `has_headers`: Boolean indicating if the first row contains headers (default: true) 97 | - `col_sep`: String specifying the field separator (default: ",") 98 | - `quote_char`: String specifying the quote character (default: "\"") 99 | - `nil_string`: String that should be interpreted as nil 100 | - by default, empty strings are interpreted as empty strings 101 | - if you want to interpret empty strings as nil, set this to an empty string 102 | - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024) 103 | - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array) 104 | - `flexible`: Boolean specifying if the parser should be flexible (default: false) 105 | - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields) 106 | - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false) 107 | - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false) 108 | 109 | When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc. 110 | 111 | ## Requirements 112 | 113 | - Ruby >= 3.1.0 114 | - Rust toolchain (for installation from source) 115 | 116 | ## Performance 117 | 118 | This library is faster than the standard Ruby CSV library. It's also faster than any other CSV gem I've been able to find. 119 | 120 | Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file. 121 | 122 | ### 1,000,000 records 123 | 124 | ``` 125 | 🏃 Running benchmarks... 126 | Benchmarking with 3000001 lines of data 127 | 128 | ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24] 129 | Warming up -------------------------------------- 130 | CSV - StringIO 1.000 i/100ms 131 | FastCSV - StringIO 1.000 i/100ms 132 | OSV - StringIO 1.000 i/100ms 133 | CSV - Hash output 1.000 i/100ms 134 | OSV - Hash output 1.000 i/100ms 135 | CSV - Array output 1.000 i/100ms 136 | OSV - Array output 1.000 i/100ms 137 | FastCSV - Array output 138 | 1.000 i/100ms 139 | OSV - Direct Open Array output 140 | 1.000 i/100ms 141 | OSV - Gzipped 1.000 i/100ms 142 | OSV - Gzipped Direct 1.000 i/100ms 143 | FastCSV - Gzipped 1.000 i/100ms 144 | CSV - Gzipped 1.000 i/100ms 145 | Calculating ------------------------------------- 146 | CSV - StringIO 0.081 (± 0.0%) i/s (12.36 s/i) - 3.000 in 37.155983s 147 | FastCSV - StringIO 0.367 (± 0.0%) i/s (2.73 s/i) - 11.000 in 30.182262s 148 | OSV - StringIO 0.673 (± 0.0%) i/s (1.49 s/i) - 20.000 in 30.247575s 149 | CSV - Hash output 0.056 (± 0.0%) i/s (17.73 s/i) - 2.000 in 35.464673s 150 | OSV - Hash output 0.266 (± 0.0%) i/s (3.77 s/i) - 8.000 in 30.511406s 151 | CSV - Array output 0.068 (± 0.0%) i/s (14.76 s/i) - 3.000 in 44.371496s 152 | OSV - Array output 0.631 (± 0.0%) i/s (1.59 s/i) - 19.000 in 30.896566s 153 | FastCSV - Array output 154 | 0.369 (± 0.0%) i/s (2.71 s/i) - 12.000 in 32.518984s 155 | OSV - Direct Open Array output 156 | 0.642 (± 0.0%) i/s (1.56 s/i) - 19.000 in 30.162703s 157 | OSV - Gzipped 0.519 (± 0.0%) i/s (1.93 s/i) - 16.000 in 31.551051s 158 | OSV - Gzipped Direct 0.512 (± 0.0%) i/s (1.95 s/i) - 16.000 in 31.630035s 159 | FastCSV - Gzipped 0.321 (± 0.0%) i/s (3.12 s/i) - 10.000 in 31.795400s 160 | CSV - Gzipped 0.058 (± 0.0%) i/s (17.34 s/i) - 2.000 in 34.686451s 161 | 162 | Comparison: 163 | OSV - StringIO: 0.7 i/s 164 | OSV - Direct Open Array output: 0.6 i/s - 1.05x slower 165 | OSV - Array output: 0.6 i/s - 1.07x slower 166 | OSV - Gzipped: 0.5 i/s - 1.30x slower 167 | OSV - Gzipped Direct: 0.5 i/s - 1.31x slower 168 | FastCSV - Array output: 0.4 i/s - 1.82x slower 169 | FastCSV - StringIO: 0.4 i/s - 1.83x slower 170 | FastCSV - Gzipped: 0.3 i/s - 2.10x slower 171 | OSV - Hash output: 0.3 i/s - 2.53x slower 172 | CSV - StringIO: 0.1 i/s - 8.31x slower 173 | CSV - Array output: 0.1 i/s - 9.93x slower 174 | CSV - Gzipped: 0.1 i/s - 11.66x slower 175 | CSV - Hash output: 0.1 i/s - 11.92x slower 176 | ``` 177 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "rake/testtask" 4 | require "rb_sys/extensiontask" 5 | 6 | task default: :test 7 | 8 | GEMSPEC = Gem::Specification.load("osv.gemspec") 9 | 10 | platforms = [ 11 | "x86_64-linux", 12 | "x86_64-linux-musl", 13 | "aarch64-linux", 14 | "aarch64-linux-musl", 15 | "x86_64-darwin", 16 | "arm64-darwin" 17 | ] 18 | 19 | RbSys::ExtensionTask.new("osv", GEMSPEC) do |ext| 20 | ext.lib_dir = "lib/osv" 21 | ext.ext_dir = "ext/osv" 22 | ext.cross_compile = true 23 | ext.cross_platform = platforms 24 | ext.cross_compiling do |spec| 25 | spec.dependencies.reject! { |dep| dep.name == "rb_sys" } 26 | spec.files.reject! { |file| File.fnmatch?("ext/*", file, File::FNM_EXTGLOB) } 27 | end 28 | end 29 | 30 | Rake::TestTask.new do |t| 31 | t.deps << :compile 32 | t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)] 33 | t.libs << "lib" 34 | t.libs << "test" 35 | end 36 | 37 | task :release do 38 | sh "bundle exec rake test" 39 | sh "mkdir -p pkg" 40 | sh "gem build osv.gemspec -o pkg/osv-#{OSV::VERSION}.gem" 41 | sh "gem push pkg/osv-#{OSV::VERSION}.gem" 42 | end 43 | -------------------------------------------------------------------------------- /benchmark/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | export RB_SYS_CARGO_PROFILE=profiling 5 | 6 | # echo "🧹 Cleaning previous build..." 7 | # cargo clean 8 | 9 | echo "📦 Installing Ruby dependencies..." 10 | bundle install 11 | 12 | echo "🔨 Compiling Rust extension..." 13 | bundle exec rake compile 14 | 15 | echo "🏃 Running benchmarks..." 16 | bundle exec benchmark/comparison_benchmark.rb 17 | -------------------------------------------------------------------------------- /benchmark/comparison_benchmark.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require "benchmark/ips" 5 | require "csv" 6 | require "osv" 7 | require "fastcsv" 8 | require "stringio" 9 | require "zlib" 10 | require "fileutils" 11 | 12 | RubyVM::YJIT.enable 13 | 14 | # Generate a larger test file for more meaningful benchmarks 15 | def generate_test_data(rows = 1_000_000) 16 | if File.exist?("benchmark/test.csv") 17 | age_total = 0 18 | CSV.foreach("benchmark/test.csv", headers: true) { |row| age_total += row["age"].to_i } 19 | return StringIO.new(File.read("benchmark/test.csv")), age_total 20 | end 21 | 22 | age = 0 23 | headers = %w[ 24 | id 25 | name 26 | age 27 | email 28 | city 29 | country 30 | salary 31 | department 32 | hire_date 33 | manager_id 34 | performance_score 35 | project_count 36 | active 37 | notes 38 | last_login 39 | description 40 | skills 41 | address 42 | ] 43 | CSV.open("benchmark/test.csv", "w", write_headers: true, headers: headers) do |csv| 44 | rows.times do |i| 45 | row_age = rand(18..80) 46 | age += row_age 47 | csv << [ 48 | i, 49 | "Person#{i}", 50 | row_age, 51 | "person#{i}@example.com", 52 | "City#{i}", 53 | "Country#{i}", 54 | rand(30_000..200_000), 55 | %w[Engineering Sales Marketing HR Finance].sample, 56 | "2020-#{rand(1..12)}-#{rand(1..28)}", 57 | rand(1..1000), 58 | rand(1..5).to_f, 59 | rand(1..10), 60 | [true, false].sample, 61 | "", 62 | "", 63 | # Large quoted text with commas and quotes 64 | "A very long description of person #{i}'s background, including multiple, comma-separated clauses. The person has \"special\" skills and experience in various fields.", 65 | # Array-like quoted text with commas 66 | "Ruby,Python,JavaScript,\"DevOps\",\"Cloud Architecture\"", 67 | # Address with embedded newlines and quotes 68 | "123 Main St.\nApt \"B\"\nSuite 100" 69 | ] 70 | end 71 | end 72 | 73 | file_string = File.read("benchmark/test.csv") 74 | 75 | Zlib::GzipWriter.open("benchmark/test.csv.gz") do |gz| 76 | CSV 77 | .new(gz, write_headers: true, headers: headers) 78 | .tap { |csv| CSV.parse(file_string, headers: true) { |row| csv << row } } 79 | end 80 | 81 | str = StringIO.new(file_string) 82 | [str, age] 83 | end 84 | 85 | TEST_FILES = %w[benchmark/test.csv benchmark/test.csv.gz].freeze 86 | 87 | begin 88 | # Create test files 89 | test_data, age = generate_test_data 90 | 91 | # Create gzipped version 92 | 93 | puts "Benchmarking with #{`wc -l benchmark/test.csv`.to_i} lines of data\n\n" 94 | 95 | Benchmark.ips do |x| 96 | x.config(time: 30, warmup: 5) 97 | 98 | x.report("CSV - StringIO") do 99 | count = 0 100 | io = StringIO.new(test_data.string) 101 | CSV.new(io).each { |row| count += row[2].to_i } 102 | io.close 103 | raise "Age mismatch: #{age} != #{count}" if age != count 104 | end 105 | 106 | x.report("FastCSV - StringIO") do 107 | count = 0 108 | io = StringIO.new(test_data.string) 109 | FastCSV.raw_parse(io) { |row| count += row[2].to_i } 110 | 111 | raise "Age mismatch: #{age} != #{count}" if age != count 112 | end 113 | 114 | x.report("OSV - StringIO") do 115 | count = 0 116 | io = StringIO.new(test_data.string) 117 | OSV.for_each(io, result_type: :array) { |row| count += row[2].to_i } 118 | raise "Age mismatch: #{age} != #{count}" if age != count 119 | end 120 | 121 | x.report("CSV - Hash output") do 122 | count = 0 123 | File.open("benchmark/test.csv") { |f| CSV.new(f, headers: true).each { |row| count += row["age"].to_i } } 124 | raise "Age mismatch: #{age} != #{count}" if age != count 125 | end 126 | 127 | x.report("OSV - Hash output") do 128 | count = 0 129 | File.open("benchmark/test.csv") { |f| OSV.for_each(f) { |row| count += row["age"].to_i } } 130 | raise "Age mismatch: #{age} != #{count}" if age != count 131 | end 132 | 133 | x.report("CSV - Array output") do 134 | count = 0 135 | File.open("benchmark/test.csv") { |f| CSV.new(f).each { |row| count += row[2].to_i } } 136 | raise "Age mismatch: #{age} != #{count}" if age != count 137 | end 138 | 139 | x.report("OSV - Array output") do 140 | count = 0 141 | File.open("benchmark/test.csv") { |f| OSV.for_each(f, result_type: :array) { |row| count += row[2].to_i } } 142 | raise "Age mismatch: #{age} != #{count}" if age != count 143 | end 144 | 145 | x.report("FastCSV - Array output") do 146 | count = 0 147 | File.open("benchmark/test.csv") { |f| FastCSV.raw_parse(f) { |row| count += row[2].to_i } } 148 | raise "Age mismatch: #{age} != #{count}" if age != count 149 | end 150 | 151 | x.report("OSV - Direct Open Array output") do 152 | count = 0 153 | OSV.for_each("benchmark/test.csv", result_type: :array) { |row| count += row[2].to_i } 154 | raise "Age mismatch: #{age} != #{count}" if age != count 155 | end 156 | 157 | x.report("OSV - Gzipped") do 158 | count = 0 159 | Zlib::GzipReader.open("benchmark/test.csv.gz") do |gz| 160 | OSV.for_each(gz, result_type: :array) { |row| count += row[2].to_i } 161 | end 162 | raise "Age mismatch: #{age} != #{count}" if age != count 163 | end 164 | 165 | x.report("OSV - Gzipped Direct") do 166 | count = 0 167 | OSV.for_each("benchmark/test.csv.gz", result_type: :array) { |row| count += row[2].to_i } 168 | raise "Age mismatch: #{age} != #{count}" if age != count 169 | end 170 | 171 | x.report("FastCSV - Gzipped") do 172 | count = 0 173 | Zlib::GzipReader.open("benchmark/test.csv.gz") { |gz| FastCSV.raw_parse(gz) { |row| count += row[2].to_i } } 174 | raise "Age mismatch: #{age} != #{count}" if age != count 175 | end 176 | 177 | x.report("CSV - Gzipped") do 178 | count = 0 179 | Zlib::GzipReader.open("benchmark/test.csv.gz") do |gz| 180 | CSV.new(gz, headers: true).each { |row| count += row["age"].to_i } 181 | end 182 | raise "Age mismatch: #{age} != #{count}" if age != count 183 | end 184 | 185 | x.compare! 186 | end 187 | ensure 188 | # Cleanup test files even if the script fails or is interrupted 189 | # FileUtils.rm_f(TEST_FILES) 190 | end 191 | -------------------------------------------------------------------------------- /benchmark/profile.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | export RB_SYS_CARGO_PROFILE=profiling 5 | 6 | echo "📦 Installing Ruby dependencies..." 7 | bundle install 8 | 9 | echo "🔨 Compiling Rust extension..." 10 | bundle exec rake compile 11 | 12 | # cargo install flamegraph 13 | sudo flamegraph -o flamegraph.svg -- bundle exec benchmark/ruby_profiling_script.rb 14 | -------------------------------------------------------------------------------- /benchmark/ruby_profiling_script.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require "osv" 5 | require "fastcsv" 6 | require "stringio" 7 | require "time" 8 | 9 | # Generate a larger test file for more meaningful benchmarks 10 | def generate_test_data(rows = 1_000_000) 11 | headers = %w[ 12 | id 13 | name 14 | age 15 | email 16 | city 17 | country 18 | salary 19 | department 20 | hire_date 21 | manager_id 22 | performance_score 23 | project_count 24 | active 25 | notes 26 | last_login 27 | ] 28 | StringIO.new.tap do |io| 29 | io.puts headers.join(",") 30 | rows.times do |i| 31 | row = [ 32 | i, 33 | "Person#{i}", 34 | rand(18..80), 35 | "person#{i}@example.com", 36 | "City#{i}", 37 | "Country#{i}", 38 | rand(30_000..200_000), 39 | %w[Engineering Sales Marketing HR Finance].sample, 40 | "2020-#{rand(1..12)}-#{rand(1..28)}", 41 | rand(1..1000), 42 | rand(1..5).to_f, 43 | rand(1..10), 44 | [true, false].sample, 45 | "", 46 | "" 47 | ] 48 | io.puts row.join(",") 49 | end 50 | io.rewind 51 | end 52 | end 53 | 54 | # Generate test data and write to file 55 | test_data = generate_test_data.string 56 | File.write("benchmark_test.csv", test_data) 57 | 58 | io = StringIO.new(test_data) 59 | 60 | # Process the file in a loop for 10 seconds 61 | end_time = Time.now + 30 62 | iterations = 0 63 | 64 | while Time.now < end_time 65 | count = 0 66 | OSV.for_each(io, result_type: :array) { |row| count += row[2].to_i } 67 | # FastCSV.raw_parse(io) { |row| count += row[2].to_i } 68 | io.rewind 69 | iterations += 1 70 | end 71 | 72 | puts "Completed #{iterations} iterations in 10 seconds" 73 | 74 | # Cleanup 75 | File.delete("benchmark_test.csv") 76 | -------------------------------------------------------------------------------- /ext/osv/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "osv" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [lib] 7 | crate-type = ["cdylib"] 8 | 9 | [dependencies] 10 | ahash = "0.8" 11 | csv = "^1.3" 12 | flate2 = "1.0.35" 13 | magnus = { version = "0.7", features = ["rb-sys"] } 14 | rb-sys = "^0.9" 15 | serde = { version = "1.0", features = ["derive"] } 16 | serde_magnus = "0.9.0" 17 | thiserror = "2.0" 18 | itertools = "^0.14" 19 | tempfile = "3.17.1" 20 | 21 | [target.'cfg(target_os = "linux")'.dependencies] 22 | jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } 23 | 24 | [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies] 25 | mimalloc = { version = "0.1", default-features = false } 26 | -------------------------------------------------------------------------------- /ext/osv/extconf.rb: -------------------------------------------------------------------------------- 1 | require "mkmf" 2 | require "rb_sys/mkmf" 3 | 4 | create_rust_makefile("osv/osv") 5 | -------------------------------------------------------------------------------- /ext/osv/src/allocator.rs: -------------------------------------------------------------------------------- 1 | #[cfg(target_os = "linux")] 2 | use jemallocator::Jemalloc; 3 | 4 | #[cfg(not(any(target_os = "linux", target_os = "windows")))] 5 | use mimalloc::MiMalloc; 6 | 7 | #[global_allocator] 8 | #[cfg(target_os = "linux")] 9 | static ALLOC: Jemalloc = Jemalloc; 10 | 11 | #[global_allocator] 12 | #[cfg(not(any(target_os = "linux", target_os = "windows")))] 13 | static ALLOC: MiMalloc = MiMalloc; 14 | -------------------------------------------------------------------------------- /ext/osv/src/csv/builder.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | header_cache::{CacheError, StringCache}, 3 | parser::RecordParser, 4 | record_reader::{RecordReader, READ_BUFFER_SIZE}, 5 | ruby_reader::RubyReader, 6 | }; 7 | use magnus::{Error as MagnusError, RString, Ruby, Value}; 8 | use std::{ 9 | borrow::Cow, 10 | io::{self, BufReader}, 11 | marker::PhantomData, 12 | }; 13 | 14 | use thiserror::Error; 15 | 16 | /// Errors that can occur when building a RecordReader 17 | #[derive(Error, Debug)] 18 | pub enum ReaderError { 19 | #[error("Failed to get file descriptor: {0}")] 20 | FileDescriptor(String), 21 | #[error("Invalid file descriptor: {0}")] 22 | InvalidFileDescriptor(i32), 23 | #[error("Failed to open file: {0}")] 24 | FileOpen(#[from] io::Error), 25 | #[error("Failed to intern headers: {0}")] 26 | HeaderIntern(#[from] CacheError), 27 | #[error("Invalid flexible default value: {0}")] 28 | InvalidFlexibleDefault(String), 29 | #[error("Invalid null string value: {0}")] 30 | InvalidNullString(String), 31 | #[error("Failed to parse CSV record: {0}")] 32 | CsvParse(#[from] csv::Error), 33 | #[error("Invalid UTF-8: {0}")] 34 | InvalidUtf8(String), 35 | #[error("Ruby error: {0}")] 36 | Ruby(String), 37 | } 38 | 39 | impl From for ReaderError { 40 | fn from(err: MagnusError) -> Self { 41 | Self::Ruby(err.to_string()) 42 | } 43 | } 44 | 45 | impl From for MagnusError { 46 | fn from(err: ReaderError) -> Self { 47 | let ruby = Ruby::get().unwrap(); 48 | match err { 49 | ReaderError::CsvParse(csv_err) => { 50 | if csv_err.to_string().contains("invalid utf-8") { 51 | MagnusError::new(ruby.exception_encoding_error(), csv_err.to_string()) 52 | } else { 53 | MagnusError::new(ruby.exception_runtime_error(), csv_err.to_string()) 54 | } 55 | } 56 | ReaderError::InvalidUtf8(utf8_err) => { 57 | MagnusError::new(ruby.exception_encoding_error(), utf8_err.to_string()) 58 | } 59 | _ => MagnusError::new(ruby.exception_runtime_error(), err.to_string()), 60 | } 61 | } 62 | } 63 | 64 | /// Builder for configuring and creating a RecordReader instance. 65 | /// 66 | /// This struct provides a fluent interface for setting up CSV parsing options 67 | /// and creating a RecordReader with the specified configuration. 68 | pub struct RecordReaderBuilder<'a, 'r, T: RecordParser<'a>> { 69 | ruby: &'r Ruby, 70 | to_read: Value, 71 | has_headers: bool, 72 | delimiter: u8, 73 | quote_char: u8, 74 | null_string: Option, 75 | flexible: bool, 76 | trim: csv::Trim, 77 | ignore_null_bytes: bool, 78 | lossy: bool, 79 | _phantom: PhantomData, 80 | _phantom_a: PhantomData<&'a ()>, 81 | } 82 | 83 | impl<'a, 'r, T: RecordParser<'a>> RecordReaderBuilder<'a, 'r, T> { 84 | /// Creates a new builder instance with default settings. 85 | pub fn new(ruby: &'r Ruby, to_read: Value) -> Self { 86 | Self { 87 | ruby, 88 | to_read, 89 | has_headers: true, 90 | delimiter: b',', 91 | quote_char: b'"', 92 | null_string: None, 93 | flexible: false, 94 | trim: csv::Trim::None, 95 | ignore_null_bytes: false, 96 | lossy: false, 97 | _phantom: PhantomData, 98 | _phantom_a: PhantomData, 99 | } 100 | } 101 | 102 | /// Sets whether the CSV file has headers. 103 | #[must_use] 104 | pub fn has_headers(mut self, has_headers: bool) -> Self { 105 | self.has_headers = has_headers; 106 | self 107 | } 108 | 109 | /// Sets the delimiter character for the CSV. 110 | #[must_use] 111 | pub fn delimiter(mut self, delimiter: u8) -> Self { 112 | self.delimiter = delimiter; 113 | self 114 | } 115 | 116 | /// Sets the quote character for the CSV. 117 | #[must_use] 118 | pub fn quote_char(mut self, quote_char: u8) -> Self { 119 | self.quote_char = quote_char; 120 | self 121 | } 122 | 123 | /// Sets the string that should be interpreted as null. 124 | #[must_use] 125 | pub fn null_string(mut self, null_string: Option) -> Self { 126 | self.null_string = null_string; 127 | self 128 | } 129 | 130 | /// Sets whether the reader should be flexible with field counts. 131 | #[must_use] 132 | pub fn flexible(mut self, flexible: bool) -> Self { 133 | self.flexible = flexible; 134 | self 135 | } 136 | 137 | /// Sets the trimming mode for fields. 138 | #[must_use] 139 | pub fn trim(mut self, trim: csv::Trim) -> Self { 140 | self.trim = trim; 141 | self 142 | } 143 | 144 | #[must_use] 145 | pub fn ignore_null_bytes(mut self, ignore_null_bytes: bool) -> Self { 146 | self.ignore_null_bytes = ignore_null_bytes; 147 | self 148 | } 149 | 150 | #[must_use] 151 | pub fn lossy(mut self, lossy: bool) -> Self { 152 | self.lossy = lossy; 153 | self 154 | } 155 | 156 | /// Builds the RecordReader with the configured options. 157 | pub fn build(self) -> Result, ReaderError> { 158 | let readable = RubyReader::try_from(self.to_read)?; 159 | 160 | let flexible = self.flexible; 161 | let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable); 162 | 163 | let mut reader = csv::ReaderBuilder::new() 164 | .has_headers(self.has_headers) 165 | .delimiter(self.delimiter) 166 | .quote(self.quote_char) 167 | .flexible(flexible) 168 | .trim(self.trim) 169 | .from_reader(reader); 170 | 171 | let mut headers = 172 | RecordReader::::get_headers(self.ruby, &mut reader, self.has_headers, self.lossy)?; 173 | 174 | if self.ignore_null_bytes { 175 | headers = headers.iter().map(|h| h.replace("\0", "")).collect(); 176 | } 177 | 178 | let static_headers = if T::uses_headers() { 179 | StringCache::intern_many(&headers)? 180 | } else { 181 | Vec::new() 182 | }; 183 | 184 | let null_string = self 185 | .null_string 186 | .map(|s| { 187 | RString::new(&s) 188 | .to_interned_str() 189 | .as_str() 190 | .map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e))) 191 | }) 192 | .transpose()? 193 | .map(Cow::Borrowed); 194 | 195 | Ok(RecordReader::new( 196 | self.ruby, 197 | reader, 198 | static_headers, 199 | null_string, 200 | self.ignore_null_bytes, 201 | self.lossy, 202 | )) 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /ext/osv/src/csv/header_cache.rs: -------------------------------------------------------------------------------- 1 | /// This module exists to avoid cloning header keys in returned HashMaps. 2 | /// Since the underlying RString creation already involves cloning, 3 | /// this caching layer aims to reduce redundant allocations. 4 | /// 5 | /// Note: Performance testing on macOS showed minimal speed improvements, 6 | /// so this optimization could be removed if any issues arise. 7 | use std::{ 8 | collections::HashMap, 9 | sync::{LazyLock, Mutex}, 10 | }; 11 | 12 | use magnus::{ 13 | r_string::FString, 14 | value::{InnerValue, Opaque}, 15 | IntoValue, RString, Ruby, Value, 16 | }; 17 | 18 | use thiserror::Error; 19 | 20 | #[derive(Debug, Clone, Error)] 21 | pub enum CacheError { 22 | #[error("Failed to acquire lock: {0}")] 23 | LockError(String), 24 | #[error("Failed to convert Ruby String to interned string: {0}")] 25 | RStringConversion(String), 26 | } 27 | 28 | static STRING_CACHE: LazyLock>> = 29 | LazyLock::new(|| Mutex::new(HashMap::with_capacity(100))); 30 | 31 | pub struct StringCache; 32 | 33 | #[derive(Copy, Clone)] 34 | pub struct StringCacheKey(Opaque); 35 | 36 | impl StringCacheKey { 37 | pub fn new(string: &str) -> Result { 38 | let rstr = RString::new(string); 39 | let fstr = rstr.to_interned_str(); 40 | // FStrings should not be collected by the GC anyway, but just in case. 41 | magnus::gc::register_mark_object(fstr); 42 | Ok(Self(Opaque::from(fstr))) 43 | } 44 | 45 | pub fn as_fstr(&self, handle: &Ruby) -> FString { 46 | self.0.get_inner_with(handle) 47 | } 48 | 49 | pub fn as_str(&self, handle: &Ruby) -> Result<&'static str, CacheError> { 50 | self.0 51 | .get_inner_with(handle) 52 | .as_str() 53 | .map_err(|e| CacheError::RStringConversion(e.to_string())) 54 | } 55 | } 56 | 57 | impl IntoValue for StringCacheKey { 58 | fn into_value_with(self, handle: &Ruby) -> Value { 59 | handle.into_value(self.0) 60 | } 61 | } 62 | 63 | impl IntoValue for &StringCacheKey { 64 | fn into_value_with(self, handle: &Ruby) -> Value { 65 | handle.into_value(self.0) 66 | } 67 | } 68 | 69 | impl StringCache { 70 | pub fn intern_many>( 71 | strings: &[AsStr], 72 | ) -> Result, CacheError> { 73 | let mut cache = STRING_CACHE 74 | .lock() 75 | .map_err(|e| CacheError::LockError(e.to_string()))?; 76 | 77 | let mut result: Vec = Vec::with_capacity(strings.len()); 78 | for string in strings { 79 | if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) { 80 | result.push(*interned_string); 81 | } else { 82 | let interned = StringCacheKey::new(string.as_ref())?; 83 | cache.insert(string.as_ref().to_string(), interned); 84 | result.push(interned); 85 | } 86 | } 87 | Ok(result) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /ext/osv/src/csv/mod.rs: -------------------------------------------------------------------------------- 1 | mod builder; 2 | mod header_cache; 3 | mod parser; 4 | mod record; 5 | mod record_reader; 6 | mod ruby_reader; 7 | 8 | pub use builder::RecordReaderBuilder; 9 | pub use record::CowStr; 10 | pub use record::CsvRecord; 11 | -------------------------------------------------------------------------------- /ext/osv/src/csv/parser.rs: -------------------------------------------------------------------------------- 1 | use super::builder::ReaderError; 2 | use super::header_cache::StringCacheKey; 3 | use super::CowStr; 4 | use magnus::Ruby; 5 | use std::borrow::Cow; 6 | use std::collections::HashMap; 7 | use std::hash::BuildHasher; 8 | 9 | pub enum CsvRecordType { 10 | String(csv::StringRecord), 11 | Byte(csv::ByteRecord), 12 | } 13 | 14 | pub trait RecordParser<'a> { 15 | type Output; 16 | 17 | fn parse( 18 | handle: &Ruby, 19 | headers: &[StringCacheKey], 20 | record: &CsvRecordType, 21 | null_string: Option>, 22 | ignore_null_bytes: bool, 23 | ) -> Result; 24 | 25 | fn uses_headers() -> bool; 26 | } 27 | 28 | impl<'a, S: BuildHasher + Default> RecordParser<'a> 29 | for HashMap<&'static str, Option>, S> 30 | { 31 | type Output = Self; 32 | 33 | #[inline] 34 | fn uses_headers() -> bool { 35 | true 36 | } 37 | 38 | #[inline] 39 | fn parse( 40 | handle: &Ruby, 41 | headers: &[StringCacheKey], 42 | record: &CsvRecordType, 43 | null_string: Option>, 44 | ignore_null_bytes: bool, 45 | ) -> Result { 46 | let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default()); 47 | let shared_empty = Cow::Borrowed(""); 48 | 49 | for (i, header) in headers.iter().enumerate() { 50 | let value = match record { 51 | CsvRecordType::String(s) => s.get(i).and_then(|field| { 52 | convert_field_to_cow_str( 53 | field, 54 | null_string.as_deref(), 55 | ignore_null_bytes, 56 | &shared_empty, 57 | ) 58 | }), 59 | CsvRecordType::Byte(b) => b.get(i).and_then(|field| { 60 | let field = String::from_utf8_lossy(field); 61 | convert_field_to_cow_str( 62 | &field, 63 | null_string.as_deref(), 64 | ignore_null_bytes, 65 | &shared_empty, 66 | ) 67 | }), 68 | }; 69 | 70 | map.insert(header.as_str(handle)?, value); 71 | } 72 | 73 | Ok(map) 74 | } 75 | } 76 | 77 | impl<'a> RecordParser<'a> for Vec>> { 78 | type Output = Self; 79 | 80 | #[inline] 81 | fn uses_headers() -> bool { 82 | false 83 | } 84 | 85 | #[inline] 86 | fn parse( 87 | _handle: &Ruby, 88 | headers: &[StringCacheKey], 89 | record: &CsvRecordType, 90 | null_string: Option>, 91 | ignore_null_bytes: bool, 92 | ) -> Result { 93 | let target_len = headers.len(); 94 | let mut vec = Vec::with_capacity(target_len); 95 | let shared_empty = Cow::Borrowed(""); 96 | 97 | match record { 98 | CsvRecordType::String(record) => { 99 | for field in record.iter() { 100 | let value = convert_field_to_cow_str( 101 | field, 102 | null_string.as_deref(), 103 | ignore_null_bytes, 104 | &shared_empty, 105 | ); 106 | vec.push(value); 107 | } 108 | } 109 | CsvRecordType::Byte(record) => { 110 | for field in record.iter() { 111 | let field = String::from_utf8_lossy(field); 112 | let value = convert_field_to_cow_str( 113 | &field, 114 | null_string.as_deref(), 115 | ignore_null_bytes, 116 | &shared_empty, 117 | ); 118 | vec.push(value); 119 | } 120 | } 121 | } 122 | 123 | Ok(vec) 124 | } 125 | } 126 | 127 | #[inline] 128 | fn convert_field_to_cow_str<'a>( 129 | field: &str, 130 | null_string: Option<&str>, 131 | ignore_null_bytes: bool, 132 | shared_empty: &Cow<'a, str>, 133 | ) -> Option> { 134 | if Some(field) == null_string { 135 | None 136 | } else if field.is_empty() { 137 | Some(CowStr(shared_empty.clone())) 138 | } else if ignore_null_bytes { 139 | Some(CowStr(Cow::Owned(field.replace("\0", "")))) 140 | } else { 141 | Some(CowStr(Cow::Owned(field.to_string()))) 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /ext/osv/src/csv/record.rs: -------------------------------------------------------------------------------- 1 | use itertools::Itertools; 2 | use magnus::{value::ReprValue, IntoValue, Ruby, Value}; 3 | use std::{borrow::Cow, collections::HashMap, hash::BuildHasher}; 4 | 5 | #[derive(Debug)] 6 | pub enum CsvRecord<'a, S: BuildHasher + Default> { 7 | Vec(Vec>>), 8 | Map(HashMap<&'static str, Option>, S>), 9 | } 10 | 11 | impl IntoValue for CsvRecord<'_, S> { 12 | #[inline] 13 | fn into_value_with(self, handle: &Ruby) -> Value { 14 | match self { 15 | CsvRecord::Vec(vec) => { 16 | let ary = handle.ary_new_capa(vec.len()); 17 | vec.into_iter().try_for_each(|v| ary.push(v)).unwrap(); 18 | ary.into_value_with(handle) 19 | } 20 | CsvRecord::Map(map) => { 21 | // Pre-allocate the hash with the known size 22 | let hash = handle.hash_new_capa(map.len()); 23 | 24 | let mut values: [Value; 128] = [handle.qnil().as_value(); 128]; 25 | let mut i = 0; 26 | 27 | for chunk in &map.into_iter().chunks(64) { 28 | for (k, v) in chunk { 29 | values[i] = handle.into_value(k); 30 | values[i + 1] = handle.into_value(v); 31 | i += 2; 32 | } 33 | hash.bulk_insert(&values[..i]).unwrap(); 34 | 35 | // Zero out used values 36 | values[..i].fill(handle.qnil().as_value()); 37 | i = 0; 38 | } 39 | 40 | hash.into_value_with(handle) 41 | } 42 | } 43 | } 44 | } 45 | 46 | #[derive(Debug, Clone)] 47 | pub struct CowStr<'a>(pub Cow<'a, str>); 48 | 49 | impl IntoValue for CowStr<'_> { 50 | fn into_value_with(self, handle: &Ruby) -> Value { 51 | self.0.into_value_with(handle) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /ext/osv/src/csv/record_reader.rs: -------------------------------------------------------------------------------- 1 | use super::builder::ReaderError; 2 | use super::header_cache::StringCacheKey; 3 | use super::parser::{CsvRecordType, RecordParser}; 4 | use super::ruby_reader::RubyReader; 5 | use magnus::{Error, Ruby}; 6 | use std::borrow::Cow; 7 | use std::io::{BufReader, Read}; 8 | 9 | /// Size of the internal buffer used for reading CSV records 10 | pub(crate) const READ_BUFFER_SIZE: usize = 16384; 11 | 12 | /// A reader that processes CSV records using a specified parser. 13 | /// 14 | /// This struct implements Iterator to provide a streaming interface for CSV records. 15 | pub struct RecordReader<'a, 'r, T: RecordParser<'a>> { 16 | handle: &'r Ruby, 17 | reader: csv::Reader>, 18 | headers: Vec, 19 | null_string: Option>, 20 | string_record: CsvRecordType, 21 | parser: std::marker::PhantomData, 22 | ignore_null_bytes: bool, 23 | } 24 | 25 | impl<'a, 'r, T: RecordParser<'a>> RecordReader<'a, 'r, T> { 26 | /// Reads and processes headers from a CSV reader. 27 | /// 28 | /// # Arguments 29 | /// * `ruby` - Ruby VM context for error handling 30 | /// * `reader` - CSV reader instance 31 | /// * `has_headers` - Whether the CSV file contains headers 32 | /// 33 | /// # Returns 34 | /// A vector of header strings or generated column names if `has_headers` is false 35 | #[inline] 36 | pub(crate) fn get_headers( 37 | ruby: &Ruby, 38 | reader: &mut csv::Reader, 39 | has_headers: bool, 40 | lossy: bool, 41 | ) -> Result, Error> { 42 | let headers = if lossy { 43 | let first_row = reader.byte_headers().map_err(|e| { 44 | Error::new( 45 | ruby.exception_runtime_error(), 46 | format!("Failed to read headers: {e}"), 47 | ) 48 | })?; 49 | if has_headers { 50 | first_row 51 | .iter() 52 | .map(String::from_utf8_lossy) 53 | .map(|x| x.to_string()) 54 | .collect() 55 | } else { 56 | (0..first_row.len()).map(|i| format!("c{i}")).collect() 57 | } 58 | } else { 59 | let first_row = reader.headers().map_err(|e| { 60 | Error::new( 61 | ruby.exception_runtime_error(), 62 | format!("Failed to read headers: {e}"), 63 | ) 64 | })?; 65 | if has_headers { 66 | first_row.iter().map(String::from).collect() 67 | } else { 68 | (0..first_row.len()).map(|i| format!("c{i}")).collect() 69 | } 70 | }; 71 | 72 | Ok(headers) 73 | } 74 | 75 | /// Creates a new RecordReader instance. 76 | pub(crate) fn new( 77 | handle: &'r Ruby, 78 | reader: csv::Reader>, 79 | headers: Vec, 80 | null_string: Option>, 81 | ignore_null_bytes: bool, 82 | lossy: bool, 83 | ) -> Self { 84 | let headers_len = headers.len(); 85 | Self { 86 | handle, 87 | reader, 88 | headers, 89 | null_string, 90 | string_record: if lossy { 91 | CsvRecordType::Byte(csv::ByteRecord::with_capacity( 92 | READ_BUFFER_SIZE, 93 | headers_len, 94 | )) 95 | } else { 96 | CsvRecordType::String(csv::StringRecord::with_capacity( 97 | READ_BUFFER_SIZE, 98 | headers_len, 99 | )) 100 | }, 101 | parser: std::marker::PhantomData, 102 | ignore_null_bytes, 103 | } 104 | } 105 | 106 | /// Attempts to read the next record, returning any errors encountered. 107 | fn try_next(&mut self) -> Result, ReaderError> { 108 | let record = match self.string_record { 109 | CsvRecordType::String(ref mut record) => self.reader.read_record(record), 110 | CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record), 111 | }?; 112 | if record { 113 | Ok(Some(T::parse( 114 | self.handle, 115 | &self.headers, 116 | &self.string_record, 117 | self.null_string.clone(), 118 | self.ignore_null_bytes, 119 | )?)) 120 | } else { 121 | Ok(None) 122 | } 123 | } 124 | } 125 | 126 | impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, '_, T> { 127 | type Item = Result; 128 | 129 | #[inline] 130 | fn next(&mut self) -> Option { 131 | match self.try_next() { 132 | Ok(Some(record)) => Some(Ok(record)), 133 | Ok(None) => None, 134 | Err(e) => Some(Err(e)), 135 | } 136 | } 137 | 138 | #[inline] 139 | fn size_hint(&self) -> (usize, Option) { 140 | (0, None) // Cannot determine size without reading entire file 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /ext/osv/src/csv/ruby_reader.rs: -------------------------------------------------------------------------------- 1 | use flate2::bufread::GzDecoder; 2 | use magnus::{ 3 | value::{Opaque, ReprValue}, 4 | RString, Ruby, Value, 5 | }; 6 | use std::{ 7 | fs::File, 8 | io::{self, BufReader, Read, Write}, 9 | }; 10 | 11 | use super::{builder::ReaderError, record_reader::READ_BUFFER_SIZE}; 12 | 13 | /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects) 14 | /// and provide a standard Read implementation for them. 15 | pub enum RubyReader { 16 | String { 17 | inner: Opaque, 18 | offset: usize, 19 | }, 20 | RubyIoLike { 21 | inner: Opaque, 22 | }, 23 | NativeProxyIoLike { 24 | proxy_file: Box, 25 | }, 26 | } 27 | 28 | impl RubyReader { 29 | fn is_io_like(value: &Value) -> bool { 30 | value.respond_to("read", false).unwrap_or(false) 31 | } 32 | } 33 | 34 | impl TryFrom for RubyReader { 35 | type Error = ReaderError; 36 | 37 | fn try_from(value: Value) -> Result { 38 | let ruby = unsafe { Ruby::get_unchecked() }; 39 | if RubyReader::is_io_like(&value) { 40 | Ok(RubyReader::RubyIoLike { 41 | inner: Opaque::from(value), 42 | }) 43 | } else if value.is_kind_of(ruby.class_string()) { 44 | let ruby_string = value.to_r_string()?; 45 | let file_path = ruby_string.to_string()?; 46 | let file = File::open(&file_path)?; 47 | 48 | let x: Box = if file_path.ends_with(".gz") { 49 | let decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)); 50 | Box::new(decoder) 51 | } else { 52 | Box::new(file) 53 | }; 54 | 55 | Ok(RubyReader::NativeProxyIoLike { proxy_file: x }) 56 | } else { 57 | // Try calling `to_str`, and if that fails, try `to_s` 58 | let string_content = value 59 | .funcall::<_, _, RString>("to_str", ()) 60 | .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?; 61 | Ok(RubyReader::String { 62 | inner: Opaque::from(string_content), 63 | offset: 0, 64 | }) 65 | } 66 | } 67 | } 68 | 69 | impl Read for RubyReader { 70 | fn read(&mut self, mut buf: &mut [u8]) -> io::Result { 71 | let ruby = unsafe { Ruby::get_unchecked() }; 72 | match self { 73 | RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf), 74 | RubyReader::String { inner, offset } => { 75 | let unwrapped_inner = ruby.get_inner(*inner); 76 | 77 | let string_buffer = unsafe { unwrapped_inner.as_slice() }; 78 | if *offset >= string_buffer.len() { 79 | return Ok(0); // EOF 80 | } 81 | 82 | let remaining = string_buffer.len() - *offset; 83 | let copy_size = remaining.min(buf.len()); 84 | buf[..copy_size].copy_from_slice(&string_buffer[*offset..*offset + copy_size]); 85 | 86 | *offset += copy_size; 87 | 88 | Ok(copy_size) 89 | } 90 | RubyReader::RubyIoLike { inner } => { 91 | let unwrapped_inner = ruby.get_inner(*inner); 92 | 93 | let bytes = unwrapped_inner 94 | .funcall::<_, _, Option>("read", (buf.len(),)) 95 | .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; 96 | 97 | match bytes { 98 | Some(bytes) => { 99 | let string_buffer = unsafe { bytes.as_slice() }; 100 | buf.write_all(string_buffer)?; 101 | Ok(string_buffer.len()) 102 | } 103 | None => Ok(0), 104 | } 105 | } 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /ext/osv/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod allocator; 2 | mod csv; 3 | mod reader; 4 | mod utils; 5 | 6 | use crate::reader::*; 7 | 8 | use magnus::{Error, Ruby}; 9 | 10 | /// Initializes the Ruby extension and defines methods. 11 | #[magnus::init] 12 | fn init(ruby: &Ruby) -> Result<(), Error> { 13 | let module = ruby.define_module("OSV")?; 14 | module.define_module_function("for_each", magnus::method!(parse_csv, -1))?; 15 | Ok(()) 16 | } 17 | -------------------------------------------------------------------------------- /ext/osv/src/reader.rs: -------------------------------------------------------------------------------- 1 | use crate::csv::{CowStr, CsvRecord, RecordReaderBuilder}; 2 | use crate::utils::*; 3 | use ahash::RandomState; 4 | use csv::Trim; 5 | use magnus::value::ReprValue; 6 | use magnus::{Error, IntoValue, KwArgs, Ruby, Symbol, Value}; 7 | use std::collections::HashMap; 8 | 9 | /// Valid result types for CSV parsing 10 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 11 | enum ResultType { 12 | Hash, 13 | Array, 14 | } 15 | 16 | impl ResultType { 17 | fn from_str(s: &str) -> Option { 18 | match s { 19 | "hash" => Some(Self::Hash), 20 | "array" => Some(Self::Array), 21 | _ => None, 22 | } 23 | } 24 | } 25 | 26 | /// Arguments for creating an enumerator 27 | #[derive(Debug)] 28 | struct EnumeratorArgs { 29 | rb_self: Value, 30 | to_read: Value, 31 | has_headers: bool, 32 | delimiter: u8, 33 | quote_char: u8, 34 | null_string: Option, 35 | result_type: String, 36 | flexible: bool, 37 | trim: Option, 38 | ignore_null_bytes: bool, 39 | lossy: bool, 40 | } 41 | 42 | /// Parses a CSV file with the given configuration. 43 | /// 44 | /// # Safety 45 | /// This function uses unsafe code to get the Ruby runtime and leak memory for static references. 46 | /// This is necessary for Ruby integration but should be used with caution. 47 | pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result { 48 | // SAFETY: We're in a Ruby callback, so Ruby runtime is guaranteed to be initialized 49 | let ruby = unsafe { Ruby::get_unchecked() }; 50 | 51 | let ReadCsvArgs { 52 | to_read, 53 | has_headers, 54 | delimiter, 55 | quote_char, 56 | null_string, 57 | result_type, 58 | flexible, 59 | trim, 60 | ignore_null_bytes, 61 | lossy, 62 | } = parse_read_csv_args(&ruby, args)?; 63 | 64 | if !ruby.block_given() { 65 | return create_enumerator( 66 | &ruby, 67 | EnumeratorArgs { 68 | rb_self, 69 | to_read, 70 | has_headers, 71 | delimiter, 72 | quote_char, 73 | null_string, 74 | result_type, 75 | flexible, 76 | trim: match trim { 77 | Trim::All => Some("all".to_string()), 78 | Trim::Headers => Some("headers".to_string()), 79 | Trim::Fields => Some("fields".to_string()), 80 | _ => None, 81 | }, 82 | ignore_null_bytes, 83 | lossy, 84 | }, 85 | ) 86 | .map(|yield_enum| yield_enum.into_value_with(&ruby)); 87 | } 88 | 89 | let result_type = ResultType::from_str(&result_type).ok_or_else(|| { 90 | Error::new( 91 | ruby.exception_runtime_error(), 92 | "Invalid result type, expected 'hash' or 'array'", 93 | ) 94 | })?; 95 | 96 | match result_type { 97 | ResultType::Hash => { 98 | let builder = RecordReaderBuilder::< 99 | HashMap<&'static str, Option>, RandomState>, 100 | >::new(&ruby, to_read) 101 | .has_headers(has_headers) 102 | .flexible(flexible) 103 | .trim(trim) 104 | .delimiter(delimiter) 105 | .quote_char(quote_char) 106 | .null_string(null_string) 107 | .ignore_null_bytes(ignore_null_bytes) 108 | .lossy(lossy) 109 | .build()?; 110 | 111 | let ruby = unsafe { Ruby::get_unchecked() }; 112 | for result in builder { 113 | let record = result?; 114 | let _: Value = ruby.yield_value(CsvRecord::::Map(record))?; 115 | } 116 | } 117 | ResultType::Array => { 118 | let builder = RecordReaderBuilder::>>>::new(&ruby, to_read) 119 | .has_headers(has_headers) 120 | .flexible(flexible) 121 | .trim(trim) 122 | .delimiter(delimiter) 123 | .quote_char(quote_char) 124 | .null_string(null_string) 125 | .ignore_null_bytes(ignore_null_bytes) 126 | .lossy(lossy) 127 | .build()?; 128 | 129 | let ruby = unsafe { Ruby::get_unchecked() }; 130 | for result in builder { 131 | let record = result?; 132 | let _: Value = ruby.yield_value(CsvRecord::::Vec(record))?; 133 | } 134 | } 135 | } 136 | 137 | let ruby = unsafe { Ruby::get_unchecked() }; 138 | Ok(ruby.qnil().into_value_with(&ruby)) 139 | } 140 | 141 | /// Creates an enumerator for lazy CSV parsing 142 | fn create_enumerator(ruby: &Ruby, args: EnumeratorArgs) -> Result { 143 | let kwargs = ruby.hash_new(); 144 | kwargs.aset(Symbol::new("has_headers"), args.has_headers)?; 145 | kwargs.aset( 146 | Symbol::new("col_sep"), 147 | String::from_utf8(vec![args.delimiter]).unwrap(), 148 | )?; 149 | kwargs.aset( 150 | Symbol::new("quote_char"), 151 | String::from_utf8(vec![args.quote_char]).unwrap(), 152 | )?; 153 | kwargs.aset(Symbol::new("nil_string"), args.null_string)?; 154 | kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?; 155 | kwargs.aset(Symbol::new("flexible"), args.flexible)?; 156 | kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?; 157 | kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?; 158 | kwargs.aset(Symbol::new("lossy"), args.lossy)?; 159 | Ok(args 160 | .rb_self 161 | .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)))) 162 | } 163 | -------------------------------------------------------------------------------- /ext/osv/src/utils.rs: -------------------------------------------------------------------------------- 1 | use magnus::{ 2 | scan_args::{get_kwargs, scan_args}, 3 | value::ReprValue, 4 | Error, RString, Ruby, Symbol, Value, 5 | }; 6 | 7 | fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result, Error> { 8 | if value.is_nil() { 9 | Ok(None) 10 | } else if value.is_kind_of(ruby.class_string()) { 11 | RString::from_value(value) 12 | .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))? 13 | .to_string() 14 | .map(Some) 15 | } else if value.is_kind_of(ruby.class_symbol()) { 16 | Symbol::from_value(value) 17 | .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))? 18 | .funcall("to_s", ()) 19 | .map(Some) 20 | } else { 21 | Err(Error::new( 22 | magnus::exception::type_error(), 23 | "Value must be a String or Symbol", 24 | )) 25 | } 26 | } 27 | 28 | #[derive(Debug)] 29 | pub struct ReadCsvArgs { 30 | pub to_read: Value, 31 | pub has_headers: bool, 32 | pub delimiter: u8, 33 | pub quote_char: u8, 34 | pub null_string: Option, 35 | pub result_type: String, 36 | pub flexible: bool, 37 | pub trim: csv::Trim, 38 | pub ignore_null_bytes: bool, 39 | pub lossy: bool, 40 | } 41 | 42 | /// Parse common arguments for CSV parsing 43 | pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result { 44 | let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?; 45 | let (to_read,) = parsed_args.required; 46 | 47 | let kwargs = get_kwargs::< 48 | _, 49 | (), 50 | ( 51 | Option>, 52 | Option>, 53 | Option>, 54 | Option>, 55 | Option>, 56 | Option>, 57 | Option>, 58 | Option>, 59 | Option>, 60 | ), 61 | (), 62 | >( 63 | parsed_args.keywords, 64 | &[], 65 | &[ 66 | "has_headers", 67 | "col_sep", 68 | "quote_char", 69 | "nil_string", 70 | "result_type", 71 | "flexible", 72 | "trim", 73 | "ignore_null_bytes", 74 | "lossy", 75 | ], 76 | )?; 77 | 78 | let has_headers = kwargs.optional.0.flatten().unwrap_or(true); 79 | 80 | let delimiter = *kwargs 81 | .optional 82 | .1 83 | .flatten() 84 | .unwrap_or_else(|| ",".to_string()) 85 | .as_bytes() 86 | .first() 87 | .ok_or_else(|| { 88 | Error::new( 89 | magnus::exception::runtime_error(), 90 | "Delimiter cannot be empty", 91 | ) 92 | })?; 93 | 94 | let quote_char = *kwargs 95 | .optional 96 | .2 97 | .flatten() 98 | .unwrap_or_else(|| "\"".to_string()) 99 | .as_bytes() 100 | .first() 101 | .ok_or_else(|| { 102 | Error::new( 103 | magnus::exception::runtime_error(), 104 | "Quote character cannot be empty", 105 | ) 106 | })?; 107 | 108 | let null_string = kwargs.optional.3.unwrap_or_default(); 109 | 110 | let result_type = match kwargs 111 | .optional 112 | .4 113 | .flatten() 114 | .map(|value| parse_string_or_symbol(ruby, value)) 115 | { 116 | Some(Ok(Some(parsed))) => match parsed.as_str() { 117 | "hash" | "array" => parsed, 118 | _ => { 119 | return Err(Error::new( 120 | magnus::exception::runtime_error(), 121 | "result_type must be either 'hash' or 'array'", 122 | )) 123 | } 124 | }, 125 | Some(Ok(None)) => String::from("hash"), 126 | Some(Err(_)) => { 127 | return Err(Error::new( 128 | magnus::exception::type_error(), 129 | "result_type must be a String or Symbol", 130 | )) 131 | } 132 | None => String::from("hash"), 133 | }; 134 | 135 | let flexible = kwargs.optional.5.flatten().unwrap_or_default(); 136 | 137 | let trim = match kwargs 138 | .optional 139 | .6 140 | .flatten() 141 | .map(|value| parse_string_or_symbol(ruby, value)) 142 | { 143 | Some(Ok(Some(parsed))) => match parsed.as_str() { 144 | "all" => csv::Trim::All, 145 | "headers" => csv::Trim::Headers, 146 | "fields" => csv::Trim::Fields, 147 | invalid => { 148 | return Err(Error::new( 149 | magnus::exception::runtime_error(), 150 | format!( 151 | "trim must be either 'all', 'headers', or 'fields' but got '{}'", 152 | invalid 153 | ), 154 | )) 155 | } 156 | }, 157 | Some(Ok(None)) => csv::Trim::None, 158 | Some(Err(_)) => { 159 | return Err(Error::new( 160 | magnus::exception::type_error(), 161 | "trim must be a String or Symbol", 162 | )) 163 | } 164 | None => csv::Trim::None, 165 | }; 166 | 167 | let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default(); 168 | 169 | let lossy = kwargs.optional.8.flatten().unwrap_or_default(); 170 | 171 | Ok(ReadCsvArgs { 172 | to_read, 173 | has_headers, 174 | delimiter, 175 | quote_char, 176 | null_string, 177 | result_type, 178 | flexible, 179 | trim, 180 | ignore_null_bytes, 181 | lossy, 182 | }) 183 | } 184 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "crane": { 4 | "locked": { 5 | "lastModified": 1734808813, 6 | "narHash": "sha256-3aH/0Y6ajIlfy7j52FGZ+s4icVX0oHhqBzRdlOeztqg=", 7 | "owner": "ipetkov", 8 | "repo": "crane", 9 | "rev": "72e2d02dbac80c8c86bf6bf3e785536acf8ee926", 10 | "type": "github" 11 | }, 12 | "original": { 13 | "owner": "ipetkov", 14 | "repo": "crane", 15 | "type": "github" 16 | } 17 | }, 18 | "flake-parts": { 19 | "inputs": { 20 | "nixpkgs-lib": "nixpkgs-lib" 21 | }, 22 | "locked": { 23 | "lastModified": 1733312601, 24 | "narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=", 25 | "owner": "hercules-ci", 26 | "repo": "flake-parts", 27 | "rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9", 28 | "type": "github" 29 | }, 30 | "original": { 31 | "owner": "hercules-ci", 32 | "repo": "flake-parts", 33 | "type": "github" 34 | } 35 | }, 36 | "flake-utils": { 37 | "inputs": { 38 | "systems": "systems" 39 | }, 40 | "locked": { 41 | "lastModified": 1731533236, 42 | "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", 43 | "owner": "numtide", 44 | "repo": "flake-utils", 45 | "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", 46 | "type": "github" 47 | }, 48 | "original": { 49 | "owner": "numtide", 50 | "repo": "flake-utils", 51 | "type": "github" 52 | } 53 | }, 54 | "napalm": { 55 | "inputs": { 56 | "flake-utils": [ 57 | "flake-utils" 58 | ], 59 | "nixpkgs": [ 60 | "nixpkgs" 61 | ] 62 | }, 63 | "locked": { 64 | "lastModified": 1717929455, 65 | "narHash": "sha256-BiI5xWygriOJuNISnGAeL0KYxrEMnjgpg+7wDskVBhI=", 66 | "owner": "nix-community", 67 | "repo": "napalm", 68 | "rev": "e1babff744cd278b56abe8478008b4a9e23036cf", 69 | "type": "github" 70 | }, 71 | "original": { 72 | "owner": "nix-community", 73 | "repo": "napalm", 74 | "type": "github" 75 | } 76 | }, 77 | "nixpkgs": { 78 | "locked": { 79 | "lastModified": 1735060202, 80 | "narHash": "sha256-5ADWDE/TTw9mKHuWKSmKI6Kyh6HwBh8JJ6QxyaWTnXA=", 81 | "owner": "NixOS", 82 | "repo": "nixpkgs", 83 | "rev": "defcdc88d552bc9758a7afd04b7f4dc8d43aa50b", 84 | "type": "github" 85 | }, 86 | "original": { 87 | "owner": "NixOS", 88 | "ref": "master", 89 | "repo": "nixpkgs", 90 | "type": "github" 91 | } 92 | }, 93 | "nixpkgs-lib": { 94 | "locked": { 95 | "lastModified": 1733096140, 96 | "narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=", 97 | "type": "tarball", 98 | "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" 99 | }, 100 | "original": { 101 | "type": "tarball", 102 | "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" 103 | } 104 | }, 105 | "root": { 106 | "inputs": { 107 | "crane": "crane", 108 | "flake-parts": "flake-parts", 109 | "flake-utils": "flake-utils", 110 | "napalm": "napalm", 111 | "nixpkgs": "nixpkgs", 112 | "rust-overlay": "rust-overlay" 113 | } 114 | }, 115 | "rust-overlay": { 116 | "inputs": { 117 | "nixpkgs": [ 118 | "nixpkgs" 119 | ] 120 | }, 121 | "locked": { 122 | "lastModified": 1735007320, 123 | "narHash": "sha256-NdhUgB9BkLGW9I+Q1GyUUCc3CbDgsg7HLWjG7WZBR5Q=", 124 | "owner": "oxalica", 125 | "repo": "rust-overlay", 126 | "rev": "fb5fdba697ee9a2391ca9ceea3b853b4e3ce37a5", 127 | "type": "github" 128 | }, 129 | "original": { 130 | "owner": "oxalica", 131 | "repo": "rust-overlay", 132 | "type": "github" 133 | } 134 | }, 135 | "systems": { 136 | "locked": { 137 | "lastModified": 1681028828, 138 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 139 | "owner": "nix-systems", 140 | "repo": "default", 141 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 142 | "type": "github" 143 | }, 144 | "original": { 145 | "owner": "nix-systems", 146 | "repo": "default", 147 | "type": "github" 148 | } 149 | } 150 | }, 151 | "root": "root", 152 | "version": 7 153 | } 154 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "Central repository of all giza builds"; 3 | nixConfig = { 4 | max-jobs = 32; 5 | http-connections = 128; 6 | max-substitution-jobs = 128; 7 | substituters = [ 8 | "https://cache.nixos.org?priority=1" 9 | "https://nix-community.cachix.org?priority=2" 10 | ]; 11 | trusted-public-keys = [ 12 | "cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY=" 13 | "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs=" 14 | ]; 15 | # This setting, when true, tries to use symlinks to optimise storage use between nix derivations. 16 | # However, on MacOS, it sometimes runs into issues, and causes stuff to build from scratch... 17 | # Which is strictly worse than using some extra storage sometimes. So we'll force it to false. 18 | auto-optimise-store = false; 19 | }; 20 | inputs = { 21 | flake-utils.url = "github:numtide/flake-utils"; 22 | flake-parts.url = "github:hercules-ci/flake-parts"; 23 | napalm = { 24 | url = "github:nix-community/napalm"; 25 | inputs = { 26 | nixpkgs.follows = "nixpkgs"; 27 | flake-utils.follows = "flake-utils"; 28 | }; 29 | }; 30 | nixpkgs.url = "github:NixOS/nixpkgs/master"; 31 | rust-overlay = { 32 | url = "github:oxalica/rust-overlay"; 33 | inputs = { 34 | nixpkgs.follows = "nixpkgs"; 35 | }; 36 | }; 37 | crane = { 38 | url = "github:ipetkov/crane"; 39 | }; 40 | }; 41 | outputs = inputs: 42 | inputs.flake-parts.lib.mkFlake { inherit inputs; } { 43 | systems = [ "x86_64-linux" "aarch64-linux" "aarch64-darwin" "x86_64-darwin" ]; 44 | imports = [ 45 | 46 | ]; 47 | 48 | perSystem = { config, self', inputs', pkgs, system, ... }: 49 | let 50 | linuxSystem = builtins.replaceStrings [ "darwin" ] [ "linux" ] system; 51 | in 52 | { 53 | _module.args.linuxSystem = linuxSystem; 54 | _module.args.pkgs = import inputs.nixpkgs { 55 | inherit system; 56 | overlays = import ./overlay.nix inputs; 57 | config = { 58 | allowUnfree = true; 59 | allowUnsupportedSystem = true; 60 | permittedInsecurePackages = [ 61 | "openssl-1.1.1w" 62 | ]; 63 | }; 64 | }; 65 | _module.args.pkgsLinux = import inputs.nixpkgs { 66 | system = linuxSystem; 67 | overlays = import ./overlay.nix inputs; 68 | config = { 69 | allowUnfree = true; 70 | allowUnsupportedSystem = true; 71 | permittedInsecurePackages = [ 72 | "openssl-1.1.1w" 73 | ]; 74 | }; 75 | }; 76 | legacyPackages.nixpkgs = pkgs; 77 | devShells.default = pkgs.mkShell { 78 | packages = with pkgs;[ 79 | ruby_3_3 80 | bundler 81 | rust-analyzer-unwrapped 82 | rust-dev-toolchain 83 | ]; 84 | }; 85 | }; 86 | }; 87 | } 88 | -------------------------------------------------------------------------------- /lib/osv.rb: -------------------------------------------------------------------------------- 1 | require_relative "osv/version" 2 | 3 | begin 4 | require "osv/#{RUBY_VERSION.to_f}/osv" 5 | rescue LoadError 6 | require "osv/osv" 7 | end 8 | 9 | module OSV 10 | end 11 | -------------------------------------------------------------------------------- /lib/osv.rbi: -------------------------------------------------------------------------------- 1 | # typed: strict 2 | 3 | module OSV 4 | # Options: 5 | # - `has_headers`: Boolean indicating if the first row contains headers 6 | # (default: true) 7 | # - `col_sep`: String specifying the field separator 8 | # (default: ",") 9 | # - `quote_char`: String specifying the quote character 10 | # (default: "\"") 11 | # - `nil_string`: String that should be interpreted as nil 12 | # By default, empty strings are interpreted as empty strings. 13 | # If you want to interpret empty strings as nil, set this to 14 | # an empty string. 15 | # - `buffer_size`: Integer specifying the read buffer size 16 | # - `result_type`: String specifying the output format 17 | # ("hash" or "array" or :hash or :array) 18 | # - `flexible`: Boolean specifying if the parser should be flexible 19 | # (default: false) 20 | # - `trim`: String specifying the trim mode 21 | # ("all" or "headers" or "fields" or :all or :headers or :fields) 22 | # (default: `nil`) 23 | # - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored 24 | # (default: false) 25 | # - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character 26 | sig do 27 | params( 28 | input: T.any(String, StringIO, IO), 29 | has_headers: T.nilable(T::Boolean), 30 | col_sep: T.nilable(String), 31 | quote_char: T.nilable(String), 32 | nil_string: T.nilable(String), 33 | buffer_size: T.nilable(Integer), 34 | result_type: T.nilable(T.any(String, Symbol)), 35 | flexible: T.nilable(T::Boolean), 36 | ignore_null_bytes: T.nilable(T::Boolean), 37 | trim: T.nilable(T.any(String, Symbol)), 38 | blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void) 39 | ).returns(T.any(Enumerator, T.untyped)) 40 | end 41 | def self.for_each( 42 | input, 43 | has_headers: true, 44 | col_sep: nil, 45 | quote_char: nil, 46 | nil_string: nil, 47 | buffer_size: nil, 48 | result_type: nil, 49 | flexible: nil, 50 | ignore_null_bytes: nil, 51 | trim: nil, 52 | lossy: nil, 53 | &blk 54 | ) 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/osv/version.rb: -------------------------------------------------------------------------------- 1 | module OSV 2 | VERSION = "0.5.3" 3 | end 4 | -------------------------------------------------------------------------------- /osv.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/osv/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "osv" 5 | spec.version = OSV::VERSION 6 | spec.authors = ["Nathan Jaremko"] 7 | spec.email = ["nathan@jaremko.ca"] 8 | 9 | spec.summary = "CSV parser for Ruby" 10 | spec.description = <<-EOF 11 | OSV is a high-performance CSV parser for Ruby, implemented in Rust. 12 | It wraps BurntSushi's csv-rs crate to provide fast CSV parsing with support for both hash-based and array-based row formats. 13 | Features include: Flexible input sources (file paths, gzipped files, IO objects, strings), 14 | configurable parsing options (headers, separators, quote chars), support for both hash and array output formats, 15 | whitespace trimming options, strict or flexible parsing modes, and is significantly faster than Ruby's standard CSV library. 16 | EOF 17 | spec.homepage = "https://github.com/njaremko/osv" 18 | spec.license = "MIT" 19 | spec.required_ruby_version = ">= 3.1.0" 20 | 21 | spec.metadata["homepage_uri"] = spec.homepage 22 | spec.metadata["source_code_uri"] = "https://github.com/njaremko/osv" 23 | spec.metadata["readme_uri"] = "https://github.com/njaremko/osv/blob/main/README.md" 24 | spec.metadata["changelog_uri"] = "https://github.com/njaremko/osv/blob/main/CHANGELOG.md" 25 | spec.metadata["documentation_uri"] = "https://www.rubydoc.info/gems/osv" 26 | spec.metadata["funding_uri"] = "https://github.com/sponsors/njaremko" 27 | 28 | spec.files = 29 | Dir[ 30 | "{ext,lib}/**/*", 31 | "LICENSE", 32 | "README.md", 33 | "Cargo.*", 34 | "Gemfile", 35 | "Rakefile" 36 | ] 37 | spec.require_paths = ["lib"] 38 | 39 | spec.extensions = ["ext/osv/extconf.rb"] 40 | 41 | # needed until rubygems supports Rust support is out of beta 42 | spec.add_dependency "rb_sys", "~> 0.9.39" 43 | 44 | # only needed when developing or packaging your gem 45 | spec.add_development_dependency "rake-compiler", "~> 1.2.0" 46 | end 47 | -------------------------------------------------------------------------------- /overlay.nix: -------------------------------------------------------------------------------- 1 | inputs: 2 | [ 3 | (import inputs.rust-overlay) 4 | (final: prev: { 5 | bundler = prev.bundler.override { ruby = final.ruby_3_3; }; 6 | bundix = prev.bundix.overrideAttrs (oldAtts: { 7 | ruby = final.ruby_3_3; 8 | }); 9 | ruby_3_3 = ((final.mkRuby { 10 | version = final.mkRubyVersion "3" "3" "6" ""; 11 | hash = "sha256-jcSP/68nD4bxAZBT8o5R5NpMzjKjZ2CgYDqa7mfX/Y0="; 12 | cargoHash = "sha256-GeelTMRFIyvz1QS2L+Q3KAnyQy7jc0ejhx3TdEFVEbk="; 13 | }).override 14 | { 15 | jemallocSupport = true; 16 | }); 17 | craneLib = (inputs.crane.mkLib final).overrideToolchain final.rust-bin.stable.latest.default; 18 | rust-toolchain = prev.rust-bin.stable.latest.default; 19 | # This is an extended rust toolchain with `rust-src` since that's required for IDE stuff 20 | rust-dev-toolchain = prev.rust-bin.stable.latest.default.override { 21 | extensions = [ "rust-src" ]; 22 | }; 23 | }) 24 | ] 25 | -------------------------------------------------------------------------------- /test/big_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "zlib" 5 | require "minitest/autorun" 6 | 7 | class BigTest < Minitest::Test 8 | def test_parse_csv_with_many_rows 9 | # Generate test data with 2000 rows 10 | Tempfile.create(%w[test_many_rows .csv]) do |test_file| 11 | test_file.write "id,name,age\n" 12 | 2000.times { |i| test_file.write "#{i},Person#{i},#{20 + i % 50}\n" } 13 | test_file.close 14 | 15 | # Parse and verify 16 | actual = [] 17 | OSV.for_each(test_file.path) { |row| actual << row } 18 | 19 | assert_equal 2000, actual.size 20 | end 21 | end 22 | 23 | def test_parse_csv_with_many_rows_stringio 24 | # Generate test data with 2000 rows 25 | io = StringIO.new 26 | io.write "id,name,age\n" 27 | 2000.times { |i| io.write "#{i},Person#{i},#{20 + i % 50}\n" } 28 | io.rewind 29 | 30 | # Parse and verify 31 | actual = [] 32 | OSV.for_each(io) { |row| actual << row } 33 | 34 | assert_equal 2000, actual.size 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /test/concurrency_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "minitest/autorun" 5 | 6 | # Tests focused on concurrency and thread-safety 7 | class ConcurrencyTest < Minitest::Test 8 | def test_parse_csv_in_multiple_threads 9 | expected = [ 10 | { "id" => "1", "age" => "25", "name" => "John" }, 11 | { "name" => "Jane", "id" => "2", "age" => "30" }, 12 | { "name" => "Jim", "age" => "35", "id" => "3" } 13 | ] 14 | 15 | threads = 16 | 100.times.map do 17 | Thread.new do 18 | result = OSV.for_each("test/test.csv").to_a 19 | assert_equal expected, result 20 | end 21 | end 22 | 23 | threads.each(&:join) 24 | end 25 | 26 | def test_parse_csv_in_multiple_threads_block 27 | expected = [ 28 | { "id" => "1", "age" => "25", "name" => "John" }, 29 | { "name" => "Jane", "id" => "2", "age" => "30" }, 30 | { "name" => "Jim", "age" => "35", "id" => "3" } 31 | ] 32 | 33 | threads = 34 | 100.times.map do 35 | Thread.new do 36 | results = [] 37 | OSV.for_each("test/test.csv") { |row| results << row } 38 | assert_equal expected, results 39 | end 40 | end 41 | 42 | threads.each(&:join) 43 | end 44 | 45 | def test_interleaved_parsing_with_threads 46 | # Create two files to parse 47 | file1 = Tempfile.new(%w[thread1 .csv]) 48 | file2 = Tempfile.new(%w[thread2 .csv]) 49 | 50 | begin 51 | # Write different content to each file 52 | file1.write("id,name\n") 53 | file2.write("code,description\n") 54 | 55 | 100.times do |i| 56 | file1.write("#{i},name#{i}\n") 57 | file2.write("code#{i},desc#{i}\n") 58 | end 59 | 60 | file1.flush 61 | file2.flush 62 | 63 | # Parse both files in interleaved fashion with threads 64 | enum1 = OSV.for_each(file1.path) 65 | enum2 = OSV.for_each(file2.path) 66 | 67 | threads = [] 68 | results1 = Queue.new 69 | results2 = Queue.new 70 | 71 | # Thread 1 processes enum1 72 | threads << Thread.new do 73 | begin 74 | results1 << enum1.next while true 75 | rescue StopIteration 76 | # Expected when enumeration is complete 77 | end 78 | end 79 | 80 | # Thread 2 processes enum2 81 | threads << Thread.new do 82 | begin 83 | results2 << enum2.next while true 84 | rescue StopIteration 85 | # Expected when enumeration is complete 86 | end 87 | end 88 | 89 | # Wait for both threads to complete 90 | threads.each(&:join) 91 | 92 | # Verify results 93 | assert_equal 100, results1.size 94 | assert_equal 100, results2.size 95 | 96 | # Check first and last items from each queue 97 | first1 = results1.pop 98 | assert_equal "0", first1["id"] 99 | assert_equal "name0", first1["name"] 100 | 101 | first2 = results2.pop 102 | assert_equal "code0", first2["code"] 103 | assert_equal "desc0", first2["description"] 104 | ensure 105 | file1.close 106 | file1.unlink 107 | file2.close 108 | file2.unlink 109 | end 110 | end 111 | end -------------------------------------------------------------------------------- /test/core_functionality_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "zlib" 5 | require "minitest/autorun" 6 | 7 | # Core functionality tests for the OSV CSV parser 8 | class CoreFunctionalityTest < Minitest::Test 9 | def test_parse_csv_with_headers 10 | expected = [ 11 | { "id" => "1", "age" => "25", "name" => "John" }, 12 | { "name" => "Jane", "id" => "2", "age" => "30" }, 13 | { "name" => "Jim", "age" => "35", "id" => "3" } 14 | ] 15 | actual = [] 16 | OSV.for_each("test/test.csv") { |row| actual << row } 17 | assert_equal expected, actual 18 | end 19 | 20 | def test_parse_csv_with_tsv 21 | expected = [ 22 | { "id" => "1", "age" => "25", "name" => "John" }, 23 | { "name" => "Jane", "id" => "2", "age" => "30" }, 24 | { "name" => "Jim", "age" => "35", "id" => "3" } 25 | ] 26 | actual = [] 27 | OSV.for_each("test/test.tsv", col_sep: "\t") { |row| actual << row } 28 | assert_equal expected, actual 29 | end 30 | 31 | def test_parse_csv_without_headers 32 | expected = [ 33 | { "c0" => "id", "c1" => "name", "c2" => "age" }, 34 | { "c1" => "John", "c2" => "25", "c0" => "1" }, 35 | { "c1" => "Jane", "c2" => "30", "c0" => "2" }, 36 | { "c0" => "3", "c1" => "Jim", "c2" => "35" } 37 | ] 38 | actual = [] 39 | OSV.for_each("test/test.csv", has_headers: false) { |row| actual << row } 40 | assert_equal expected, actual 41 | end 42 | 43 | def test_parse_csv_with_io 44 | expected = [ 45 | { "id" => "1", "age" => "25", "name" => "John" }, 46 | { "name" => "Jane", "id" => "2", "age" => "30" }, 47 | { "name" => "Jim", "age" => "35", "id" => "3" } 48 | ] 49 | actual = [] 50 | File.open("test/test.csv") { |file| OSV.for_each(file) { |row| actual << row } } 51 | assert_equal expected, actual 52 | end 53 | 54 | def test_parse_csv_with_string_io 55 | expected = [ 56 | { "id" => "1", "age" => "25", "name" => "John" }, 57 | { "name" => "Jane", "id" => "2", "age" => "30" }, 58 | { "name" => "Jim", "age" => "35", "id" => "3" } 59 | ] 60 | actual = [] 61 | csv_data = File.read("test/test.csv") 62 | string_io = StringIO.new(csv_data) 63 | OSV.for_each(string_io) { |row| actual << row } 64 | assert_equal expected, actual 65 | end 66 | 67 | def test_enumerator_raises_stop_iteration 68 | enum = OSV.for_each("test/test.csv") 69 | 3.times { enum.next } # Consume all records 70 | assert_raises(StopIteration) { enum.next } 71 | end 72 | 73 | def test_for_each_without_block 74 | result = OSV.for_each("test/test.csv") 75 | assert_instance_of Enumerator, result 76 | expected = [ 77 | { "id" => "1", "age" => "25", "name" => "John" }, 78 | { "name" => "Jane", "id" => "2", "age" => "30" }, 79 | { "name" => "Jim", "age" => "35", "id" => "3" } 80 | ] 81 | assert_equal expected, result.to_a 82 | end 83 | 84 | def test_for_each_compat_without_block 85 | result = OSV.for_each("test/test.csv", result_type: "array") 86 | assert_instance_of Enumerator, result 87 | expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]] 88 | assert_equal expected, result.to_a 89 | end 90 | 91 | def test_parse_csv_compat_without_headers 92 | expected = [%w[id name age], %w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]] 93 | actual = [] 94 | OSV.for_each("test/test.csv", has_headers: false, result_type: "array") { |row| actual << row } 95 | assert_equal expected, actual 96 | end 97 | 98 | def test_parse_csv_compat_with_headers 99 | expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]] 100 | actual = [] 101 | OSV.for_each("test/test.csv", has_headers: true, result_type: "array") { |row| actual << row } 102 | assert_equal expected, actual 103 | end 104 | 105 | def test_parse_csv_compat_with_io_and_headers 106 | expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]] 107 | actual = [] 108 | File.open("test/test.csv") { |file| OSV.for_each(file, result_type: "array") { |row| actual << row } } 109 | assert_equal expected, actual 110 | end 111 | 112 | def test_parse_csv_compat_with_io_without_headers 113 | expected = [%w[id name age], %w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]] 114 | actual = [] 115 | File.open("test/test.csv") do |file| 116 | OSV.for_each(file, has_headers: false, result_type: "array") { |row| actual << row } 117 | end 118 | assert_equal expected, actual 119 | end 120 | end -------------------------------------------------------------------------------- /test/encoding_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "minitest/autorun" 5 | 6 | # Tests focused on encoding handling 7 | class EncodingTest < Minitest::Test 8 | def test_parse_csv_with_invalid_utf8 9 | invalid_utf8 = StringIO.new("id,name\n1,\xFF\xFF\n") 10 | assert_raises(EncodingError) do 11 | OSV.for_each(invalid_utf8) { |_row| } 12 | rescue => e 13 | assert e.message.include?("invalid utf-8") 14 | raise 15 | end 16 | end 17 | 18 | def test_parse_csv_with_invalid_utf8_file 19 | File.write("test/invalid_utf8.csv", "id,name\n1,\xFF\xFF\n") 20 | assert_raises(EncodingError) do 21 | OSV.for_each("test/invalid_utf8.csv") { |_row| } 22 | rescue => e 23 | assert e.message.include?("invalid utf-8") 24 | raise 25 | ensure 26 | begin 27 | File.delete("test/invalid_utf8.csv") 28 | rescue StandardError 29 | nil 30 | end 31 | end 32 | end 33 | 34 | def test_parse_csv_with_invalid_utf8_file_lossy 35 | File.write("test/invalid_utf8.csv", "id,name\n1,\xFF\xFF\n") 36 | actual = [] 37 | OSV.for_each("test/invalid_utf8.csv", lossy: true) { |row| actual << row } 38 | assert_equal [{ "id" => "1", "name" => "��" }], actual 39 | ensure 40 | begin 41 | File.delete("test/invalid_utf8.csv") 42 | rescue StandardError 43 | nil 44 | end 45 | end 46 | 47 | def test_parse_csv_with_invalid_utf8_headers_lossy 48 | File.write("test/invalid_utf8_headers.csv", "\xFF\xFF,name\n1,test\n") 49 | actual = [] 50 | OSV.for_each("test/invalid_utf8_headers.csv", lossy: true) { |row| actual << row } 51 | assert_equal [{ "��" => "1", "name" => "test" }], actual 52 | ensure 53 | begin 54 | File.delete("test/invalid_utf8_headers.csv") 55 | rescue StandardError 56 | nil 57 | end 58 | end 59 | 60 | def test_parse_csv_with_unicode 61 | csv_content = <<~CSV 62 | id,name,description 63 | 1,"José García","Señor developer 👨‍💻" 64 | 2,"Zoë Smith","⭐ Project lead" 65 | CSV 66 | 67 | expected = [ 68 | { "id" => "1", "name" => "José García", "description" => "Señor developer 👨‍💻" }, 69 | { "id" => "2", "name" => "Zoë Smith", "description" => "⭐ Project lead" } 70 | ] 71 | 72 | actual = [] 73 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 74 | assert_equal expected, actual 75 | end 76 | 77 | def test_parse_csv_with_bom 78 | csv_content = "\xEF\xBB\xBF" + <<~CSV 79 | id,name,age 80 | 1,John,25 81 | 2,Jane,30 82 | CSV 83 | 84 | expected = [{ "id" => "1", "name" => "John", "age" => "25" }, { "id" => "2", "name" => "Jane", "age" => "30" }] 85 | 86 | actual = [] 87 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 88 | assert_equal expected, actual 89 | end 90 | 91 | def test_parse_csv_with_null_bytes 92 | csv_content = <<~CSV 93 | id,na\0me,description 94 | 1,Jo\0hn,test 95 | 2,Jane,te\0st 96 | CSV 97 | 98 | expected = [ 99 | { "id" => "1", "name" => "John", "description" => "test" }, 100 | { "id" => "2", "name" => "Jane", "description" => "test" } 101 | ] 102 | 103 | actual = [] 104 | StringIO.new(csv_content).tap { |io| OSV.for_each(io, ignore_null_bytes: true) { |row| actual << row } } 105 | assert_equal expected, actual 106 | 107 | actual = OSV.for_each(StringIO.new(csv_content), ignore_null_bytes: true).to_a 108 | assert_equal expected, actual 109 | 110 | # Without ignore_null_bytes, null bytes are preserved 111 | actual = [] 112 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 113 | assert_equal [ 114 | { "id" => "1", "na\0me" => "Jo\0hn", "description" => "test" }, 115 | { "id" => "2", "na\0me" => "Jane", "description" => "te\0st" } 116 | ], 117 | actual 118 | end 119 | end -------------------------------------------------------------------------------- /test/format_options_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "minitest/autorun" 5 | 6 | # Tests focused on parsing options and formatting 7 | class FormatOptionsTest < Minitest::Test 8 | def test_parse_csv_with_headers_null 9 | expected = [ 10 | { "id" => "1", "age" => "25", "name" => "John" }, 11 | { "name" => nil, "id" => "2", "age" => "30" }, 12 | { "name" => "Jim", "age" => "35", "id" => "3" } 13 | ] 14 | actual = [] 15 | OSV.for_each("test/test.csv", nil_string: "Jane") { |row| actual << row } 16 | assert_equal expected, actual 17 | end 18 | 19 | def test_parse_csv_compat_with_headers_null 20 | expected = [%w[1 John 25], ["2", nil, "30"], %w[3 Jim 35]] 21 | actual = [] 22 | OSV.for_each("test/test.csv", has_headers: true, nil_string: "Jane", result_type: "array") { |row| actual << row } 23 | assert_equal expected, actual 24 | end 25 | 26 | def test_parse_csv_with_empty_field 27 | Tempfile.create(%w[test .csv]) do |tempfile| 28 | # Copy existing content and add a line with empty field 29 | content = File.read("test/test.csv") 30 | content += "4,,40\n" 31 | tempfile.write(content) 32 | tempfile.close 33 | 34 | expected = [ 35 | { "id" => "1", "age" => "25", "name" => "John" }, 36 | { "name" => "Jane", "id" => "2", "age" => "30" }, 37 | { "name" => "Jim", "age" => "35", "id" => "3" }, 38 | { "id" => "4", "name" => "", "age" => "40" } 39 | ] 40 | actual = [] 41 | OSV.for_each(tempfile.path) { |row| actual << row } 42 | assert_equal expected, actual 43 | end 44 | end 45 | 46 | def test_parse_csv_with_empty_field_as_nil_string 47 | Tempfile.create(%w[test .csv]) do |tempfile| 48 | # Copy existing content and add a line with empty field 49 | content = File.read("test/test.csv") 50 | content += "4,,40\n" 51 | tempfile.write(content) 52 | tempfile.close 53 | 54 | expected = [ 55 | { "id" => "1", "age" => "25", "name" => "John" }, 56 | { "name" => "Jane", "id" => "2", "age" => "30" }, 57 | { "name" => "Jim", "age" => "35", "id" => "3" }, 58 | { "id" => "4", "name" => nil, "age" => "40" } 59 | ] 60 | actual = [] 61 | OSV.for_each(tempfile.path, nil_string: "") { |row| actual << row } 62 | assert_equal expected, actual 63 | end 64 | end 65 | 66 | def test_parse_csv_with_missing_field_default_strict 67 | Tempfile.create(%w[test .csv]) do |tempfile| 68 | content = File.read("test/test.csv") 69 | content += "4,oops\n" 70 | tempfile.write(content) 71 | tempfile.close 72 | 73 | expected = [ 74 | { "id" => "1", "age" => "25", "name" => "John" }, 75 | { "name" => "Jane", "id" => "2", "age" => "30" }, 76 | { "name" => "Jim", "age" => "35", "id" => "3" } 77 | ] 78 | actual = [] 79 | 80 | assert_raises(RuntimeError) do 81 | OSV.for_each(tempfile.path) { |row| actual << row } 82 | rescue RuntimeError => e 83 | assert e.message.include?("found record with 2 fields, but the previous record has 3 fields") 84 | raise 85 | end 86 | 87 | assert_equal expected, actual 88 | end 89 | end 90 | 91 | def test_parse_csv_with_missing_field_flexible 92 | Tempfile.create(%w[test .csv]) do |tempfile| 93 | content = File.read("test/test.csv") 94 | content += "4,oops\n" 95 | tempfile.write(content) 96 | tempfile.close 97 | 98 | expected = [ 99 | { "id" => "1", "age" => "25", "name" => "John" }, 100 | { "name" => "Jane", "id" => "2", "age" => "30" }, 101 | { "name" => "Jim", "age" => "35", "id" => "3" }, 102 | { "id" => "4", "name" => "oops", "age" => nil } 103 | ] 104 | actual = [] 105 | OSV.for_each(tempfile.path, flexible: true) { |row| actual << row } 106 | assert_equal expected, actual 107 | end 108 | end 109 | 110 | def test_parse_csv_with_missing_field_flexible_without_headers 111 | Tempfile.create(%w[test .csv]) do |tempfile| 112 | content = File.read("test/test.csv") 113 | content += "4,oops\n" 114 | tempfile.write(content) 115 | tempfile.close 116 | 117 | expected = [ 118 | { "c2" => "age", "c0" => "id", "c1" => "name" }, 119 | { "c2" => "25", "c0" => "1", "c1" => "John" }, 120 | { "c1" => "Jane", "c2" => "30", "c0" => "2" }, 121 | { "c0" => "3", "c2" => "35", "c1" => "Jim" }, 122 | { "c1" => "oops", "c0" => "4", "c2" => nil } 123 | ] 124 | actual = [] 125 | OSV.for_each(tempfile.path, has_headers: false, flexible: true) { |row| actual << row } 126 | assert_equal expected, actual 127 | end 128 | end 129 | 130 | def test_parse_csv_with_missing_field_flexible_array 131 | Tempfile.create(%w[test .csv]) do |tempfile| 132 | content = File.read("test/test.csv") 133 | content += "4,oops\n" 134 | tempfile.write(content) 135 | tempfile.close 136 | 137 | expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35], %w[4 oops]] 138 | actual = [] 139 | OSV.for_each(tempfile.path, flexible: true, result_type: :array) { |row| actual << row } 140 | assert_equal expected, actual 141 | end 142 | end 143 | 144 | def test_for_each_trim_all 145 | csv_content = <<~CSV 146 | id , name , age 147 | 1 , John , 25 148 | 2 , Jane , 30 149 | 3 , Jim , 35 150 | CSV 151 | 152 | expected = [ 153 | { "id" => "1", "name" => "John", "age" => "25" }, 154 | { "id" => "2", "name" => "Jane", "age" => "30" }, 155 | { "id" => "3", "name" => "Jim", "age" => "35" } 156 | ] 157 | 158 | actual = [] 159 | StringIO.new(csv_content).tap { |io| OSV.for_each(io, trim: "all") { |row| actual << row } } 160 | assert_equal expected, actual 161 | end 162 | 163 | def test_for_each_trim_headers 164 | csv_content = <<~CSV 165 | id , name , age 166 | 1, John, 25 167 | 2, Jane, 30 168 | 3, Jim, 35 169 | CSV 170 | 171 | expected = [ 172 | { "id" => "1", "name" => " John", "age" => " 25" }, 173 | { "id" => "2", "name" => " Jane", "age" => " 30" }, 174 | { "id" => "3", "name" => " Jim", "age" => " 35" } 175 | ] 176 | 177 | actual = [] 178 | StringIO.new(csv_content).tap { |io| OSV.for_each(io, trim: :headers) { |row| actual << row } } 179 | assert_equal expected, actual 180 | end 181 | 182 | def test_for_each_trim_fields 183 | csv_content = <<~CSV 184 | id,name,age 185 | 1 , John , 25 186 | 2 , Jane , 30 187 | 3 , Jim , 35 188 | CSV 189 | 190 | expected = [ 191 | { "id" => "1", "name" => "John", "age" => "25" }, 192 | { "id" => "2", "name" => "Jane", "age" => "30" }, 193 | { "id" => "3", "name" => "Jim", "age" => "35" } 194 | ] 195 | 196 | actual = [] 197 | StringIO.new(csv_content).tap { |io| OSV.for_each(io, trim: "fields") { |row| actual << row } } 198 | assert_equal expected, actual 199 | end 200 | 201 | def test_parse_csv_with_quoted_commas 202 | csv_content = <<~CSV 203 | id,name,description 204 | 1,"Smith, John","Manager, Sales" 205 | 2,"Doe, Jane","Director, HR" 206 | CSV 207 | 208 | expected = [ 209 | { "id" => "1", "name" => "Smith, John", "description" => "Manager, Sales" }, 210 | { "id" => "2", "name" => "Doe, Jane", "description" => "Director, HR" } 211 | ] 212 | 213 | actual = [] 214 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 215 | assert_equal expected, actual 216 | end 217 | 218 | def test_parse_csv_with_escaped_quotes 219 | csv_content = <<~CSV 220 | id,name,quote 221 | 1,"John","He said ""Hello World""" 222 | 2,"Jane","She replied ""Hi there!""" 223 | CSV 224 | 225 | expected = [ 226 | { "id" => "1", "name" => "John", "quote" => 'He said "Hello World"' }, 227 | { "id" => "2", "name" => "Jane", "quote" => 'She replied "Hi there!"' } 228 | ] 229 | 230 | actual = [] 231 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 232 | assert_equal expected, actual 233 | end 234 | 235 | def test_parse_csv_with_newlines_in_quotes 236 | csv_content = <<~CSV 237 | id,name,address 238 | 1,"John Smith","123 Main St. 239 | Apt 4B 240 | New York, NY" 241 | 2,"Jane Doe","456 Park Ave. 242 | Suite 789" 243 | CSV 244 | 245 | expected = [ 246 | { "id" => "1", "name" => "John Smith", "address" => "123 Main St.\nApt 4B\nNew York, NY" }, 247 | { "id" => "2", "name" => "Jane Doe", "address" => "456 Park Ave.\nSuite 789" } 248 | ] 249 | 250 | actual = [] 251 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 252 | assert_equal expected, actual 253 | end 254 | 255 | def test_parse_csv_with_explicit_nil_kwargs 256 | csv_content = <<~CSV 257 | id,name,age 258 | 1,John,25 259 | 2,Jane,30 260 | CSV 261 | 262 | expected = [{ "id" => "1", "name" => "John", "age" => "25" }, { "id" => "2", "name" => "Jane", "age" => "30" }] 263 | 264 | actual = [] 265 | StringIO 266 | .new(csv_content) 267 | .tap do |io| 268 | OSV.for_each( 269 | io, 270 | has_headers: nil, 271 | col_sep: nil, 272 | quote_char: nil, 273 | nil_string: nil, 274 | result_type: nil, 275 | flexible: nil, 276 | trim: nil 277 | ) { |row| actual << row } 278 | end 279 | assert_equal expected, actual 280 | end 281 | end -------------------------------------------------------------------------------- /test/gc_stress_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "minitest/autorun" 5 | require "stringio" 6 | require "tempfile" 7 | 8 | class GCStressTest < Minitest::Test 9 | def setup 10 | # Create a CSV string to test with 11 | csv = String.new("id,header1,header2\n") 12 | 100.times do |i| 13 | csv << "#{i},value_#{i}_1,value_#{i}_2\n" 14 | end 15 | @csv_string = csv 16 | 17 | # Set GC to maximum stress level 18 | GC.stress = true 19 | end 20 | 21 | def teardown 22 | # Reset GC settings 23 | GC.stress = false 24 | end 25 | 26 | def test_parse_with_gc_stress 27 | # Parse the CSV with GC stress enabled 28 | results = [] 29 | 30 | # Use a StringIO to avoid filesystem operations 31 | io = StringIO.new(@csv_string) 32 | 33 | # Parse with OSV 34 | enum = OSV.for_each(io) 35 | 36 | # Read all rows with aggressive GC between each 37 | count = 0 38 | begin 39 | while count < 100 40 | row = enum.next 41 | results << row 42 | count += 1 43 | 44 | # Force garbage collection 45 | GC.start(full_mark: true, immediate_sweep: true) 46 | end 47 | rescue StopIteration 48 | # Expected at end of file 49 | end 50 | 51 | # Verify we read everything 52 | assert_equal 100, results.size 53 | 54 | # Verify some random values 55 | assert_equal "0", results[0]["id"] 56 | assert_equal "value_0_1", results[0]["header1"] 57 | assert_equal "50", results[50]["id"] 58 | assert_equal "value_50_1", results[50]["header1"] 59 | assert_equal "99", results[99]["id"] 60 | assert_equal "value_99_2", results[99]["header2"] 61 | end 62 | 63 | def test_file_handle_gc_safety 64 | # Test with file handles that might be garbage collected 65 | file = Tempfile.new(['gc_stress', '.csv']) 66 | begin 67 | # Write CSV data 68 | file.write(@csv_string) 69 | file.flush 70 | 71 | # Create parser from the file 72 | path = file.path 73 | enum = OSV.for_each(path) 74 | 75 | # Read some rows with GC pressure 76 | 10.times do 77 | row = enum.next 78 | assert_equal row["header1"], "value_#{row["id"]}_1" 79 | 80 | # Force GC 81 | GC.start(full_mark: true, immediate_sweep: true) 82 | end 83 | ensure 84 | file.close 85 | file.unlink 86 | end 87 | end 88 | end -------------------------------------------------------------------------------- /test/io_handling_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "zlib" 5 | require "minitest/autorun" 6 | 7 | # Tests focused on IO handling capabilities 8 | class IoHandlingTest < Minitest::Test 9 | def test_parse_csv_with_gzip 10 | expected = [ 11 | { "id" => "1", "age" => "25", "name" => "John" }, 12 | { "name" => "Jane", "id" => "2", "age" => "30" }, 13 | { "name" => "Jim", "age" => "35", "id" => "3" } 14 | ] 15 | actual = [] 16 | File.open("test/test.csv.gz", "wb") do |gz_file| 17 | gz = Zlib::GzipWriter.new(gz_file) 18 | gz.write(File.read("test/test.csv")) 19 | gz.close 20 | end 21 | OSV.for_each("test/test.csv.gz") { |row| actual << row } 22 | assert_equal expected, actual 23 | ensure 24 | FileUtils.rm_f("test/test.csv.gz") 25 | end 26 | 27 | def test_parse_csv_with_gzip_io 28 | expected = [ 29 | { "id" => "1", "age" => "25", "name" => "John" }, 30 | { "name" => "Jane", "id" => "2", "age" => "30" }, 31 | { "name" => "Jim", "age" => "35", "id" => "3" } 32 | ] 33 | actual = [] 34 | File.open("test/test2.csv.gz", "wb") do |gz_file| 35 | gz = Zlib::GzipWriter.new(gz_file) 36 | gz.write(File.read("test/test.csv")) 37 | gz.close 38 | end 39 | Zlib::GzipReader.open("test/test2.csv.gz") { |gz| OSV.for_each(gz) { |row| actual << row } } 40 | assert_equal expected, actual 41 | ensure 42 | FileUtils.rm_f("test/test2.csv.gz") 43 | end 44 | 45 | def test_parse_with_gzip_corrupted 46 | # Create a corrupted gzip file 47 | File.open("test/corrupted.csv.gz", "wb") do |file| 48 | file.write("This is not a valid gzip file but has .gz extension") 49 | end 50 | 51 | assert_raises(RuntimeError) { OSV.for_each("test/corrupted.csv.gz") { |row| } } 52 | ensure 53 | FileUtils.rm_f("test/corrupted.csv.gz") 54 | end 55 | 56 | def test_parse_input_modified_during_iteration 57 | temp_file = Tempfile.new(%w[dynamic .csv]) 58 | begin 59 | temp_file.write("id,name\n1,John\n2,Jane\n") 60 | temp_file.flush 61 | 62 | enum = OSV.for_each(temp_file.path) 63 | # Get first row 64 | enum.next 65 | 66 | # Modify file between iterations 67 | File.open(temp_file.path, "a") { |f| f.write("3,Modified\n") } 68 | 69 | # Continue iteration 70 | second = enum.next 71 | assert_equal({ "id" => "2", "name" => "Jane" }, second) 72 | 73 | # This might read the newly appended line or might not depending on buffering 74 | # Either way, it shouldn't crash 75 | begin 76 | third = enum.next 77 | assert_equal({ "id" => "3", "name" => "Modified" }, third) 78 | rescue StopIteration 79 | # This is also acceptable 80 | end 81 | ensure 82 | temp_file.close 83 | temp_file.unlink 84 | end 85 | end 86 | 87 | def test_parse_with_extremely_large_row 88 | Tempfile.create(%w[large .csv]) do |tempfile| 89 | tempfile.write("id,name,description\n") 90 | tempfile.write("1,test,#{"x" * 10_000_000}\n") # 10MB row 91 | tempfile.flush 92 | 93 | result = nil 94 | # This shouldn't crash, though it might use a lot of memory 95 | OSV.for_each(tempfile.path) do |row| 96 | result = row 97 | break # Only read the first row 98 | end 99 | 100 | assert_equal "1", result["id"] 101 | assert_equal "test", result["name"] 102 | assert_equal 10_000_000, result["description"].length 103 | end 104 | end 105 | 106 | def test_parse_csv_with_mixed_line_endings 107 | csv_content = "id,name,age\r\n1,John,25\n2,Jane,30\r\n3,Jim,35" 108 | 109 | expected = [ 110 | { "id" => "1", "name" => "John", "age" => "25" }, 111 | { "id" => "2", "name" => "Jane", "age" => "30" }, 112 | { "id" => "3", "name" => "Jim", "age" => "35" } 113 | ] 114 | 115 | actual = [] 116 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 117 | assert_equal expected, actual 118 | end 119 | 120 | def test_parse_csv_with_empty_lines 121 | csv_content = <<~CSV 122 | id,name,age 123 | 124 | 1,John,25 125 | 126 | 2,Jane,30 127 | 128 | 3,Jim,35 129 | 130 | CSV 131 | 132 | expected = [ 133 | { "id" => "1", "name" => "John", "age" => "25" }, 134 | { "id" => "2", "name" => "Jane", "age" => "30" }, 135 | { "id" => "3", "name" => "Jim", "age" => "35" } 136 | ] 137 | 138 | actual = [] 139 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 140 | assert_equal expected, actual 141 | end 142 | 143 | def test_parse_empty_file 144 | Tempfile.create(%w[empty .csv]) do |tempfile| 145 | # Empty file 146 | tempfile.flush 147 | 148 | count = 0 149 | OSV.for_each(tempfile.path) { |row| count += 1 } 150 | 151 | assert_equal 0, count 152 | 153 | # Also test with headers but no data 154 | tempfile.write("id,name\n") 155 | tempfile.flush 156 | 157 | count = 0 158 | OSV.for_each(tempfile.path) { |row| count += 1 } 159 | 160 | assert_equal 0, count 161 | end 162 | end 163 | 164 | def test_parse_csv_with_whitespace_and_quotes 165 | csv_content = <<~CSV 166 | id,name,description 167 | 1, John , unquoted spaces 168 | 2," Jane ", " quoted spaces " 169 | 3,"Jim"," mixed " 170 | CSV 171 | 172 | expected = [ 173 | { "id" => "1", "description" => " unquoted spaces", "name" => " John " }, 174 | { "id" => "2", "description" => " \" quoted spaces \"", "name" => " Jane " }, 175 | { "id" => "3", "description" => " mixed ", "name" => "Jim" } 176 | ] 177 | actual = [] 178 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 179 | assert_equal expected, actual 180 | end 181 | 182 | def test_parse_csv_with_empty_quoted_vs_unquoted 183 | csv_content = <<~CSV 184 | id,quoted,unquoted 185 | 1,"", 186 | 2,," " 187 | 3,, 188 | 4," ", 189 | CSV 190 | 191 | expected = [ 192 | { "id" => "1", "quoted" => "", "unquoted" => "" }, 193 | { "id" => "2", "quoted" => "", "unquoted" => " " }, 194 | { "id" => "3", "quoted" => "", "unquoted" => "" }, 195 | { "id" => "4", "quoted" => " ", "unquoted" => "" } 196 | ] 197 | 198 | actual = [] 199 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 200 | assert_equal expected, actual 201 | end 202 | 203 | def test_parse_csv_with_duplicate_headers 204 | csv_content = <<~CSV 205 | id,name,id,name 206 | 1,John,A,Johnny 207 | 2,Jane,B,Janet 208 | CSV 209 | 210 | expected = [%w[1 John A Johnny], %w[2 Jane B Janet]] 211 | 212 | actual = [] 213 | StringIO.new(csv_content).tap { |io| OSV.for_each(io, result_type: :array) { |row| actual << row } } 214 | assert_equal expected, actual 215 | end 216 | end -------------------------------------------------------------------------------- /test/memory_safety_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "zlib" 5 | require "minitest/autorun" 6 | 7 | class MemorySafetyTest < Minitest::Test 8 | # Test to target potential issues with Ruby string slices in RubyReader 9 | # Focuses on the RubyReader::String variant that uses as_slice() 10 | def test_string_slice_gc_safety 11 | begin 12 | # Create a large string with CSV content 13 | csv_string = "id,name,description\n" 14 | 1000.times do |i| 15 | csv_string += "#{i},name#{i},desc#{i}\n" 16 | end 17 | 18 | # Create a StringIO with the string 19 | string_io = StringIO.new(csv_string) 20 | 21 | # Start parsing 22 | enum = OSV.for_each(string_io) 23 | 24 | # Read a few rows 25 | rows = [] 26 | 5.times { rows << enum.next } 27 | 28 | # Clear the original string reference and force GC 29 | csv_string = nil 30 | GC.start(full_mark: true, immediate_sweep: true) 31 | 32 | # Create memory pressure by allocating large objects 33 | large_objects = [] 34 | 10.times { large_objects << "x" * (1024 * 1024) } 35 | 36 | # Continue reading - this would segfault if RubyReader keeps unsafe references 37 | # to the original string after GC 38 | begin 39 | 20.times { rows << enum.next } 40 | rescue StopIteration 41 | # Expected at end of file 42 | end 43 | 44 | # Verify we read the expected data 45 | assert rows.size > 5 46 | assert_equal "5", rows[5]["id"] if rows.size > 5 47 | end 48 | end 49 | 50 | # Test for potential issues with IO object and its premature garbage collection 51 | def test_io_object_gc_safety 52 | begin 53 | # Create a custom IO-like object that we can control 54 | custom_io = Class.new do 55 | def initialize(data) 56 | @data = data 57 | @position = 0 58 | end 59 | 60 | def read(bytes) 61 | return nil if @position >= @data.length 62 | chunk = @data[@position, [bytes, 100].min] # Read in small chunks 63 | @position += chunk.length 64 | chunk 65 | end 66 | end 67 | 68 | # Create CSV data 69 | csv_data = "id,name,value\n" 70 | 100.times { |i| csv_data += "#{i},name#{i},value#{i}\n" } 71 | 72 | # Create our custom IO object 73 | io_obj = custom_io.new(csv_data) 74 | 75 | # Create an enumerator using the custom IO 76 | enum = OSV.for_each(io_obj) 77 | 78 | # Read a few rows 79 | rows = [] 80 | 5.times { rows << enum.next } 81 | 82 | # Release references to the IO object and force GC 83 | io_obj = nil 84 | GC.start(full_mark: true, immediate_sweep: true) 85 | 86 | # Allocate objects to increase memory pressure 87 | 10.times { "x" * (1024 * 1024) } 88 | 89 | # Try to continue reading after GC (should either work correctly or raise 90 | # a Ruby exception, but shouldn't segfault) 91 | begin 92 | 10.times { rows << enum.next } 93 | rescue => e 94 | # If Rust has unsafe references to Ruby objects that were GC'd, 95 | # this might segfault instead of a proper Ruby exception 96 | assert_match(/io|read|closed/i, e.message) 97 | end 98 | 99 | # Success if we got this far without segfault 100 | assert true 101 | end 102 | end 103 | 104 | # Test to exercise thread safety issues with unchecked Ruby VM access 105 | def test_thread_safety_ruby_vm_access 106 | # Create a CSV file to read 107 | file = Tempfile.new(['thread_safety', '.csv']) 108 | begin 109 | # Create a large CSV file 110 | file.write("id,name,description,value,extra\n") 111 | 500.times { |i| file.write("#{i},name#{i},desc#{i},value#{i},extra#{i}\n") } 112 | file.flush 113 | 114 | # Create shared data structures 115 | results = Queue.new 116 | error_count = 0 117 | mutex = Mutex.new 118 | 119 | # Create multiple threads that will read from the same file concurrently 120 | # This can expose thread safety issues with Ruby VM access 121 | threads = 8.times.map do |thread_id| 122 | Thread.new do 123 | begin 124 | # Each thread creates its own enumerator 125 | enum = OSV.for_each(file.path) 126 | 127 | # Skip to a different starting point 128 | skip_count = thread_id * 10 129 | skip_count.times { enum.next rescue nil } 130 | 131 | # Read rows with aggressive GC in between 132 | 10.times do |i| 133 | begin 134 | row = enum.next 135 | results << [thread_id, row["id"]] 136 | 137 | # Force GC frequently 138 | GC.start if i % 2 == 0 139 | 140 | # Create temporary objects to increase memory pressure 141 | temp = "x" * (1024 * (thread_id + 1)) 142 | temp = nil 143 | rescue StopIteration 144 | break 145 | rescue => e 146 | mutex.synchronize { error_count += 1 } 147 | break 148 | end 149 | end 150 | rescue => e 151 | mutex.synchronize { error_count += 1 } 152 | end 153 | end 154 | end 155 | 156 | # Wait for all threads to complete 157 | threads.each(&:join) 158 | 159 | # Check that we got results without segfaults 160 | assert results.size > 0 161 | assert_equal 0, error_count, "Expected no errors during concurrent parsing" 162 | ensure 163 | file.close 164 | file.unlink 165 | end 166 | end 167 | 168 | # Test buffer boundary handling which can cause issues with memory safety 169 | def test_buffer_boundary_handling 170 | # Create a file with content designed to test buffer boundaries 171 | file = Tempfile.new(['buffer_boundary', '.csv']) 172 | begin 173 | # The READ_BUFFER_SIZE in the implementation is 16384 bytes 174 | buffer_size = 16384 175 | 176 | # Write CSV header 177 | file.write("id,name,description\n") 178 | 179 | # Row 1: Create a field that ends exactly at buffer boundary 180 | content_size = buffer_size - "1,name1,".length - 1 # -1 for newline 181 | file.write("1,name1,#{"x" * content_size}\n") 182 | 183 | # Row 2: Field that causes buffer boundary to occur right before a delimiter 184 | content_size = buffer_size - "2,".length - 1 # -1 for newline 185 | file.write("2,#{"y" * content_size},desc2\n") 186 | 187 | # Row 3: Field with quoted content that spans across buffer boundary 188 | content_size = buffer_size - 10 189 | file.write("3,\"#{"z" * content_size}\",desc3\n") 190 | 191 | # Row 4: Multiple quoted fields with escaped quotes near buffer boundary 192 | file.write("4,\"#{"a" * (buffer_size/2 - 10)}\"\"#{"b" * 10}\",\"multi\"\"quote\"\n") 193 | 194 | # Flush to ensure content is written 195 | file.flush 196 | 197 | # Try parsing with different options 198 | [ 199 | {}, 200 | { result_type: "array" }, 201 | { flexible: true }, 202 | { lossy: true } 203 | ].each do |opts| 204 | begin 205 | # Parse with each option set 206 | enum = OSV.for_each(file.path, **opts) 207 | 208 | # Read rows while doing aggressive GC 209 | rows = [] 210 | begin 211 | loop do 212 | rows << enum.next 213 | GC.start if rows.size % 2 == 0 214 | end 215 | rescue StopIteration 216 | # Expected at end of file 217 | end 218 | 219 | # Verify we read all rows 220 | assert_equal 4, rows.size, "Should have read 4 rows with options: #{opts}" 221 | rescue => e 222 | raise 223 | end 224 | end 225 | ensure 226 | file.close 227 | file.unlink 228 | end 229 | end 230 | 231 | # Test with different Ruby string encodings to find encoding-related memory issues 232 | def test_string_encoding_safety 233 | begin 234 | # Create strings with different encodings 235 | utf8_string = "id,name,description\n1,John,Regular\n2,José,Café\n3,你好,世界\n" 236 | ascii_string = utf8_string.encode("ASCII-8BIT", invalid: :replace, undef: :replace) 237 | utf16_string = utf8_string.encode("UTF-16LE") 238 | 239 | # Test with each encoding 240 | [utf8_string, ascii_string, utf16_string].each do |str| 241 | string_io = StringIO.new(str) 242 | 243 | begin 244 | # Parse the string 245 | enum = OSV.for_each(string_io) 246 | 247 | # Read while forcing GC 248 | rows = [] 249 | begin 250 | while rows.size < 10 251 | row = enum.next 252 | rows << row 253 | GC.start if rows.size % 2 == 0 254 | end 255 | rescue StopIteration 256 | # Expected at end 257 | rescue => e 258 | # Some encodings may cause valid errors 259 | if str == utf16_string 260 | assert_match(/invalid|encoding/i, e.message) 261 | else 262 | raise 263 | end 264 | end 265 | rescue => e 266 | # Only UTF-16 is expected to have encoding issues 267 | if str == utf16_string 268 | assert_match(/invalid|encoding/i, e.message) 269 | else 270 | raise 271 | end 272 | end 273 | end 274 | end 275 | end 276 | end -------------------------------------------------------------------------------- /test/performance_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "minitest/autorun" 5 | 6 | # Tests focused on performance aspects 7 | class PerformanceTest < Minitest::Test 8 | def test_parse_csv_with_many_rows 9 | # Generate test data with 2000 rows 10 | Tempfile.create(%w[test_many_rows .csv]) do |test_file| 11 | test_file.write "id,name,age\n" 12 | 2000.times { |i| test_file.write "#{i},Person#{i},#{20 + i % 50}\n" } 13 | test_file.close 14 | 15 | # Parse and verify 16 | actual = [] 17 | OSV.for_each(test_file.path) { |row| actual << row } 18 | 19 | assert_equal 2000, actual.size 20 | end 21 | end 22 | 23 | def test_parse_csv_with_many_rows_stringio 24 | # Generate test data with 2000 rows 25 | io = StringIO.new 26 | io.write "id,name,age\n" 27 | 2000.times { |i| io.write "#{i},Person#{i},#{20 + i % 50}\n" } 28 | io.rewind 29 | 30 | # Parse and verify 31 | actual = [] 32 | OSV.for_each(io) { |row| actual << row } 33 | 34 | assert_equal 2000, actual.size 35 | end 36 | end -------------------------------------------------------------------------------- /test/stress_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "osv" 4 | require "minitest/autorun" 5 | 6 | # Tests focused on stress-testing and edge cases 7 | class StressTest < Minitest::Test 8 | def test_segfault_stress_csv_parser_with_many_instances 9 | # This test creates many parser instances simultaneously, 10 | # which can stress the memory management and potentially trigger segfaults 11 | 12 | files = [] 13 | enumerators = [] 14 | 15 | begin 16 | # Create several moderate-sized CSV files 17 | 5.times do |file_idx| 18 | file = Tempfile.new(["stress_#{file_idx}", '.csv']) 19 | files << file 20 | 21 | file.write("id,name,value\n") 22 | 500.times { |i| file.write("#{i},name#{i},value#{i}\n") } 23 | file.flush 24 | end 25 | 26 | # Create many parser instances for each file 27 | files.each do |file| 28 | 10.times do 29 | enumerators << OSV.for_each(file.path) 30 | end 31 | end 32 | 33 | # Force memory pressure with large temporary objects 34 | temp_strings = [] 35 | 10.times { temp_strings << "x" * (1024 * 1024) } 36 | GC.start 37 | 38 | # Read partially from random enumerators 39 | 100.times do 40 | enum = enumerators.sample 41 | begin 42 | # Read a random number of records, but not too many 43 | rand(1..5).times { enum.next } 44 | rescue StopIteration 45 | # Expected for some enumerators 46 | end 47 | end 48 | 49 | # Force more GC pressure 50 | temp_strings = nil 51 | GC.start(full_mark: true, immediate_sweep: true) 52 | 53 | # Success if we get here without a segfault 54 | assert true 55 | ensure 56 | # Clean up 57 | files.each do |file| 58 | begin 59 | file.close 60 | file.unlink 61 | rescue 62 | # Ignore cleanup errors 63 | end 64 | end 65 | end 66 | end 67 | 68 | def test_parse_csv_with_long_line 69 | long_text = "x" * 1_000_000 70 | csv_content = <<~CSV 71 | id,name,description 72 | 1,John,#{long_text} 73 | 2,Jane,Short description 74 | CSV 75 | 76 | expected = [ 77 | { "id" => "1", "name" => "John", "description" => long_text }, 78 | { "id" => "2", "name" => "Jane", "description" => "Short description" } 79 | ] 80 | 81 | actual = [] 82 | StringIO.new(csv_content).tap { |io| OSV.for_each(io) { |row| actual << row } } 83 | assert_equal expected, actual 84 | end 85 | 86 | def test_parse_with_garbage_collection_stress 87 | # Create a medium-sized file 88 | Tempfile.create(%w[gc_stress .csv]) do |tempfile| 89 | # Write a decent amount of data 90 | tempfile.write("id,name,value\n") 91 | 1000.times { |i| tempfile.write("#{i},name#{i},value#{i}\n") } 92 | tempfile.flush 93 | 94 | # Enable GC stress mode during parsing 95 | GC.stress = true 96 | begin 97 | count = 0 98 | OSV.for_each(tempfile.path) do |row| 99 | count += 1 100 | # Force some allocations 101 | row.transform_values(&:dup) 102 | # Occasionally force GC 103 | GC.start if count % 100 == 0 104 | end 105 | assert_equal 1000, count 106 | ensure 107 | GC.stress = false 108 | end 109 | end 110 | end 111 | 112 | def test_parse_csv_with_large_data 113 | skip "Skipping large data test in normal test runs" unless ENV["RUN_LARGE_TESTS"] 114 | 115 | # Only run during specific stress test sessions 116 | # Create a large file 117 | Tempfile.create(%w[large_data .csv]) do |tempfile| 118 | tempfile.write("id,name,value\n") 119 | 120 | # Write about 1GB of data 121 | 100_000.times do |i| 122 | # ~10KB per line × 100K = ~1GB 123 | value = "value_#{i}_" + ("x" * 10_000) 124 | tempfile.write("#{i},name#{i},#{value}\n") 125 | end 126 | tempfile.flush 127 | 128 | # Parse the file 129 | count = 0 130 | OSV.for_each(tempfile.path) do |row| 131 | count += 1 132 | # Verify some values to ensure proper parsing 133 | assert_equal count - 1, row["id"].to_i 134 | assert_equal "name#{count - 1}", row["name"] 135 | assert row["value"].start_with?("value_#{count - 1}_") 136 | 137 | # Only read a portion to keep test runtime reasonable 138 | break if count >= 10_000 139 | end 140 | 141 | assert count > 0, "Should have read some rows" 142 | end 143 | end 144 | end -------------------------------------------------------------------------------- /test/test.csv: -------------------------------------------------------------------------------- 1 | id,name,age 2 | 1,John,25 3 | 2,Jane,30 4 | 3,Jim,35 5 | -------------------------------------------------------------------------------- /test/test.tsv: -------------------------------------------------------------------------------- 1 | id name age 2 | 1 John 25 3 | 2 Jane 30 4 | 3 Jim 35 5 | --------------------------------------------------------------------------------