├── .devcontainer └── devcontainer.json ├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── .rspec ├── .rubocop.yml ├── .ruby-version ├── Dockerfile ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── docker-compose.yml ├── json-streamer.gemspec ├── lib └── json │ ├── streamer.rb │ └── streamer │ ├── aggregator.rb │ ├── callbacks.rb │ ├── conditions.rb │ ├── json_streamer.rb │ ├── parser.rb │ └── version.rb └── spec ├── json ├── streamer │ ├── conditions_spec.rb │ └── json_streamer_spec.rb ├── streamer_memory_spec.rb └── streamer_spec.rb └── spec_helper.rb /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "json-streamer", 3 | "dockerComposeFile": "../docker-compose.yml", 4 | "service": "ruby", 5 | "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", 6 | 7 | "features": { 8 | "ghcr.io/devcontainers/features/github-cli:1": {} 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: thisismydesign 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | timeout-minutes: 5 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - uses: ruby/setup-ruby@v1 16 | with: 17 | ruby-version: 3.4.3 18 | bundler-cache: true 19 | 20 | - name: Test 21 | run: bundle exec rspec 22 | 23 | - name: Memory test 24 | run: bundle exec rspec --tag speed:slow 25 | 26 | lint: 27 | runs-on: ubuntu-latest 28 | timeout-minutes: 5 29 | 30 | steps: 31 | - uses: actions/checkout@v4 32 | 33 | - uses: ruby/setup-ruby@v1 34 | with: 35 | ruby-version: 3.4.3 36 | bundler-cache: true 37 | 38 | - name: Lint 39 | run: bundle exec rubocop 40 | 41 | build: 42 | runs-on: ubuntu-latest 43 | timeout-minutes: 5 44 | 45 | steps: 46 | - uses: actions/checkout@v4 47 | 48 | - uses: ruby/setup-ruby@v1 49 | with: 50 | ruby-version: 3.4.3 51 | bundler-cache: true 52 | 53 | - name: Build 54 | run: bundle exec rake build 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | 10 | # rspec failure tracking 11 | .rspec_status 12 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | plugins: 2 | - rubocop-rake 3 | - rubocop-rspec 4 | 5 | AllCops: 6 | TargetRubyVersion: 3.1 7 | NewCops: enable 8 | 9 | # Named keyword arguments are used in many cases and names have to match 10 | Lint/UnusedBlockArgument: 11 | Enabled: false 12 | 13 | # Do not enforce documentation 14 | Style/Documentation: 15 | Enabled: false 16 | 17 | # Disabled due to complex use cases 18 | RSpec/NestedGroups: 19 | Enabled: false 20 | RSpec/MultipleMemoizedHelpers: 21 | Enabled: false 22 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | ruby-3.4.3 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | # check=error=true 3 | 4 | ARG RUBY_VERSION=3.4.3 5 | FROM docker.io/library/ruby:$RUBY_VERSION-slim 6 | 7 | # Rails app lives here 8 | WORKDIR /workspaces/blueprinter_schema 9 | 10 | RUN apt-get update -qq && \ 11 | apt-get install --no-install-recommends -y \ 12 | # Install packages needed to build gems 13 | build-essential git libyaml-dev pkg-config \ 14 | # Install gem dependencies 15 | libyajl-dev && \ 16 | rm -rf /var/lib/apt/lists /var/cache/apt/archives 17 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | # Specify your gem's dependencies in json-streamer.gemspec 6 | gemspec 7 | 8 | # Development 9 | gem 'get_process_mem' 10 | gem 'irb' 11 | gem 'ndhash' 12 | gem 'rake', '~> 13.0' 13 | gem 'rspec', '~> 3.0' 14 | gem 'rubocop', '~> 1.75', require: false 15 | gem 'rubocop-rake', require: false 16 | gem 'rubocop-rspec', require: false 17 | gem 'simplecov', require: false 18 | gem 'yajl-ffi' 19 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | json-streamer (2.1.0) 5 | json-stream 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | ast (2.4.3) 11 | bigdecimal (3.1.9) 12 | date (3.4.1) 13 | diff-lcs (1.6.2) 14 | docile (1.4.1) 15 | ffi (1.17.2) 16 | ffi (1.17.2-aarch64-linux-gnu) 17 | ffi (1.17.2-aarch64-linux-musl) 18 | ffi (1.17.2-arm-linux-gnu) 19 | ffi (1.17.2-arm-linux-musl) 20 | ffi (1.17.2-arm64-darwin) 21 | ffi (1.17.2-x86-linux-gnu) 22 | ffi (1.17.2-x86-linux-musl) 23 | ffi (1.17.2-x86_64-darwin) 24 | ffi (1.17.2-x86_64-linux-gnu) 25 | ffi (1.17.2-x86_64-linux-musl) 26 | get_process_mem (1.0.0) 27 | bigdecimal (>= 2.0) 28 | ffi (~> 1.0) 29 | io-console (0.8.0) 30 | irb (1.15.2) 31 | pp (>= 0.6.0) 32 | rdoc (>= 4.0.0) 33 | reline (>= 0.4.2) 34 | json (2.12.0) 35 | json-stream (1.0.0) 36 | language_server-protocol (3.17.0.5) 37 | lint_roller (1.1.0) 38 | ndhash (0.4.0) 39 | parallel (1.27.0) 40 | parser (3.3.8.0) 41 | ast (~> 2.4.1) 42 | racc 43 | pp (0.6.2) 44 | prettyprint 45 | prettyprint (0.2.0) 46 | prism (1.4.0) 47 | psych (5.2.6) 48 | date 49 | stringio 50 | racc (1.8.1) 51 | rainbow (3.1.1) 52 | rake (13.2.1) 53 | rdoc (6.13.1) 54 | psych (>= 4.0.0) 55 | regexp_parser (2.10.0) 56 | reline (0.6.1) 57 | io-console (~> 0.5) 58 | rspec (3.13.0) 59 | rspec-core (~> 3.13.0) 60 | rspec-expectations (~> 3.13.0) 61 | rspec-mocks (~> 3.13.0) 62 | rspec-core (3.13.3) 63 | rspec-support (~> 3.13.0) 64 | rspec-expectations (3.13.4) 65 | diff-lcs (>= 1.2.0, < 2.0) 66 | rspec-support (~> 3.13.0) 67 | rspec-mocks (3.13.4) 68 | diff-lcs (>= 1.2.0, < 2.0) 69 | rspec-support (~> 3.13.0) 70 | rspec-support (3.13.3) 71 | rubocop (1.75.6) 72 | json (~> 2.3) 73 | language_server-protocol (~> 3.17.0.2) 74 | lint_roller (~> 1.1.0) 75 | parallel (~> 1.10) 76 | parser (>= 3.3.0.2) 77 | rainbow (>= 2.2.2, < 4.0) 78 | regexp_parser (>= 2.9.3, < 3.0) 79 | rubocop-ast (>= 1.44.0, < 2.0) 80 | ruby-progressbar (~> 1.7) 81 | unicode-display_width (>= 2.4.0, < 4.0) 82 | rubocop-ast (1.44.1) 83 | parser (>= 3.3.7.2) 84 | prism (~> 1.4) 85 | rubocop-rake (0.7.1) 86 | lint_roller (~> 1.1) 87 | rubocop (>= 1.72.1) 88 | rubocop-rspec (3.6.0) 89 | lint_roller (~> 1.1) 90 | rubocop (~> 1.72, >= 1.72.1) 91 | ruby-progressbar (1.13.0) 92 | simplecov (0.22.0) 93 | docile (~> 1.1) 94 | simplecov-html (~> 0.11) 95 | simplecov_json_formatter (~> 0.1) 96 | simplecov-html (0.13.1) 97 | simplecov_json_formatter (0.1.4) 98 | stringio (3.1.7) 99 | unicode-display_width (3.1.4) 100 | unicode-emoji (~> 4.0, >= 4.0.4) 101 | unicode-emoji (4.0.4) 102 | yajl-ffi (1.0.0) 103 | ffi (~> 1.16) 104 | 105 | PLATFORMS 106 | aarch64-linux-gnu 107 | aarch64-linux-musl 108 | arm-linux-gnu 109 | arm-linux-musl 110 | arm64-darwin 111 | ruby 112 | x86-linux-gnu 113 | x86-linux-musl 114 | x86_64-darwin 115 | x86_64-linux-gnu 116 | x86_64-linux-musl 117 | 118 | DEPENDENCIES 119 | get_process_mem 120 | irb 121 | json-streamer! 122 | ndhash 123 | rake (~> 13.0) 124 | rspec (~> 3.0) 125 | rubocop (~> 1.75) 126 | rubocop-rake 127 | rubocop-rspec 128 | simplecov 129 | yajl-ffi 130 | 131 | BUNDLED WITH 132 | 2.6.7 133 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Csaba Apagyi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Json::Streamer 2 | 3 | #### Ruby gem for getting data from JSON streams based on various criteria (key, nesting level, etc). 4 | 5 | Status and support 6 | 7 | - ✔ stable 8 | - ✔ supported 9 | - ✖ no ongoing development 10 | 11 | 12 | *You are viewing the README of the default branch. You can find releases [here](https://github.com/thisismydesign/json-streamer/releases).* 13 | 14 | 15 | --- 16 | 17 | #### If you've tried JSON streaming with other Ruby libraries before (e.g. [JSON::Stream](https://github.com/dgraham/json-stream), [Yajl::FFI](https://github.com/dgraham/yajl-ffi)) 18 | 19 | This gem will basically spare you the need to define your own callbacks (i.e. implement an actual JSON parser using `start_object`, `end_object`, `key`, `value`, etc.). 20 | 21 | #### If you're new to this 22 | 23 | Streaming is useful for 24 | - big files that do not fit in the memory (or you'd rather avoid the risk) 25 | - files read in chunks (e.g. arriving over network) 26 | - cases where you expect some issue with the file (e.g. losing connection to source, invalid data at some point) but would like to get as much data as possible anyway 27 | 28 | This gem is aimed at making streaming as easy and convenient as possible. 29 | 30 | #### Performance 31 | 32 | Highly depends on the event generator. Out of the box the gem uses [JSON::Stream](https://github.com/dgraham/json-stream). It was chosen because it's a pure Ruby parser with no runtime dependencies. You can use any custom event generator, such as [Yajl::FFI](https://github.com/dgraham/yajl-ffi) which is dependent on the native YAJL library and is [~10 times faster](https://github.com/dgraham/yajl-ffi#performance). See the [Custom event generators](#custom-event-generators) chapter. 33 | 34 | I did not measure the performance of my implementation on top of these libraries. 35 | 36 | #### Dependencies 37 | 38 | The gem's single runtime dependency is [JSON::Stream](https://github.com/dgraham/json-stream). It is only loaded if the default event generator is used. 39 | 40 | ## Installation 41 | 42 | Add this line to your application's Gemfile: 43 | 44 | ```ruby 45 | gem 'json-streamer' 46 | ``` 47 | 48 | And then execute: 49 | 50 | $ bundle 51 | 52 | Or install it yourself as: 53 | 54 | $ gem install json-streamer 55 | 56 | ## Usage 57 | 58 | ```ruby 59 | require 'json/streamer' 60 | ``` 61 | 62 | #### Passing IO upfront 63 | 64 | Since [v1.2.0](https://github.com/thisismydesign/json-streamer/releases/tag/v1.2.0) 65 | 66 | ```ruby 67 | file_stream = File.open('data.json', 'r') 68 | chunk_size = 500 # defaults to 1000 69 | 70 | streamer = Json::Streamer.parser(file_io: file_stream, chunk_size: chunk_size) 71 | ``` 72 | 73 | #### Get objects based on nesting level 74 | 75 | ```ruby 76 | # Level zero yields the full JSON, first level yields data within the JSON 1-by-1, etc. 77 | streamer.get(nesting_level:1) do |object| 78 | p object 79 | end 80 | ``` 81 | 82 | Input: 83 | ```json 84 | { 85 | "object1": "first_level_value", 86 | "object2": {} 87 | } 88 | ``` 89 | 90 | Output: 91 | ```ruby 92 | "first_level_value" 93 | {} 94 | ``` 95 | 96 | #### Get data based on key 97 | 98 | ```ruby 99 | streamer.get(key:'desired_key') do |object| 100 | p object 101 | end 102 | ``` 103 | 104 | Input: 105 | ```json 106 | { 107 | "obj1" : { 108 | "desired_key" : "value1" 109 | }, 110 | "desired_key" : "value2", 111 | "obj2" : { 112 | "desired_key" : { 113 | "desired_key" : "value3" 114 | } 115 | } 116 | } 117 | ``` 118 | 119 | Output: 120 | ```ruby 121 | "value1" 122 | "value2" 123 | "value3" 124 | {"desired_key" => "value3"} 125 | ``` 126 | 127 | #### Skip values 128 | 129 | ```ruby 130 | streamer.get(nesting_level:1, yield_values:false) do |object| 131 | p object 132 | end 133 | ``` 134 | 135 | Input: 136 | ```json 137 | { 138 | "obj1" : {}, 139 | "key" : "value" 140 | } 141 | ``` 142 | 143 | Output: 144 | ```json 145 | {} 146 | ``` 147 | 148 | #### Symbolize keys 149 | 150 | Since [v1.3.0](https://github.com/thisismydesign/json-streamer/releases/tag/v1.3.0) 151 | 152 | ```ruby 153 | streamer.get(nesting_level:0, symbolize_keys: true) do |object| 154 | p object 155 | end 156 | ``` 157 | 158 | Input: 159 | ```json 160 | { 161 | "obj1" : {"key" : "value"} 162 | } 163 | ``` 164 | 165 | Output: 166 | ```ruby 167 | {:obj1=>{:key=>"value"}} 168 | ``` 169 | 170 | #### Passing IO later (EventMachine-style) 171 | 172 | ```ruby 173 | # Get a JsonStreamer object that provides access to the parser 174 | # but does not start processing immediately 175 | streamer = Json::Streamer.parser 176 | 177 | streamer.get(nesting_level:1) do |object| 178 | p object 179 | end 180 | ``` 181 | 182 | Then later in your EventMachine handler: 183 | 184 | ```ruby 185 | def receive_data(data) 186 | streamer << data 187 | end 188 | ``` 189 | 190 | #### Custom event generators 191 | 192 | Since [v2.1.0](https://github.com/thisismydesign/json-streamer/releases/tag/v2.1.0) 193 | 194 | ```ruby 195 | require "yajl/ffi" 196 | 197 | Json::Streamer.parser(event_generator: Yajl::FFI::Parser.new) 198 | ``` 199 | 200 | Any parser can be used that provides the right events. The gem is tested with [Yajl::FFI](https://github.com/dgraham/yajl-ffi) and [JSON::Stream](https://github.com/dgraham/json-stream). 201 | 202 | #### Custom yield conditions 203 | 204 | Since [v2.0.0](https://github.com/thisismydesign/json-streamer/releases/tag/v2.0.0) 205 | 206 | Custom conditions provide ultimate control over what to yield. 207 | 208 | The Conditions API exposes 3 callbacks: 209 | - `yield_value` 210 | - `yield_array` 211 | - `yield_object` 212 | 213 | Each of them may be redefined. They are called once the corresponding data (value, array or object) is available. They should return whether the data should be yielded for the outside. They receive the data and the `aggregator` as parameters. 214 | 215 | The `aggregator` exposes data about the current state of the partly parsed JSON such as: 216 | - `level` - current level 217 | - `key` - current key 218 | - `value` - current value 219 | - `key_for_level(level)` - key for custom level 220 | - `value_for_level(level)` - value for custom level 221 | - `get` - the raw data (in a custom format) 222 | 223 | Example usage (inspired by [this issue](https://github.com/thisismydesign/json-streamer/issues/7#issuecomment-330232484)): 224 | 225 | ```ruby 226 | conditions = Json::Streamer::Conditions.new 227 | conditions.yield_value = ->(aggregator:, value:) { false } 228 | conditions.yield_array = ->(aggregator:, array:) { false } 229 | conditions.yield_object = lambda do |aggregator:, object:| 230 | aggregator.level.eql?(2) && aggregator.key_for_level(1).eql?('items1') 231 | end 232 | 233 | streamer.get_with_conditions(conditions) do |object| 234 | p object 235 | end 236 | ``` 237 | 238 | Input: 239 | 240 | ```ruby 241 | { 242 | "other": "stuff", 243 | "items1": [ 244 | { 245 | "key1": "value" 246 | }, 247 | { 248 | "key2": "value" 249 | } 250 | ], 251 | "items2": [ 252 | { 253 | "key3": "value" 254 | }, 255 | { 256 | "key4": "value" 257 | } 258 | ] 259 | } 260 | ``` 261 | 262 | Output: 263 | 264 | ```ruby 265 | {"key1"=>"value"} 266 | {"key2"=>"value"} 267 | ``` 268 | 269 | #### Get an Enumerable when not passing a block 270 | 271 | Since [v2.1.0](https://github.com/thisismydesign/json-streamer/releases/tag/v2.1.0) 272 | 273 | When _not_ passed a block both `get` and `get_with_conditions` return an enumerator of the requested objects. When passed a block they return an empty enumerator. This means that **when _not_ passed a block the requested objects will accumulate in memory**. 274 | 275 | Without block 276 | 277 | ```ruby 278 | objects = streamer.get(nesting_level:1) 279 | p objects 280 | ``` 281 | 282 | Input: 283 | ```json 284 | { 285 | "object1": "first_level_value", 286 | "object2": {} 287 | } 288 | ``` 289 | 290 | Output: 291 | ```ruby 292 | ["first_level_value", {}] 293 | ``` 294 | 295 | With block 296 | 297 | ```ruby 298 | unyielded_objects = streamer.get(nesting_level:1) { |object| do_something(object) } 299 | p unyielded_objects 300 | ``` 301 | 302 | Input: 303 | ```json 304 | { 305 | "object1": "first_level_value", 306 | "object2": {} 307 | } 308 | ``` 309 | 310 | Output: 311 | ```ruby 312 | [] 313 | ``` 314 | 315 | #### Other usage information 316 | 317 | Check the unit tests for more examples ([spec/streamer_spec.rb](spec/json/streamer/json_streamer_spec.rb)). 318 | 319 | One `streamer` object handles one set of conditions. For multiple conditions create multiple streamers. For more details see [this discussion](https://github.com/thisismydesign/json-streamer/issues/9). 320 | 321 | #### Deprecated API 322 | 323 | Pre [v1.2.0](https://github.com/thisismydesign/json-streamer/releases/tag/v1.2.0) 324 | 325 | This functionality is deprecated but kept for compatibility reasons. 326 | 327 | ```ruby 328 | # Same as Json::Streamer.parser 329 | streamer = Json::Streamer::JsonStreamer.new 330 | ``` 331 | 332 | ```ruby 333 | # Same as streamer << data 334 | streamer.parser << data 335 | ``` 336 | 337 | ## Development 338 | 339 | Devcontainer / Codespaces / Native 340 | 341 | ```sh 342 | bin/setup 343 | ``` 344 | 345 | Docker 346 | 347 | ```sh 348 | docker compose up -d 349 | docker compose exec ruby bin/setup 350 | ``` 351 | 352 | Then, run `bundle exec rspec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 353 | 354 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org). 355 | 356 | ## Contributing 357 | 358 | Bug reports and pull requests are welcome on GitHub at https://github.com/thisismydesign/json-streamer. 359 | 360 | ## License 361 | 362 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 363 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'bundler/gem_tasks' 4 | require 'rspec/core/rake_task' 5 | 6 | RSpec::Core::RakeTask.new(:spec) 7 | 8 | desc 'Check if source can be required locally' 9 | task :require do 10 | sh "ruby -e \"require '#{File.dirname __FILE__}/lib/json/streamer'\"" 11 | end 12 | 13 | task default: %i[require spec] 14 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'bundler/setup' 5 | require 'json/streamer' 6 | 7 | # You can add fixtures and/or initialization code here to make experimenting 8 | # with your gem easier. You can also use a different console, if you like. 9 | 10 | # (If you use this, don't forget to add pry to your Gemfile!) 11 | # require "pry" 12 | # Pry.start 13 | 14 | require 'irb' 15 | IRB.start 16 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | ruby: 3 | build: . 4 | image: thisismydesign/json-streamer:local 5 | command: sleep infinity 6 | volumes: 7 | - bundle_cache:/usr/local/bundle 8 | - .:/workspaces/json-streamer:cached 9 | 10 | volumes: 11 | bundle_cache: 12 | -------------------------------------------------------------------------------- /json-streamer.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'lib/json/streamer/version' 4 | 5 | Gem::Specification.new do |spec| 6 | spec.name = 'json-streamer' 7 | spec.version = Json::Streamer::VERSION 8 | spec.authors = ['thisismydesign'] 9 | spec.email = ['thisismydesign@users.noreply.github.com'] 10 | 11 | spec.summary = 'Stream JSON data based on various criteria (key, nesting level, etc).' 12 | spec.homepage = 'https://github.com/thisismydesign/json-streamer' 13 | spec.license = 'MIT' 14 | spec.required_ruby_version = '>= 3.1.0' 15 | 16 | spec.metadata['homepage_uri'] = spec.homepage 17 | spec.metadata['source_code_uri'] = 'https://github.com/thisismydesign/json-streamer' 18 | spec.metadata['rubygems_mfa_required'] = 'true' 19 | 20 | # Specify which files should be added to the gem when it is released. 21 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git. 22 | gemspec = File.basename(__FILE__) 23 | spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__, err: IO::NULL) do |ls| 24 | ls.readlines("\x0", chomp: true).reject do |f| 25 | (f == gemspec) || 26 | f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile]) 27 | end 28 | end 29 | spec.bindir = 'exe' 30 | spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } 31 | spec.require_paths = ['lib'] 32 | 33 | spec.add_dependency 'json-stream' 34 | end 35 | -------------------------------------------------------------------------------- /lib/json/streamer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'streamer/json_streamer' 4 | 5 | module Json 6 | module Streamer 7 | def self.parser(file_io: nil, chunk_size: 1000, event_generator: :default) 8 | JsonStreamer.new(file_io, chunk_size, event_generator) 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/json/streamer/aggregator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'forwardable' 4 | 5 | module Json 6 | module Streamer 7 | class Aggregator 8 | extend Forwardable 9 | def_delegators :@aggregator, :pop, :push, :empty? 10 | 11 | def initialize 12 | @aggregator = [] 13 | end 14 | 15 | def get 16 | @aggregator 17 | end 18 | 19 | def level 20 | @aggregator.size 21 | end 22 | 23 | def key 24 | @aggregator.last[:key] unless @aggregator.last.nil? 25 | end 26 | 27 | def key=(param) 28 | @aggregator.last[:key] = param 29 | end 30 | 31 | def value 32 | @aggregator.last[:value] 33 | end 34 | 35 | def value=(param) 36 | if array_level? 37 | value << param 38 | else 39 | value[key] = param 40 | end 41 | end 42 | 43 | def key_for_level(level) 44 | @aggregator[level - 1][:key] unless @aggregator[level - 1].nil? 45 | end 46 | 47 | def value_for_level(level) 48 | @aggregator[level - 1][:key] unless @aggregator[level - 1].nil? 49 | end 50 | 51 | private 52 | 53 | def array_level? 54 | value.is_a?(Array) 55 | end 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/json/streamer/callbacks.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Json 4 | module Streamer 5 | class Callbacks 6 | attr_reader :aggregator 7 | 8 | def initialize(aggregator) 9 | @aggregator = aggregator 10 | end 11 | 12 | def start_object 13 | new_level({}) 14 | end 15 | 16 | def start_array 17 | new_level([]) 18 | end 19 | 20 | def key(param, symbolize_keys) 21 | @aggregator.key = symbolize_keys ? param.to_sym : param 22 | end 23 | 24 | def value(value) 25 | used = yield value 26 | add_value(value) unless used 27 | end 28 | 29 | def end_object(&) 30 | end_level(&) 31 | end 32 | 33 | def end_array(&) 34 | end_level(&) 35 | end 36 | 37 | private 38 | 39 | def end_level 40 | data = @aggregator.value.clone 41 | 42 | @aggregator.pop 43 | 44 | used = yield data 45 | add_value(data) unless used || @aggregator.empty? 46 | end 47 | 48 | def add_value(value) 49 | @aggregator.value = value 50 | end 51 | 52 | def new_level(type) 53 | @aggregator.push(value: type) 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/json/streamer/conditions.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Json 4 | module Streamer 5 | class Conditions 6 | attr_accessor :yield_value, :yield_object, :yield_array 7 | 8 | def initialize(yield_level: -1, yield_key: nil) 9 | @yield_level = yield_level 10 | @yield_key = yield_key 11 | 12 | @yield_value = ->(aggregator:, value: nil) { yield?(aggregator) } 13 | @yield_object = ->(aggregator:, object: nil) { yield?(aggregator) } 14 | @yield_array = ->(aggregator:, array: nil) { yield?(aggregator) } 15 | end 16 | 17 | private 18 | 19 | def yield?(aggregator) 20 | aggregator.level.eql?(@yield_level) or (!@yield_key.nil? and @yield_key == aggregator.key) 21 | end 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/json/streamer/json_streamer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'conditions' 4 | require_relative 'parser' 5 | 6 | module Json 7 | module Streamer 8 | class JsonStreamer 9 | attr_reader :parser 10 | 11 | def initialize(file_io = nil, chunk_size = 1000, event_generator = :default) 12 | @event_generator = make_event_generator(event_generator) 13 | 14 | @file_io = file_io 15 | @chunk_size = chunk_size 16 | end 17 | 18 | def <<(data) 19 | parser << data 20 | end 21 | 22 | # rubocop:disable Metrics/MethodLength 23 | def get(nesting_level: -1, key: nil, yield_values: true, symbolize_keys: false) 24 | conditions = Conditions.new(yield_level: nesting_level, yield_key: key) 25 | conditions.yield_value = ->(aggregator:, value:) { false } unless yield_values 26 | 27 | @parser = Parser.new(@event_generator, symbolize_keys: symbolize_keys) 28 | unyielded_items = [] 29 | 30 | parser.get(conditions) do |obj| 31 | if block_given? 32 | yield obj 33 | else 34 | unyielded_items.push(obj) 35 | end 36 | 37 | obj 38 | end 39 | 40 | process_io 41 | 42 | unyielded_items 43 | end 44 | # rubocop:enable Metrics/MethodLength 45 | 46 | # rubocop:disable Metrics/MethodLength 47 | def get_with_conditions(conditions, options = {}) 48 | @parser = Parser.new(@event_generator, symbolize_keys: options[:symbolize_keys]) 49 | unyielded_items = [] 50 | 51 | parser.get(conditions) do |obj| 52 | if block_given? 53 | yield obj 54 | else 55 | unyielded_items.push(obj) 56 | end 57 | end 58 | 59 | process_io 60 | 61 | unyielded_items 62 | end 63 | # rubocop:enable Metrics/MethodLength 64 | 65 | def aggregator 66 | parser.aggregator 67 | end 68 | 69 | private 70 | 71 | def process_io 72 | @file_io&.each(@chunk_size) { |chunk| parser << chunk } 73 | end 74 | 75 | def make_event_generator(generator) 76 | case generator 77 | when :default 78 | require 'json/stream' 79 | JSON::Stream::Parser.new 80 | else 81 | generator 82 | end 83 | end 84 | end 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /lib/json/streamer/parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'aggregator' 4 | require_relative 'callbacks' 5 | 6 | module Json 7 | module Streamer 8 | class Parser 9 | def initialize(event_generator, options = {}) 10 | @event_generator = event_generator 11 | @symbolize_keys = options[:symbolize_keys] 12 | 13 | @aggregator = Aggregator.new 14 | @event_consumer = Callbacks.new(@aggregator) 15 | end 16 | 17 | # rubocop:disable Metrics/MethodLength 18 | # rubocop:disable Metrics/AbcSize 19 | def get(conditions) 20 | @event_generator.start_object { @event_consumer.start_object } 21 | @event_generator.start_array { @event_consumer.start_array } 22 | 23 | @event_generator.key do |k| 24 | @event_consumer.key(k, @symbolize_keys) 25 | end 26 | 27 | @event_generator.value do |v| 28 | @event_consumer.value(v) do |value| 29 | yield value if conditions.yield_value.call(aggregator: @aggregator, value: value) 30 | end 31 | end 32 | 33 | @event_generator.end_object do 34 | @event_consumer.end_object do |object| 35 | yield object if conditions.yield_object.call(aggregator: @aggregator, object: object) 36 | end 37 | end 38 | 39 | @event_generator.end_array do 40 | @event_consumer.end_array do |array| 41 | yield array if conditions.yield_array.call(aggregator: @aggregator, array: array) 42 | end 43 | end 44 | end 45 | # rubocop:enable Metrics/MethodLength 46 | # rubocop:enable Metrics/AbcSize 47 | 48 | def <<(data) 49 | @event_generator << data 50 | end 51 | 52 | def aggregator 53 | @aggregator.get 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/json/streamer/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Json 4 | module Streamer 5 | VERSION = '2.1.0' 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /spec/json/streamer/conditions_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe Json::Streamer::Conditions do 4 | let(:yield_level) { -1 } 5 | let(:yield_key) { nil } 6 | let(:key) { nil } 7 | let(:level) { 0 } 8 | let(:conditions) { described_class.new(yield_level: yield_level, yield_key: yield_key) } 9 | let(:aggregator) { Json::Streamer::Aggregator.new } 10 | 11 | before do 12 | allow(aggregator).to receive_messages(key: key, level: level) 13 | end 14 | 15 | RSpec.shared_examples 'yield' do |method| 16 | context 'with level' do 17 | context 'when true' do 18 | let(:level) { 1 } 19 | let(:yield_level) { 1 } 20 | 21 | it 'returns whether provided level equals yield_level' do 22 | expect(conditions.send(method).call(aggregator: aggregator)).to be_truthy 23 | end 24 | end 25 | 26 | context 'when false' do 27 | let(:level) { 2 } 28 | let(:yield_level) { 1 } 29 | 30 | it 'returns whether provided level equals yield_level' do 31 | expect(conditions.send(method).call(aggregator: aggregator)).to be_falsey 32 | end 33 | end 34 | end 35 | 36 | context 'with key' do 37 | context 'when true' do 38 | let(:key) { 'key' } 39 | let(:yield_key) { 'key' } 40 | 41 | it 'returns whether provided key equals yield_key' do 42 | expect(conditions.send(method).call(aggregator: aggregator)).to be_truthy 43 | end 44 | end 45 | 46 | context 'when false' do 47 | let(:key) { 'else' } 48 | let(:yield_key) { 'key' } 49 | 50 | it 'returns whether provided key equals yield_key' do 51 | expect(conditions.send(method).call(aggregator: aggregator)).to be_falsey 52 | end 53 | end 54 | end 55 | end 56 | 57 | describe '#yield_value' do 58 | it_behaves_like 'yield', :yield_value 59 | end 60 | 61 | describe '#yield_object' do 62 | it_behaves_like 'yield', :yield_object 63 | end 64 | 65 | describe '#yield_array' do 66 | it_behaves_like 'yield', :yield_array 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /spec/json/streamer/json_streamer_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.shared_examples 'Json::Streamer::JsonStreamer' do 4 | let(:example_key) { 'key' } 5 | let(:example_value) { 'value' } 6 | let(:example_hash) { { example_key => example_value } } 7 | let(:example_multi_level_hash) { { object1: example_hash, object2: example_hash, object3: example_hash } } 8 | let(:chunk_size) { 10 } 9 | let(:json) { JSON.generate(hash) } 10 | let(:json_file_mock) { StringIO.new(json) } 11 | let(:yielded_objects) { [] } 12 | let(:streamer) { described_class.new(json_file_mock, chunk_size, event_generator) } 13 | 14 | before do 15 | if DEBUG 16 | highlight('INPUT') do 17 | puts JSON.pretty_generate(hash) if defined?(hash) 18 | end 19 | end 20 | end 21 | 22 | after do 23 | if DEBUG 24 | highlight('OUTPUT') do 25 | puts JSON.pretty_generate(yielded_objects) if defined?(yielded_objects) 26 | end 27 | end 28 | end 29 | 30 | describe '#<<' do 31 | it 'forwards data to parser' do 32 | streamer = Json::Streamer.parser 33 | allow(streamer).to receive(:<<) 34 | 35 | streamer << {} 36 | 37 | expect(streamer).to have_received(:<<).with({}) 38 | end 39 | end 40 | 41 | RSpec.shared_examples 'an iterable object' do 42 | let(:hash) { example_multi_level_hash } 43 | 44 | context 'when no block is passed' do 45 | subject(:send) { streamer.send(method, **params) } 46 | 47 | it 'returns an Enumerable' do 48 | expect(send).to be_a(Enumerable) 49 | end 50 | 51 | it 'returns array of items that would have been yielded' do 52 | expect(send).to eq(Array.new(3) { example_hash }) 53 | end 54 | end 55 | 56 | context 'when a block is passed' do 57 | it 'yields' do 58 | expect do |block| 59 | streamer.send(method, **params, &block) 60 | end.to yield_control 61 | end 62 | end 63 | 64 | context 'when an empty block is passed' do 65 | it 'returns an empty Enumerable' do 66 | # rubocop:disable Lint/EmptyBlock 67 | unyielded_objects = streamer.send(method, **params) {} 68 | # rubocop:enable Lint/EmptyBlock 69 | 70 | expect(unyielded_objects).to eq([]) 71 | end 72 | end 73 | end 74 | 75 | describe '#get' do 76 | describe 'API interaction' do 77 | let(:params) { { nesting_level: 1 } } 78 | let(:method) { :get } 79 | 80 | it_behaves_like 'an iterable object' 81 | end 82 | 83 | context 'when block is passed' do 84 | before do 85 | streamer.get(**params) do |object| 86 | yielded_objects << object 87 | end 88 | end 89 | 90 | describe 'nesting_level option' do 91 | context 'with JSON objects' do 92 | context 'when at 0th level of empty' do 93 | let(:hash) { {} } 94 | let(:params) { { nesting_level: 0 } } 95 | 96 | it 'yields empty JSON object' do 97 | expect(yielded_objects).to eq([{}]) 98 | end 99 | end 100 | 101 | context 'when at 0th level' do 102 | let(:hash) { { example_key => example_hash } } 103 | let(:params) { { nesting_level: 0 } } 104 | 105 | it 'yields whole JSON' do 106 | expect(yielded_objects).to eq([{ example_key => example_hash }]) 107 | end 108 | end 109 | 110 | context 'when at 1st level' do 111 | let(:hash) { example_multi_level_hash } 112 | let(:params) { { nesting_level: 1 } } 113 | 114 | it 'yields objects within JSON object' do 115 | expect(yielded_objects).to eq([example_hash, example_hash, example_hash]) 116 | end 117 | end 118 | end 119 | 120 | context 'with JSON arrays' do 121 | context 'when at 0th level of flat' do 122 | let(:hash) { [example_value, example_value] } 123 | let(:params) { { nesting_level: 0 } } 124 | 125 | it 'yields whole array' do 126 | expect(yielded_objects).to eq([[example_value, example_value]]) 127 | end 128 | end 129 | 130 | context 'when at 1st level of flat' do 131 | let(:hash) { Array.new(10) { example_hash } } 132 | let(:params) { { nesting_level: 1 } } 133 | 134 | it 'yields objects in array' do 135 | expect(yielded_objects).to eq(hash) 136 | end 137 | end 138 | 139 | context 'when at 1st level of multi-level' do 140 | let(:hash) { [[example_hash, example_hash, example_hash]] } 141 | let(:params) { { nesting_level: 1 } } 142 | 143 | it 'yields nested array' do 144 | expect(yielded_objects).to eq([[example_hash, example_hash, example_hash]]) 145 | end 146 | end 147 | 148 | context 'when at 2nd level of multi-level' do 149 | let(:hash) { [[example_hash, example_hash, example_hash]] } 150 | let(:params) { { nesting_level: 2 } } 151 | 152 | it 'yields nested array elements' do 153 | expect(yielded_objects).to eq([example_hash, example_hash, example_hash]) 154 | end 155 | end 156 | end 157 | end 158 | 159 | describe 'key option' do 160 | context 'with JSON objects' do 161 | context 'when flat, key pointing to value' do 162 | let(:hash) { example_hash } 163 | let(:params) { { key: example_key } } 164 | 165 | it 'yields value' do 166 | expect(yielded_objects).to eq([example_value]) 167 | end 168 | end 169 | 170 | context 'with multi level, key pointing to values' do 171 | let(:hash) { example_multi_level_hash } 172 | let(:params) { { key: example_key } } 173 | 174 | it 'yields values' do 175 | expect(yielded_objects).to eq([example_value, example_value, example_value]) 176 | end 177 | end 178 | 179 | context 'with multi level, key pointing to values and objects' do 180 | let(:hash) { example_multi_level_hash } 181 | let(:params) { { key: example_key } } 182 | 183 | it 'yields values and objects from all levels' do 184 | expect(yielded_objects).to eq([example_value, example_value, example_value]) 185 | end 186 | end 187 | end 188 | 189 | context 'with JSON arrays' do 190 | context 'when key pointing to nested array' do 191 | let(:hash) { { items: [[[example_hash, example_hash, example_hash]]] } } 192 | let(:params) { { nesting_level: 1 } } 193 | 194 | it 'does not yield trailing empty arrays' do 195 | expect(yielded_objects.length).to eq(1) 196 | end 197 | 198 | it 'yields nested arrays with the correct nesting' do 199 | expect(yielded_objects).to eq([[[[example_hash, example_hash, example_hash]]]]) 200 | end 201 | end 202 | 203 | context 'with keys pointing to array' do 204 | let(:hash) { { items: [example_hash, example_value, example_hash] } } 205 | let(:params) { { key: 'items' } } 206 | 207 | it 'yields array' do 208 | expect(yielded_objects).to eq([[example_hash, example_value, example_hash]]) 209 | end 210 | end 211 | 212 | context 'with nested keys pointing to array' do 213 | let(:hash) { { items: { nested_items: [example_hash, example_value, example_hash] } } } 214 | let(:params) { { key: 'items' } } 215 | 216 | it 'keeps key pointing to arrays' do 217 | expect(yielded_objects).to eq([{ 'nested_items' => [example_hash, example_value, example_hash] }]) 218 | end 219 | end 220 | end 221 | 222 | context 'when parsing by both JSON arrays and objects' do 223 | context 'with nested keys pointing to array and object' do 224 | let(:hash) do 225 | { items: { nested_items: [example_hash, example_value, example_hash] }, nested_items: example_hash } 226 | end 227 | let(:params) { { key: 'nested_items' } } 228 | 229 | it 'yields both array and object' do 230 | expect(yielded_objects).to eq([[example_hash, example_value, example_hash], example_hash]) 231 | end 232 | end 233 | end 234 | end 235 | 236 | describe 'yield_values option' do 237 | let(:hash) { { obj: example_hash, obj2: { nested_obj: example_hash } } } 238 | 239 | context 'when enabled' do 240 | let(:params) { { nesting_level: 2 } } 241 | 242 | it 'yields values from given level' do 243 | expect(yielded_objects).to eq([example_value, example_hash]) 244 | end 245 | end 246 | 247 | context 'when disabled' do 248 | let(:params) { { nesting_level: 2, yield_values: false } } 249 | 250 | it 'does not yield values from given level' do 251 | expect(yielded_objects).to eq([example_hash]) 252 | end 253 | end 254 | end 255 | 256 | describe 'EventMachine style input' do 257 | let(:streamer) { Json::Streamer::JsonStreamer.new } 258 | let(:hash) { example_multi_level_hash } 259 | let(:params) { { nesting_level: 1 } } 260 | 261 | context 'with input piped to parser' do 262 | it 'yields objects within JSON object' do 263 | streamer.parser << json 264 | 265 | expect(yielded_objects).to eq([example_hash, example_hash, example_hash]) 266 | end 267 | end 268 | 269 | context 'with chunked input piped to parser' do 270 | it 'yields objects within JSON object' do 271 | json_file_mock.each(chunk_size) do |chunk| 272 | streamer.parser << chunk 273 | end 274 | 275 | expect(yielded_objects).to eq([example_hash, example_hash, example_hash]) 276 | end 277 | end 278 | end 279 | 280 | describe 'finished parsing' do 281 | let(:hash) { { obj: example_hash } } 282 | let(:params) { { nesting_level: 0 } } 283 | 284 | it 'removes object from local store' do 285 | expect(streamer.aggregator).to be_empty 286 | end 287 | end 288 | 289 | describe 'edge cases' do 290 | context 'when conditions overlap' do 291 | let(:hash) { { example_key => { example_key => example_hash } } } 292 | let(:params) { { key: example_key } } 293 | 294 | it 'consumes object on first occurrence' do 295 | expect(yielded_objects).to eq([example_value, {}, {}]) 296 | end 297 | end 298 | 299 | context 'when nesting_level and key both point to the same object' do 300 | let(:hash) { { items: { nested_items: [example_value, example_value, example_value] } } } 301 | let(:params) { { key: 'nested_items', nesting_level: 2 } } 302 | 303 | it 'yields the object once' do 304 | expect(yielded_objects).to eq([[example_value, example_value, example_value]]) 305 | end 306 | end 307 | end 308 | 309 | describe 'symbolize_keys option' do 310 | let(:hash) { { 'object' => example_hash } } 311 | let(:params) { { nesting_level: 0, symbolize_keys: true } } 312 | 313 | it 'symbolizes keys' do 314 | expect(yielded_objects).to eq([{ object: { key: 'value' } }]) 315 | end 316 | end 317 | end 318 | end 319 | 320 | describe '#get_with_conditions' do 321 | let(:conditions) { Json::Streamer::Conditions.new(yield_key: 'nested_items') } 322 | 323 | describe 'API interaction' do 324 | let(:params) do 325 | conditions = Json::Streamer::Conditions.new 326 | conditions.yield_object = ->(aggregator:, object:) { aggregator.level.eql?(1) } 327 | conditions 328 | end 329 | let(:method) { :get_with_conditions } 330 | 331 | # Same as shared context but without keyword arguemnts 332 | describe 'it_behaves_like an iterable object' do 333 | let(:hash) { example_multi_level_hash } 334 | 335 | context 'when no block is passed' do 336 | subject(:send) { streamer.send(method, params) } 337 | 338 | it 'returns an Enumerable' do 339 | expect(send).to be_a(Enumerable) 340 | end 341 | 342 | it 'returns array of items that would have been yielded' do 343 | expect(send).to eq(Array.new(3) { example_hash }) 344 | end 345 | end 346 | 347 | context 'when a block is passed' do 348 | it 'yields' do 349 | expect do |block| 350 | streamer.send(method, params, &block) 351 | end.to yield_control 352 | end 353 | end 354 | 355 | context 'when an empty block is passed' do 356 | it 'returns an empty Enumerable' do 357 | # rubocop:disable Lint/EmptyBlock 358 | unyielded_objects = streamer.send(method, params) {} 359 | # rubocop:enable Lint/EmptyBlock 360 | 361 | expect(unyielded_objects).to eq([]) 362 | end 363 | end 364 | end 365 | end 366 | 367 | context 'when block is passed' do 368 | before do 369 | streamer.get_with_conditions(conditions) do |object| 370 | yielded_objects << object 371 | end 372 | end 373 | 374 | context 'when there are both JSON arrays and objects' do 375 | context 'when nested keys point to array and object' do 376 | let(:hash) do 377 | { items: { nested_items: [example_hash, example_value, example_hash] }, nested_items: example_hash } 378 | end 379 | 380 | it 'yields both array and object' do 381 | expect(yielded_objects).to eq([[example_hash, example_value, example_hash], example_hash]) 382 | end 383 | end 384 | end 385 | 386 | context 'when cannot be solved via regular get' do 387 | let(:conditions) do 388 | conditions = Json::Streamer::Conditions.new 389 | conditions.yield_value = ->(aggregator:, value:) { false } 390 | conditions.yield_array = ->(aggregator:, array:) { false } 391 | conditions.yield_object = lambda do |aggregator:, object:| 392 | aggregator.level.eql?(2) && aggregator.key_for_level(1).eql?('items1') 393 | end 394 | conditions 395 | end 396 | 397 | let(:hash) do 398 | { 399 | other: 'stuff', 400 | items1: [ 401 | { 402 | key1: 'value' 403 | }, 404 | { 405 | key2: 'value' 406 | } 407 | ], 408 | items2: [ 409 | { 410 | key3: 'value' 411 | }, 412 | { 413 | key4: 'value' 414 | } 415 | ] 416 | } 417 | end 418 | 419 | it 'solves it ^^' do 420 | expect(yielded_objects).to eq([{ 'key1' => 'value' }, { 'key2' => 'value' }]) 421 | end 422 | end 423 | end 424 | end 425 | 426 | describe '#get (generated)' do 427 | context 'with JSONs with various nesting level and number of objects per level' do 428 | # rubocop:disable RSpec/ExampleLength 429 | it 'yields all objects on desired level (checking number of yielded objects)' do 430 | # Setting these options to high can cause the test to run longer 431 | entries_per_level = 2 432 | max_levels = 10 433 | 434 | (1..max_levels).each do |max_level| 435 | hash = NDHash.generate(levels: max_level, values_per_level: 0, hashes_per_level: entries_per_level) 436 | json_file_mock = StringIO.new(JSON.generate(hash)) 437 | streamer = Json::Streamer::JsonStreamer.new(json_file_mock) 438 | 439 | yielded_objects = [] 440 | streamer.get(nesting_level: max_level - 1) do |object| 441 | yielded_objects << object 442 | end 443 | 444 | expect(yielded_objects.length).to eq(entries_per_level**(max_level - 1)) 445 | end 446 | end 447 | # rubocop:enable RSpec/ExampleLength 448 | end 449 | end 450 | end 451 | 452 | RSpec.describe Json::Streamer::JsonStreamer do 453 | context 'when using default event generator' do 454 | let(:event_generator) { :default } 455 | 456 | it_behaves_like 'Json::Streamer::JsonStreamer' 457 | end 458 | 459 | context 'when using custom yajl/ffi event generator' do 460 | require 'yajl/ffi' 461 | let(:event_generator) { Yajl::FFI::Parser.new } 462 | 463 | it_behaves_like 'Json::Streamer::JsonStreamer' 464 | end 465 | end 466 | -------------------------------------------------------------------------------- /spec/json/streamer_memory_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe Json::Streamer do 4 | describe 'memory usage', speed: 'slow' do 5 | before do 6 | GC.start 7 | highlight('MEMORY USAGE TEST') 8 | end 9 | 10 | let(:example_hash) { { 'key' => rand } } 11 | let(:size) { 2**18 } 12 | let(:hash) { Array.new(size) { content } } 13 | let!(:json_file_mock) { StringIO.new(JSON.generate(hash)) } 14 | 15 | RSpec.shared_examples 'does not consumne memory' do 16 | # rubocop:disable RSpec/ExampleLength 17 | # rubocop:disable RSpec/MultipleExpectations 18 | it 'does not increase memory consumption' do 19 | p "Number of elements: #{size}" 20 | memory_usage_before_parsing = current_memory_usage 21 | p "Memory consumption before parsing: #{memory_usage_before_parsing} MB" 22 | 23 | streamer = described_class.parser(file_io: json_file_mock) 24 | object_count = 0 25 | streamer.get(nesting_level: 1) do 26 | object_count += 1 27 | end 28 | expect(object_count).to eq(size) 29 | 30 | memory_usage_after_parsing = current_memory_usage 31 | p "Memory consumption after parsing: #{memory_usage_after_parsing.round} MB" 32 | 33 | expect(memory_usage_after_parsing).to be < 1.1 * memory_usage_before_parsing 34 | p 'With JSON::Streamer memory consumption did not increase by more than 10% during processing.' 35 | end 36 | # rubocop:enable RSpec/ExampleLength 37 | # rubocop:enable RSpec/MultipleExpectations 38 | end 39 | 40 | context 'with streaming' do 41 | context 'with array of objects parsed with JSON::Streamer' do 42 | let(:content) { example_hash } 43 | 44 | it_behaves_like 'does not consumne memory' 45 | end 46 | 47 | context 'with array of values parsed with JSON::Streamer' do 48 | let(:content) { rand } 49 | 50 | it_behaves_like 'does not consumne memory' 51 | end 52 | 53 | context 'with array of arrays parsed with JSON::Streamer' do 54 | let(:content) { [rand] } 55 | 56 | it_behaves_like 'does not consumne memory' 57 | end 58 | end 59 | 60 | context 'without streaming' do 61 | context 'with array of objects parsed with JSON::Stream' do 62 | let(:content) { example_hash } 63 | 64 | # rubocop:disable RSpec/MultipleExpectations 65 | # rubocop:disable RSpec/ExampleLength 66 | it 'increases memory consumption' do 67 | p "Number of elements: #{size}" 68 | memory_usage_before_parsing = current_memory_usage 69 | p "Memory consumption before parsing: #{memory_usage_before_parsing} MB" 70 | 71 | object = JSON::Stream::Parser.parse(json_file_mock) 72 | expect(object.length).to eq(size) 73 | 74 | memory_usage_after_parsing = current_memory_usage 75 | p "Memory consumption after parsing: #{memory_usage_after_parsing.round} MB" 76 | 77 | expect(memory_usage_after_parsing).to be > 1.5 * memory_usage_before_parsing 78 | p 'With JSON::Stream memory consumption increased by at least 50% during processing.' 79 | end 80 | # rubocop:enable RSpec/ExampleLength 81 | # rubocop:enable RSpec/MultipleExpectations 82 | end 83 | end 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /spec/json/streamer_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe Json::Streamer do 4 | describe '.parser' do 5 | it 'returns Json::Streamer::JsonStreamer instance' do 6 | expect(described_class.parser).to be_a(Json::Streamer::JsonStreamer) 7 | end 8 | 9 | it 'defaults to `JSON::Stream::Parser` event generator' do 10 | expect(described_class.parser.instance_variable_get(:@event_generator)).to be_a(JSON::Stream::Parser) 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'simplecov' 4 | SimpleCov.start 5 | 6 | require 'stringio' 7 | require 'json' 8 | require 'ndhash' 9 | require 'json/stream' 10 | require 'get_process_mem' 11 | 12 | require 'json/streamer' 13 | 14 | DEBUG = false 15 | 16 | def highlight(msg) 17 | puts("\n#{'#' * 10} #{msg} #{'#' * 10}\n\n") 18 | return unless block_given? 19 | 20 | yield 21 | puts("\n#{'#' * 8} #{msg} END #{'#' * 8}\n\n") 22 | end 23 | 24 | def current_memory_usage 25 | GetProcessMem.new.mb.round 26 | end 27 | 28 | RSpec.configure do |config| 29 | # Enable flags like --only-failures and --next-failure 30 | config.example_status_persistence_file_path = '.rspec_status' 31 | 32 | # Disable RSpec exposing methods globally on `Module` and `main` 33 | config.disable_monkey_patching! 34 | 35 | config.expect_with :rspec do |c| 36 | c.syntax = :expect 37 | end 38 | 39 | config.filter_run_excluding speed: 'slow' 40 | end 41 | --------------------------------------------------------------------------------