├── .ruby-version ├── .ruby-gemset ├── .rspec ├── Gemfile ├── spec ├── support │ └── streaming_api │ │ ├── streaming_api.rb │ │ ├── config.ru │ │ ├── app.rb │ │ └── helpers.rb ├── stream_lines_spec.rb ├── spec_helper.rb └── reading │ ├── json_lines_spec.rb │ ├── csv_spec.rb │ └── stream_spec.rb ├── lib ├── stream_lines │ ├── version.rb │ ├── reading.rb │ ├── error.rb │ └── reading │ │ ├── json_lines.rb │ │ ├── csv.rb │ │ └── stream.rb └── stream_lines.rb ├── bin ├── setup └── console ├── .gitignore ├── Rakefile ├── .github ├── dependabot.yml └── workflows │ ├── ruby-tests.yml │ └── style-check.yml ├── .rubocop.yml ├── LICENSE.txt ├── stream_lines.gemspec ├── CODE_OF_CONDUCT.md ├── Gemfile.lock └── README.md /.ruby-version: -------------------------------------------------------------------------------- 1 | 3.3.0 2 | -------------------------------------------------------------------------------- /.ruby-gemset: -------------------------------------------------------------------------------- 1 | stream_lines 2 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | gemspec 6 | -------------------------------------------------------------------------------- /spec/support/streaming_api/streaming_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module StreamingApi 4 | end 5 | -------------------------------------------------------------------------------- /lib/stream_lines/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module StreamLines 4 | VERSION = '0.4.1' 5 | end 6 | -------------------------------------------------------------------------------- /lib/stream_lines/reading.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module StreamLines 4 | module Reading 5 | end 6 | end 7 | -------------------------------------------------------------------------------- /spec/support/streaming_api/config.ru: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'app' 4 | 5 | run App.run! 6 | -------------------------------------------------------------------------------- /lib/stream_lines/error.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module StreamLines 4 | class Error < RuntimeError; end 5 | end 6 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /spec/stream_lines_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe StreamLines do 4 | it 'has a version number' do 5 | expect(StreamLines::VERSION).not_to be nil 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | *.swp 10 | .byebug_history 11 | 12 | # rspec failure tracking 13 | .rspec_status 14 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'bundler/gem_tasks' 4 | require 'bundler/gem_version_tasks' 5 | require 'rspec/core/rake_task' 6 | 7 | RSpec::Core::RakeTask.new(:spec) 8 | 9 | task default: :spec 10 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: bundler 4 | directory: "/" 5 | schedule: 6 | interval: weekly 7 | time: "10:00" 8 | open-pull-requests-limit: 10 9 | ignore: 10 | - dependency-name: rubocop 11 | versions: 12 | - 1.12.0 13 | -------------------------------------------------------------------------------- /lib/stream_lines.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'stream_lines/error' 4 | require 'stream_lines/version' 5 | require 'stream_lines/reading/csv' 6 | require 'stream_lines/reading/json_lines' 7 | require 'stream_lines/reading/stream' 8 | 9 | module StreamLines 10 | end 11 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | NewCops: 'enable' 3 | SuggestExtensions: false 4 | TargetRubyVersion: 2.5 5 | Exclude: 6 | - '*.gemspec' 7 | - 'vendor/bundle/**/*' 8 | 9 | Metrics/BlockLength: 10 | Exclude: 11 | - 'spec/**/*.rb' 12 | 13 | Layout/LineLength: 14 | Max: 99 15 | 16 | Style/Documentation: 17 | Enabled: false 18 | 19 | Style/MapIntoArray: 20 | Exclude: 21 | - 'spec/reading/**/*.rb' # most specs call each intentionally since a Stream is an Enumerable 22 | -------------------------------------------------------------------------------- /spec/support/streaming_api/app.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'sinatra/base' 4 | require 'sinatra/streaming' 5 | 6 | module StreamingApi 7 | class App < Sinatra::Base 8 | DATA_FILE = File.join(__dir__, 'data.txt') 9 | 10 | helpers Sinatra::Streaming 11 | 12 | get '/stream_data' do 13 | stream do |out| 14 | File.foreach(DATA_FILE) do |line| 15 | out << line 16 | end 17 | 18 | out.flush 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # frozen_string_literal: true 4 | 5 | require 'bundler/setup' 6 | require 'byebug' 7 | require 'stream_lines' 8 | require 'charlock_holmes/string' 9 | 10 | # You can add fixtures and/or initialization code here to make experimenting 11 | # with your gem easier. You can also use a different console, if you like. 12 | 13 | # (If you use this, don't forget to add pry to your Gemfile!) 14 | # require "pry" 15 | # Pry.start 16 | 17 | require 'irb' 18 | IRB.start(__FILE__) 19 | -------------------------------------------------------------------------------- /.github/workflows/ruby-tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests (Ruby) 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '*' 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | ruby_version: 17 | - 3.0 18 | - 3.1 19 | - 3.2 20 | - 3.3 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Ruby 24 | uses: ruby/setup-ruby@v1 25 | with: 26 | ruby-version: ${{ matrix.ruby_version }} 27 | - name: Ruby version 28 | run: ruby -v 29 | - name: Install dependencies 30 | run: bundle install 31 | - name: Run tests 32 | run: bundle exec rake 33 | -------------------------------------------------------------------------------- /lib/stream_lines/reading/json_lines.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'stream_lines/reading/stream' 5 | 6 | module StreamLines 7 | module Reading 8 | class JSONLines 9 | include Enumerable 10 | 11 | def initialize(url, encoding: Encoding.default_external, **json_options) 12 | @url = url 13 | @json_options = json_options 14 | @stream = Stream.new(url, encoding: encoding) 15 | end 16 | 17 | def each(&block) 18 | @stream.each { |line| block.call(parse_line(line)) } 19 | end 20 | 21 | private 22 | 23 | attr_reader :url 24 | 25 | def parse_line(line) 26 | JSON.parse(line, **@json_options) 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /spec/support/streaming_api/helpers.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rackup/handler/webrick' 4 | 5 | module StreamingApi 6 | module Helpers 7 | def run_streaming_api 8 | WebMock.disable! 9 | 10 | pid = Process.fork { start_server } 11 | sleep 1 12 | 13 | yield 14 | 15 | Process.kill('TERM', pid) 16 | WebMock.enable! 17 | end 18 | 19 | def start_server 20 | options = { 21 | Host: '127.0.0.1', 22 | Port: '4567' 23 | } 24 | 25 | Rackup::Handler::WEBrick.run(StreamingApi::App, **options) do |server| 26 | %i[INT TERM].each { |sig| trap(sig) { server.stop } } 27 | end 28 | end 29 | 30 | def stream_data_url 31 | 'http://localhost:4567/stream_data' 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'bundler/setup' 4 | require 'byebug' 5 | require 'simplecov' 6 | SimpleCov.start 7 | 8 | require 'stream_lines' 9 | 10 | require 'awesome_print' 11 | require 'get_process_mem' 12 | require 'memory_profiler' 13 | require 'webmock/rspec' 14 | 15 | Dir[File.join(__dir__, 'support', '**', '*.rb')].sort.each { |f| require f } 16 | 17 | WebMock.disable_net_connect!(allow_localhost: true) 18 | 19 | RSpec.configure do |config| 20 | # Enable flags like --only-failures and --next-failure 21 | config.example_status_persistence_file_path = '.rspec_status' 22 | 23 | # Disable RSpec exposing methods globally on `Module` and `main` 24 | config.disable_monkey_patching! 25 | 26 | config.expect_with :rspec do |c| 27 | c.syntax = :expect 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /.github/workflows/style-check.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby 7 | 8 | name: Style Check 9 | 10 | on: 11 | push: 12 | branches: 13 | - main 14 | pull_request: 15 | branches: 16 | - '*' 17 | 18 | jobs: 19 | rubocop: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Ruby 24 | uses: ruby/setup-ruby@v1 25 | with: 26 | ruby-version: 3.3 27 | - name: Install dependencies 28 | run: bundle install 29 | - name: Run Rubocop 30 | run: bundle exec rubocop -D 31 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Joel Lubrano 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/stream_lines/reading/csv.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'csv' 4 | require 'stream_lines/reading/stream' 5 | 6 | module StreamLines 7 | module Reading 8 | class CSV 9 | # NOTE: (jdlubrano) 10 | # I suspect that these options are not used terribly frequently, and each 11 | # would require additional logic in the #each method. Rather than 12 | # attempting to implement sensible solutions for these options, I am 13 | # choosing to explicitly ignore them until there is enough outcry to 14 | # support them. 15 | IGNORED_CSV_OPTIONS = %i[ 16 | return_headers 17 | header_converters 18 | skip_lines 19 | ].freeze 20 | 21 | include Enumerable 22 | 23 | def initialize(url, **csv_options) 24 | @url = url 25 | @csv_options = accepted_csv_options(csv_options) 26 | @first_row_headers = @csv_options[:headers] == true 27 | 28 | encoding = @csv_options[:encoding] || Encoding.default_external 29 | @stream = Stream.new(url, encoding: encoding) 30 | end 31 | 32 | def each(&block) 33 | @stream.each_with_index do |line, i| 34 | next assign_first_row_headers(line) if i.zero? && first_row_headers? 35 | 36 | block.call(::CSV.parse_line(line, **@csv_options)) 37 | end 38 | end 39 | 40 | private 41 | 42 | attr_reader :url 43 | 44 | def first_row_headers? 45 | @first_row_headers 46 | end 47 | 48 | def assign_first_row_headers(first_line) 49 | header_row = ::CSV.parse_line(first_line) 50 | @csv_options[:headers] = header_row 51 | end 52 | 53 | def accepted_csv_options(csv_options) 54 | csv_options.transform_keys(&:to_sym) 55 | .delete_if { |key, _value| IGNORED_CSV_OPTIONS.include?(key) } 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /spec/reading/json_lines_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'stream_lines' 4 | 5 | RSpec.describe StreamLines::Reading::JSONLines do 6 | let(:url) { 'https://test.stream_lines.com' } 7 | let(:json) { described_class.new(url) } 8 | 9 | it { expect(json).to be_an(Enumerable) } 10 | 11 | describe '#each' do 12 | let(:json_content) do 13 | <<~JSON 14 | { "foo": 1, "bar": "two" } 15 | { "foo": "three", "bar": 4 } 16 | JSON 17 | end 18 | 19 | let(:streamed_rows) do 20 | [].tap do |rows| 21 | json.each { |row| rows << row } 22 | end 23 | end 24 | 25 | context 'when the request to fetch the JSON lines succeeds' do 26 | before do 27 | WebMock.stub_request(:get, url) 28 | .to_return(status: 200, body: json_content) 29 | end 30 | 31 | it 'correctly yields all of the data' do 32 | expect(streamed_rows).to eq([{ 'foo' => 1, 'bar' => 'two' }, 33 | { 'foo' => 'three', 'bar' => 4 }]) 34 | end 35 | 36 | context 'when JSON parsing options are provided' do 37 | let(:json) { described_class.new(url, symbolize_names: true) } 38 | 39 | it 'uses the options when parsing the JSON' do 40 | expect(streamed_rows).to eq([{ foo: 1, bar: 'two' }, 41 | { foo: 'three', bar: 4 }]) 42 | end 43 | end 44 | 45 | context 'when the response contains invalid JSON' do 46 | let(:json_content) do 47 | <<~JSON 48 | { "foo": 1, "bar": "two" } 49 | { foo: "three", bar: 4 } 50 | JSON 51 | end 52 | 53 | it 'raises a JSON::ParserError' do 54 | expect { streamed_rows }.to raise_error(JSON::ParserError) 55 | end 56 | end 57 | end 58 | 59 | context 'when the request to fetch the JSON lines fails' do 60 | before do 61 | WebMock.stub_request(:get, url).to_return(status: 404) 62 | end 63 | 64 | it 'raises an error' do 65 | expect { streamed_rows }.to raise_error(StreamLines::Error) 66 | end 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /stream_lines.gemspec: -------------------------------------------------------------------------------- 1 | lib = File.expand_path("lib", __dir__) 2 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 3 | require "stream_lines/version" 4 | 5 | Gem::Specification.new do |spec| 6 | spec.name = "stream_lines" 7 | spec.version = StreamLines::VERSION 8 | spec.authors = ["Joel Lubrano"] 9 | spec.email = ["joel.lubrano@gmail.com"] 10 | 11 | spec.summary = 'A utility to stream lines of a file over HTTP' 12 | spec.homepage = 'https://github.com/jdlubrano/stream_lines' 13 | spec.license = 'MIT' 14 | 15 | spec.metadata["homepage_uri"] = spec.homepage 16 | spec.metadata["source_code_uri"] = 'https://github.com/jdlubrano/stream_lines' 17 | spec.metadata["changelog_uri"] = 'https://github.com/jdlubrano/stream_lines/releases' 18 | 19 | # Specify which files should be added to the gem when it is released. 20 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git. 21 | spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do 22 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 23 | end 24 | 25 | spec.bindir = 'bin' 26 | spec.executables = [] 27 | spec.require_paths = ["lib"] 28 | 29 | spec.add_runtime_dependency 'httparty', '~> 0.14' 30 | 31 | spec.add_development_dependency 'awesome_print', '~> 1.8' 32 | spec.add_development_dependency 'bundler', '~> 2.0' 33 | spec.add_development_dependency 'bundler-audit' 34 | spec.add_development_dependency 'bundler-gem_version_tasks' 35 | spec.add_development_dependency 'byebug' 36 | spec.add_development_dependency 'charlock_holmes' 37 | spec.add_development_dependency 'get_process_mem' 38 | spec.add_development_dependency 'memory_profiler' 39 | spec.add_development_dependency 'rackup' 40 | spec.add_development_dependency 'rake', '~> 13.0' 41 | spec.add_development_dependency 'rspec', '~> 3.0' 42 | spec.add_development_dependency 'rubocop', '~> 1.81.1' 43 | spec.add_development_dependency 'simplecov', '~> 0.17' 44 | spec.add_development_dependency 'sinatra', '~> 4.0' 45 | spec.add_development_dependency 'sinatra-contrib', '~> 4.0' 46 | spec.add_development_dependency 'webrick', '~> 1.7' 47 | spec.add_development_dependency 'webmock', '~> 3.0' 48 | end 49 | -------------------------------------------------------------------------------- /lib/stream_lines/reading/stream.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'httparty' 4 | 5 | require 'stream_lines/error' 6 | 7 | module StreamLines 8 | module Reading 9 | class Stream 10 | include Enumerable 11 | include HTTParty 12 | 13 | raise_on 400..599 14 | 15 | def initialize(url, encoding: Encoding.default_external) 16 | @url = url 17 | @encoding = encoding 18 | @buffer = String.new(encoding: @encoding) 19 | end 20 | 21 | def each(&block) 22 | stream_lines(&block) 23 | rescue HTTParty::Error => e 24 | raise Error, "Failed to download #{url} with code: #{e.response.code}" 25 | end 26 | 27 | private 28 | 29 | attr_reader :url 30 | 31 | def stream_lines(&block) 32 | self.class.get(url, stream_body: true) do |chunk| 33 | lines = extract_lines(chunk) 34 | lines.each { |line| block.call(line) } 35 | end 36 | 37 | block.call(@buffer) if @buffer.size.positive? 38 | end 39 | 40 | def extract_lines(chunk) 41 | encoded_chunk = @buffer + chunk.to_s.dup.force_encoding(@encoding) 42 | lines = split_lines(encoded_chunk) 43 | @buffer = String.new(encoding: @encoding) 44 | @buffer << lines.pop.to_s 45 | 46 | lines 47 | end 48 | 49 | def split_lines(encoded_chunk) 50 | encoded_chunk.split($INPUT_RECORD_SEPARATOR, -1) 51 | rescue ArgumentError => e 52 | raise e unless /invalid byte sequence/.match?(e.message) 53 | 54 | # NOTE: (jdlubrano) 55 | # The last byte in the chunk is most likely a part of a multibyte 56 | # character that, on its own, is an invalid byte sequence. So, we 57 | # want to split the lines containing all valid bytes and make the 58 | # trailing bytes the last line. The last line eventually gets added 59 | # to the buffer, prepended to the next chunk, and, hopefully, restores 60 | # a valid byte sequence. 61 | last_newline_index = encoded_chunk.rindex($INPUT_RECORD_SEPARATOR) 62 | return [encoded_chunk] if last_newline_index.nil? 63 | 64 | valid_lines = encoded_chunk[0...last_newline_index].split($INPUT_RECORD_SEPARATOR, -1) 65 | valid_lines + [encoded_chunk[(last_newline_index + 1)..-1]].compact 66 | end 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at joel.lubrano@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /spec/reading/csv_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'stream_lines' 4 | 5 | RSpec.describe StreamLines::Reading::CSV do 6 | let(:url) { 'https://test.stream_lines.com' } 7 | let(:csv) { described_class.new(url) } 8 | 9 | it { expect(csv).to be_an(Enumerable) } 10 | 11 | describe '#each' do 12 | let(:csv_content) do 13 | <<~CSV 14 | foo,bar 15 | 1,2 16 | 3,4 17 | CSV 18 | end 19 | 20 | subject(:streamed_rows) do 21 | [].tap do |rows| 22 | csv.each { |row| rows << row } 23 | end 24 | end 25 | 26 | context 'when the request to fetch the CSV succeeds' do 27 | let(:csv) { described_class.new(url) } 28 | 29 | before do 30 | WebMock.stub_request(:get, url) 31 | .to_return(status: 200, body: csv_content) 32 | end 33 | 34 | context 'when the headers option is false' do 35 | it 'yields Arrays' do 36 | expect(streamed_rows).to all be_an(Array) 37 | end 38 | 39 | it 'returns the headers as the first row' do 40 | expect(streamed_rows.first).to eq(%w[foo bar]) 41 | end 42 | 43 | it 'correctly yields the all of the data' do 44 | expect(streamed_rows).to eq([%w[foo bar], 45 | %w[1 2], 46 | %w[3 4]]) 47 | end 48 | end 49 | 50 | context 'when the headers option is true' do 51 | let(:csv) { described_class.new(url, headers: true) } 52 | 53 | it 'yields CSV::Rows' do 54 | expect(streamed_rows).to all be_a(CSV::Row) 55 | end 56 | 57 | it 'uses the first row as the headers' do 58 | expect(streamed_rows.first.headers).to eq(%w[foo bar]) 59 | end 60 | 61 | it 'correctly yields all of the data' do 62 | expect(streamed_rows.map(&:to_h)).to eq([{ 'foo' => '1', 'bar' => '2' }, 63 | { 'foo' => '3', 'bar' => '4' }]) 64 | end 65 | 66 | it 'correctly yields all of the data' do 67 | stream = described_class.new(url, headers: true) 68 | 69 | cloud = [] 70 | cloud << stream.first.headers.to_csv 71 | stream.each do |row| 72 | cloud << row.fields.to_csv 73 | end 74 | expect(cloud).to eq ["foo,bar\n", "1,2\n", "3,4\n"] 75 | end 76 | end 77 | 78 | context 'when the headers are provided as an array' do 79 | let(:csv) { described_class.new(url, headers: headers) } 80 | let(:headers) { %w[column_1 column_2] } 81 | 82 | it 'yields CSV::Rows' do 83 | expect(streamed_rows).to all be_a(CSV::Row) 84 | end 85 | 86 | it 'yields the first row with the given headers' do 87 | expect(streamed_rows.first.to_h).to eq('column_1' => 'foo', 'column_2' => 'bar') 88 | end 89 | 90 | it 'correctly yields all of the data' do 91 | expect(streamed_rows.map(&:to_h)).to eq([{ 'column_1' => 'foo', 'column_2' => 'bar' }, 92 | { 'column_1' => '1', 'column_2' => '2' }, 93 | { 'column_1' => '3', 'column_2' => '4' }]) 94 | end 95 | end 96 | 97 | context 'when converters are provided' do 98 | let(:csv) { described_class.new(url, converters: [:integer]) } 99 | 100 | it 'converts all of the data' do 101 | expect(streamed_rows).to eq([%w[foo bar], 102 | [1, 2], 103 | [3, 4]]) 104 | end 105 | end 106 | end 107 | 108 | context 'when the request to fetch the CSV fails' do 109 | before do 110 | WebMock.stub_request(:get, url).to_return(status: 404) 111 | end 112 | 113 | it 'raises an error' do 114 | expect { streamed_rows }.to raise_error(StreamLines::Error) 115 | end 116 | end 117 | end 118 | end 119 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | stream_lines (0.4.1) 5 | httparty (~> 0.14) 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | addressable (2.8.7) 11 | public_suffix (>= 2.0.2, < 7.0) 12 | ast (2.4.3) 13 | awesome_print (1.9.2) 14 | base64 (0.3.0) 15 | bigdecimal (3.3.1) 16 | bundler-audit (0.9.3) 17 | bundler (>= 1.2.0) 18 | thor (~> 1.0) 19 | bundler-gem_version_tasks (0.2.1) 20 | byebug (12.0.0) 21 | charlock_holmes (0.7.9) 22 | crack (1.0.1) 23 | bigdecimal 24 | rexml 25 | csv (3.3.5) 26 | diff-lcs (1.6.2) 27 | docile (1.4.1) 28 | ffi (1.17.2) 29 | get_process_mem (1.0.0) 30 | bigdecimal (>= 2.0) 31 | ffi (~> 1.0) 32 | hashdiff (1.2.1) 33 | httparty (0.23.2) 34 | csv 35 | mini_mime (>= 1.0.0) 36 | multi_xml (>= 0.5.2) 37 | json (2.15.2) 38 | language_server-protocol (3.17.0.5) 39 | lint_roller (1.1.0) 40 | logger (1.7.0) 41 | memory_profiler (1.1.0) 42 | mini_mime (1.1.5) 43 | multi_json (1.17.0) 44 | multi_xml (0.7.2) 45 | bigdecimal (~> 3.1) 46 | mustermann (3.0.4) 47 | ruby2_keywords (~> 0.0.1) 48 | parallel (1.27.0) 49 | parser (3.3.10.0) 50 | ast (~> 2.4.1) 51 | racc 52 | prism (1.6.0) 53 | public_suffix (6.0.2) 54 | racc (1.8.1) 55 | rack (3.2.4) 56 | rack-protection (4.2.1) 57 | base64 (>= 0.1.0) 58 | logger (>= 1.6.0) 59 | rack (>= 3.0.0, < 4) 60 | rack-session (2.1.1) 61 | base64 (>= 0.1.0) 62 | rack (>= 3.0.0) 63 | rackup (2.3.1) 64 | rack (>= 3) 65 | rainbow (3.1.1) 66 | rake (13.3.1) 67 | regexp_parser (2.11.3) 68 | rexml (3.4.4) 69 | rspec (3.13.2) 70 | rspec-core (~> 3.13.0) 71 | rspec-expectations (~> 3.13.0) 72 | rspec-mocks (~> 3.13.0) 73 | rspec-core (3.13.6) 74 | rspec-support (~> 3.13.0) 75 | rspec-expectations (3.13.5) 76 | diff-lcs (>= 1.2.0, < 2.0) 77 | rspec-support (~> 3.13.0) 78 | rspec-mocks (3.13.7) 79 | diff-lcs (>= 1.2.0, < 2.0) 80 | rspec-support (~> 3.13.0) 81 | rspec-support (3.13.6) 82 | rubocop (1.81.7) 83 | json (~> 2.3) 84 | language_server-protocol (~> 3.17.0.2) 85 | lint_roller (~> 1.1.0) 86 | parallel (~> 1.10) 87 | parser (>= 3.3.0.2) 88 | rainbow (>= 2.2.2, < 4.0) 89 | regexp_parser (>= 2.9.3, < 3.0) 90 | rubocop-ast (>= 1.47.1, < 2.0) 91 | ruby-progressbar (~> 1.7) 92 | unicode-display_width (>= 2.4.0, < 4.0) 93 | rubocop-ast (1.47.1) 94 | parser (>= 3.3.7.2) 95 | prism (~> 1.4) 96 | ruby-progressbar (1.13.0) 97 | ruby2_keywords (0.0.5) 98 | simplecov (0.22.0) 99 | docile (~> 1.1) 100 | simplecov-html (~> 0.11) 101 | simplecov_json_formatter (~> 0.1) 102 | simplecov-html (0.13.2) 103 | simplecov_json_formatter (0.1.4) 104 | sinatra (4.2.1) 105 | logger (>= 1.6.0) 106 | mustermann (~> 3.0) 107 | rack (>= 3.0.0, < 4) 108 | rack-protection (= 4.2.1) 109 | rack-session (>= 2.0.0, < 3) 110 | tilt (~> 2.0) 111 | sinatra-contrib (4.2.1) 112 | multi_json (>= 0.0.2) 113 | mustermann (~> 3.0) 114 | rack-protection (= 4.2.1) 115 | sinatra (= 4.2.1) 116 | tilt (~> 2.0) 117 | thor (1.4.0) 118 | tilt (2.6.1) 119 | unicode-display_width (3.2.0) 120 | unicode-emoji (~> 4.1) 121 | unicode-emoji (4.1.0) 122 | webmock (3.26.1) 123 | addressable (>= 2.8.0) 124 | crack (>= 0.3.2) 125 | hashdiff (>= 0.4.0, < 2.0.0) 126 | webrick (1.9.2) 127 | 128 | PLATFORMS 129 | ruby 130 | 131 | DEPENDENCIES 132 | awesome_print (~> 1.8) 133 | bundler (~> 2.0) 134 | bundler-audit 135 | bundler-gem_version_tasks 136 | byebug 137 | charlock_holmes 138 | get_process_mem 139 | memory_profiler 140 | rackup 141 | rake (~> 13.0) 142 | rspec (~> 3.0) 143 | rubocop (~> 1.81.1) 144 | simplecov (~> 0.17) 145 | sinatra (~> 4.0) 146 | sinatra-contrib (~> 4.0) 147 | stream_lines! 148 | webmock (~> 3.0) 149 | webrick (~> 1.7) 150 | 151 | BUNDLED WITH 152 | 2.5.6 153 | -------------------------------------------------------------------------------- /spec/reading/stream_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rack' 4 | require 'stream_lines/reading/stream' 5 | 6 | RSpec.describe StreamLines::Reading::Stream do 7 | let(:url) { 'https://test.stream_lines.com' } 8 | let(:stream) { described_class.new(url) } 9 | 10 | it { expect(stream).to be_an(Enumerable) } 11 | 12 | describe '#each' do 13 | subject(:streamed_lines) do 14 | [].tap do |lines| 15 | stream.each { |line| lines << line } 16 | end 17 | end 18 | 19 | context 'when the content is multiple lines less than the chunk size' do 20 | before do 21 | allow(described_class).to receive(:get).and_yield("foo\nbar") 22 | end 23 | 24 | it 'calls the block with each line' do 25 | expect(streamed_lines).to eq(%w[foo bar]) 26 | end 27 | end 28 | 29 | context 'when the content is all 1 line, but multiple chunks' do 30 | before do 31 | allow(described_class).to receive(:get).and_yield('a' * 100).and_yield('a' * 100) 32 | end 33 | 34 | it 'calls the block with the 1 line' do 35 | expect(streamed_lines).to eq(['a' * 200]) 36 | end 37 | end 38 | 39 | context 'when a chunk ends with a newline' do 40 | before do 41 | allow(described_class) 42 | .to receive(:get) 43 | .and_yield("foo\nbar\n") 44 | .and_yield('baz') 45 | end 46 | 47 | it 'correctly considers the trailing newline to create a separate, empty chunk' do 48 | expect(streamed_lines).to eq(%w[foo bar baz]) 49 | end 50 | end 51 | 52 | context 'when the content ends with a newline' do 53 | before do 54 | allow(described_class) 55 | .to receive(:get) 56 | .and_yield('foobar') 57 | .and_yield("baz\n") 58 | end 59 | 60 | it 'calls the block with the content in the correct order' do 61 | expect(streamed_lines).to eq(['foobarbaz']) 62 | end 63 | end 64 | 65 | context 'when a chunk starts with a newline' do 66 | before do 67 | allow(described_class).to receive(:get).and_yield("\nfoo") 68 | end 69 | 70 | it 'calls the block with the empty string from the leading newline' do 71 | expect(streamed_lines).to eq(['', 'foo']) 72 | end 73 | end 74 | 75 | context 'when a chunk contains consecutive newline characters' do 76 | before do 77 | allow(described_class).to receive(:get).and_yield("foo\n\nbar") 78 | end 79 | 80 | it 'calls the block with the empty string from the leading newline' do 81 | expect(streamed_lines).to eq(['foo', '', 'bar']) 82 | end 83 | end 84 | 85 | context 'when the chunk splits a UTF-8 string such that an invalid byte sequence is created' do 86 | context 'no newlines' do 87 | before do 88 | content = 'Hello™ World' 89 | bytes = content.bytes 90 | 91 | allow(described_class) 92 | .to receive(:get) 93 | .and_yield(bytes[0..6].pack('c*')) 94 | .and_yield(bytes[7..-1].pack('c*')) 95 | end 96 | 97 | it 'reassembles valid byte sequences' do 98 | expect(streamed_lines).to eq(['Hello™ World']) 99 | end 100 | end 101 | 102 | context 'with newlines' do 103 | before do 104 | content = <<~CONTENT 105 | Hello™, World 106 | Hello™ again, World 107 | Hello™ one last time, World 108 | CONTENT 109 | 110 | bytes = content.bytes 111 | 112 | allow(described_class) 113 | .to receive(:get) 114 | .and_yield(bytes[0..22].pack('c*')) 115 | .and_yield(bytes[23..-1].pack('c*')) 116 | end 117 | 118 | it 'reassembles valid byte sequences' do 119 | expect(streamed_lines).to eq(['Hello™, World', 120 | 'Hello™ again, World', 121 | 'Hello™ one last time, World']) 122 | end 123 | end 124 | end 125 | 126 | context 'when the GET request fails' do 127 | let(:url) { 'https://test.stream_lines.com/fail' } 128 | 129 | before { stub_request(:get, url).to_return(status: 403) } 130 | 131 | it 'raises a StreamLines::Error' do 132 | expect { stream.each.to_a } 133 | .to raise_error StreamLines::Error, "Failed to download #{url} with code: 403" 134 | end 135 | end 136 | 137 | context 'memory efficiency' do 138 | include StreamingApi::Helpers 139 | 140 | let(:url) { stream_data_url } 141 | 142 | around do |ex| 143 | run_streaming_api { ex.run } 144 | end 145 | 146 | it 'can stream large files without using too much memory' do 147 | max_memory_usage = baseline_memory_usage = GetProcessMem.new.mb 148 | 149 | stream.each do |_line| 150 | max_memory_usage = [max_memory_usage, GetProcessMem.new.mb].max 151 | end 152 | 153 | expect(max_memory_usage - baseline_memory_usage).to be <= 20 154 | end 155 | end 156 | end 157 | end 158 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StreamLines 2 | 3 | [![Gem Version](https://badge.fury.io/rb/stream_lines.svg)](https://badge.fury.io/rb/stream_lines) 4 | [![Tests (Ruby)](https://github.com/jdlubrano/stream_lines/actions/workflows/ruby-tests.yml/badge.svg)](https://github.com/jdlubrano/stream_lines/actions/workflows/ruby-tests.yml) 5 | 6 | An API for streaming files from remote locations one line at a time. 7 | 8 | ## Background 9 | 10 | Some applications run in production environments without a writable file system; 11 | usually this is a security measure. Futhermore, with the proliferation of 12 | container-based production environments, containers may not have access to 13 | tremendous amounts of memory. Thus, it can be impossible to read large files 14 | unless you read the file into memory in small doses. A common pattern is to 15 | use a line-delimited file like [JSON Lines](http://jsonlines.org) or a CSV 16 | and to read the file one line at a time in order to iterate over a dataset. 17 | This gem aims to provide an [Enumerable](https://ruby-doc.org/core-2.7.0/Enumerable.html) 18 | interface for iterating over remote, line-delimited datasets. 19 | 20 | ## Installation 21 | 22 | Add this line to your application's Gemfile: 23 | 24 | ```ruby 25 | gem 'stream_lines' 26 | ``` 27 | 28 | And then execute: 29 | 30 | $ bundle 31 | 32 | Or install it yourself as: 33 | 34 | $ gem install stream_lines 35 | 36 | ## Usage 37 | 38 | ### Reading 39 | 40 | #### From a URL 41 | 42 | ```ruby 43 | url = 'https://my.remote.file/file.txt' 44 | stream = StreamLines::Reading::Stream.new(url) 45 | 46 | stream.each do |line| 47 | # Do something with the line of data (the line will be a String) 48 | end 49 | 50 | # A StreamLines::Reading::Stream object is Enumerable, so you can also use 51 | # any Enumerable methods. 52 | 53 | stream.each_slice(100) do |lines| 54 | # Do something with the 100 lines of data 55 | end 56 | ``` 57 | 58 | ##### Caveats 59 | 60 | This library strives to provide streamed data via an `Enumerable` interface. 61 | In order to be memory-efficient, however, each time the stream is iterated over, 62 | a new GET request is made to fetch the data from its remote URL. For example, 63 | 64 | ```ruby 65 | url = 'https://my.remote.file/file.txt' 66 | stream = StreamLines::Reading::Stream.new(url) 67 | do_something_with_first_row(stream.first) # GET request made 68 | 69 | stream.each do |line| # same GET request made 70 | # Do something with the line of data (the line will be a String) 71 | end 72 | ``` 73 | 74 | makes two GET requests. The call to `first` makes a GET request to fetch 75 | the first row of data. The subsequent call to `each` makes the same GET 76 | request. To avoid unnecessary requests, I recommend a slightly different 77 | approach, which may not be intuitive but does make only one network request: 78 | 79 | ```ruby 80 | url = 'https://my.remote.file/file.txt' 81 | stream = StreamLines::Reading::Stream.new(url) 82 | 83 | stream.each_with_index do |line, i| 84 | do_something_with_first_row(line) if i.zero? 85 | # Do something with the line of data (the line will be a String) 86 | end 87 | ``` 88 | 89 | ##### CSVs 90 | 91 | This gem provides first-class support for streaming CSVs from a remote URL. 92 | 93 | ```ruby 94 | url = 'https://my.remote.file/file.csv' 95 | stream = StreamLines::Reading::CSV.new(url) 96 | 97 | stream.each do |row| 98 | # each row will be an Array 99 | end 100 | 101 | # Supports most Ruby CSV options (see ignored options below) 102 | stream = StreamLines::Reading::CSV.new(url, headers: true) 103 | 104 | stream.each do |row| 105 | # each row will be a CSV::Row object that you can access like row['column_name'] 106 | end 107 | ``` 108 | 109 | Most options that you can pass to 110 | [Ruby's CSV library](https://ruby-doc.org/stdlib-2.6.1/libdoc/csv/rdoc/CSV.html#method-c-new) 111 | are supported; however, the following options are explicitly ignored: 112 | 113 | * `return_headers` 114 | * `header_converters` 115 | * `skip_lines` 116 | 117 | I suspect that these options are not used terribly frequently, and each would 118 | require additional logic in the `StreamLines::Reading::CSV#each` method. 119 | Rather than attempting to implement sensible solutions for these options, I am 120 | choosing to explicitly ignore them until there is enough outcry to support them. 121 | 122 | ##### JSON Lines/Streaming JSON 123 | 124 | This gem provides first-class support for streaming 125 | [JSON lines](http://jsonlines.org) from a remote URL. 126 | 127 | ```ruby 128 | url = 'https://my.remote.file/file.jsonl' 129 | stream = StreamLines::Reading::JSONLines.new(url) 130 | 131 | stream.each do |row| 132 | # each row will be an Hash 133 | end 134 | 135 | # Supports all Ruby JSON::parse options 136 | stream = StreamLines::Reading::JSONLines.new(url, symbolize_names: true) 137 | 138 | stream.each do |row| 139 | # each row will be a Hash 140 | end 141 | ``` 142 | 143 | ## Development 144 | 145 | After checking out the repo, run `bin/setup` to install dependencies. 146 | Then, run `rake spec` to run the tests. You can also run `bin/console` for an 147 | interactive prompt that will allow you to experiment. 148 | 149 | To install this gem onto your local machine, run `bundle exec rake install`. 150 | 151 | ## Releasing 152 | 153 | After merging in the new functionality to the main branch: 154 | 155 | ``` 156 | git checkout main 157 | git pull --prune 158 | bundle exec rake version:bump: 159 | bundle exec rubocop -a 160 | git commit -a --amend 161 | bundle exec rake release 162 | ``` 163 | 164 | ## Contributing 165 | 166 | Bug reports and pull requests are welcome on GitHub at 167 | https://github.com/jdlubrano/stream_lines. This project is intended to be a 168 | safe, welcoming space for collaboration, and contributors are expected to 169 | adhere to the [code of conduct](https://github.com/jdlubrano/stream_lines/blob/main/CODE_OF_CONDUCT.md). 170 | 171 | ## License 172 | 173 | The gem is available as open source under the terms of the 174 | [MIT License](https://opensource.org/licenses/MIT). 175 | --------------------------------------------------------------------------------