├── .gitignore ├── .rspec ├── .travis.yml ├── Gemfile ├── README.md ├── Rakefile ├── archive_io.gemspec ├── bin └── console ├── lib ├── archive_io.rb └── archive_io │ ├── archive_reader.rb │ ├── file_reader.rb │ ├── header.rb │ ├── lib_archive.rb │ ├── version.rb │ └── wildcard_pattern.rb └── spec ├── acceptance ├── basic_spec.rb └── nokogiri_spec.rb ├── fixtures ├── archive.zip └── unarchived_file.txt ├── spec_helper.rb └── unit ├── archive_reader_spec.rb ├── lib_archive_spec.rb └── wildcard_pattern_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | /spec/examples.txt 11 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | --require spec_helper 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.3.1 4 | before_install: 5 | - sudo apt-get -qq update 6 | - sudo apt-get install -y libarchive-dev 7 | cache: bundler 8 | script: bundle exec rspec 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in archive_io.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ArchiveIO 2 | [![Gem Version](https://badge.fury.io/rb/archive_io.svg)](https://badge.fury.io/rb/archive_io) 3 | [![Build Status](https://travis-ci.org/AMekss/archive_io.svg?branch=master)](https://travis-ci.org/AMekss/archive_io) 4 | 5 | Library which can traverse archived file (using [libarchive](http://www.libarchive.org/) under the hood) and yields IO like object on each file entry inside it for further streamline processing. Stress-tested via processing 500 MB big archive (~10GB big xml file when uncompressed) and it worked quite good. No memory peaks during the process were observed. 6 | 7 | **Note:** [libarchive](http://www.libarchive.org/) have to be pre-installed and available on the host system 8 | 9 | ## Installation 10 | 11 | Add this line to your application's Gemfile: 12 | 13 | ```ruby 14 | gem 'archive_io' 15 | ``` 16 | 17 | And then execute: 18 | 19 | $ bundle 20 | 21 | Or install it yourself as: 22 | 23 | $ gem install archive_io 24 | 25 | ## Usage 26 | 27 | Simple usage: 28 | 29 | ```ruby 30 | archive = ArchiveIO.open("archive.7z") 31 | archive.each do |cursor| 32 | puts cursor.pathname # prints out pathname inside archive 33 | puts cursor.read(10) # prints out beginning of each file 34 | end 35 | archive.close 36 | ``` 37 | 38 | This library can come in handy if you want to process huge xml files reading straight from the archive without uncompressing it and works nicely together with `Nokogiri::XML::Reader` and can be used as follows: 39 | 40 | ```ruby 41 | archive = ArchiveIO.open("archive.7z") 42 | archive.select("*.xml") do |cursor| 43 | Nokogiri::XML::Reader(cursor).each do |xml_node| 44 | # your custom xml processing logic goes here 45 | end 46 | end 47 | archive.close 48 | ``` 49 | 50 | ## Development 51 | 52 | After checking out the repo, run `bundle install` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 53 | 54 | To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 55 | 56 | ## Contributing 57 | 58 | Bug reports and pull requests are welcome on GitHub at https://github.com/AMekss/archive_io. 59 | 60 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | task :default => :spec 3 | -------------------------------------------------------------------------------- /archive_io.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'archive_io/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "archive_io" 8 | spec.version = ArchiveIO::VERSION 9 | spec.authors = ["Artūrs Mekšs"] 10 | spec.email = ["arturs.mekss@gmail.com"] 11 | 12 | spec.summary = %q{Library which can traverse archived file and yields io like object on each file in it.} 13 | spec.description = %q{Library which can traverse archived file (using `libarchive` under the hood) and yield io like object on each file entry inside it for further processing.} 14 | spec.homepage = "https://github.com/AMekss/archive_io" 15 | 16 | spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 17 | spec.bindir = "exe" 18 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_dependency "ffi", "~> 1.9" 22 | 23 | spec.add_development_dependency "bundler", "~> 1.11" 24 | spec.add_development_dependency "rake", "~> 10.0" 25 | spec.add_development_dependency "rspec", "~> 3.5" 26 | spec.add_development_dependency "pry" 27 | spec.add_development_dependency "nokogiri" 28 | end 29 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "archive_io" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | require "pry" 10 | Pry.start 11 | -------------------------------------------------------------------------------- /lib/archive_io.rb: -------------------------------------------------------------------------------- 1 | require "archive_io/version" 2 | require "archive_io/archive_reader" 3 | 4 | module ArchiveIO 5 | def self.open(filename) 6 | reader = ArchiveIO::ArchiveReader.new 7 | reader.open(filename) 8 | reader 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/archive_io/archive_reader.rb: -------------------------------------------------------------------------------- 1 | require "archive_io/lib_archive" 2 | require "archive_io/header" 3 | require "archive_io/file_reader" 4 | require "archive_io/wildcard_pattern" 5 | 6 | module ArchiveIO 7 | class ArchiveReader 8 | HEADER_HANDLING_STRATEGIES = { 9 | LibArchive::OK => -> (pointer) { Header.new(pointer) }, 10 | LibArchive::EOF => -> (_) { nil }, 11 | }.freeze 12 | 13 | def initialize 14 | ObjectSpace.define_finalizer(self, &method(:close)) 15 | end 16 | 17 | def open(filename) 18 | close 19 | 20 | @archive = LibArchive::archive_read_new 21 | @filename = filename 22 | 23 | assert_operation_valid(LibArchive::archive_read_support_compression_all(archive)) 24 | assert_operation_valid(LibArchive::archive_read_support_format_all(archive)) 25 | assert_operation_valid(LibArchive::archive_read_open_filename(archive, filename, 10240)) 26 | end 27 | 28 | def each 29 | raise LibArchive::NoArchiveError, "Archive is not open, call open with filename on archive before this operation" unless archive 30 | 31 | while header = next_header 32 | yield FileReader.new(header.pathname, archive) if header.file? 33 | end 34 | 35 | open(@filename) #reopen file for the next call 36 | end 37 | 38 | def select(pattern) 39 | wildcard = WildcardPattern.new(pattern) 40 | 41 | each do |file| 42 | yield file if wildcard.match?(file.pathname) 43 | end 44 | end 45 | 46 | def close 47 | LibArchive::archive_read_finish(archive) if archive 48 | ensure 49 | @archive = nil 50 | @filename = nil 51 | end 52 | 53 | private 54 | 55 | attr_reader :archive 56 | 57 | def header_handler(code) 58 | HEADER_HANDLING_STRATEGIES.fetch(code) { raise LibArchive::Error, archive } 59 | end 60 | 61 | def next_header 62 | header = LibArchive::archive_entry_new 63 | header_handler(LibArchive::archive_read_next_header(archive, header)).call(header.read_pointer) 64 | end 65 | 66 | def assert_operation_valid(operation_result_code) 67 | raise LibArchive::Error, archive if operation_result_code != LibArchive::OK 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /lib/archive_io/file_reader.rb: -------------------------------------------------------------------------------- 1 | require "archive_io/lib_archive" 2 | 3 | module ArchiveIO 4 | class FileReader 5 | attr_reader :pathname 6 | 7 | def initialize(pathname, archive) 8 | @pathname = pathname 9 | @archive = archive 10 | end 11 | 12 | def read(size) 13 | buffer = FFI::MemoryPointer.new(:char, size) 14 | return nil if LibArchive::archive_read_data(@archive, buffer, size) < 0 15 | buffer.get_string(0) 16 | end 17 | 18 | def close 19 | # does nothing, purpose of having it is to support IO like interface 20 | end 21 | end 22 | private_constant :FileReader 23 | end 24 | -------------------------------------------------------------------------------- /lib/archive_io/header.rb: -------------------------------------------------------------------------------- 1 | require "archive_io/lib_archive" 2 | 3 | module ArchiveIO 4 | class Header 5 | S_IFMT = 0170000 6 | S_IFREG = 0100000 # regular file 7 | 8 | def initialize(pointer) 9 | @pointer = pointer 10 | end 11 | 12 | def file? 13 | LibArchive::archive_entry_filetype(@pointer) & S_IFMT == S_IFREG 14 | end 15 | 16 | def pathname 17 | LibArchive::archive_entry_pathname(@pointer) 18 | end 19 | end 20 | private_constant :Header 21 | end 22 | -------------------------------------------------------------------------------- /lib/archive_io/lib_archive.rb: -------------------------------------------------------------------------------- 1 | require "ffi" 2 | 3 | module ArchiveIO 4 | module LibArchive 5 | extend FFI::Library 6 | 7 | OK = 0 8 | EOF = 1 9 | 10 | ffi_lib ["archive", "libarchive.so.2"] 11 | 12 | attach_function :archive_version_string, [], :string 13 | attach_function :archive_read_open_filename, [:pointer, :string, :size_t], :int 14 | attach_function :archive_read_support_compression_all, [:pointer], :int 15 | attach_function :archive_read_support_format_all, [:pointer], :int 16 | attach_function :archive_error_string, [:pointer], :string 17 | attach_function :archive_read_new, [], :pointer 18 | attach_function :archive_read_finish, [:pointer], :int 19 | attach_function :archive_read_header_position, [:pointer], :int 20 | attach_function :archive_read_next_header, [:pointer, :pointer], :int 21 | attach_function :archive_read_data, [:pointer, :pointer, :size_t], :size_t 22 | 23 | # entry 24 | attach_function :archive_entry_new, [], :pointer 25 | attach_function :archive_entry_filetype, [:pointer], :mode_t 26 | attach_function :archive_entry_pathname, [:pointer], :string 27 | 28 | class NoArchiveError < StandardError; end 29 | class Error < StandardError 30 | def initialize(archive) 31 | super "#{LibArchive::archive_error_string(archive)}" 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/archive_io/version.rb: -------------------------------------------------------------------------------- 1 | module ArchiveIO 2 | VERSION = "0.1.0" 3 | end 4 | -------------------------------------------------------------------------------- /lib/archive_io/wildcard_pattern.rb: -------------------------------------------------------------------------------- 1 | module ArchiveIO 2 | class WildcardPattern 3 | def initialize(expression) 4 | escaped = Regexp.escape(expression).gsub('\*','.*?') 5 | @regex = Regexp.new "^#{escaped}$", Regexp::IGNORECASE 6 | end 7 | 8 | def match?(str) 9 | !!(str =~ @regex) 10 | end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /spec/acceptance/basic_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe ArchiveIO do 2 | context "with archived file" do 3 | it "reads archived content" do 4 | results = [] 5 | 6 | archive = ArchiveIO.open("spec/fixtures/archive.zip") 7 | archive.each do |file| 8 | results << file.pathname 9 | results << file.read(10) 10 | end 11 | archive.close 12 | 13 | expect(results).to eq([ 14 | "archive/data_file.txt", 15 | "Archived t", 16 | "archive/xml/customers.xml", 17 | " "be bigger than 2 and smaller than 4" 31 | # ...rather than: 32 | # # => "be bigger than 2" 33 | expectations.include_chain_clauses_in_custom_matcher_descriptions = true 34 | end 35 | 36 | # rspec-mocks config goes here. You can use an alternate test double 37 | # library (such as bogus or mocha) by changing the `mock_with` option here. 38 | config.mock_with :rspec do |mocks| 39 | # Prevents you from mocking or stubbing a method that does not exist on 40 | # a real object. This is generally recommended, and will default to 41 | # `true` in RSpec 4. 42 | mocks.verify_partial_doubles = true 43 | end 44 | 45 | # This option will default to `:apply_to_host_groups` in RSpec 4 (and will 46 | # have no way to turn it off -- the option exists only for backwards 47 | # compatibility in RSpec 3). It causes shared context metadata to be 48 | # inherited by the metadata hash of host groups and examples, rather than 49 | # triggering implicit auto-inclusion in groups with matching metadata. 50 | config.shared_context_metadata_behavior = :apply_to_host_groups 51 | 52 | # This allows you to limit a spec run to individual examples or groups 53 | # you care about by tagging them with `:focus` metadata. When nothing 54 | # is tagged with `:focus`, all examples get run. RSpec also provides 55 | # aliases for `it`, `describe`, and `context` that include `:focus` 56 | # metadata: `fit`, `fdescribe` and `fcontext`, respectively. 57 | config.filter_run_when_matching :focus 58 | 59 | # Allows RSpec to persist some state between runs in order to support 60 | # the `--only-failures` and `--next-failure` CLI options. We recommend 61 | # you configure your source control system to ignore this file. 62 | config.example_status_persistence_file_path = "spec/examples.txt" 63 | 64 | # Limits the available syntax to the non-monkey patched syntax that is 65 | # recommended. For more details, see: 66 | # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ 67 | # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ 68 | # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode 69 | config.disable_monkey_patching! 70 | 71 | # This setting enables warnings. It's recommended, but in some cases may 72 | # be too noisy due to issues in dependencies. 73 | config.warnings = false 74 | 75 | # Many RSpec users commonly either run the entire suite or an individual 76 | # file, and it's useful to allow more verbose output when running an 77 | # individual spec file. 78 | if config.files_to_run.one? 79 | # Use the documentation formatter for detailed output, 80 | # unless a formatter has already been configured 81 | # (e.g. via a command-line flag). 82 | config.default_formatter = 'doc' 83 | end 84 | 85 | # Print the 10 slowest examples and example groups at the 86 | # end of the spec run, to help surface which specs are running 87 | # particularly slow. 88 | config.profile_examples = 10 89 | 90 | # Run specs in random order to surface order dependencies. If you find an 91 | # order dependency and want to debug it, you can fix the order by providing 92 | # the seed, which is printed after each run. 93 | # --seed 1234 94 | config.order = :random 95 | 96 | # Seed global randomization in this process using the `--seed` CLI option. 97 | # Setting this allows you to use `--seed` to deterministically reproduce 98 | # test failures related to randomization by passing the same `--seed` value 99 | # as the one that triggered the failure. 100 | Kernel.srand config.seed 101 | end 102 | -------------------------------------------------------------------------------- /spec/unit/archive_reader_spec.rb: -------------------------------------------------------------------------------- 1 | module ArchiveIO 2 | RSpec.describe ArchiveReader do 3 | subject(:reader) { described_class.new } 4 | let(:results) { [] } 5 | before { reader.open("spec/fixtures/archive.zip") } 6 | 7 | describe "#each" do 8 | it "iterates through the archive multiple times" do 9 | reader.each { |file| results << file.pathname } 10 | reader.each { |file| results << file.pathname } 11 | 12 | expect(results).to eq [ 13 | "archive/data_file.txt", 14 | "archive/xml/customers.xml", 15 | "archive/data_file.txt", 16 | "archive/xml/customers.xml", 17 | ] 18 | end 19 | end 20 | 21 | describe "#select" do 22 | it "selects files by pattern" do 23 | reader.select("*.xml") { |file| results << file.pathname } 24 | expect(results).to eq [ "archive/xml/customers.xml" ] 25 | end 26 | end 27 | 28 | describe "#close" do 29 | it "not possible to operate on closed archive" do 30 | reader.close 31 | expect { reader.each { |f| } } 32 | .to raise_error(LibArchive::NoArchiveError, "Archive is not open, call open with filename on archive before this operation") 33 | end 34 | end 35 | 36 | describe "yielded FileReader" do 37 | it "responds to #read" do 38 | reader.select("*.xml") do |file_reader| 39 | expect(file_reader.read(4)).to eq "\n" 41 | end 42 | end 43 | 44 | it "return empty string when EOF reached" do 45 | reader.select("*.xml") do |file_reader| 46 | file_reader.read(1000) 47 | expect(file_reader.read(5)).to eq "" 48 | end 49 | end 50 | 51 | it "responds to #close" do 52 | reader.select("*.xml") do |file_reader| 53 | expect(file_reader).to respond_to(:close) 54 | end 55 | end 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /spec/unit/lib_archive_spec.rb: -------------------------------------------------------------------------------- 1 | module ArchiveIO 2 | RSpec.describe LibArchive do 3 | describe ".archive_version_string" do 4 | it "returns non-empty string thus bindings are correct" do 5 | expect(LibArchive.archive_version_string).to be_a String 6 | expect(LibArchive.archive_version_string).not_to be_empty 7 | end 8 | end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /spec/unit/wildcard_pattern_spec.rb: -------------------------------------------------------------------------------- 1 | module ArchiveIO 2 | RSpec.describe WildcardPattern do 3 | TEST_EXPRESSIONS = { 4 | "*hn" => { "john" => true, "johnny" => false, "hanna" => false }, 5 | "*hn*" => { "john" => true, "johnny" => true, "hanna" => false }, 6 | "hn" => { "john" => false, "johnny" => false, "hanna" => false }, 7 | "*h*n*" => { "john" => true, "johnny" => true, "hanna" => true }, 8 | } 9 | 10 | TEST_EXPRESSIONS.each do |expression, expected_results| 11 | expected_results.each do |test_string, expected_result| 12 | it "#{expression} returns #{expected_result} for #{test_string}" do 13 | wildcard = described_class.new(expression) 14 | expect(wildcard.match?(test_string)).to eq expected_result 15 | end 16 | end 17 | end 18 | end 19 | end 20 | --------------------------------------------------------------------------------