├── .github └── workflows │ └── ruby.yml ├── .gitignore ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin └── console ├── lib ├── saxlsx.rb └── saxlsx │ ├── boolean_parser.rb │ ├── column_name_generator.rb │ ├── file_system.rb │ ├── rows_collection.rb │ ├── rows_collection_count_parser.rb │ ├── rows_collection_parser.rb │ ├── sax_parser.rb │ ├── shared_string_collection.rb │ ├── shared_string_collection_parser.rb │ ├── sheet.rb │ ├── sheet_collection.rb │ ├── sheet_collection_parser.rb │ ├── style_collection.rb │ ├── style_collection_parser.rb │ ├── version.rb │ └── workbook.rb ├── saxlsx.gemspec └── spec ├── benchmarks.rb ├── column_name_generator_spec.rb ├── data ├── Spec.xlsx ├── Spec1904.xlsx ├── SpecInlineStrings.xlsx ├── SpecMultiline10.xlsx ├── SpecMultilineN.xlsx ├── SpecNumberFormat.xlsx └── SpecSloppy.xlsx ├── sheet_spec.rb ├── spec_helper.rb └── workbook_spec.rb /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | name: Ruby 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | ruby-version: ['2.5', '2.6', '2.7', '3.0'] 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Ruby 15 | uses: ruby/setup-ruby@v1 16 | with: 17 | ruby-version: ${{ matrix.ruby-version }} 18 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 19 | - name: Run tests 20 | run: bundle exec rake 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | .idea 19 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in saxlsx.gemspec 4 | gemspec 5 | 6 | gem "rubyzip", "~> 2.0" 7 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 MAK IT 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Saxlsx 2 | 3 | [![Join the chat at https://gitter.im/mak-it/saxlsx](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/mak-it/saxlsx?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 4 | 5 | **Fast** and memory efficient XLSX reader on top of Ox SAX parser. 6 | 7 | It reads row by row and doesn't store the whole sheet in memory, so this 8 | approach is more suitable when parsing big files. 9 | 10 | ## Installation 11 | 12 | Add this line to your application's Gemfile: 13 | 14 | ```ruby 15 | gem 'saxlsx' 16 | ``` 17 | 18 | And then execute: 19 | 20 | ```bash 21 | $ bundle 22 | ``` 23 | 24 | Or install it yourself as: 25 | 26 | ```bash 27 | $ gem install saxlsx 28 | ``` 29 | 30 | ## Usage 31 | 32 | ```ruby 33 | Saxlsx::Workbook.open filename, auto_format: true do |w| 34 | w.sheets.each do |s| 35 | puts s.rows.count 36 | s.rows.each do |r| 37 | puts r.inspect 38 | end 39 | end 40 | end 41 | ``` 42 | 43 | By default `saxlsx` will try to convert `General` type cells that look like 44 | numbers to ruby floats or integers. You can disable this feature 45 | using `auto_format: false`. 46 | 47 | ## How fast is it? 48 | 49 | ```bash 50 | $ rake bench 51 | ``` 52 | 53 | ruby 2.7 on OS X 54 | 55 | ``` 56 | Shared Strings 57 | 58 | user system total real 59 | creek 1.296539 0.029374 1.325913 ( 1.340820) 60 | dullard 1.178981 0.025073 1.204054 ( 1.221381) 61 | oxcelix 0.985258 0.025028 1.010286 ( 1.023730) 62 | roo 0.971155 0.029964 1.001119 ( 1.016452) 63 | rubyXL 2.979334 0.055708 3.035042 ( 3.079301) 64 | saxlsx 0.473398 0.011342 0.484740 ( 0.490247) 65 | simple_xlsx_reader 1.209074 0.024579 1.233653 ( 1.249957) 66 | 67 | Inline Strings 68 | 69 | user system total real 70 | creek 1.471115 0.075182 1.546297 ( 1.567045) 71 | dullard 1.338499 0.085116 1.423615 ( 1.443386) 72 | oxcelix ERROR 73 | roo 1.133878 0.052834 1.186712 ( 1.208369) 74 | rubyXL 3.213630 0.070255 3.283885 ( 3.324428) 75 | saxlsx 0.667601 0.024265 0.691866 ( 0.696603) 76 | simple_xlsx_reader 1.350298 0.028411 1.378709 ( 1.396583) 77 | ``` 78 | 79 | ## Contributing 80 | 81 | 1. Fork it 82 | 2. Create your feature branch (`git checkout -b my-new-feature`) 83 | 3. Commit your changes (`git commit -am 'Add some feature'`) 84 | 4. Push to the branch (`git push origin my-new-feature`) 85 | 5. Create new Pull Request 86 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | Bundler::GemHelper.install_tasks 5 | RSpec::Core::RakeTask.new(:spec) 6 | task :default => :spec 7 | 8 | task :bench do 9 | require './spec/benchmarks.rb' 10 | Saxlsx::Benchmarks.new.run 11 | end 12 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "saxlsx" 5 | 6 | require "irb" 7 | IRB.start 8 | -------------------------------------------------------------------------------- /lib/saxlsx.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | require 'bigdecimal' 3 | require 'rational' 4 | require 'zip' 5 | require 'ox' 6 | require 'cgi' 7 | 8 | Dir["#{File.dirname(__FILE__)}/saxlsx/**/*.rb"].each { |f| require f } 9 | -------------------------------------------------------------------------------- /lib/saxlsx/boolean_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class BooleanParser 4 | 5 | def self.parse(string) 6 | return true if string == true || string =~ (/(true|t|yes|y|1)$/i) 7 | return false if string == false || string.nil? || string =~ (/(false|f|no|n|0)$/i) 8 | raise ArgumentError.new("Invalid value for Boolean: \"#{string}\"") 9 | end 10 | 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/saxlsx/column_name_generator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class ColumnNameGenerator 4 | FIRST = 'A' 5 | LAST = 'Z' 6 | 7 | def self.next_to(previous) 8 | char = previous ? previous[-1] : nil 9 | if char.nil? 10 | FIRST 11 | elsif char < LAST 12 | previous[0..-2] + char.next 13 | else 14 | next_to(previous[0..-2]) + FIRST 15 | end 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/saxlsx/file_system.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class FileSystem 4 | IO_METHODS = [:tell, :seek, :read, :close] 5 | 6 | def self.open(filename) 7 | begin 8 | file_system = self.new(filename) 9 | yield file_system 10 | ensure 11 | file_system.close 12 | end 13 | end 14 | 15 | def initialize(filename) 16 | if IO_METHODS.map { |method| filename.respond_to?(method) }.all? 17 | @zip = Zip::File.open_buffer filename 18 | @io = true 19 | else 20 | @zip = Zip::File.open filename 21 | end 22 | end 23 | 24 | def close 25 | @zip.close unless @io 26 | end 27 | 28 | def workbook 29 | @zip.get_input_stream('xl/workbook.xml') 30 | end 31 | 32 | def shared_strings 33 | file = @zip.glob('xl/shared[Ss]trings.xml').first 34 | @zip.get_input_stream(file) if file 35 | end 36 | 37 | def styles 38 | @zip.get_input_stream('xl/styles.xml') 39 | end 40 | 41 | def sheet(i) 42 | @zip.get_input_stream("xl/worksheets/sheet#{i+1}.xml") 43 | end 44 | 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /lib/saxlsx/rows_collection.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class RowsCollection 4 | 5 | include Enumerable 6 | 7 | def initialize(index, file_system, workbook) 8 | @index = index 9 | @file_system = file_system 10 | @workbook = workbook 11 | @sheet = file_system.sheet(index) 12 | end 13 | 14 | def each(&block) 15 | RowsCollectionParser.parse @index, @sheet, @workbook, &block 16 | end 17 | 18 | def count 19 | @count ||= RowsCollectionCountParser.count @sheet 20 | end 21 | 22 | alias :size :count 23 | 24 | def [](value) 25 | to_a[value] 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/saxlsx/rows_collection_count_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class RowsCollectionCountParser < Ox::Sax 4 | def self.count(data, &block) 5 | parser = new 6 | catch :abort do 7 | SaxParser.parse parser, data 8 | end 9 | parser.count 10 | end 11 | 12 | attr_reader :count 13 | 14 | def initialize 15 | @count = 0 16 | end 17 | 18 | def start_element(name) 19 | @current_element = name 20 | if name == :row 21 | @count += 1 22 | end 23 | end 24 | 25 | def attr(name, value) 26 | if @current_element == :dimension 27 | if name == :ref && value 28 | matches = value.match(/[^:]+:[A-Z]*(\d+)/) 29 | if matches 30 | @count = matches[1].to_i 31 | throw :abort 32 | end 33 | end 34 | end 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /lib/saxlsx/rows_collection_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class RowsCollectionParser < Ox::Sax 4 | SECONDS_IN_DAY = 86400 5 | NUM_FORMATS = { 6 | 0 => :string, # General 7 | 1 => :fixnum, # 0 8 | 2 => :float, # 0.00 9 | 3 => :fixnum, # #,##0 10 | 4 => :float, # #,##0.00 11 | 5 => :unsupported, # $#,##0_);($#,##0) 12 | 6 => :unsupported, # $#,##0_);[Red]($#,##0) 13 | 7 => :unsupported, # $#,##0.00_);($#,##0.00) 14 | 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00) 15 | 9 => :percentage, # 0% 16 | 10 => :percentage, # 0.00% 17 | 11 => :bignum, # 0.00E+00 18 | 12 => :rational, # # ?/? 19 | 13 => :rational, # # ??/?? 20 | 14 => :date, # mm-dd-yy 21 | 15 => :date, # d-mmm-yy 22 | 16 => :date, # d-mmm 23 | 17 => :date, # mmm-yy 24 | 18 => :time, # h:mm AM/PM 25 | 19 => :time, # h:mm:ss AM/PM 26 | 20 => :time, # h:mm 27 | 21 => :time, # h:mm:ss 28 | 22 => :date_time, # m/d/yy h:mm 29 | 37 => :unsupported, # #,##0 ;(#,##0) 30 | 38 => :unsupported, # #,##0 ;[Red](#,##0) 31 | 39 => :unsupported, # #,##0.00;(#,##0.00) 32 | 40 => :unsupported, # #,##0.00;[Red](#,##0.00) 33 | 45 => :time, # mm:ss 34 | 46 => :time, # [h]:mm:ss 35 | 47 => :time, # mmss.0 36 | 48 => :bignum, # ##0.0E+0 37 | 49 => :unsupported # @ 38 | } 39 | 40 | def self.parse(index, data, workbook, &block) 41 | SaxParser.parse self.new(workbook, &block), data 42 | end 43 | 44 | def initialize(workbook, &block) 45 | @base_date = workbook.base_date 46 | @auto_format = workbook.auto_format 47 | @shared_strings = workbook.shared_strings 48 | @number_formats = workbook.number_formats 49 | @block = block 50 | end 51 | 52 | def start_element(name) 53 | @current_element = name 54 | case name 55 | when :row 56 | @current_row = [] 57 | @next_column = 'A' 58 | when :c 59 | @current_type = nil 60 | @current_number_format = nil 61 | end 62 | end 63 | 64 | def end_element(name) 65 | if name == :row 66 | @block.call @current_row 67 | @current_row = nil 68 | end 69 | end 70 | 71 | def attr(name, value) 72 | if @current_element == :c 73 | case name 74 | when :t 75 | @current_type = value 76 | when :r 77 | @current_column = value.gsub(/\d/, '') 78 | when :s 79 | @current_number_format = detect_format_type(value.to_i) 80 | end 81 | end 82 | end 83 | 84 | def text(value) 85 | if @current_row && (@current_element == :v || @current_element == :t) 86 | while @next_column != @current_column 87 | @current_row << nil 88 | @next_column = ColumnNameGenerator.next_to(@next_column) 89 | end 90 | @current_row << value_of(value) 91 | @next_column = ColumnNameGenerator.next_to(@next_column) 92 | end 93 | end 94 | 95 | private 96 | 97 | def value_of(text) 98 | case @current_type 99 | when 's' 100 | @shared_strings[text.to_i] 101 | when 'inlineStr' 102 | CGI.unescapeHTML(text) 103 | when 'b' 104 | BooleanParser.parse text 105 | else 106 | case @current_number_format 107 | when :date 108 | @base_date + Float(text) 109 | when :date_time 110 | # Round time to seconds 111 | date = @base_date + Rational((Float(text) * SECONDS_IN_DAY).round, SECONDS_IN_DAY) 112 | DateTime.new(date.year, date.month, date.day, date.hour, date.minute, date.second) 113 | when :fixnum 114 | Integer(text, 10) 115 | when :float, :percentage 116 | Float(text) 117 | when :rational 118 | Rational(text) 119 | when :bignum 120 | Float(text) # raises ArgumentError if text is not a number 121 | BigDecimal(text) # doesn't raise ArgumentError 122 | else 123 | if @current_type == 'n' 124 | Float(text) 125 | elsif @auto_format && text =~ /\A-?\d+(\.\d+(?:e[+-]\d+)?)?\Z/i 126 | # Auto convert numbers 127 | $1 ? Float(text) : Integer(text, 10) 128 | else 129 | CGI.unescapeHTML(text) 130 | end 131 | end 132 | end 133 | rescue ArgumentError 134 | CGI.unescapeHTML(text) 135 | end 136 | 137 | def detect_format_type(index) 138 | format = @number_formats[index] 139 | NUM_FORMATS[format] || detect_custom_format_type(format) 140 | end 141 | 142 | # This is the least deterministic part of reading xlsx files. Due to 143 | # custom styles, you can't know for sure when a date is a date other than 144 | # looking at its format and gessing. It's not impossible to guess right, 145 | # though. 146 | # 147 | # http://stackoverflow.com/questions/4948998/determining-if-an-xlsx-cell-is-date-formatted-for-excel-2007-spreadsheets 148 | def detect_custom_format_type(code) 149 | code = code.gsub(/\[[^\]]+\]/, '') # Strip meta - [...] 150 | if code =~ /0/ 151 | :float 152 | elsif code =~ /[ymdhis]/i 153 | :date_time 154 | else 155 | :unsupported 156 | end 157 | end 158 | end 159 | end 160 | -------------------------------------------------------------------------------- /lib/saxlsx/sax_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class SaxParser 4 | 5 | def self.parse(handler, xml) 6 | Ox.sax_parse(handler, xml, skip: :skip_return) 7 | ensure 8 | xml.rewind 9 | end 10 | 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/saxlsx/shared_string_collection.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class SharedStringCollection 4 | 5 | include Enumerable 6 | 7 | def initialize(file_system) 8 | @file_system = file_system 9 | end 10 | 11 | def each(&block) 12 | SharedStringCollectionParser.parse @file_system, &block 13 | end 14 | 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/saxlsx/shared_string_collection_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class SharedStringCollectionParser < Ox::Sax 4 | 5 | def self.parse(file_system, &block) 6 | shared_strings = file_system.shared_strings 7 | if shared_strings 8 | SaxParser.parse self.new(&block), shared_strings 9 | else 10 | [] 11 | end 12 | end 13 | 14 | def initialize(&block) 15 | @block = block 16 | end 17 | 18 | def start_element(name) 19 | @current_string = String.new if name == :si 20 | end 21 | 22 | def end_element(name) 23 | if name == :si 24 | @block.call @current_string 25 | @current_string = nil 26 | end 27 | end 28 | 29 | def text(value) 30 | @current_string << CGI.unescapeHTML(value) if @current_string 31 | end 32 | 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/saxlsx/sheet.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class Sheet 4 | 5 | attr_reader :name 6 | 7 | def initialize(name, index, file_system, workbook) 8 | @name = name 9 | @index = index 10 | @file_system = file_system 11 | @workbook = workbook 12 | end 13 | 14 | def rows 15 | @rows ||= RowsCollection.new(@index, @file_system, @workbook) 16 | end 17 | 18 | def to_csv(path) 19 | FileUtils.mkpath path unless Dir.exists? path 20 | File.open("#{path}/#{name}.csv", 'w') do |f| 21 | rows.each do |row| 22 | f.puts row.map{|c| "\"#{c}\""}.join(',') 23 | end 24 | end 25 | end 26 | 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/saxlsx/sheet_collection.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class SheetCollection 4 | 5 | include Enumerable 6 | 7 | def initialize(file_system, workbook) 8 | @file_system = file_system 9 | @workbook = workbook 10 | end 11 | 12 | def each(&block) 13 | SheetCollectionParser.parse @file_system, @workbook, &block 14 | end 15 | 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/saxlsx/sheet_collection_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class SheetCollectionParser < Ox::Sax 4 | 5 | CurrentSheet = Struct.new :index, :name 6 | 7 | def self.parse(file_system, workbook, &block) 8 | SaxParser.parse( 9 | self.new(file_system, workbook, &block), 10 | file_system.workbook 11 | ) 12 | end 13 | 14 | def initialize(file_system, workbook, &block) 15 | @file_system = file_system 16 | @workbook = workbook 17 | @block = block 18 | @index = -1 19 | @workbook_pr = false 20 | end 21 | 22 | def start_element(name) 23 | case name 24 | when :sheet 25 | @current_sheet = CurrentSheet.new(@index += 1) 26 | when :workbookPr 27 | @workbook_pr = true 28 | end 29 | end 30 | 31 | def end_element(name) 32 | case name 33 | when :sheet 34 | @block.call Sheet.new( 35 | @current_sheet.name, 36 | @current_sheet.index, 37 | @file_system, 38 | @workbook 39 | ) 40 | @current_sheet = nil 41 | when :workbookPr 42 | @workbook_pr = false 43 | end 44 | end 45 | 46 | def attr(name, value) 47 | if @current_sheet 48 | if name == :name 49 | @current_sheet.name = value 50 | end 51 | elsif @workbook_pr 52 | if name == :date1904 && value =~ /true|1/i 53 | @workbook.date1904 = true 54 | end 55 | end 56 | end 57 | 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/saxlsx/style_collection.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class StyleCollection 4 | 5 | include Enumerable 6 | 7 | def initialize(file_system) 8 | @file_system = file_system 9 | end 10 | 11 | def each(&block) 12 | StyleCollectionParser.parse @file_system, &block 13 | end 14 | 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/saxlsx/style_collection_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class StyleCollectionParser < Ox::Sax 4 | def self.parse(file_system, &block) 5 | SaxParser.parse self.new(&block), file_system.styles 6 | end 7 | 8 | def initialize(&block) 9 | @block = block 10 | @cell_styles = false 11 | @custom_num_fmts = {} 12 | end 13 | 14 | def start_element(name) 15 | case name 16 | when :cellXfs 17 | @cell_styles = true 18 | when :xf 19 | @num_fmt_id = nil 20 | when :numFmt 21 | @num_fmt_id = nil 22 | @num_fmt_code = nil 23 | end 24 | end 25 | 26 | def end_element(name) 27 | case name 28 | when :cellXfs 29 | @cell_styles = false 30 | when :xf 31 | if @cell_styles 32 | custom_num_fmt_code = @custom_num_fmts[@num_fmt_id] 33 | if custom_num_fmt_code 34 | @block.call custom_num_fmt_code 35 | else 36 | @block.call @num_fmt_id.to_i 37 | end 38 | end 39 | when :numFmt 40 | @custom_num_fmts[@num_fmt_id] = @num_fmt_code 41 | end 42 | end 43 | 44 | def attr(name, value) 45 | case name 46 | when :numFmtId 47 | @num_fmt_id = value.to_i 48 | when :formatCode 49 | @num_fmt_code = value 50 | end 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/saxlsx/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | VERSION = '1.11.0' 4 | end 5 | -------------------------------------------------------------------------------- /lib/saxlsx/workbook.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Saxlsx 3 | class Workbook 4 | DATE_SYSTEM_1900 = DateTime.new(1899, 12, 30) 5 | DATE_SYSTEM_1904 = DateTime.new(1904, 1, 1) 6 | 7 | attr_accessor :date1904 8 | attr_reader :auto_format 9 | 10 | def self.open(filename, **kw_args) 11 | begin 12 | workbook = new(filename, **kw_args) 13 | yield workbook 14 | ensure 15 | workbook.close if workbook 16 | end 17 | end 18 | 19 | def initialize(filename, auto_format: true) 20 | @file_system = FileSystem.new filename 21 | @auto_format = auto_format 22 | end 23 | 24 | def close 25 | @file_system.close 26 | end 27 | 28 | def sheets(name=nil) 29 | @sheets ||= SheetCollection.new(@file_system, self).to_a 30 | name.nil? ? @sheets : @sheets.detect { |s| s.name == name } 31 | end 32 | 33 | def sheet_names 34 | sheets.map(&:name) 35 | end 36 | 37 | def shared_strings 38 | @shared_strings ||= SharedStringCollection.new(@file_system).to_a 39 | end 40 | 41 | def number_formats 42 | @number_formats ||= StyleCollection.new(@file_system).to_a 43 | end 44 | 45 | def base_date 46 | @base_date ||= date1904 ? DATE_SYSTEM_1904 : DATE_SYSTEM_1900 47 | end 48 | 49 | def to_csv(path) 50 | sheets.each { |s| s.to_csv path } 51 | end 52 | 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /saxlsx.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'saxlsx/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "saxlsx" 8 | spec.version = Saxlsx::VERSION 9 | spec.authors = ["Edgars Beigarts"] 10 | spec.email = ["edgars.beigarts@makit.lv"] 11 | spec.description = 'Fast xlsx reader on top of Ox SAX parser' 12 | spec.summary = 'Fast xlsx reader on top of Ox SAX parser' 13 | spec.homepage = "https://github.com/mak-it/saxlsx" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files`.split($/) 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^spec/}) 19 | spec.require_paths = ["lib"] 20 | 21 | spec.required_ruby_version = '>= 2.5.0' 22 | 23 | spec.add_dependency 'rubyzip', '>= 1.0' 24 | spec.add_dependency 'ox', '~> 2.1' 25 | 26 | spec.add_development_dependency 'bundler', ">= 1.5" 27 | spec.add_development_dependency 'rake', '~> 13.0' 28 | spec.add_development_dependency 'rspec', '~> 3.10' 29 | spec.add_development_dependency 'rspec-collection_matchers' 30 | spec.add_development_dependency 'simplecov', '~> 0.8' 31 | end 32 | -------------------------------------------------------------------------------- /spec/benchmarks.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | require 'benchmark' 3 | require 'bundler/inline' 4 | 5 | gemfile do 6 | source 'https://rubygems.org' 7 | gem 'caxlsx' 8 | gem 'rubyXL' 9 | gem 'simple_xlsx_reader' 10 | gem 'creek' 11 | gem 'oxcelix' 12 | gem 'roo' 13 | gem 'dullard' 14 | gem "saxlsx", path: File.expand_path("../..", __FILE__ ) 15 | end 16 | 17 | module Saxlsx 18 | class Benchmarks 19 | def run 20 | path = "tmp/bench_shared_strings.xlsx" 21 | generate path, true 22 | benchmark "Shared Strings", path 23 | 24 | path = "tmp/bench_inline_strings.xlsx" 25 | generate path, false 26 | benchmark "Inline Strings", path 27 | end 28 | 29 | private 30 | 31 | def generate(path, shared_strings) 32 | unless File.exists?(path) 33 | puts "* Generating #{path}" 34 | FileUtils.mkdir_p File.dirname(path) 35 | Axlsx::Package.new do |p| 36 | money_style = p.workbook.styles.add_style( 37 | num_fmt: 5, format_code: "€0.000" 38 | ) 39 | p.workbook.add_worksheet(:name => "Sheet 1") do |sheet| 40 | 10000.times do 41 | sheet.add_row( 42 | [Date.today, Time.now, 1000, 3.14, "Long" * 100], 43 | types: [:date, :time, :integer, :float, :string], 44 | style: [nil, nil, nil, money_style, nil] 45 | ) 46 | end 47 | end 48 | p.use_shared_strings = shared_strings 49 | p.serialize(path) 50 | end 51 | end 52 | end 53 | 54 | def benchmark(title, path) 55 | puts 56 | puts title 57 | puts 58 | Benchmark.bmbm(20) do |x| 59 | x.report "creek" do 60 | run_creek(path) 61 | end 62 | x.report "dullard" do 63 | run_dullard(path) 64 | end 65 | x.report "oxcelix" do 66 | run_oxcelix(path) 67 | end 68 | x.report "roo" do 69 | run_roo(path) 70 | end 71 | x.report "rubyXL" do 72 | run_rubyxl(path) 73 | end 74 | x.report "saxlsx" do 75 | run_saxlsx(path) 76 | end 77 | x.report "simple_xlsx_reader" do 78 | run_simple_xlsx_reader(path) 79 | end 80 | end 81 | end 82 | 83 | def run_creek(path) 84 | w = Creek::Book.new path 85 | w.sheets.each do |s| 86 | s.rows.each do |r| 87 | r.values.inspect 88 | end 89 | end 90 | end 91 | 92 | def run_oxcelix(path) 93 | w = Oxcelix::Workbook.new(path) 94 | w.sheets.each do |s| 95 | s.to_ru.to_a.each do |r| 96 | r.inspect 97 | end 98 | end 99 | rescue 100 | puts "ERROR" 101 | end 102 | 103 | def run_rubyxl(path) 104 | w = RubyXL::Parser.parse path 105 | w.worksheets.each do |s| 106 | s.each do |r| 107 | r.cells.map(&:value).inspect 108 | end 109 | end 110 | end 111 | 112 | def run_saxlsx(path) 113 | Saxlsx::Workbook.open path do |w| 114 | w.sheets.each do |s| 115 | s.rows.each do |r| 116 | r.to_a.inspect 117 | end 118 | end 119 | end 120 | end 121 | 122 | def run_simple_xlsx_reader(path) 123 | w = SimpleXlsxReader.open path 124 | w.sheets.each do |s| 125 | s.rows.each do |r| 126 | r.to_a.inspect 127 | end 128 | end 129 | end 130 | 131 | def run_roo(path) 132 | w = Roo::Excelx.new path 133 | w.each_with_pagename do |_, s| 134 | s.each do |r| 135 | r.to_a.inspect 136 | end 137 | end 138 | end 139 | 140 | def run_dullard(path) 141 | w = Dullard::Workbook.new path 142 | w.sheets.each do |s| 143 | s.rows.each do |r| 144 | r.to_a.inspect 145 | end 146 | end 147 | end 148 | end 149 | end 150 | -------------------------------------------------------------------------------- /spec/column_name_generator_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | require 'spec_helper' 3 | 4 | describe ColumnNameGenerator do 5 | 6 | it 'First char' do 7 | ColumnNameGenerator::FIRST.should eq 'A' 8 | end 9 | 10 | it 'Last char' do 11 | ColumnNameGenerator::LAST.should eq 'Z' 12 | end 13 | 14 | it 'Next value' do 15 | ColumnNameGenerator.tap do |g| 16 | g.next_to(nil).should eq 'A' 17 | g.next_to('F').should eq 'G' 18 | g.next_to('DM').should eq 'DN' 19 | g.next_to('RZ').should eq 'SA' 20 | g.next_to('ZZ').should eq 'AAA' 21 | g.next_to('EDT').should eq 'EDU' 22 | end 23 | end 24 | 25 | end 26 | -------------------------------------------------------------------------------- /spec/data/Spec.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/Spec.xlsx -------------------------------------------------------------------------------- /spec/data/Spec1904.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/Spec1904.xlsx -------------------------------------------------------------------------------- /spec/data/SpecInlineStrings.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/SpecInlineStrings.xlsx -------------------------------------------------------------------------------- /spec/data/SpecMultiline10.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/SpecMultiline10.xlsx -------------------------------------------------------------------------------- /spec/data/SpecMultilineN.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/SpecMultilineN.xlsx -------------------------------------------------------------------------------- /spec/data/SpecNumberFormat.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/SpecNumberFormat.xlsx -------------------------------------------------------------------------------- /spec/data/SpecSloppy.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitigate-dev/saxlsx/a4640faecf01a25b737bf1a8cf308acc5ac4685e/spec/data/SpecSloppy.xlsx -------------------------------------------------------------------------------- /spec/sheet_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | # encoding: UTF-8 3 | require 'spec_helper' 4 | 5 | describe Sheet do 6 | 7 | let(:filename) { "#{File.dirname(__FILE__)}/data/Spec.xlsx" } 8 | let(:tmp_path) { "#{File.dirname(__FILE__)}/../tmp" } 9 | 10 | before :each do 11 | FileUtils.rm_rf tmp_path if Dir.exists? tmp_path 12 | end 13 | 14 | it 'Rows count' do 15 | Workbook.open filename do |w| 16 | w.sheets[0].rows.count.should eq 7 17 | w.sheets[1].rows.count.should eq 9 18 | w.sheets[2].rows.count.should eq 3 19 | w.sheets[3].rows.count.should eq 2 20 | w.sheets[4].rows.count.should eq 3 21 | end 22 | end 23 | 24 | it 'Rows collection' do 25 | Workbook.open filename do |w| 26 | w.sheets[0].rows.should be_an_instance_of RowsCollection 27 | end 28 | end 29 | 30 | it 'Rows content' do 31 | Workbook.open filename do |w| 32 | w.sheets[0].tap do |s| 33 | s.rows[0].should eq [ 34 | 'LevenshteinDistance', 35 | 3.14, 36 | 3, 37 | DateTime.new(2013, 12, 13, 8, 0, 58), 38 | DateTime.new(1970, 1, 1), 39 | BigDecimal('3.4028236692093801E+38'), 40 | DateTime.new(2015, 2, 13, 12, 40, 5) 41 | ] 42 | s.rows[1].should eq [ 43 | 'Case sensitive', 44 | false, 45 | 3.0, 46 | DateTime.new(1970, 1, 1, 1, 0, 0) 47 | ] 48 | s.rows[2].should eq ['Fields', 'Type', 'URL Mining'] 49 | s.rows[3].should eq ['autor', 'text', false] 50 | s.rows[4].should eq ['texto', 'text', false] 51 | s.rows[5].should eq ['url', 'text', false] 52 | s.rows[6].should eq ['comentario', 'text', false] 53 | end 54 | end 55 | end 56 | 57 | it 'Rows content skipping cells' do 58 | Workbook.open filename do |w| 59 | w.sheets[3].tap do |s| 60 | s.rows[0].should eq [nil, 'en', 'es', 'pt', 'un'] 61 | s.rows[1].should eq ['default', 30, 50, 15, 5] 62 | end 63 | end 64 | end 65 | 66 | it 'Rows content with tag separators (>)' do 67 | Workbook.open filename do |w| 68 | w.sheets[4].tap do |s| 69 | s.rows[0].should eq ['Especificacion', 'Concepto/RegExp/Pair', 'ClienteTexto_Campos', 'ClienteTexto_Especificacion'] 70 | s.rows[1].should eq ['Discriminación > Sexual | Insulto', 'puto', 'texto', 'TST_RechAuto_Insulto_SE_Normal'] 71 | s.rows[2].should eq ['Insulto', 'boludo', 'texto', 'TST_ModMan_Insulto_SU_Normal'] 72 | end 73 | end 74 | end 75 | 76 | it 'Export to CSV' do 77 | Workbook.open filename do |w| 78 | csv_file = "#{tmp_path}/#{w.sheets[0].name}.csv" 79 | 80 | File.should_not be_exists csv_file 81 | 82 | w.sheets[0].to_csv tmp_path 83 | 84 | csv = File.open(csv_file, 'r') { |f| f.readlines } 85 | # TODO: newer rubies use lowercase "e" in scientific numbers 86 | # csv[0].should eq %{"LevenshteinDistance","3.14","3","2013-12-13T08:00:58+00:00","1970-01-01T00:00:00+00:00","0.34028236692093801E39","2015-02-13T12:40:05+00:00"\n} 87 | csv[1].should eq %{"Case sensitive","false","3.0","1970-01-01T01:00:00+00:00"\n} 88 | csv[2].should eq "\"Fields\",\"Type\",\"URL Mining\"\n" 89 | csv[3].should eq "\"autor\",\"text\",\"false\"\n" 90 | csv[4].should eq "\"texto\",\"text\",\"false\"\n" 91 | csv[5].should eq "\"url\",\"text\",\"false\"\n" 92 | csv[6].should eq "\"comentario\",\"text\",\"false\"\n" 93 | end 94 | end 95 | 96 | it 'Handle missing fonts and dimension tags' do 97 | filename = "#{File.dirname(__FILE__)}/data/SpecSloppy.xlsx" 98 | 99 | Workbook.open filename do |w| 100 | w.sheets[0].rows.count.should eq 85 101 | headers = w.sheets[0].rows.first 102 | headers.count.should eq 52 103 | headers.each do |str| 104 | str.should eq "X" 105 | end 106 | end 107 | end 108 | 109 | context 'with 1904 date system' do 110 | let(:filename) { "#{File.dirname(__FILE__)}/data/Spec1904.xlsx" } 111 | 112 | it 'should use 1904 date system when converting dates' do 113 | Workbook.open filename do |w| 114 | w.sheets[0].tap do |s| 115 | s.rows[0].should eq [ 116 | DateTime.new(1970, 1, 1, 1, 0, 0), 117 | DateTime.new(1970, 1, 1) 118 | ] 119 | end 120 | end 121 | end 122 | end 123 | 124 | context 'with mutliline strings ( )' do 125 | let(:filename) { "#{File.dirname(__FILE__)}/data/SpecMultiline10.xlsx" } 126 | 127 | it 'should return multiline cells' do 128 | Workbook.open filename do |w| 129 | w.sheets[0].tap do |s| 130 | s.rows[0].should eq [ 131 | "Test\nTest1\nTest3" 132 | ] 133 | end 134 | end 135 | end 136 | end 137 | 138 | context 'with mutliline strings (\n)' do 139 | let(:filename) { "#{File.dirname(__FILE__)}/data/SpecMultilineN.xlsx" } 140 | 141 | it 'should return multiline cells' do 142 | Workbook.open filename do |w| 143 | w.sheets[0].tap do |s| 144 | s.rows[0].should eq [ 145 | "Test\nTest1\nTest3" 146 | ] 147 | end 148 | end 149 | end 150 | end 151 | 152 | context 'with inline strings' do 153 | let(:filename) { "#{File.dirname(__FILE__)}/data/SpecInlineStrings.xlsx" } 154 | 155 | it 'should read inline strings' do 156 | Workbook.open filename do |w| 157 | w.sheets[0].tap do |s| 158 | s.rows[0].should eq [ 159 | 'Test' 160 | ] 161 | end 162 | end 163 | end 164 | end 165 | 166 | context 'with number formats and auto format' do 167 | let(:filename) { "#{File.dirname(__FILE__)}/data/SpecNumberFormat.xlsx" } 168 | 169 | [ ["General", "Test"], 170 | ["General", 123], 171 | ["General", 123.5], 172 | ["Fixnum", 123], 173 | ["Currency", 123.0], 174 | ["Date", DateTime.new(1970, 1, 1)], 175 | ["Time", DateTime.new(2015, 2, 13, 12, 40, 5)], 176 | ["Percentage", 0.9999], 177 | ["Fraction", 0.5], 178 | ["Scientific", BigDecimal('3.4028236692093801E+38')], 179 | ["Custom", 123.0], 180 | ].each.with_index do |row, i| 181 | name, value = row 182 | 183 | it "should typecast #{name}" do 184 | Workbook.open filename do |w| 185 | w.sheets[0].tap do |s| 186 | expect(s.rows[i+1]).to eq([name, value, "Test"]) 187 | end 188 | end 189 | end 190 | end 191 | end 192 | 193 | context 'with number formats and without auto format' do 194 | let(:filename) { "#{File.dirname(__FILE__)}/data/SpecNumberFormat.xlsx" } 195 | 196 | [ ["General", "Test"], 197 | ["General", "0123"], 198 | ["General", "0123.50"], 199 | ["Fixnum", 123], 200 | ["Currency", 123.0], 201 | ["Date", DateTime.new(1970, 1, 1)], 202 | ["Time", DateTime.new(2015, 2, 13, 12, 40, 5)], 203 | ["Percentage", 0.9999], 204 | ["Fraction", 0.5], 205 | ["Scientific", BigDecimal('3.4028236692093801E+38')], 206 | ["Custom", 123.0], 207 | ].each.with_index do |row, i| 208 | name, value = row 209 | 210 | it "should typecast #{name}" do 211 | Workbook.open filename, auto_format: false do |w| 212 | w.sheets[0].tap do |s| 213 | expect(s.rows[i+1]).to eq([name, value, "Test"]) 214 | end 215 | end 216 | end 217 | end 218 | end 219 | end 220 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rspec/collection_matchers' 4 | require 'saxlsx' 5 | 6 | include Saxlsx 7 | 8 | RSpec.configure do |config| 9 | config.color = true 10 | end 11 | -------------------------------------------------------------------------------- /spec/workbook_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | require 'spec_helper' 3 | 4 | describe Workbook do 5 | 6 | let(:filename) { "#{File.dirname(__FILE__)}/data/Spec.xlsx" } 7 | 8 | it 'Reads from StringIO' do 9 | io = StringIO.new File.read(filename) 10 | Workbook.open io do |w| 11 | w.should have(5).sheets 12 | end 13 | end 14 | 15 | it 'Sheets count' do 16 | Workbook.open filename do |w| 17 | w.should have(5).sheets 18 | end 19 | end 20 | 21 | it 'Sheet names' do 22 | Workbook.open filename do |w| 23 | w.sheet_names.should eq %w(test_otros test_spec test_param Lenguajes ont_demo) 24 | end 25 | end 26 | 27 | it 'Find sheet by index' do 28 | Workbook.open filename do |w| 29 | w.sheets[0].name.should eq 'test_otros' 30 | w.sheets[1].name.should eq 'test_spec' 31 | w.sheets[2].name.should eq 'test_param' 32 | w.sheets[3].name.should eq 'Lenguajes' 33 | w.sheets[4].name.should eq 'ont_demo' 34 | end 35 | end 36 | 37 | it 'Find sheet by name' do 38 | Workbook.open filename do |w| 39 | w.sheets('test_otros').name.should eq 'test_otros' 40 | w.sheets('test_spec').name.should eq 'test_spec' 41 | w.sheets('test_param').name.should eq 'test_param' 42 | w.sheets('Lenguajes').name.should eq 'Lenguajes' 43 | w.sheets('ont_demo').name.should eq 'ont_demo' 44 | end 45 | end 46 | 47 | it 'Shared strings' do 48 | Workbook.open filename do |w| 49 | w.should have(56).shared_strings 50 | w.shared_strings.should include 'LevenshteinDistance' 51 | w.shared_strings.should include 'TST_ModMan_Insulto_SU_Normal' 52 | end 53 | end 54 | 55 | it 'Export to CSV' do 56 | Workbook.open filename do |w| 57 | w.sheets.each { |s| s.should_receive(:to_csv).with(Dir.pwd) } 58 | w.to_csv Dir.pwd 59 | end 60 | end 61 | end 62 | --------------------------------------------------------------------------------