├── .deepsource.toml ├── .document ├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── .hound.yml ├── .rspec ├── .rubocop.yml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── lib ├── rtesseract.rb └── rtesseract │ ├── base.rb │ ├── box.rb │ ├── check.rb │ ├── command.rb │ ├── configuration.rb │ ├── pdf.rb │ ├── text.rb │ ├── tsv.rb │ └── version.rb ├── rtesseract.gemspec └── spec ├── resources ├── README.pdf ├── blank.tif ├── eng.user-words.txt ├── image_with_error.png ├── mixed.tif ├── orientation_reverse.png ├── test with spaces.tif ├── test-pdf.png ├── test.bmp ├── test.jpg ├── test.png ├── test.tif ├── test1.tif └── test_words.png ├── rtesseract ├── box_spec.rb ├── configuration_spec.rb ├── pdf_spec.rb ├── text_spec.rb └── tsv_spec.rb ├── rtesseract_spec.rb └── spec_helper.rb /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [[analyzers]] 4 | name = "shell" 5 | enabled = true 6 | 7 | [[analyzers]] 8 | name = "ruby" 9 | enabled = true -------------------------------------------------------------------------------- /.document: -------------------------------------------------------------------------------- 1 | lib/**/*.rb 2 | bin/* 3 | - 4 | LICENSE.txt 5 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: dannnylo 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push] 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | strategy: 7 | matrix: 8 | ruby: 9 | - '3.2.0' 10 | - '3.4.2' 11 | repository: 12 | - 'ppa:alex-p/tesseract-ocr5' 13 | - 'ppa:alex-p/tesseract-ocr-devel' 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Install tesseract-ocr 17 | run: | 18 | sudo add-apt-repository ${{ matrix.repository }} -y 19 | sudo apt-get update -q 20 | sudo apt-get install tesseract-ocr tesseract-ocr-eng ghostscript -y 21 | tesseract --version 22 | - name: Setup Ruby 23 | uses: ruby/setup-ruby@v1 24 | with: 25 | ruby-version: ${{ matrix.ruby }} 26 | - name: Bundle 27 | run: | 28 | gem uninstall -aIx bundler 29 | gem install bundler 30 | bundle install --jobs 4 --retry 3 31 | - name: Test 32 | run: bundle exec rake 33 | - name: Coverage 34 | env: 35 | CODACY_PROJECT_TOKEN: ${{ secrets.CODACY_PROJECT_TOKEN }} 36 | run: bash <(curl -Ls https://coverage.codacy.com/get.sh) report -l Ruby -r coverage/lcov/* 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /coverage.data 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | 11 | # rspec failure tracking 12 | .rspec_status 13 | *.gem -------------------------------------------------------------------------------- /.hound.yml: -------------------------------------------------------------------------------- 1 | rubocop: 2 | config_file: .rubocop.yml 3 | version: 0.80.0 4 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --order rand 4 | --require spec_helper 5 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | NewCops: enable 3 | SuggestExtensions: false 4 | 5 | Layout/LineLength: 6 | Max: 150 7 | 8 | Metrics/BlockLength: 9 | Max: 50 10 | 11 | Metrics/AbcSize: 12 | Max: 30 13 | 14 | Style/Documentation: 15 | Enabled: false 16 | Style/HashEachMethods: 17 | Enabled: true 18 | 19 | Style/HashTransformKeys: 20 | Enabled: true 21 | 22 | Style/HashTransformValues: 23 | Enabled: true 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changes 2 | 3 | ## v3.1.4 4 | # Changed 5 | * Temporary hocr file is deleted after the file is processed. 6 | 7 | ## v3.1.3 8 | * Fixed a configuration error that wouldn't allow you to do different kinds of calls on the same object, for example calling .to_box and then .to_s would result in unexpected behavior. 9 | 10 | ## v3.1.2 11 | 12 | #### Added 13 | 14 | * Added confidence for each word in box mode 15 | 16 | ## v3.1.1 17 | 18 | #### Changed 19 | 20 | * Changed RTesseract::Command to receive a block when success run 21 | 22 | ## v3.1.0 23 | 24 | #### Changed 25 | 26 | * Removed nokogi dependency. 27 | 28 | ## v3.0.4 29 | 30 | #### Changed 31 | 32 | * Updated dependencies by security alerts. 33 | 34 | ## v3.0.5 35 | 36 | #### Changed 37 | 38 | * Updated dependencies by security alerts. 39 | 40 | ## v3.0.4 41 | 42 | #### Changed 43 | 44 | * Updated dependencies by security alerts. 45 | 46 | ## v3.0.3 47 | 48 | #### Changed 49 | 50 | * Fix some problems with commanders gem 51 | 52 | ## v3.0.0 53 | 54 | #### Changed 55 | 56 | * Refactoring all gem to working with tesseract version 4 or above 57 | 58 | ## v2.1.0 59 | 60 | #### Added 61 | 62 | * Support to generate searchable PDF 63 | 64 | ## v2.0.1 65 | 66 | #### Changed 67 | 68 | * Refactoring of some small classes 69 | 70 | ## v2.0.0 71 | 72 | #### Added 73 | 74 | * Support to options --tessdata-dir, --user-words and --user-patterns 75 | * Ruby 2.3.0 to travis tests. 76 | 77 | #### Changed 78 | 79 | * Refactoring of some classes 80 | * Crop options is a hash with x,y,w,h keys. 81 | * Areas of RTesseract::Mixed now changed :width to :w and :height to :h. 82 | 83 | #### Removed 84 | 85 | * Support to quick_magick gem. 86 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at daniloj.dasilva@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } 6 | 7 | # Specify your gem's dependencies in rtesseract.gemspec 8 | gemspec 9 | 10 | group :development, :test do 11 | gem 'bundler', '~> 2' 12 | gem 'rake' 13 | gem 'rspec' 14 | 15 | gem 'simplecov' 16 | gem 'simplecov-cobertura' 17 | gem 'simplecov-lcov' 18 | end 19 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | rtesseract (3.1.4) 5 | 6 | GEM 7 | remote: https://rubygems.org/ 8 | specs: 9 | diff-lcs (1.4.4) 10 | docile (1.4.0) 11 | rake (13.0.6) 12 | rspec (3.10.0) 13 | rspec-core (~> 3.10.0) 14 | rspec-expectations (~> 3.10.0) 15 | rspec-mocks (~> 3.10.0) 16 | rspec-core (3.10.1) 17 | rspec-support (~> 3.10.0) 18 | rspec-expectations (3.10.1) 19 | diff-lcs (>= 1.2.0, < 2.0) 20 | rspec-support (~> 3.10.0) 21 | rspec-mocks (3.10.2) 22 | diff-lcs (>= 1.2.0, < 2.0) 23 | rspec-support (~> 3.10.0) 24 | rspec-support (3.10.2) 25 | simplecov (0.21.2) 26 | docile (~> 1.1) 27 | simplecov-html (~> 0.11) 28 | simplecov_json_formatter (~> 0.1) 29 | simplecov-cobertura (1.4.2) 30 | simplecov (~> 0.8) 31 | simplecov-html (0.12.3) 32 | simplecov-lcov (0.8.0) 33 | simplecov_json_formatter (0.1.3) 34 | 35 | PLATFORMS 36 | ruby 37 | 38 | DEPENDENCIES 39 | bundler (~> 2) 40 | rake 41 | rspec 42 | rtesseract! 43 | simplecov 44 | simplecov-cobertura 45 | simplecov-lcov 46 | 47 | BUNDLED WITH 48 | 2.4.20 49 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Danilo Jeremias da Silva 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RTesseract 2 | 3 | 4 | Gem Version 5 | 6 | 7 | Build Status 8 | 9 | 10 | Coverage Status 11 | 12 | 13 | Coverage 14 | 15 | 16 | 17 | 18 | 19 | Ruby library for working with the Tesseract OCR. 20 | 21 | ## Installation 22 | 23 | Check if tesseract ocr programs are installed: 24 | 25 | $ tesseract --version 26 | 27 | If not, you can install them with a command like: 28 | 29 | $ apt install tesseract-ocr 30 | 31 | or 32 | 33 | $ brew install tesseract 34 | 35 | or for Heroku 22 to add the buildpack https://github.com/pathwaysmedical/heroku-buildpack-tesseract 36 | 37 | 38 | Add this line to your application's Gemfile: 39 | 40 | ```ruby 41 | gem 'rtesseract' 42 | ``` 43 | 44 | And then execute: 45 | 46 | $ bundle 47 | 48 | Or install it yourself as: 49 | 50 | $ gem install rtesseract 51 | 52 | ## Usage 53 | 54 | It's very simple to use rtesseract. 55 | 56 | ### Convert image to string 57 | 58 | ```ruby 59 | image = RTesseract.new("my_image.jpg") 60 | image.to_s # Getting the value 61 | ``` 62 | 63 | ### Convert image to searchable PDF 64 | 65 | ```ruby 66 | image = RTesseract.new("my_image.jpg") 67 | image.to_pdf # Getting open file of pdf 68 | ``` 69 | 70 | ### Convert image to TSV 71 | 72 | ```ruby 73 | image = RTesseract.new("my_image.jpg") 74 | image.to_tsv # Getting open file of tsv 75 | ``` 76 | 77 | This will preserve the image colors, pictures and structure in the generated pdf. 78 | 79 | ## Options 80 | 81 | ### Language 82 | 83 | ```ruby 84 | RTesseract.new('test.jpg', lang: 'deu') 85 | ``` 86 | 87 | * eng - English 88 | * deu - German 89 | * deu-f - German fraktur 90 | * fra - French 91 | * ita - Italian 92 | * nld - Dutch 93 | * por - Portuguese 94 | * spa - Spanish 95 | * vie - Vietnamese 96 | * or any other supported by tesseract. 97 | 98 | Note: Make sure you have installed the language to tesseract 99 | 100 | ### Other options 101 | 102 | ```ruby 103 | RTesseract.new('test.jpg', config_file: :digits) # Only digit recognition 104 | ``` 105 | 106 | OR 107 | 108 | ```ruby 109 | RTesseract.new('test.jpg', config_file: 'digits quiet') 110 | ``` 111 | 112 | ### BOUNDING BOX: TO GET WORDS WITH THEIR POSITIONS 113 | 114 | ```ruby 115 | RTesseract.new('test_words.png').to_box 116 | => [ 117 | { :word => 'If', :confidence=>89, :x_start=>52, :y_start=>13, :x_end=>63, :y_end=>27}, 118 | { :word => 'you', :confidence=>96, :x_start=>69, :y_start=>17, :x_end=>100, :y_end=>31}, 119 | { :word => 'are', :confidence=>92, :x_start=>108, :y_start=>17, :x_end=>136, :y_end=>27}, 120 | { :word => 'a', :confidence=>92, :x_start=>133, :y_start=>8, :x_end=>147, :y_end=>35}, 121 | { :word => 'friend,', :confidence=>95, :x_start=>158, :y_start=>13, :x_end=>214, :y_end=>29}, 122 | { :word => 'you', :confidence=>96, :x_start=>51, :y_start=>39, :x_end=>82, :y_end=>53}, 123 | { :word => 'speak', :confidence=>96, :x_start=>90, :y_start=>35, :x_end=>140, :y_end=>53}, 124 | { :word => 'the', :confidence=>96, :x_start=>146, :y_start=>35, :x_end=>174, :y_end=>49}, 125 | { :word => 'password,', :confidence=>96, :x_start=>182, :y_start=>35, :x_end=>267, :y_end=>53}, 126 | { :word => 'and', :confidence=>96, :x_start=>51, :y_start=>57, :x_end=>81, :y_end=>71}, 127 | { :word => 'the', :confidence=>96, :x_start=>89, :y_start=>57, :x_end=>117, :y_end=>71}, 128 | { :word => 'doors', :confidence=>96, :x_start=>124, :y_start=>57, :x_end=>172, :y_end=>71}, 129 | { :word => 'will', :confidence=>96, :x_start=>180, :y_start=>57, :x_end=>208, :y_end=>71}, 130 | { :word => 'open.', :confidence=>96, :x_start=>216, :y_start=>61, :x_end=>263, :y_end=>75} 131 | ] 132 | ``` 133 | 134 | ## Development 135 | 136 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 137 | 138 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 139 | 140 | ## Contributing 141 | 142 | Bug reports and pull requests are welcome on GitHub at https://github.com/dannnylo/rtesseract. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct. 143 | 144 | ## License 145 | 146 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 147 | 148 | ## Code of Conduct 149 | 150 | Everyone interacting in the Rtesseract project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/dannnylo/rtesseract/blob/master/CODE_OF_CONDUCT.md). 151 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'bundler/gem_tasks' 4 | require 'rspec/core/rake_task' 5 | 6 | RSpec::Core::RakeTask.new(:spec) 7 | 8 | task default: :spec 9 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'bundler/setup' 5 | require 'rtesseract' 6 | 7 | # You can add fixtures and/or initialization code here to make experimenting 8 | # with your gem easier. You can also use a different console, if you like. 9 | 10 | # (If you use this, don't forget to add pry to your Gemfile!) 11 | # require "pry" 12 | # Pry.start 13 | 14 | require 'irb' 15 | IRB.start(__FILE__) 16 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /lib/rtesseract.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rtesseract/check' 4 | require 'rtesseract/configuration' 5 | require 'rtesseract/command' 6 | require 'rtesseract/base' 7 | require 'rtesseract/text' 8 | require 'rtesseract/pdf' 9 | require 'rtesseract/box' 10 | require 'rtesseract/tsv' 11 | 12 | class RTesseract 13 | class Error < StandardError; end 14 | 15 | attr_reader :config, :source, :errors 16 | 17 | def initialize(src = '', options = {}) 18 | @source = src 19 | @config = RTesseract.config.merge(options) 20 | @errors = [] 21 | end 22 | 23 | def to_box 24 | Box.run(@source, @errors, @config) 25 | end 26 | 27 | def words 28 | to_box.map { |word| word[:word] } 29 | end 30 | 31 | def to_pdf 32 | Pdf.run(@source, @errors, @config) 33 | end 34 | 35 | def to_tsv 36 | Tsv.run(@source, @errors, @config) 37 | end 38 | 39 | # Output value 40 | def to_s 41 | Text.run(@source, @errors, @config) 42 | end 43 | 44 | # Remove spaces and break-lines 45 | def to_s_without_spaces 46 | to_s.gsub(/\s/, '') 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /lib/rtesseract/base.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tmpdir' 4 | require 'securerandom' 5 | require 'pathname' 6 | 7 | class RTesseract 8 | module Base 9 | def temp_file_path 10 | Pathname.new(Dir.tmpdir).join("rtesseract_#{SecureRandom.uuid}").to_s 11 | end 12 | 13 | def remove_tmp_file(absolute_file_path) 14 | File.delete(absolute_file_path) if File.file?(absolute_file_path) 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/rtesseract/box.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | class RTesseract 4 | module Box 5 | extend RTesseract::Base 6 | 7 | class << self 8 | def run(source, errors, options) 9 | options = options.merge({ tessedit_create_hocr: 1 }) 10 | 11 | RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path| 12 | filename = "#{output_path}.hocr" 13 | content = File.read(filename) 14 | remove_tmp_file(filename) 15 | parse(content) 16 | end 17 | end 18 | 19 | def parse(content) 20 | content.lines.map { |line| parse_line(line) }.compact 21 | end 22 | 23 | def parse_line(line) 24 | return unless line.match?(/oc(rx|r)_word/) 25 | 26 | word = line.to_s.scan(/>(.*)= 2.7' 24 | spec.bindir = 'exe' 25 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 26 | spec.require_paths = ['lib'] 27 | 28 | spec.metadata['rubygems_mfa_required'] = 'true' 29 | end 30 | -------------------------------------------------------------------------------- /spec/resources/README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/README.pdf -------------------------------------------------------------------------------- /spec/resources/blank.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/blank.tif -------------------------------------------------------------------------------- /spec/resources/eng.user-words.txt: -------------------------------------------------------------------------------- 1 | you 2 | are 3 | a 4 | friend 5 | you 6 | speak 7 | the 8 | password 9 | and 10 | the 11 | doors 12 | will 13 | open -------------------------------------------------------------------------------- /spec/resources/image_with_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/image_with_error.png -------------------------------------------------------------------------------- /spec/resources/mixed.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/mixed.tif -------------------------------------------------------------------------------- /spec/resources/orientation_reverse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/orientation_reverse.png -------------------------------------------------------------------------------- /spec/resources/test with spaces.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test with spaces.tif -------------------------------------------------------------------------------- /spec/resources/test-pdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test-pdf.png -------------------------------------------------------------------------------- /spec/resources/test.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.bmp -------------------------------------------------------------------------------- /spec/resources/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.jpg -------------------------------------------------------------------------------- /spec/resources/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.png -------------------------------------------------------------------------------- /spec/resources/test.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.tif -------------------------------------------------------------------------------- /spec/resources/test1.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test1.tif -------------------------------------------------------------------------------- /spec/resources/test_words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test_words.png -------------------------------------------------------------------------------- /spec/rtesseract/box_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe RTesseract::Box do 4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } 5 | let(:words_image) { path.join('resources', 'test_words.png').to_s } 6 | let(:words) { ['If', 'you', 'are', 'a', 'friend,', 'you', 'speak', 'the', 'password,', 'and', 'the', 'doors', 'will', 'open.'] } 7 | let(:instance) { RTesseract.new(words_image) } 8 | 9 | it 'returns the list of words' do 10 | expect(instance.words).to eql(words) 11 | end 12 | 13 | it 'bounding box' do 14 | expect(instance.to_box).to include(word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31, confidence: 96) 15 | end 16 | 17 | it 'removes the temp hocr file' do 18 | initial_count = Dir["#{Dir.tmpdir}/*"].length 19 | instance.to_box 20 | expect(initial_count).to eql(Dir["#{Dir.tmpdir}/*"].length) 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /spec/rtesseract/configuration_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe RTesseract do 4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } 5 | 6 | context 'with global spm' do 7 | before { described_class.configure { |config| config.psm = 7 } } 8 | 9 | it 'gets the global psm value' do 10 | expect(described_class.config.psm).to be(7) 11 | end 12 | 13 | it 'gets instance psm value' do 14 | expect(described_class.new(path, psm: 2).config.psm).to be(2) 15 | end 16 | end 17 | 18 | context 'with default command' do 19 | it 'gets the global psm value' do 20 | expect(described_class.config.command).to eql('tesseract') 21 | end 22 | 23 | it 'gets instance command value' do 24 | expect(described_class.new(path, command: '/usr/bin/tesseract4').config.command).to eql('/usr/bin/tesseract4') 25 | end 26 | end 27 | 28 | context 'with other options' do 29 | it 'allows to setup oem' do 30 | expect(described_class.new(path, oem: 1).config.oem).to be(1) 31 | end 32 | 33 | it 'allows to setup lang' do 34 | expect(described_class.new(path, lang: 'eng').config.lang).to eql('eng') 35 | end 36 | 37 | it 'allows to setup multiple langs' do 38 | expect(described_class.new(path, lang: 'eng+por').config.lang).to eql('eng+por') 39 | end 40 | end 41 | 42 | context 'when block not given' do 43 | it { expect(described_class.configure).to eq(nil) } 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/rtesseract/pdf_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe RTesseract::Pdf do 4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } 5 | let(:words_image) { path.join('resources', 'test-pdf.png').to_s } 6 | let(:file) { RTesseract.new(words_image).to_pdf } 7 | 8 | after do 9 | file.close 10 | File.unlink(file) 11 | end 12 | 13 | it 'returns a file with extension .pdf' do 14 | expect(File.extname(file.path)).to eql('.pdf') 15 | end 16 | 17 | it 'checks if file pdf exisits' do 18 | expect(File).to exist(file.path) 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /spec/rtesseract/text_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe RTesseract::Text do 4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } 5 | let(:image_path) { path.join('resources', 'test.tif').to_s } 6 | let(:pdf_path) { path.join('resources', 'test.tif').to_s } 7 | 8 | let(:words_image) { path.join('resources', 'test_words.png').to_s } 9 | 10 | it 'translate image to text' do 11 | expect(RTesseract.new(image_path).to_s_without_spaces).to eql('43XF') 12 | end 13 | 14 | it 'translate tif image to text' do 15 | expect(RTesseract.new(path.join('resources', 'test1.tif').to_s).to_s_without_spaces).to eql('V2V4') 16 | end 17 | 18 | it 'translate tif image with spaces to text' do 19 | expect(RTesseract.new(path.join('resources', 'test with spaces.tif').to_s).to_s_without_spaces).to eql('V2V4') 20 | end 21 | 22 | it 'translate png image with spaces to text' do 23 | expect(RTesseract.new(path.join('resources', 'test.png').to_s).to_s_without_spaces).to eql('HW9W') 24 | end 25 | 26 | it 'translate jpg image with spaces to text' do 27 | expect(RTesseract.new(path.join('resources', 'test.jpg').to_s).to_s_without_spaces).to eql('3R8F') 28 | end 29 | 30 | it 'translate image to text with options' do 31 | expect(RTesseract.new(image_path, psm: 7, oem: 1).to_s_without_spaces).to eql('43XF') 32 | end 33 | 34 | it 'tests output text' do 35 | expect(RTesseract.new(words_image).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n") 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /spec/rtesseract/tsv_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe RTesseract::Tsv do 4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } 5 | let(:words_image) { path.join('resources', 'test_words.png').to_s } 6 | let(:file) { RTesseract.new(words_image).to_tsv } 7 | 8 | after do 9 | file.close 10 | File.unlink(file) 11 | end 12 | 13 | it 'returns a file with extension .tsv' do 14 | expect(File.extname(file.path)).to eql('.tsv') 15 | end 16 | 17 | it ' support tsv output mode' do 18 | expect(file.read).to include('level page_num block_num par_num line_num word_num left top width height conf text') 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /spec/rtesseract_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe RTesseract do 4 | let(:path) { Pathname.new(__dir__) } 5 | let(:image_path) { path.join('resources', 'test.tif').to_s } 6 | let(:pdf_path) { path.join('resources', 'test.tif').to_s } 7 | let(:words_image) { path.join('resources', 'test_words.png').to_s } 8 | 9 | it 'returns the tesseract version' do 10 | expect(described_class.tesseract_version).to be > 3.05 11 | end 12 | 13 | it 'be instantiable without path' do 14 | expect(described_class.new.class).to eql(described_class) 15 | end 16 | 17 | it 'be instantiable with blank string' do 18 | expect(described_class.new('').class).to eql(described_class) 19 | end 20 | 21 | it 'be instantiable with a path' do 22 | expect(described_class.new(image_path).class).to eql(described_class) 23 | end 24 | 25 | context 'when tesseract not installed' do 26 | before do 27 | described_class.configure { |config| config.command = 'tesseract_not_installed' } 28 | end 29 | 30 | it 'returns zero on #tesseract_version' do 31 | expect(described_class.tesseract_version).to be(0) 32 | end 33 | 34 | it 'raise a error if tesseract version < 3.05' do 35 | expect { described_class.check_version! }.to raise_error(RTesseract::Error) 36 | end 37 | end 38 | 39 | context 'when tesseract installed version is less then 3' do 40 | before do 41 | allow(RTesseract).to receive(:tesseract_version).and_return(2) 42 | end 43 | 44 | it 'raise a error if tesseract version < 3.05' do 45 | expect { described_class.check_version! }.to raise_error(RTesseract::Error) 46 | end 47 | end 48 | 49 | context 'when tesseract installed version is greather then 3' do 50 | before do 51 | allow(RTesseract).to receive(:tesseract_version).and_return(4) 52 | end 53 | 54 | it 'raise a error if tesseract version < 3.05' do 55 | expect(described_class.check_version!).to eq(nil) 56 | end 57 | end 58 | 59 | context 'without source' do 60 | let(:instance) { described_class.new } 61 | 62 | it 'raise an exception' do 63 | aggregate_failures 'raise an exception' do 64 | expect { instance.to_s }.to raise_error(RTesseract::Error) 65 | expect(instance.errors.first).to include('Error during processing') 66 | end 67 | end 68 | end 69 | 70 | context 'with errors on image' do 71 | let(:error_intance) do 72 | described_class.new(path.join('resources', 'image_with_error.png').to_s) 73 | end 74 | 75 | it 'stores the error on a variable to debug' do 76 | aggregate_failures 'stores the error on a variable to debug' do 77 | expect(error_intance.to_s_without_spaces).to eql('RTX-0003-03-02-01PRE') 78 | expect(error_intance.errors).to eql(["Error in boxClipToRectangle: box outside rectangle\nError in pixScanForForeground: invalid box\n"]) 79 | end 80 | end 81 | end 82 | 83 | it 'runs multiple types' do 84 | tesseract = RTesseract.new(words_image) 85 | # Check that none of the other options affects the config, making text error out. 86 | box = tesseract.to_box 87 | pdf = tesseract.to_pdf 88 | tsv = tesseract.to_tsv 89 | 90 | result = tesseract.to_s 91 | expect(result).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n") 92 | expect(box).to be_a(Array) 93 | 94 | expect(pdf).to be_a(File) 95 | expect(File.extname(pdf.path)).to eq('.pdf') 96 | 97 | expect(tsv).to be_a(File) 98 | expect(File.extname(tsv.path)).to eq('.tsv') 99 | 100 | pdf.close 101 | tsv.close 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'bundler/setup' 4 | require 'simplecov' 5 | require 'simplecov-lcov' 6 | 7 | SimpleCov.formatters = SimpleCov::Formatter::MultiFormatter.new([ 8 | SimpleCov::Formatter::HTMLFormatter, 9 | SimpleCov::Formatter::LcovFormatter 10 | ]) 11 | 12 | SimpleCov.start :test_frameworks do 13 | enable_coverage :branch 14 | 15 | minimum_coverage line: 100, branch: 85 16 | end 17 | 18 | require 'rtesseract' 19 | 20 | RSpec.configure do |config| 21 | # Enable flags like --only-failures and --next-failure 22 | config.example_status_persistence_file_path = '.rspec_status' 23 | 24 | # Disable RSpec exposing methods globally on `Module` and `main` 25 | config.disable_monkey_patching! 26 | 27 | config.expect_with :rspec do |c| 28 | c.syntax = :expect 29 | end 30 | 31 | config.before do 32 | RTesseract.reset_config! 33 | end 34 | end 35 | --------------------------------------------------------------------------------