├── .deepsource.toml
├── .document
├── .github
├── FUNDING.yml
└── workflows
│ └── ci.yml
├── .gitignore
├── .hound.yml
├── .rspec
├── .rubocop.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.md
├── Rakefile
├── bin
├── console
└── setup
├── lib
├── rtesseract.rb
└── rtesseract
│ ├── base.rb
│ ├── box.rb
│ ├── check.rb
│ ├── command.rb
│ ├── configuration.rb
│ ├── pdf.rb
│ ├── text.rb
│ ├── tsv.rb
│ └── version.rb
├── rtesseract.gemspec
└── spec
├── resources
├── README.pdf
├── blank.tif
├── eng.user-words.txt
├── image_with_error.png
├── mixed.tif
├── orientation_reverse.png
├── test with spaces.tif
├── test-pdf.png
├── test.bmp
├── test.jpg
├── test.png
├── test.tif
├── test1.tif
└── test_words.png
├── rtesseract
├── box_spec.rb
├── configuration_spec.rb
├── pdf_spec.rb
├── text_spec.rb
└── tsv_spec.rb
├── rtesseract_spec.rb
└── spec_helper.rb
/.deepsource.toml:
--------------------------------------------------------------------------------
1 | version = 1
2 |
3 | [[analyzers]]
4 | name = "shell"
5 | enabled = true
6 |
7 | [[analyzers]]
8 | name = "ruby"
9 | enabled = true
--------------------------------------------------------------------------------
/.document:
--------------------------------------------------------------------------------
1 | lib/**/*.rb
2 | bin/*
3 | -
4 | LICENSE.txt
5 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: dannnylo
2 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [push]
3 | jobs:
4 | test:
5 | runs-on: ubuntu-latest
6 | strategy:
7 | matrix:
8 | ruby:
9 | - '3.2.0'
10 | - '3.4.2'
11 | repository:
12 | - 'ppa:alex-p/tesseract-ocr5'
13 | - 'ppa:alex-p/tesseract-ocr-devel'
14 | steps:
15 | - uses: actions/checkout@v2
16 | - name: Install tesseract-ocr
17 | run: |
18 | sudo add-apt-repository ${{ matrix.repository }} -y
19 | sudo apt-get update -q
20 | sudo apt-get install tesseract-ocr tesseract-ocr-eng ghostscript -y
21 | tesseract --version
22 | - name: Setup Ruby
23 | uses: ruby/setup-ruby@v1
24 | with:
25 | ruby-version: ${{ matrix.ruby }}
26 | - name: Bundle
27 | run: |
28 | gem uninstall -aIx bundler
29 | gem install bundler
30 | bundle install --jobs 4 --retry 3
31 | - name: Test
32 | run: bundle exec rake
33 | - name: Coverage
34 | env:
35 | CODACY_PROJECT_TOKEN: ${{ secrets.CODACY_PROJECT_TOKEN }}
36 | run: bash <(curl -Ls https://coverage.codacy.com/get.sh) report -l Ruby -r coverage/lcov/*
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle/
2 | /.yardoc
3 | /_yardoc/
4 | /coverage/
5 | /coverage.data
6 | /doc/
7 | /pkg/
8 | /spec/reports/
9 | /tmp/
10 |
11 | # rspec failure tracking
12 | .rspec_status
13 | *.gem
--------------------------------------------------------------------------------
/.hound.yml:
--------------------------------------------------------------------------------
1 | rubocop:
2 | config_file: .rubocop.yml
3 | version: 0.80.0
4 |
--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --format documentation
2 | --color
3 | --order rand
4 | --require spec_helper
5 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | AllCops:
2 | NewCops: enable
3 | SuggestExtensions: false
4 |
5 | Layout/LineLength:
6 | Max: 150
7 |
8 | Metrics/BlockLength:
9 | Max: 50
10 |
11 | Metrics/AbcSize:
12 | Max: 30
13 |
14 | Style/Documentation:
15 | Enabled: false
16 | Style/HashEachMethods:
17 | Enabled: true
18 |
19 | Style/HashTransformKeys:
20 | Enabled: true
21 |
22 | Style/HashTransformValues:
23 | Enabled: true
24 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changes
2 |
3 | ## v3.1.4
4 | # Changed
5 | * Temporary hocr file is deleted after the file is processed.
6 |
7 | ## v3.1.3
8 | * Fixed a configuration error that wouldn't allow you to do different kinds of calls on the same object, for example calling .to_box and then .to_s would result in unexpected behavior.
9 |
10 | ## v3.1.2
11 |
12 | #### Added
13 |
14 | * Added confidence for each word in box mode
15 |
16 | ## v3.1.1
17 |
18 | #### Changed
19 |
20 | * Changed RTesseract::Command to receive a block when success run
21 |
22 | ## v3.1.0
23 |
24 | #### Changed
25 |
26 | * Removed nokogi dependency.
27 |
28 | ## v3.0.4
29 |
30 | #### Changed
31 |
32 | * Updated dependencies by security alerts.
33 |
34 | ## v3.0.5
35 |
36 | #### Changed
37 |
38 | * Updated dependencies by security alerts.
39 |
40 | ## v3.0.4
41 |
42 | #### Changed
43 |
44 | * Updated dependencies by security alerts.
45 |
46 | ## v3.0.3
47 |
48 | #### Changed
49 |
50 | * Fix some problems with commanders gem
51 |
52 | ## v3.0.0
53 |
54 | #### Changed
55 |
56 | * Refactoring all gem to working with tesseract version 4 or above
57 |
58 | ## v2.1.0
59 |
60 | #### Added
61 |
62 | * Support to generate searchable PDF
63 |
64 | ## v2.0.1
65 |
66 | #### Changed
67 |
68 | * Refactoring of some small classes
69 |
70 | ## v2.0.0
71 |
72 | #### Added
73 |
74 | * Support to options --tessdata-dir, --user-words and --user-patterns
75 | * Ruby 2.3.0 to travis tests.
76 |
77 | #### Changed
78 |
79 | * Refactoring of some classes
80 | * Crop options is a hash with x,y,w,h keys.
81 | * Areas of RTesseract::Mixed now changed :width to :w and :height to :h.
82 |
83 | #### Removed
84 |
85 | * Support to quick_magick gem.
86 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at daniloj.dasilva@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 |
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | source 'https://rubygems.org'
4 |
5 | git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6 |
7 | # Specify your gem's dependencies in rtesseract.gemspec
8 | gemspec
9 |
10 | group :development, :test do
11 | gem 'bundler', '~> 2'
12 | gem 'rake'
13 | gem 'rspec'
14 |
15 | gem 'simplecov'
16 | gem 'simplecov-cobertura'
17 | gem 'simplecov-lcov'
18 | end
19 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | PATH
2 | remote: .
3 | specs:
4 | rtesseract (3.1.4)
5 |
6 | GEM
7 | remote: https://rubygems.org/
8 | specs:
9 | diff-lcs (1.4.4)
10 | docile (1.4.0)
11 | rake (13.0.6)
12 | rspec (3.10.0)
13 | rspec-core (~> 3.10.0)
14 | rspec-expectations (~> 3.10.0)
15 | rspec-mocks (~> 3.10.0)
16 | rspec-core (3.10.1)
17 | rspec-support (~> 3.10.0)
18 | rspec-expectations (3.10.1)
19 | diff-lcs (>= 1.2.0, < 2.0)
20 | rspec-support (~> 3.10.0)
21 | rspec-mocks (3.10.2)
22 | diff-lcs (>= 1.2.0, < 2.0)
23 | rspec-support (~> 3.10.0)
24 | rspec-support (3.10.2)
25 | simplecov (0.21.2)
26 | docile (~> 1.1)
27 | simplecov-html (~> 0.11)
28 | simplecov_json_formatter (~> 0.1)
29 | simplecov-cobertura (1.4.2)
30 | simplecov (~> 0.8)
31 | simplecov-html (0.12.3)
32 | simplecov-lcov (0.8.0)
33 | simplecov_json_formatter (0.1.3)
34 |
35 | PLATFORMS
36 | ruby
37 |
38 | DEPENDENCIES
39 | bundler (~> 2)
40 | rake
41 | rspec
42 | rtesseract!
43 | simplecov
44 | simplecov-cobertura
45 | simplecov-lcov
46 |
47 | BUNDLED WITH
48 | 2.4.20
49 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2018 Danilo Jeremias da Silva
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RTesseract
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | Ruby library for working with the Tesseract OCR.
20 |
21 | ## Installation
22 |
23 | Check if tesseract ocr programs are installed:
24 |
25 | $ tesseract --version
26 |
27 | If not, you can install them with a command like:
28 |
29 | $ apt install tesseract-ocr
30 |
31 | or
32 |
33 | $ brew install tesseract
34 |
35 | or for Heroku 22 to add the buildpack https://github.com/pathwaysmedical/heroku-buildpack-tesseract
36 |
37 |
38 | Add this line to your application's Gemfile:
39 |
40 | ```ruby
41 | gem 'rtesseract'
42 | ```
43 |
44 | And then execute:
45 |
46 | $ bundle
47 |
48 | Or install it yourself as:
49 |
50 | $ gem install rtesseract
51 |
52 | ## Usage
53 |
54 | It's very simple to use rtesseract.
55 |
56 | ### Convert image to string
57 |
58 | ```ruby
59 | image = RTesseract.new("my_image.jpg")
60 | image.to_s # Getting the value
61 | ```
62 |
63 | ### Convert image to searchable PDF
64 |
65 | ```ruby
66 | image = RTesseract.new("my_image.jpg")
67 | image.to_pdf # Getting open file of pdf
68 | ```
69 |
70 | ### Convert image to TSV
71 |
72 | ```ruby
73 | image = RTesseract.new("my_image.jpg")
74 | image.to_tsv # Getting open file of tsv
75 | ```
76 |
77 | This will preserve the image colors, pictures and structure in the generated pdf.
78 |
79 | ## Options
80 |
81 | ### Language
82 |
83 | ```ruby
84 | RTesseract.new('test.jpg', lang: 'deu')
85 | ```
86 |
87 | * eng - English
88 | * deu - German
89 | * deu-f - German fraktur
90 | * fra - French
91 | * ita - Italian
92 | * nld - Dutch
93 | * por - Portuguese
94 | * spa - Spanish
95 | * vie - Vietnamese
96 | * or any other supported by tesseract.
97 |
98 | Note: Make sure you have installed the language to tesseract
99 |
100 | ### Other options
101 |
102 | ```ruby
103 | RTesseract.new('test.jpg', config_file: :digits) # Only digit recognition
104 | ```
105 |
106 | OR
107 |
108 | ```ruby
109 | RTesseract.new('test.jpg', config_file: 'digits quiet')
110 | ```
111 |
112 | ### BOUNDING BOX: TO GET WORDS WITH THEIR POSITIONS
113 |
114 | ```ruby
115 | RTesseract.new('test_words.png').to_box
116 | => [
117 | { :word => 'If', :confidence=>89, :x_start=>52, :y_start=>13, :x_end=>63, :y_end=>27},
118 | { :word => 'you', :confidence=>96, :x_start=>69, :y_start=>17, :x_end=>100, :y_end=>31},
119 | { :word => 'are', :confidence=>92, :x_start=>108, :y_start=>17, :x_end=>136, :y_end=>27},
120 | { :word => 'a', :confidence=>92, :x_start=>133, :y_start=>8, :x_end=>147, :y_end=>35},
121 | { :word => 'friend,', :confidence=>95, :x_start=>158, :y_start=>13, :x_end=>214, :y_end=>29},
122 | { :word => 'you', :confidence=>96, :x_start=>51, :y_start=>39, :x_end=>82, :y_end=>53},
123 | { :word => 'speak', :confidence=>96, :x_start=>90, :y_start=>35, :x_end=>140, :y_end=>53},
124 | { :word => 'the', :confidence=>96, :x_start=>146, :y_start=>35, :x_end=>174, :y_end=>49},
125 | { :word => 'password,', :confidence=>96, :x_start=>182, :y_start=>35, :x_end=>267, :y_end=>53},
126 | { :word => 'and', :confidence=>96, :x_start=>51, :y_start=>57, :x_end=>81, :y_end=>71},
127 | { :word => 'the', :confidence=>96, :x_start=>89, :y_start=>57, :x_end=>117, :y_end=>71},
128 | { :word => 'doors', :confidence=>96, :x_start=>124, :y_start=>57, :x_end=>172, :y_end=>71},
129 | { :word => 'will', :confidence=>96, :x_start=>180, :y_start=>57, :x_end=>208, :y_end=>71},
130 | { :word => 'open.', :confidence=>96, :x_start=>216, :y_start=>61, :x_end=>263, :y_end=>75}
131 | ]
132 | ```
133 |
134 | ## Development
135 |
136 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
137 |
138 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
139 |
140 | ## Contributing
141 |
142 | Bug reports and pull requests are welcome on GitHub at https://github.com/dannnylo/rtesseract. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
143 |
144 | ## License
145 |
146 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
147 |
148 | ## Code of Conduct
149 |
150 | Everyone interacting in the Rtesseract project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/dannnylo/rtesseract/blob/master/CODE_OF_CONDUCT.md).
151 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'bundler/gem_tasks'
4 | require 'rspec/core/rake_task'
5 |
6 | RSpec::Core::RakeTask.new(:spec)
7 |
8 | task default: :spec
9 |
--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'bundler/setup'
5 | require 'rtesseract'
6 |
7 | # You can add fixtures and/or initialization code here to make experimenting
8 | # with your gem easier. You can also use a different console, if you like.
9 |
10 | # (If you use this, don't forget to add pry to your Gemfile!)
11 | # require "pry"
12 | # Pry.start
13 |
14 | require 'irb'
15 | IRB.start(__FILE__)
16 |
--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 |
6 | bundle install
7 |
8 | # Do any other automated setup that you need to do here
9 |
--------------------------------------------------------------------------------
/lib/rtesseract.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'rtesseract/check'
4 | require 'rtesseract/configuration'
5 | require 'rtesseract/command'
6 | require 'rtesseract/base'
7 | require 'rtesseract/text'
8 | require 'rtesseract/pdf'
9 | require 'rtesseract/box'
10 | require 'rtesseract/tsv'
11 |
12 | class RTesseract
13 | class Error < StandardError; end
14 |
15 | attr_reader :config, :source, :errors
16 |
17 | def initialize(src = '', options = {})
18 | @source = src
19 | @config = RTesseract.config.merge(options)
20 | @errors = []
21 | end
22 |
23 | def to_box
24 | Box.run(@source, @errors, @config)
25 | end
26 |
27 | def words
28 | to_box.map { |word| word[:word] }
29 | end
30 |
31 | def to_pdf
32 | Pdf.run(@source, @errors, @config)
33 | end
34 |
35 | def to_tsv
36 | Tsv.run(@source, @errors, @config)
37 | end
38 |
39 | # Output value
40 | def to_s
41 | Text.run(@source, @errors, @config)
42 | end
43 |
44 | # Remove spaces and break-lines
45 | def to_s_without_spaces
46 | to_s.gsub(/\s/, '')
47 | end
48 | end
49 |
--------------------------------------------------------------------------------
/lib/rtesseract/base.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tmpdir'
4 | require 'securerandom'
5 | require 'pathname'
6 |
7 | class RTesseract
8 | module Base
9 | def temp_file_path
10 | Pathname.new(Dir.tmpdir).join("rtesseract_#{SecureRandom.uuid}").to_s
11 | end
12 |
13 | def remove_tmp_file(absolute_file_path)
14 | File.delete(absolute_file_path) if File.file?(absolute_file_path)
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/rtesseract/box.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | class RTesseract
4 | module Box
5 | extend RTesseract::Base
6 |
7 | class << self
8 | def run(source, errors, options)
9 | options = options.merge({ tessedit_create_hocr: 1 })
10 |
11 | RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
12 | filename = "#{output_path}.hocr"
13 | content = File.read(filename)
14 | remove_tmp_file(filename)
15 | parse(content)
16 | end
17 | end
18 |
19 | def parse(content)
20 | content.lines.map { |line| parse_line(line) }.compact
21 | end
22 |
23 | def parse_line(line)
24 | return unless line.match?(/oc(rx|r)_word/)
25 |
26 | word = line.to_s.scan(/>(.*)).flatten.first.to_s
27 |
28 | return if word.strip == ''
29 |
30 | word_info(word, parse_position(line), parse_confidence(line))
31 | end
32 |
33 | def word_info(word, positions, confidence)
34 | {
35 | word: word,
36 | confidence: confidence[-1].to_i,
37 | x_start: positions[1].to_i,
38 | y_start: positions[2].to_i,
39 | x_end: positions[3].to_i,
40 | y_end: positions[4].to_i
41 | }
42 | end
43 |
44 | def parse_position(line)
45 | line.match(/(?<=title)(.*?)(?=;)/).to_s.split
46 | end
47 |
48 | def parse_confidence(line)
49 | line.match(/(?<=;)(.*?)(?=')/).to_s.split
50 | end
51 | end
52 | end
53 | end
54 |
--------------------------------------------------------------------------------
/lib/rtesseract/check.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | class RTesseract
4 | class << self
5 | def tesseract_version
6 | Open3.capture2e(RTesseract.config.command, '--version').first.to_s.match(/\d+.\d+/)[0].to_f
7 | rescue Errno::ENOENT
8 | 0
9 | end
10 |
11 | def check_version!
12 | raise RTesseract::Error, 'Tesseract OCR 3.5 or later not installed' if RTesseract.tesseract_version < 3.05
13 | end
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/lib/rtesseract/command.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | class RTesseract
4 | class Command
5 | FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze
6 |
7 | attr_reader :options
8 |
9 | def initialize(source, output_path, errors, options)
10 | @source = source
11 | @output_path = output_path
12 | @options = options
13 | @errors = errors
14 | @full_command = [options.command, @source, @output_path]
15 | end
16 |
17 | def full_command
18 | add_option('--psm', options.psm)
19 | add_option('--oem', options.oem)
20 | add_option('-l', options.lang)
21 | add_option('--tessdata-dir', options.tessdata_dir)
22 | add_option('--user-words', options.user_words)
23 | add_option('--user-patterns', options.user_patterns)
24 |
25 | other_configs
26 |
27 | add_option(options.config_file)
28 |
29 | @full_command
30 | end
31 |
32 | def add_option(*args)
33 | return unless args.last
34 |
35 | @full_command << args.map(&:to_s)
36 | end
37 |
38 | def other_configs
39 | @options.to_h.map do |key, value|
40 | next if FIXED.include?(key)
41 |
42 | add_option('-c', "#{key}=#{value}")
43 | end
44 | end
45 |
46 | def run
47 | output, error, status = Open3.capture3(*full_command.flatten)
48 |
49 | @errors.push(error)
50 |
51 | if status.success?
52 | return yield(@output_path) if block_given?
53 |
54 | return output
55 | end
56 |
57 | raise RTesseract::Error, error
58 | end
59 | end
60 | end
61 |
--------------------------------------------------------------------------------
/lib/rtesseract/configuration.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'ostruct'
4 |
5 | class RTesseract
6 | class Configuration < OpenStruct
7 | def merge(options)
8 | RTesseract::Configuration.new(to_h.merge(options))
9 | end
10 |
11 | def command
12 | @table[:command]
13 | end
14 | end
15 |
16 | class << self
17 | def config
18 | @config ||= RTesseract::Configuration.new(
19 | command: 'tesseract',
20 | debug_file: '/dev/null'
21 | )
22 | end
23 |
24 | def configure
25 | yield(config) if block_given?
26 | end
27 |
28 | def reset_config!
29 | @config = nil
30 | end
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/lib/rtesseract/pdf.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | class RTesseract
4 | module Pdf
5 | extend Base
6 |
7 | def self.run(source, errors, options)
8 | options = options.merge({ tessedit_create_pdf: 1 })
9 |
10 | RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
11 | File.open("#{output_path}.pdf", 'r')
12 | end
13 | end
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/lib/rtesseract/text.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'open3'
4 |
5 | class RTesseract
6 | module Text
7 | def self.run(source, errors, options)
8 | text = RTesseract::Command.new(source, 'stdout', errors, options).run
9 | text = text.gsub("\f", '') if text.is_a?(String)
10 | text
11 | end
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/lib/rtesseract/tsv.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | class RTesseract
4 | module Tsv
5 | extend Base
6 |
7 | def self.run(source, errors, options)
8 | options = options.merge({ tessedit_create_tsv: 1 })
9 |
10 | RTesseract::Command.new(
11 | source,
12 | temp_file_path,
13 | errors,
14 | options
15 | ).run do |output_path|
16 | File.open("#{output_path}.tsv", 'r')
17 | end
18 | end
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/lib/rtesseract/version.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | class RTesseract
4 | VERSION = '3.1.4'
5 | end
6 |
--------------------------------------------------------------------------------
/rtesseract.gemspec:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | lib = File.expand_path('lib', __dir__)
4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5 | require 'rtesseract/version'
6 |
7 | Gem::Specification.new do |spec|
8 | spec.name = 'rtesseract'
9 | spec.version = RTesseract::VERSION
10 | spec.authors = ['Danilo Jeremias da Silva']
11 | spec.email = ['dannnylo@gmail.com']
12 |
13 | spec.summary = 'Ruby library for working with the Tesseract OCR.'
14 | spec.description = 'Ruby library for working with the Tesseract OCR.'
15 | spec.homepage = 'http://github.com/dannnylo/rtesseract'
16 | spec.license = 'MIT'
17 |
18 | # Specify which files should be added to the gem when it is released.
19 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20 | spec.files = Dir.chdir(File.expand_path(__dir__)) do
21 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
22 | end
23 | spec.required_ruby_version = '>= 2.7'
24 | spec.bindir = 'exe'
25 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
26 | spec.require_paths = ['lib']
27 |
28 | spec.metadata['rubygems_mfa_required'] = 'true'
29 | end
30 |
--------------------------------------------------------------------------------
/spec/resources/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/README.pdf
--------------------------------------------------------------------------------
/spec/resources/blank.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/blank.tif
--------------------------------------------------------------------------------
/spec/resources/eng.user-words.txt:
--------------------------------------------------------------------------------
1 | you
2 | are
3 | a
4 | friend
5 | you
6 | speak
7 | the
8 | password
9 | and
10 | the
11 | doors
12 | will
13 | open
--------------------------------------------------------------------------------
/spec/resources/image_with_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/image_with_error.png
--------------------------------------------------------------------------------
/spec/resources/mixed.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/mixed.tif
--------------------------------------------------------------------------------
/spec/resources/orientation_reverse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/orientation_reverse.png
--------------------------------------------------------------------------------
/spec/resources/test with spaces.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test with spaces.tif
--------------------------------------------------------------------------------
/spec/resources/test-pdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test-pdf.png
--------------------------------------------------------------------------------
/spec/resources/test.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.bmp
--------------------------------------------------------------------------------
/spec/resources/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.jpg
--------------------------------------------------------------------------------
/spec/resources/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.png
--------------------------------------------------------------------------------
/spec/resources/test.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test.tif
--------------------------------------------------------------------------------
/spec/resources/test1.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test1.tif
--------------------------------------------------------------------------------
/spec/resources/test_words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dannnylo/rtesseract/1f51d6fc5f4d0733a79359939f1a11592d12aa8b/spec/resources/test_words.png
--------------------------------------------------------------------------------
/spec/rtesseract/box_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | RSpec.describe RTesseract::Box do
4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
5 | let(:words_image) { path.join('resources', 'test_words.png').to_s }
6 | let(:words) { ['If', 'you', 'are', 'a', 'friend,', 'you', 'speak', 'the', 'password,', 'and', 'the', 'doors', 'will', 'open.'] }
7 | let(:instance) { RTesseract.new(words_image) }
8 |
9 | it 'returns the list of words' do
10 | expect(instance.words).to eql(words)
11 | end
12 |
13 | it 'bounding box' do
14 | expect(instance.to_box).to include(word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31, confidence: 96)
15 | end
16 |
17 | it 'removes the temp hocr file' do
18 | initial_count = Dir["#{Dir.tmpdir}/*"].length
19 | instance.to_box
20 | expect(initial_count).to eql(Dir["#{Dir.tmpdir}/*"].length)
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/spec/rtesseract/configuration_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | RSpec.describe RTesseract do
4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
5 |
6 | context 'with global spm' do
7 | before { described_class.configure { |config| config.psm = 7 } }
8 |
9 | it 'gets the global psm value' do
10 | expect(described_class.config.psm).to be(7)
11 | end
12 |
13 | it 'gets instance psm value' do
14 | expect(described_class.new(path, psm: 2).config.psm).to be(2)
15 | end
16 | end
17 |
18 | context 'with default command' do
19 | it 'gets the global psm value' do
20 | expect(described_class.config.command).to eql('tesseract')
21 | end
22 |
23 | it 'gets instance command value' do
24 | expect(described_class.new(path, command: '/usr/bin/tesseract4').config.command).to eql('/usr/bin/tesseract4')
25 | end
26 | end
27 |
28 | context 'with other options' do
29 | it 'allows to setup oem' do
30 | expect(described_class.new(path, oem: 1).config.oem).to be(1)
31 | end
32 |
33 | it 'allows to setup lang' do
34 | expect(described_class.new(path, lang: 'eng').config.lang).to eql('eng')
35 | end
36 |
37 | it 'allows to setup multiple langs' do
38 | expect(described_class.new(path, lang: 'eng+por').config.lang).to eql('eng+por')
39 | end
40 | end
41 |
42 | context 'when block not given' do
43 | it { expect(described_class.configure).to eq(nil) }
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/spec/rtesseract/pdf_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | RSpec.describe RTesseract::Pdf do
4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
5 | let(:words_image) { path.join('resources', 'test-pdf.png').to_s }
6 | let(:file) { RTesseract.new(words_image).to_pdf }
7 |
8 | after do
9 | file.close
10 | File.unlink(file)
11 | end
12 |
13 | it 'returns a file with extension .pdf' do
14 | expect(File.extname(file.path)).to eql('.pdf')
15 | end
16 |
17 | it 'checks if file pdf exisits' do
18 | expect(File).to exist(file.path)
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/spec/rtesseract/text_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | RSpec.describe RTesseract::Text do
4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
5 | let(:image_path) { path.join('resources', 'test.tif').to_s }
6 | let(:pdf_path) { path.join('resources', 'test.tif').to_s }
7 |
8 | let(:words_image) { path.join('resources', 'test_words.png').to_s }
9 |
10 | it 'translate image to text' do
11 | expect(RTesseract.new(image_path).to_s_without_spaces).to eql('43XF')
12 | end
13 |
14 | it 'translate tif image to text' do
15 | expect(RTesseract.new(path.join('resources', 'test1.tif').to_s).to_s_without_spaces).to eql('V2V4')
16 | end
17 |
18 | it 'translate tif image with spaces to text' do
19 | expect(RTesseract.new(path.join('resources', 'test with spaces.tif').to_s).to_s_without_spaces).to eql('V2V4')
20 | end
21 |
22 | it 'translate png image with spaces to text' do
23 | expect(RTesseract.new(path.join('resources', 'test.png').to_s).to_s_without_spaces).to eql('HW9W')
24 | end
25 |
26 | it 'translate jpg image with spaces to text' do
27 | expect(RTesseract.new(path.join('resources', 'test.jpg').to_s).to_s_without_spaces).to eql('3R8F')
28 | end
29 |
30 | it 'translate image to text with options' do
31 | expect(RTesseract.new(image_path, psm: 7, oem: 1).to_s_without_spaces).to eql('43XF')
32 | end
33 |
34 | it 'tests output text' do
35 | expect(RTesseract.new(words_image).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n")
36 | end
37 | end
38 |
--------------------------------------------------------------------------------
/spec/rtesseract/tsv_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | RSpec.describe RTesseract::Tsv do
4 | let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
5 | let(:words_image) { path.join('resources', 'test_words.png').to_s }
6 | let(:file) { RTesseract.new(words_image).to_tsv }
7 |
8 | after do
9 | file.close
10 | File.unlink(file)
11 | end
12 |
13 | it 'returns a file with extension .tsv' do
14 | expect(File.extname(file.path)).to eql('.tsv')
15 | end
16 |
17 | it ' support tsv output mode' do
18 | expect(file.read).to include('level page_num block_num par_num line_num word_num left top width height conf text')
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/spec/rtesseract_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | RSpec.describe RTesseract do
4 | let(:path) { Pathname.new(__dir__) }
5 | let(:image_path) { path.join('resources', 'test.tif').to_s }
6 | let(:pdf_path) { path.join('resources', 'test.tif').to_s }
7 | let(:words_image) { path.join('resources', 'test_words.png').to_s }
8 |
9 | it 'returns the tesseract version' do
10 | expect(described_class.tesseract_version).to be > 3.05
11 | end
12 |
13 | it 'be instantiable without path' do
14 | expect(described_class.new.class).to eql(described_class)
15 | end
16 |
17 | it 'be instantiable with blank string' do
18 | expect(described_class.new('').class).to eql(described_class)
19 | end
20 |
21 | it 'be instantiable with a path' do
22 | expect(described_class.new(image_path).class).to eql(described_class)
23 | end
24 |
25 | context 'when tesseract not installed' do
26 | before do
27 | described_class.configure { |config| config.command = 'tesseract_not_installed' }
28 | end
29 |
30 | it 'returns zero on #tesseract_version' do
31 | expect(described_class.tesseract_version).to be(0)
32 | end
33 |
34 | it 'raise a error if tesseract version < 3.05' do
35 | expect { described_class.check_version! }.to raise_error(RTesseract::Error)
36 | end
37 | end
38 |
39 | context 'when tesseract installed version is less then 3' do
40 | before do
41 | allow(RTesseract).to receive(:tesseract_version).and_return(2)
42 | end
43 |
44 | it 'raise a error if tesseract version < 3.05' do
45 | expect { described_class.check_version! }.to raise_error(RTesseract::Error)
46 | end
47 | end
48 |
49 | context 'when tesseract installed version is greather then 3' do
50 | before do
51 | allow(RTesseract).to receive(:tesseract_version).and_return(4)
52 | end
53 |
54 | it 'raise a error if tesseract version < 3.05' do
55 | expect(described_class.check_version!).to eq(nil)
56 | end
57 | end
58 |
59 | context 'without source' do
60 | let(:instance) { described_class.new }
61 |
62 | it 'raise an exception' do
63 | aggregate_failures 'raise an exception' do
64 | expect { instance.to_s }.to raise_error(RTesseract::Error)
65 | expect(instance.errors.first).to include('Error during processing')
66 | end
67 | end
68 | end
69 |
70 | context 'with errors on image' do
71 | let(:error_intance) do
72 | described_class.new(path.join('resources', 'image_with_error.png').to_s)
73 | end
74 |
75 | it 'stores the error on a variable to debug' do
76 | aggregate_failures 'stores the error on a variable to debug' do
77 | expect(error_intance.to_s_without_spaces).to eql('RTX-0003-03-02-01PRE')
78 | expect(error_intance.errors).to eql(["Error in boxClipToRectangle: box outside rectangle\nError in pixScanForForeground: invalid box\n"])
79 | end
80 | end
81 | end
82 |
83 | it 'runs multiple types' do
84 | tesseract = RTesseract.new(words_image)
85 | # Check that none of the other options affects the config, making text error out.
86 | box = tesseract.to_box
87 | pdf = tesseract.to_pdf
88 | tsv = tesseract.to_tsv
89 |
90 | result = tesseract.to_s
91 | expect(result).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n")
92 | expect(box).to be_a(Array)
93 |
94 | expect(pdf).to be_a(File)
95 | expect(File.extname(pdf.path)).to eq('.pdf')
96 |
97 | expect(tsv).to be_a(File)
98 | expect(File.extname(tsv.path)).to eq('.tsv')
99 |
100 | pdf.close
101 | tsv.close
102 | end
103 | end
104 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'bundler/setup'
4 | require 'simplecov'
5 | require 'simplecov-lcov'
6 |
7 | SimpleCov.formatters = SimpleCov::Formatter::MultiFormatter.new([
8 | SimpleCov::Formatter::HTMLFormatter,
9 | SimpleCov::Formatter::LcovFormatter
10 | ])
11 |
12 | SimpleCov.start :test_frameworks do
13 | enable_coverage :branch
14 |
15 | minimum_coverage line: 100, branch: 85
16 | end
17 |
18 | require 'rtesseract'
19 |
20 | RSpec.configure do |config|
21 | # Enable flags like --only-failures and --next-failure
22 | config.example_status_persistence_file_path = '.rspec_status'
23 |
24 | # Disable RSpec exposing methods globally on `Module` and `main`
25 | config.disable_monkey_patching!
26 |
27 | config.expect_with :rspec do |c|
28 | c.syntax = :expect
29 | end
30 |
31 | config.before do
32 | RTesseract.reset_config!
33 | end
34 | end
35 |
--------------------------------------------------------------------------------