├── .codeclimate.yml ├── .formatter.exs ├── .github └── FUNDING.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── config └── config.exs ├── lib ├── tesseract_ocr.ex └── tesseract_ocr │ ├── pdf.ex │ ├── tsv.ex │ ├── utils.ex │ └── words.ex ├── mix.exs ├── mix.lock └── test ├── resources ├── blank.tif ├── test.png ├── test.tif └── world.png ├── tesseract_ocr ├── pdf_test.exs ├── tsv_test.exs ├── utils_test.exs └── words_test.exs ├── tesseract_ocr_test.exs ├── test.tsv └── test_helper.exs /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | plugins: 2 | credo: 3 | enabled: true 4 | -------------------------------------------------------------------------------- /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["mix.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: dannnylo 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | tesseract_ocr-*.tar 24 | 25 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: bionic 2 | language: elixir 3 | 4 | matrix: 5 | include: 6 | - elixir: 1.7.0 7 | otp_release: 22.0 8 | 9 | - elixir: 1.9.0 10 | otp_release: 22.0 11 | 12 | script: 13 | - mix test 14 | 15 | addons: 16 | apt: 17 | packages: 18 | - tesseract-ocr 19 | env: 20 | global: 21 | - CC_TEST_REPORTER_ID=d263af6633d496aebee2d97ddc12eccf98ab39e72de1734b55023d587d928ed1 22 | 23 | before_install: 24 | - sudo add-apt-repository ppa:alex-p/tesseract-ocr -y 25 | - sudo apt-get update -q 26 | - sudo apt-get install tesseract-ocr tesseract-ocr-eng ghostscript -y 27 | - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter 28 | - chmod +x ./cc-test-reporter 29 | - ./cc-test-reporter before-build 30 | 31 | after_script: 32 | - mix deps.get --only docs 33 | - MIX_ENV=test mix credo --strict 34 | - MIX_ENV=test mix coveralls --verbose 35 | - MIX_ENV=test mix coveralls.json --verbose 36 | - MIX_ENV=test mix coveralls.travis 37 | - ./cc-test-reporter after-build -r "$CC_TEST_REPORTER_ID" -t excoveralls --exit-code $TRAVIS_TEST_RESULT 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Danilo Jeremias da Silva 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TesseractOcr 2 | 3 | [![travis-ci.org](https://api.travis-ci.org/dannnylo/tesseract-ocr-elixir.svg)](https://travis-ci.org/dannnylo/tesseract-ocr-elixir) 4 | [![hex.pm](https://img.shields.io/hexpm/v/tesseract_ocr.svg)](https://hex.pm/packages/tesseract_ocr) 5 | [![hex.pm](https://img.shields.io/badge/docs-hexpm-blue.svg)](https://hexdocs.pm/tesseract_ocr) 6 | [![hex.pm](https://img.shields.io/hexpm/dt/tesseract_ocr.svg)](https://hex.pm/packages/tesseract_ocr) 7 | [![hex.pm](https://img.shields.io/hexpm/l/tesseract_ocr.svg)](https://hex.pm/packages/tesseract_ocr) 8 | [![github.com](https://img.shields.io/github/last-commit/dannnylo/tesseract-ocr-elixir.svg)](https://github.com/dannnylo/tesseract-ocr-elixir/commits/master) 9 | 10 | Elixir wrapper for [Tesseract OCR](https://github.com/tesseract-ocr), an open 11 | source text recognition (OCR) Engine. 12 | 13 | ## Requirements 14 | 15 | - Elixir 1.6+ / Erlang OTP 19+ 16 | - [Tesseract OCR binary](https://github.com/tesseract-ocr/tesseract/wiki) 17 | 18 | ## Installation 19 | 20 | Add `tesseract_ocr` to your list of dependencies in `mix.exs`: 21 | 22 | ```elixir 23 | def deps do 24 | [ 25 | {:tesseract_ocr, "~> 0.1.5"} 26 | ] 27 | end 28 | ``` 29 | 30 | ## Usage 31 | 32 | Reading an image file. 33 | 34 | ```elixir 35 | iex> TesseractOcr.read("test/resources/world.png") 36 | "world" 37 | ``` 38 | 39 | With additional options. 40 | 41 | ```elixir 42 | iex> TesseractOcr.read("test/resources/world.png", %{lang: "por", psm: 7, oem: 1}) 43 | "world" 44 | ``` 45 | 46 | Get words positions. 47 | 48 | ```elixir 49 | iex> TesseractOcr.Words.read("test/resources/world.png") 50 | [%{confidence: 95, word: "world", x_end: 185, x_start: 2, y_end: 56, y_start: 2}] 51 | ``` 52 | 53 | Convert image into PDF with text. 54 | 55 | ```elixir 56 | iex> TesseractOcr.PDF.read("test/resources/world.png", "/tmp/test") 57 | "/tmp/test.pdf" 58 | ``` 59 | Convert image into TSV with text. 60 | 61 | ```elixir 62 | iex> TesseractOcr.TSV.read("test/resources/world.png", "/tmp/test") 63 | "/tmp/test.tsv" 64 | ``` 65 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for 9 | # 3rd-party users, it should be done in your "mix.exs" file. 10 | 11 | # You can configure your application as: 12 | # 13 | # config :tesseract_ocr, key: :value 14 | # 15 | # and access this configuration in your application as: 16 | # 17 | # Application.get_env(:tesseract_ocr, :key) 18 | # 19 | # You can also configure a 3rd-party app: 20 | # 21 | # config :logger, level: :info 22 | # 23 | 24 | # It is also possible to import configuration files, relative to this 25 | # directory. For example, you can emulate configuration per environment 26 | # by uncommenting the line below and defining dev.exs, test.exs and such. 27 | # Configuration from the imported file will override the ones defined 28 | # here (which is why it is important to import them last). 29 | # 30 | # import_config "#{Mix.env}.exs" 31 | -------------------------------------------------------------------------------- /lib/tesseract_ocr.ex: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr do 2 | @moduledoc """ 3 | Documentation for TesseractOcr. 4 | """ 5 | 6 | import TesseractOcr.Utils 7 | 8 | @doc """ 9 | This function reads the chars on image by OCR. 10 | 11 | ## Examples 12 | 13 | iex> TesseractOcr.read("test/resources/world.png") 14 | "world" 15 | 16 | """ 17 | def read(path, options \\ %{}) when is_binary(path) do 18 | path 19 | |> command("stdout", options) 20 | |> elem(0) 21 | |> String.trim() 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/tesseract_ocr/pdf.ex: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.PDF do 2 | @moduledoc """ 3 | Documentation for TesseractOcr.PDF. 4 | """ 5 | 6 | import TesseractOcr.Utils 7 | 8 | @doc """ 9 | This function reads the words on image by OCR and returns the pdf's file's path 10 | 11 | ## Examples 12 | 13 | iex> TesseractOcr.PDF.read("test/resources/world.png", "/tmp/test") 14 | "/tmp/test.pdf" 15 | 16 | """ 17 | def read(path, output, options \\ %{}) when is_binary(path) do 18 | options = Map.merge(options, %{c: "tessedit_create_pdf=1"}) 19 | 20 | command(path, output, options) 21 | 22 | "#{output}.pdf" 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/tesseract_ocr/tsv.ex: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.TSV do 2 | @moduledoc """ 3 | Documentation for TesseractOcr.TSV. 4 | """ 5 | 6 | import TesseractOcr.Utils 7 | 8 | @doc """ 9 | This function reads the words on image by OCR and returns the TSV's file's path 10 | 11 | ## Examples 12 | 13 | iex> TesseractOcr.TSV.read("test/resources/world.png", "/tmp/test") 14 | "/tmp/test.tsv" 15 | 16 | """ 17 | def read(path, output, options \\ %{}) when is_binary(path) do 18 | options = Map.merge(options, %{c: "tessedit_create_tsv=1"}) 19 | 20 | command(path, output, options) 21 | 22 | "#{output}.tsv" 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/tesseract_ocr/utils.ex: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.Utils do 2 | @moduledoc """ 3 | Utilities to run `tesseract-ocr` binary. 4 | """ 5 | 6 | @doc """ 7 | This function executes the tesseract on system and return the output. 8 | """ 9 | def command(path, output, options) do 10 | System.cmd("tesseract", command_options(path, output, options)) 11 | end 12 | 13 | @doc """ 14 | This function will mount the options to Tesseract OCR. 15 | 16 | ## Examples 17 | 18 | iex> TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{l: "por", oem: "1"}) 19 | ["test/resources/world.png", "stdout", "-l", "por", "--oem","1"] 20 | 21 | iex> TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{l: "por", psm: 1}) 22 | ["test/resources/world.png", "stdout", "-l", "por", "--psm", "1"] 23 | 24 | iex> TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{c: "var=b"}) 25 | ["test/resources/world.png", "stdout", "-c", "var=b"] 26 | 27 | """ 28 | def command_options(path, output, options) do 29 | [ 30 | path, 31 | output, 32 | make_short_option(:l, options[:l] || options[:lang]), 33 | make_option(:oem, options[:oem]), 34 | make_option(:dpi, options[:dpi]), 35 | make_option(:psm, options[:psm]), 36 | make_option("tessdata-dir", options[:tessdata_dir]), 37 | make_option("user-patterns", options[:user_patterns]), 38 | make_option("user-words", options[:user_words]), 39 | make_short_option(:c, options[:c]) 40 | ] 41 | |> List.flatten() 42 | |> Enum.filter(&(!is_nil(&1))) 43 | end 44 | 45 | defp make_option(_name, value) when is_nil(value) do 46 | nil 47 | end 48 | 49 | defp make_option(name, value) when is_integer(value) do 50 | make_option(name, Integer.to_string(value)) 51 | end 52 | 53 | defp make_option(name, value) do 54 | ["--#{name}", value] 55 | end 56 | 57 | defp make_short_option(_name, value) when is_nil(value) do 58 | nil 59 | end 60 | 61 | defp make_short_option(name, value) when is_list(value) do 62 | Enum.flat_map(value, fn v -> 63 | ["-#{name}", v] 64 | end) 65 | end 66 | 67 | defp make_short_option(name, value) do 68 | ["-#{name}", value] 69 | end 70 | 71 | def read_and_remove(path) do 72 | content = File.read(path) 73 | 74 | File.rm(path) 75 | 76 | content 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /lib/tesseract_ocr/words.ex: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.Words do 2 | @moduledoc """ 3 | Documentation for TesseractOcrWords. 4 | """ 5 | 6 | import TesseractOcr.Utils 7 | 8 | @doc """ 9 | This function reads the words on image by OCR and returns with positions. 10 | 11 | ## Examples 12 | 13 | iex> TesseractOcr.Words.read("test/resources/world.png") 14 | [%{confidence: 95, word: "world", x_end: 185, x_start: 2, y_end: 56, y_start: 2}] 15 | 16 | """ 17 | def read(path, options \\ %{}) when is_binary(path) do 18 | output = SecureRandom.uuid() 19 | options = Map.merge(options, %{c: "tessedit_create_hocr=1"}) 20 | 21 | command(path, output, options) 22 | 23 | read_and_remove("#{output}.hocr") 24 | |> elem(1) 25 | |> parse 26 | end 27 | 28 | defp parse(content) do 29 | content 30 | |> String.split("\n") 31 | |> Enum.map(fn line -> parse_line(line) end) 32 | |> Enum.reject(&is_nil/1) 33 | end 34 | 35 | defp parse_line(line) do 36 | if String.match?(line, ~r/oc(rx|r)_word/) do 37 | word = content_match(line, ~r/(?<=>)(.*?)(?=<)/) 38 | 39 | if word == "" do 40 | nil 41 | else 42 | word_info(word, parse_position(line), parse_confidence(line)) 43 | end 44 | else 45 | nil 46 | end 47 | end 48 | 49 | def word_info(word, positions, confidence) do 50 | %{ 51 | word: word, 52 | confidence: confidence, 53 | x_start: Enum.at(positions, 0), 54 | y_start: Enum.at(positions, 1), 55 | x_end: Enum.at(positions, 2), 56 | y_end: Enum.at(positions, 3) 57 | } 58 | end 59 | 60 | defp parse_position(line) do 61 | content_match(line, ~r/(?<=bbox)(.*?)(?=;)/) 62 | |> String.split() 63 | |> Enum.map(&String.to_integer/1) 64 | end 65 | 66 | defp parse_confidence(line) do 67 | content_match(line, ~r/(?<=; x_wconf )(.*?)(?=')/) 68 | |> String.to_integer() 69 | end 70 | 71 | def content_match(string, re) do 72 | matched = 73 | Regex.scan(re, string) 74 | |> List.first() 75 | 76 | List.first(matched || [""]) 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.MixProject do 2 | use Mix.Project 3 | 4 | @source_url "https://github.com/dannnylo/tesseract-ocr-elixir" 5 | 6 | def project do 7 | [ 8 | app: :tesseract_ocr, 9 | version: "0.1.5", 10 | elixir: "~> 1.6", 11 | description: description(), 12 | start_permanent: Mix.env() == :prod, 13 | deps: deps(), 14 | package: package(), 15 | docs: docs(), 16 | test_coverage: [tool: ExCoveralls] 17 | ] 18 | end 19 | 20 | def application do 21 | [ 22 | extra_applications: [:logger] 23 | ] 24 | end 25 | 26 | defp description do 27 | """ 28 | Elixir wrapper for Tesseract OCR, an open source text recognition engine. 29 | """ 30 | end 31 | 32 | defp deps do 33 | [ 34 | {:secure_random, ">= 0.0.0"}, 35 | {:ex_doc, ">= 0.0.0", only: :dev}, 36 | {:credo, "~> 1.2.0", only: [:dev, :test], runtime: false}, 37 | {:excoveralls, "~> 0.12", only: :test} 38 | ] 39 | end 40 | 41 | defp package() do 42 | [ 43 | maintainers: ["Danilo Jeremias da Silva"], 44 | licenses: ["MIT"], 45 | links: %{"GitHub" => @source_url} 46 | ] 47 | end 48 | 49 | defp docs do 50 | [ 51 | main: "readme", 52 | source_url: @source_url, 53 | extras: [ 54 | "README.md" 55 | ] 56 | ] 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm", "7af5c7e09fe1d40f76c8e4f9dd2be7cebd83909f31fee7cd0e9eadc567da8353"}, 3 | "certifi": {:hex, :certifi, "2.5.1", "867ce347f7c7d78563450a18a6a28a8090331e77fa02380b4a21962a65d36ee5", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm", "805abd97539caf89ec6d4732c91e62ba9da0cda51ac462380bbd28ee697a8c42"}, 4 | "credo": {:hex, :credo, "1.2.2", "f57faf60e0a12b0ba9fd4bad07966057fde162b33496c509b95b027993494aab", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "8f2623cd8c895a6f4a55ef10f3fdf6a55a9ca7bef09676bd835551687bf8a740"}, 5 | "earmark": {:hex, :earmark, "1.4.3", "364ca2e9710f6bff494117dbbd53880d84bebb692dafc3a78eb50aa3183f2bfd", [:mix], [], "hexpm", "8cf8a291ebf1c7b9539e3cddb19e9cef066c2441b1640f13c34c1d3cfc825fec"}, 6 | "ex_doc": {:hex, :ex_doc, "0.21.3", "857ec876b35a587c5d9148a2512e952e24c24345552259464b98bfbb883c7b42", [:mix], [{:earmark, "~> 1.4", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "0db1ee8d1547ab4877c5b5dffc6604ef9454e189928d5ba8967d4a58a801f161"}, 7 | "excoveralls": {:hex, :excoveralls, "0.12.3", "2142be7cb978a3ae78385487edda6d1aff0e482ffc6123877bb7270a8ffbcfe0", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "568a3e616c264283f5dea5b020783ae40eef3f7ee2163f7a67cbd7b35bcadada"}, 8 | "hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "e0100f8ef7d1124222c11ad362c857d3df7cb5f4204054f9f0f4a728666591fc"}, 9 | "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "4bdd305eb64e18b0273864920695cb18d7a2021f31a11b9c5fbcd9a253f936e2"}, 10 | "jason": {:hex, :jason, "1.1.2", "b03dedea67a99223a2eaf9f1264ce37154564de899fd3d8b9a21b1a6fd64afe7", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fdf843bca858203ae1de16da2ee206f53416bbda5dc8c9e78f43243de4bc3afe"}, 11 | "makeup": {:hex, :makeup, "1.0.0", "671df94cf5a594b739ce03b0d0316aa64312cee2574b6a44becb83cd90fb05dc", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "a10c6eb62cca416019663129699769f0c2ccf39428b3bb3c0cb38c718a0c186d"}, 12 | "makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "d4b316c7222a85bbaa2fd7c6e90e37e953257ad196dc229505137c5e505e9eff"}, 13 | "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, 14 | "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, 15 | "nimble_parsec": {:hex, :nimble_parsec, "0.5.3", "def21c10a9ed70ce22754fdeea0810dafd53c2db3219a0cd54cf5526377af1c6", [:mix], [], "hexpm", "589b5af56f4afca65217a1f3eb3fee7e79b09c40c742fddc1c312b3ac0b3399f"}, 16 | "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm", "17ef63abde837ad30680ea7f857dd9e7ced9476cdd7b0394432af4bfc241b960"}, 17 | "secure_random": {:hex, :secure_random, "0.5.1", "c5532b37c89d175c328f5196a0c2a5680b15ebce3e654da37129a9fe40ebf51b", [:mix], [], "hexpm", "1b9754f15e3940a143baafd19da12293f100044df69ea12db5d72878312ae6ab"}, 18 | "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.5", "6eaf7ad16cb568bb01753dbbd7a95ff8b91c7979482b95f38443fe2c8852a79b", [:make, :mix, :rebar3], [], "hexpm", "13104d7897e38ed7f044c4de953a6c28597d1c952075eb2e328bc6d6f2bfc496"}, 19 | "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"}, 20 | } 21 | -------------------------------------------------------------------------------- /test/resources/blank.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/tesseract-ocr-elixir/159e53806f6517a24079d262054cfd34e371ed4b/test/resources/blank.tif -------------------------------------------------------------------------------- /test/resources/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/tesseract-ocr-elixir/159e53806f6517a24079d262054cfd34e371ed4b/test/resources/test.png -------------------------------------------------------------------------------- /test/resources/test.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/tesseract-ocr-elixir/159e53806f6517a24079d262054cfd34e371ed4b/test/resources/test.tif -------------------------------------------------------------------------------- /test/resources/world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dannnylo/tesseract-ocr-elixir/159e53806f6517a24079d262054cfd34e371ed4b/test/resources/world.png -------------------------------------------------------------------------------- /test/tesseract_ocr/pdf_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.PDFTest do 2 | use ExUnit.Case 3 | doctest TesseractOcr.PDF 4 | 5 | test "read image and saves on a PDF" do 6 | pdf_path = TesseractOcr.PDF.read("test/resources/world.png", "test/test", %{lang: "eng", psm: 7, oem: 1}) 7 | 8 | assert pdf_path === "test/test.pdf" 9 | File.rm(pdf_path) 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /test/tesseract_ocr/tsv_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.TSVTest do 2 | use ExUnit.Case 3 | doctest TesseractOcr.TSV 4 | 5 | test "read image and saves on a TSV" do 6 | tsv_path = TesseractOcr.TSV.read("test/resources/world.png", "test/test", %{lang: "eng", psm: 7, oem: 1}) 7 | 8 | assert tsv_path === "test/test.tsv" 9 | File.rm(tsv_path) 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /test/tesseract_ocr/utils_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.UtilsTest do 2 | use ExUnit.Case 3 | doctest TesseractOcr.Utils 4 | 5 | test "command options generate the options to shell command" do 6 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{ 7 | lang: "por", 8 | oem: "1" 9 | }) === 10 | ["test/resources/world.png", "stdout", "-l", "por", "--oem", "1"] 11 | 12 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{psm: "1"}) === 13 | [ 14 | "test/resources/world.png", 15 | "stdout", 16 | "--psm", 17 | "1" 18 | ] 19 | 20 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{}) === [ 21 | "test/resources/world.png", 22 | "stdout" 23 | ] 24 | end 25 | 26 | test "allow multiple -c options" do 27 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{ 28 | lang: "por", 29 | oem: "1", 30 | c: ["tessedit_char_whitelist=A", "tessedit_do_invert=0"] 31 | }) === [ 32 | "test/resources/world.png", 33 | "stdout", 34 | "-l", 35 | "por", 36 | "--oem", 37 | "1", 38 | "-c", 39 | "tessedit_char_whitelist=A", 40 | "-c", 41 | "tessedit_do_invert=0" 42 | ] 43 | end 44 | 45 | test "support a single -c option (support the 'old' behaviour)" do 46 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{ 47 | lang: "por", 48 | oem: "1", 49 | c: "tessedit_char_whitelist=A" 50 | }) === [ 51 | "test/resources/world.png", 52 | "stdout", 53 | "-l", 54 | "por", 55 | "--oem", 56 | "1", 57 | "-c", 58 | "tessedit_char_whitelist=A" 59 | ] 60 | end 61 | 62 | test "support all default command line options" do 63 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{ 64 | lang: "eng", 65 | oem: "1", 66 | c: ["tessedit_char_whitelist=A10", "tessedit_do_invert=1"], 67 | psm: 3, 68 | dpi: 72, 69 | tessdata_dir: "/home/tesseract", 70 | user_patterns: "/home/tesseract-dir/patterns", 71 | user_words: "/home/tesseract-dir/words" 72 | }) === [ 73 | "test/resources/world.png", 74 | "stdout", 75 | "-l", 76 | "eng", 77 | "--oem", 78 | "1", 79 | "--dpi", 80 | "72", 81 | "--psm", 82 | "3", 83 | "--tessdata-dir", 84 | "/home/tesseract", 85 | "--user-patterns", 86 | "/home/tesseract-dir/patterns", 87 | "--user-words", 88 | "/home/tesseract-dir/words", 89 | "-c", 90 | "tessedit_char_whitelist=A10", 91 | "-c", 92 | "tessedit_do_invert=1" 93 | ] 94 | end 95 | end 96 | -------------------------------------------------------------------------------- /test/tesseract_ocr/words_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcr.WordsTest do 2 | use ExUnit.Case 3 | doctest TesseractOcr.Words 4 | 5 | test "read words of an image " do 6 | assert TesseractOcr.Words.read("test/resources/world.png", %{lang: "eng", psm: 7, oem: 1}) === 7 | [%{confidence: 95, word: "world", x_end: 185, x_start: 2, y_end: 56, y_start: 2}] 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /test/tesseract_ocr_test.exs: -------------------------------------------------------------------------------- 1 | defmodule TesseractOcrTest do 2 | use ExUnit.Case 3 | doctest TesseractOcr 4 | 5 | test "read an image" do 6 | assert TesseractOcr.read("./test/resources/blank.tif") == "" 7 | assert TesseractOcr.read("./test/resources/test.tif") == "43XF" 8 | assert TesseractOcr.read("./test/resources/test.png") == "HW9W" 9 | 10 | assert TesseractOcr.read("test/resources/world.png", %{lang: "eng", psm: 7, oem: 1}) == 11 | "world" 12 | end 13 | 14 | test "command options generate the options to shell command" do 15 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{lang: "por", oem: "1"}) === 16 | ["test/resources/world.png", "stdout", "-l", "por", "--oem", "1"] 17 | 18 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{psm: "1"}) === [ 19 | "test/resources/world.png", 20 | "stdout", 21 | "--psm", 22 | "1" 23 | ] 24 | 25 | assert TesseractOcr.Utils.command_options("test/resources/world.png", "stdout", %{}) === [ 26 | "test/resources/world.png", 27 | "stdout" 28 | ] 29 | end 30 | 31 | test "raise exception when non binnary used as path" do 32 | assert_raise FunctionClauseError, fn -> 33 | TesseractOcr.read({}) 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /test/test.tsv: -------------------------------------------------------------------------------- 1 | level page_num block_num par_num line_num word_num left top width height conf text 2 | 1 1 0 0 0 0 0 0 295 71 -1 3 | 2 1 1 0 0 0 2 2 183 54 -1 4 | 3 1 1 1 0 0 2 2 183 54 -1 5 | 4 1 1 1 1 0 2 2 183 54 -1 6 | 5 1 1 1 1 1 2 2 183 54 95 world 7 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | --------------------------------------------------------------------------------