├── .coveragerc ├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── COPYING ├── Makefile ├── README.rst ├── djvu2hocr ├── doc ├── Makefile ├── README ├── changelog ├── djvu2hocr.xml ├── hocr2djvused.xml ├── ocrodjvu.xml └── todo ├── examples └── scans2djvu+hocr ├── hocr2djvused ├── lib ├── __init__.py ├── cli │ ├── __init__.py │ ├── djvu2hocr.py │ ├── hocr2djvused.py │ └── ocrodjvu.py ├── engines │ ├── __init__.py │ ├── common.py │ ├── cuneiform.py │ ├── dummy.py │ ├── gocr.py │ ├── ocrad.py │ ├── ocropus.py │ └── tesseract.py ├── errors.py ├── hocr.py ├── html5_support.py ├── image_io.py ├── ipc.py ├── iso639.py ├── logger.py ├── temporary.py ├── text_zones.py ├── unicode_support.py ├── utils.py └── version.py ├── ocrodjvu ├── private ├── examine-hangs ├── make-release-tarball ├── run-pyflakes ├── update-coverage └── update-version └── tests ├── __init__.py ├── coverage ├── data ├── alice.djvu ├── bad-page-id.djvu ├── empty.djvu └── non-ascii.png ├── djvu2hocr ├── __init__.py ├── nesting.djvused ├── nesting.test1 ├── non-xml-characters.djvused ├── non-xml-characters.test1 ├── non-xml-characters.test2 ├── test.py ├── upside-down.djvused └── upside-down.test1 ├── engines ├── __init__.py ├── fake-cuneiform ├── fake-cuneiform-multilang └── test_cuneiform.py ├── hocr2djvused ├── __init__.py ├── alice_cuneiform0.8.html ├── alice_cuneiform0.8.test1 ├── alice_cuneiform0.8.test2 ├── alice_cuneiform0.8.test3 ├── alice_cuneiform0.8.test4 ├── alice_cuneiform0.8.test5 ├── alice_cuneiform0.9.html ├── alice_cuneiform0.9.test1 ├── alice_cuneiform0.9.test2 ├── alice_cuneiform0.9.test3 ├── alice_cuneiform0.9.test4 ├── alice_cuneiform0.9.test5 ├── alice_ocropus0.2.html ├── alice_ocropus0.2.test1 ├── alice_ocropus0.3.1+charboxes.html ├── alice_ocropus0.3.1+charboxes.test1 ├── alice_ocropus0.3.1+charboxes.test2 ├── alice_ocropus0.3.1+charboxes.test3 ├── alice_ocropus0.3.1+charboxes.test4 ├── alice_ocropus0.3.1+charboxes.test5 ├── alice_ocropus0.3.1.html ├── alice_ocropus0.3.1.test1 ├── alice_tesseract3.00+charboxes.html ├── alice_tesseract3.00+charboxes.test1 ├── alice_tesseract3.00+charboxes.test2 ├── alice_tesseract3.00+charboxes.test3 ├── alice_tesseract3.00+charboxes.test4 ├── alice_tesseract3.00+charboxes.test5 ├── alice_tesseract3.00.html ├── alice_tesseract3.00.test1 ├── alice_tesseract3.00.test2 ├── alice_tesseract3.00.test3 ├── alice_tesseract3.00.test4 ├── alice_tesseract3.00.test5 ├── alice_tesseract3.00svn622+charboxes.html ├── alice_tesseract3.00svn622+charboxes.test1 ├── alice_tesseract3.00svn622+charboxes.test2 ├── alice_tesseract3.00svn622+charboxes.test3 ├── alice_tesseract3.00svn622+charboxes.test4 ├── alice_tesseract3.00svn622+charboxes.test5 ├── alice_tesseract3.00svn622.html ├── alice_tesseract3.00svn622.test1 ├── alice_tesseract3.00svn622.test2 ├── alice_tesseract3.00svn622.test3 ├── alice_tesseract3.00svn622.test4 ├── alice_tesseract3.00svn622.test5 ├── alice_tesseract3.02+charboxes.html ├── alice_tesseract3.02+charboxes.test1 ├── alice_tesseract3.02+charboxes.test2 ├── alice_tesseract3.02+charboxes.test3 ├── alice_tesseract3.02+charboxes.test4 ├── alice_tesseract3.02+charboxes.test5 ├── alice_tesseract3.02.html ├── alice_tesseract3.02.test1 ├── alice_tesseract3.02.test2 ├── alice_tesseract3.02.test3 ├── alice_tesseract3.02.test4 ├── alice_tesseract3.02.test5 ├── empty_ocropus0.3.1.html ├── empty_ocropus0.3.1.test1 ├── empty_ocropus0.3.1.test2 ├── empty_ocropus0.3.1.test3 ├── empty_ocropus0.3.1.test4 ├── empty_ocropus0.3.1.test5 ├── empty_tesseract3.04+charboxes.html ├── empty_tesseract3.04+charboxes.test1 ├── empty_tesseract3.04+charboxes.test2 ├── empty_tesseract3.04+charboxes.test3 ├── empty_tesseract3.04+charboxes.test4 ├── empty_tesseract3.04+charboxes.test5 ├── multipage_ocropus0.3.1.html ├── multipage_ocropus0.3.1.test1 ├── non-ascii_cuneiform0.7.html ├── non-ascii_cuneiform1.0.html ├── non-ascii_ocropus0.3.1.html ├── non-ascii_tesseract3.00.html ├── test.py ├── text+images_cuneiform0.7.html ├── text+images_cuneiform0.7.test1 ├── text+images_cuneiform0.7.test2 ├── text+images_cuneiform0.7.test3 ├── text+images_cuneiform0.7.test4 ├── text+images_cuneiform0.7.test5 ├── text+images_cuneiform1.0.html ├── text+images_cuneiform1.0.test1 ├── text+images_cuneiform1.0.test2 ├── text+images_cuneiform1.0.test3 ├── text+images_cuneiform1.0.test4 └── text+images_cuneiform1.0.test5 ├── image_io ├── __init__.py ├── test.py ├── whirl.djvu ├── whirl_1bpp.bmp ├── whirl_1bpp.pbm ├── whirl_1bpp.tif ├── whirl_24bpp.bmp ├── whirl_24bpp.ppm └── whirl_24bpp.tif ├── ocrodjvu ├── __init__.py ├── test.py └── test_integration.py ├── test_ipc.py ├── test_text_zones.py ├── test_unicode_support.py ├── test_utils.py └── tools.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | 4 | [report] 5 | show_missing = true 6 | exclude_lines = # no coverage 7 | 8 | # vim:ft=dosini 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | /tests/**/*.html linguist-detectable=false 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | main: 7 | strategy: 8 | matrix: 9 | include: 10 | - cython: cython-0.22.1 11 | pydjvu: python-djvulibre-0.3.9 12 | lxml: lxml-2.0 13 | pyicu: pyicu-1.9.5 # earlier versions FTBFS in this env 14 | html5lib: html5lib-0.95 15 | pillow: pillow-1.0 16 | os: ubuntu-18.04 17 | - subprocess: subprocess32-3.2.7 18 | os: ubuntu-20.04 19 | - gamera: gamera-3.4.0 20 | subprocess: subprocess32 21 | os: ubuntu-20.04 22 | runs-on: ${{matrix.os}} 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: set up Python 2.7 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: 2.7 29 | - name: set up APT 30 | run: | 31 | printf '\n\nPackage: *\nPin: release o=Ubuntu\nPin-Priority: 9999\n' | sudo tee -a /etc/apt/preferences 32 | printf 'Apt::Install-Recommends "false";\n' | sudo tee -a /etc/apt/apt.conf 33 | sudo apt-get update 34 | - name: apt install deps 35 | run: 36 | sudo apt-get install -y 37 | djvulibre-bin 38 | docbook-xml 39 | docbook-xsl 40 | libdjvulibre-dev 41 | libicu-dev 42 | libxml2-utils 43 | libxslt1-dev 44 | xsltproc 45 | - name: set up pip cache 46 | uses: actions/cache@v2 47 | with: 48 | path: ~/.cache/pip 49 | key: pip-${{matrix.os}} 50 | - name: install wheel 51 | run: | 52 | pip install wheel 53 | - name: install Cython 54 | run: | 55 | v=${{matrix.cython}} 56 | v=${v#cython-} 57 | python -m pip install cython${v:+==$v} 58 | - name: install python-djvulibre 59 | run: | 60 | v=${{matrix.pydjvu}} 61 | v=${v#python-djvulibre-} 62 | python -m pip install python-djvulibre${v:+==$v} 63 | - name: install lxml 64 | run: | 65 | v=${{matrix.lxml}} 66 | v=${v#lxml-} 67 | python -m pip install lxml${v:+==$v} 68 | - name: install subprocess32 69 | if: ${{matrix.subprocess}} 70 | run: | 71 | dist=${{matrix.subprocess}} 72 | dist=${dist/-/==} 73 | python -m pip install $dist 74 | - name: install PyICU 75 | run: | 76 | v=${{matrix.pyicu}} 77 | v=${v#pyicu-} 78 | python -m pip install pyicu${v:+==$v} 79 | - name: install html5lib 80 | run: | 81 | v=${{matrix.html5lib}} 82 | v=${v#html5lib-} 83 | python -m pip install html5lib${v:+==$v} 84 | - name: install Pillow 85 | run: | 86 | v=${{matrix.pillow}} 87 | v=${v#pillow-} 88 | python -m pip install pillow${v:+==$v} 89 | - name: install nose 90 | run: | 91 | python -m pip install nose 92 | - name: run tests 93 | run: | 94 | make test 95 | - name: check docs 96 | run: | 97 | python -m pip install docutils pygments 98 | make -C doc check 99 | - name: build docs 100 | run: 101 | make -C doc all 102 | - name: install 103 | run: 104 | make install PREFIX=$HOME/.local 105 | - name: check whether the executables were installed correctly 106 | run: | 107 | cd / 108 | djvu2hocr --version 109 | hocr2djvused --version 110 | ocrodjvu --version 111 | - name: check whether the man pages were installed correctly 112 | env: 113 | MANPATH: /home/runner/.local/share/man 114 | MANWIDTH: 80 115 | run: | 116 | cd / 117 | man 1 ocrodjvu | grep -A 10 -w OCRODJVU 118 | man 1 djvu2hocr | grep -A 10 -w DJVU2HOCR 119 | man 1 hocr2djvused | grep -A 10 -w HOCR2DJVUSED 120 | - name: run pydiatra 121 | run: | 122 | python -m pip install pydiatra 123 | python -m pydiatra -v . 124 | - name: run pyflakes 125 | run: | 126 | python -m pip install pyflakes 127 | private/run-pyflakes 128 | pypi: 129 | runs-on: ubuntu-latest 130 | steps: 131 | - name: check for namesquatting 132 | run: | 133 | set +e 134 | curl -fsS https://pypi.org/simple/ocrodjvu/ 135 | [ $? -eq 22 ] || exit 1 136 | 137 | # vim:ts=2 sts=2 sw=2 et 138 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | /.coverage 3 | /dist 4 | /doc/*.1 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright © 2012-2019 Jakub Wilk 2 | # 3 | # This file is part of ocrodjvu. 4 | # 5 | # ocrodjvu is free software; you can redistribute it and/or modify it 6 | # under the terms of the GNU General Public License version 2 as 7 | # published by the Free Software Foundation. 8 | # 9 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 10 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 | # for more details. 13 | 14 | PYTHON = python 15 | 16 | PREFIX = /usr/local 17 | DESTDIR = 18 | 19 | bindir = $(PREFIX)/bin 20 | basedir = $(PREFIX)/share/ocrodjvu 21 | mandir = $(PREFIX)/share/man 22 | 23 | .PHONY: all 24 | all: ; 25 | 26 | python-exe = $(shell $(PYTHON) -c 'import sys; print(sys.executable)') 27 | 28 | define install-script 29 | sed \ 30 | -e "1 s@^#!.*@#!$(python-exe)@" \ 31 | -e "s#^basedir = .*#basedir = '$(basedir)/'#" \ 32 | $(1) > $(1).tmp 33 | install -d $(DESTDIR)$(bindir) 34 | install $(1).tmp $(DESTDIR)$(bindir)/$(1) 35 | rm $(1).tmp 36 | endef 37 | 38 | define install-lib 39 | install -d $(DESTDIR)$(basedir)/lib/$(1) 40 | install -p -m644 lib/$(1)/*.py $(DESTDIR)$(basedir)/lib/$(1) 41 | endef 42 | 43 | .PHONY: install 44 | install: ocrodjvu 45 | $(PYTHON) - < lib/__init__.py # Python version check 46 | $(call install-script,ocrodjvu) 47 | $(call install-script,hocr2djvused) 48 | $(call install-script,djvu2hocr) 49 | $(call install-lib) 50 | $(call install-lib,cli) 51 | $(call install-lib,engines) 52 | ifeq "$(DESTDIR)" "" 53 | umask 022 && $(PYTHON) -m compileall -q $(basedir)/lib/ 54 | endif 55 | ifeq "$(wildcard doc/*.1)" "" 56 | # run "$(MAKE) -C doc" to build the manpages 57 | else 58 | install -d $(DESTDIR)$(mandir)/man1 59 | install -m644 doc/*.1 $(DESTDIR)$(mandir)/man1/ 60 | endif 61 | 62 | .PHONY: test 63 | test: ocrodjvu 64 | $(PYTHON) -c 'import nose; nose.main()' --verbose 65 | 66 | .PHONY: clean 67 | clean: 68 | find . -type f -name '*.py[co]' -delete 69 | find . -type d -name '__pycache__' -delete 70 | rm -f .coverage 71 | rm -f *.tmp 72 | 73 | .error = GNU make is required 74 | 75 | # vim:ts=4 sts=4 sw=4 noet 76 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | doc/README -------------------------------------------------------------------------------- /djvu2hocr: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=UTF-8 3 | 4 | # Copyright © 2009-2018 Jakub Wilk 5 | # 6 | # This file is part of ocrodjvu. 7 | # 8 | # ocrodjvu is free software; you can redistribute it and/or modify it 9 | # under the terms of the GNU General Public License version 2 as 10 | # published by the Free Software Foundation. 11 | # 12 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 13 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | # for more details. 16 | 17 | import sys 18 | 19 | basedir = None 20 | if basedir is not None: 21 | sys.path[:0] = [basedir] 22 | 23 | from lib.cli import djvu2hocr as cli 24 | 25 | cli.main(sys.argv) 26 | 27 | # vim:ts=4 sts=4 sw=4 et 28 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright © 2018-2019 Jakub Wilk 2 | # 3 | # This file is part of ocrodjvu. 4 | # 5 | # ocrodjvu is free software; you can redistribute it and/or modify it 6 | # under the terms of the GNU General Public License version 2 as 7 | # published by the Free Software Foundation. 8 | # 9 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 10 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 | # for more details. 13 | 14 | rst2xml = $(notdir $(shell command -v rst2xml || echo rst2xml.py)) \ 15 | --input-encoding=UTF-8 \ 16 | --strict 17 | 18 | xsl = http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl 19 | xsltproc = xsltproc --nonet \ 20 | --param man.authors.section.enabled 0 \ 21 | --param man.charmap.use.subset 0 \ 22 | --param man.font.links '"I"' 23 | 24 | xml-files = $(wildcard *.xml) 25 | man-files = $(xml-files:.xml=.1) 26 | 27 | .PHONY: all 28 | all: $(man-files) 29 | 30 | %.1: %.xml 31 | $(xsltproc) $(xsl) $(<) 32 | perl -pi -e 's#^(\\%https?://.*)#\\m[blue]\\fI$$1\\fR\\m[]#' $(@) 33 | 34 | .PHONY: check 35 | check: check-changelog check-rst check-xml 36 | 37 | .PHONY: check-changelog 38 | check-changelog: changelog 39 | dpkg-parsechangelog -l$(<) --all 2>&1 >/dev/null | { ! grep .; } 40 | 41 | .PHONY: check-rst 42 | check-rst: 43 | grep -rwl 'ft[=]rst' | xargs -t -I{} $(rst2xml) {} /dev/null 44 | 45 | .PHONY: check-xml 46 | check-xml: $(xml-files) 47 | xmllint --nonet --noout --valid $(^) 48 | 49 | .PHONY: clean 50 | clean: 51 | rm -f *.1 52 | 53 | # vim:ts=4 sts=4 sw=4 noet 54 | -------------------------------------------------------------------------------- /doc/README: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | **ocrodjvu** is a wrapper for OCR systems 5 | that allows you to perform OCR on DjVu_ files. 6 | 7 | .. _DjVu: 8 | http://djvu.org/ 9 | 10 | Example 11 | ------- 12 | 13 | .. code:: console 14 | 15 | $ wget -q 'https://sources.debian.org/data/main/o/ocropus/0.3.1-3/data/pages/alice_1.png' 16 | $ gm convert -threshold 50% 'alice_1.png' 'alice.pbm' 17 | $ cjb2 'alice.pbm' 'alice.djvu' 18 | $ ocrodjvu --in-place 'alice.djvu' 19 | Processing 'alice.djvu': 20 | - Page #1 21 | $ djvused -e print-txt 'alice.djvu' 22 | (page 0 0 2488 3507 23 | (column 470 2922 1383 2978 24 | (para 470 2922 1383 2978 25 | (line 470 2922 1383 2978 26 | (word 470 2927 499 2976 "1") 27 | (word 588 2926 787 2978 "Down") 28 | (word 817 2925 927 2977 "the") 29 | (word 959 2922 1383 2976 "Rabbit-Hole")))) 30 | (column 451 707 2076 2856 31 | (para 463 2626 2076 2856 32 | (line 465 2803 2073 2856 33 | (word 465 2819 569 2856 "Alice") 34 | (word 592 2819 667 2841 "was") 35 | (word 690 2808 896 2854 "beginning") 36 | ⋮ 37 | 38 | Requisites 39 | ========== 40 | 41 | The following software is needed to run ocrodjvu: 42 | 43 | * Python 2.7 44 | 45 | * an OCR engine: 46 | 47 | + OCRopus_ 0.2, 0.3 or 0.3.1 48 | + Cuneiform_ ≥ 0.7 49 | + Ocrad_ ≥ 0.10 50 | + GOCR_ ≥ 0.40 51 | + Tesseract_ ≥ 2.00 52 | 53 | * DjVuLibre_ ≥ 3.5.21 54 | 55 | * python-djvulibre_ 56 | 57 | * subprocess32_ 58 | 59 | * lxml_ ≥ 2.0 60 | 61 | Additionally, some optional features require the following software: 62 | 63 | * PyICU_ ≥ 1.0.1 — 64 | required for the ``--word-segmentation=uax29`` option 65 | 66 | * html5lib_ — 67 | required for the ``--html5`` option 68 | 69 | The following software is needed to rebuild the manual pages from 70 | source: 71 | 72 | * xsltproc_ 73 | 74 | * `DocBook XSL stylesheets`_ 75 | 76 | 77 | .. _OCRopus: 78 | https://code.google.com/p/ocropus/ 79 | .. _Cuneiform: 80 | https://launchpad.net/cuneiform-linux 81 | .. _Ocrad: 82 | https://www.gnu.org/software/ocrad/ 83 | .. _GOCR: 84 | http://www-e.uni-magdeburg.de/jschulen/ocr/ 85 | .. _Tesseract: 86 | https://github.com/tesseract-ocr/tesseract 87 | .. _DjVuLibre: 88 | http://djvu.sourceforge.net/ 89 | .. _python-djvulibre: 90 | https://jwilk.net/software/python-djvulibre 91 | .. _lxml: 92 | https://lxml.de/ 93 | .. _subprocess32: 94 | https://pypi.org/project/subprocess32/ 95 | .. _PyICU: 96 | https://pypi.org/project/PyICU/ 97 | .. _html5lib: 98 | https://github.com/html5lib/html5lib-python 99 | .. _xsltproc: 100 | http://xmlsoft.org/XSLT/xsltproc2.html 101 | .. _DocBook XSL stylesheets: 102 | https://github.com/docbook/xslt10-stylesheets 103 | 104 | Acknowledgment 105 | ============== 106 | 107 | ocrodjvu development was supported by the Polish Ministry of Science 108 | and Higher Education's grant no. N N519 384036 (2009–2012, 109 | https://bitbucket.org/jsbien/ndt). 110 | 111 | .. vim:ft=rst ts=3 sts=3 sw=3 et tw=72 112 | -------------------------------------------------------------------------------- /doc/djvu2hocr.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | ]> 7 | 8 | 9 | 10 | 11 | &p; manual 12 | ocrodjvu 13 | 2020-08-06 14 | 15 | Jakub Wilk 16 | jwilk@jwilk.net 17 | 18 | 19 | 20 | 21 | &p; 22 | 1 23 | &version; 24 | 25 | 26 | 27 | &p; 28 | DjVu to hOCR converter 29 | 30 | 31 | 32 | 33 | &p; 34 | option 35 | djvu-file 36 | 37 | 38 | &p; 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | Description 49 | 50 | &p; converts hidden text from a DjVu file to the 51 | hOCR format. 52 | 53 | 54 | 55 | 56 | Options 57 | 58 | Input selection options 59 | 60 | 61 | 62 | 63 | 64 | 65 | Specifies pages to covert. page-range is a comma-separated list of 66 | sub-ranges. Each sub-range is either a single page (e.g. 17) or a contiguous 67 | range of pages (e.g. 37-42). Pages are numbered from 1. 68 | 69 | 70 | The default is to convert all pages. 71 | 72 | 73 | 74 | 75 | 76 | 77 | Text segmentation options 78 | 79 | 80 | 81 | 82 | 83 | Use the same word segmentation as found in the DjVu file. 84 | 85 | 86 | This is the default. 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | Use the Unicode Text Segmentation algorithm 95 | to break lines into words, possibly fixing word segmentation found in the DjVu file. 96 | 97 | 98 | 99 | 100 | 101 | 102 | HTML output options 103 | 104 | 105 | 106 | 107 | 108 | Specifies the document title. 109 | 110 | 111 | The default is DjVu hidden text layer. 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | Add the specified CSS style to the document. 120 | 121 | 122 | For example, can be used to visually 123 | preserve line breaks. 124 | 125 | 126 | 127 | 128 | 129 | 130 | Other options 131 | 132 | 133 | 134 | 135 | Output version information and exit. 136 | 137 | 138 | 139 | 140 | 141 | 142 | Display help and exit. 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | Portability 151 | 152 | &p; uses a custom extension to hOCR to retain characters which cannot be directly represented in an HTML/XML 153 | document. For example, control character BEL (^G, U+0007), is converted into the following HTML chunk: 154 | ]]> 155 | 156 | 157 | 158 | 159 | Bugs 160 | 161 | Please report bugs at: 162 | 163 | 164 | 165 | 166 | 167 | See also 168 | 169 | 170 | djvu 171 | 1 172 | , 173 | 174 | hocr2djvused 175 | 1 176 | , 177 | 178 | ocrodjvu 179 | 1 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /doc/hocr2djvused.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | ]> 7 | 8 | 9 | 10 | 11 | &p; manual 12 | ocrodjvu 13 | 2020-08-06 14 | 15 | Jakub Wilk 16 | jwilk@jwilk.net 17 | 18 | 19 | 20 | 21 | &p; 22 | 1 23 | &version; 24 | 25 | 26 | 27 | &p; 28 | hOCR to djvused script converter 29 | 30 | 31 | 32 | 33 | &p; 34 | option 35 | hocr-file 36 | 37 | 38 | 39 | 40 | Description 41 | 42 | &p; reads one or more hOCR files (as 43 | 44 | produced by 45 | OCRopus or 46 | Cuneiform or 47 | Tesseract) 48 | and converts them to a djvused script. 49 | 50 | 51 | Unless a filename is explicitly provided on the command line, hOCR is read from the standard input. 52 | 53 | 54 | 55 | 56 | Options 57 | 58 | Text segmentation options 59 | 60 | 61 | 62 | 63 | 64 | 65 | Record location of every line. Don't record locations of particular words or characters. 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | Record location of every line and every word. Don't record locations of particular characters. 75 | 76 | 77 | This is the default. 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | Record location of every line, every word and every character. 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | Consider each non-empty sequence of non-whitespace characters a single word. 95 | 96 | 97 | This is the default, despite being linguistically incorrect. 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | Use the Unicode Text Segmentation algorithm 106 | to break lines into words. 107 | 108 | 109 | This options break assumptions of some DjVu tools that words are separated by spaces, 110 | and therefore is it not recommended. 111 | 112 | 113 | 114 | 115 | 116 | 117 | Other options 118 | 119 | 120 | 121 | 122 | 123 | Assume that DjVu pages are rotated by n degrees. 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | Specifies that page size is width pixels × 132 | height pixels. 133 | 134 | This option is required for hOCR generated by Cuneiform (< 0.8) and superfluous otherwise. 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | Use a HTML5 143 | parser, which is more robust but slower than the default parser. 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | Attempt to fix UTF-8 encoding issues and eliminate unwanted control characters. 152 | 153 | 154 | This option might be needed for hOCR generated by 155 | Cuneiform 156 | or 157 | Tesseract. 158 | 159 | 160 | 161 | 162 | 163 | 164 | Output version information and exit. 165 | 166 | 167 | 168 | 169 | 170 | 171 | Display help and exit. 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | Bugs 180 | 181 | Please report bugs at: 182 | 183 | 184 | 185 | 186 | 187 | See also 188 | 189 | 190 | djvu 191 | 1 192 | , 193 | 194 | ocrodjvu 195 | 1 196 | , 197 | 198 | djvu2hocr 199 | 1 200 | , 201 | 202 | djvused 203 | 1 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /doc/todo: -------------------------------------------------------------------------------- 1 | Missing tests 2 | ============= 3 | * for non-ASCII filenames 4 | * for Cuneiform hOCR with inline formatting 5 | * for Cuneiform hOCR with bounding boxes for whitespace characters 6 | * for Cuneiform hOCR with empty pages 7 | * for OCRad, in particular: 8 | - for non-ASCII characters 9 | - for text close to a page boundary 10 | - for empty pages 11 | - for characters with no interpretations 12 | * for GOCR, in particular 13 | - for non-ASCII characters 14 | - for empty pages 15 | * for https://bugs.debian.org/575484#35 16 | * for https://bugs.debian.org/671764 17 | * for https://github.com/jwilk/ocrodjvu/issues/4 18 | * for https://github.com/jwilk/ocrodjvu/issues/12 19 | 20 | Documentation 21 | ============= 22 | * write better documentation for -X 23 | 24 | Miscellanea 25 | =========== 26 | * ``--debug`` should enable debug logging 27 | * don't use `` 20 | 21 | 22 | -------------------------------------------------------------------------------- /tests/hocr2djvused/empty_tesseract3.04+charboxes.test1: -------------------------------------------------------------------------------- 1 | # --details=lines 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 "") 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/empty_tesseract3.04+charboxes.test2: -------------------------------------------------------------------------------- 1 | # --details=words 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 "") 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/empty_tesseract3.04+charboxes.test3: -------------------------------------------------------------------------------- 1 | # --details=words --word-segmentation=uax29 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 "") 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/empty_tesseract3.04+charboxes.test4: -------------------------------------------------------------------------------- 1 | # --details=chars 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 "") 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/empty_tesseract3.04+charboxes.test5: -------------------------------------------------------------------------------- 1 | # --details=chars --word-segmentation=uax29 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 "") 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/multipage_ocropus0.3.1.html: -------------------------------------------------------------------------------- 1 | 4 | OCR Output 5 |
x 6 |
y 7 |
8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/multipage_ocropus0.3.1.test1: -------------------------------------------------------------------------------- 1 | # --details=lines 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 100 100 (line 10 60 50 90 "x")) 6 | . 7 | 8 | select 2 9 | remove-txt 10 | set-txt 11 | (page 0 0 100 100 (line 10 60 50 90 "y")) 12 | . 13 | 14 | -------------------------------------------------------------------------------- /tests/hocr2djvused/non-ascii_cuneiform0.7.html: -------------------------------------------------------------------------------- 1 | 2 |

czarów

-------------------------------------------------------------------------------- /tests/hocr2djvused/non-ascii_cuneiform1.0.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 |

czarów 8 |

9 |

10 |

11 |
12 | -------------------------------------------------------------------------------- /tests/hocr2djvused/non-ascii_ocropus0.3.1.html: -------------------------------------------------------------------------------- 1 | 4 | OCR Output 5 |
czarów 6 |
7 | -------------------------------------------------------------------------------- /tests/hocr2djvused/non-ascii_tesseract3.00.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 |
11 |

12 | czarów 13 |

14 |
15 |
16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/hocr2djvused/test.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2017 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | import contextlib 17 | import io 18 | import os 19 | import re 20 | import shlex 21 | import sys 22 | 23 | import djvu.sexpr 24 | 25 | from lib import errors 26 | from lib.cli import hocr2djvused 27 | 28 | from tests.tools import ( 29 | assert_equal, 30 | assert_multi_line_equal, 31 | assert_not_equal, 32 | interim, 33 | sorted_glob, 34 | try_run, 35 | ) 36 | 37 | here = os.path.dirname(__file__) 38 | here = os.path.relpath(here) 39 | 40 | def test_help(): 41 | stdout = io.BytesIO() 42 | stderr = io.BytesIO() 43 | with interim(sys, stdout=stdout, stderr=stderr): 44 | rc = try_run(hocr2djvused.main, ['', '--help']) 45 | assert_equal(stderr.getvalue(), '') 46 | assert_equal(rc, 0) 47 | assert_not_equal(stdout.getvalue(), '') 48 | 49 | def test_version(): 50 | # https://bugs.debian.org/573496 51 | stdout = io.BytesIO() 52 | stderr = io.BytesIO() 53 | with interim(sys, stdout=stdout, stderr=stderr): 54 | rc = try_run(hocr2djvused.main, ['', '--version']) 55 | assert_equal(stderr.getvalue(), '') 56 | assert_equal(rc, 0) 57 | assert_not_equal(stdout.getvalue(), '') 58 | 59 | def test_bad_options(): 60 | stdout = io.BytesIO() 61 | stderr = io.BytesIO() 62 | with interim(sys, stdout=stdout, stderr=stderr): 63 | rc = try_run(hocr2djvused.main, ['', '--bad-option']) 64 | assert_equal(rc, errors.EXIT_FATAL) 65 | assert_not_equal(stderr.getvalue(), '') 66 | assert_equal(stdout.getvalue(), '') 67 | 68 | def normalize_sexpr(match): 69 | return djvu.sexpr.Expression.from_string(match.group(1)).as_string(width=80) 70 | 71 | _djvused_text_re = re.compile('^([(].*)(?=^[.]$)', flags=(re.MULTILINE | re.DOTALL)) 72 | def normalize_djvused(script): 73 | return _djvused_text_re.sub(normalize_sexpr, script) 74 | 75 | def _test_from_file(base_filename, index, extra_args): 76 | base_filename = os.path.join(here, base_filename) 77 | test_filename = '{base}.test{i}'.format(base=base_filename, i=index) 78 | html_filename = '{base}.html'.format(base=base_filename) 79 | with open(test_filename, 'rb') as file: 80 | commandline = file.readline() 81 | expected_output = file.read() 82 | args = shlex.split(commandline) + shlex.split(extra_args) 83 | assert_equal(args[0], '#') 84 | with contextlib.closing(io.BytesIO()) as output_file: 85 | with open(html_filename, 'rb') as html_file: 86 | with interim(sys, stdin=html_file, stdout=output_file): 87 | rc = try_run(hocr2djvused.main, args) 88 | assert_equal(rc, 0) 89 | output = output_file.getvalue() 90 | assert_multi_line_equal( 91 | normalize_djvused(expected_output), 92 | normalize_djvused(output) 93 | ) 94 | 95 | def _rough_test_from_file(base_filename, args): 96 | args = ['#'] + shlex.split(args) 97 | if base_filename.endswith(('cuneiform0.7', 'cuneiform0.8')): 98 | # Add dummy page-size information 99 | args += ['--page-size=1000x1000'] 100 | base_filename = os.path.join(here, base_filename) 101 | html_filename = '{base}.html'.format(base=base_filename) 102 | with contextlib.closing(io.BytesIO()) as output_file: 103 | with open(html_filename, 'rb') as html_file: 104 | with interim(sys, stdin=html_file, stdout=output_file): 105 | rc = try_run(hocr2djvused.main, args) 106 | assert_equal(rc, 0) 107 | output = output_file.getvalue() 108 | assert_not_equal(output, '') 109 | 110 | def test_from_file(): 111 | rough_test_args = ['--details=lines'] 112 | rough_test_args += [ 113 | '--details={0}'.format(details) + extra 114 | for details in ('words', 'chars') 115 | for extra in ('', ' --word-segmentation=uax29') 116 | ] 117 | known_bases = set() 118 | for test_filename in sorted_glob(os.path.join(here, '*.test[0-9]')): 119 | index = int(test_filename[-1]) 120 | base_filename = os.path.basename(test_filename[:-6]) 121 | known_bases.add(base_filename) 122 | for extra_args in '', '--html5': 123 | yield _test_from_file, base_filename, index, extra_args 124 | for html_filename in sorted_glob(os.path.join(here, '*.html')): 125 | # For HTML files that have no corresponding .test* files, we just check 126 | # if they won't trigger any exception. 127 | base_filename = os.path.basename(html_filename[:-5]) 128 | for args in rough_test_args: 129 | if base_filename not in known_bases: 130 | for extra_args in '', ' --html5': 131 | yield _rough_test_from_file, base_filename, args + extra_args 132 | 133 | # vim:ts=4 sts=4 sw=4 et 134 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform0.7.html: -------------------------------------------------------------------------------- 1 | 2 |

ocr_files/0.bmp

x

3 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform0.7.test1: -------------------------------------------------------------------------------- 1 | # --page-size=1000x1000 --details=lines 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 (para 100 800 200 900 "x")) 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform0.7.test2: -------------------------------------------------------------------------------- 1 | # --page-size=1000x1000 --details=words 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 (para 100 800 200 900 (word 100 800 200 900 "x"))) 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform0.7.test3: -------------------------------------------------------------------------------- 1 | # --page-size=1000x1000 --details=words --word-segmentation=uax29 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 (para 100 800 200 900 (word 100 800 200 900 "x"))) 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform0.7.test4: -------------------------------------------------------------------------------- 1 | # --page-size=1000x1000 --details=chars 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 6 | (para 100 800 200 900 (word 100 800 200 900 (char 100 800 200 900 "x"))) ) 7 | . 8 | 9 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform0.7.test5: -------------------------------------------------------------------------------- 1 | # --page-size=1000x1000 --details=chars --word-segmentation=uax29 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 6 | (para 100 800 200 900 (word 100 800 200 900 (char 100 800 200 900 "x"))) ) 7 | . 8 | 9 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform1.0.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 |

8 |

9 |

ocr_files/0.bmp 10 |

11 |

12 |

13 |

x 14 |

15 |

16 |

17 |
18 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform1.0.test1: -------------------------------------------------------------------------------- 1 | # --details=lines 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 (para 100 800 200 900 (line 100 800 200 900 "x"))) 6 | . 7 | 8 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform1.0.test2: -------------------------------------------------------------------------------- 1 | # --details=words 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 6 | (para 100 800 200 900 (line 100 800 200 900 (word 100 800 200 900 "x"))) ) 7 | . 8 | 9 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform1.0.test3: -------------------------------------------------------------------------------- 1 | # --details=words --word-segmentation=uax29 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 6 | (para 100 800 200 900 (line 100 800 200 900 (word 100 800 200 900 "x"))) ) 7 | . 8 | 9 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform1.0.test4: -------------------------------------------------------------------------------- 1 | # --details=chars 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 6 | (para 100 800 200 900 7 | (line 100 800 200 900 (word 100 800 200 900 (char 100 800 200 900 "x"))) ) ) 8 | . 9 | 10 | -------------------------------------------------------------------------------- /tests/hocr2djvused/text+images_cuneiform1.0.test5: -------------------------------------------------------------------------------- 1 | # --details=chars --word-segmentation=uax29 2 | select 1 3 | remove-txt 4 | set-txt 5 | (page 0 0 1000 1000 6 | (para 100 800 200 900 7 | (line 100 800 200 900 (word 100 800 200 900 (char 100 800 200 900 "x"))) ) ) 8 | . 9 | 10 | -------------------------------------------------------------------------------- /tests/image_io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/__init__.py -------------------------------------------------------------------------------- /tests/image_io/test.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2015 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | import io 17 | import os 18 | 19 | from PIL import Image 20 | 21 | import djvu.decode 22 | 23 | from lib import image_io 24 | 25 | from tests.tools import ( 26 | assert_equal, 27 | assert_is_none, 28 | sorted_glob, 29 | ) 30 | 31 | here = os.path.dirname(__file__) 32 | here = os.path.relpath(here) 33 | 34 | formats = image_io.PNM, image_io.BMP, image_io.TIFF 35 | 36 | def _test_from_file(base_filename, format): 37 | if format.bpp == 1: 38 | layers = djvu.decode.RENDER_MASK_ONLY 39 | else: 40 | layers = djvu.decode.RENDER_COLOR 41 | base_filename = os.path.join(here, base_filename) 42 | djvu_filename = '{base}.djvu'.format(base=base_filename) 43 | expected_filename = '{base}_{bpp}bpp.{ext}'.format(base=base_filename, bpp=format.bpp, ext=format.extension) 44 | with open(expected_filename, 'rb') as file: 45 | expected = file.read() 46 | context = djvu.decode.Context() 47 | document = context.new_document(djvu.decode.FileUri(djvu_filename)) 48 | page_job = document.pages[0].decode(wait=True) 49 | file = io.BytesIO() 50 | format.write_image(page_job, layers, file) 51 | result = file.getvalue() 52 | assert_equal(len(result), len(expected)) 53 | if result == expected: 54 | # The easy part: 55 | return 56 | else: 57 | # The result might be still correct, even if the strings are different. 58 | # Think of BMP format and its padding bytes. 59 | expected = Image.open(expected_filename) 60 | result = Image.open(io.BytesIO(result)) 61 | assert_equal(result.format, expected.format) 62 | assert_equal(result.size, expected.size) 63 | assert_equal(result.mode, expected.mode) 64 | if result.palette is None: 65 | assert_is_none(expected.palette) 66 | else: 67 | assert_equal(list(result.palette.getdata()), list(expected.palette.getdata())) 68 | assert_equal(list(result.getdata()), list(expected.getdata())) 69 | 70 | def test_from_file(): 71 | for djvu_filename in sorted_glob(os.path.join(here, '*.djvu')): 72 | base_filename = os.path.basename(djvu_filename[:-5]) 73 | for format in formats: 74 | for bpp in 1, 24: 75 | yield _test_from_file, base_filename, format(bpp) 76 | 77 | # vim:ts=4 sts=4 sw=4 et 78 | -------------------------------------------------------------------------------- /tests/image_io/whirl.djvu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl.djvu -------------------------------------------------------------------------------- /tests/image_io/whirl_1bpp.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl_1bpp.bmp -------------------------------------------------------------------------------- /tests/image_io/whirl_1bpp.pbm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl_1bpp.pbm -------------------------------------------------------------------------------- /tests/image_io/whirl_1bpp.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl_1bpp.tif -------------------------------------------------------------------------------- /tests/image_io/whirl_24bpp.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl_24bpp.bmp -------------------------------------------------------------------------------- /tests/image_io/whirl_24bpp.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl_24bpp.ppm -------------------------------------------------------------------------------- /tests/image_io/whirl_24bpp.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/image_io/whirl_24bpp.tif -------------------------------------------------------------------------------- /tests/ocrodjvu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwilk-archive/ocrodjvu/9b317314c90217432b4cd9cf6b13d19769e740dd/tests/ocrodjvu/__init__.py -------------------------------------------------------------------------------- /tests/ocrodjvu/test.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2020 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | import io 17 | import os 18 | import shutil 19 | import sys 20 | 21 | from lib import errors 22 | from lib import temporary 23 | from lib.cli import ocrodjvu 24 | 25 | from tests.tools import ( 26 | assert_equal, 27 | assert_is_not_none, 28 | assert_not_equal, 29 | interim, 30 | remove_logging_handlers, 31 | require_locale_encoding, 32 | try_run, 33 | ) 34 | 35 | engines = None 36 | 37 | def test_help(): 38 | stdout = io.BytesIO() 39 | stderr = io.BytesIO() 40 | with interim(sys, stdout=stdout, stderr=stderr): 41 | rc = try_run(ocrodjvu.main, ['', '--help']) 42 | assert_equal(stderr.getvalue(), '') 43 | assert_equal(rc, 0) 44 | assert_not_equal(stdout.getvalue(), '') 45 | 46 | def test_version(): 47 | # https://bugs.debian.org/573496 48 | stdout = io.BytesIO() 49 | stderr = io.BytesIO() 50 | with interim(sys, stdout=stdout, stderr=stderr): 51 | rc = try_run(ocrodjvu.main, ['', '--version']) 52 | assert_equal(rc, 0) 53 | assert_equal(stderr.getvalue(), '') 54 | assert_not_equal(stdout.getvalue(), '') 55 | 56 | def test_bad_options(): 57 | stdout = io.BytesIO() 58 | stderr = io.BytesIO() 59 | with interim(sys, stdout=stdout, stderr=stderr): 60 | rc = try_run(ocrodjvu.main, ['']) 61 | assert_equal(rc, errors.EXIT_FATAL) 62 | assert_not_equal(stderr.getvalue(), '') 63 | assert_equal(stdout.getvalue(), '') 64 | 65 | def test_list_engines(): 66 | global engines 67 | stdout = io.BytesIO() 68 | stderr = io.BytesIO() 69 | with interim(sys, stdout=stdout, stderr=stderr): 70 | rc = try_run(ocrodjvu.main, ['', '--list-engines']) 71 | assert_equal(stderr.getvalue(), '') 72 | assert_equal(rc, 0) 73 | engines = stdout.getvalue().splitlines() 74 | 75 | def _test_list_languages(engine): 76 | stdout = io.BytesIO() 77 | stderr = io.BytesIO() 78 | with interim(sys, stdout=stdout, stderr=stderr): 79 | rc = try_run(ocrodjvu.main, ['', '--engine', engine, '--list-languages']) 80 | assert_equal(stderr.getvalue(), '') 81 | assert_equal(rc, 0) 82 | assert_not_equal(stdout.getvalue(), '') 83 | 84 | def test_list_languages(): 85 | assert_is_not_none(engines) 86 | for engine in engines: 87 | yield _test_list_languages, engine 88 | 89 | def test_nonascii_path(): 90 | require_locale_encoding('UTF-8') # djvused breaks otherwise 91 | remove_logging_handlers('ocrodjvu.') 92 | here = os.path.dirname(__file__) 93 | here = os.path.abspath(here) 94 | path = os.path.join(here, '..', 'data', 'empty.djvu') 95 | stdout = io.BytesIO() 96 | stderr = io.BytesIO() 97 | with temporary.directory() as tmpdir: 98 | tmp_path = os.path.join(tmpdir, 'тмп.djvu') 99 | shutil.copy(path, tmp_path) 100 | with interim(sys, stdout=stdout, stderr=stderr): 101 | rc = try_run(ocrodjvu.main, ['', '--engine', '_dummy', '--in-place', tmp_path]) 102 | assert_equal(stderr.getvalue(), '') 103 | assert_equal(rc, 0) 104 | assert_equal(stdout.getvalue(), '') 105 | 106 | def test_bad_page_id(): 107 | remove_logging_handlers('ocrodjvu.') 108 | here = os.path.dirname(__file__) 109 | here = os.path.abspath(here) 110 | path = os.path.join(here, '..', 'data', 'bad-page-id.djvu') 111 | stdout = io.BytesIO() 112 | stderr = io.BytesIO() 113 | with temporary.directory() as tmpdir: 114 | out_path = os.path.join(tmpdir, 'tmp.djvu') 115 | with interim(sys, stdout=stdout, stderr=stderr): 116 | with interim(ocrodjvu, system_encoding='ASCII'): 117 | rc = try_run(ocrodjvu.main, ['', '--engine', '_dummy', '--save-bundled', out_path, path]) 118 | assert_equal(stderr.getvalue(), '') 119 | assert_equal(rc, 0) 120 | assert_equal(stdout.getvalue(), '') 121 | 122 | # vim:ts=4 sts=4 sw=4 et 123 | -------------------------------------------------------------------------------- /tests/ocrodjvu/test_integration.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2018 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | import distutils.spawn 17 | import io 18 | import os 19 | import sys 20 | 21 | from lib import temporary 22 | from lib.cli import ocrodjvu 23 | 24 | from tests.tools import ( 25 | assert_multi_line_equal, 26 | assert_equal, 27 | interim, 28 | remove_logging_handlers, 29 | try_run, 30 | SkipTest, 31 | ) 32 | 33 | engines = 'tesseract', 'cuneiform', 'gocr', 'ocrad' 34 | 35 | def _test_ocr(engine, layers): 36 | if not distutils.spawn.find_executable(engine): 37 | raise SkipTest('{cmd} not found'.format(cmd=engine)) 38 | remove_logging_handlers('ocrodjvu.') 39 | here = os.path.dirname(__file__) 40 | here = os.path.abspath(here) 41 | path = os.path.join(here, '..', 'data', 'alice.djvu') 42 | stdout = io.BytesIO() 43 | stderr = io.BytesIO() 44 | with temporary.directory() as tmpdir: 45 | tmp_path = os.path.join(tmpdir, 'tmp.djvu') 46 | with interim(sys, stdout=stdout, stderr=stderr): 47 | rc = try_run(ocrodjvu.main, ['', '--engine', engine, '--render', layers, '--save-bundled', tmp_path, path]) 48 | assert_multi_line_equal(stderr.getvalue(), '') 49 | assert_equal(rc, 0) 50 | assert_multi_line_equal(stdout.getvalue(), '') 51 | 52 | def test_ocr(): 53 | for engine in engines: 54 | for layers in 'mask', 'all': 55 | yield _test_ocr, engine, layers 56 | 57 | # vim:ts=4 sts=4 sw=4 et 58 | -------------------------------------------------------------------------------- /tests/test_ipc.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2022 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | from __future__ import print_function 17 | 18 | import errno 19 | import os 20 | import signal 21 | 22 | from tests.tools import ( 23 | assert_equal, 24 | assert_false, 25 | assert_raises, 26 | assert_true, 27 | interim_environ, 28 | ) 29 | 30 | from lib import ipc 31 | from lib import temporary 32 | 33 | class test_exceptions(): 34 | 35 | def test_sigint(self): 36 | ex = ipc.CalledProcessInterrupted(signal.SIGINT, 'eggs') 37 | assert_equal(str(ex), "Command 'eggs' was interrupted by signal SIGINT") 38 | assert_true(ex.by_user) 39 | 40 | def test_sigabrt(self): 41 | ex = ipc.CalledProcessInterrupted(signal.SIGABRT, 'eggs') 42 | assert_equal(str(ex), "Command 'eggs' was interrupted by signal SIGABRT") 43 | assert_false(ex.by_user) 44 | 45 | def test_sigsegv(self): 46 | ex = ipc.CalledProcessInterrupted(signal.SIGSEGV, 'eggs') 47 | assert_equal(str(ex), "Command 'eggs' was interrupted by signal SIGSEGV") 48 | assert_false(ex.by_user) 49 | 50 | def test_invalid_signo(self): 51 | # signal.NSIG is guaranteed not be a correct signal number 52 | ex = ipc.CalledProcessInterrupted(signal.NSIG, 'eggs') 53 | assert_equal(str(ex), "Command 'eggs' was interrupted by signal {0}".format(signal.NSIG)) 54 | assert_false(ex.by_user) 55 | 56 | def test_init_exc(): 57 | # https://bugs.python.org/issue32490 58 | prog = 'ocrodjvu-nonexistent' 59 | with assert_raises(EnvironmentError) as ecm: 60 | ipc.Subprocess([prog]) 61 | msg = '[Errno {err}] {strerr}: {prog!r}'.format( 62 | err=errno.ENOENT, 63 | strerr=os.strerror(errno.ENOENT), 64 | prog=prog 65 | ) 66 | assert_equal(str(ecm.exception), msg) 67 | 68 | class test_wait(): 69 | 70 | def test0(self): 71 | child = ipc.Subprocess(['true']) 72 | child.wait() 73 | 74 | def test1(self): 75 | child = ipc.Subprocess(['false']) 76 | with assert_raises(ipc.CalledProcessError) as ecm: 77 | child.wait() 78 | message = str(ecm.exception) 79 | if message[-1] == '.': # subprocess32 >= 3.5 80 | message = message[:-1] 81 | assert_equal(message, "Command 'false' returned non-zero exit status 1") 82 | 83 | def _test_signal(self, name): 84 | child = ipc.Subprocess(['cat'], stdin=ipc.PIPE) # Any long-standing process would do. 85 | os.kill(child.pid, getattr(signal, name)) 86 | with assert_raises(ipc.CalledProcessInterrupted) as ecm: 87 | child.wait() 88 | assert_equal(str(ecm.exception), "Command 'cat' was interrupted by signal " + name) 89 | 90 | def test_wait_signal(self): 91 | for name in 'SIGINT', 'SIGABRT', 'SIGSEGV': 92 | yield self._test_signal, name 93 | 94 | class test_environment(): 95 | 96 | # https://bugs.debian.org/594385 97 | 98 | def test1(self): 99 | with interim_environ(ocrodjvu='42'): 100 | child = ipc.Subprocess( 101 | ['sh', '-c', 'printf $ocrodjvu'], 102 | stdout=ipc.PIPE, stderr=ipc.PIPE, 103 | ) 104 | stdout, stderr = child.communicate() 105 | assert_equal(stdout, '42') 106 | assert_equal(stderr, '') 107 | 108 | def test2(self): 109 | with interim_environ(ocrodjvu='42'): 110 | child = ipc.Subprocess( 111 | ['sh', '-c', 'printf $ocrodjvu'], 112 | stdout=ipc.PIPE, stderr=ipc.PIPE, 113 | env={}, 114 | ) 115 | stdout, stderr = child.communicate() 116 | assert_equal(stdout, '42') 117 | assert_equal(stderr, '') 118 | 119 | def test3(self): 120 | with interim_environ(ocrodjvu='42'): 121 | child = ipc.Subprocess( 122 | ['sh', '-c', 'printf $ocrodjvu'], 123 | stdout=ipc.PIPE, stderr=ipc.PIPE, 124 | env=dict(ocrodjvu='24'), 125 | ) 126 | stdout, stderr = child.communicate() 127 | assert_equal(stdout, '24') 128 | assert_equal(stderr, '') 129 | 130 | def test_path(self): 131 | path = os.getenv('PATH') 132 | with temporary.directory() as tmpdir: 133 | command_name = temporary.name(dir=tmpdir) 134 | command_path = os.path.join(tmpdir, command_name) 135 | with open(command_path, 'wt') as file: 136 | print('#!/bin/sh', file=file) 137 | print('printf 42', file=file) 138 | os.chmod(command_path, 0o700) 139 | path = str.join(os.pathsep, [tmpdir, path]) 140 | with interim_environ(PATH=path): 141 | child = ipc.Subprocess([command_name], 142 | stdout=ipc.PIPE, stderr=ipc.PIPE, 143 | ) 144 | stdout, stderr = child.communicate() 145 | assert_equal(stdout, '42') 146 | assert_equal(stderr, '') 147 | 148 | def _test_locale(self): 149 | child = ipc.Subprocess(['locale'], 150 | stdout=ipc.PIPE, stderr=ipc.PIPE 151 | ) 152 | stdout, stderr = child.communicate() 153 | stdout = stdout.splitlines() 154 | stderr = stderr.splitlines() 155 | assert_equal(stderr, []) 156 | data = dict(line.split('=', 1) for line in stdout) 157 | has_lc_all = has_lc_ctype = has_lang = 0 158 | for key, value in data.iteritems(): 159 | if key == 'LC_ALL': 160 | has_lc_all = 1 161 | assert_equal(value, '') 162 | elif key == 'LC_CTYPE': 163 | has_lc_ctype = 1 164 | assert_equal(value, 'en_US.UTF-8') 165 | elif key == 'LANG': 166 | has_lang = 1 167 | assert_equal(value, '') 168 | elif key == 'LANGUAGE': 169 | assert_equal(value, '') 170 | else: 171 | assert_equal(value, '"POSIX"') 172 | assert_true(has_lc_all) 173 | assert_true(has_lc_ctype) 174 | assert_true(has_lang) 175 | 176 | def test_locale_lc_all(self): 177 | with interim_environ(LC_ALL='en_US.UTF-8'): 178 | self._test_locale() 179 | 180 | def test_locale_lc_ctype(self): 181 | with interim_environ(LC_ALL=None, LC_CTYPE='en_US.UTF-8'): 182 | self._test_locale() 183 | 184 | def test_locale_lang(self): 185 | with interim_environ(LC_ALL=None, LC_CTYPE=None, LANG='en_US.UTF-8'): 186 | self._test_locale() 187 | 188 | class test_require(): 189 | 190 | def test_ok(self): 191 | ipc.require('cat') 192 | 193 | def test_fail(self): 194 | prog = 'ocrodjvu-nonexistent' 195 | with assert_raises(OSError) as ecm: 196 | ipc.require(prog) 197 | exc_message = "[Errno {errno.ENOENT}] command not found: {cmd!r}".format( 198 | errno=errno, 199 | cmd=prog, 200 | ) 201 | assert_equal(str(ecm.exception), exc_message) 202 | 203 | # vim:ts=4 sts=4 sw=4 et 204 | -------------------------------------------------------------------------------- /tests/test_text_zones.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2015 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | import io 17 | import distutils.version 18 | 19 | from tests.tools import ( 20 | assert_equal, 21 | ) 22 | 23 | from lib import text_zones 24 | 25 | V = distutils.version.LooseVersion 26 | python_djvulibre_version = V(text_zones.decode.__version__) 27 | 28 | def test_print_sexpr(): 29 | inp = 'jeż' 30 | if python_djvulibre_version < V('0.4'): 31 | out = r'"je\305\274"' 32 | else: 33 | out = '"jeż"' 34 | fp = io.BytesIO() 35 | expr = text_zones.sexpr.Expression(inp) 36 | text_zones.print_sexpr(expr, fp) 37 | fp.seek(0) 38 | assert_equal(fp.getvalue(), out) 39 | 40 | # vim:ts=4 sts=4 sw=4 et 41 | -------------------------------------------------------------------------------- /tests/test_unicode_support.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2019 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | from tests.tools import ( 17 | assert_equal, 18 | assert_not_equal, 19 | ) 20 | 21 | from lib.unicode_support import ( 22 | get_icu, 23 | simple_word_break_iterator, 24 | word_break_iterator, 25 | ) 26 | 27 | text = u'\u201CJekyll,\u201D cried Utterson, with a\xa0loud voice, \u201CI demand to see you.\u201D' 28 | 29 | class test_simple_word_break_iterator(): 30 | 31 | def test_nonempty(self): 32 | t = list(simple_word_break_iterator(text)) 33 | s = [9, 10, 15, 16, 25, 26, 30, 31, 32, 33, 37, 38, 44, 45, 47, 48, 54, 55, 57, 58, 61, 62, 67] 34 | assert_equal(t, s) 35 | assert_equal(s[-1], len(text)) 36 | 37 | def test_empty(self): 38 | t = list(simple_word_break_iterator('')) 39 | assert_equal(t, []) 40 | 41 | class test_word_break_iterator(): 42 | 43 | def test_nolocale(self): 44 | t = list(word_break_iterator(text)) 45 | s = [9, 10, 15, 16, 25, 26, 30, 31, 32, 33, 37, 38, 44, 45, 47, 48, 54, 55, 57, 58, 61, 62, 67] 46 | assert_equal(t, s) 47 | assert_equal(s[-1], len(text)) 48 | 49 | def test_nolocale_empty(self): 50 | t = list(word_break_iterator('')) 51 | assert_equal(t, []) 52 | 53 | def test_en(self): 54 | icu = get_icu() 55 | assert_not_equal(icu, None) 56 | t = list(word_break_iterator(text, icu.Locale('en'))) 57 | s = [1, 7, 8, 9, 10, 15, 16, 24, 25, 26, 30, 31, 32, 33, 37, 38, 43, 44, 45, 46, 47, 48, 54, 55, 57, 58, 61, 62, 65, 66, 67] 58 | assert_equal(t, s) 59 | assert_equal(s[-1], len(text)) 60 | 61 | def test_en_simple(self): 62 | # Trigger reference-counting bug that was fixed in PyICU 1.0.1: 63 | # https://github.com/ovalhub/pyicu/commit/515e076682e29d806aeb5f6b1016b799d03d92a9 64 | icu = get_icu() 65 | assert_not_equal(icu, None) 66 | t = list(word_break_iterator('eggs', icu.Locale('en'))) 67 | assert_equal(t, [4]) 68 | 69 | def test_en_empty(self): 70 | icu = get_icu() 71 | assert_not_equal(icu, None) 72 | t = list(word_break_iterator('', icu.Locale('en'))) 73 | assert_equal(t, []) 74 | 75 | # vim:ts=4 sts=4 sw=4 et 76 | -------------------------------------------------------------------------------- /tests/tools.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2021 Jakub Wilk 4 | # 5 | # This file is part of ocrodjvu. 6 | # 7 | # ocrodjvu is free software; you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License version 2 as 9 | # published by the Free Software Foundation. 10 | # 11 | # ocrodjvu is distributed in the hope that it will be useful, but WITHOUT 12 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | # for more details. 15 | 16 | import codecs 17 | import contextlib 18 | import glob 19 | import locale 20 | import logging 21 | import os 22 | 23 | from nose import SkipTest 24 | 25 | from nose.tools import ( 26 | assert_equal, 27 | assert_false, 28 | assert_greater, 29 | assert_greater_equal, 30 | assert_in, 31 | assert_is, 32 | assert_is_instance, 33 | assert_is_none, 34 | assert_is_not_none, 35 | assert_less, 36 | assert_less_equal, 37 | assert_multi_line_equal, 38 | assert_not_equal, 39 | assert_raises, 40 | assert_raises_regexp as assert_raises_regex, 41 | assert_regexp_matches as assert_regex, 42 | assert_true, 43 | ) 44 | 45 | type(assert_multi_line_equal.__self__).maxDiff = None 46 | 47 | @contextlib.contextmanager 48 | def interim(obj, **override): 49 | copy = dict( 50 | (key, getattr(obj, key)) 51 | for key in override 52 | ) 53 | for key, value in override.iteritems(): 54 | setattr(obj, key, value) 55 | try: 56 | yield 57 | finally: 58 | for key, value in copy.iteritems(): 59 | setattr(obj, key, value) 60 | 61 | @contextlib.contextmanager 62 | def interim_environ(**override): 63 | keys = set(override) 64 | copy_keys = keys & set(os.environ) 65 | copy = dict( 66 | (key, value) 67 | for key, value in os.environ.iteritems() 68 | if key in copy_keys 69 | ) 70 | for key, value in override.iteritems(): 71 | if value is None: 72 | os.environ.pop(key, None) 73 | else: 74 | os.environ[key] = value 75 | try: 76 | yield 77 | finally: 78 | for key in keys: 79 | os.environ.pop(key, None) 80 | os.environ.update(copy) 81 | 82 | def try_run(f, *args, **kwargs): 83 | '''Catch SystemExit etc.''' 84 | try: 85 | f(*args, **kwargs) 86 | except SystemExit as ex: 87 | return ex.code 88 | else: 89 | return 0 90 | 91 | def sorted_glob(*args, **kwargs): 92 | return sorted(glob.iglob(*args, **kwargs)) 93 | 94 | def remove_logging_handlers(prefix): 95 | loggers = logging.Logger.manager.loggerDict.values() 96 | for logger in loggers: 97 | try: 98 | handlers = logger.handlers 99 | except AttributeError: 100 | continue 101 | for handler in handlers: 102 | if logger.name.startswith(prefix): 103 | logger.removeHandler(handler) 104 | 105 | def require_locale_encoding(encoding): 106 | req_encoding = codecs.lookup(encoding).name 107 | locale_encoding = locale.getpreferredencoding() 108 | locale_encoding = codecs.lookup(locale_encoding).name 109 | if req_encoding != locale_encoding: 110 | raise SkipTest('locale encoding {enc} is required'.format(enc=encoding)) 111 | 112 | __all__ = [ 113 | 'assert_equal', 114 | 'assert_false', 115 | 'assert_greater', 116 | 'assert_greater_equal', 117 | 'assert_in', 118 | 'assert_is', 119 | 'assert_is_instance', 120 | 'assert_is_none', 121 | 'assert_is_not_none', 122 | 'assert_less', 123 | 'assert_less_equal', 124 | 'assert_multi_line_equal', 125 | 'assert_not_equal', 126 | 'assert_raises', 127 | 'assert_raises_regex', 128 | 'assert_regex', 129 | 'assert_true', 130 | 'interim', 131 | 'interim_environ', 132 | 'remove_logging_handlers', 133 | 'require_locale_encoding', 134 | 'sorted_glob', 135 | 'try_run', 136 | ] 137 | 138 | # vim:ts=4 sts=4 sw=4 et 139 | --------------------------------------------------------------------------------