├── .coveragerc ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .pyup.yml ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── Vagrantfile ├── bin └── textract ├── docs ├── Makefile ├── changelog.rst ├── command_line_interface.rst ├── conf.py ├── contributing.rst ├── index.rst ├── installation.rst └── python_package.rst ├── provision ├── debian.sh ├── development.sh ├── python2.sh ├── python3.sh └── travis-mock.sh ├── requirements ├── debian ├── freebsd ├── python ├── python-dev2 ├── python-dev3 └── python-doc ├── setup.cfg ├── setup.py ├── tests ├── Dockerfile ├── Makefile ├── __init__.py ├── base.py ├── csv │ ├── raw_text.csv │ ├── raw_text.txt │ └── standardized_text.csv ├── doc │ ├── raw_text.doc │ ├── raw_text.txt │ ├── standardized_text.doc │ └── standardized_text_1.odt ├── docker_entry.sh ├── docx │ ├── paragraphs_and_tables.docx │ ├── paragraphs_and_tables.txt │ ├── raw_text.docx │ ├── raw_text.txt │ └── standardized_text.docx ├── eml │ ├── raw_text.eml │ ├── raw_text.txt │ └── standardized_text.eml ├── epub │ ├── raw_text.epub │ ├── raw_text.txt │ └── standardized_text.epub ├── gif │ ├── raw_text.gif │ └── standardized_text.gif ├── html │ ├── raw_text.html │ ├── raw_text.txt │ ├── standardized_text.html │ ├── tables.html │ └── tables.txt ├── jpg │ ├── raw_text.jpg │ └── standardized_text.jpg ├── json │ ├── raw_text.json │ ├── raw_text.txt │ └── standardized_text.json ├── mp3 │ ├── raw_text-m=google.txt │ ├── raw_text-m=sphinx.txt │ ├── raw_text.mp3 │ ├── raw_text.txt │ └── standardized_text.mp3 ├── msg │ ├── raw_text.msg │ ├── raw_text.txt │ └── standardized_text.msg ├── no_ext │ ├── docx_paragraphs_and_tables │ ├── msg_standardized_text │ └── pdf_standardized_text ├── odt │ ├── raw_text.odt │ ├── raw_text.txt │ └── standardized_text.odt ├── ogg │ ├── raw_text.ogg │ ├── raw_text.txt │ └── standardized_text.ogg ├── pdf │ ├── ocr_text.pdf │ ├── raw_text-m=pdfminer.txt │ ├── raw_text.pdf │ ├── raw_text.txt │ ├── standardized_text.pdf │ ├── two_column.pdf │ └── two_column.txt ├── png │ ├── raw_text.png │ └── standardized_text.png ├── pptx │ ├── raw_text.pptx │ ├── raw_text.txt │ └── standardized_text.pptx ├── ps │ ├── raw_text.ps │ ├── raw_text.txt │ └── standardized_text.ps ├── psv │ ├── raw_text.psv │ ├── raw_text.txt │ └── standardized_text.psv ├── rtf │ ├── raw_text.rtf │ ├── raw_text.txt │ └── standardized_text.rtf ├── run.py ├── run_docker_tests.sh ├── test_csv.py ├── test_doc.py ├── test_docx.py ├── test_eml.py ├── test_epub.py ├── test_exceptions.py ├── test_gif.py ├── test_html.py ├── test_jpg.py ├── test_json.py ├── test_mp3.py ├── test_msg.py ├── test_no_ext.py ├── test_odt.py ├── test_ogg.py ├── test_pdf.py ├── test_png.py ├── test_pptx.py ├── test_ps.py ├── test_psv.py ├── test_rtf.py ├── test_tiff.py ├── test_tsv.py ├── test_txt.py ├── test_wav.py ├── test_xls.py ├── test_xlsx.py ├── tiff │ ├── raw_text.tiff │ └── standardized_text.tiff ├── tsv │ ├── raw_text.tsv │ ├── raw_text.txt │ └── standardized_text.tsv ├── txt │ ├── raw_text.txt │ └── standardized_text.txt ├── wav │ ├── raw_text.txt │ ├── raw_text.wav │ └── standardized_text.wav ├── xls │ ├── raw_text.txt │ ├── raw_text.xls │ └── standardized_text.xls └── xlsx │ ├── raw_text.txt │ ├── raw_text.xlsx │ └── standardized_text.xlsx └── textract ├── __init__.py ├── cli.py ├── colors.py ├── exceptions.py └── parsers ├── __init__.py ├── audio.py ├── csv_parser.py ├── doc_parser.py ├── docx_parser.py ├── eml_parser.py ├── epub_parser.py ├── gif_parser.py ├── html_parser.py ├── image.py ├── jpg_parser.py ├── json_parser.py ├── mp3_parser.py ├── msg_parser.py ├── odt_parser.py ├── ogg_parser.py ├── pdf_parser.py ├── png_parser.py ├── pptx_parser.py ├── ps_parser.py ├── psv_parser.py ├── rtf_parser.py ├── tiff_parser.py ├── tsv_parser.py ├── txt_parser.py ├── utils.py ├── wav_parser.py ├── xls_parser.py └── xlsx_parser.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = 3 | */python?.?/* 4 | */site-packages/nose/* 5 | textract/cli.py 6 | textract/colors.py 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. Windows 10] 28 | - Textract version [e.g. 1.6.3] 29 | - Python version [e.g. 3.7] 30 | - Virtual environment (yes/no) 31 | 32 | **Additional context** 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Which filetype should textract support?** 14 | A clear and concise description of file types you think textract should be able to process. 15 | 16 | **Which external software (python or command line tool), can parse the requested file type** 17 | A clear and concise description of tools that can parse the desired filetype. 18 | 19 | **Describe alternatives you've considered** 20 | A clear and concise description of any alternative solutions or features you've considered. 21 | 22 | **Additional context** 23 | Add any other context or screenshots about the feature request here. 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | 24 | # Installer logs 25 | pip-log.txt 26 | pip-delete-this-directory.txt 27 | 28 | # Virtual environments 29 | **/venv* 30 | 31 | # Unit test / coverage reports 32 | htmlcov/ 33 | .tox/ 34 | .coverage 35 | .cache 36 | nosetests.xml 37 | coverage.xml 38 | 39 | # Translations 40 | *.mo 41 | 42 | # Mr Developer 43 | .mr.developer.cfg 44 | .project 45 | .pydevproject 46 | 47 | # Rope 48 | .ropeproject 49 | 50 | # Django stuff: 51 | *.log 52 | *.pot 53 | 54 | # Sphinx documentation 55 | docs/build/ 56 | docs/textract/ 57 | 58 | # vagrant 59 | .vagrant 60 | 61 | # ignore big testing files that are dynamically downloaded 62 | tests/pdf/large.pdf 63 | 64 | # ignore raw_text.txt files that are dynamically generated during testing 65 | tests/png/raw_text.txt 66 | tests/gif/raw_text.txt 67 | tests/jpg/raw_text.txt 68 | tests/tiff/raw_text.txt 69 | tests/png/standardized_text.txt 70 | tests/gif/standardized_text.txt 71 | tests/jpg/standardized_text.txt 72 | tests/tiff/standardized_text.txt 73 | tests/pdf/ocr_text.txt 74 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | update: all 2 | branch: master 3 | schedule: "every two weeks" 4 | pin: False 5 | requirements: 6 | - requirements/python: 7 | updates: all 8 | - requirements/python-dev: 9 | updates: all 10 | - requirements/python-doc: 11 | updates: all 12 | assignees: 13 | - deanmalmgren 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: focal 2 | os: linux 3 | 4 | language: python 5 | python: 6 | - "2.7" 7 | - "3.7" 8 | 9 | # install system dependencies here with apt-get. 10 | before_install: 11 | - sudo ./provision/debian.sh 12 | - python -m pip install --upgrade pip 13 | 14 | # install python dependencies including this package in the travis 15 | # virtualenv 16 | install: 17 | 18 | - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; 19 | then ./provision/python3.sh; 20 | fi 21 | - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; 22 | then ./provision/python2.sh; 23 | fi 24 | - pip install .[pocketsphinx] 25 | 26 | # commands to run the testing suite. if any of these fail, travic lets us know 27 | script: 28 | - cd tests && make && cd - 29 | - nosetests --with-coverage --cover-package=textract 30 | - cd tests && pytest && cd - 31 | # - pycodestyle textract/ bin/textract 32 | - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; 33 | then cd docs && make html && cd -; 34 | fi 35 | 36 | # commands to run after the tests successfully complete 37 | after_success: 38 | - coveralls 39 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | [![Jazzband](https://jazzband.co/static/img/jazzband.svg)](https://jazzband.co/) 2 | 3 | This is a [Jazzband](https://jazzband.co/) project. By contributing you agree to 4 | abide by the [Contributor Code of Conduct](https://jazzband.co/about/conduct) 5 | and follow the [guidelines](https://jazzband.co/about/guidelines). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Dean Malmgren 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements/* 2 | include MANIFEST.in 3 | include README.rst 4 | include LICENSE 5 | recursive-exclude * *.py[co] 6 | recursive-exclude * *~ 7 | recursive-exclude * *.orig 8 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. NOTES FOR CREATING A RELEASE: 2 | .. 3 | .. * bumpversion {major|minor|patch} 4 | .. * git push && git push --tags 5 | .. * twine upload -r textract dist/* 6 | .. * convert into release https://github.com/deanmalmgren/textract/releases 7 | 8 | textract 9 | ======== 10 | 11 | Extract text from any document. No muss. No fuss. 12 | 13 | `Full documentation `__. 14 | 15 | Originally written by @deanmalmgren. Maintained by the good people at 16 | @jazzband |Jazz Band| 17 | 18 | |Build Status| |Version| |Downloads| |Test Coverage| |Documentation Status| 19 | |Updates| |Stars| |Forks| 20 | 21 | .. |Jazz Band| image:: https://jazzband.co/static/img/badge.svg 22 | :target: https://jazzband.co/ 23 | :alt: Jazzband 24 | 25 | .. |Build Status| image:: https://travis-ci.org/deanmalmgren/textract.svg?branch=master 26 | :target: https://travis-ci.org/deanmalmgren/textract 27 | 28 | .. |Version| image:: https://img.shields.io/pypi/v/textract.svg 29 | :target: https://warehouse.python.org/project/textract/ 30 | 31 | .. |Downloads| image:: https://img.shields.io/pypi/dm/textract.svg 32 | :target: https://warehouse.python.org/project/textract/ 33 | 34 | .. |Test Coverage| image:: https://coveralls.io/repos/github/deanmalmgren/textract/badge.svg?branch=master 35 | :target: https://coveralls.io/github/deanmalmgren/textract?branch=master 36 | 37 | .. |Documentation Status| image:: https://readthedocs.org/projects/textract/badge/?version=latest 38 | :target: https://readthedocs.org/projects/textract/?badge=latest 39 | 40 | .. |Updates| image:: https://pyup.io/repos/github/deanmalmgren/textract/shield.svg 41 | :target: https://pyup.io/repos/github/deanmalmgren/textract/ 42 | 43 | .. |Stars| image:: https://img.shields.io/github/stars/deanmalmgren/textract.svg 44 | :target: https://github.com/deanmalmgren/textract/stargazers 45 | 46 | .. |Forks| image:: https://img.shields.io/github/forks/deanmalmgren/textract.svg 47 | :target: https://github.com/deanmalmgren/textract/network 48 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # If there are any problems with the required gems, vagrant 5 | # has its own ruby environment. To install the gems (iniparse, 6 | # for example), you need to run: 7 | # 8 | # $ vagrant plugin install iniparse 9 | # 10 | # For more details, check out: 11 | # https://docs.vagrantup.com/v2/cli/plugin.html 12 | 13 | require 'iniparse' 14 | 15 | Vagrant.configure("2") do |config| 16 | 17 | # preliminaries 18 | root_dir = File.dirname(__FILE__) 19 | 20 | #################################################### VIRTUALBOX PROVIDER SETUP 21 | # global configuration on the virtualbox provider. for all available 22 | # options, see http://www.virtualbox.org/manual/ch08.html 23 | virtualbox_server_name = "dev" 24 | config.vm.provider :virtualbox do |vb, override_config| 25 | vb.gui = false 26 | # http://stackoverflow.com/a/17126363/892506 27 | vb.customize ["modifyvm", :id, "--ioapic", "on"] 28 | vb.customize ["modifyvm", :id, "--cpus", "2"] 29 | vb.customize ["modifyvm", :id, "--memory", "2048"] 30 | override_config.vm.box = "trusty64" 31 | override_config.vm.box_url = "https://cloud-images.ubuntu.com/vagrant/trusty/current/trusty-server-cloudimg-amd64-vagrant-disk1.box" 32 | end 33 | 34 | # steps for provisioning so that these provisioning steps are 35 | # properly executed in this virtual machine and also on travis-ci 36 | def provision_script(config, script_path) 37 | config.vm.provision "shell" do |s| 38 | s.path = script_path 39 | s.args = "/vagrant" 40 | end 41 | end 42 | 43 | 44 | ################################################################# LOCAL SERVER 45 | config.vm.define virtualbox_server_name do |server_config| 46 | server_config.vm.hostname = virtualbox_server_name 47 | 48 | # NOTE: this is a tentative hack. the way to properly do this 49 | # would be to use the official ci-environments 50 | # http://docs.travis-ci.com/user/ci-environment/, which are built 51 | # using chef recipes from here 52 | # https://github.com/travis-ci/travis-cookbooks/ 53 | provision_script(server_config, "provision/travis-mock.sh") 54 | 55 | # these are the same provisioning steps that are done on travis-ci 56 | # as on the virtual machine 57 | provision_script(server_config, "provision/debian.sh") 58 | provision_script(server_config, "provision/python.sh") 59 | 60 | # these provisioning steps are only done locally as a convenience 61 | # for setting up a useful development environment 62 | provision_script(server_config, "provision/development.sh") 63 | end 64 | 65 | end 66 | -------------------------------------------------------------------------------- /bin/textract: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python -*- 3 | # PYTHON_ARGCOMPLETE_OK 4 | 5 | """ 6 | Command-line application. 7 | """ 8 | 9 | import sys 10 | 11 | from textract.cli import get_parser 12 | from textract import process 13 | from textract.exceptions import CommandLineError 14 | from textract.colors import red 15 | 16 | 17 | # extract text 18 | def main(): 19 | """Interpret the command-line arguments, process the document and 20 | raise errors accordingly (with traceback surpressed). 21 | """ 22 | parser = get_parser() 23 | args = parser.parse_args() 24 | try: 25 | output = process(**vars(args)) 26 | except CommandLineError as ex: 27 | sys.stderr.write(red(ex) + '\n') 28 | sys.exit(1) 29 | else: 30 | args.output.write(output) 31 | 32 | 33 | main() 34 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | APIDOC = sphinx-apidoc 10 | TEXTRACT = 11 | APIDOC_IGNORE = ../textract/colors.py 12 | 13 | # User-friendly check for sphinx-build 14 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 15 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 16 | endif 17 | 18 | # Internal variables. 19 | PAPEROPT_a4 = -D latex_paper_size=a4 20 | PAPEROPT_letter = -D latex_paper_size=letter 21 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 22 | # the i18n builder cannot share the environment and doctrees with the others 23 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 24 | 25 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 26 | 27 | help: 28 | @echo "Please use \`make ' where is one of" 29 | @echo " html to make standalone HTML files" 30 | @echo " dirhtml to make HTML files named index.html in directories" 31 | @echo " singlehtml to make a single large HTML file" 32 | @echo " pickle to make pickle files" 33 | @echo " json to make JSON files" 34 | @echo " htmlhelp to make HTML files and a HTML help project" 35 | @echo " qthelp to make HTML files and a qthelp project" 36 | @echo " devhelp to make HTML files and a Devhelp project" 37 | @echo " epub to make an epub" 38 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 39 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 40 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 41 | @echo " text to make text files" 42 | @echo " man to make manual pages" 43 | @echo " texinfo to make Texinfo files" 44 | @echo " info to make Texinfo files and run them through makeinfo" 45 | @echo " gettext to make PO message catalogs" 46 | @echo " changes to make an overview of all changed/added/deprecated items" 47 | @echo " xml to make Docutils-native XML files" 48 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 49 | @echo " linkcheck to check all external links for integrity" 50 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 51 | 52 | clean: 53 | rm -rf $(BUILDDIR)/* 54 | 55 | apidoc: 56 | # $(APIDOC) --force --no-toc ../textract --output-dir=./textract $(APIDOC_IGNORE) 57 | # @cat python_package > python_package.rst 58 | # @grep -A8 " module" textract/textract.parsers.rst >> python_package.rst 59 | # @echo "" >> python_package.rst 60 | # @grep -A8 " module" textract/textract.rst >> python_package.rst 61 | # @rm -rf ./textract 62 | 63 | html: apidoc 64 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 65 | @echo 66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 67 | 68 | dirhtml: apidoc 69 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 70 | @echo 71 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 72 | 73 | singlehtml: apidoc 74 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 75 | @echo 76 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 77 | 78 | pickle: apidoc 79 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 80 | @echo 81 | @echo "Build finished; now you can process the pickle files." 82 | 83 | json: apidoc 84 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 85 | @echo 86 | @echo "Build finished; now you can process the JSON files." 87 | 88 | htmlhelp: apidoc 89 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 90 | @echo 91 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 92 | ".hhp project file in $(BUILDDIR)/htmlhelp." 93 | 94 | qthelp: apidoc 95 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 96 | @echo 97 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 98 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 99 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/textract.qhcp" 100 | @echo "To view the help file:" 101 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/textract.qhc" 102 | 103 | devhelp: apidoc 104 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 105 | @echo 106 | @echo "Build finished." 107 | @echo "To view the help file:" 108 | @echo "# mkdir -p $$HOME/.local/share/devhelp/textract" 109 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/textract" 110 | @echo "# devhelp" 111 | 112 | epub: apidoc 113 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 114 | @echo 115 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 116 | 117 | latex: apidoc 118 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 119 | @echo 120 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 121 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 122 | "(use \`make latexpdf' here to do that automatically)." 123 | 124 | latexpdf: apidoc 125 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 126 | @echo "Running LaTeX files through pdflatex..." 127 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 128 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 129 | 130 | latexpdfja: apidoc 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo "Running LaTeX files through platex and dvipdfmx..." 133 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 134 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 135 | 136 | text: apidoc 137 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 138 | @echo 139 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 140 | 141 | man: apidoc 142 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 143 | @echo 144 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 145 | 146 | texinfo: apidoc 147 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 148 | @echo 149 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 150 | @echo "Run \`make' in that directory to run these through makeinfo" \ 151 | "(use \`make info' here to do that automatically)." 152 | 153 | info: apidoc 154 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 155 | @echo "Running Texinfo files through makeinfo..." 156 | make -C $(BUILDDIR)/texinfo info 157 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 158 | 159 | gettext: apidoc 160 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 161 | @echo 162 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 163 | 164 | changes: apidoc 165 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 166 | @echo 167 | @echo "The overview file is in $(BUILDDIR)/changes." 168 | 169 | linkcheck: apidoc 170 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 171 | @echo 172 | @echo "Link check complete; look for any errors in the above output " \ 173 | "or in $(BUILDDIR)/linkcheck/output.txt." 174 | 175 | doctest: apidoc 176 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 177 | @echo "Testing of doctests in the sources finished, look at the " \ 178 | "results in $(BUILDDIR)/doctest/output.txt." 179 | 180 | xml: apidoc 181 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 182 | @echo 183 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 184 | 185 | pseudoxml: apidoc 186 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 187 | @echo 188 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 189 | -------------------------------------------------------------------------------- /docs/command_line_interface.rst: -------------------------------------------------------------------------------- 1 | .. _command-line-interface: 2 | 3 | Command line interface 4 | ====================== 5 | 6 | textract 7 | -------- 8 | 9 | .. argparse:: 10 | :module: textract.cli 11 | :func: get_parser 12 | :prog: textract 13 | 14 | .. note:: 15 | 16 | To make the command line interface as usable as possible, 17 | autocompletion of available options with textract is enabled by 18 | @kislyuk's amazing `argcomplete 19 | `_ package. Follow 20 | instructions to `enable global autocomplete 21 | `_ 22 | and you should be all set. As an example, this is also configured 23 | in the `virtual machine provisioning for this project 24 | `_. 25 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # textract documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Jul 4 11:09:09 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | project_root = os.path.abspath(os.path.join(os.path.abspath('.'), '..')) 22 | sys.path.insert(0, project_root) 23 | import textract 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | #needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.todo', 36 | 'sphinx.ext.viewcode', 37 | 'sphinxarg.ext', 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['.templates'] 42 | 43 | # The suffix of source filenames. 44 | source_suffix = '.rst' 45 | 46 | # The encoding of source files. 47 | #source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = 'index' 51 | 52 | # General information about the project. 53 | project = u'textract' 54 | copyright = u'2014, Dean Malmgren' 55 | 56 | # The version info for the project you're documenting, acts as replacement for 57 | # |version| and |release|, also used in various other places throughout the 58 | # built documents. 59 | # 60 | # The short X.Y version. 61 | release = version = "1.6.5" 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | #language = None 66 | 67 | # There are two options for replacing |today|: either, you set today to some 68 | # non-false value, then it is used: 69 | #today = '' 70 | # Else, today_fmt is used as the format for a strftime call. 71 | #today_fmt = '%B %d, %Y' 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | exclude_patterns = [] 76 | 77 | # The reST default role (used for this markup: `text`) to use for all 78 | # documents. 79 | #default_role = None 80 | 81 | # If true, '()' will be appended to :func: etc. cross-reference text. 82 | #add_function_parentheses = True 83 | 84 | # If true, the current module name will be prepended to all description 85 | # unit titles (such as .. function::). 86 | #add_module_names = True 87 | 88 | # If true, sectionauthor and moduleauthor directives will be shown in the 89 | # output. They are ignored by default. 90 | #show_authors = False 91 | 92 | # The name of the Pygments (syntax highlighting) style to use. 93 | pygments_style = 'sphinx' 94 | 95 | # A list of ignored prefixes for module index sorting. 96 | #modindex_common_prefix = [] 97 | 98 | # If true, keep warnings as "system message" paragraphs in the built documents. 99 | #keep_warnings = False 100 | 101 | 102 | # -- Options for HTML output ---------------------------------------------- 103 | 104 | # The theme to use for HTML and HTML Help pages. See the documentation for 105 | # a list of builtin themes. 106 | html_theme = 'default' 107 | 108 | # Theme options are theme-specific and customize the look and feel of a theme 109 | # further. For a list of options available for each theme, see the 110 | # documentation. 111 | #html_theme_options = {} 112 | 113 | # Add any paths that contain custom themes here, relative to this directory. 114 | #html_theme_path = [] 115 | 116 | # The name for this set of Sphinx documents. If None, it defaults to 117 | # " v documentation". 118 | #html_title = None 119 | 120 | # A shorter title for the navigation bar. Default is the same as html_title. 121 | #html_short_title = None 122 | 123 | # The name of an image file (relative to this directory) to place at the top 124 | # of the sidebar. 125 | #html_logo = None 126 | 127 | # The name of an image file (within the static path) to use as favicon of the 128 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 129 | # pixels large. 130 | #html_favicon = None 131 | 132 | # Add any paths that contain custom static files (such as style sheets) here, 133 | # relative to this directory. They are copied after the builtin static files, 134 | # so a file named "default.css" will overwrite the builtin "default.css". 135 | html_static_path = [] 136 | 137 | # Add any extra paths that contain custom files (such as robots.txt or 138 | # .htaccess) here, relative to this directory. These files are copied 139 | # directly to the root of the documentation. 140 | #html_extra_path = [] 141 | 142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 143 | # using the given strftime format. 144 | #html_last_updated_fmt = '%b %d, %Y' 145 | 146 | # If true, SmartyPants will be used to convert quotes and dashes to 147 | # typographically correct entities. 148 | #html_use_smartypants = True 149 | 150 | # Custom sidebar templates, maps document names to template names. 151 | #html_sidebars = {} 152 | 153 | # Additional templates that should be rendered to pages, maps page names to 154 | # template names. 155 | #html_additional_pages = {} 156 | 157 | # If false, no module index is generated. 158 | #html_domain_indices = True 159 | 160 | # If false, no index is generated. 161 | #html_use_index = True 162 | 163 | # If true, the index is split into individual pages for each letter. 164 | #html_split_index = False 165 | 166 | # If true, links to the reST sources are added to the pages. 167 | #html_show_sourcelink = True 168 | 169 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 170 | #html_show_sphinx = True 171 | 172 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 173 | #html_show_copyright = True 174 | 175 | # If true, an OpenSearch description file will be output, and all pages will 176 | # contain a tag referring to it. The value of this option must be the 177 | # base URL from which the finished HTML is served. 178 | #html_use_opensearch = '' 179 | 180 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 181 | #html_file_suffix = None 182 | 183 | # Output file base name for HTML help builder. 184 | htmlhelp_basename = 'textractdoc' 185 | 186 | 187 | # -- Options for LaTeX output --------------------------------------------- 188 | 189 | latex_elements = { 190 | # The paper size ('letterpaper' or 'a4paper'). 191 | #'papersize': 'letterpaper', 192 | 193 | # The font size ('10pt', '11pt' or '12pt'). 194 | #'pointsize': '10pt', 195 | 196 | # Additional stuff for the LaTeX preamble. 197 | #'preamble': '', 198 | } 199 | 200 | # Grouping the document tree into LaTeX files. List of tuples 201 | # (source start file, target name, title, 202 | # author, documentclass [howto, manual, or own class]). 203 | latex_documents = [ 204 | ('index', 'textract.tex', u'textract Documentation', 205 | u'Dean Malmgren', 'manual'), 206 | ] 207 | 208 | # The name of an image file (relative to this directory) to place at the top of 209 | # the title page. 210 | #latex_logo = None 211 | 212 | # For "manual" documents, if this is true, then toplevel headings are parts, 213 | # not chapters. 214 | #latex_use_parts = False 215 | 216 | # If true, show page references after internal links. 217 | #latex_show_pagerefs = False 218 | 219 | # If true, show URL addresses after external links. 220 | #latex_show_urls = False 221 | 222 | # Documents to append as an appendix to all manuals. 223 | #latex_appendices = [] 224 | 225 | # If false, no module index is generated. 226 | #latex_domain_indices = True 227 | 228 | 229 | # -- Options for manual page output --------------------------------------- 230 | 231 | # One entry per manual page. List of tuples 232 | # (source start file, name, description, authors, manual section). 233 | man_pages = [ 234 | ('index', 'textract', u'textract Documentation', 235 | [u'Dean Malmgren'], 1) 236 | ] 237 | 238 | # If true, show URL addresses after external links. 239 | #man_show_urls = False 240 | 241 | 242 | # -- Options for Texinfo output ------------------------------------------- 243 | 244 | # Grouping the document tree into Texinfo files. List of tuples 245 | # (source start file, target name, title, author, 246 | # dir menu entry, description, category) 247 | texinfo_documents = [ 248 | ('index', 'textract', u'textract Documentation', 249 | u'Dean Malmgren', 'textract', 'One line description of project.', 250 | 'Miscellaneous'), 251 | ] 252 | 253 | # Documents to append as an appendix to all manuals. 254 | #texinfo_appendices = [] 255 | 256 | # If false, no module index is generated. 257 | #texinfo_domain_indices = True 258 | 259 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 260 | #texinfo_show_urls = 'footnote' 261 | 262 | # If true, do not generate a @detailmenu in the "Top" node's menu. 263 | #texinfo_no_detailmenu = False 264 | 265 | # on_rtd is whether we are on readthedocs.org 266 | # http://read-the-docs.readthedocs.org/en/latest/theme.html#how-do-i-use-this-locally-and-on-read-the-docs 267 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 268 | if not on_rtd: # only import and set the theme if we're building docs locally 269 | import sphinx_rtd_theme 270 | html_theme = 'sphinx_rtd_theme' 271 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 272 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | 3 | Contributing 4 | ============ 5 | 6 | The overarching goal of this project is to make it as easy as possible 7 | to extract raw text from any document for the purposes of most natural 8 | language processing tasks. In practice, this means that this project 9 | should preferentially provide tools that correctly produce output that 10 | has words in the correct order but that whitespace between words, 11 | formatting, etc is totally irrelevant. As the various parsers mature, 12 | I fully expect the output to become more readable to support 13 | additional use cases, like `extracting text to appear in web pages 14 | `_. 15 | 16 | Importantly, this project is committed to being as agnostic about how 17 | the content is extracted as it is about the means in which the text is 18 | analyzed downstream. This means that ``textract`` should support 19 | multiple modes of extracting text from any document and provide 20 | reasonably good defaults (defaulting to tools that tend to produce the 21 | correct word sequence). 22 | 23 | Another important aspect of this project is that we want to have 24 | extremely good documentation. If you notice a type-o, error, confusing 25 | statement etc, please fix it! 26 | 27 | 28 | .. _contributing-quick-start: 29 | 30 | Quick start 31 | ----------- 32 | 33 | 1. `Fork `_ and clone the 34 | project: 35 | 36 | .. code-block:: bash 37 | 38 | git clone https://github.com/YOUR-USERNAME/textract.git 39 | 40 | 2. Contribute! There are several `open issues 41 | `_ that provide 42 | good places to dig in. Check out the `contribution guidelines 43 | `_ 44 | and send pull requests; your help is greatly appreciated! 45 | 46 | Depending on your development preferences, there are lots of ways to 47 | get started developing with textract: 48 | 49 | Developing in a native Ubuntu environment 50 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 51 | 52 | 3. Install all the necessary system packages: 53 | 54 | .. code-block:: bash 55 | 56 | ./provision/travis-mock.sh 57 | ./provision/debian.sh 58 | 59 | # optionally run some of the steps in these scripts, but you 60 | # may want to be selective about what you do as they alter global 61 | # environment states 62 | ./provision/python.sh 63 | ./provision/development.sh 64 | 65 | .. _run-ubuntu-tests: 66 | 67 | 4. On the virtual machine, make sure everything is working by running 68 | the suite of functional tests: 69 | 70 | .. code-block:: bash 71 | 72 | nosetests 73 | 74 | These functional tests are designed to be run on an Ubuntu 12.04 75 | LTS server, just like the virtual machine and the server that runs 76 | the travis-ci test suite. There are some other tests that have been 77 | added along the way in the `Travis configuration 78 | `_. For 79 | your convenience, you can run all of these tests with: 80 | 81 | .. code-block:: bash 82 | 83 | ./tests/run.py 84 | 85 | Current build status: |Build Status| 86 | 87 | 88 | Developing with Vagrant virtual machine 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 | 91 | 3. Install `Vagrant `_ and 92 | `Virtualbox `_ and launch 93 | the development virtual machine: 94 | 95 | .. code-block:: bash 96 | 97 | vagrant plugin install iniparse 98 | vagrant up && vagrant provision 99 | 100 | On ``vagrant ssh``\ ing to the virtual machine, note that the 101 | ``PYTHONPATH`` and ``PATH`` `environment variables have been 102 | altered in this virtual machine 103 | `_ 104 | so that any changes you make to textract in development are 105 | automatically incorporated into the command. 106 | 107 | 4. See :ref:`step 4 ` in the Ubuntu development environment. 108 | Current build status: |Build Status| 109 | 110 | 111 | 112 | Developing with Docker container 113 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 114 | 115 | 3. Go to the `Docker 116 | documentation `_ 117 | and follow the instructions under "If you'd like to try the latest 118 | version of Docker" to install Docker. 119 | 120 | 4. Just run ``tests/run_docker_tests.sh`` to run the full test suite. 121 | Current build status: |Build Status| 122 | 123 | 124 | .. |Build Status| image:: https://travis-ci.org/deanmalmgren/textract.png 125 | :target: https://travis-ci.org/deanmalmgren/textract 126 | 127 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. textract documentation master file, created by 2 | sphinx-quickstart on Fri Jul 4 11:09:09 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | textract 7 | ================================ 8 | 9 | As undesirable as it might be, more often than not there is extremely 10 | useful information embedded in Word documents, PowerPoint 11 | presentations, PDFs, etc---so-called "dark data"---that would be 12 | valuable for further textual analysis and visualization. While 13 | :ref:`several packages ` exist for extracting content from 14 | each of these formats on their own, this package provides a single 15 | interface for extracting content from any type of file, without any 16 | irrelevant markup. 17 | 18 | This package provides two primary facilities for doing this, the 19 | :ref:`command line interface ` 20 | 21 | .. code-block:: bash 22 | 23 | textract path/to/file.extension 24 | 25 | or the :ref:`python package ` 26 | 27 | .. code-block:: python 28 | 29 | # some python file 30 | import textract 31 | text = textract.process("path/to/file.extension") 32 | 33 | .. _supporting: 34 | 35 | Currently supporting 36 | -------------------- 37 | 38 | textract supports a growing list of file types for text extraction. If 39 | you don't see your favorite file type here, Please recommend other 40 | file types by either mentioning them on the `issue tracker 41 | `_ or by 42 | :ref:`contributing a pull request `. 43 | 44 | 45 | * ``.csv`` via python builtins 46 | 47 | * ``.tsv`` and ``.tab`` via python builtins 48 | 49 | * ``.doc`` via `antiword`_ 50 | 51 | * ``.docx`` via `python-docx2txt`_ 52 | 53 | * ``.eml`` via python builtins 54 | 55 | * ``.epub`` via `ebooklib`_ 56 | 57 | * ``.gif`` via `tesseract-ocr`_ 58 | 59 | * ``.jpg`` and ``.jpeg`` via `tesseract-ocr`_ 60 | 61 | * ``.json`` via python builtins 62 | 63 | * ``.html`` and ``.htm`` via `beautifulsoup4`_ 64 | 65 | * ``.mp3`` via `sox`_, `SpeechRecognition`_, and `pocketsphinx`_ 66 | 67 | * ``.msg`` via `msg-extractor`_ 68 | 69 | * ``.odt`` via python builtins 70 | 71 | * ``.ogg`` via `sox`_, `SpeechRecognition`_, and `pocketsphinx`_ 72 | 73 | * ``.pdf`` via `pdftotext`_ (default) or `pdfminer.six`_ 74 | 75 | * ``.png`` via `tesseract-ocr`_ 76 | 77 | * ``.pptx`` via `python-pptx`_ 78 | 79 | * ``.ps`` via `ps2ascii`_ 80 | 81 | * ``.rtf`` via `unrtf`_ 82 | 83 | * ``.tiff`` and ``.tif`` via `tesseract-ocr`_ 84 | 85 | * ``.txt`` via python builtins 86 | 87 | * ``.wav`` via `SpeechRecognition`_ and `pocketsphinx`_ 88 | 89 | * ``.xlsx`` via `xlrd `_ 90 | 91 | * ``.xls`` via `xlrd `_ 92 | 93 | .. this is a list of all the packages that textract uses for extraction 94 | .. _antiword: http://www.winfield.demon.nl/ 95 | .. _beautifulsoup4: http://beautiful-soup-4.readthedocs.org/en/latest/ 96 | .. _ebooklib: https://github.com/aerkalov/ebooklib 97 | .. _msg-extractor: https://github.com/mattgwwalker/msg-extractor 98 | .. _pdfminer.six: https://github.com/goulu/pdfminer 99 | .. _pdftotext: http://poppler.freedesktop.org/ 100 | .. _pocketsphinx: https://github.com/cmusphinx/pocketsphinx/ 101 | .. _ps2ascii: https://www.ghostscript.com/doc/current/Use.htm 102 | .. _python-docx2txt: https://github.com/ankushshah89/python-docx2txt 103 | .. _python-pptx: https://python-pptx.readthedocs.org/en/latest/ 104 | .. _SpeechRecognition: https://pypi.python.org/pypi/SpeechRecognition/ 105 | .. _sox: http://sox.sourceforge.net/ 106 | .. _tesseract-ocr: https://code.google.com/p/tesseract-ocr/ 107 | .. _unrtf: http://www.gnu.org/software/unrtf/ 108 | 109 | .. _related-projects: 110 | 111 | Related projects 112 | ---------------- 113 | 114 | Of course, textract isn't the first project with the aim to provide a 115 | simple interface for extracting text from any document. But this is, 116 | to the best of my knowledge, the only project that is written in 117 | python (a language commonly chosen by the natural language processing 118 | community) and is :ref:`method agnostic about how content is extracted 119 | `. I'm sure that there are other similar projects out 120 | there, but here is a small sample of similar projects: 121 | 122 | * `Apache Tika `_ has `very similar, if not 123 | identical, aims as textract 124 | `_ and has 125 | impressive coverage of a wide range of file formats. It is written 126 | in java. 127 | 128 | * `textract (node.js) `_ has 129 | similar aims as this textract package (including an identical name! 130 | great minds...). It is written in node.js. 131 | 132 | * `pandoc `_ is intended to be a 133 | document conversion tool (a much more difficult task!), but it does have 134 | `the ability to convert to plain text 135 | `_. It is written in 136 | Haskell. 137 | 138 | 139 | Contents: 140 | 141 | .. toctree:: 142 | :maxdepth: 2 143 | 144 | command_line_interface 145 | python_package 146 | installation 147 | contributing 148 | changelog 149 | 150 | 151 | Indices and tables 152 | ================== 153 | 154 | * :ref:`genindex` 155 | * :ref:`modindex` 156 | * :ref:`search` 157 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | One of the main goals of textract is to make it as easy as possible to 7 | start using textract (meaning that installation should be as quick and 8 | painless as possible). This package is built on top of several python 9 | packages and other source libraries. Assuming you are using ``pip`` or 10 | ``easy_install`` to install textract, the `python packages 11 | `_ 12 | are all installed by default with textract. The source libraries are a 13 | separate matter though and largely depend on your operating system. 14 | 15 | Ubuntu / Debian 16 | --------------- 17 | 18 | There are two steps required to run this package on 19 | Ubuntu/Debian. First you must install some system packages using the 20 | `apt-get `_ 21 | package manager before installing textract from pypi. 22 | 23 | .. code-block:: bash 24 | 25 | apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \ 26 | flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig libpulse-dev 27 | pip install textract 28 | 29 | .. note:: 30 | 31 | It may also be necessary to install ``zlib1g-dev`` on Docker 32 | instances of Ubuntu. See `issue #19 33 | `_ for details 34 | 35 | OSX 36 | --- 37 | 38 | These steps rely on you having `homebrew `_ installed 39 | as well as the `cask `_ plugin (``brew tap caskroom/cask``). The basic idea is to first install 40 | `XQuartz `_ before 41 | installing a bunch of system packages before installing textract from 42 | pypi. 43 | 44 | .. code-block:: bash 45 | 46 | brew install --cask xquartz 47 | brew install poppler antiword unrtf tesseract swig 48 | pip install textract 49 | 50 | .. brew install libxml2 libxslt antiword poppler tesseract 51 | .. brew link libxml2 libxslt 52 | 53 | .. note:: 54 | 55 | `pstotext `_ is 56 | not currently a part of homebrew so ``.ps`` extraction must be 57 | enabled by manually installing from source. 58 | 59 | .. note:: 60 | 61 | Depending on how you have python configured on your system with 62 | homebrew, you may also need to install the python 63 | development header files for textract to properly install. 64 | 65 | FreeBSD 66 | ------- 67 | 68 | Setting up this package on FreeBSD pretty much follows the steps for 69 | Ubuntu / Debian while using ``pkg`` as package manager. 70 | 71 | .. code-block:: bash 72 | 73 | pkg install lang/python38 devel/py-pip textproc/libxml2 textproc/libxslt textproc/antiword textproc/unrtf \ 74 | graphics/poppler print/pstotext graphics/tesseract audio/flac multimedia/ffmpeg audio/lame audio/sox \ 75 | graphics/jpeg-turbo 76 | pip install textract 77 | 78 | Don't see your operating system installation instructions here? 79 | --------------------------------------------------------------- 80 | 81 | My apologies! Installing system packages is a bit of a drag and its 82 | hard to anticipate all of the different environments that need to be 83 | accomodated (wouldn't it be awesome if there were a system-agnostic 84 | package manager or, better yet, if python could install these system 85 | dependencies for you?!?!). If you're operating system doesn't have 86 | documenation about how to install the textract dependencies, please 87 | :ref:`contribute a pull request ` with: 88 | 89 | 1. A new section in here with the appropriate details about how to 90 | install things. In particular, please give instructions for how to 91 | install the following libraries before running ``pip install 92 | textract``: 93 | 94 | - `libxml2 2.6.21 or later `_ 95 | is required by the ``.docx`` parser which uses `lxml 96 | `_ via 97 | python-docx. 98 | 99 | - `libxslt 1.1.15 or later 100 | `_ is required by the 101 | ``.docx`` parser which users `lxml 102 | `_ via 103 | python-docx. 104 | 105 | - python header files are required for building lxml. 106 | 107 | - `antiword `_ is required by the 108 | ``.doc`` parser. 109 | 110 | - `pdftotext `_ is *optionally* 111 | required by the ``.pdf`` parser (there is a pure python fallback 112 | that works if pdftotext isn't installed). 113 | 114 | - `pstotext `_ 115 | is required by the ``.ps`` parser. 116 | 117 | - `tesseract-ocr `_ 118 | is required by the ``.jpg``, ``.png`` and ``.gif`` parser. 119 | 120 | - `sox `_ 121 | is required by the ``.mp3`` and ``.ogg`` parser. 122 | You need to install ffmpeg, lame, libmad0 and libsox-fmt-mp3, 123 | before building sox, for these filetypes to work. 124 | 125 | 2. Add a requirements file to the `requirements directory 126 | `_ 127 | of the project with the lower-cased name of your operating system 128 | (e.g. ``requirements/windows``) so we can try to keep these things 129 | up to date in the future. 130 | -------------------------------------------------------------------------------- /docs/python_package.rst: -------------------------------------------------------------------------------- 1 | .. _python-package: 2 | 3 | Python package 4 | ============== 5 | 6 | This package is organized to make it as easy as possible to add new 7 | extensions and support the continued growth and coverage of 8 | textract. For almost all applications, you will just have to do 9 | something like this:: 10 | 11 | import textract 12 | text = textract.process('path/to/file.extension') 13 | 14 | to obtain text from a document. You can also pass keyword arguments to 15 | ``textract.process``, for example, to use a particular method for 16 | parsing a pdf like this:: 17 | 18 | import textract 19 | text = textract.process('path/to/a.pdf', method='pdfminer') 20 | 21 | or to specify a particular output encoding (input encodings are 22 | inferred using `chardet `_):: 23 | 24 | import textract 25 | text = textract.process('path/to/file.extension', encoding='ascii') 26 | 27 | When the file name has no extension, you specify the file's extension as an argument 28 | to ``textract.process`` like this:: 29 | 30 | import textract 31 | text = textract.process('path/to/file', extension='docx') 32 | 33 | .. _additional-options: 34 | 35 | Additional options 36 | ------------------ 37 | 38 | Some parsers also enable additional options which can be passed in as keyword 39 | arguments to the ``textract.process`` function. Here is a quick table of 40 | available options that are available to the different types of parsers: 41 | 42 | ====== ========= =========================================================== 43 | parser option description 44 | ====== ========= =========================================================== 45 | gif language Specify `the language`_ for OCR-ing text with tesseract 46 | jpg language Specify `the language`_ for OCR-ing text with tesseract 47 | pdf language For use when ``method='tesseract'``, specify `the language`_ 48 | pdf layout With ``method='pdftotext'`` (default), preserve the layout 49 | png language Specify `the language`_ for OCR-ing text with tesseract 50 | tiff language Specify `the language`_ for OCR-ing text with tesseract 51 | ====== ========= =========================================================== 52 | 53 | As an example of using these additional options, you can extract text from a 54 | Norwegian PDF using Tesseract OCR like this:: 55 | 56 | text = textract.process( 57 | 'path/to/norwegian.pdf', 58 | method='tesseract', 59 | language='nor', 60 | ) 61 | 62 | 63 | A look under the hood 64 | --------------------- 65 | 66 | When ``textract.process('path/to/file.extension')`` is called, 67 | ``textract.process`` looks for a module called 68 | ``textract.parsers.extension_parser`` that also contains a ``Parser``. 69 | 70 | 71 | .. autofunction:: textract.parsers.process 72 | 73 | Importantly, the ``textract.parsers.extension_parser.Parser`` class 74 | must inherit from ``textract.parsers.utils.BaseParser``. 75 | 76 | .. autoclass:: textract.parsers.utils.BaseParser 77 | :members: 78 | :undoc-members: 79 | :show-inheritance: 80 | 81 | Many of the parsers rely on command line utilities to do some of the 82 | parsing. For convenience, the ``textract.parsers.utils.ShellParser`` 83 | class includes some convenience methods for streamlining access to the 84 | command line. 85 | 86 | .. autoclass:: textract.parsers.utils.ShellParser 87 | :members: 88 | :undoc-members: 89 | :show-inheritance: 90 | 91 | 92 | A few specific examples 93 | ----------------------- 94 | 95 | There are quite a few parsers included with ``textract``. Rather than 96 | elaborating all of them, here are a few that demonstrate how parsers 97 | work. 98 | 99 | .. autoclass:: textract.parsers.epub_parser.Parser 100 | :members: 101 | :undoc-members: 102 | :show-inheritance: 103 | 104 | .. autoclass:: textract.parsers.doc_parser.Parser 105 | :members: 106 | :undoc-members: 107 | :show-inheritance: 108 | 109 | 110 | .. _the language: https://code.google.com/p/tesseract-ocr/downloads/list 111 | -------------------------------------------------------------------------------- /provision/debian.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This needs to work both for Vagrant provisioning and for Travis 4 | # builds in a Python virtualenv, each of which have different current 5 | # working directories when this script is called. When run in Vagrant, the 6 | # script is copied to /tmp and executed from there, passing the original 7 | # path as the first argument. So deal with that. 8 | if [ "$1" == "" ]; then 9 | # normal 10 | cd $(dirname $0)/.. 11 | else 12 | # run from /tmp by Vagrant. 13 | cd $1 14 | fi 15 | base=$(pwd) 16 | 17 | # Install all of the dependencies required in the examples. 18 | # http://docs.travis-ci.com/user/installing-dependencies/#Installing-Ubuntu-packages 19 | apt-get update -qq 20 | sed 's/\(.*\)\#.*/\1/' < $base/requirements/debian | xargs apt-get install -y --fix-missing 21 | -------------------------------------------------------------------------------- /provision/development.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this script sets up some additional configurations that are 4 | # convenient during development only 5 | 6 | # make sure the PYTHONPATH and PATH variables are properly configured 7 | # for the vagrant user. Always change into the /vagrant directory on 8 | # the virtual machine to make it easy to start developing 9 | cat << EOF > /home/vagrant/.bash_profile 10 | export PATH=/vagrant/bin:$PATH 11 | export PYTHONPATH=/vagrant:$PYTHONPATH 12 | cd /vagrant 13 | EOF 14 | 15 | # setup global tab completion on the flo command 16 | # https://github.com/kislyuk/argcomplete#activating-global-completion 17 | activate-global-python-argcomplete --dest /etc/bash_completion.d/ 18 | -------------------------------------------------------------------------------- /provision/python2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This needs to work for vagrant, Travis builds, and Docker builds. 4 | # in a python virtualenv. in the virtual machine provisioning, 5 | # we're passing the directory this should be run from. in travis-ci, 6 | # its run from the root of the repository. 7 | if [ "$#" -eq 1 ]; then 8 | cd $1 9 | fi 10 | 11 | # upgrade pip so we can use wheel downloads 12 | pip install -U pip 13 | 14 | # Install the requirements for this package as well as this module. 15 | pip install -r requirements/python-dev2 16 | -------------------------------------------------------------------------------- /provision/python3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This needs to work for vagrant, Travis builds, and Docker builds. 4 | # in a python virtualenv. in the virtual machine provisioning, 5 | # we're passing the directory this should be run from. in travis-ci, 6 | # its run from the root of the repository. 7 | if [ "$#" -eq 1 ]; then 8 | cd $1 9 | fi 10 | 11 | # upgrade pip so we can use wheel downloads 12 | pip install -U pip 13 | 14 | # Install the requirements for this package as well as this module. 15 | pip install -r requirements/python-dev3 16 | pip install -r requirements/python-doc 17 | -------------------------------------------------------------------------------- /provision/travis-mock.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # these additional packages are required to make the virtual machine 4 | # have a similar environment to travis-ci before we install anything 5 | # else. See Vagrantfile for details on how this could be done better 6 | # if its a problem. 7 | # http://docs.travis-ci.com/user/languages/python/#Travis-CI-Uses-Isolated-virtualenvs 8 | sudo apt-get update -qq 9 | sudo apt-get install -y python-pip python-dev build-essential 10 | 11 | # install pep8 and nose for testing 12 | sudo pip install pep8 nose 13 | -------------------------------------------------------------------------------- /requirements/debian: -------------------------------------------------------------------------------- 1 | # required packages 2 | gcc 3 | libpulse-dev 4 | libasound2-dev 5 | libjpeg-dev 6 | build-essential 7 | git 8 | make 9 | 10 | # these packages are required by python-docx, which depends on lxml 11 | # and requires these things 12 | python-dev 13 | libxml2-dev 14 | libxslt1-dev 15 | 16 | # parse word documents 17 | antiword 18 | 19 | # parse rtf documents 20 | unrtf 21 | 22 | # parse image files 23 | tesseract-ocr 24 | libjpeg-dev 25 | 26 | # parse pdfs 27 | poppler-utils 28 | 29 | # parse postscript files 30 | pstotext 31 | 32 | # parse audio files, with SpeechRecognition 33 | flac 34 | 35 | # filetype conversion libs 36 | ffmpeg 37 | lame 38 | libmad0 39 | libsox-fmt-mp3 40 | 41 | # convert audio files 42 | sox 43 | 44 | # Sphinx Speech Recognition 45 | swig 46 | 47 | # ubuntu 14.04 requires this in addition to libxml2-dev and 48 | # libxslt1-dev for compiling lxml. 49 | # https://github.com/deanmalmgren/textract/issues/19 50 | zlib1g-dev 51 | -------------------------------------------------------------------------------- /requirements/freebsd: -------------------------------------------------------------------------------- 1 | # required packages 2 | audio/pulseaudio 3 | devel/git 4 | 5 | # these packages are required by python-docx, which depends on lxml 6 | # and requires these things 7 | lang/python38 8 | devel/py-pippython-pip 9 | textproc/libxml2 10 | textproc/libxslt 11 | 12 | # parse word documents 13 | textproc/antiword 14 | 15 | # parse rtf documents 16 | textproc/unrtf 17 | 18 | # parse image files 19 | graphics/tesseract 20 | graphics/jpeg-turbo 21 | 22 | # parse pdfs 23 | graphics/poppler 24 | 25 | # parse postscript files 26 | print/pstotext 27 | 28 | # parse audio files, with SpeechRecognition 29 | audio/flac 30 | 31 | # filetype conversion libs 32 | multimedia/ffmpeg 33 | audio/lame 34 | 35 | # convert audio files 36 | audio/sox 37 | -------------------------------------------------------------------------------- /requirements/python: -------------------------------------------------------------------------------- 1 | # This file contains all python dependencies that are required by the textract 2 | # package in order for it to properly work. 3 | 4 | argcomplete~=1.10.0 5 | beautifulsoup4~=4.8.0 6 | chardet==3.* 7 | docx2txt~=0.8 8 | extract-msg<=0.29.* #Last with python2 support 9 | pdfminer.six==20191110 #Last with python2 support 10 | python-pptx~=0.6.18 11 | six~=1.12.0 12 | SpeechRecognition~=3.8.1 13 | xlrd~=1.2.0 14 | -------------------------------------------------------------------------------- /requirements/python-dev2: -------------------------------------------------------------------------------- 1 | # This includes all packages that are used in development, including all 2 | # packages that are required by textract itself (python), packages for 3 | # documentation builds (python-doc) 4 | 5 | -r python 6 | 7 | # needed for tests/run.py script to read .travis.yml file 8 | coveralls==1.8.2 9 | nose==1.3.7 10 | pycodestyle==2.5.0 11 | PyYAML==5.1.1 12 | requests==2.22.0 13 | pytest==4.6 14 | 15 | # needed for managing versions 16 | bumpversion==0.5.3 17 | -------------------------------------------------------------------------------- /requirements/python-dev3: -------------------------------------------------------------------------------- 1 | # This includes all packages that are used in development, including all 2 | # packages that are required by textract itself (python), packages for 3 | # documentation builds (python-doc) 4 | 5 | -r python 6 | 7 | # needed for tests/run.py script to read .travis.yml file 8 | coveralls==1.8.2 9 | nose==1.3.7 10 | pycodestyle==2.5.0 11 | PyYAML==5.1.1 12 | pytest==5.0.1 13 | requests==2.22.0 14 | 15 | # needed for managing versions 16 | bumpversion==0.5.3 17 | -------------------------------------------------------------------------------- /requirements/python-doc: -------------------------------------------------------------------------------- 1 | # this only includes packages that are needed for documentation build. 2 | 3 | sphinx==2.1.2 4 | sphinx_rtd_theme==0.4.3 5 | sphinx-argparse==0.2.5 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.6.5 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version="{current_version}" 8 | replace = version="{new_version}" 9 | 10 | [bumpversion:file:textract/__init__.py] 11 | search = VERSION = "{current_version}" 12 | replace = VERSION = "{new_version}" 13 | 14 | [bumpversion:file:docs/conf.py] 15 | search = version = "{current_version}" 16 | replace = version = "{new_version}" 17 | 18 | [bumpversion:file:docs/changelog.rst] 19 | search = THANKS FOR CONTRIBUTING; ADD YOUR UNRELEASED CHANGES HERE! 20 | replace = THANKS FOR CONTRIBUTING; ADD YOUR UNRELEASED CHANGES HERE! 21 | {new_version} 22 | ------------------- 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from setuptools import setup 4 | 5 | import textract 6 | 7 | # get all of the scripts 8 | scripts = glob.glob("bin/*") 9 | 10 | # read in the description from README 11 | with open("README.rst") as stream: 12 | long_description = stream.read() 13 | 14 | github_url = 'https://github.com/deanmalmgren/textract' 15 | 16 | 17 | def parse_requirements(requirements_filename): 18 | """read in the dependencies from the requirements files 19 | """ 20 | dependencies, dependency_links = [], [] 21 | requirements_dir = os.path.dirname(requirements_filename) 22 | with open(requirements_filename, 'r') as stream: 23 | for line in stream: 24 | line = line.strip() 25 | if line.startswith("-r"): 26 | filename = os.path.join(requirements_dir, line[2:].strip()) 27 | _dependencies, _dependency_links = parse_requirements(filename) 28 | dependencies.extend(_dependencies) 29 | dependency_links.extend(_dependency_links) 30 | elif line.startswith("http"): 31 | dependency_links.append(line) 32 | else: 33 | package = line.split('#')[0] 34 | if package: 35 | dependencies.append(package) 36 | return dependencies, dependency_links 37 | 38 | 39 | requirements_filename = os.path.join("requirements", "python") 40 | dependencies, dependency_links = parse_requirements(requirements_filename) 41 | 42 | 43 | setup( 44 | name=textract.__name__, 45 | version="1.6.5", 46 | description="extract text from any document. no muss. no fuss.", 47 | long_description=long_description, 48 | url=github_url, 49 | download_url="%s/archives/master" % github_url, 50 | author='Dean Malmgren', 51 | author_email='dean.malmgren@datascopeanalytics.com', 52 | license='MIT', 53 | scripts=scripts, 54 | packages=[ 55 | 'textract', 56 | 'textract.parsers', 57 | ], 58 | install_requires=dependencies, 59 | extras_require={ 60 | "pocketsphinx": ["pocketsphinx==0.1.15"] 61 | }, 62 | dependency_links=dependency_links, 63 | zip_safe=False, 64 | ) 65 | -------------------------------------------------------------------------------- /tests/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:12.04 2 | MAINTAINER Shawn Milochik 3 | ENV DEBIAN_FRONTEND noninteractive 4 | ENV REFRESHED_AT 2014-08-12b 5 | RUN apt-get update 6 | RUN apt-get install python-pip -y 7 | ADD . /src 8 | WORKDIR /src 9 | RUN /bin/bash /src/provision/debian.sh 10 | RUN /bin/bash /src/provision/python.sh 11 | RUN adduser --disabled-password --gecos "" --home=/home/textract textract 12 | VOLUME ["/home/textract/src"] 13 | ENV PATH $PATH:/home/textract/src/bin 14 | ENV PYTHONPATH /home/textract/src 15 | USER textract 16 | ENTRYPOINT ["/home/textract/src/tests/run.py"] 17 | -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | # This file is used to create standardized text for any textract method that 2 | # uses any command line utility or third party service (as opposed to a pure 3 | # python package) for text extraction. These types of extraction methods have 4 | # made it particularly difficult to maintain reasonably stable testing 5 | # environments, so this provides a useful workaround to validate that the 6 | # extraction methods are working correctly across all development environments 7 | # 8 | # https://github.com/deanmalmgren/textract/issues/78 9 | 10 | TARGETS = pdf/ocr_text.txt \ 11 | png/raw_text.txt png/standardized_text.txt \ 12 | gif/raw_text.txt gif/standardized_text.txt \ 13 | jpg/raw_text.txt jpg/standardized_text.txt \ 14 | tiff/raw_text.txt tiff/standardized_text.txt \ 15 | ps/raw_text.txt 16 | 17 | all: $(TARGETS) 18 | 19 | clean: 20 | rm -f $(TARGETS) 21 | 22 | # create OCR output for the multi-page pdf test 23 | pdf/ocr_text.txt: pdf/ocr_text.pdf 24 | pdftoppm $< /tmp/pdf-ocr-text 25 | for x in /tmp/pdf-ocr-text*; do \ 26 | tesseract $$x $$(basename $$x .ppm) > /dev/null; \ 27 | done 28 | cat pdf-ocr-text*.txt > $@ 29 | rm -f pdf-ocr-text* 30 | 31 | ps/raw_text.txt: ps/raw_text.ps 32 | ps2ascii $< > $@ 33 | 34 | # simple pattern rule for creating standard issue tesseract files for different 35 | # fileypes. the `g` shell variable is the path to the file without the 36 | # extension (e.g. g=png/raw_text) 37 | %.txt: %.png 38 | f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null 39 | 40 | %.txt: %.gif 41 | f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null 42 | 43 | %.txt: %.tiff 44 | f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null 45 | 46 | %.txt: %.jpg 47 | f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null 48 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/__init__.py -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import tempfile 4 | import shutil 5 | import six 6 | 7 | import requests 8 | 9 | 10 | class GenericUtilities(object): 11 | 12 | def get_temp_filename(self, extension=None): 13 | stream = tempfile.NamedTemporaryFile(delete=False) 14 | stream.close() 15 | filename = stream.name 16 | if not extension is None: 17 | filename += '.' + extension 18 | shutil.move(stream.name, filename) 19 | return filename 20 | 21 | def clean_text(self, text): 22 | lines = text.splitlines() 23 | # Clean empty lines (fixes epub issue) 24 | lines = [line for line in lines if line.strip()] # Clean empty lines 25 | return six.b('\n').join(lines) 26 | 27 | 28 | class BaseParserTestCase(GenericUtilities): 29 | """This BaseParserTestCase object is used to collect a bunch of 30 | standardized tests that should be run for every BaseParser. 31 | """ 32 | 33 | # 'txt', for example. this is mandatory and potentially the only thing that 34 | # has to be specified to subclass this unittest 35 | extension = '' 36 | 37 | # User can specify a particular filename root (without 38 | # extension!), but these have good defaults that are specified by 39 | # the @property methods below 40 | raw_text_filename_root = '' 41 | standardized_text_filename_root = '' 42 | unicode_text_filename_root = '' 43 | 44 | def __init__(self, *args, **kwargs): 45 | super(BaseParserTestCase, self).__init__(*args, **kwargs) 46 | if self.extension == '': 47 | raise NotImplementedError( 48 | 'need to specify `extension` class attribute on test case' 49 | ) 50 | 51 | def get_extension_directory(self): 52 | return os.path.join( 53 | os.path.dirname(os.path.abspath(__file__)), 54 | self.extension, 55 | ) 56 | 57 | def get_filename(self, filename_root, default_filename_root): 58 | if filename_root: 59 | filename = os.path.join( 60 | self.get_extension_directory(), 61 | filename_root + '.' + self.extension, 62 | ) 63 | if not os.path.exists(filename): 64 | raise Exception(( 65 | 'expected filename "%(filename)s" to exist for testing ' 66 | 'purposes but it doesnt' 67 | ) % locals()) 68 | return filename 69 | return self.get_filename(default_filename_root, default_filename_root) 70 | 71 | @property 72 | def raw_text_filename(self): 73 | return self.get_filename(self.raw_text_filename_root, 74 | "raw_text") 75 | 76 | @property 77 | def standardized_text_filename(self): 78 | return self.get_filename(self.standardized_text_filename_root, 79 | "standardized_text") 80 | 81 | @property 82 | def unicode_text_filename(self): 83 | return self.get_filename(self.unicode_text_filename_root, 84 | "unicode_text") 85 | 86 | def test_raw_text_cli(self): 87 | """Make sure raw text matches from the command line""" 88 | self.compare_cli_output(self.raw_text_filename) 89 | 90 | def test_raw_text_python(self): 91 | """Make sure raw text matches from python""" 92 | self.compare_python_output(self.raw_text_filename) 93 | 94 | def test_standardized_text_cli(self): 95 | """Make sure standardized text matches from the command line""" 96 | temp_filename = self.assertSuccessfulTextract( 97 | self.standardized_text_filename, 98 | cleanup=False, 99 | ) 100 | with open(temp_filename, 'rb') as stream: 101 | self.assertEqual( 102 | six.b('').join(stream.read().split()), 103 | self.get_standardized_text(), 104 | "standardized text fails for %s" % self.extension, 105 | ) 106 | os.remove(temp_filename) 107 | 108 | def test_standardized_text_python(self): 109 | """Make sure standardized text matches from python""" 110 | import textract 111 | result = textract.process(self.standardized_text_filename) 112 | self.assertEqual( 113 | six.b('').join(result.split()), 114 | self.get_standardized_text(), 115 | "standardized text fails for %s" % self.extension, 116 | ) 117 | 118 | # def test_unicode_text_cli(self): 119 | # """Make sure unicode text matches from the command line""" 120 | # self.compare_cli_output(self.unicode_text_filename) 121 | 122 | # def test_unicode_text_python(self): 123 | # """Make sure unicode text matches from python""" 124 | # self.compare_python_output(self.unicode_text_filename) 125 | 126 | def get_expected_filename(self, filename, **kwargs): 127 | basename, extension = os.path.splitext(filename) 128 | if kwargs.get('method'): 129 | basename += '-m=' + kwargs.get('method') 130 | return basename + '.txt' 131 | 132 | def get_cli_options(self, **kwargs): 133 | option = '' 134 | for key, val in six.iteritems(kwargs): 135 | option += '--%s=%s ' % (key, val) 136 | return option 137 | 138 | def get_standardized_text(self): 139 | filename = os.path.join( 140 | self.get_extension_directory(), 141 | "standardized_text.txt" 142 | ) 143 | if os.path.exists(filename): 144 | with open(filename, 'rb') as stream: 145 | standardized_text = stream.read() 146 | else: 147 | standardized_text = six.b( 148 | "the quick brown fox jumps over the lazy dog" 149 | ) 150 | return six.b('').join(standardized_text.split()) 151 | 152 | def assertSuccessfulCommand(self, command): 153 | self.assertEqual( 154 | 0, subprocess.call(command, shell=True), 155 | "COMMAND FAILED: %(command)s" % locals() 156 | ) 157 | 158 | def assertSuccessfulTextract(self, filename, cleanup=True, **kwargs): 159 | 160 | # construct the option string 161 | option = self.get_cli_options(**kwargs) 162 | 163 | # run the command and make sure everything worked correctly 164 | temp_filename = self.get_temp_filename() 165 | self.assertSuccessfulCommand( 166 | "textract %(option)s '%(filename)s' > %(temp_filename)s" % locals() 167 | ) 168 | if cleanup: 169 | os.remove(temp_filename) 170 | return None 171 | else: 172 | return temp_filename 173 | 174 | def compare_cli_output(self, filename, expected_filename=None, **kwargs): 175 | if expected_filename is None: 176 | expected_filename = self.get_expected_filename(filename, **kwargs) 177 | 178 | # run the command and make sure everything worked correctly 179 | temp_filename = self.assertSuccessfulTextract( 180 | filename, 181 | cleanup=False, 182 | **kwargs 183 | ) 184 | 185 | self.assertSuccessfulCommand( 186 | "diff --ignore-blank-lines '%(temp_filename)s' '%(expected_filename)s'" % locals() 187 | ) 188 | os.remove(temp_filename) 189 | 190 | def compare_python_output(self, filename, expected_filename=None, **kwargs): 191 | if expected_filename is None: 192 | expected_filename = self.get_expected_filename(filename, **kwargs) 193 | 194 | import textract 195 | result = textract.process(filename, **kwargs) 196 | with open(expected_filename, 'rb') as stream: 197 | result = self.clean_text(result) 198 | expected = self.clean_text(stream.read()) 199 | self.assertEqual(result, expected) 200 | 201 | 202 | class ShellParserTestCase(BaseParserTestCase): 203 | """This BaseParserTestCase object is used to collect a bunch of 204 | standardized tests that should be run for every ShellParser. 205 | """ 206 | 207 | def test_filename_spaces(self): 208 | """Make sure filenames with spaces work on the command line""" 209 | temp_filename = spaced_filename = self.get_temp_filename() 210 | spaced_filename += " a filename with spaces." + self.extension 211 | shutil.copyfile(self.raw_text_filename, spaced_filename) 212 | self.compare_cli_output( 213 | spaced_filename, 214 | self.get_expected_filename(self.raw_text_filename), 215 | ) 216 | os.remove(temp_filename) 217 | os.remove(spaced_filename) 218 | -------------------------------------------------------------------------------- /tests/csv/raw_text.txt: -------------------------------------------------------------------------------- 1 | CREATION DATE STATUS COMPLETION DATE SERVICE REQUEST NUMBER TYPE OF SERVICE REQUEST CURRENT ACTIVITY MOST RECENT ACTION NUMBER OF POTHOLES FILLED ON BLOCK STREET ADDRESS ZIP X COORDINATE Y COORDINATE Ward Police District Community Area LATITUDE LONGITUDE LOCATION 2 | 08/28/2014 Completed 08/28/2014 14-01433654 Pothole in Street Final Outcome Pothole Patched 5 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 3 | 08/27/2014 Completed 08/27/2014 14-01424541 Pothole in Street Final Outcome Pothole Patched 1 100 N MICHIGAN AVE 60602 1177299.6140023 1900836.66107586 42 1 32 41.88328915138877 -87.62454939639862 (41.88328915138877, -87.62454939639862) 4 | 08/27/2014 Completed 08/27/2014 14-01424527 Pothole in Street Final Outcome Pothole Patched 5 100 S MICHIGAN AVE 60603 1177324.39101512 1899960.94636756 42 1 32 41.88077066474826 -87.62448301713603 (41.88077066474826, -87.62448301713603) 5 | 08/27/2014 Completed 08/27/2014 14-01424501 Pothole in Street Final Outcome Pothole Patched 5 200 S MICHIGAN AVE 60604 1177323.63491104 1899497.03462832 42 1 32 41.87948932869769 -87.6244981249339 (41.87948932869769, -87.6244981249339) 6 | 08/27/2014 Completed 08/27/2014 14-01424389 Pothole in Street Final Outcome Pothole Patched 5 300 S MICHIGAN AVE 60604 1177339.54271824 1899032.66368812 2 1 32 41.87821734531058 -87.62445584029679 (41.87821734531058, -87.62445584029679) 7 | 08/27/2014 Completed 08/27/2014 14-01424212 Pothole in Street Final Outcome Pothole Patched 15 400 S MICHIGAN AVE 60605 1177347.93260788 1898568.04117246 2 1 32 41.87693801152278 -87.6244366179451 (41.87693801152278, -87.6244366179451) 8 | 08/25/2014 Completed 08/25/2014 14-01404817 Pothole in Street Final Outcome Pothole Patched 10 740 N WABASH AVE 60611 1176600.11893119 1905427.14160096 42 18 8 41.896091479927165 -87.6269829451334 (41.896091479927165, -87.6269829451334) 9 | 08/22/2014 Completed 08/22/2014 14-01395263 Pothole in Street Final Outcome Pothole Patched 1 1137 W CHICAGO AVE 60642 1168573.58997114 1905510.8499928 27 12 24 41.89613260416578 -87.65667886202922 (41.89613260416578, -87.65667886202922) 10 | 08/22/2014 Completed 08/22/2014 14-01390538 Pothole in Street Final Outcome Pothole Patched 5 615 N FRANKLIN ST 60654 1174219.86039744 1904291.9608037 42 18 8 41.89291371668045 -87.63546140541867 (41.89291371668045, -87.63546140541867) 11 | 08/21/2014 Completed 08/28/2014 14-01383161 Pothole in Street Final Outcome Pothole Patched 5 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 12 | 08/12/2014 Completed 08/25/2014 14-01323742 Pothole in Street Final Outcome Pothole Patched 2 2744 W EVERGREEN AVE 60622 1157855.84623339 1908863.62028457 26 14 24 41.905779104276164 -87.69629062504274 (41.905779104276164, -87.69629062504274) 13 | 08/11/2014 Completed 08/25/2014 14-01317376 Pothole in Street Final Outcome Pothole Patched 4 2701 W HIRSCH ST 60622 1158141.76231382 1909202.67761659 26 14 24 41.9064904995143 -87.69453948271031 (41.9064904995143, -87.69453948271031) 14 | 08/07/2014 Completed 08/27/2014 14-01294373 Pothole in Street Final Outcome Pothole Patched 1 159 N DEARBORN ST 60601 1175915.29501332 1901390.11610965 42 1 32 41.884767887865465 -87.62932139824203 (41.884767887865465, -87.62932139824203) 15 | 08/06/2014 Completed 08/26/2014 14-01284763 Pothole in Street Final Outcome Pothole Patched 17 100 S WABASH AVE 60603 1176824.63149672 1899942.95872303 42 1 32 41.88072398164076 -87.62631635116725 (41.88072398164076, -87.62631635116725) 16 | 08/04/2014 Completed 08/25/2014 14-01264182 Pothole in Street Final Outcome Pothole Patched 3 201 E SUPERIOR ST 60611 1177708.98999045 1905405.56110885 42 18 8 41.8956500849415 -87.62265814875258 (41.8956500849415, -87.62265814875258) 17 | 08/04/2014 Completed 08/22/2014 14-01266469 Pothole in Street Final Outcome Pothole Patched 8 220 W SUPERIOR ST 60654 1174518.27808694 1905317.80798624 42 18 8 41.89569109218158 -87.63509877442043 (41.89569109218158, -87.63509877442043) 18 | 08/03/2014 Completed 08/22/2014 14-01258054 Pothole in Street Final Outcome Pothole Patched 8 600 N WABASH AVE 60611 1176632.67288564 1904207.6496666 42 18 8 41.892531148204995 -87.62689596447262 (41.892531148204995, -87.62689596447262) 19 | 07/29/2014 Completed 08/28/2014 14-01228358 Pothole in Street Final Outcome Pothole Patched 7 10 W ELM ST 60610 1176002.41378993 1908093.80450502 42 18 8 41.90328038833401 -87.62904275288018 (41.90328038833401, -87.62904275288018) 20 | 07/29/2014 Completed 08/25/2014 14-01224309 Pothole in Street Final Outcome Pothole Patched 15 201 E HURON ST 60611 1177717.86567145 1905118.00892518 42 18 8 41.89486671392807 -87.62262561802254 (41.89486671392807, -87.62262561802254) 21 | 07/28/2014 Completed 08/27/2014 14-01208070 Pothole in Street Final Outcome Pothole Patched 2 177 N WELLS ST 60606 1174702.03147832 1901552.59999636 42 1 32 41.88525719405359 -87.63376979601067 (41.88525719405359, -87.63376979601067) 22 | 07/26/2014 Completed 08/28/2014 14-01203517 Pothole in Street Final Outcome Pothole Patched 1 1 W MAPLE ST 60610 1176169.35729479 1907624.99998895 42 18 8 41.90177327913275 -87.62842192883082 (41.90177327913275, -87.62842192883082) 23 | 07/24/2014 Completed 08/22/2014 14-01190319 Pothole in Street Final Outcome Pothole Patched 3 730 N FRANKLIN ST 60654 1174191.71119404 1905308.4805198 42 18 8 41.89562283550215 -87.63582499692767 (41.89562283550215, -87.63582499692767) 24 | 07/23/2014 Completed 08/28/2014 14-01178461 Pothole in Street Final Outcome Pothole Patched 3 1155 N DEARBORN ST 60610 1175723.69360823 1908131.92908715 42 18 8 41.90344694644998 -87.62982489848619 (41.90344694644998, -87.62982489848619) 25 | 07/21/2014 Completed 08/22/2014 14-01161819 Pothole in Street Final Outcome Pothole Patched 3 600 E GRAND AVE 60611 1180764.02691294 1904055.10759636 42 18 8 41.892094136861786 -87.61156988394656 (41.892094136861786, -87.61156988394656) 26 | 07/18/2014 Completed 08/27/2014 14-01149914 Pothole in Street Final Outcome Pothole Patched 4 150 N FRANKLIN ST 60606 1174306.13841283 1901261.81180327 42 1 32 41.88449312034977 -87.63552722968024 (41.88449312034977, -87.63552722968024) 27 | 07/17/2014 Completed 08/22/2014 14-01134572 Pothole in Street Final Outcome Pothole Patched 2 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 28 | 07/14/2014 Completed 08/28/2014 14-01112602 Pothole in Street Final Outcome Pothole Patched 15 170 W OAK ST 60610 1174950.17410849 1907124.5048948 42 18 8 41.90065230620216 -87.63419240464901 (41.90065230620216, -87.63419240464901) 29 | 07/14/2014 Completed 08/22/2014 14-01114977 Pothole in Street Final Outcome Pothole Patched 3 461 N CITYFRONT PLAZA DR 60611 1177942.61391555 1903427.32649215 42 18 8 41.89012269030228 -87.62144215679457 (41.89012269030228, -87.62144215679457) 30 | 07/14/2014 Completed 08/27/2014 14-01114104 Pothole in Street Final Outcome Pothole Patched 13 462 N LAKE SHORE DR 60611 1180059.25771678 1903489.31249721 42 18 8 41.89043391381039 -87.61432635149913 (41.89043391381039, -87.61432635149913) 31 | 07/13/2014 Completed 08/22/2014 14-01106120 Pothole in Street Final Outcome Pothole Patched 2 37 E OHIO ST 60611 1176632.67288564 1904207.6496666 42 18 8 41.892389760220446 -87.6266700643506 (41.892389760220446, -87.6266700643506) 32 | 07/11/2014 Completed 08/27/2014 14-01092992 Pothole in Street Final Outcome Pothole Patched 0 517 N LAKE SHORE DR 60611 1180023.3999851 1903912.87000671 42 18 8 41.891747134984456 -87.61415723810663 (41.891747134984456, -87.61415723810663) 33 | 07/11/2014 Completed 08/28/2014 14-01092312 Pothole in Street Final Outcome Pothole Patched 2 1165 N LA SALLE DR 60610 1174923.84946801 1908163.01737071 42 18 8 41.90367107146938 -87.63276634924583 (41.90367107146938, -87.63276634924583) 34 | 07/09/2014 Completed 08/22/2014 14-01072126 Pothole in Street Final Outcome Pothole Patched 1 200 W OHIO ST 60654 1174634.07810009 1904152.08572945 42 18 8 41.892497422329804 -87.63416622539611 (41.892497422329804, -87.63416622539611) 35 | 07/08/2014 Completed 08/26/2014 14-01064944 Pothole in Street Final Outcome Pothole Patched 4 34 S STATE ST 60603 1176384.09122814 1900236.40989122 42 1 32 41.881194280111245 -87.62788715387323 (41.881194280111245, -87.62788715387323) 36 | 07/06/2014 Completed 08/22/2014 14-01050617 Pothole in Street Final Outcome Pothole Patched 5 430 N FRANKLIN ST 60654 1174247.46842821 1903265.17618927 42 18 8 41.890001134206734 -87.63568286112702 (41.890001134206734, -87.63568286112702) 37 | 06/30/2014 Completed 08/26/2014 14-01013381 Pothole in Street Final Outcome Pothole Patched 7 50 E JACKSON BLVD 60604 1176916.76437779 1899018.37548671 42 1 32 41.87836259235806 -87.62578989491375 (41.87836259235806, -87.62578989491375) 38 | 06/26/2014 Completed 08/27/2014 14-00992410 Pothole in Street Final Outcome Pothole Patched 6 201 N WELLS ST 60606 1174696.68801277 1901736.11738759 42 1 32 41.885758730147366 -87.63378404961261 (41.885758730147366, -87.63378404961261) 39 | 06/23/2014 Completed 08/22/2014 14-00961579 Pothole in Street Final Outcome Pothole Patched 1 340 W ERIE ST 60654 1173808.62199403 1904714.70549298 42 18 8 41.89405989499057 -87.63717643739275 (41.89405989499057, -87.63717643739275) 40 | 06/23/2014 Completed 08/26/2014 14-00967688 Pothole in Street Final Outcome Pothole Patched 12 140 S DEARBORN ST 60603 1175959.03010572 1899623.29000633 42 1 32 41.87967604011696 -87.62950228266759 (41.87967604011696, -87.62950228266759) 41 | 06/18/2014 Completed 08/27/2014 14-00933275 Pothole in Street Final Outcome Pothole Patched 15 211 N DEARBORN ST 60601 1175900.7237733 1901846.43477009 42 1 32 41.88604316092842 -87.62936052143365 (41.88604316092842, -87.62936052143365) 42 | 06/12/2014 Completed 08/27/2014 14-00897156 Pothole in Street Final Outcome Pothole Patched 42 200 S LA SALLE ST 60604 1175161.95537691 1899423.25992154 42 1 32 41.87936625147701 -87.63243763414764 (41.87936625147701, -87.63243763414764) 43 | 06/12/2014 Completed 08/22/2014 14-00890682 Pothole in Street Final Outcome Pothole Patched 1 306 W OHIO ST 60654 1174162.61920993 1904141.19087966 42 18 8 41.89247410656883 -87.63593436430777 (41.89247410656883, -87.63593436430777) 44 | 06/11/2014 Completed 08/28/2014 14-00887552 Pothole in Street Final Outcome Pothole Patched 4 1020 N STATE ST 60610 1176176.20887962 1907374.89001835 42 18 8 41.901200484161656 -87.62847652035359 (41.901200484161656, -87.62847652035359) 45 | 06/10/2014 Completed 08/27/2014 14-00880864 Pothole in Street Final Outcome Pothole Patched 0 245 W WASHINGTON ST 60606 1174540.03218467 1900805.68709955 42 1 32 41.883096426444126 -87.63515649580224 (41.883096426444126, -87.63515649580224) 46 | 06/04/2014 Completed 08/22/2014 14-00844247 Pothole in Street Final Outcome Pothole Patched 2 525 W SUPERIOR ST 60654 1172505.49257168 1905270.97268573 42 18 8 41.895379256528365 -87.64251783750353 (41.895379256528365, -87.64251783750353) 47 | 06/03/2014 Completed 08/26/2014 14-00829362 Pothole in Street Final Outcome Pothole Patched 2 100 S STATE ST 60603 1176400.89309128 1899922.58000955 42 1 32 41.88071513784339 -87.62787607841581 (41.88071513784339, -87.62787607841581) 48 | 06/01/2014 Completed 08/26/2014 14-00819596 Pothole in Street Final Outcome Pothole Patched 1 21 S DEARBORN ST 60603 1175943.73762726 1900281.08996842 42 1 32 41.88163394997373 -87.62924355913282 (41.88163394997373, -87.62924355913282) 49 | 05/31/2014 Completed 08/28/2014 14-00816655 Pothole in Street Final Outcome Pothole Patched 2 1250 N DEARBORN ST 60610 1175716.58493337 1908364.4279693 42 18 8 41.905379013702905 -87.63017455348202 (41.905379013702905, -87.63017455348202) 50 | 05/29/2014 Completed 08/22/2014 14-00805945 Pothole in Street Final Outcome Pothole Patched 10 738 N LARRABEE ST 60654 1172208.49919151 1905332.36197604 42 18 8 41.89589422031982 -87.64313907101959 (41.89589422031982, -87.64313907101959) 51 | 05/29/2014 Completed 08/26/2014 14-00802449 Pothole in Street Final Outcome Pothole Patched 6 33 S STATE ST 60603 1176383.89122814 1900241.30989122 42 1 32 41.8812023900134 -87.62759359231444 (41.8812023900134, -87.62759359231444) 52 | 05/28/2014 Completed 08/22/2014 14-00797786 Pothole in Street Final Outcome Pothole Patched 2 720 N FRANKLIN ST 60654 1174193.91119404 1905217.9805198 42 18 8 41.89553241941632 -87.63582582624329 (41.89553241941632, -87.63582582624329) 53 | 05/26/2014 Completed 08/28/2014 14-00780187 Pothole in Street Final Outcome Pothole Patched 15 12 W ELM ST 60610 1175983.31378993 1908093.10450502 42 18 8 41.90327840141334 -87.62912853335304 (41.90327840141334, -87.62912853335304) 54 | 05/21/2014 Completed 08/28/2014 14-00753021 Pothole in Street Final Outcome Pothole Patched 10 30 E ELM ST 60611 1176399.91378993 1908106.10450502 42 18 8 41.903312908929266 -87.6274232482672 (41.903312908929266, -87.6274232482672) 55 | 05/19/2014 Completed 08/27/2014 14-00736478 Pothole in Street Final Outcome Pothole Patched 10 520 E ILLINOIS ST 60611 1180045.15771678 1903729.01249721 42 18 8 41.891214491225654 -87.61421696505855 (41.891214491225654, -87.61421696505855) 56 | 04/08/2014 Completed 08/25/2014 14-00504762 Pothole in Street Final Outcome Pothole Patched 2 1300 N CALIFORNIA AVE 60622 1157492.02101534 1908524.16997455 26 14 24 41.904795555353125 -87.69707304441381 (41.904795555353125, -87.69707304441381) -------------------------------------------------------------------------------- /tests/csv/standardized_text.csv: -------------------------------------------------------------------------------- 1 | the,quick 2 | brown,fox 3 | jumps over,the 4 | lazy,dog -------------------------------------------------------------------------------- /tests/doc/raw_text.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/doc/raw_text.doc -------------------------------------------------------------------------------- /tests/doc/raw_text.txt: -------------------------------------------------------------------------------- 1 | 2 | I love word documents. They are lovely. They make me so happy I could 3 | smile. And that is why I wrote this package. 4 | 5 | Sample text is hard. That is where http://hipsum.co comes in handy. 6 | 7 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop- 8 | up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B. 9 | Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's 10 | kogi American Apparel. Shabby chic distillery church-key locavore beard, 11 | food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi 12 | organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag 13 | Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred 14 | hoodie vegan, food truck leggings Austin pour-over banjo trust fund before 15 | they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid 16 | Carles Austin tofu. 17 | 18 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn 19 | biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, 20 | viral asymmetrical artisan bicycle rights bitters master cleanse 21 | Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS 22 | hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee 23 | retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard 24 | gluten-free seitan, VHS sartorial pork belly gastropub meh whatever 25 | authentic synth. Beard single-origin coffee irony fixie, before they sold 26 | out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan 27 | hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. 28 | 29 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice Banksy 30 | roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo booth, Wes 31 | Anderson skateboard Odd Future. Etsy art party Bushwick keffiyeh. Pork 32 | belly 3 wolf moon butcher mustache. YOLO raw denim lo-fi, hoodie gentrify 33 | Schlitz 8-bit sriracha Shoreditch retro brunch. Williamsburg farm-to-table 34 | beard, mlkshk Banksy fap kogi Etsy art party squid semiotics. XOXO church- 35 | key Pitchfork mlkshk irony tote bag. 36 | 37 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic 38 | trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt 39 | chambray, leggings shabby chic gastropub YOLO plaid hoodie Williamsburg 40 | Godard mixtape. Retro Godard keytar biodiesel, freegan paleo Etsy you 41 | probably haven't heard of them Pitchfork Schlitz readymade small batch 42 | cred. Pug trust fund paleo, 90's fixie typewriter next level banjo. Banksy 43 | occupy authentic master cleanse Bushwick fingerstache selfies, direct trade 44 | craft beer cliche +1 cray. Locavore four loko biodiesel Neutra chia mlkshk. 45 | Fanny pack YOLO Portland, mlkshk PBR&B single-origin coffee drinking 46 | vinegar 8-bit flannel gentrify stumptown pop-up. 47 | Oh. You need a little dummy text for your mockup? How quaint. 48 | 49 | I bet you are still using Bootstrap too 50 | 51 | 52 | -------------------------------------------------------------------------------- /tests/doc/standardized_text.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/doc/standardized_text.doc -------------------------------------------------------------------------------- /tests/doc/standardized_text_1.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/doc/standardized_text_1.odt -------------------------------------------------------------------------------- /tests/docker_entry.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script gets called from within the 4 | # Docker container. 5 | 6 | ./tests/run.py 7 | -------------------------------------------------------------------------------- /tests/docx/paragraphs_and_tables.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/docx/paragraphs_and_tables.docx -------------------------------------------------------------------------------- /tests/docx/paragraphs_and_tables.txt: -------------------------------------------------------------------------------- 1 | This is the paragraph before the table. It should appear first. 2 | 3 | 4 | 5 | Row 1, Column 1 6 | 7 | Row 1, Column 2 8 | 9 | Row 1, Column 3 10 | 11 | Row 2, Column 1 12 | 13 | Row 2, Column 2 14 | 15 | Row 2, Column 3 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | INNER TABLE! 26 | 27 | 28 | 29 | Inner 1,1 30 | 31 | Inner 1,2 32 | 33 | Inner 2,1 34 | 35 | Inner 1,2 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | This is the paragraph after the table. It should appear last. -------------------------------------------------------------------------------- /tests/docx/raw_text.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/docx/raw_text.docx -------------------------------------------------------------------------------- /tests/docx/raw_text.txt: -------------------------------------------------------------------------------- 1 | I love word documents. They are lovely. They make me so happy I could smile. And that’s why I wrote this package. 2 | 3 | 4 | 5 | Sample text is hard. That’s where http://hipsum.co comes in handy. 6 | 7 | 8 | 9 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church-key locavore beard, food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred hoodie vegan, food truck leggings Austin pour-over banjo trust fund before they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. 10 | 11 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard gluten-free seitan, VHS sartorial pork belly gastropub meh whatever authentic synth. Beard single-origin coffee irony fixie, before they sold out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. 12 | 13 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo-fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag. 14 | 15 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt chambray, leggings shabby chic gastropub YOLO plaid hoodie Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan paleo Etsy you probably haven't heard of them Pitchfork Schlitz readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter next level banjo. Banksy occupy authentic master cleanse Bushwick fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify stumptown pop-up. 16 | 17 | Oh. You need a little dummy text for your mockup? How quaint. 18 | 19 | I bet you’re still using Bootstrap too… -------------------------------------------------------------------------------- /tests/docx/standardized_text.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/docx/standardized_text.docx -------------------------------------------------------------------------------- /tests/eml/raw_text.eml: -------------------------------------------------------------------------------- 1 | Return-path: 2 | Envelope-to: to@example.com 3 | Delivery-date: Sat, 27 Mar 2010 12:11:32 +0000 4 | Received: from mail by mail.example.com with local-bsmtp (Exim 4.69) 5 | (envelope-from ) 6 | id 1NvUrP-0002O1-C7 7 | for to@example.com; Sat, 27 Mar 2010 12:11:32 +0000 8 | X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on 9 | mail.example.com 10 | X-Spam-Level: 11 | X-Spam-Status: No, score=-11.3 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, 12 | HTML_MESSAGE,MPART_ALT_DIFF autolearn=no version=3.2.5 13 | Received: from cpc1-mort1-0-0-cust399.croy.cable.virginmedia.com ([82.44.61.144] helo=spike) 14 | by mail.example.com with esmtpsa (TLS1.0:RSA_AES_128_CBC_SHA1:16) 15 | (Exim 4.69) 16 | (envelope-from ) 17 | id 1NvUrP-0002Nw-6s 18 | for to@example.com; Sat, 27 Mar 2010 12:11:27 +0000 19 | From: "Example" 20 | To: 21 | Subject: test 22 | Date: Sat, 27 Mar 2010 12:11:21 -0000 23 | Message-ID: <118CE886A86E491FBC9BF5A1474EACD4@spike> 24 | MIME-Version: 1.0 25 | Content-Type: multipart/mixed; 26 | boundary="----=_NextPart_000_006B_01CACDA6.A0401560" 27 | X-Mailer: Microsoft Office Outlook 11 28 | X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7600.16385 29 | Thread-Index: AcrNpp0Z+/nuAPz4TiOf1ZIdke/vzQ== 30 | 31 | This is a multi-part message in MIME format. 32 | 33 | ------=_NextPart_000_006B_01CACDA6.A0401560 34 | Content-Type: multipart/alternative; 35 | boundary="----=_NextPart_001_006C_01CACDA6.A0401560" 36 | 37 | 38 | ------=_NextPart_001_006C_01CACDA6.A0401560 39 | Content-Type: text/plain; 40 | charset="us-ascii" 41 | Content-Transfer-Encoding: 7bit 42 | 43 | test 44 | 45 | 46 | ------=_NextPart_001_006C_01CACDA6.A0401560 47 | Content-Type: text/html; 48 | charset="us-ascii" 49 | Content-Transfer-Encoding: quoted-printable 50 | 51 | 55 | 56 | 57 | 59 | 60 | 85 | 91 | 92 | 93 | 94 | 95 |
96 | 97 |

test

99 | 100 |
101 | 102 | 103 | 104 | 105 | 106 | ------=_NextPart_001_006C_01CACDA6.A0401560-- 107 | 108 | ------=_NextPart_000_006B_01CACDA6.A0401560 109 | Content-Type: message/rfc822 110 | Content-Transfer-Encoding: 7bit 111 | Content-Disposition: attachment 112 | 113 | Received: from mail by mail.example.com with local-bsmtp (Exim 4.69) 114 | (envelope-from ) 115 | id 1NvUpr-0002NA-Sp 116 | for from@example.com; Sat, 27 Mar 2010 12:09:54 +0000 117 | Received: from cpc1-mort1-0-0-cust399.croy.cable.virginmedia.com ([82.44.61.144] helo=spike) 118 | by mail.example.com with esmtpsa (TLS1.0:RSA_AES_128_CBC_SHA1:16) 119 | (Exim 4.69) 120 | (envelope-from ) 121 | id 1NvUpr-0002N7-Ob 122 | for from@example.com; Sat, 27 Mar 2010 12:09:51 +0000 123 | Return-Path: 124 | From: "Example" 125 | To: 126 | Subject: test 127 | Date: Sat, 27 Mar 2010 12:09:46 -0000 128 | Message-ID: <63850E3677964844B666AE60AFB35F6C@spike> 129 | MIME-Version: 1.0 130 | Content-Type: multipart/alternative; 131 | boundary="----=_NextPart_000_0067_01CACDA6.A0401560" 132 | X-Mailer: Microsoft Office Outlook 11 133 | X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) onmail.example.com 134 | X-Spam-Level: 135 | X-Spam-Status: No, score=-10.5 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00,TVD_SPACE_RATIO autolearn=no version=3.2.5 136 | X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7600.16385 137 | Thread-Index: AcrNpjn33LNJQ9EnTDiluk/gD2wIxw== 138 | 139 | This is a multi-part message in MIME format. 140 | 141 | ------=_NextPart_000_0067_01CACDA6.A0401560 142 | Content-Type: text/plain; 143 | charset="us-ascii" 144 | Content-Transfer-Encoding: 7bit 145 | 146 | test 147 | 148 | ------=_NextPart_000_0067_01CACDA6.A0401560 149 | Content-Type: text/html; 150 | charset="us-ascii" 151 | Content-Transfer-Encoding: quoted-printable 152 | 153 | 154 | 155 | 156 | 158 | 160 | test 161 | 162 | 163 | 164 | 165 |

test 166 |

167 | 168 | 169 | 170 | ------=_NextPart_000_0067_01CACDA6.A0401560-- 171 | 172 | ------=_NextPart_000_006B_01CACDA6.A0401560-- 173 | 174 | -------------------------------------------------------------------------------- /tests/eml/raw_text.txt: -------------------------------------------------------------------------------- 1 | test 2 | 3 | 4 | 5 | test 6 | -------------------------------------------------------------------------------- /tests/eml/standardized_text.eml: -------------------------------------------------------------------------------- 1 | From nobody Mon Aug 25 13:47:30 2014 2 | Content-Type: text/plain; charset="us-ascii" 3 | MIME-Version: 1.0 4 | Content-Transfer-Encoding: 7bit 5 | Subject: test 6 | From: me@example.com 7 | To: you@example.com 8 | 9 | the quick brown fox 10 | jumps over the lazy dog -------------------------------------------------------------------------------- /tests/epub/raw_text.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/epub/raw_text.epub -------------------------------------------------------------------------------- /tests/epub/raw_text.txt: -------------------------------------------------------------------------------- 1 | Epub testing 2 | With subtitle... 3 | Introduction 4 | Welcome here! All the text have ben generate with the Samuel L lorem ipsum. 5 | We happy? 6 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 7 | We happy? 8 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die. 9 | Oh... what I'm gon' do? 10 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 11 | No man, I don't eat pork 12 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 13 | Is she dead, yes or no? 14 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 15 | We happy? 16 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 17 | We happy? 18 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die. 19 | Oh... what I'm gon' do? 20 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 21 | No man, I don't eat pork 22 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 23 | Is she dead, yes or no? 24 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 25 | We happy? 26 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 27 | We happy? 28 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die. 29 | Oh... what I'm gon' do? 30 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 31 | No man, I don't eat pork 32 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 33 | Is she dead, yes or no? 34 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 35 | We happy? 36 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 37 | We happy? 38 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die. 39 | Oh... what I'm gon' do? 40 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 41 | No man, I don't eat pork 42 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 43 | Is she dead, yes or no? 44 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 45 | We happy? 46 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 47 | We happy? 48 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die. 49 | Oh... what I'm gon' do? 50 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 51 | No man, I don't eat pork 52 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing. 53 | Is she dead, yes or no? 54 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee. 55 | -------------------------------------------------------------------------------- /tests/epub/standardized_text.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/epub/standardized_text.epub -------------------------------------------------------------------------------- /tests/gif/raw_text.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/gif/raw_text.gif -------------------------------------------------------------------------------- /tests/gif/standardized_text.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/gif/standardized_text.gif -------------------------------------------------------------------------------- /tests/html/standardized_text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

the quick

4 |

5 | brown fox jumps over the lazy dog 6 |

7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/jpg/raw_text.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/jpg/raw_text.jpg -------------------------------------------------------------------------------- /tests/jpg/standardized_text.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/jpg/standardized_text.jpg -------------------------------------------------------------------------------- /tests/json/raw_text.json: -------------------------------------------------------------------------------- 1 | { 2 | "hey":"hello world this is some text from a json document", 3 | "should_ignore_this_number": 42, 4 | "should_extract_one_two_three": { 5 | "1": "one", 6 | "2": "two", 7 | "3": "three", 8 | "should_be_abcdef": ["a", "b", "c", ["d","e"], {"dict":"f"}] 9 | }, 10 | "finished": "fine" 11 | } -------------------------------------------------------------------------------- /tests/json/raw_text.txt: -------------------------------------------------------------------------------- 1 | fine hello world this is some text from a json document one two three a b c d e f -------------------------------------------------------------------------------- /tests/json/standardized_text.json: -------------------------------------------------------------------------------- 1 | { 2 | "a":"the quick brown fox", 3 | "aa": "jumps", 4 | "b": "over", 5 | "z": "the lazy dog" 6 | } 7 | -------------------------------------------------------------------------------- /tests/mp3/raw_text-m=google.txt: -------------------------------------------------------------------------------- 1 | Everything Is Awesome 2 | -------------------------------------------------------------------------------- /tests/mp3/raw_text-m=sphinx.txt: -------------------------------------------------------------------------------- 1 | everything is awesome 2 | -------------------------------------------------------------------------------- /tests/mp3/raw_text.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/mp3/raw_text.mp3 -------------------------------------------------------------------------------- /tests/mp3/raw_text.txt: -------------------------------------------------------------------------------- 1 | Everything Is Awesome 2 | -------------------------------------------------------------------------------- /tests/mp3/standardized_text.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/mp3/standardized_text.mp3 -------------------------------------------------------------------------------- /tests/msg/raw_text.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/msg/raw_text.msg -------------------------------------------------------------------------------- /tests/msg/raw_text.txt: -------------------------------------------------------------------------------- 1 | Test for TIF files 2 | 3 | This is a test email to experiment with the MS Outlook MSG Extractor 4 | 5 | 6 | -- 7 | 8 | 9 | Kind regards 10 | 11 | 12 | 13 | 14 | Brian Zhou 15 | 16 | -------------------------------------------------------------------------------- /tests/msg/standardized_text.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/msg/standardized_text.msg -------------------------------------------------------------------------------- /tests/no_ext/docx_paragraphs_and_tables: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/no_ext/docx_paragraphs_and_tables -------------------------------------------------------------------------------- /tests/no_ext/msg_standardized_text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/no_ext/msg_standardized_text -------------------------------------------------------------------------------- /tests/no_ext/pdf_standardized_text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/no_ext/pdf_standardized_text -------------------------------------------------------------------------------- /tests/odt/raw_text.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/odt/raw_text.odt -------------------------------------------------------------------------------- /tests/odt/raw_text.txt: -------------------------------------------------------------------------------- 1 | Sample OppenOffice Writer file with tabs and multiple spaces 2 | -------------------------------------------------------------------------------- /tests/odt/standardized_text.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/odt/standardized_text.odt -------------------------------------------------------------------------------- /tests/ogg/raw_text.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/ogg/raw_text.ogg -------------------------------------------------------------------------------- /tests/ogg/raw_text.txt: -------------------------------------------------------------------------------- 1 | Everything Is Awesome 2 | -------------------------------------------------------------------------------- /tests/ogg/standardized_text.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/ogg/standardized_text.ogg -------------------------------------------------------------------------------- /tests/pdf/ocr_text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/ocr_text.pdf -------------------------------------------------------------------------------- /tests/pdf/raw_text-m=pdfminer.txt: -------------------------------------------------------------------------------- 1 | I  love  word  documents.  They  are  lovely.  They  make  me  so  happy  I  could  smile.  And   2 | that’s  why  I  wrote  this  package.   3 |   4 | 5 | Sample text is hard. That’s 6 | where http://hipsum.co comes 7 | in handy. 8 | 9 |   10 | 11 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin 12 | pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer 13 | PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh 14 | ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church- 15 | key locavore beard, food truck chillwave sartorial deep v flannel authentic 16 | Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage 17 | Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel 18 | keytar Portland post-ironic. Cred hoodie vegan, food truck leggings 19 | Austin pour-over banjo trust fund before they sold out cray Intelligentsia 20 | plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. 21 | 22 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn 23 | biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, 24 | viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter 25 | YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag 26 | meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 27 | 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard 28 | gluten-free seitan, VHS sartorial pork belly gastropub meh whatever 29 | authentic synth. Beard single-origin coffee irony fixie, before they sold 30 | 31 | out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan 32 | hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. 33 | 34 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice 35 | Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo 36 | booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick 37 | keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo- 38 | fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. 39 | Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party 40 | squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag. 41 | 42 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic 43 | trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt 44 | chambray, leggings shabby chic gastropub YOLO plaid hoodie 45 | Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan 46 | paleo Etsy you probably haven't heard of them Pitchfork Schlitz 47 | readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter 48 | next level banjo. Banksy occupy authentic master cleanse Bushwick 49 | fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four 50 | loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk 51 | PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify 52 | stumptown pop-up. 53 | Oh. You need a little dummy text for your mockup? How quaint. 54 | 55 | I bet you’re still using Bootstrap too… 56 | 57 |   58 | 59 | -------------------------------------------------------------------------------- /tests/pdf/raw_text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/raw_text.pdf -------------------------------------------------------------------------------- /tests/pdf/raw_text.txt: -------------------------------------------------------------------------------- 1 | I  love  word  documents.  They  are  lovely.  They  make  me  so  happy  I  could  smile.  And   2 | that’s  why  I  wrote  this  package.   3 |   4 | 5 | Sample text is hard. That’s 6 | where http://hipsum.co comes 7 | in handy. 8 |   9 | 10 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin 11 | pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer 12 | PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh 13 | ethnic Marfa 90's kogi American Apparel. Shabby chic distillery churchkey locavore beard, food truck chillwave sartorial deep v flannel authentic 14 | Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage 15 | Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel 16 | keytar Portland post-ironic. Cred hoodie vegan, food truck leggings 17 | Austin pour-over banjo trust fund before they sold out cray Intelligentsia 18 | plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. 19 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn 20 | biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, 21 | viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter 22 | YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag 23 | meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 24 | 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard 25 | gluten-free seitan, VHS sartorial pork belly gastropub meh whatever 26 | authentic synth. Beard single-origin coffee irony fixie, before they sold 27 | 28 | out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan 29 | hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. 30 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice 31 | Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo 32 | booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick 33 | keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lofi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. 34 | Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party 35 | squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag. 36 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic 37 | trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt 38 | chambray, leggings shabby chic gastropub YOLO plaid hoodie 39 | Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan 40 | paleo Etsy you probably haven't heard of them Pitchfork Schlitz 41 | readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter 42 | next level banjo. Banksy occupy authentic master cleanse Bushwick 43 | fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four 44 | loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk 45 | PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify 46 | stumptown pop-up. 47 | Oh. You need a little dummy text for your mockup? How quaint. 48 | I bet you’re still using Bootstrap too… 49 |   50 | 51 | -------------------------------------------------------------------------------- /tests/pdf/standardized_text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/standardized_text.pdf -------------------------------------------------------------------------------- /tests/pdf/two_column.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/two_column.pdf -------------------------------------------------------------------------------- /tests/png/raw_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/png/raw_text.png -------------------------------------------------------------------------------- /tests/png/standardized_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/png/standardized_text.png -------------------------------------------------------------------------------- /tests/pptx/raw_text.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pptx/raw_text.pptx -------------------------------------------------------------------------------- /tests/pptx/raw_text.txt: -------------------------------------------------------------------------------- 1 | I love 2 | 3 | PowerPoint. A lot. 4 | 5 | It is 6 | 7 | lovely. It makes me so happy I could smile. And that’s why I wrote this package. 8 | 9 | hipsum 10 | 11 | Viral Godard typewriter Pitchfork Blue Bottle asymmetrical gluten-free, forage hoodie. High Life ethnic biodiesel banjo you probably haven't heard of them skateboard. Swag forage art party, Marfa 12 | 13 | yr 14 | 15 | 16 | 17 | sriracha 18 | 19 | before they sold out PBR drinking vinegar Blue Bottle gluten-free. Lo-fi single-origin coffee bicycle rights, ennui 20 | 21 | selfies 22 | 23 | 24 | 25 | fap 26 | 27 | 28 | 29 | paleo 30 | 31 | 32 | 33 | Etsy 34 | 35 | 36 | 37 | Bushwick 38 | 39 | ethical. Vice authentic vinyl 3 wolf moon, pour-over tousled Marfa 40 | 41 | fingerstache 42 | 43 | readymade blog ethical squid. Echo Park mustache beard, crucifix 44 | 45 | fingerstache 46 | 47 | bitters 48 | 49 | normcore 50 | 51 | Blue Bottle +1. Skateboard bicycle rights XOXO, literally Vice put a bird on it VHS craft beer. 52 | 53 | Hipsum 54 | 55 | II 56 | 57 | Lo-fi fashion axe Godard food truck 3 wolf moon Echo Park, 58 | 59 | normcore 60 | 61 | raw denim kitsch narwhal 62 | 63 | mixtape 64 | 65 | . Brunch 66 | 67 | cray 68 | 69 | 70 | 71 | scenester 72 | 73 | selvage 74 | 75 | chillwave 76 | 77 | , pickled ennui irony ugh salvia beard Schlitz wayfarers kitsch 78 | 79 | Etsy 80 | 81 | . Brooklyn beard messenger bag tote bag Wes Anderson. 82 | 83 | Keffiyeh 84 | 85 | cardigan Blue Bottle Schlitz 86 | 87 | chillwave 88 | 89 | . Twee 90 | 91 | Kickstarter 92 | 93 | 94 | 95 | hella 96 | 97 | 98 | 99 | banh 100 | 101 | mi beard, 102 | 103 | Carles 104 | 105 | 106 | 107 | keytar 108 | 109 | 110 | 111 | mumblecore 112 | 113 | fanny pack 114 | 115 | selfies 116 | 117 | 118 | 119 | Tonx 120 | 121 | bitters 122 | 123 | mixtape 124 | 125 | 126 | 127 | gastropub 128 | 129 | . Vegan 130 | 131 | Neutra 132 | 133 | small batch 134 | 135 | keffiyeh 136 | 137 | , 138 | 139 | normcore 140 | 141 | single-origin coffee meh pickled High Life 142 | 143 | meggings 144 | 145 | hoodie next level 146 | 147 | fingerstache 148 | 149 | church-key. Jean shorts wayfarers messenger bag kitsch banjo art party single-origin coffee. -------------------------------------------------------------------------------- /tests/pptx/standardized_text.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pptx/standardized_text.pptx -------------------------------------------------------------------------------- /tests/ps/raw_text.ps: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-1.0 2 | 3 | newpath 4 | 50 50 moveto 5 | 50 300 lineto % absolute coordinates 6 | 400 300 lineto 7 | 400 50 lineto 8 | closepath 9 | stroke 10 | 11 | newpath 12 | 0.5 setgray 13 | 100 100 moveto 14 | 100 0 rlineto % relative coordinates 15 | 0 100 rlineto 16 | -100 0 rlineto 17 | closepath 18 | fill 19 | 20 | /Times-Roman findfont 21 | 20 scalefont 22 | setfont 23 | 24 | newpath 25 | 0.0 setgray 26 | 72 72 moveto 27 | (How exciting!) show 28 | 29 | gsave 30 | 200 200 moveto 31 | 0.5 1 scale 32 | (Narrow Text) show 33 | grestore 34 | 35 | 0 2 360 { 36 | newpath 37 | gsave 38 | rotate 39 | 300 300 moveto 40 | 100 0 rlineto 41 | stroke 42 | grestore 43 | } for 44 | 45 | gsave 46 | /Times-Roman findfont 60 scalefont setfont 47 | 72 72 moveto (Clipping) true charpath 48 | clip 49 | 174 72 translate 50 | 0 2 360 { % every 2. degree of 360 degrees 51 | newpath 52 | gsave 53 | rotate 54 | 0 0 moveto 55 | 300 0 rlineto 56 | stroke 57 | grestore 58 | } for 59 | grestore 60 | 61 | 62 | showpage 63 | -------------------------------------------------------------------------------- /tests/ps/raw_text.txt: -------------------------------------------------------------------------------- 1 | Narrow Text 2 | How exciting! 3 | -------------------------------------------------------------------------------- /tests/psv/raw_text.txt: -------------------------------------------------------------------------------- 1 | CREATION DATE STATUS COMPLETION DATE SERVICE REQUEST NUMBER TYPE OF SERVICE REQUEST CURRENT ACTIVITY MOST RECENT ACTION NUMBER OF POTHOLES FILLED ON BLOCK STREET ADDRESS ZIP X COORDINATE Y COORDINATE Ward Police District Community Area LATITUDE LONGITUDE LOCATION 2 | 08/28/2014 Completed 08/28/2014 14-01433654 Pothole in Street Final Outcome Pothole Patched 5 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 3 | 08/27/2014 Completed 08/27/2014 14-01424541 Pothole in Street Final Outcome Pothole Patched 1 100 N MICHIGAN AVE 60602 1177299.6140023 1900836.66107586 42 1 32 41.88328915138877 -87.62454939639862 (41.88328915138877, -87.62454939639862) 4 | 08/27/2014 Completed 08/27/2014 14-01424527 Pothole in Street Final Outcome Pothole Patched 5 100 S MICHIGAN AVE 60603 1177324.39101512 1899960.94636756 42 1 32 41.88077066474826 -87.62448301713603 (41.88077066474826, -87.62448301713603) 5 | 08/27/2014 Completed 08/27/2014 14-01424501 Pothole in Street Final Outcome Pothole Patched 5 200 S MICHIGAN AVE 60604 1177323.63491104 1899497.03462832 42 1 32 41.87948932869769 -87.6244981249339 (41.87948932869769, -87.6244981249339) 6 | 08/27/2014 Completed 08/27/2014 14-01424389 Pothole in Street Final Outcome Pothole Patched 5 300 S MICHIGAN AVE 60604 1177339.54271824 1899032.66368812 2 1 32 41.87821734531058 -87.62445584029679 (41.87821734531058, -87.62445584029679) 7 | 08/27/2014 Completed 08/27/2014 14-01424212 Pothole in Street Final Outcome Pothole Patched 15 400 S MICHIGAN AVE 60605 1177347.93260788 1898568.04117246 2 1 32 41.87693801152278 -87.6244366179451 (41.87693801152278, -87.6244366179451) 8 | 08/25/2014 Completed 08/25/2014 14-01404817 Pothole in Street Final Outcome Pothole Patched 10 740 N WABASH AVE 60611 1176600.11893119 1905427.14160096 42 18 8 41.896091479927165 -87.6269829451334 (41.896091479927165, -87.6269829451334) 9 | 08/22/2014 Completed 08/22/2014 14-01395263 Pothole in Street Final Outcome Pothole Patched 1 1137 W CHICAGO AVE 60642 1168573.58997114 1905510.8499928 27 12 24 41.89613260416578 -87.65667886202922 (41.89613260416578, -87.65667886202922) 10 | 08/22/2014 Completed 08/22/2014 14-01390538 Pothole in Street Final Outcome Pothole Patched 5 615 N FRANKLIN ST 60654 1174219.86039744 1904291.9608037 42 18 8 41.89291371668045 -87.63546140541867 (41.89291371668045, -87.63546140541867) 11 | 08/21/2014 Completed 08/28/2014 14-01383161 Pothole in Street Final Outcome Pothole Patched 5 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 12 | 08/12/2014 Completed 08/25/2014 14-01323742 Pothole in Street Final Outcome Pothole Patched 2 2744 W EVERGREEN AVE 60622 1157855.84623339 1908863.62028457 26 14 24 41.905779104276164 -87.69629062504274 (41.905779104276164, -87.69629062504274) 13 | 08/11/2014 Completed 08/25/2014 14-01317376 Pothole in Street Final Outcome Pothole Patched 4 2701 W HIRSCH ST 60622 1158141.76231382 1909202.67761659 26 14 24 41.9064904995143 -87.69453948271031 (41.9064904995143, -87.69453948271031) 14 | 08/07/2014 Completed 08/27/2014 14-01294373 Pothole in Street Final Outcome Pothole Patched 1 159 N DEARBORN ST 60601 1175915.29501332 1901390.11610965 42 1 32 41.884767887865465 -87.62932139824203 (41.884767887865465, -87.62932139824203) 15 | 08/06/2014 Completed 08/26/2014 14-01284763 Pothole in Street Final Outcome Pothole Patched 17 100 S WABASH AVE 60603 1176824.63149672 1899942.95872303 42 1 32 41.88072398164076 -87.62631635116725 (41.88072398164076, -87.62631635116725) 16 | 08/04/2014 Completed 08/25/2014 14-01264182 Pothole in Street Final Outcome Pothole Patched 3 201 E SUPERIOR ST 60611 1177708.98999045 1905405.56110885 42 18 8 41.8956500849415 -87.62265814875258 (41.8956500849415, -87.62265814875258) 17 | 08/04/2014 Completed 08/22/2014 14-01266469 Pothole in Street Final Outcome Pothole Patched 8 220 W SUPERIOR ST 60654 1174518.27808694 1905317.80798624 42 18 8 41.89569109218158 -87.63509877442043 (41.89569109218158, -87.63509877442043) 18 | 08/03/2014 Completed 08/22/2014 14-01258054 Pothole in Street Final Outcome Pothole Patched 8 600 N WABASH AVE 60611 1176632.67288564 1904207.6496666 42 18 8 41.892531148204995 -87.62689596447262 (41.892531148204995, -87.62689596447262) 19 | 07/29/2014 Completed 08/28/2014 14-01228358 Pothole in Street Final Outcome Pothole Patched 7 10 W ELM ST 60610 1176002.41378993 1908093.80450502 42 18 8 41.90328038833401 -87.62904275288018 (41.90328038833401, -87.62904275288018) 20 | 07/29/2014 Completed 08/25/2014 14-01224309 Pothole in Street Final Outcome Pothole Patched 15 201 E HURON ST 60611 1177717.86567145 1905118.00892518 42 18 8 41.89486671392807 -87.62262561802254 (41.89486671392807, -87.62262561802254) 21 | 07/28/2014 Completed 08/27/2014 14-01208070 Pothole in Street Final Outcome Pothole Patched 2 177 N WELLS ST 60606 1174702.03147832 1901552.59999636 42 1 32 41.88525719405359 -87.63376979601067 (41.88525719405359, -87.63376979601067) 22 | 07/26/2014 Completed 08/28/2014 14-01203517 Pothole in Street Final Outcome Pothole Patched 1 1 W MAPLE ST 60610 1176169.35729479 1907624.99998895 42 18 8 41.90177327913275 -87.62842192883082 (41.90177327913275, -87.62842192883082) 23 | 07/24/2014 Completed 08/22/2014 14-01190319 Pothole in Street Final Outcome Pothole Patched 3 730 N FRANKLIN ST 60654 1174191.71119404 1905308.4805198 42 18 8 41.89562283550215 -87.63582499692767 (41.89562283550215, -87.63582499692767) 24 | 07/23/2014 Completed 08/28/2014 14-01178461 Pothole in Street Final Outcome Pothole Patched 3 1155 N DEARBORN ST 60610 1175723.69360823 1908131.92908715 42 18 8 41.90344694644998 -87.62982489848619 (41.90344694644998, -87.62982489848619) 25 | 07/21/2014 Completed 08/22/2014 14-01161819 Pothole in Street Final Outcome Pothole Patched 3 600 E GRAND AVE 60611 1180764.02691294 1904055.10759636 42 18 8 41.892094136861786 -87.61156988394656 (41.892094136861786, -87.61156988394656) 26 | 07/18/2014 Completed 08/27/2014 14-01149914 Pothole in Street Final Outcome Pothole Patched 4 150 N FRANKLIN ST 60606 1174306.13841283 1901261.81180327 42 1 32 41.88449312034977 -87.63552722968024 (41.88449312034977, -87.63552722968024) 27 | 07/17/2014 Completed 08/22/2014 14-01134572 Pothole in Street Final Outcome Pothole Patched 2 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 28 | 07/14/2014 Completed 08/28/2014 14-01112602 Pothole in Street Final Outcome Pothole Patched 15 170 W OAK ST 60610 1174950.17410849 1907124.5048948 42 18 8 41.90065230620216 -87.63419240464901 (41.90065230620216, -87.63419240464901) 29 | 07/14/2014 Completed 08/22/2014 14-01114977 Pothole in Street Final Outcome Pothole Patched 3 461 N CITYFRONT PLAZA DR 60611 1177942.61391555 1903427.32649215 42 18 8 41.89012269030228 -87.62144215679457 (41.89012269030228, -87.62144215679457) 30 | 07/14/2014 Completed 08/27/2014 14-01114104 Pothole in Street Final Outcome Pothole Patched 13 462 N LAKE SHORE DR 60611 1180059.25771678 1903489.31249721 42 18 8 41.89043391381039 -87.61432635149913 (41.89043391381039, -87.61432635149913) 31 | 07/13/2014 Completed 08/22/2014 14-01106120 Pothole in Street Final Outcome Pothole Patched 2 37 E OHIO ST 60611 1176632.67288564 1904207.6496666 42 18 8 41.892389760220446 -87.6266700643506 (41.892389760220446, -87.6266700643506) 32 | 07/11/2014 Completed 08/27/2014 14-01092992 Pothole in Street Final Outcome Pothole Patched 0 517 N LAKE SHORE DR 60611 1180023.3999851 1903912.87000671 42 18 8 41.891747134984456 -87.61415723810663 (41.891747134984456, -87.61415723810663) 33 | 07/11/2014 Completed 08/28/2014 14-01092312 Pothole in Street Final Outcome Pothole Patched 2 1165 N LA SALLE DR 60610 1174923.84946801 1908163.01737071 42 18 8 41.90367107146938 -87.63276634924583 (41.90367107146938, -87.63276634924583) 34 | 07/09/2014 Completed 08/22/2014 14-01072126 Pothole in Street Final Outcome Pothole Patched 1 200 W OHIO ST 60654 1174634.07810009 1904152.08572945 42 18 8 41.892497422329804 -87.63416622539611 (41.892497422329804, -87.63416622539611) 35 | 07/08/2014 Completed 08/26/2014 14-01064944 Pothole in Street Final Outcome Pothole Patched 4 34 S STATE ST 60603 1176384.09122814 1900236.40989122 42 1 32 41.881194280111245 -87.62788715387323 (41.881194280111245, -87.62788715387323) 36 | 07/06/2014 Completed 08/22/2014 14-01050617 Pothole in Street Final Outcome Pothole Patched 5 430 N FRANKLIN ST 60654 1174247.46842821 1903265.17618927 42 18 8 41.890001134206734 -87.63568286112702 (41.890001134206734, -87.63568286112702) 37 | 06/30/2014 Completed 08/26/2014 14-01013381 Pothole in Street Final Outcome Pothole Patched 7 50 E JACKSON BLVD 60604 1176916.76437779 1899018.37548671 42 1 32 41.87836259235806 -87.62578989491375 (41.87836259235806, -87.62578989491375) 38 | 06/26/2014 Completed 08/27/2014 14-00992410 Pothole in Street Final Outcome Pothole Patched 6 201 N WELLS ST 60606 1174696.68801277 1901736.11738759 42 1 32 41.885758730147366 -87.63378404961261 (41.885758730147366, -87.63378404961261) 39 | 06/23/2014 Completed 08/22/2014 14-00961579 Pothole in Street Final Outcome Pothole Patched 1 340 W ERIE ST 60654 1173808.62199403 1904714.70549298 42 18 8 41.89405989499057 -87.63717643739275 (41.89405989499057, -87.63717643739275) 40 | 06/23/2014 Completed 08/26/2014 14-00967688 Pothole in Street Final Outcome Pothole Patched 12 140 S DEARBORN ST 60603 1175959.03010572 1899623.29000633 42 1 32 41.87967604011696 -87.62950228266759 (41.87967604011696, -87.62950228266759) 41 | 06/18/2014 Completed 08/27/2014 14-00933275 Pothole in Street Final Outcome Pothole Patched 15 211 N DEARBORN ST 60601 1175900.7237733 1901846.43477009 42 1 32 41.88604316092842 -87.62936052143365 (41.88604316092842, -87.62936052143365) 42 | 06/12/2014 Completed 08/27/2014 14-00897156 Pothole in Street Final Outcome Pothole Patched 42 200 S LA SALLE ST 60604 1175161.95537691 1899423.25992154 42 1 32 41.87936625147701 -87.63243763414764 (41.87936625147701, -87.63243763414764) 43 | 06/12/2014 Completed 08/22/2014 14-00890682 Pothole in Street Final Outcome Pothole Patched 1 306 W OHIO ST 60654 1174162.61920993 1904141.19087966 42 18 8 41.89247410656883 -87.63593436430777 (41.89247410656883, -87.63593436430777) 44 | 06/11/2014 Completed 08/28/2014 14-00887552 Pothole in Street Final Outcome Pothole Patched 4 1020 N STATE ST 60610 1176176.20887962 1907374.89001835 42 18 8 41.901200484161656 -87.62847652035359 (41.901200484161656, -87.62847652035359) 45 | 06/10/2014 Completed 08/27/2014 14-00880864 Pothole in Street Final Outcome Pothole Patched 0 245 W WASHINGTON ST 60606 1174540.03218467 1900805.68709955 42 1 32 41.883096426444126 -87.63515649580224 (41.883096426444126, -87.63515649580224) 46 | 06/04/2014 Completed 08/22/2014 14-00844247 Pothole in Street Final Outcome Pothole Patched 2 525 W SUPERIOR ST 60654 1172505.49257168 1905270.97268573 42 18 8 41.895379256528365 -87.64251783750353 (41.895379256528365, -87.64251783750353) 47 | 06/03/2014 Completed 08/26/2014 14-00829362 Pothole in Street Final Outcome Pothole Patched 2 100 S STATE ST 60603 1176400.89309128 1899922.58000955 42 1 32 41.88071513784339 -87.62787607841581 (41.88071513784339, -87.62787607841581) 48 | 06/01/2014 Completed 08/26/2014 14-00819596 Pothole in Street Final Outcome Pothole Patched 1 21 S DEARBORN ST 60603 1175943.73762726 1900281.08996842 42 1 32 41.88163394997373 -87.62924355913282 (41.88163394997373, -87.62924355913282) 49 | 05/31/2014 Completed 08/28/2014 14-00816655 Pothole in Street Final Outcome Pothole Patched 2 1250 N DEARBORN ST 60610 1175716.58493337 1908364.4279693 42 18 8 41.905379013702905 -87.63017455348202 (41.905379013702905, -87.63017455348202) 50 | 05/29/2014 Completed 08/22/2014 14-00805945 Pothole in Street Final Outcome Pothole Patched 10 738 N LARRABEE ST 60654 1172208.49919151 1905332.36197604 42 18 8 41.89589422031982 -87.64313907101959 (41.89589422031982, -87.64313907101959) 51 | 05/29/2014 Completed 08/26/2014 14-00802449 Pothole in Street Final Outcome Pothole Patched 6 33 S STATE ST 60603 1176383.89122814 1900241.30989122 42 1 32 41.8812023900134 -87.62759359231444 (41.8812023900134, -87.62759359231444) 52 | 05/28/2014 Completed 08/22/2014 14-00797786 Pothole in Street Final Outcome Pothole Patched 2 720 N FRANKLIN ST 60654 1174193.91119404 1905217.9805198 42 18 8 41.89553241941632 -87.63582582624329 (41.89553241941632, -87.63582582624329) 53 | 05/26/2014 Completed 08/28/2014 14-00780187 Pothole in Street Final Outcome Pothole Patched 15 12 W ELM ST 60610 1175983.31378993 1908093.10450502 42 18 8 41.90327840141334 -87.62912853335304 (41.90327840141334, -87.62912853335304) 54 | 05/21/2014 Completed 08/28/2014 14-00753021 Pothole in Street Final Outcome Pothole Patched 10 30 E ELM ST 60611 1176399.91378993 1908106.10450502 42 18 8 41.903312908929266 -87.6274232482672 (41.903312908929266, -87.6274232482672) 55 | 05/19/2014 Completed 08/27/2014 14-00736478 Pothole in Street Final Outcome Pothole Patched 10 520 E ILLINOIS ST 60611 1180045.15771678 1903729.01249721 42 18 8 41.891214491225654 -87.61421696505855 (41.891214491225654, -87.61421696505855) 56 | 04/08/2014 Completed 08/25/2014 14-00504762 Pothole in Street Final Outcome Pothole Patched 2 1300 N CALIFORNIA AVE 60622 1157492.02101534 1908524.16997455 26 14 24 41.904795555353125 -87.69707304441381 (41.904795555353125, -87.69707304441381) -------------------------------------------------------------------------------- /tests/psv/standardized_text.psv: -------------------------------------------------------------------------------- 1 | the|quick 2 | brown|fox 3 | jumps over|the 4 | lazy|dog -------------------------------------------------------------------------------- /tests/rtf/raw_text.txt: -------------------------------------------------------------------------------- 1 | I love word documents. They are lovely. They make me so happy I could smile. And that is why I wrote this package. 2 | 3 | Sample text is hard. That is where http://hipsum.co comes in handy. 4 | 5 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church-key locavore beard, food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred hoodie vegan, food truck leggings Austin pour-over banjo trust fund before they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. 6 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard gluten-free seitan, VHS sartorial pork belly gastropub meh whatever authentic synth. Beard single-origin coffee irony fixie, before they sold out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. 7 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo-fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag. 8 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt chambray, leggings shabby chic gastropub YOLO plaid hoodie Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan paleo Etsy you probably haven't heard of them Pitchfork Schlitz readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter next level banjo. Banksy occupy authentic master cleanse Bushwick fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify stumptown pop-up. 9 | Oh. You need a little dummy text for your mockup? How quaint. 10 | I bet you are still using Bootstrap too 11 | 12 | -------------------------------------------------------------------------------- /tests/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Run the test suite that is specified in the .travis.yml file 4 | """ 5 | 6 | import os 7 | import subprocess 8 | 9 | import yaml 10 | 11 | from textract.colors import green, red 12 | 13 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 14 | def run_test(command): 15 | wrapped_command = "cd %s && %s" % (root_dir, command) 16 | pipe = subprocess.Popen( 17 | wrapped_command, shell=True, 18 | ) 19 | pipe.wait() 20 | if pipe.returncode == 0: 21 | print(green("TEST PASSED")) 22 | else: 23 | print(red("TEST FAILED")) 24 | return pipe.returncode 25 | 26 | # load the script tests from the .travis.yml file 27 | with open(os.path.join(root_dir, '.travis.yml')) as stream: 28 | travis_yml = yaml.load_all(stream.read()) 29 | config = travis_yml.next() 30 | tests = config['script'] 31 | 32 | # run the tests 33 | if isinstance(tests, (str, unicode)): 34 | returncode = run_test(tests) 35 | elif isinstance(tests, (list, tuple)): 36 | returncode = 0 37 | for test in tests: 38 | returncode += run_test(test) 39 | 40 | if returncode == 0: 41 | print(green("ALL TESTS PASSED")) 42 | else: 43 | print(red("SOME TESTS FAILED, SEE ABOVE")) 44 | -------------------------------------------------------------------------------- /tests/run_docker_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Run this to create an up-to-date Docker container and run tests. 4 | 5 | cd $(dirname $0)/.. 6 | base=$(pwd) 7 | 8 | image="textract/ubuntu12.04" 9 | 10 | cp tests/Dockerfile ./Dockerfile 11 | 12 | # Note: For speed, the image won't be automatically rebuilt. If the dependencies 13 | # change and the existing image is outdated, just delete it with: 14 | # docker rmi 15 | docker images | grep $image || docker build -t $image . 16 | docker run --rm -v $base:/home/textract/src $image 17 | 18 | rm ./Dockerfile 19 | 20 | -------------------------------------------------------------------------------- /tests/test_csv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class CsvTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'csv' 8 | -------------------------------------------------------------------------------- /tests/test_doc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class DocTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'doc' 8 | -------------------------------------------------------------------------------- /tests/test_docx.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | from . import base 5 | 6 | 7 | class DocxTestCase(base.BaseParserTestCase, unittest.TestCase): 8 | extension = 'docx' 9 | 10 | def test_tables(self): 11 | """make sure table output is correct""" 12 | d = self.get_extension_directory() 13 | self.compare_cli_output(os.path.join(d, "paragraphs_and_tables.docx")) 14 | -------------------------------------------------------------------------------- /tests/test_eml.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class EmlTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'eml' 8 | -------------------------------------------------------------------------------- /tests/test_epub.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class EpubTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'epub' 8 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import subprocess 4 | import uuid 5 | 6 | from . import base 7 | 8 | 9 | class ExceptionTestCase(base.GenericUtilities, unittest.TestCase): 10 | """This class contains a bunch of tests to make sure that textract 11 | fails in expected ways. 12 | """ 13 | 14 | def test_unsupported_extension_cli(self): 15 | """Make sure unsupported extension exits with non-zero status""" 16 | filename = self.get_temp_filename(extension="extension") 17 | command = "textract %(filename)s 2> /dev/null" % locals() 18 | self.assertEqual(1, subprocess.call(command, shell=True)) 19 | os.remove(filename) 20 | 21 | def test_unsupported_extension_python(self): 22 | """Make sure unsupported extension raises the correct error""" 23 | filename = self.get_temp_filename(extension="extension") 24 | import textract 25 | from textract.exceptions import ExtensionNotSupported 26 | with self.assertRaises(ExtensionNotSupported): 27 | textract.process(filename) 28 | os.remove(filename) 29 | 30 | def test_missing_filename_cli(self): 31 | """Make sure missing files exits with non-zero status""" 32 | filename = self.get_temp_filename() 33 | os.remove(filename) 34 | command = "textract %(filename)s 2> /dev/null" % locals() 35 | self.assertEqual(1, subprocess.call(command, shell=True)) 36 | 37 | def test_missing_filename_python(self): 38 | """Make sure missing files raise the correct error""" 39 | filename = self.get_temp_filename() 40 | os.remove(filename) 41 | import textract 42 | from textract.exceptions import MissingFileError 43 | with self.assertRaises(MissingFileError): 44 | textract.process(filename) 45 | 46 | def test_shell_parser_run(self): 47 | """get a useful error message when a dependency is missing""" 48 | from textract.parsers import utils 49 | from textract.parsers import exceptions 50 | parser = utils.ShellParser() 51 | try: 52 | # There shouldn't be a command on the path matching a random uuid 53 | parser.run([str(uuid.uuid4())]) 54 | except exceptions.ShellError as e: 55 | self.assertTrue(e.is_not_installed()) 56 | else: 57 | self.assertTrue(False, "Expected ShellError") 58 | -------------------------------------------------------------------------------- /tests/test_gif.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class GifTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'gif' 8 | -------------------------------------------------------------------------------- /tests/test_html.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | from . import base 5 | 6 | 7 | class HtmlTestCase(base.BaseParserTestCase, unittest.TestCase): 8 | extension = 'html' 9 | 10 | def test_table_text_python(self): 11 | """Make sure tables in html look pretty through python""" 12 | d = self.get_extension_directory() 13 | self.compare_python_output(os.path.join(d, "tables.html")) 14 | 15 | def test_table_text_cli(self): 16 | """Make sure tables in html look pretty through cli""" 17 | d = self.get_extension_directory() 18 | self.compare_cli_output(os.path.join(d, "tables.html")) 19 | -------------------------------------------------------------------------------- /tests/test_jpg.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import os 4 | 5 | from . import base 6 | 7 | 8 | class JpgTestCase(base.ShellParserTestCase, unittest.TestCase): 9 | extension = 'jpg' 10 | 11 | def get_jpeg_filename(self, contents_filename): 12 | temp_filename = self.get_temp_filename() 13 | jpeg_filename = temp_filename + ".jpeg" 14 | os.remove(temp_filename) 15 | shutil.copyfile(contents_filename, jpeg_filename) 16 | return jpeg_filename 17 | 18 | def test_jpeg_synonym_cli(self): 19 | """Make sure .jpeg synonym works in cli""" 20 | jpeg_filename = self.get_jpeg_filename(self.raw_text_filename) 21 | self.compare_cli_output( 22 | jpeg_filename, 23 | self.get_expected_filename(self.raw_text_filename), 24 | ) 25 | os.remove(jpeg_filename) 26 | 27 | def test_jpeg_synonym_python(self): 28 | """Make sure .jpeg synonym works in python""" 29 | jpeg_filename = self.get_jpeg_filename(self.raw_text_filename) 30 | self.compare_python_output( 31 | jpeg_filename, 32 | self.get_expected_filename(self.raw_text_filename), 33 | ) 34 | os.remove(jpeg_filename) 35 | 36 | -------------------------------------------------------------------------------- /tests/test_json.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class JsonTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'json' 8 | -------------------------------------------------------------------------------- /tests/test_mp3.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class Mp3TestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'mp3' 8 | 9 | def test_mp3(self): 10 | """make sure default audio method output is correct""" 11 | self.compare_python_output(self.raw_text_filename) 12 | 13 | def test_mp3_google(self): 14 | """make sure google api python output is correct""" 15 | self.compare_python_output(self.raw_text_filename, method='google') 16 | 17 | def test_mp3_sphinx(self): 18 | """make sure sphinx python output is correct""" 19 | self.compare_python_output(self.raw_text_filename, method='sphinx') 20 | -------------------------------------------------------------------------------- /tests/test_msg.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class MsgTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'msg' 8 | -------------------------------------------------------------------------------- /tests/test_no_ext.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import textract 4 | 5 | class No_Ext_TestCase(unittest.TestCase): 6 | 7 | def test_docx(self): 8 | current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 9 | docx_file = os.path.join(current_dir, "tests/no_ext/docx_paragraphs_and_tables") 10 | # pass the file without extension and provide the extension as a parameter 11 | text = textract.process(docx_file, extension='docx') 12 | print(text) 13 | 14 | def test_msg(self): 15 | current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 16 | msg_file = os.path.join(current_dir, "tests/no_ext/msg_standardized_text") 17 | # pass the file without extension and provide the extension as a parameter 18 | text = textract.process(msg_file, extension='msg') 19 | print(text) 20 | 21 | def test_pdf(self): 22 | current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 23 | pdf_file = os.path.join(current_dir, "tests/no_ext/pdf_standardized_text") 24 | # pass the file without extension and provide the extension as a parameter 25 | text = textract.process(pdf_file, extension='.pdf') 26 | print(text) 27 | 28 | -------------------------------------------------------------------------------- /tests/test_odt.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class OdtTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'odt' 8 | -------------------------------------------------------------------------------- /tests/test_ogg.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class OggTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'ogg' 8 | -------------------------------------------------------------------------------- /tests/test_pdf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import six 4 | 5 | from . import base 6 | 7 | 8 | class PdfTestCase(base.ShellParserTestCase, unittest.TestCase): 9 | extension = 'pdf' 10 | 11 | def test_pdfminer_python(self): 12 | """make sure pdfminer python output is correct""" 13 | self.compare_python_output(self.raw_text_filename, method='pdfminer') 14 | 15 | def test_pdfminer_cli(self): 16 | """make sure pdfminer command line output is correct""" 17 | self.compare_cli_output(self.raw_text_filename, method='pdfminer') 18 | 19 | def test_tesseract_cli(self): 20 | """confirm pdf extraction with tesseract""" 21 | d = self.get_extension_directory() 22 | self.compare_cli_output( 23 | os.path.join(d, "ocr_text.pdf"), 24 | expected_filename=os.path.join(d, "ocr_text.txt"), 25 | method='tesseract', 26 | ) 27 | 28 | def test_two_column(self): 29 | """Preserve two column layout in extraction""" 30 | filename = os.path.join(self.get_extension_directory(), 'two_column.pdf') 31 | self.compare_python_output(filename, layout=True) 32 | -------------------------------------------------------------------------------- /tests/test_png.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class PngTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'png' 8 | -------------------------------------------------------------------------------- /tests/test_pptx.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class PptxTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'pptx' 8 | -------------------------------------------------------------------------------- /tests/test_ps.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class PsTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'ps' 8 | -------------------------------------------------------------------------------- /tests/test_psv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class PsvTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'psv' 8 | -------------------------------------------------------------------------------- /tests/test_rtf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class RtfTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'rtf' 8 | -------------------------------------------------------------------------------- /tests/test_tiff.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class PngTestCase(base.ShellParserTestCase, unittest.TestCase): 7 | extension = 'tiff' 8 | -------------------------------------------------------------------------------- /tests/test_tsv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class TsvTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'tsv' 8 | -------------------------------------------------------------------------------- /tests/test_txt.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import os 4 | 5 | from . import base 6 | 7 | 8 | class TxtTestCase(base.BaseParserTestCase, unittest.TestCase): 9 | extension = 'txt' 10 | 11 | def test_extensionless_filenames(self): 12 | """make sure that text from extensionless files is treated as txt""" 13 | temp_filename = self.get_temp_filename() 14 | shutil.copyfile(self.raw_text_filename, temp_filename) 15 | self.compare_python_output(temp_filename, self.raw_text_filename) 16 | os.remove(temp_filename) 17 | -------------------------------------------------------------------------------- /tests/test_wav.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class WavTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'wav' 8 | -------------------------------------------------------------------------------- /tests/test_xls.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class XlsTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'xls' 8 | -------------------------------------------------------------------------------- /tests/test_xlsx.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from . import base 4 | 5 | 6 | class XlsxTestCase(base.BaseParserTestCase, unittest.TestCase): 7 | extension = 'xlsx' 8 | -------------------------------------------------------------------------------- /tests/tiff/raw_text.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/tiff/raw_text.tiff -------------------------------------------------------------------------------- /tests/tiff/standardized_text.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/tiff/standardized_text.tiff -------------------------------------------------------------------------------- /tests/tsv/raw_text.txt: -------------------------------------------------------------------------------- 1 | CREATION DATE STATUS COMPLETION DATE SERVICE REQUEST NUMBER TYPE OF SERVICE REQUEST CURRENT ACTIVITY MOST RECENT ACTION NUMBER OF POTHOLES FILLED ON BLOCK STREET ADDRESS ZIP X COORDINATE Y COORDINATE Ward Police District Community Area LATITUDE LONGITUDE LOCATION 2 | 08/28/2014 Completed 08/28/2014 14-01433654 Pothole in Street Final Outcome Pothole Patched 5 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 3 | 08/27/2014 Completed 08/27/2014 14-01424541 Pothole in Street Final Outcome Pothole Patched 1 100 N MICHIGAN AVE 60602 1177299.6140023 1900836.66107586 42 1 32 41.88328915138877 -87.62454939639862 (41.88328915138877, -87.62454939639862) 4 | 08/27/2014 Completed 08/27/2014 14-01424527 Pothole in Street Final Outcome Pothole Patched 5 100 S MICHIGAN AVE 60603 1177324.39101512 1899960.94636756 42 1 32 41.88077066474826 -87.62448301713603 (41.88077066474826, -87.62448301713603) 5 | 08/27/2014 Completed 08/27/2014 14-01424501 Pothole in Street Final Outcome Pothole Patched 5 200 S MICHIGAN AVE 60604 1177323.63491104 1899497.03462832 42 1 32 41.87948932869769 -87.6244981249339 (41.87948932869769, -87.6244981249339) 6 | 08/27/2014 Completed 08/27/2014 14-01424389 Pothole in Street Final Outcome Pothole Patched 5 300 S MICHIGAN AVE 60604 1177339.54271824 1899032.66368812 2 1 32 41.87821734531058 -87.62445584029679 (41.87821734531058, -87.62445584029679) 7 | 08/27/2014 Completed 08/27/2014 14-01424212 Pothole in Street Final Outcome Pothole Patched 15 400 S MICHIGAN AVE 60605 1177347.93260788 1898568.04117246 2 1 32 41.87693801152278 -87.6244366179451 (41.87693801152278, -87.6244366179451) 8 | 08/25/2014 Completed 08/25/2014 14-01404817 Pothole in Street Final Outcome Pothole Patched 10 740 N WABASH AVE 60611 1176600.11893119 1905427.14160096 42 18 8 41.896091479927165 -87.6269829451334 (41.896091479927165, -87.6269829451334) 9 | 08/22/2014 Completed 08/22/2014 14-01395263 Pothole in Street Final Outcome Pothole Patched 1 1137 W CHICAGO AVE 60642 1168573.58997114 1905510.8499928 27 12 24 41.89613260416578 -87.65667886202922 (41.89613260416578, -87.65667886202922) 10 | 08/22/2014 Completed 08/22/2014 14-01390538 Pothole in Street Final Outcome Pothole Patched 5 615 N FRANKLIN ST 60654 1174219.86039744 1904291.9608037 42 18 8 41.89291371668045 -87.63546140541867 (41.89291371668045, -87.63546140541867) 11 | 08/21/2014 Completed 08/28/2014 14-01383161 Pothole in Street Final Outcome Pothole Patched 5 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 12 | 08/12/2014 Completed 08/25/2014 14-01323742 Pothole in Street Final Outcome Pothole Patched 2 2744 W EVERGREEN AVE 60622 1157855.84623339 1908863.62028457 26 14 24 41.905779104276164 -87.69629062504274 (41.905779104276164, -87.69629062504274) 13 | 08/11/2014 Completed 08/25/2014 14-01317376 Pothole in Street Final Outcome Pothole Patched 4 2701 W HIRSCH ST 60622 1158141.76231382 1909202.67761659 26 14 24 41.9064904995143 -87.69453948271031 (41.9064904995143, -87.69453948271031) 14 | 08/07/2014 Completed 08/27/2014 14-01294373 Pothole in Street Final Outcome Pothole Patched 1 159 N DEARBORN ST 60601 1175915.29501332 1901390.11610965 42 1 32 41.884767887865465 -87.62932139824203 (41.884767887865465, -87.62932139824203) 15 | 08/06/2014 Completed 08/26/2014 14-01284763 Pothole in Street Final Outcome Pothole Patched 17 100 S WABASH AVE 60603 1176824.63149672 1899942.95872303 42 1 32 41.88072398164076 -87.62631635116725 (41.88072398164076, -87.62631635116725) 16 | 08/04/2014 Completed 08/25/2014 14-01264182 Pothole in Street Final Outcome Pothole Patched 3 201 E SUPERIOR ST 60611 1177708.98999045 1905405.56110885 42 18 8 41.8956500849415 -87.62265814875258 (41.8956500849415, -87.62265814875258) 17 | 08/04/2014 Completed 08/22/2014 14-01266469 Pothole in Street Final Outcome Pothole Patched 8 220 W SUPERIOR ST 60654 1174518.27808694 1905317.80798624 42 18 8 41.89569109218158 -87.63509877442043 (41.89569109218158, -87.63509877442043) 18 | 08/03/2014 Completed 08/22/2014 14-01258054 Pothole in Street Final Outcome Pothole Patched 8 600 N WABASH AVE 60611 1176632.67288564 1904207.6496666 42 18 8 41.892531148204995 -87.62689596447262 (41.892531148204995, -87.62689596447262) 19 | 07/29/2014 Completed 08/28/2014 14-01228358 Pothole in Street Final Outcome Pothole Patched 7 10 W ELM ST 60610 1176002.41378993 1908093.80450502 42 18 8 41.90328038833401 -87.62904275288018 (41.90328038833401, -87.62904275288018) 20 | 07/29/2014 Completed 08/25/2014 14-01224309 Pothole in Street Final Outcome Pothole Patched 15 201 E HURON ST 60611 1177717.86567145 1905118.00892518 42 18 8 41.89486671392807 -87.62262561802254 (41.89486671392807, -87.62262561802254) 21 | 07/28/2014 Completed 08/27/2014 14-01208070 Pothole in Street Final Outcome Pothole Patched 2 177 N WELLS ST 60606 1174702.03147832 1901552.59999636 42 1 32 41.88525719405359 -87.63376979601067 (41.88525719405359, -87.63376979601067) 22 | 07/26/2014 Completed 08/28/2014 14-01203517 Pothole in Street Final Outcome Pothole Patched 1 1 W MAPLE ST 60610 1176169.35729479 1907624.99998895 42 18 8 41.90177327913275 -87.62842192883082 (41.90177327913275, -87.62842192883082) 23 | 07/24/2014 Completed 08/22/2014 14-01190319 Pothole in Street Final Outcome Pothole Patched 3 730 N FRANKLIN ST 60654 1174191.71119404 1905308.4805198 42 18 8 41.89562283550215 -87.63582499692767 (41.89562283550215, -87.63582499692767) 24 | 07/23/2014 Completed 08/28/2014 14-01178461 Pothole in Street Final Outcome Pothole Patched 3 1155 N DEARBORN ST 60610 1175723.69360823 1908131.92908715 42 18 8 41.90344694644998 -87.62982489848619 (41.90344694644998, -87.62982489848619) 25 | 07/21/2014 Completed 08/22/2014 14-01161819 Pothole in Street Final Outcome Pothole Patched 3 600 E GRAND AVE 60611 1180764.02691294 1904055.10759636 42 18 8 41.892094136861786 -87.61156988394656 (41.892094136861786, -87.61156988394656) 26 | 07/18/2014 Completed 08/27/2014 14-01149914 Pothole in Street Final Outcome Pothole Patched 4 150 N FRANKLIN ST 60606 1174306.13841283 1901261.81180327 42 1 32 41.88449312034977 -87.63552722968024 (41.88449312034977, -87.63552722968024) 27 | 07/17/2014 Completed 08/22/2014 14-01134572 Pothole in Street Final Outcome Pothole Patched 2 600 N MICHIGAN AVE 60611 1177329.77956558 1904235.62916875 42 18 8 41.89259322553828 -87.6243340479495 (41.89259322553828, -87.6243340479495) 28 | 07/14/2014 Completed 08/28/2014 14-01112602 Pothole in Street Final Outcome Pothole Patched 15 170 W OAK ST 60610 1174950.17410849 1907124.5048948 42 18 8 41.90065230620216 -87.63419240464901 (41.90065230620216, -87.63419240464901) 29 | 07/14/2014 Completed 08/22/2014 14-01114977 Pothole in Street Final Outcome Pothole Patched 3 461 N CITYFRONT PLAZA DR 60611 1177942.61391555 1903427.32649215 42 18 8 41.89012269030228 -87.62144215679457 (41.89012269030228, -87.62144215679457) 30 | 07/14/2014 Completed 08/27/2014 14-01114104 Pothole in Street Final Outcome Pothole Patched 13 462 N LAKE SHORE DR 60611 1180059.25771678 1903489.31249721 42 18 8 41.89043391381039 -87.61432635149913 (41.89043391381039, -87.61432635149913) 31 | 07/13/2014 Completed 08/22/2014 14-01106120 Pothole in Street Final Outcome Pothole Patched 2 37 E OHIO ST 60611 1176632.67288564 1904207.6496666 42 18 8 41.892389760220446 -87.6266700643506 (41.892389760220446, -87.6266700643506) 32 | 07/11/2014 Completed 08/27/2014 14-01092992 Pothole in Street Final Outcome Pothole Patched 0 517 N LAKE SHORE DR 60611 1180023.3999851 1903912.87000671 42 18 8 41.891747134984456 -87.61415723810663 (41.891747134984456, -87.61415723810663) 33 | 07/11/2014 Completed 08/28/2014 14-01092312 Pothole in Street Final Outcome Pothole Patched 2 1165 N LA SALLE DR 60610 1174923.84946801 1908163.01737071 42 18 8 41.90367107146938 -87.63276634924583 (41.90367107146938, -87.63276634924583) 34 | 07/09/2014 Completed 08/22/2014 14-01072126 Pothole in Street Final Outcome Pothole Patched 1 200 W OHIO ST 60654 1174634.07810009 1904152.08572945 42 18 8 41.892497422329804 -87.63416622539611 (41.892497422329804, -87.63416622539611) 35 | 07/08/2014 Completed 08/26/2014 14-01064944 Pothole in Street Final Outcome Pothole Patched 4 34 S STATE ST 60603 1176384.09122814 1900236.40989122 42 1 32 41.881194280111245 -87.62788715387323 (41.881194280111245, -87.62788715387323) 36 | 07/06/2014 Completed 08/22/2014 14-01050617 Pothole in Street Final Outcome Pothole Patched 5 430 N FRANKLIN ST 60654 1174247.46842821 1903265.17618927 42 18 8 41.890001134206734 -87.63568286112702 (41.890001134206734, -87.63568286112702) 37 | 06/30/2014 Completed 08/26/2014 14-01013381 Pothole in Street Final Outcome Pothole Patched 7 50 E JACKSON BLVD 60604 1176916.76437779 1899018.37548671 42 1 32 41.87836259235806 -87.62578989491375 (41.87836259235806, -87.62578989491375) 38 | 06/26/2014 Completed 08/27/2014 14-00992410 Pothole in Street Final Outcome Pothole Patched 6 201 N WELLS ST 60606 1174696.68801277 1901736.11738759 42 1 32 41.885758730147366 -87.63378404961261 (41.885758730147366, -87.63378404961261) 39 | 06/23/2014 Completed 08/22/2014 14-00961579 Pothole in Street Final Outcome Pothole Patched 1 340 W ERIE ST 60654 1173808.62199403 1904714.70549298 42 18 8 41.89405989499057 -87.63717643739275 (41.89405989499057, -87.63717643739275) 40 | 06/23/2014 Completed 08/26/2014 14-00967688 Pothole in Street Final Outcome Pothole Patched 12 140 S DEARBORN ST 60603 1175959.03010572 1899623.29000633 42 1 32 41.87967604011696 -87.62950228266759 (41.87967604011696, -87.62950228266759) 41 | 06/18/2014 Completed 08/27/2014 14-00933275 Pothole in Street Final Outcome Pothole Patched 15 211 N DEARBORN ST 60601 1175900.7237733 1901846.43477009 42 1 32 41.88604316092842 -87.62936052143365 (41.88604316092842, -87.62936052143365) 42 | 06/12/2014 Completed 08/27/2014 14-00897156 Pothole in Street Final Outcome Pothole Patched 42 200 S LA SALLE ST 60604 1175161.95537691 1899423.25992154 42 1 32 41.87936625147701 -87.63243763414764 (41.87936625147701, -87.63243763414764) 43 | 06/12/2014 Completed 08/22/2014 14-00890682 Pothole in Street Final Outcome Pothole Patched 1 306 W OHIO ST 60654 1174162.61920993 1904141.19087966 42 18 8 41.89247410656883 -87.63593436430777 (41.89247410656883, -87.63593436430777) 44 | 06/11/2014 Completed 08/28/2014 14-00887552 Pothole in Street Final Outcome Pothole Patched 4 1020 N STATE ST 60610 1176176.20887962 1907374.89001835 42 18 8 41.901200484161656 -87.62847652035359 (41.901200484161656, -87.62847652035359) 45 | 06/10/2014 Completed 08/27/2014 14-00880864 Pothole in Street Final Outcome Pothole Patched 0 245 W WASHINGTON ST 60606 1174540.03218467 1900805.68709955 42 1 32 41.883096426444126 -87.63515649580224 (41.883096426444126, -87.63515649580224) 46 | 06/04/2014 Completed 08/22/2014 14-00844247 Pothole in Street Final Outcome Pothole Patched 2 525 W SUPERIOR ST 60654 1172505.49257168 1905270.97268573 42 18 8 41.895379256528365 -87.64251783750353 (41.895379256528365, -87.64251783750353) 47 | 06/03/2014 Completed 08/26/2014 14-00829362 Pothole in Street Final Outcome Pothole Patched 2 100 S STATE ST 60603 1176400.89309128 1899922.58000955 42 1 32 41.88071513784339 -87.62787607841581 (41.88071513784339, -87.62787607841581) 48 | 06/01/2014 Completed 08/26/2014 14-00819596 Pothole in Street Final Outcome Pothole Patched 1 21 S DEARBORN ST 60603 1175943.73762726 1900281.08996842 42 1 32 41.88163394997373 -87.62924355913282 (41.88163394997373, -87.62924355913282) 49 | 05/31/2014 Completed 08/28/2014 14-00816655 Pothole in Street Final Outcome Pothole Patched 2 1250 N DEARBORN ST 60610 1175716.58493337 1908364.4279693 42 18 8 41.905379013702905 -87.63017455348202 (41.905379013702905, -87.63017455348202) 50 | 05/29/2014 Completed 08/22/2014 14-00805945 Pothole in Street Final Outcome Pothole Patched 10 738 N LARRABEE ST 60654 1172208.49919151 1905332.36197604 42 18 8 41.89589422031982 -87.64313907101959 (41.89589422031982, -87.64313907101959) 51 | 05/29/2014 Completed 08/26/2014 14-00802449 Pothole in Street Final Outcome Pothole Patched 6 33 S STATE ST 60603 1176383.89122814 1900241.30989122 42 1 32 41.8812023900134 -87.62759359231444 (41.8812023900134, -87.62759359231444) 52 | 05/28/2014 Completed 08/22/2014 14-00797786 Pothole in Street Final Outcome Pothole Patched 2 720 N FRANKLIN ST 60654 1174193.91119404 1905217.9805198 42 18 8 41.89553241941632 -87.63582582624329 (41.89553241941632, -87.63582582624329) 53 | 05/26/2014 Completed 08/28/2014 14-00780187 Pothole in Street Final Outcome Pothole Patched 15 12 W ELM ST 60610 1175983.31378993 1908093.10450502 42 18 8 41.90327840141334 -87.62912853335304 (41.90327840141334, -87.62912853335304) 54 | 05/21/2014 Completed 08/28/2014 14-00753021 Pothole in Street Final Outcome Pothole Patched 10 30 E ELM ST 60611 1176399.91378993 1908106.10450502 42 18 8 41.903312908929266 -87.6274232482672 (41.903312908929266, -87.6274232482672) 55 | 05/19/2014 Completed 08/27/2014 14-00736478 Pothole in Street Final Outcome Pothole Patched 10 520 E ILLINOIS ST 60611 1180045.15771678 1903729.01249721 42 18 8 41.891214491225654 -87.61421696505855 (41.891214491225654, -87.61421696505855) 56 | 04/08/2014 Completed 08/25/2014 14-00504762 Pothole in Street Final Outcome Pothole Patched 2 1300 N CALIFORNIA AVE 60622 1157492.02101534 1908524.16997455 26 14 24 41.904795555353125 -87.69707304441381 (41.904795555353125, -87.69707304441381) -------------------------------------------------------------------------------- /tests/tsv/standardized_text.tsv: -------------------------------------------------------------------------------- 1 | the quick 2 | brown fox 3 | jumps over the 4 | lazy dog -------------------------------------------------------------------------------- /tests/txt/raw_text.txt: -------------------------------------------------------------------------------- 1 | Little Bo peep has lost her sheep 2 | And doesn't know where to find them. 3 | Leave them alone and they'll come home, 4 | Bringing their tails behind them. 5 | Little Bo peep fell fast asleep 6 | And dreamt she heard them bleating, 7 | But when she awoke, she found it a joke, 8 | For they were all still fleeting. 9 | Then up she took her little crook 10 | Determined for to find them. 11 | She found them indeed, but it made her heart bleed, 12 | For they left their tails behind them. 13 | It happened one day, as Bo peep did stray 14 | Into a meadow hard by, 15 | There she espied their tails side by side 16 | All hung on a tree to dry. 17 | She heaved a sigh, and wiped her eye, 18 | And over the hillocks went rambling, 19 | And tried what she could, 20 | As a shepherdess should, 21 | To tack again each to its lambkin. 22 | -------------------------------------------------------------------------------- /tests/txt/standardized_text.txt: -------------------------------------------------------------------------------- 1 | the quick brown fox jumps over the lazy dog 2 | -------------------------------------------------------------------------------- /tests/wav/raw_text.txt: -------------------------------------------------------------------------------- 1 | Everything Is Awesome 2 | -------------------------------------------------------------------------------- /tests/wav/raw_text.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/wav/raw_text.wav -------------------------------------------------------------------------------- /tests/wav/standardized_text.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/wav/standardized_text.wav -------------------------------------------------------------------------------- /tests/xls/raw_text.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xls/raw_text.xls -------------------------------------------------------------------------------- /tests/xls/standardized_text.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xls/standardized_text.xls -------------------------------------------------------------------------------- /tests/xlsx/raw_text.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xlsx/raw_text.xlsx -------------------------------------------------------------------------------- /tests/xlsx/standardized_text.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xlsx/standardized_text.xlsx -------------------------------------------------------------------------------- /textract/__init__.py: -------------------------------------------------------------------------------- 1 | from .parsers import process 2 | 3 | VERSION = "1.6.5" 4 | -------------------------------------------------------------------------------- /textract/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Use argparse to handle command-line arguments. 3 | """ 4 | 5 | import argparse 6 | import encodings 7 | import os 8 | import pkgutil 9 | import sys 10 | import six 11 | import re 12 | import glob 13 | 14 | import argcomplete 15 | 16 | from . import VERSION 17 | from .parsers import DEFAULT_ENCODING, _get_available_extensions 18 | 19 | 20 | class AddToNamespaceAction(argparse.Action): 21 | """This adds KEY,VALUE arbitrary pairs to the argparse.Namespace object 22 | """ 23 | def __call__(self, parser, namespace, values, option_string=None): 24 | key, val = values.strip().split('=') 25 | if hasattr(namespace, key): 26 | parser.error(( 27 | 'Duplicate specification of the key "%(key)s" with --option.' 28 | ) % locals()) 29 | setattr(namespace, key, val) 30 | 31 | 32 | # Fix FileType to honor 'b' flag, see: https://bugs.python.org/issue14156 33 | class FileType(argparse.FileType): 34 | def __call__(self, string): 35 | if string == '-' and six.PY3: 36 | if 'r' in self._mode: 37 | string = sys.stdin.fileno() 38 | elif 'w' in self._mode: 39 | string = sys.stdout.fileno() 40 | return super(FileType, self).__call__(string) 41 | 42 | 43 | # This function is necessary to enable autodocumentation of the script 44 | # output 45 | def get_parser(): 46 | """Initialize the parser for the command line interface and bind the 47 | autocompletion functionality""" 48 | 49 | # initialize the parser 50 | parser = argparse.ArgumentParser( 51 | description=( 52 | 'Command line tool for extracting text from any document. ' 53 | ) % locals(), 54 | ) 55 | 56 | # define the command line options here 57 | parser.add_argument( 58 | 'filename', help='Filename to extract text.', 59 | ).completer = argcomplete.completers.FilesCompleter 60 | parser.add_argument( 61 | '-e', '--encoding', type=str, default=DEFAULT_ENCODING, 62 | choices=_get_available_encodings(), 63 | help='Specify the encoding of the output.', 64 | ) 65 | parser.add_argument( 66 | '--extension', type=str, default=None, 67 | choices=_get_available_extensions(), 68 | help='Specify the extension of the file.', 69 | ) 70 | parser.add_argument( 71 | '-m', '--method', default='', 72 | help='Specify a method of extraction for formats that support it', 73 | ) 74 | parser.add_argument( 75 | '-o', '--output', type=FileType('wb'), default='-', 76 | help='Output raw text in this file', 77 | ) 78 | parser.add_argument( 79 | '-O', '--option', type=str, action=AddToNamespaceAction, 80 | help=( 81 | 'Add arbitrary options to various parsers of the form ' 82 | 'KEYWORD=VALUE. A full list of available KEYWORD options is ' 83 | 'available at http://bit.ly/textract-options' 84 | ), 85 | ) 86 | parser.add_argument( 87 | '-v', '--version', action='version', version='%(prog)s '+VERSION, 88 | ) 89 | 90 | # enable autocompletion with argcomplete 91 | argcomplete.autocomplete(parser) 92 | 93 | return parser 94 | 95 | 96 | def _get_available_encodings(): 97 | """Get a list of the available encodings to make it easy to 98 | tab-complete the command line interface. 99 | 100 | Inspiration from http://stackoverflow.com/a/3824405/564709 101 | """ 102 | available_encodings = set(encodings.aliases.aliases.values()) 103 | paths = [os.path.dirname(encodings.__file__)] 104 | for importer, modname, ispkg in pkgutil.walk_packages(path=paths): 105 | available_encodings.add(modname) 106 | available_encodings = list(available_encodings) 107 | available_encodings.sort() 108 | return available_encodings 109 | -------------------------------------------------------------------------------- /textract/colors.py: -------------------------------------------------------------------------------- 1 | """Inspiration from 2 | https://github.com/fabric/fabric/blob/master/fabric/colors.py 3 | """ 4 | import re 5 | 6 | 7 | def _wrap_with(code, bold=False): 8 | def inner(text): 9 | c = code 10 | if bold: 11 | c = "1;%s" % c 12 | return "\033[%sm%s\033[0m" % (c, text) 13 | return inner 14 | 15 | 16 | red = _wrap_with('31') 17 | green = _wrap_with('32') 18 | yellow = _wrap_with('33') 19 | blue = _wrap_with('34') 20 | magenta = _wrap_with('35') 21 | cyan = _wrap_with('36') 22 | white = _wrap_with('37') 23 | 24 | bold_red = _wrap_with('31', True) 25 | bold_green = _wrap_with('32', True) 26 | bold_yellow = _wrap_with('33', True) 27 | bold_blue = _wrap_with('34', True) 28 | bold_magenta = _wrap_with('35', True) 29 | bold_cyan = _wrap_with('36', True) 30 | bold_white = _wrap_with('37', True) 31 | 32 | 33 | # regular expression to omit colorcodes 34 | def colorless(text): 35 | """Remove color from the text""" 36 | return re.sub(r"\033\[(1;)?[\d]+m", '', text) 37 | -------------------------------------------------------------------------------- /textract/exceptions.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | # traceback from exceptions that inherit from this class are suppressed 5 | class CommandLineError(Exception): 6 | """The traceback of all CommandLineError's is supressed when the 7 | errors occur on the command line to provide a useful command line 8 | interface. 9 | """ 10 | def render(self, msg): 11 | return msg % vars(self) 12 | 13 | 14 | class ExtensionNotSupported(CommandLineError): 15 | """This error is raised with unsupported extensions""" 16 | def __init__(self, ext): 17 | self.ext = ext 18 | 19 | from .parsers import _get_available_extensions 20 | available_extensions = [] 21 | for e in _get_available_extensions(): 22 | if e.startswith('.'): 23 | available_extensions.append(e) 24 | self.available_extensions_str = ', '.join(available_extensions) 25 | 26 | def __str__(self): 27 | return self.render(( 28 | 'The filename extension %(ext)s is not yet supported by\n' 29 | 'textract. Please suggest this filename extension here:\n\n' 30 | ' https://github.com/deanmalmgren/textract/issues\n\n' 31 | 'Available extensions include: %(available_extensions_str)s\n' 32 | )) 33 | 34 | 35 | class MissingFileError(CommandLineError): 36 | """This error is raised when the file can not be located at the 37 | specified path. 38 | """ 39 | def __init__(self, filename): 40 | self.filename = filename 41 | self.root, self.ext = os.path.splitext(filename) 42 | 43 | def __str__(self): 44 | return self.render(( 45 | 'The file "%(filename)s" can not be found.\n' 46 | 'Is this the right path/to/file/you/want/to/extract%(ext)s?' 47 | )) 48 | 49 | 50 | class UnknownMethod(CommandLineError): 51 | """This error is raised when the specified --method on the command 52 | line is unknown. 53 | """ 54 | def __init__(self, method): 55 | self.method = method 56 | 57 | def __str__(self): 58 | return self.render(( 59 | 'The method "%(method)s" can not be found for this filetype.' 60 | )) 61 | 62 | 63 | class ShellError(CommandLineError): 64 | """This error is raised when a shell.run returns a non-zero exit code 65 | (meaning the command failed). 66 | """ 67 | def __init__(self, command, exit_code, stdout, stderr): 68 | self.command = command 69 | self.exit_code = exit_code 70 | self.stdout = stdout 71 | self.stderr = stderr 72 | self.executable = self.command.split()[0] 73 | 74 | def is_not_installed(self): 75 | return os.name == 'posix' and self.exit_code == 127 76 | 77 | def not_installed_message(self): 78 | return ( 79 | "The command `%(command)s` failed because the executable\n" 80 | "`%(executable)s` is not installed on your system. Please make\n" 81 | "sure the appropriate dependencies are installed before using\n" 82 | "textract:\n\n" 83 | " http://textract.readthedocs.org/en/latest/installation.html\n" 84 | ) % vars(self) 85 | 86 | def failed_message(self): 87 | return ( 88 | "The command `%(command)s` failed with exit code %(exit_code)d\n" 89 | "------------- stdout -------------\n" 90 | "%(stdout)s" 91 | "------------- stderr -------------\n" 92 | "%(stderr)s" 93 | ) % vars(self) 94 | 95 | def __str__(self): 96 | if self.is_not_installed(): 97 | return self.not_installed_message() 98 | else: 99 | return self.failed_message() 100 | -------------------------------------------------------------------------------- /textract/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Route the request to the appropriate parser based on file type. 3 | """ 4 | 5 | import os 6 | import importlib 7 | import glob 8 | import re 9 | 10 | from .. import exceptions 11 | 12 | # Dictionary structure for synonymous file extension types 13 | EXTENSION_SYNONYMS = { 14 | ".jpeg": ".jpg", 15 | ".tff": ".tiff", 16 | ".tif": ".tiff", 17 | ".htm": ".html", 18 | "": ".txt", 19 | ".log": ".txt", 20 | ".tab": ".tsv", 21 | } 22 | 23 | # default encoding that is returned by the process method. specify it 24 | # here so the default is used on both the process function and also by 25 | # the command line interface 26 | DEFAULT_OUTPUT_ENCODING = 'utf_8' 27 | DEFAULT_ENCODING = 'utf_8' 28 | 29 | # filename format 30 | _FILENAME_SUFFIX = '_parser' 31 | 32 | 33 | def process(filename, input_encoding=None, output_encoding=DEFAULT_OUTPUT_ENCODING, extension=None, **kwargs): 34 | """This is the core function used for extracting text. It routes the 35 | ``filename`` to the appropriate parser and returns the extracted 36 | text as a byte-string encoded with ``encoding``. 37 | """ 38 | 39 | # make sure the filename exists 40 | if not os.path.exists(filename): 41 | raise exceptions.MissingFileError(filename) 42 | 43 | # get the filename extension, which is something like .docx for 44 | # example, and import the module dynamically using importlib. This 45 | # is a relative import so the name of the package is necessary 46 | # normally, file extension will be extracted from the file name 47 | # if the file name has no extension, then the user can pass the 48 | # extension as an argument 49 | if extension: 50 | ext = extension 51 | # check if the extension has the leading . 52 | if not ext.startswith('.'): 53 | ext = '.' + ext 54 | ext = ext.lower() 55 | else: 56 | _, ext = os.path.splitext(filename) 57 | ext = ext.lower() 58 | 59 | # check the EXTENSION_SYNONYMS dictionary 60 | ext = EXTENSION_SYNONYMS.get(ext, ext) 61 | 62 | # to avoid conflicts with packages that are installed globally 63 | # (e.g. python's json module), all extension parser modules have 64 | # the _parser extension 65 | rel_module = ext + _FILENAME_SUFFIX 66 | 67 | # If we can't import the module, the file extension isn't currently 68 | # supported 69 | try: 70 | filetype_module = importlib.import_module( 71 | rel_module, 'textract.parsers' 72 | ) 73 | except ImportError: 74 | raise exceptions.ExtensionNotSupported(ext) 75 | 76 | # do the extraction 77 | 78 | parser = filetype_module.Parser() 79 | return parser.process(filename, input_encoding, output_encoding, **kwargs) 80 | 81 | 82 | def _get_available_extensions(): 83 | """Get a list of available file extensions to make it easy for 84 | tab-completion and exception handling. 85 | """ 86 | extensions = [] 87 | 88 | # from filenames 89 | parsers_dir = os.path.join(os.path.dirname(__file__)) 90 | glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py") 91 | # escape backslashes for python 3.6+ 92 | glob_filename = glob_filename.replace("//", "////") 93 | ext_re = re.compile(glob_filename.replace('*', r"(?P\w+)")) 94 | for filename in glob.glob(glob_filename): 95 | ext_match = ext_re.match(filename) 96 | ext = ext_match.groups()[0] 97 | extensions.append(ext) 98 | extensions.append('.' + ext) 99 | 100 | # from relevant synonyms (don't use the '' synonym) 101 | for ext in EXTENSION_SYNONYMS.keys(): 102 | if ext: 103 | extensions.append(ext) 104 | extensions.append(ext.replace('.', '', 1)) 105 | extensions.sort() 106 | return extensions 107 | -------------------------------------------------------------------------------- /textract/parsers/audio.py: -------------------------------------------------------------------------------- 1 | import speech_recognition as sr 2 | import os 3 | 4 | from ..exceptions import UnknownMethod, ShellError 5 | from .utils import ShellParser 6 | 7 | 8 | class Parser(ShellParser): 9 | """ 10 | Extract text (i.e. speech) from an audio file, using SpeechRecognition. 11 | 12 | Since SpeechRecognition expects a .wav file, with 1 channel, 13 | the audio file has to be converted, via sox, if not compliant 14 | 15 | Note: for testing, use - 16 | http://www2.research.att.com/~ttsweb/tts/demo.php, 17 | with Rich (US English) for best results 18 | """ 19 | 20 | def extract(self, filename, method='', **kwargs): 21 | speech = '' 22 | 23 | # convert to wav, if not already .wav 24 | base, ext = os.path.splitext(filename) 25 | if ext != '.wav': 26 | temp_filename = self.convert_to_wav(filename) 27 | try: 28 | speech = self.extract(temp_filename, method, **kwargs) 29 | finally: # make sure temp_file is deleted 30 | os.remove(temp_filename) 31 | else: 32 | r = sr.Recognizer() 33 | 34 | with sr.WavFile(filename) as source: 35 | audio = r.record(source) 36 | 37 | try: 38 | if method == 'google' or method == '': 39 | speech = r.recognize_google(audio) 40 | elif method == 'sphinx': 41 | speech = r.recognize_sphinx(audio) 42 | else: 43 | raise UnknownMethod(method) 44 | except LookupError: # audio is not understandable 45 | speech = '' 46 | except sr.UnknownValueError: 47 | speech = '' 48 | 49 | # add a newline, to make output cleaner 50 | speech += '\n' 51 | 52 | return speech 53 | 54 | def convert_to_wav(self, filename): 55 | """ 56 | Uses sox cmdline tool, to convert audio file to .wav 57 | 58 | Note: for testing, use - 59 | http://www.text2speech.org/, 60 | with American Male 2 for best results 61 | """ 62 | temp_filename = '{0}.wav'.format(self.temp_filename()) 63 | self.run(['sox', '-G', '-c', '1', filename, temp_filename]) 64 | return temp_filename 65 | -------------------------------------------------------------------------------- /textract/parsers/csv_parser.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from .utils import BaseParser 4 | 5 | 6 | class Parser(BaseParser): 7 | """Extract text from comma separated values files (.csv). 8 | """ 9 | 10 | delimiter = ',' 11 | 12 | def extract(self, filename, **kwargs): 13 | 14 | # quick 'n dirty solution for the time being 15 | with open(filename) as stream: 16 | reader = csv.reader(stream, delimiter=self.delimiter) 17 | return '\n'.join(['\t'.join(row) for row in reader]) 18 | -------------------------------------------------------------------------------- /textract/parsers/doc_parser.py: -------------------------------------------------------------------------------- 1 | from .utils import ShellParser 2 | 3 | 4 | class Parser(ShellParser): 5 | """Extract text from doc files using antiword. 6 | """ 7 | 8 | def extract(self, filename, **kwargs): 9 | stdout, stderr = self.run(['antiword', filename]) 10 | return stdout 11 | -------------------------------------------------------------------------------- /textract/parsers/docx_parser.py: -------------------------------------------------------------------------------- 1 | import docx2txt 2 | 3 | from .utils import BaseParser 4 | 5 | 6 | class Parser(BaseParser): 7 | """Extract text from docx file using python-docx. 8 | """ 9 | 10 | def extract(self, filename, **kwargs): 11 | return docx2txt.process(filename) 12 | -------------------------------------------------------------------------------- /textract/parsers/eml_parser.py: -------------------------------------------------------------------------------- 1 | from email.parser import Parser as EmailParser 2 | 3 | from .utils import BaseParser 4 | 5 | 6 | class Parser(BaseParser): 7 | """Extract text from email messages in .eml format. This gets the 8 | subject and all text from the contents. 9 | """ 10 | 11 | def extract(self, filename, **kwargs): 12 | # TODO: could make option here to omit all non-original content 13 | # (forwarded content, quoted content in reply, signature, etc), 14 | # perhaps using https://github.com/zapier/email-reply-parser 15 | 16 | # TODO: could also potentially grab text/html content instead of 17 | # only grabbing text/plain content 18 | 19 | with open(filename) as stream: 20 | parser = EmailParser() 21 | message = parser.parse(stream) 22 | 23 | text_content = [] 24 | for part in message.walk(): 25 | if part.get_content_type().startswith('text/plain'): 26 | text_content.append(part.get_payload()) 27 | return '\n\n'.join(text_content) 28 | -------------------------------------------------------------------------------- /textract/parsers/epub_parser.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from bs4 import BeautifulSoup 3 | 4 | from .utils import BaseParser 5 | 6 | 7 | class Parser(BaseParser): 8 | """Extract text from epub""" 9 | 10 | def extract(self, filename, **kwargs): 11 | book = zipfile.ZipFile(filename) 12 | result = '' 13 | for text_name in self.__epub_sections(book): 14 | if not text_name.endswith("html"): 15 | continue 16 | soup = BeautifulSoup(book.open(text_name), features='lxml') 17 | html_content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4'] 18 | for child in soup.find_all(html_content_tags): 19 | inner_text = child.text.strip() if child.text else "" 20 | if inner_text: 21 | result += inner_text + '\n' 22 | return result 23 | 24 | def __epub_sections(self, book): 25 | opf_paths = self.__get_opf_paths(book) 26 | item_paths = self.__get_item_paths(book, opf_paths) 27 | return item_paths 28 | 29 | def __get_opf_paths(self, book): 30 | meta_inf = book.open("META-INF/container.xml") 31 | meta_soup = BeautifulSoup(meta_inf, features='lxml') 32 | return [f["full-path"] for f in meta_soup.rootfiles.find_all("rootfile")] 33 | 34 | def __get_item_paths(self, book, opf_paths): 35 | item_paths = [] 36 | for opf_path in opf_paths: 37 | opf_soup = BeautifulSoup(book.open(opf_path), "lxml") 38 | epub_items = opf_soup.spine.find_all("itemref") 39 | for epub_item in epub_items: 40 | item = self.__get_item(opf_soup, epub_item["idref"]) 41 | item_paths.append(self.__get_full_item_path(book, item["href"])) 42 | return item_paths 43 | 44 | def __get_item(self, opf_soup, item_id): 45 | for item in opf_soup.manifest.find_all("item"): 46 | if item["id"] == item_id: 47 | return item 48 | return None 49 | 50 | def __get_full_item_path(self, book, partial_path): 51 | for filename in book.namelist(): 52 | if filename.endswith(partial_path): 53 | return filename 54 | -------------------------------------------------------------------------------- /textract/parsers/gif_parser.py: -------------------------------------------------------------------------------- 1 | from .image import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/html_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from .utils import BaseParser 7 | 8 | 9 | class Parser(BaseParser): 10 | """Extract text from html file using beautifulsoup4. Filter text to 11 | only show the visible parts of the page. Insipration from `here 12 | `_. 13 | """ 14 | 15 | _disallowed_names = [ 16 | 'style', 'script', '[document]', 'head', 'title', 'html', 'meta', 17 | 'link', 'body', 18 | ] 19 | 20 | _inline_tags = [ 21 | 'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code', 22 | 'dfn', 'em', 'kbd', 'strong', 'samp', 'var', 'a', 'bdo', 'br', 'img', 23 | 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button', 24 | 'input', 'label', 'select', 'textarea', 25 | ] 26 | 27 | def _visible(self, element): 28 | """Used to filter text elements that have invisible text on the page. 29 | """ 30 | if element.name in self._disallowed_names: 31 | return False 32 | elif re.match(u'', six.text_type(element.extract())): 33 | return False 34 | return True 35 | 36 | def _inline(self, element): 37 | """Used to check whether given element can be treated as inline 38 | element (without new line after). 39 | """ 40 | if element.name in self._inline_tags: 41 | return True 42 | return False 43 | 44 | def _find_any_text(self, tag): 45 | """Looks for any possible text within given tag. 46 | """ 47 | text = '' 48 | if tag is not None: 49 | text = six.text_type(tag) 50 | text = re.sub(r'(<[^>]+>)', '', text) 51 | text = re.sub(r'\s', ' ', text) 52 | text = text.strip() 53 | return text 54 | 55 | def _parse_tables(self, soup): 56 | """Returns array containing basic informations about tables for ASCII 57 | replacement (look: _replace_tables()). 58 | """ 59 | tables = [] 60 | for t in soup.find_all('table'): 61 | t_dict = {'width': 0, 'table': t, 'trs': [], 'col_width': {}} 62 | trs = t.find_all('tr') 63 | if len(trs) > 0: 64 | for tr in trs: 65 | tr_dict = [] 66 | tds = tr.find_all('th') + tr.find_all('td') 67 | if len(tds) > 0: 68 | for i, td in enumerate(tds): 69 | td_text = self._find_any_text(td) 70 | length = len(td_text) 71 | if i in t_dict['col_width']: 72 | t_dict['col_width'][i] = max( 73 | length, 74 | t_dict['col_width'][i] 75 | ) 76 | else: 77 | t_dict['col_width'][i] = length 78 | tr_dict.append({ 79 | 'text': td_text, 80 | 'colspan': int(td.get('colspan', 1)), 81 | }) 82 | t_dict['trs'].append(tr_dict) 83 | for col in t_dict['col_width']: 84 | t_dict['width'] += t_dict['col_width'][col] 85 | tables.append(t_dict) 86 | return tables 87 | 88 | def _replace_tables(self, soup, v_separator=' | ', h_separator='-'): 89 | """Replaces elements with its ASCII equivalent. 90 | """ 91 | tables = self._parse_tables(soup) 92 | v_sep_len = len(v_separator) 93 | v_left_sep = v_separator.lstrip() 94 | for t in tables: 95 | html = '' 96 | trs = t['trs'] 97 | h_length = 1 + (v_sep_len * len(t['col_width'])) + t['width'] 98 | head_foot = (h_separator * h_length) + "\n" 99 | html += head_foot 100 | for tr in trs: 101 | html += v_left_sep 102 | for i, td in enumerate(tr): 103 | text = td['text'] 104 | col_width = t['col_width'][i] + v_sep_len 105 | if td['colspan'] > 1: 106 | for j in range(td['colspan']-1): 107 | j = j + 1 108 | if (i+j) < len(t['col_width']): 109 | col_width += t['col_width'][i+j] + v_sep_len 110 | html += ('%' + str(col_width) + 's') % (text + v_separator) 111 | html += "\n" 112 | html += head_foot 113 | new_table = soup.new_tag('div') 114 | new_table.string = html 115 | t['table'].replace_with(new_table) 116 | return soup 117 | 118 | def _join_inlines(self, soup): 119 | """Unwraps inline elements defined in self._inline_tags. 120 | """ 121 | elements = soup.find_all(True) 122 | for elem in elements: 123 | if self._inline(elem): 124 | elem.unwrap() 125 | return soup 126 | 127 | def extract(self, filename, **kwargs): 128 | with open(filename, "rb") as stream: 129 | soup = BeautifulSoup(stream, 'lxml') 130 | 131 | # Convert tables to ASCII ones 132 | soup = self._replace_tables(soup) 133 | 134 | # Join inline elements 135 | soup = self._join_inlines(soup) 136 | 137 | # Make HTML 138 | html = '' 139 | elements = soup.find_all(True) 140 | elements = [el for el in filter(self._visible, elements)] 141 | for elem in elements: 142 | string = elem.string 143 | if string is None: 144 | string = self._find_any_text(elem) 145 | string = string.strip() 146 | if len(string) > 0: 147 | html += "\n" + string + "\n" 148 | return html 149 | -------------------------------------------------------------------------------- /textract/parsers/image.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process an image file using tesseract. 3 | """ 4 | import os 5 | 6 | from .utils import ShellParser 7 | 8 | 9 | class Parser(ShellParser): 10 | """Extract text from various image file formats using tesseract-ocr""" 11 | 12 | def extract(self, filename, **kwargs): 13 | 14 | # if language given as argument, specify language for tesseract to use 15 | if 'language' in kwargs: 16 | args = ['tesseract', filename, 'stdout', '-l', kwargs['language']] 17 | else: 18 | args = ['tesseract', filename, 'stdout'] 19 | 20 | stdout, _ = self.run(args) 21 | return stdout 22 | -------------------------------------------------------------------------------- /textract/parsers/jpg_parser.py: -------------------------------------------------------------------------------- 1 | from .image import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/json_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | import six 3 | 4 | from .utils import BaseParser 5 | 6 | 7 | class Parser(BaseParser): 8 | """Extract all of the string values of a json file (no keys as those 9 | are, in some sense, markup). This is useful for parsing content 10 | from mongodb dumps, for example. 11 | """ 12 | 13 | def extract(self, filename, **kwargs): 14 | with open(filename, 'r') as raw: 15 | deserialized_json = json.load(raw) 16 | return self.get_text(deserialized_json) 17 | 18 | def get_text(self, deserialized_json): 19 | """Recursively get text from subcomponents of a deserialized json. To 20 | enforce the same order on the documents, make sure to read keys of 21 | deserialized_json in a consistent (alphabetical) order. 22 | """ 23 | if isinstance(deserialized_json, dict): 24 | result = '' 25 | for key in sorted(deserialized_json): 26 | result += self.get_text(deserialized_json[key]) + ' ' 27 | return result 28 | 29 | if isinstance(deserialized_json, list): 30 | result = '' 31 | for item in deserialized_json: 32 | result += self.get_text(item) + ' ' 33 | return result 34 | 35 | if isinstance(deserialized_json, six.string_types): 36 | return deserialized_json 37 | else: 38 | return '' 39 | -------------------------------------------------------------------------------- /textract/parsers/mp3_parser.py: -------------------------------------------------------------------------------- 1 | from .audio import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/msg_parser.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | import extract_msg 4 | 5 | from .utils import BaseParser 6 | 7 | 8 | def ensure_bytes(string): 9 | """Normalize string to bytes. 10 | 11 | `ExtractMsg.Message._getStringStream` can return unicode or bytes depending 12 | on what is originally stored in message file. 13 | 14 | This helper functon makes sure, that bytes type is returned. 15 | """ 16 | if isinstance(string, six.string_types): 17 | return string.encode('utf-8') 18 | return string 19 | 20 | 21 | class Parser(BaseParser): 22 | """Extract text from Microsoft Outlook files (.msg) 23 | """ 24 | 25 | def extract(self, filename, **kwargs): 26 | m = extract_msg.Message(filename) 27 | return ensure_bytes(m.subject) + six.b('\n\n') + ensure_bytes(m.body) 28 | -------------------------------------------------------------------------------- /textract/parsers/odt_parser.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import xml.etree.ElementTree as ET 3 | 4 | from .utils import BaseParser 5 | 6 | 7 | class Parser(BaseParser): 8 | """Extract text from open document files. 9 | """ 10 | 11 | def extract(self, filename, **kwargs): 12 | # Inspiration from 13 | # https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py 14 | with open(filename, 'rb') as stream: 15 | zip_stream = zipfile.ZipFile(stream) 16 | self.content = ET.fromstring(zip_stream.read("content.xml")) 17 | return self.to_string() 18 | 19 | def to_string(self): 20 | """ Converts the document to a string. """ 21 | buff = u"" 22 | for child in self.content.iter(): 23 | if child.tag in [self.qn('text:p'), self.qn('text:h')]: 24 | buff += self.text_to_string(child) + "\n" 25 | # remove last newline char 26 | if buff: 27 | buff = buff[:-1] 28 | return buff 29 | 30 | def text_to_string(self, element): 31 | buff = u"" 32 | if element.text is not None: 33 | buff += element.text 34 | for child in element: 35 | if child.tag == self.qn('text:tab'): 36 | buff += "\t" 37 | if child.tail is not None: 38 | buff += child.tail 39 | elif child.tag == self.qn('text:s'): 40 | buff += u" " 41 | if child.get(self.qn('text:c')) is not None: 42 | buff += u" " * (int(child.get(self.qn('text:c'))) - 1) 43 | if child.tail is not None: 44 | buff += child.tail 45 | else: 46 | buff += self.text_to_string(child) 47 | if element.tail is not None: 48 | buff += element.tail 49 | return buff 50 | 51 | def qn(self, namespace): 52 | """Connect tag prefix to longer namespace""" 53 | nsmap = { 54 | 'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0', 55 | } 56 | spl = namespace.split(':') 57 | return '{{{}}}{}'.format(nsmap[spl[0]], spl[1]) 58 | -------------------------------------------------------------------------------- /textract/parsers/ogg_parser.py: -------------------------------------------------------------------------------- 1 | from .audio import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/pdf_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import six 4 | from tempfile import mkdtemp 5 | 6 | from ..exceptions import UnknownMethod, ShellError 7 | 8 | from .utils import ShellParser 9 | from .image import Parser as TesseractParser 10 | 11 | from distutils.spawn import find_executable 12 | 13 | class Parser(ShellParser): 14 | """Extract text from pdf files using either the ``pdftotext`` method 15 | (default) or the ``pdfminer`` method. 16 | """ 17 | 18 | def extract(self, filename, method='', **kwargs): 19 | if method == '' or method == 'pdftotext': 20 | try: 21 | return self.extract_pdftotext(filename, **kwargs) 22 | except ShellError as ex: 23 | # If pdftotext isn't installed and the pdftotext method 24 | # wasn't specified, then gracefully fallback to using 25 | # pdfminer instead. 26 | if method == '' and ex.is_not_installed(): 27 | return self.extract_pdfminer(filename, **kwargs) 28 | else: 29 | raise ex 30 | 31 | elif method == 'pdfminer': 32 | return self.extract_pdfminer(filename, **kwargs) 33 | elif method == 'tesseract': 34 | return self.extract_tesseract(filename, **kwargs) 35 | else: 36 | raise UnknownMethod(method) 37 | 38 | def extract_pdftotext(self, filename, **kwargs): 39 | """Extract text from pdfs using the pdftotext command line utility.""" 40 | if 'layout' in kwargs: 41 | args = ['pdftotext', '-layout', filename, '-'] 42 | else: 43 | args = ['pdftotext', filename, '-'] 44 | stdout, _ = self.run(args) 45 | return stdout 46 | 47 | def extract_pdfminer(self, filename, **kwargs): 48 | """Extract text from pdfs using pdfminer.""" 49 | #Nested try/except loops? Not great 50 | #Try the normal pdf2txt, if that fails try the python3 51 | # pdf2txt, if that fails try the python2 pdf2txt 52 | pdf2txt_path = find_executable('pdf2txt.py') 53 | try: 54 | stdout, _ = self.run(['pdf2txt.py', filename]) 55 | except OSError: 56 | try: 57 | stdout, _ = self.run(['python3',pdf2txt_path, filename]) 58 | except ShellError: 59 | stdout, _ = self.run(['python2',pdf2txt_path, filename]) 60 | return stdout 61 | 62 | def extract_tesseract(self, filename, **kwargs): 63 | """Extract text from pdfs using tesseract (per-page OCR).""" 64 | temp_dir = mkdtemp() 65 | base = os.path.join(temp_dir, 'conv') 66 | contents = [] 67 | try: 68 | stdout, _ = self.run(['pdftoppm', filename, base]) 69 | 70 | for page in sorted(os.listdir(temp_dir)): 71 | page_path = os.path.join(temp_dir, page) 72 | page_content = TesseractParser().extract(page_path, **kwargs) 73 | contents.append(page_content) 74 | return six.b('').join(contents) 75 | finally: 76 | shutil.rmtree(temp_dir) 77 | -------------------------------------------------------------------------------- /textract/parsers/png_parser.py: -------------------------------------------------------------------------------- 1 | from .image import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/pptx_parser.py: -------------------------------------------------------------------------------- 1 | import pptx 2 | 3 | from .utils import BaseParser 4 | 5 | 6 | class Parser(BaseParser): 7 | """Extract text from pptx file using python-pptx 8 | """ 9 | 10 | def extract(self, filename, **kwargs): 11 | presentation = pptx.Presentation(filename) 12 | text_runs = [] 13 | for slide in presentation.slides: 14 | for shape in slide.shapes: 15 | if not shape.has_text_frame: 16 | continue 17 | for paragraph in shape.text_frame.paragraphs: 18 | for run in paragraph.runs: 19 | text_runs.append(run.text) 20 | return '\n\n'.join(text_runs) 21 | -------------------------------------------------------------------------------- /textract/parsers/ps_parser.py: -------------------------------------------------------------------------------- 1 | from .utils import ShellParser 2 | 3 | 4 | class Parser(ShellParser): 5 | """Extract text from postscript files using ps2ascii command. 6 | """ 7 | 8 | def extract(self, filename, **kwargs): 9 | stdout, _ = self.run(['ps2ascii', filename]) 10 | return stdout 11 | -------------------------------------------------------------------------------- /textract/parsers/psv_parser.py: -------------------------------------------------------------------------------- 1 | from .csv_parser import Parser as BaseParser 2 | 3 | 4 | class Parser(BaseParser): 5 | """Extract text from pipe separated values files (.psv). 6 | """ 7 | 8 | delimiter = '|' 9 | -------------------------------------------------------------------------------- /textract/parsers/rtf_parser.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | from .utils import ShellParser 4 | 5 | 6 | class Parser(ShellParser): 7 | """Extract text from rtf files using unrtf. 8 | """ 9 | 10 | def extract(self, filename, **kwargs): 11 | # http://superuser.com/a/243089/126633 12 | stdout, stderr = self.run(['unrtf', '--text', filename]) 13 | splitter = six.b('-') * 17 + six.b('\n') 14 | text_conversion = stdout.split(splitter, 1)[-1] 15 | return text_conversion 16 | -------------------------------------------------------------------------------- /textract/parsers/tiff_parser.py: -------------------------------------------------------------------------------- 1 | from .image import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/tsv_parser.py: -------------------------------------------------------------------------------- 1 | from .csv_parser import Parser as BaseParser 2 | 3 | 4 | class Parser(BaseParser): 5 | """Extract text from tab separated values files (.tsv). 6 | """ 7 | 8 | delimiter = '\t' 9 | -------------------------------------------------------------------------------- /textract/parsers/txt_parser.py: -------------------------------------------------------------------------------- 1 | from .utils import BaseParser 2 | 3 | 4 | class Parser(BaseParser): 5 | """Parse ``.txt`` files""" 6 | 7 | def extract(self, filename, **kwargs): 8 | with open(filename) as stream: 9 | return stream.read() 10 | -------------------------------------------------------------------------------- /textract/parsers/utils.py: -------------------------------------------------------------------------------- 1 | """This module includes a bunch of convenient base classes that are 2 | reused in many of the other parser modules. 3 | """ 4 | 5 | import subprocess 6 | import tempfile 7 | import os 8 | import errno 9 | 10 | import six 11 | import chardet 12 | 13 | from .. import exceptions 14 | 15 | 16 | class BaseParser(object): 17 | """The :class:`.BaseParser` abstracts out some common functionality 18 | that is used across all document Parsers. In particular, it has 19 | the responsibility of handling all unicode and byte-encoding. 20 | """ 21 | 22 | def extract(self, filename, **kwargs): 23 | """This method must be overwritten by child classes to extract raw 24 | text from a filename. This method can return either a 25 | byte-encoded string or unicode. 26 | """ 27 | raise NotImplementedError('must be overwritten by child classes') 28 | 29 | def encode(self, text, encoding): 30 | """Encode the ``text`` in ``encoding`` byte-encoding. This ignores 31 | code points that can't be encoded in byte-strings. 32 | """ 33 | return text.encode(encoding, 'ignore') 34 | 35 | def process(self, filename, input_encoding, output_encoding="utf8", **kwargs): 36 | """Process ``filename`` and encode byte-string with ``encoding``. This 37 | method is called by :func:`textract.parsers.process` and wraps 38 | the :meth:`.BaseParser.extract` method in `a delicious unicode 39 | sandwich `_. 40 | 41 | """ 42 | # make a "unicode sandwich" to handle dealing with unknown 43 | # input byte strings and converting them to a predictable 44 | # output encoding 45 | # http://nedbatchelder.com/text/unipain/unipain.html#35 46 | byte_string = self.extract(filename, **kwargs) 47 | unicode_string = self.decode(byte_string, input_encoding) 48 | return self.encode(unicode_string, output_encoding) 49 | 50 | def decode(self, text, input_encoding=None): 51 | """Decode ``text`` using the `chardet 52 | `_ package. 53 | """ 54 | # only decode byte strings into unicode if it hasn't already 55 | # been done by a subclass 56 | if isinstance(text, six.text_type): 57 | return text 58 | 59 | # empty text? nothing to decode 60 | if not text: 61 | return u'' 62 | 63 | # use the provided encoding 64 | if input_encoding: 65 | return text.decode(input_encoding) 66 | 67 | # use chardet to automatically detect the encoding text if no encoding is provided 68 | result = chardet.detect(text) 69 | encoding = result['encoding'] if result['confidence'] > 0.80 else 'utf8' 70 | return text.decode(encoding, errors="replace") 71 | 72 | 73 | class ShellParser(BaseParser): 74 | """The :class:`.ShellParser` extends the :class:`.BaseParser` to make 75 | it easy to run external programs from the command line with 76 | `Fabric `_-like behavior. 77 | """ 78 | 79 | def run(self, args): 80 | """Run ``command`` and return the subsequent ``stdout`` and ``stderr`` 81 | as a tuple. If the command is not successful, this raises a 82 | :exc:`textract.exceptions.ShellError`. 83 | """ 84 | 85 | # run a subprocess and put the stdout and stderr on the pipe object 86 | try: 87 | pipe = subprocess.Popen( 88 | args, 89 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, 90 | ) 91 | except OSError as e: 92 | if e.errno == errno.ENOENT: 93 | # File not found. 94 | # This is equivalent to getting exitcode 127 from sh 95 | raise exceptions.ShellError( 96 | ' '.join(args), 127, '', '', 97 | ) 98 | else: raise #Reraise the last exception unmodified 99 | 100 | # pipe.wait() ends up hanging on large files. using 101 | # pipe.communicate appears to avoid this issue 102 | stdout, stderr = pipe.communicate() 103 | 104 | # if pipe is busted, raise an error (unlike Fabric) 105 | if pipe.returncode != 0: 106 | raise exceptions.ShellError( 107 | ' '.join(args), pipe.returncode, stdout, stderr, 108 | ) 109 | 110 | return stdout, stderr 111 | 112 | def temp_filename(self): 113 | """Return a unique tempfile name. 114 | """ 115 | # TODO: it would be nice to get this to behave more like a 116 | # context so we can make sure these temporary files are 117 | # removed, regardless of whether an error occurs or the 118 | # program is terminated. 119 | handle, filename = tempfile.mkstemp() 120 | os.close(handle) 121 | return filename 122 | -------------------------------------------------------------------------------- /textract/parsers/wav_parser.py: -------------------------------------------------------------------------------- 1 | from .audio import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/xls_parser.py: -------------------------------------------------------------------------------- 1 | from .xlsx_parser import Parser 2 | -------------------------------------------------------------------------------- /textract/parsers/xlsx_parser.py: -------------------------------------------------------------------------------- 1 | import xlrd 2 | import six 3 | 4 | from six.moves import xrange 5 | 6 | from .utils import BaseParser 7 | 8 | 9 | class Parser(BaseParser): 10 | """Extract text from Excel files (.xls/xlsx). 11 | """ 12 | 13 | def extract(self, filename, **kwargs): 14 | workbook = xlrd.open_workbook(filename) 15 | sheets_name = workbook.sheet_names() 16 | output = "\n" 17 | for names in sheets_name: 18 | worksheet = workbook.sheet_by_name(names) 19 | num_rows = worksheet.nrows 20 | num_cells = worksheet.ncols 21 | 22 | for curr_row in range(num_rows): 23 | row = worksheet.row(curr_row) 24 | new_output = [] 25 | for index_col in xrange(num_cells): 26 | value = worksheet.cell_value(curr_row, index_col) 27 | if value: 28 | if isinstance(value, (int, float)): 29 | value = six.text_type(value) 30 | new_output.append(value) 31 | if new_output: 32 | output += u' '.join(new_output) + u'\n' 33 | return output 34 | --------------------------------------------------------------------------------