├── .coveragerc
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── .pyup.yml
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── Vagrantfile
├── bin
    └── textract
├── docs
    ├── Makefile
    ├── changelog.rst
    ├── command_line_interface.rst
    ├── conf.py
    ├── contributing.rst
    ├── index.rst
    ├── installation.rst
    └── python_package.rst
├── provision
    ├── debian.sh
    ├── development.sh
    ├── python2.sh
    ├── python3.sh
    └── travis-mock.sh
├── requirements
    ├── debian
    ├── freebsd
    ├── python
    ├── python-dev2
    ├── python-dev3
    └── python-doc
├── setup.cfg
├── setup.py
├── tests
    ├── Dockerfile
    ├── Makefile
    ├── __init__.py
    ├── base.py
    ├── csv
    │   ├── raw_text.csv
    │   ├── raw_text.txt
    │   └── standardized_text.csv
    ├── doc
    │   ├── raw_text.doc
    │   ├── raw_text.txt
    │   ├── standardized_text.doc
    │   └── standardized_text_1.odt
    ├── docker_entry.sh
    ├── docx
    │   ├── paragraphs_and_tables.docx
    │   ├── paragraphs_and_tables.txt
    │   ├── raw_text.docx
    │   ├── raw_text.txt
    │   └── standardized_text.docx
    ├── eml
    │   ├── raw_text.eml
    │   ├── raw_text.txt
    │   └── standardized_text.eml
    ├── epub
    │   ├── raw_text.epub
    │   ├── raw_text.txt
    │   └── standardized_text.epub
    ├── gif
    │   ├── raw_text.gif
    │   └── standardized_text.gif
    ├── html
    │   ├── raw_text.html
    │   ├── raw_text.txt
    │   ├── standardized_text.html
    │   ├── tables.html
    │   └── tables.txt
    ├── jpg
    │   ├── raw_text.jpg
    │   └── standardized_text.jpg
    ├── json
    │   ├── raw_text.json
    │   ├── raw_text.txt
    │   └── standardized_text.json
    ├── mp3
    │   ├── raw_text-m=google.txt
    │   ├── raw_text-m=sphinx.txt
    │   ├── raw_text.mp3
    │   ├── raw_text.txt
    │   └── standardized_text.mp3
    ├── msg
    │   ├── raw_text.msg
    │   ├── raw_text.txt
    │   └── standardized_text.msg
    ├── no_ext
    │   ├── docx_paragraphs_and_tables
    │   ├── msg_standardized_text
    │   └── pdf_standardized_text
    ├── odt
    │   ├── raw_text.odt
    │   ├── raw_text.txt
    │   └── standardized_text.odt
    ├── ogg
    │   ├── raw_text.ogg
    │   ├── raw_text.txt
    │   └── standardized_text.ogg
    ├── pdf
    │   ├── ocr_text.pdf
    │   ├── raw_text-m=pdfminer.txt
    │   ├── raw_text.pdf
    │   ├── raw_text.txt
    │   ├── standardized_text.pdf
    │   ├── two_column.pdf
    │   └── two_column.txt
    ├── png
    │   ├── raw_text.png
    │   └── standardized_text.png
    ├── pptx
    │   ├── raw_text.pptx
    │   ├── raw_text.txt
    │   └── standardized_text.pptx
    ├── ps
    │   ├── raw_text.ps
    │   ├── raw_text.txt
    │   └── standardized_text.ps
    ├── psv
    │   ├── raw_text.psv
    │   ├── raw_text.txt
    │   └── standardized_text.psv
    ├── rtf
    │   ├── raw_text.rtf
    │   ├── raw_text.txt
    │   └── standardized_text.rtf
    ├── run.py
    ├── run_docker_tests.sh
    ├── test_csv.py
    ├── test_doc.py
    ├── test_docx.py
    ├── test_eml.py
    ├── test_epub.py
    ├── test_exceptions.py
    ├── test_gif.py
    ├── test_html.py
    ├── test_jpg.py
    ├── test_json.py
    ├── test_mp3.py
    ├── test_msg.py
    ├── test_no_ext.py
    ├── test_odt.py
    ├── test_ogg.py
    ├── test_pdf.py
    ├── test_png.py
    ├── test_pptx.py
    ├── test_ps.py
    ├── test_psv.py
    ├── test_rtf.py
    ├── test_tiff.py
    ├── test_tsv.py
    ├── test_txt.py
    ├── test_wav.py
    ├── test_xls.py
    ├── test_xlsx.py
    ├── tiff
    │   ├── raw_text.tiff
    │   └── standardized_text.tiff
    ├── tsv
    │   ├── raw_text.tsv
    │   ├── raw_text.txt
    │   └── standardized_text.tsv
    ├── txt
    │   ├── raw_text.txt
    │   └── standardized_text.txt
    ├── wav
    │   ├── raw_text.txt
    │   ├── raw_text.wav
    │   └── standardized_text.wav
    ├── xls
    │   ├── raw_text.txt
    │   ├── raw_text.xls
    │   └── standardized_text.xls
    └── xlsx
    │   ├── raw_text.txt
    │   ├── raw_text.xlsx
    │   └── standardized_text.xlsx
└── textract
    ├── __init__.py
    ├── cli.py
    ├── colors.py
    ├── exceptions.py
    └── parsers
        ├── __init__.py
        ├── audio.py
        ├── csv_parser.py
        ├── doc_parser.py
        ├── docx_parser.py
        ├── eml_parser.py
        ├── epub_parser.py
        ├── gif_parser.py
        ├── html_parser.py
        ├── image.py
        ├── jpg_parser.py
        ├── json_parser.py
        ├── mp3_parser.py
        ├── msg_parser.py
        ├── odt_parser.py
        ├── ogg_parser.py
        ├── pdf_parser.py
        ├── png_parser.py
        ├── pptx_parser.py
        ├── ps_parser.py
        ├── psv_parser.py
        ├── rtf_parser.py
        ├── tiff_parser.py
        ├── tsv_parser.py
        ├── txt_parser.py
        ├── utils.py
        ├── wav_parser.py
        ├── xls_parser.py
        └── xlsx_parser.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit =
3 |     */python?.?/*
4 |     */site-packages/nose/*
5 |     textract/cli.py
6 |     textract/colors.py
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. Windows 10]
28 |  - Textract version [e.g. 1.6.3]
29 |  - Python version [e.g. 3.7]
30 |  - Virtual environment (yes/no)
31 | 
32 | **Additional context**
33 | Add any other context about the problem here.
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Which filetype should textract support?**
14 | A clear and concise description of file types you think textract should be able to process.
15 | 
16 | **Which external software (python or command line tool), can parse the requested file type**
17 | A clear and concise description of tools that can parse the desired filetype.
18 | 
19 | **Describe alternatives you've considered**
20 | A clear and concise description of any alternative solutions or features you've considered.
21 | 
22 | **Additional context**
23 | Add any other context or screenshots about the feature request here.
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | eggs/
15 | lib/
16 | lib64/
17 | parts/
18 | sdist/
19 | var/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | pip-delete-this-directory.txt
27 | 
28 | # Virtual environments
29 | **/venv*
30 | 
31 | # Unit test / coverage reports
32 | htmlcov/
33 | .tox/
34 | .coverage
35 | .cache
36 | nosetests.xml
37 | coverage.xml
38 | 
39 | # Translations
40 | *.mo
41 | 
42 | # Mr Developer
43 | .mr.developer.cfg
44 | .project
45 | .pydevproject
46 | 
47 | # Rope
48 | .ropeproject
49 | 
50 | # Django stuff:
51 | *.log
52 | *.pot
53 | 
54 | # Sphinx documentation
55 | docs/build/
56 | docs/textract/
57 | 
58 | # vagrant
59 | .vagrant
60 | 
61 | # ignore big testing files that are dynamically downloaded
62 | tests/pdf/large.pdf
63 | 
64 | # ignore raw_text.txt files that are dynamically generated during testing
65 | tests/png/raw_text.txt
66 | tests/gif/raw_text.txt
67 | tests/jpg/raw_text.txt
68 | tests/tiff/raw_text.txt
69 | tests/png/standardized_text.txt
70 | tests/gif/standardized_text.txt
71 | tests/jpg/standardized_text.txt
72 | tests/tiff/standardized_text.txt
73 | tests/pdf/ocr_text.txt
74 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
 1 | update: all
 2 | branch: master
 3 | schedule: "every two weeks"
 4 | pin: False
 5 | requirements:
 6 |   - requirements/python:
 7 |       updates: all
 8 |   - requirements/python-dev:
 9 |       updates: all
10 |   - requirements/python-doc:
11 |       updates: all
12 | assignees:
13 |   - deanmalmgren
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: focal
 2 | os: linux
 3 | 
 4 | language: python
 5 | python:
 6 |   - "2.7"
 7 |   - "3.7"
 8 | 
 9 | # install system dependencies here with apt-get.
10 | before_install:
11 |   - sudo ./provision/debian.sh
12 |   - python -m pip install --upgrade pip
13 | 
14 | # install python dependencies including this package in the travis
15 | # virtualenv
16 | install:
17 | 
18 |   - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]];
19 |         then ./provision/python3.sh;
20 |         fi
21 |   - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]];
22 |         then ./provision/python2.sh;
23 |         fi
24 |   - pip install .[pocketsphinx]
25 | 
26 | # commands to run the testing suite. if any of these fail, travic lets us know
27 | script:
28 |   - cd tests && make && cd -
29 |   - nosetests --with-coverage --cover-package=textract
30 |   - cd tests && pytest && cd -
31 | #  - pycodestyle textract/ bin/textract
32 |   - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]];
33 |         then cd docs && make html && cd -;
34 |         fi
35 | 
36 | # commands to run after the tests successfully complete
37 | after_success:
38 |   - coveralls
39 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | [![Jazzband](https://jazzband.co/static/img/jazzband.svg)](https://jazzband.co/)
2 | 
3 | This is a [Jazzband](https://jazzband.co/) project. By contributing you agree to
4 | abide by the [Contributor Code of Conduct](https://jazzband.co/about/conduct)
5 | and follow the [guidelines](https://jazzband.co/about/guidelines).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Dean Malmgren
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements/*
2 | include MANIFEST.in
3 | include README.rst
4 | include LICENSE
5 | recursive-exclude * *.py[co]
6 | recursive-exclude * *~ 
7 | recursive-exclude * *.orig
8 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. NOTES FOR CREATING A RELEASE:
 2 | ..
 3 | ..   * bumpversion {major|minor|patch}
 4 | ..   * git push && git push --tags
 5 | ..   * twine upload -r textract dist/*
 6 | ..   * convert into release https://github.com/deanmalmgren/textract/releases
 7 | 
 8 | textract
 9 | ========
10 | 
11 | Extract text from any document. No muss. No fuss.
12 | 
13 | `Full documentation <http://textract.readthedocs.org>`__.
14 | 
15 | Originally written by @deanmalmgren. Maintained by the good people at
16 | @jazzband |Jazz Band|
17 | 
18 | |Build Status| |Version| |Downloads| |Test Coverage| |Documentation Status|
19 | |Updates| |Stars| |Forks|
20 | 
21 | .. |Jazz Band| image:: https://jazzband.co/static/img/badge.svg
22 |    :target: https://jazzband.co/
23 |    :alt: Jazzband
24 | 
25 | .. |Build Status| image:: https://travis-ci.org/deanmalmgren/textract.svg?branch=master
26 |    :target: https://travis-ci.org/deanmalmgren/textract
27 | 
28 | .. |Version| image:: https://img.shields.io/pypi/v/textract.svg
29 |    :target: https://warehouse.python.org/project/textract/
30 | 
31 | .. |Downloads| image:: https://img.shields.io/pypi/dm/textract.svg
32 |    :target: https://warehouse.python.org/project/textract/
33 | 
34 | .. |Test Coverage| image:: https://coveralls.io/repos/github/deanmalmgren/textract/badge.svg?branch=master
35 |     :target: https://coveralls.io/github/deanmalmgren/textract?branch=master
36 | 
37 | .. |Documentation Status| image:: https://readthedocs.org/projects/textract/badge/?version=latest
38 |    :target: https://readthedocs.org/projects/textract/?badge=latest
39 | 
40 | .. |Updates| image:: https://pyup.io/repos/github/deanmalmgren/textract/shield.svg
41 |     :target: https://pyup.io/repos/github/deanmalmgren/textract/
42 | 
43 | .. |Stars| image:: https://img.shields.io/github/stars/deanmalmgren/textract.svg
44 |     :target: https://github.com/deanmalmgren/textract/stargazers
45 | 
46 | .. |Forks| image:: https://img.shields.io/github/forks/deanmalmgren/textract.svg
47 |     :target: https://github.com/deanmalmgren/textract/network
48 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | # If there are any problems with the required gems, vagrant
 5 | # has its own ruby environment. To install the gems (iniparse,
 6 | # for example), you need to run:
 7 | #
 8 | # $ vagrant plugin install iniparse
 9 | #
10 | # For more details, check out:
11 | # https://docs.vagrantup.com/v2/cli/plugin.html
12 | 
13 | require 'iniparse'
14 | 
15 | Vagrant.configure("2") do |config|
16 | 
17 |   # preliminaries
18 |   root_dir = File.dirname(__FILE__)
19 | 
20 |   #################################################### VIRTUALBOX PROVIDER SETUP
21 |   # global configuration on the virtualbox provider. for all available
22 |   # options, see http://www.virtualbox.org/manual/ch08.html
23 |   virtualbox_server_name = "dev"
24 |   config.vm.provider :virtualbox do |vb, override_config|
25 |     vb.gui = false
26 |     # http://stackoverflow.com/a/17126363/892506
27 |     vb.customize ["modifyvm", :id, "--ioapic", "on"]
28 |     vb.customize ["modifyvm", :id, "--cpus", "2"]
29 |     vb.customize ["modifyvm", :id, "--memory", "2048"]
30 |     override_config.vm.box = "trusty64"
31 |     override_config.vm.box_url = "https://cloud-images.ubuntu.com/vagrant/trusty/current/trusty-server-cloudimg-amd64-vagrant-disk1.box"
32 |   end
33 | 
34 |   # steps for provisioning so that these provisioning steps are
35 |   # properly executed in this virtual machine and also on travis-ci
36 |   def provision_script(config, script_path)
37 |     config.vm.provision "shell" do |s|
38 |       s.path = script_path
39 |       s.args = "/vagrant"
40 |     end
41 |   end
42 | 
43 | 
44 |   ################################################################# LOCAL SERVER
45 |   config.vm.define virtualbox_server_name do |server_config|
46 |     server_config.vm.hostname = virtualbox_server_name
47 | 
48 |     # NOTE: this is a tentative hack. the way to properly do this
49 |     # would be to use the official ci-environments
50 |     # http://docs.travis-ci.com/user/ci-environment/, which are built
51 |     # using chef recipes from here
52 |     # https://github.com/travis-ci/travis-cookbooks/
53 |     provision_script(server_config, "provision/travis-mock.sh")
54 | 
55 |     # these are the same provisioning steps that are done on travis-ci
56 |     # as on the virtual machine
57 |     provision_script(server_config, "provision/debian.sh")
58 |     provision_script(server_config, "provision/python.sh")
59 | 
60 |     # these provisioning steps are only done locally as a convenience
61 |     # for setting up a useful development environment
62 |     provision_script(server_config, "provision/development.sh")
63 |   end
64 | 
65 | end
66 | 


--------------------------------------------------------------------------------
/bin/textract:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- mode: python -*-
 3 | # PYTHON_ARGCOMPLETE_OK
 4 | 
 5 | """
 6 | Command-line application.
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from textract.cli import get_parser
12 | from textract import process
13 | from textract.exceptions import CommandLineError
14 | from textract.colors import red
15 | 
16 | 
17 | # extract text
18 | def main():
19 |     """Interpret the command-line arguments, process the document and
20 |     raise errors accordingly (with traceback surpressed).
21 |     """
22 |     parser = get_parser()
23 |     args = parser.parse_args()
24 |     try:
25 |         output = process(**vars(args))
26 |     except CommandLineError as ex:
27 |         sys.stderr.write(red(ex) + '\n')
28 |         sys.exit(1)
29 |     else:
30 |         args.output.write(output)
31 | 
32 | 
33 | main()
34 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | APIDOC        = sphinx-apidoc
 10 | TEXTRACT      = 
 11 | APIDOC_IGNORE = ../textract/colors.py
 12 | 
 13 | # User-friendly check for sphinx-build
 14 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 15 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 16 | endif
 17 | 
 18 | # Internal variables.
 19 | PAPEROPT_a4     = -D latex_paper_size=a4
 20 | PAPEROPT_letter = -D latex_paper_size=letter
 21 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 22 | # the i18n builder cannot share the environment and doctrees with the others
 23 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 24 | 
 25 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 26 | 
 27 | help:
 28 | 	@echo "Please use \`make <target>' where <target> is one of"
 29 | 	@echo "  html       to make standalone HTML files"
 30 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 31 | 	@echo "  singlehtml to make a single large HTML file"
 32 | 	@echo "  pickle     to make pickle files"
 33 | 	@echo "  json       to make JSON files"
 34 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 35 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 36 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 37 | 	@echo "  epub       to make an epub"
 38 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 39 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 40 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 41 | 	@echo "  text       to make text files"
 42 | 	@echo "  man        to make manual pages"
 43 | 	@echo "  texinfo    to make Texinfo files"
 44 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 45 | 	@echo "  gettext    to make PO message catalogs"
 46 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 47 | 	@echo "  xml        to make Docutils-native XML files"
 48 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 49 | 	@echo "  linkcheck  to check all external links for integrity"
 50 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 51 | 
 52 | clean:
 53 | 	rm -rf $(BUILDDIR)/*
 54 | 
 55 | apidoc:
 56 | #	$(APIDOC) --force --no-toc ../textract --output-dir=./textract $(APIDOC_IGNORE)
 57 | #	@cat python_package > python_package.rst
 58 | #	@grep -A8 " module" textract/textract.parsers.rst >> python_package.rst
 59 | #	@echo "" >> python_package.rst
 60 | #	@grep -A8 " module" textract/textract.rst >> python_package.rst
 61 | #	@rm -rf ./textract
 62 | 
 63 | html: apidoc
 64 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 65 | 	@echo
 66 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 67 | 
 68 | dirhtml: apidoc
 69 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 70 | 	@echo
 71 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 72 | 
 73 | singlehtml: apidoc
 74 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 75 | 	@echo
 76 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 77 | 
 78 | pickle: apidoc
 79 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 80 | 	@echo
 81 | 	@echo "Build finished; now you can process the pickle files."
 82 | 
 83 | json: apidoc
 84 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 85 | 	@echo
 86 | 	@echo "Build finished; now you can process the JSON files."
 87 | 
 88 | htmlhelp: apidoc
 89 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 90 | 	@echo
 91 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 92 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 93 | 
 94 | qthelp: apidoc
 95 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 96 | 	@echo
 97 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 98 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 99 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/textract.qhcp"
100 | 	@echo "To view the help file:"
101 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/textract.qhc"
102 | 
103 | devhelp: apidoc
104 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
105 | 	@echo
106 | 	@echo "Build finished."
107 | 	@echo "To view the help file:"
108 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/textract"
109 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/textract"
110 | 	@echo "# devhelp"
111 | 
112 | epub: apidoc
113 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
114 | 	@echo
115 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
116 | 
117 | latex: apidoc
118 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
119 | 	@echo
120 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
121 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
122 | 	      "(use \`make latexpdf' here to do that automatically)."
123 | 
124 | latexpdf: apidoc
125 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
126 | 	@echo "Running LaTeX files through pdflatex..."
127 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
128 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
129 | 
130 | latexpdfja: apidoc
131 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
132 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
133 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
134 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
135 | 
136 | text: apidoc
137 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
138 | 	@echo
139 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
140 | 
141 | man: apidoc
142 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
143 | 	@echo
144 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
145 | 
146 | texinfo: apidoc
147 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
148 | 	@echo
149 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
150 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
151 | 	      "(use \`make info' here to do that automatically)."
152 | 
153 | info: apidoc
154 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
155 | 	@echo "Running Texinfo files through makeinfo..."
156 | 	make -C $(BUILDDIR)/texinfo info
157 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
158 | 
159 | gettext: apidoc
160 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
161 | 	@echo
162 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
163 | 
164 | changes: apidoc
165 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
166 | 	@echo
167 | 	@echo "The overview file is in $(BUILDDIR)/changes."
168 | 
169 | linkcheck: apidoc
170 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
171 | 	@echo
172 | 	@echo "Link check complete; look for any errors in the above output " \
173 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
174 | 
175 | doctest: apidoc
176 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
177 | 	@echo "Testing of doctests in the sources finished, look at the " \
178 | 	      "results in $(BUILDDIR)/doctest/output.txt."
179 | 
180 | xml: apidoc
181 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
182 | 	@echo
183 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
184 | 
185 | pseudoxml: apidoc
186 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
187 | 	@echo
188 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
189 | 


--------------------------------------------------------------------------------
/docs/command_line_interface.rst:
--------------------------------------------------------------------------------
 1 | .. _command-line-interface:
 2 | 
 3 | Command line interface
 4 | ======================
 5 | 
 6 | textract
 7 | --------
 8 | 
 9 | .. argparse::
10 |    :module: textract.cli
11 |    :func: get_parser
12 |    :prog: textract
13 | 
14 | .. note:: 
15 | 
16 |     To make the command line interface as usable as possible,
17 |     autocompletion of available options with textract is enabled by
18 |     @kislyuk's amazing `argcomplete
19 |     <https://github.com/kislyuk/argcomplete>`_ package.  Follow
20 |     instructions to `enable global autocomplete
21 |     <https://github.com/kislyuk/argcomplete#activating-global-completion>`_
22 |     and you should be all set. As an example, this is also configured
23 |     in the `virtual machine provisioning for this project
24 |     <http://github.com/deanmalmgren/textract/blob/master/provision/development.sh#L17>`_.
25 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # textract documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Jul  4 11:09:09 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | project_root = os.path.abspath(os.path.join(os.path.abspath('.'), '..'))
 22 | sys.path.insert(0, project_root)
 23 | import textract
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = [
 34 |     'sphinx.ext.autodoc',
 35 |     'sphinx.ext.todo',
 36 |     'sphinx.ext.viewcode',
 37 |     'sphinxarg.ext',
 38 | ]
 39 | 
 40 | # Add any paths that contain templates here, relative to this directory.
 41 | templates_path = ['.templates']
 42 | 
 43 | # The suffix of source filenames.
 44 | source_suffix = '.rst'
 45 | 
 46 | # The encoding of source files.
 47 | #source_encoding = 'utf-8-sig'
 48 | 
 49 | # The master toctree document.
 50 | master_doc = 'index'
 51 | 
 52 | # General information about the project.
 53 | project = u'textract'
 54 | copyright = u'2014, Dean Malmgren'
 55 | 
 56 | # The version info for the project you're documenting, acts as replacement for
 57 | # |version| and |release|, also used in various other places throughout the
 58 | # built documents.
 59 | #
 60 | # The short X.Y version.
 61 | release = version = "1.6.5"
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #language = None
 66 | 
 67 | # There are two options for replacing |today|: either, you set today to some
 68 | # non-false value, then it is used:
 69 | #today = ''
 70 | # Else, today_fmt is used as the format for a strftime call.
 71 | #today_fmt = '%B %d, %Y'
 72 | 
 73 | # List of patterns, relative to source directory, that match files and
 74 | # directories to ignore when looking for source files.
 75 | exclude_patterns = []
 76 | 
 77 | # The reST default role (used for this markup: `text`) to use for all
 78 | # documents.
 79 | #default_role = None
 80 | 
 81 | # If true, '()' will be appended to :func: etc. cross-reference text.
 82 | #add_function_parentheses = True
 83 | 
 84 | # If true, the current module name will be prepended to all description
 85 | # unit titles (such as .. function::).
 86 | #add_module_names = True
 87 | 
 88 | # If true, sectionauthor and moduleauthor directives will be shown in the
 89 | # output. They are ignored by default.
 90 | #show_authors = False
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = 'sphinx'
 94 | 
 95 | # A list of ignored prefixes for module index sorting.
 96 | #modindex_common_prefix = []
 97 | 
 98 | # If true, keep warnings as "system message" paragraphs in the built documents.
 99 | #keep_warnings = False
100 | 
101 | 
102 | # -- Options for HTML output ----------------------------------------------
103 | 
104 | # The theme to use for HTML and HTML Help pages.  See the documentation for
105 | # a list of builtin themes.
106 | html_theme = 'default'
107 | 
108 | # Theme options are theme-specific and customize the look and feel of a theme
109 | # further.  For a list of options available for each theme, see the
110 | # documentation.
111 | #html_theme_options = {}
112 | 
113 | # Add any paths that contain custom themes here, relative to this directory.
114 | #html_theme_path = []
115 | 
116 | # The name for this set of Sphinx documents.  If None, it defaults to
117 | # "<project> v<release> documentation".
118 | #html_title = None
119 | 
120 | # A shorter title for the navigation bar.  Default is the same as html_title.
121 | #html_short_title = None
122 | 
123 | # The name of an image file (relative to this directory) to place at the top
124 | # of the sidebar.
125 | #html_logo = None
126 | 
127 | # The name of an image file (within the static path) to use as favicon of the
128 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
129 | # pixels large.
130 | #html_favicon = None
131 | 
132 | # Add any paths that contain custom static files (such as style sheets) here,
133 | # relative to this directory. They are copied after the builtin static files,
134 | # so a file named "default.css" will overwrite the builtin "default.css".
135 | html_static_path = []
136 | 
137 | # Add any extra paths that contain custom files (such as robots.txt or
138 | # .htaccess) here, relative to this directory. These files are copied
139 | # directly to the root of the documentation.
140 | #html_extra_path = []
141 | 
142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
143 | # using the given strftime format.
144 | #html_last_updated_fmt = '%b %d, %Y'
145 | 
146 | # If true, SmartyPants will be used to convert quotes and dashes to
147 | # typographically correct entities.
148 | #html_use_smartypants = True
149 | 
150 | # Custom sidebar templates, maps document names to template names.
151 | #html_sidebars = {}
152 | 
153 | # Additional templates that should be rendered to pages, maps page names to
154 | # template names.
155 | #html_additional_pages = {}
156 | 
157 | # If false, no module index is generated.
158 | #html_domain_indices = True
159 | 
160 | # If false, no index is generated.
161 | #html_use_index = True
162 | 
163 | # If true, the index is split into individual pages for each letter.
164 | #html_split_index = False
165 | 
166 | # If true, links to the reST sources are added to the pages.
167 | #html_show_sourcelink = True
168 | 
169 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
170 | #html_show_sphinx = True
171 | 
172 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
173 | #html_show_copyright = True
174 | 
175 | # If true, an OpenSearch description file will be output, and all pages will
176 | # contain a <link> tag referring to it.  The value of this option must be the
177 | # base URL from which the finished HTML is served.
178 | #html_use_opensearch = ''
179 | 
180 | # This is the file name suffix for HTML files (e.g. ".xhtml").
181 | #html_file_suffix = None
182 | 
183 | # Output file base name for HTML help builder.
184 | htmlhelp_basename = 'textractdoc'
185 | 
186 | 
187 | # -- Options for LaTeX output ---------------------------------------------
188 | 
189 | latex_elements = {
190 | # The paper size ('letterpaper' or 'a4paper').
191 | #'papersize': 'letterpaper',
192 | 
193 | # The font size ('10pt', '11pt' or '12pt').
194 | #'pointsize': '10pt',
195 | 
196 | # Additional stuff for the LaTeX preamble.
197 | #'preamble': '',
198 | }
199 | 
200 | # Grouping the document tree into LaTeX files. List of tuples
201 | # (source start file, target name, title,
202 | #  author, documentclass [howto, manual, or own class]).
203 | latex_documents = [
204 |   ('index', 'textract.tex', u'textract Documentation',
205 |    u'Dean Malmgren', 'manual'),
206 | ]
207 | 
208 | # The name of an image file (relative to this directory) to place at the top of
209 | # the title page.
210 | #latex_logo = None
211 | 
212 | # For "manual" documents, if this is true, then toplevel headings are parts,
213 | # not chapters.
214 | #latex_use_parts = False
215 | 
216 | # If true, show page references after internal links.
217 | #latex_show_pagerefs = False
218 | 
219 | # If true, show URL addresses after external links.
220 | #latex_show_urls = False
221 | 
222 | # Documents to append as an appendix to all manuals.
223 | #latex_appendices = []
224 | 
225 | # If false, no module index is generated.
226 | #latex_domain_indices = True
227 | 
228 | 
229 | # -- Options for manual page output ---------------------------------------
230 | 
231 | # One entry per manual page. List of tuples
232 | # (source start file, name, description, authors, manual section).
233 | man_pages = [
234 |     ('index', 'textract', u'textract Documentation',
235 |      [u'Dean Malmgren'], 1)
236 | ]
237 | 
238 | # If true, show URL addresses after external links.
239 | #man_show_urls = False
240 | 
241 | 
242 | # -- Options for Texinfo output -------------------------------------------
243 | 
244 | # Grouping the document tree into Texinfo files. List of tuples
245 | # (source start file, target name, title, author,
246 | #  dir menu entry, description, category)
247 | texinfo_documents = [
248 |   ('index', 'textract', u'textract Documentation',
249 |    u'Dean Malmgren', 'textract', 'One line description of project.',
250 |    'Miscellaneous'),
251 | ]
252 | 
253 | # Documents to append as an appendix to all manuals.
254 | #texinfo_appendices = []
255 | 
256 | # If false, no module index is generated.
257 | #texinfo_domain_indices = True
258 | 
259 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
260 | #texinfo_show_urls = 'footnote'
261 | 
262 | # If true, do not generate a @detailmenu in the "Top" node's menu.
263 | #texinfo_no_detailmenu = False
264 | 
265 | # on_rtd is whether we are on readthedocs.org
266 | # http://read-the-docs.readthedocs.org/en/latest/theme.html#how-do-i-use-this-locally-and-on-read-the-docs
267 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
268 | if not on_rtd:  # only import and set the theme if we're building docs locally
269 |     import sphinx_rtd_theme
270 |     html_theme = 'sphinx_rtd_theme'
271 |     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
272 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
  1 | .. _contributing:
  2 | 
  3 | Contributing
  4 | ============
  5 | 
  6 | The overarching goal of this project is to make it as easy as possible
  7 | to extract raw text from any document for the purposes of most natural
  8 | language processing tasks. In practice, this means that this project
  9 | should preferentially provide tools that correctly produce output that
 10 | has words in the correct order but that whitespace between words,
 11 | formatting, etc is totally irrelevant. As the various parsers mature,
 12 | I fully expect the output to become more readable to support
 13 | additional use cases, like `extracting text to appear in web pages
 14 | <https://github.com/deanmalmgren/textract/pull/58#issuecomment-53697943>`_.
 15 | 
 16 | Importantly, this project is committed to being as agnostic about how
 17 | the content is extracted as it is about the means in which the text is
 18 | analyzed downstream. This means that ``textract`` should support
 19 | multiple modes of extracting text from any document and provide
 20 | reasonably good defaults (defaulting to tools that tend to produce the
 21 | correct word sequence).
 22 | 
 23 | Another important aspect of this project is that we want to have
 24 | extremely good documentation. If you notice a type-o, error, confusing
 25 | statement etc, please fix it!
 26 | 
 27 | 
 28 | .. _contributing-quick-start:
 29 | 
 30 | Quick start
 31 | -----------
 32 | 
 33 | 1. `Fork <https://github.com/deanmalmgren/textract/fork>`_ and clone the
 34 |    project:
 35 | 
 36 |    .. code-block:: bash
 37 | 
 38 |         git clone https://github.com/YOUR-USERNAME/textract.git
 39 | 
 40 | 2. Contribute! There are several `open issues
 41 |    <https://github.com/deanmalmgren/textract/issues>`_ that provide
 42 |    good places to dig in. Check out the `contribution guidelines
 43 |    <https://github.com/deanmalmgren/textract/blob/master/CONTRIBUTING.md>`_
 44 |    and send pull requests; your help is greatly appreciated!
 45 | 
 46 | Depending on your development preferences, there are lots of ways to
 47 | get started developing with textract:
 48 | 
 49 | Developing in a native Ubuntu environment
 50 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 51 | 
 52 | 3. Install all the necessary system packages:
 53 | 
 54 |    .. code-block:: bash
 55 | 
 56 |        ./provision/travis-mock.sh
 57 |        ./provision/debian.sh
 58 | 
 59 |        # optionally run some of the steps in these scripts, but you
 60 |        # may want to be selective about what you do as they alter global
 61 |        # environment states
 62 |        ./provision/python.sh
 63 |        ./provision/development.sh
 64 | 
 65 | .. _run-ubuntu-tests:
 66 | 
 67 | 4. On the virtual machine, make sure everything is working by running
 68 |    the suite of functional tests:
 69 | 
 70 |    .. code-block:: bash
 71 | 
 72 |         nosetests
 73 | 
 74 |    These functional tests are designed to be run on an Ubuntu 12.04
 75 |    LTS server, just like the virtual machine and the server that runs
 76 |    the travis-ci test suite. There are some other tests that have been
 77 |    added along the way in the `Travis configuration
 78 |    <https://github.com/deanmalmgren/textract/blob/master/.travis.yml>`_. For
 79 |    your convenience, you can run all of these tests with:
 80 | 
 81 |    .. code-block:: bash
 82 | 
 83 |         ./tests/run.py
 84 | 
 85 |    Current build status: |Build Status|
 86 | 
 87 | 
 88 | Developing with Vagrant virtual machine
 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 90 | 
 91 | 3. Install `Vagrant <http://vagrantup.com/downloads>`_ and
 92 |    `Virtualbox <https://www.virtualbox.org/wiki/Downloads>`_ and launch
 93 |    the development virtual machine:
 94 | 
 95 |    .. code-block:: bash
 96 | 
 97 |         vagrant plugin install iniparse
 98 |         vagrant up && vagrant provision
 99 | 
100 |    On ``vagrant ssh``\ ing to the virtual machine, note that the
101 |    ``PYTHONPATH`` and ``PATH`` `environment variables have been
102 |    altered in this virtual machine
103 |    <https://github.com/deanmalmgren/textract/blob/master/provision/development.sh>`_
104 |    so that any changes you make to textract in development are
105 |    automatically incorporated into the command.
106 | 
107 | 4. See :ref:`step 4 <run-ubuntu-tests>` in the Ubuntu development environment.
108 |    Current build status: |Build Status|
109 | 
110 | 
111 | 
112 | Developing with Docker container
113 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
114 | 
115 | 3. Go to the `Docker
116 |    documentation <http://docs.docker.com/installation/ubuntulinux/>`_
117 |    and follow the instructions under "If you'd like to try the latest
118 |    version of Docker" to install Docker.
119 | 
120 | 4. Just run ``tests/run_docker_tests.sh`` to run the full test suite.
121 |    Current build status: |Build Status|
122 | 
123 | 
124 | .. |Build Status| image:: https://travis-ci.org/deanmalmgren/textract.png
125 |    :target: https://travis-ci.org/deanmalmgren/textract
126 | 
127 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. textract documentation master file, created by
  2 |    sphinx-quickstart on Fri Jul  4 11:09:09 2014.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | textract
  7 | ================================
  8 | 
  9 | As undesirable as it might be, more often than not there is extremely
 10 | useful information embedded in Word documents, PowerPoint
 11 | presentations, PDFs, etc---so-called "dark data"---that would be
 12 | valuable for further textual analysis and visualization. While
 13 | :ref:`several packages <supporting>` exist for extracting content from
 14 | each of these formats on their own, this package provides a single
 15 | interface for extracting content from any type of file, without any
 16 | irrelevant markup.
 17 | 
 18 | This package provides two primary facilities for doing this, the
 19 | :ref:`command line interface <command-line-interface>`
 20 | 
 21 | .. code-block:: bash
 22 | 
 23 |     textract path/to/file.extension
 24 | 
 25 | or the :ref:`python package <python-package>`
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     # some python file
 30 |     import textract
 31 |     text = textract.process("path/to/file.extension")
 32 | 
 33 | .. _supporting:
 34 | 
 35 | Currently supporting
 36 | --------------------
 37 | 
 38 | textract supports a growing list of file types for text extraction. If
 39 | you don't see your favorite file type here, Please recommend other
 40 | file types by either mentioning them on the `issue tracker
 41 | <https://github.com/deanmalmgren/textract/issues>`_ or by
 42 | :ref:`contributing a pull request <contributing>`.
 43 | 
 44 | 
 45 | * ``.csv`` via python builtins
 46 | 
 47 | * ``.tsv`` and ``.tab`` via python builtins
 48 | 
 49 | * ``.doc`` via `antiword`_
 50 | 
 51 | * ``.docx`` via `python-docx2txt`_
 52 | 
 53 | * ``.eml`` via python builtins
 54 | 
 55 | * ``.epub`` via `ebooklib`_
 56 | 
 57 | * ``.gif`` via `tesseract-ocr`_
 58 | 
 59 | * ``.jpg`` and ``.jpeg`` via `tesseract-ocr`_
 60 | 
 61 | * ``.json`` via python builtins
 62 | 
 63 | * ``.html`` and ``.htm`` via `beautifulsoup4`_
 64 | 
 65 | * ``.mp3`` via `sox`_, `SpeechRecognition`_, and `pocketsphinx`_
 66 | 
 67 | * ``.msg`` via `msg-extractor`_
 68 | 
 69 | * ``.odt`` via python builtins
 70 | 
 71 | * ``.ogg`` via `sox`_, `SpeechRecognition`_, and `pocketsphinx`_
 72 | 
 73 | * ``.pdf`` via `pdftotext`_ (default) or `pdfminer.six`_
 74 | 
 75 | * ``.png`` via `tesseract-ocr`_
 76 | 
 77 | * ``.pptx`` via `python-pptx`_
 78 | 
 79 | * ``.ps`` via `ps2ascii`_
 80 | 
 81 | * ``.rtf`` via `unrtf`_
 82 | 
 83 | * ``.tiff`` and ``.tif`` via `tesseract-ocr`_
 84 | 
 85 | * ``.txt`` via python builtins
 86 | 
 87 | * ``.wav`` via `SpeechRecognition`_ and `pocketsphinx`_
 88 | 
 89 | * ``.xlsx`` via `xlrd <https://pypi.python.org/pypi/xlrd>`_
 90 | 
 91 | * ``.xls`` via `xlrd <https://pypi.python.org/pypi/xlrd>`_
 92 | 
 93 | .. this is a list of all the packages that textract uses for extraction
 94 | .. _antiword: http://www.winfield.demon.nl/
 95 | .. _beautifulsoup4: http://beautiful-soup-4.readthedocs.org/en/latest/
 96 | .. _ebooklib: https://github.com/aerkalov/ebooklib
 97 | .. _msg-extractor: https://github.com/mattgwwalker/msg-extractor
 98 | .. _pdfminer.six: https://github.com/goulu/pdfminer
 99 | .. _pdftotext: http://poppler.freedesktop.org/
100 | .. _pocketsphinx: https://github.com/cmusphinx/pocketsphinx/
101 | .. _ps2ascii: https://www.ghostscript.com/doc/current/Use.htm
102 | .. _python-docx2txt: https://github.com/ankushshah89/python-docx2txt
103 | .. _python-pptx: https://python-pptx.readthedocs.org/en/latest/
104 | .. _SpeechRecognition: https://pypi.python.org/pypi/SpeechRecognition/
105 | .. _sox: http://sox.sourceforge.net/
106 | .. _tesseract-ocr: https://code.google.com/p/tesseract-ocr/
107 | .. _unrtf: http://www.gnu.org/software/unrtf/
108 | 
109 | .. _related-projects:
110 | 
111 | Related projects
112 | ----------------
113 | 
114 | Of course, textract isn't the first project with the aim to provide a
115 | simple interface for extracting text from any document. But this is,
116 | to the best of my knowledge, the only project that is written in
117 | python (a language commonly chosen by the natural language processing
118 | community) and is :ref:`method agnostic about how content is extracted
119 | <contributing>`. I'm sure that there are other similar projects out
120 | there, but here is a small sample of similar projects:
121 | 
122 | * `Apache Tika <http://tika.apache.org/>`_ has `very similar, if not
123 |   identical, aims as textract
124 |   <https://github.com/deanmalmgren/textract/issues/12>`_ and has
125 |   impressive coverage of a wide range of file formats. It is written
126 |   in java.
127 | 
128 | * `textract (node.js) <https://github.com/dbashford/textract>`_ has
129 |   similar aims as this textract package (including an identical name!
130 |   great minds...). It is written in node.js.
131 | 
132 | * `pandoc <http://johnmacfarlane.net/pandoc/>`_ is intended to be a
133 |   document conversion tool (a much more difficult task!), but it does have
134 |   `the ability to convert to plain text
135 |   <http://johnmacfarlane.net/pandoc/demos.html>`_. It is written in
136 |   Haskell.
137 | 
138 | 
139 | Contents:
140 | 
141 | .. toctree::
142 |    :maxdepth: 2
143 | 
144 |    command_line_interface
145 |    python_package
146 |    installation
147 |    contributing
148 |    changelog
149 | 
150 | 
151 | Indices and tables
152 | ==================
153 | 
154 | * :ref:`genindex`
155 | * :ref:`modindex`
156 | * :ref:`search`
157 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
  1 | .. _installation:
  2 | 
  3 | Installation
  4 | ============
  5 | 
  6 | One of the main goals of textract is to make it as easy as possible to
  7 | start using textract (meaning that installation should be as quick and
  8 | painless as possible). This package is built on top of several python
  9 | packages and other source libraries. Assuming you are using ``pip`` or
 10 | ``easy_install`` to install textract, the `python packages
 11 | <https://github.com/deanmalmgren/textract/blob/master/requirements/python>`_
 12 | are all installed by default with textract. The source libraries are a
 13 | separate matter though and largely depend on your operating system.
 14 | 
 15 | Ubuntu / Debian
 16 | ---------------
 17 | 
 18 | There are two steps required to run this package on
 19 | Ubuntu/Debian. First you must install some system packages using the
 20 | `apt-get <https://help.ubuntu.com/14.04/serverguide/apt-get.html>`_
 21 | package manager before installing textract from pypi.
 22 | 
 23 | .. code-block:: bash
 24 | 
 25 |     apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
 26 |     flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig libpulse-dev
 27 |     pip install textract
 28 | 
 29 | .. note::
 30 | 
 31 |     It may also be necessary to install ``zlib1g-dev`` on Docker
 32 |     instances of Ubuntu. See `issue #19
 33 |     <https://github.com/deanmalmgren/textract/pull/19>`_ for details
 34 | 
 35 | OSX
 36 | ---
 37 | 
 38 | These steps rely on you having `homebrew <http://brew.sh/>`_ installed
 39 | as well as the `cask <http://caskroom.io/>`_ plugin (``brew tap caskroom/cask``). The basic idea is to first install
 40 | `XQuartz <https://xquartz.macosforge.org/landing/>`_ before
 41 | installing a bunch of system packages before installing textract from
 42 | pypi.
 43 | 
 44 | .. code-block:: bash
 45 | 
 46 |     brew install --cask xquartz
 47 |     brew install poppler antiword unrtf tesseract swig
 48 |     pip install textract
 49 | 
 50 | ..     brew install libxml2 libxslt antiword poppler tesseract
 51 | ..     brew link libxml2 libxslt
 52 | 
 53 | .. note::
 54 | 
 55 |     `pstotext <http://pages.cs.wisc.edu/~ghost/doc/pstotext.htm>`_ is
 56 |     not currently a part of homebrew so ``.ps`` extraction must be
 57 |     enabled by manually installing from source.
 58 | 
 59 | .. note::
 60 | 
 61 |     Depending on how you have python configured on your system with
 62 |     homebrew, you may also need to install the python
 63 |     development header files for textract to properly install.
 64 | 
 65 | FreeBSD
 66 | -------
 67 | 
 68 | Setting up this package on FreeBSD pretty much follows the steps for
 69 | Ubuntu / Debian while using ``pkg`` as package manager.
 70 | 
 71 | .. code-block:: bash
 72 | 
 73 |     pkg install lang/python38 devel/py-pip textproc/libxml2 textproc/libxslt textproc/antiword textproc/unrtf \
 74 |     graphics/poppler print/pstotext graphics/tesseract audio/flac multimedia/ffmpeg audio/lame audio/sox \
 75 |     graphics/jpeg-turbo
 76 |     pip install textract
 77 | 
 78 | Don't see your operating system installation instructions here?
 79 | ---------------------------------------------------------------
 80 | 
 81 | My apologies! Installing system packages is a bit of a drag and its
 82 | hard to anticipate all of the different environments that need to be
 83 | accomodated (wouldn't it be awesome if there were a system-agnostic
 84 | package manager or, better yet, if python could install these system
 85 | dependencies for you?!?!). If you're operating system doesn't have
 86 | documenation about how to install the textract dependencies, please
 87 | :ref:`contribute a pull request <contributing>` with:
 88 | 
 89 | 1. A new section in here with the appropriate details about how to
 90 |    install things. In particular, please give instructions for how to
 91 |    install the following libraries before running ``pip install
 92 |    textract``:
 93 | 
 94 |     - `libxml2 2.6.21 or later <http://xmlsoft.org/downloads.html>`_
 95 |       is required by the ``.docx`` parser which uses `lxml
 96 |       <http://lxml.de/installation.html#requirements>`_ via
 97 |       python-docx.
 98 | 
 99 |     - `libxslt 1.1.15 or later
100 |       <http://xmlsoft.org/XSLT/downloads.html>`_ is required by the
101 |       ``.docx`` parser which users `lxml
102 |       <http://lxml.de/installation.html#requirements>`_ via
103 |       python-docx.
104 | 
105 |     - python header files are required for building lxml.
106 | 
107 |     - `antiword <http://www.winfield.demon.nl/>`_ is required by the
108 |       ``.doc`` parser.
109 | 
110 |     - `pdftotext <http://poppler.freedesktop.org/>`_ is *optionally*
111 |       required by the ``.pdf`` parser (there is a pure python fallback
112 |       that works if pdftotext isn't installed).
113 | 
114 |     - `pstotext <http://pages.cs.wisc.edu/~ghost/doc/pstotext.htm>`_
115 |       is required by the ``.ps`` parser.
116 | 
117 |     - `tesseract-ocr <https://code.google.com/p/tesseract-ocr/>`_
118 |       is required by the ``.jpg``, ``.png`` and ``.gif`` parser.
119 | 
120 |     - `sox <http://sox.sourceforge.net/>`_
121 |       is required by the ``.mp3`` and ``.ogg`` parser.
122 |       You need to install ffmpeg, lame, libmad0 and libsox-fmt-mp3,
123 |       before building sox, for these filetypes to work.
124 | 
125 | 2. Add a requirements file to the `requirements directory
126 |    <https://github.com/deanmalmgren/textract/tree/master/requirements>`_
127 |    of the project with the lower-cased name of your operating system
128 |    (e.g. ``requirements/windows``) so we can try to keep these things
129 |    up to date in the future.
130 | 


--------------------------------------------------------------------------------
/docs/python_package.rst:
--------------------------------------------------------------------------------
  1 | .. _python-package:
  2 | 
  3 | Python package
  4 | ==============
  5 | 
  6 | This package is organized to make it as easy as possible to add new
  7 | extensions and support the continued growth and coverage of
  8 | textract. For almost all applications, you will just have to do
  9 | something like this::
 10 | 
 11 |     import textract
 12 |     text = textract.process('path/to/file.extension')
 13 | 
 14 | to obtain text from a document. You can also pass keyword arguments to
 15 | ``textract.process``, for example, to use a particular method for
 16 | parsing a pdf like this::
 17 | 
 18 |     import textract
 19 |     text = textract.process('path/to/a.pdf', method='pdfminer')
 20 | 
 21 | or to specify a particular output encoding (input encodings are
 22 | inferred using `chardet <https://github.com/chardet/chardet>`_)::
 23 | 
 24 |     import textract
 25 |     text = textract.process('path/to/file.extension', encoding='ascii')
 26 | 
 27 | When the file name has no extension, you specify the file's extension as an argument
 28 | to ``textract.process`` like this::
 29 | 
 30 |     import textract
 31 |     text = textract.process('path/to/file', extension='docx')
 32 | 
 33 | .. _additional-options:
 34 | 
 35 | Additional options
 36 | ------------------
 37 | 
 38 | Some parsers also enable additional options which can be passed in as keyword
 39 | arguments to the ``textract.process`` function. Here is a quick table of
 40 | available options that are available to the different types of parsers:
 41 | 
 42 | ======  =========  ===========================================================
 43 | parser  option     description
 44 | ======  =========  ===========================================================
 45 | gif     language   Specify `the language`_ for OCR-ing text with tesseract
 46 | jpg     language   Specify `the language`_ for OCR-ing text with tesseract
 47 | pdf     language   For use when ``method='tesseract'``, specify `the language`_
 48 | pdf     layout     With ``method='pdftotext'`` (default), preserve the layout
 49 | png     language   Specify `the language`_ for OCR-ing text with tesseract
 50 | tiff    language   Specify `the language`_ for OCR-ing text with tesseract
 51 | ======  =========  ===========================================================
 52 | 
 53 | As an example of using these additional options, you can extract text from a
 54 | Norwegian PDF using Tesseract OCR like this::
 55 | 
 56 |     text = textract.process(
 57 |         'path/to/norwegian.pdf',
 58 |         method='tesseract',
 59 |         language='nor',
 60 |     )
 61 | 
 62 | 
 63 | A look under the hood
 64 | ---------------------
 65 | 
 66 | When ``textract.process('path/to/file.extension')`` is called,
 67 | ``textract.process`` looks for a module called
 68 | ``textract.parsers.extension_parser`` that also contains a ``Parser``.
 69 | 
 70 | 
 71 | .. autofunction:: textract.parsers.process
 72 | 
 73 | Importantly, the ``textract.parsers.extension_parser.Parser`` class
 74 | must inherit from ``textract.parsers.utils.BaseParser``.
 75 | 
 76 | .. autoclass:: textract.parsers.utils.BaseParser
 77 |     :members:
 78 |     :undoc-members:
 79 |     :show-inheritance:
 80 | 
 81 | Many of the parsers rely on command line utilities to do some of the
 82 | parsing. For convenience, the ``textract.parsers.utils.ShellParser``
 83 | class includes some convenience methods for streamlining access to the
 84 | command line.
 85 | 
 86 | .. autoclass:: textract.parsers.utils.ShellParser
 87 |     :members:
 88 |     :undoc-members:
 89 |     :show-inheritance:
 90 | 
 91 | 
 92 | A few specific examples
 93 | -----------------------
 94 | 
 95 | There are quite a few parsers included with ``textract``. Rather than
 96 | elaborating all of them, here are a few that demonstrate how parsers
 97 | work.
 98 | 
 99 | .. autoclass:: textract.parsers.epub_parser.Parser
100 |     :members:
101 |     :undoc-members:
102 |     :show-inheritance:
103 | 
104 | .. autoclass:: textract.parsers.doc_parser.Parser
105 |     :members:
106 |     :undoc-members:
107 |     :show-inheritance:
108 | 
109 | 
110 | .. _the language: https://code.google.com/p/tesseract-ocr/downloads/list
111 | 


--------------------------------------------------------------------------------
/provision/debian.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This needs to work both for Vagrant provisioning and for Travis
 4 | # builds in a Python virtualenv, each of which have different current
 5 | # working directories when this script is called. When run in Vagrant, the
 6 | # script is copied to /tmp and executed from there, passing the original
 7 | # path as the first argument. So deal with that.
 8 | if [ "$1" == "" ]; then
 9 |     # normal
10 |     cd $(dirname $0)/..
11 | else
12 |     # run from /tmp by Vagrant.
13 |     cd $1
14 | fi
15 | base=$(pwd)
16 | 
17 | # Install all of the dependencies required in the examples.
18 | # http://docs.travis-ci.com/user/installing-dependencies/#Installing-Ubuntu-packages
19 | apt-get update -qq
20 | sed 's/\(.*\)\#.*/\1/' < $base/requirements/debian | xargs apt-get install -y --fix-missing
21 | 


--------------------------------------------------------------------------------
/provision/development.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this script sets up some additional configurations that are
 4 | # convenient during development only
 5 | 
 6 | # make sure the PYTHONPATH and PATH variables are properly configured
 7 | # for the vagrant user. Always change into the /vagrant directory on
 8 | # the virtual machine to make it easy to start developing
 9 | cat << EOF > /home/vagrant/.bash_profile
10 | export PATH=/vagrant/bin:$PATH
11 | export PYTHONPATH=/vagrant:$PYTHONPATH
12 | cd /vagrant
13 | EOF
14 | 
15 | # setup global tab completion on the flo command
16 | # https://github.com/kislyuk/argcomplete#activating-global-completion
17 | activate-global-python-argcomplete --dest /etc/bash_completion.d/
18 | 


--------------------------------------------------------------------------------
/provision/python2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This needs to work for vagrant, Travis builds, and Docker builds.
 4 | # in a python virtualenv. in the virtual machine provisioning,
 5 | # we're passing the directory this should be run from. in travis-ci,
 6 | # its run from the root of the repository.
 7 | if [ "$#" -eq 1 ]; then
 8 |      cd $1
 9 | fi
10 | 
11 | # upgrade pip so we can use wheel downloads
12 | pip install -U pip
13 | 
14 | # Install the requirements for this package as well as this module.
15 | pip install -r requirements/python-dev2
16 | 


--------------------------------------------------------------------------------
/provision/python3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This needs to work for vagrant, Travis builds, and Docker builds.
 4 | # in a python virtualenv. in the virtual machine provisioning,
 5 | # we're passing the directory this should be run from. in travis-ci,
 6 | # its run from the root of the repository.
 7 | if [ "$#" -eq 1 ]; then
 8 |      cd $1
 9 | fi
10 | 
11 | # upgrade pip so we can use wheel downloads
12 | pip install -U pip
13 | 
14 | # Install the requirements for this package as well as this module.
15 | pip install -r requirements/python-dev3
16 | pip install -r requirements/python-doc
17 | 


--------------------------------------------------------------------------------
/provision/travis-mock.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # these additional packages are required to make the virtual machine
 4 | # have a similar environment to travis-ci before we install anything
 5 | # else.  See Vagrantfile for details on how this could be done better
 6 | # if its a problem.
 7 | # http://docs.travis-ci.com/user/languages/python/#Travis-CI-Uses-Isolated-virtualenvs
 8 | sudo apt-get update -qq
 9 | sudo apt-get install -y python-pip python-dev build-essential
10 | 
11 | # install pep8 and nose for testing
12 | sudo pip install pep8 nose
13 | 


--------------------------------------------------------------------------------
/requirements/debian:
--------------------------------------------------------------------------------
 1 | # required packages
 2 | gcc
 3 | libpulse-dev
 4 | libasound2-dev
 5 | libjpeg-dev
 6 | build-essential
 7 | git
 8 | make
 9 | 
10 | # these packages are required by python-docx, which depends on lxml
11 | # and requires these things
12 | python-dev
13 | libxml2-dev
14 | libxslt1-dev
15 | 
16 | # parse word documents
17 | antiword
18 | 
19 | # parse rtf documents
20 | unrtf
21 | 
22 | # parse image files
23 | tesseract-ocr
24 | libjpeg-dev
25 | 
26 | # parse pdfs
27 | poppler-utils
28 | 
29 | # parse postscript files
30 | pstotext
31 | 
32 | # parse audio files, with SpeechRecognition
33 | flac
34 | 
35 | # filetype conversion libs
36 | ffmpeg
37 | lame
38 | libmad0
39 | libsox-fmt-mp3
40 | 
41 | # convert audio files
42 | sox
43 | 
44 | # Sphinx Speech Recognition
45 | swig
46 | 
47 | # ubuntu 14.04 requires this in addition to libxml2-dev and
48 | # libxslt1-dev for compiling lxml.
49 | # https://github.com/deanmalmgren/textract/issues/19
50 | zlib1g-dev
51 | 


--------------------------------------------------------------------------------
/requirements/freebsd:
--------------------------------------------------------------------------------
 1 | # required packages
 2 | audio/pulseaudio
 3 | devel/git
 4 | 
 5 | # these packages are required by python-docx, which depends on lxml
 6 | # and requires these things
 7 | lang/python38
 8 | devel/py-pippython-pip
 9 | textproc/libxml2
10 | textproc/libxslt
11 | 
12 | # parse word documents
13 | textproc/antiword
14 | 
15 | # parse rtf documents
16 | textproc/unrtf
17 | 
18 | # parse image files
19 | graphics/tesseract
20 | graphics/jpeg-turbo
21 | 
22 | # parse pdfs
23 | graphics/poppler
24 | 
25 | # parse postscript files
26 | print/pstotext
27 | 
28 | # parse audio files, with SpeechRecognition
29 | audio/flac 
30 | 
31 | # filetype conversion libs
32 | multimedia/ffmpeg
33 | audio/lame
34 | 
35 | # convert audio files
36 | audio/sox
37 | 


--------------------------------------------------------------------------------
/requirements/python:
--------------------------------------------------------------------------------
 1 | # This file contains all python dependencies that are required by the textract
 2 | # package in order for it to properly work.
 3 | 
 4 | argcomplete~=1.10.0
 5 | beautifulsoup4~=4.8.0
 6 | chardet==3.*
 7 | docx2txt~=0.8
 8 | extract-msg<=0.29.* #Last with python2 support
 9 | pdfminer.six==20191110 #Last with python2 support
10 | python-pptx~=0.6.18
11 | six~=1.12.0
12 | SpeechRecognition~=3.8.1
13 | xlrd~=1.2.0
14 | 


--------------------------------------------------------------------------------
/requirements/python-dev2:
--------------------------------------------------------------------------------
 1 | # This includes all packages that are used in development, including all
 2 | # packages that are required by textract itself (python), packages for
 3 | # documentation builds (python-doc)
 4 | 
 5 | -r python
 6 | 
 7 | # needed for tests/run.py script to read .travis.yml file
 8 | coveralls==1.8.2
 9 | nose==1.3.7
10 | pycodestyle==2.5.0
11 | PyYAML==5.1.1
12 | requests==2.22.0
13 | pytest==4.6
14 | 
15 | # needed for managing versions
16 | bumpversion==0.5.3
17 | 


--------------------------------------------------------------------------------
/requirements/python-dev3:
--------------------------------------------------------------------------------
 1 | # This includes all packages that are used in development, including all
 2 | # packages that are required by textract itself (python), packages for
 3 | # documentation builds (python-doc)
 4 | 
 5 | -r python
 6 | 
 7 | # needed for tests/run.py script to read .travis.yml file
 8 | coveralls==1.8.2
 9 | nose==1.3.7
10 | pycodestyle==2.5.0
11 | PyYAML==5.1.1
12 | pytest==5.0.1
13 | requests==2.22.0
14 | 
15 | # needed for managing versions
16 | bumpversion==0.5.3
17 | 


--------------------------------------------------------------------------------
/requirements/python-doc:
--------------------------------------------------------------------------------
1 | # this only includes packages that are needed for documentation build.
2 | 
3 | sphinx==2.1.2
4 | sphinx_rtd_theme==0.4.3
5 | sphinx-argparse==0.2.5
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.6.5
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version="{current_version}"
 8 | replace = version="{new_version}"
 9 | 
10 | [bumpversion:file:textract/__init__.py]
11 | search = VERSION = "{current_version}"
12 | replace = VERSION = "{new_version}"
13 | 
14 | [bumpversion:file:docs/conf.py]
15 | search = version = "{current_version}"
16 | replace = version = "{new_version}"
17 | 
18 | [bumpversion:file:docs/changelog.rst]
19 | search = THANKS FOR CONTRIBUTING; ADD YOUR UNRELEASED CHANGES HERE!
20 | replace = THANKS FOR CONTRIBUTING; ADD YOUR UNRELEASED CHANGES HERE!
21 | 	{new_version}
22 | 	-------------------
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from setuptools import setup
 4 | 
 5 | import textract
 6 | 
 7 | # get all of the scripts
 8 | scripts = glob.glob("bin/*")
 9 | 
10 | # read in the description from README
11 | with open("README.rst") as stream:
12 |     long_description = stream.read()
13 | 
14 | github_url = 'https://github.com/deanmalmgren/textract'
15 | 
16 | 
17 | def parse_requirements(requirements_filename):
18 |     """read in the dependencies from the requirements files
19 |     """
20 |     dependencies, dependency_links = [], []
21 |     requirements_dir = os.path.dirname(requirements_filename)
22 |     with open(requirements_filename, 'r') as stream:
23 |         for line in stream:
24 |             line = line.strip()
25 |             if line.startswith("-r"):
26 |                 filename = os.path.join(requirements_dir, line[2:].strip())
27 |                 _dependencies, _dependency_links = parse_requirements(filename)
28 |                 dependencies.extend(_dependencies)
29 |                 dependency_links.extend(_dependency_links)
30 |             elif line.startswith("http"):
31 |                 dependency_links.append(line)
32 |             else:
33 |                 package = line.split('#')[0]
34 |                 if package:
35 |                     dependencies.append(package)
36 |     return dependencies, dependency_links
37 | 
38 | 
39 | requirements_filename = os.path.join("requirements", "python")
40 | dependencies, dependency_links = parse_requirements(requirements_filename)
41 | 
42 | 
43 | setup(
44 |     name=textract.__name__,
45 |     version="1.6.5",
46 |     description="extract text from any document. no muss. no fuss.",
47 |     long_description=long_description,
48 |     url=github_url,
49 |     download_url="%s/archives/master" % github_url,
50 |     author='Dean Malmgren',
51 |     author_email='dean.malmgren@datascopeanalytics.com',
52 |     license='MIT',
53 |     scripts=scripts,
54 |     packages=[
55 |         'textract',
56 |         'textract.parsers',
57 |     ],
58 |     install_requires=dependencies,
59 |     extras_require={
60 |         "pocketsphinx": ["pocketsphinx==0.1.15"]
61 |     },
62 |     dependency_links=dependency_links,
63 |     zip_safe=False,
64 | )
65 | 


--------------------------------------------------------------------------------
/tests/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:12.04
 2 | MAINTAINER Shawn Milochik <shawn@milochik.com>
 3 | ENV DEBIAN_FRONTEND noninteractive
 4 | ENV REFRESHED_AT 2014-08-12b
 5 | RUN apt-get update
 6 | RUN apt-get install python-pip -y
 7 | ADD . /src
 8 | WORKDIR /src
 9 | RUN /bin/bash /src/provision/debian.sh
10 | RUN /bin/bash /src/provision/python.sh
11 | RUN adduser --disabled-password --gecos "" --home=/home/textract textract
12 | VOLUME ["/home/textract/src"]
13 | ENV PATH $PATH:/home/textract/src/bin
14 | ENV PYTHONPATH /home/textract/src
15 | USER textract
16 | ENTRYPOINT ["/home/textract/src/tests/run.py"]
17 | 


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
 1 | # This file is used to create standardized text for any textract method that
 2 | # uses any command line utility or third party service (as opposed to a pure
 3 | # python package) for text extraction. These types of extraction methods have
 4 | # made it particularly difficult to maintain reasonably stable testing
 5 | # environments, so this provides a useful workaround to validate that the
 6 | # extraction methods are working correctly across all development environments
 7 | #
 8 | # https://github.com/deanmalmgren/textract/issues/78
 9 | 
10 | TARGETS = pdf/ocr_text.txt \
11 | 	png/raw_text.txt png/standardized_text.txt \
12 | 	gif/raw_text.txt gif/standardized_text.txt \
13 |     jpg/raw_text.txt jpg/standardized_text.txt \
14 |     tiff/raw_text.txt tiff/standardized_text.txt \
15 | 	ps/raw_text.txt
16 | 
17 | all: $(TARGETS)
18 | 
19 | clean:
20 | 	rm -f $(TARGETS)
21 | 
22 | # create OCR output for the multi-page pdf test
23 | pdf/ocr_text.txt: pdf/ocr_text.pdf
24 | 	pdftoppm $< /tmp/pdf-ocr-text
25 | 	for x in /tmp/pdf-ocr-text*; do \
26 | 		tesseract $$x $$(basename $$x .ppm) > /dev/null; \
27 | 	done
28 | 	cat pdf-ocr-text*.txt > $@
29 | 	rm -f pdf-ocr-text*
30 | 
31 | ps/raw_text.txt: ps/raw_text.ps
32 | 	ps2ascii $< > $@
33 | 
34 | # simple pattern rule for creating standard issue tesseract files for different
35 | # fileypes. the `g` shell variable is the path to the file without the
36 | # extension (e.g. g=png/raw_text)
37 | %.txt: %.png
38 | 	f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null
39 | 
40 | %.txt: %.gif
41 | 	f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null
42 | 
43 | %.txt: %.tiff
44 | 	f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null
45 | 
46 | %.txt: %.jpg
47 | 	f=$@; g=$${f%%.*}; tesseract $< $${g} > /dev/null
48 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/__init__.py


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import tempfile
  4 | import shutil
  5 | import six
  6 | 
  7 | import requests
  8 | 
  9 | 
 10 | class GenericUtilities(object):
 11 | 
 12 |     def get_temp_filename(self, extension=None):
 13 |         stream = tempfile.NamedTemporaryFile(delete=False)
 14 |         stream.close()
 15 |         filename = stream.name
 16 |         if not extension is None:
 17 |             filename += '.' + extension
 18 |             shutil.move(stream.name, filename)
 19 |         return filename
 20 | 
 21 |     def clean_text(self, text):
 22 |         lines = text.splitlines()
 23 |         # Clean empty lines (fixes epub issue)
 24 |         lines = [line for line in lines if line.strip()]  # Clean empty lines
 25 |         return six.b('\n').join(lines)
 26 | 
 27 | 
 28 | class BaseParserTestCase(GenericUtilities):
 29 |     """This BaseParserTestCase object is used to collect a bunch of
 30 |     standardized tests that should be run for every BaseParser.
 31 |     """
 32 | 
 33 |     # 'txt', for example. this is mandatory and potentially the only thing that
 34 |     #  has to be specified to subclass this unittest
 35 |     extension = ''
 36 | 
 37 |     # User can specify a particular filename root (without
 38 |     # extension!), but these have good defaults that are specified by
 39 |     # the @property methods below
 40 |     raw_text_filename_root = ''
 41 |     standardized_text_filename_root = ''
 42 |     unicode_text_filename_root = ''
 43 | 
 44 |     def __init__(self, *args, **kwargs):
 45 |         super(BaseParserTestCase, self).__init__(*args, **kwargs)
 46 |         if self.extension == '':
 47 |             raise NotImplementedError(
 48 |                 'need to specify `extension` class attribute on test case'
 49 |             )
 50 | 
 51 |     def get_extension_directory(self):
 52 |         return os.path.join(
 53 |             os.path.dirname(os.path.abspath(__file__)),
 54 |             self.extension,
 55 |         )
 56 | 
 57 |     def get_filename(self, filename_root, default_filename_root):
 58 |         if filename_root:
 59 |             filename = os.path.join(
 60 |                 self.get_extension_directory(),
 61 |                 filename_root + '.' + self.extension,
 62 |             )
 63 |             if not os.path.exists(filename):
 64 |                 raise Exception((
 65 |                     'expected filename "%(filename)s" to exist for testing '
 66 |                     'purposes but it doesnt'
 67 |                 ) % locals())
 68 |             return filename
 69 |         return self.get_filename(default_filename_root, default_filename_root)
 70 | 
 71 |     @property
 72 |     def raw_text_filename(self):
 73 |         return self.get_filename(self.raw_text_filename_root,
 74 |                                  "raw_text")
 75 | 
 76 |     @property
 77 |     def standardized_text_filename(self):
 78 |         return self.get_filename(self.standardized_text_filename_root,
 79 |                                  "standardized_text")
 80 | 
 81 |     @property
 82 |     def unicode_text_filename(self):
 83 |         return self.get_filename(self.unicode_text_filename_root,
 84 |                                  "unicode_text")
 85 | 
 86 |     def test_raw_text_cli(self):
 87 |         """Make sure raw text matches from the command line"""
 88 |         self.compare_cli_output(self.raw_text_filename)
 89 | 
 90 |     def test_raw_text_python(self):
 91 |         """Make sure raw text matches from python"""
 92 |         self.compare_python_output(self.raw_text_filename)
 93 | 
 94 |     def test_standardized_text_cli(self):
 95 |         """Make sure standardized text matches from the command line"""
 96 |         temp_filename = self.assertSuccessfulTextract(
 97 |             self.standardized_text_filename,
 98 |             cleanup=False,
 99 |         )
100 |         with open(temp_filename, 'rb') as stream:
101 |             self.assertEqual(
102 |                 six.b('').join(stream.read().split()),
103 |                 self.get_standardized_text(),
104 |                 "standardized text fails for %s" % self.extension,
105 |             )
106 |         os.remove(temp_filename)
107 | 
108 |     def test_standardized_text_python(self):
109 |         """Make sure standardized text matches from python"""
110 |         import textract
111 |         result = textract.process(self.standardized_text_filename)
112 |         self.assertEqual(
113 |             six.b('').join(result.split()),
114 |             self.get_standardized_text(),
115 |             "standardized text fails for %s" % self.extension,
116 |         )
117 | 
118 |     # def test_unicode_text_cli(self):
119 |     #     """Make sure unicode text matches from the command line"""
120 |     #     self.compare_cli_output(self.unicode_text_filename)
121 | 
122 |     # def test_unicode_text_python(self):
123 |     #     """Make sure unicode text matches from python"""
124 |     #     self.compare_python_output(self.unicode_text_filename)
125 | 
126 |     def get_expected_filename(self, filename, **kwargs):
127 |         basename, extension = os.path.splitext(filename)
128 |         if kwargs.get('method'):
129 |             basename += '-m=' + kwargs.get('method')
130 |         return basename + '.txt'
131 | 
132 |     def get_cli_options(self, **kwargs):
133 |         option = ''
134 |         for key, val in six.iteritems(kwargs):
135 |             option += '--%s=%s ' % (key, val)
136 |         return option
137 | 
138 |     def get_standardized_text(self):
139 |         filename = os.path.join(
140 |             self.get_extension_directory(),
141 |             "standardized_text.txt"
142 |         )
143 |         if os.path.exists(filename):
144 |             with open(filename, 'rb') as stream:
145 |                 standardized_text = stream.read()
146 |         else:
147 |             standardized_text = six.b(
148 |                 "the quick brown fox jumps over the lazy dog"
149 |             )
150 |         return six.b('').join(standardized_text.split())
151 | 
152 |     def assertSuccessfulCommand(self, command):
153 |         self.assertEqual(
154 |             0, subprocess.call(command, shell=True),
155 |             "COMMAND FAILED: %(command)s" % locals()
156 |         )
157 | 
158 |     def assertSuccessfulTextract(self, filename, cleanup=True, **kwargs):
159 | 
160 |         # construct the option string
161 |         option = self.get_cli_options(**kwargs)
162 | 
163 |         # run the command and make sure everything worked correctly
164 |         temp_filename = self.get_temp_filename()
165 |         self.assertSuccessfulCommand(
166 |             "textract %(option)s '%(filename)s' > %(temp_filename)s" % locals()
167 |         )
168 |         if cleanup:
169 |             os.remove(temp_filename)
170 |             return None
171 |         else:
172 |             return temp_filename
173 | 
174 |     def compare_cli_output(self, filename, expected_filename=None, **kwargs):
175 |         if expected_filename is None:
176 |             expected_filename = self.get_expected_filename(filename, **kwargs)
177 | 
178 |         # run the command and make sure everything worked correctly
179 |         temp_filename = self.assertSuccessfulTextract(
180 |             filename,
181 |             cleanup=False,
182 |             **kwargs
183 |         )
184 | 
185 |         self.assertSuccessfulCommand(
186 |             "diff --ignore-blank-lines '%(temp_filename)s' '%(expected_filename)s'" % locals()
187 |         )
188 |         os.remove(temp_filename)
189 | 
190 |     def compare_python_output(self, filename, expected_filename=None, **kwargs):
191 |         if expected_filename is None:
192 |             expected_filename = self.get_expected_filename(filename, **kwargs)
193 | 
194 |         import textract
195 |         result = textract.process(filename, **kwargs)
196 |         with open(expected_filename, 'rb') as stream:
197 |             result = self.clean_text(result)
198 |             expected = self.clean_text(stream.read())
199 |             self.assertEqual(result, expected)
200 | 
201 | 
202 | class ShellParserTestCase(BaseParserTestCase):
203 |     """This BaseParserTestCase object is used to collect a bunch of
204 |     standardized tests that should be run for every ShellParser.
205 |     """
206 | 
207 |     def test_filename_spaces(self):
208 |         """Make sure filenames with spaces work on the command line"""
209 |         temp_filename = spaced_filename = self.get_temp_filename()
210 |         spaced_filename += " a filename with spaces." + self.extension
211 |         shutil.copyfile(self.raw_text_filename, spaced_filename)
212 |         self.compare_cli_output(
213 |             spaced_filename,
214 |             self.get_expected_filename(self.raw_text_filename),
215 |         )
216 |         os.remove(temp_filename)
217 |         os.remove(spaced_filename)
218 | 


--------------------------------------------------------------------------------
/tests/csv/raw_text.txt:
--------------------------------------------------------------------------------
 1 | CREATION DATE	STATUS	COMPLETION DATE	SERVICE REQUEST NUMBER	TYPE OF SERVICE REQUEST	CURRENT ACTIVITY	MOST RECENT ACTION	NUMBER OF POTHOLES FILLED ON BLOCK	STREET ADDRESS	ZIP	X COORDINATE	Y COORDINATE	Ward	Police District	Community Area	LATITUDE	LONGITUDE	LOCATION
 2 | 08/28/2014	Completed	08/28/2014	14-01433654	Pothole in Street	Final Outcome	Pothole Patched	5	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
 3 | 08/27/2014	Completed	08/27/2014	14-01424541	Pothole in Street	Final Outcome	Pothole Patched	1	100 N MICHIGAN AVE	60602	1177299.6140023	1900836.66107586	42	1	32	41.88328915138877	-87.62454939639862	(41.88328915138877, -87.62454939639862)
 4 | 08/27/2014	Completed	08/27/2014	14-01424527	Pothole in Street	Final Outcome	Pothole Patched	5	100 S MICHIGAN AVE	60603	1177324.39101512	1899960.94636756	42	1	32	41.88077066474826	-87.62448301713603	(41.88077066474826, -87.62448301713603)
 5 | 08/27/2014	Completed	08/27/2014	14-01424501	Pothole in Street	Final Outcome	Pothole Patched	5	200 S MICHIGAN AVE	60604	1177323.63491104	1899497.03462832	42	1	32	41.87948932869769	-87.6244981249339	(41.87948932869769, -87.6244981249339)
 6 | 08/27/2014	Completed	08/27/2014	14-01424389	Pothole in Street	Final Outcome	Pothole Patched	5	300 S MICHIGAN AVE	60604	1177339.54271824	1899032.66368812	2	1	32	41.87821734531058	-87.62445584029679	(41.87821734531058, -87.62445584029679)
 7 | 08/27/2014	Completed	08/27/2014	14-01424212	Pothole in Street	Final Outcome	Pothole Patched	15	400 S MICHIGAN AVE	60605	1177347.93260788	1898568.04117246	2	1	32	41.87693801152278	-87.6244366179451	(41.87693801152278, -87.6244366179451)
 8 | 08/25/2014	Completed	08/25/2014	14-01404817	Pothole in Street	Final Outcome	Pothole Patched	10	740 N WABASH AVE	60611	1176600.11893119	1905427.14160096	42	18	8	41.896091479927165	-87.6269829451334	(41.896091479927165, -87.6269829451334)
 9 | 08/22/2014	Completed	08/22/2014	14-01395263	Pothole in Street	Final Outcome	Pothole Patched	1	1137 W CHICAGO AVE	60642	1168573.58997114	1905510.8499928	27	12	24	41.89613260416578	-87.65667886202922	(41.89613260416578, -87.65667886202922)
10 | 08/22/2014	Completed	08/22/2014	14-01390538	Pothole in Street	Final Outcome	Pothole Patched	5	615 N FRANKLIN ST	60654	1174219.86039744	1904291.9608037	42	18	8	41.89291371668045	-87.63546140541867	(41.89291371668045, -87.63546140541867)
11 | 08/21/2014	Completed	08/28/2014	14-01383161	Pothole in Street	Final Outcome	Pothole Patched	5	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
12 | 08/12/2014	Completed	08/25/2014	14-01323742	Pothole in Street	Final Outcome	Pothole Patched	2	2744 W EVERGREEN AVE	60622	1157855.84623339	1908863.62028457	26	14	24	41.905779104276164	-87.69629062504274	(41.905779104276164, -87.69629062504274)
13 | 08/11/2014	Completed	08/25/2014	14-01317376	Pothole in Street	Final Outcome	Pothole Patched	4	2701 W HIRSCH ST	60622	1158141.76231382	1909202.67761659	26	14	24	41.9064904995143	-87.69453948271031	(41.9064904995143, -87.69453948271031)
14 | 08/07/2014	Completed	08/27/2014	14-01294373	Pothole in Street	Final Outcome	Pothole Patched	1	159 N DEARBORN ST	60601	1175915.29501332	1901390.11610965	42	1	32	41.884767887865465	-87.62932139824203	(41.884767887865465, -87.62932139824203)
15 | 08/06/2014	Completed	08/26/2014	14-01284763	Pothole in Street	Final Outcome	Pothole Patched	17	100 S WABASH AVE	60603	1176824.63149672	1899942.95872303	42	1	32	41.88072398164076	-87.62631635116725	(41.88072398164076, -87.62631635116725)
16 | 08/04/2014	Completed	08/25/2014	14-01264182	Pothole in Street	Final Outcome	Pothole Patched	3	201 E SUPERIOR ST	60611	1177708.98999045	1905405.56110885	42	18	8	41.8956500849415	-87.62265814875258	(41.8956500849415, -87.62265814875258)
17 | 08/04/2014	Completed	08/22/2014	14-01266469	Pothole in Street	Final Outcome	Pothole Patched	8	220 W SUPERIOR ST	60654	1174518.27808694	1905317.80798624	42	18	8	41.89569109218158	-87.63509877442043	(41.89569109218158, -87.63509877442043)
18 | 08/03/2014	Completed	08/22/2014	14-01258054	Pothole in Street	Final Outcome	Pothole Patched	8	600 N WABASH AVE	60611	1176632.67288564	1904207.6496666	42	18	8	41.892531148204995	-87.62689596447262	(41.892531148204995, -87.62689596447262)
19 | 07/29/2014	Completed	08/28/2014	14-01228358	Pothole in Street	Final Outcome	Pothole Patched	7	10 W ELM ST	60610	1176002.41378993	1908093.80450502	42	18	8	41.90328038833401	-87.62904275288018	(41.90328038833401, -87.62904275288018)
20 | 07/29/2014	Completed	08/25/2014	14-01224309	Pothole in Street	Final Outcome	Pothole Patched	15	201 E HURON ST	60611	1177717.86567145	1905118.00892518	42	18	8	41.89486671392807	-87.62262561802254	(41.89486671392807, -87.62262561802254)
21 | 07/28/2014	Completed	08/27/2014	14-01208070	Pothole in Street	Final Outcome	Pothole Patched	2	177 N WELLS ST	60606	1174702.03147832	1901552.59999636	42	1	32	41.88525719405359	-87.63376979601067	(41.88525719405359, -87.63376979601067)
22 | 07/26/2014	Completed	08/28/2014	14-01203517	Pothole in Street	Final Outcome	Pothole Patched	1	1 W MAPLE ST	60610	1176169.35729479	1907624.99998895	42	18	8	41.90177327913275	-87.62842192883082	(41.90177327913275, -87.62842192883082)
23 | 07/24/2014	Completed	08/22/2014	14-01190319	Pothole in Street	Final Outcome	Pothole Patched	3	730 N FRANKLIN ST	60654	1174191.71119404	1905308.4805198	42	18	8	41.89562283550215	-87.63582499692767	(41.89562283550215, -87.63582499692767)
24 | 07/23/2014	Completed	08/28/2014	14-01178461	Pothole in Street	Final Outcome	Pothole Patched	3	1155 N DEARBORN ST	60610	1175723.69360823	1908131.92908715	42	18	8	41.90344694644998	-87.62982489848619	(41.90344694644998, -87.62982489848619)
25 | 07/21/2014	Completed	08/22/2014	14-01161819	Pothole in Street	Final Outcome	Pothole Patched	3	600 E GRAND AVE	60611	1180764.02691294	1904055.10759636	42	18	8	41.892094136861786	-87.61156988394656	(41.892094136861786, -87.61156988394656)
26 | 07/18/2014	Completed	08/27/2014	14-01149914	Pothole in Street	Final Outcome	Pothole Patched	4	150 N FRANKLIN ST	60606	1174306.13841283	1901261.81180327	42	1	32	41.88449312034977	-87.63552722968024	(41.88449312034977, -87.63552722968024)
27 | 07/17/2014	Completed	08/22/2014	14-01134572	Pothole in Street	Final Outcome	Pothole Patched	2	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
28 | 07/14/2014	Completed	08/28/2014	14-01112602	Pothole in Street	Final Outcome	Pothole Patched	15	170 W OAK ST	60610	1174950.17410849	1907124.5048948	42	18	8	41.90065230620216	-87.63419240464901	(41.90065230620216, -87.63419240464901)
29 | 07/14/2014	Completed	08/22/2014	14-01114977	Pothole in Street	Final Outcome	Pothole Patched	3	461 N CITYFRONT PLAZA DR	60611	1177942.61391555	1903427.32649215	42	18	8	41.89012269030228	-87.62144215679457	(41.89012269030228, -87.62144215679457)
30 | 07/14/2014	Completed	08/27/2014	14-01114104	Pothole in Street	Final Outcome	Pothole Patched	13	462 N LAKE SHORE DR	60611	1180059.25771678	1903489.31249721	42	18	8	41.89043391381039	-87.61432635149913	(41.89043391381039, -87.61432635149913)
31 | 07/13/2014	Completed	08/22/2014	14-01106120	Pothole in Street	Final Outcome	Pothole Patched	2	37 E OHIO ST	60611	1176632.67288564	1904207.6496666	42	18	8	41.892389760220446	-87.6266700643506	(41.892389760220446, -87.6266700643506)
32 | 07/11/2014	Completed	08/27/2014	14-01092992	Pothole in Street	Final Outcome	Pothole Patched	0	517 N LAKE SHORE DR	60611	1180023.3999851	1903912.87000671	42	18	8	41.891747134984456	-87.61415723810663	(41.891747134984456, -87.61415723810663)
33 | 07/11/2014	Completed	08/28/2014	14-01092312	Pothole in Street	Final Outcome	Pothole Patched	2	1165 N LA SALLE DR	60610	1174923.84946801	1908163.01737071	42	18	8	41.90367107146938	-87.63276634924583	(41.90367107146938, -87.63276634924583)
34 | 07/09/2014	Completed	08/22/2014	14-01072126	Pothole in Street	Final Outcome	Pothole Patched	1	200 W OHIO ST	60654	1174634.07810009	1904152.08572945	42	18	8	41.892497422329804	-87.63416622539611	(41.892497422329804, -87.63416622539611)
35 | 07/08/2014	Completed	08/26/2014	14-01064944	Pothole in Street	Final Outcome	Pothole Patched	4	34 S STATE ST	60603	1176384.09122814	1900236.40989122	42	1	32	41.881194280111245	-87.62788715387323	(41.881194280111245, -87.62788715387323)
36 | 07/06/2014	Completed	08/22/2014	14-01050617	Pothole in Street	Final Outcome	Pothole Patched	5	430 N FRANKLIN ST	60654	1174247.46842821	1903265.17618927	42	18	8	41.890001134206734	-87.63568286112702	(41.890001134206734, -87.63568286112702)
37 | 06/30/2014	Completed	08/26/2014	14-01013381	Pothole in Street	Final Outcome	Pothole Patched	7	50 E JACKSON BLVD	60604	1176916.76437779	1899018.37548671	42	1	32	41.87836259235806	-87.62578989491375	(41.87836259235806, -87.62578989491375)
38 | 06/26/2014	Completed	08/27/2014	14-00992410	Pothole in Street	Final Outcome	Pothole Patched	6	201 N WELLS ST	60606	1174696.68801277	1901736.11738759	42	1	32	41.885758730147366	-87.63378404961261	(41.885758730147366, -87.63378404961261)
39 | 06/23/2014	Completed	08/22/2014	14-00961579	Pothole in Street	Final Outcome	Pothole Patched	1	340 W ERIE ST	60654	1173808.62199403	1904714.70549298	42	18	8	41.89405989499057	-87.63717643739275	(41.89405989499057, -87.63717643739275)
40 | 06/23/2014	Completed	08/26/2014	14-00967688	Pothole in Street	Final Outcome	Pothole Patched	12	140 S DEARBORN ST	60603	1175959.03010572	1899623.29000633	42	1	32	41.87967604011696	-87.62950228266759	(41.87967604011696, -87.62950228266759)
41 | 06/18/2014	Completed	08/27/2014	14-00933275	Pothole in Street	Final Outcome	Pothole Patched	15	211 N DEARBORN ST	60601	1175900.7237733	1901846.43477009	42	1	32	41.88604316092842	-87.62936052143365	(41.88604316092842, -87.62936052143365)
42 | 06/12/2014	Completed	08/27/2014	14-00897156	Pothole in Street	Final Outcome	Pothole Patched	42	200 S LA SALLE ST	60604	1175161.95537691	1899423.25992154	42	1	32	41.87936625147701	-87.63243763414764	(41.87936625147701, -87.63243763414764)
43 | 06/12/2014	Completed	08/22/2014	14-00890682	Pothole in Street	Final Outcome	Pothole Patched	1	306 W OHIO ST	60654	1174162.61920993	1904141.19087966	42	18	8	41.89247410656883	-87.63593436430777	(41.89247410656883, -87.63593436430777)
44 | 06/11/2014	Completed	08/28/2014	14-00887552	Pothole in Street	Final Outcome	Pothole Patched	4	1020 N STATE ST	60610	1176176.20887962	1907374.89001835	42	18	8	41.901200484161656	-87.62847652035359	(41.901200484161656, -87.62847652035359)
45 | 06/10/2014	Completed	08/27/2014	14-00880864	Pothole in Street	Final Outcome	Pothole Patched	0	245 W WASHINGTON ST	60606	1174540.03218467	1900805.68709955	42	1	32	41.883096426444126	-87.63515649580224	(41.883096426444126, -87.63515649580224)
46 | 06/04/2014	Completed	08/22/2014	14-00844247	Pothole in Street	Final Outcome	Pothole Patched	2	525 W SUPERIOR ST	60654	1172505.49257168	1905270.97268573	42	18	8	41.895379256528365	-87.64251783750353	(41.895379256528365, -87.64251783750353)
47 | 06/03/2014	Completed	08/26/2014	14-00829362	Pothole in Street	Final Outcome	Pothole Patched	2	100 S STATE ST	60603	1176400.89309128	1899922.58000955	42	1	32	41.88071513784339	-87.62787607841581	(41.88071513784339, -87.62787607841581)
48 | 06/01/2014	Completed	08/26/2014	14-00819596	Pothole in Street	Final Outcome	Pothole Patched	1	21 S DEARBORN ST	60603	1175943.73762726	1900281.08996842	42	1	32	41.88163394997373	-87.62924355913282	(41.88163394997373, -87.62924355913282)
49 | 05/31/2014	Completed	08/28/2014	14-00816655	Pothole in Street	Final Outcome	Pothole Patched	2	1250 N DEARBORN ST	60610	1175716.58493337	1908364.4279693	42	18	8	41.905379013702905	-87.63017455348202	(41.905379013702905, -87.63017455348202)
50 | 05/29/2014	Completed	08/22/2014	14-00805945	Pothole in Street	Final Outcome	Pothole Patched	10	738 N LARRABEE ST	60654	1172208.49919151	1905332.36197604	42	18	8	41.89589422031982	-87.64313907101959	(41.89589422031982, -87.64313907101959)
51 | 05/29/2014	Completed	08/26/2014	14-00802449	Pothole in Street	Final Outcome	Pothole Patched	6	33 S STATE ST	60603	1176383.89122814	1900241.30989122	42	1	32	41.8812023900134	-87.62759359231444	(41.8812023900134, -87.62759359231444)
52 | 05/28/2014	Completed	08/22/2014	14-00797786	Pothole in Street	Final Outcome	Pothole Patched	2	720 N FRANKLIN ST	60654	1174193.91119404	1905217.9805198	42	18	8	41.89553241941632	-87.63582582624329	(41.89553241941632, -87.63582582624329)
53 | 05/26/2014	Completed	08/28/2014	14-00780187	Pothole in Street	Final Outcome	Pothole Patched	15	12 W ELM ST	60610	1175983.31378993	1908093.10450502	42	18	8	41.90327840141334	-87.62912853335304	(41.90327840141334, -87.62912853335304)
54 | 05/21/2014	Completed	08/28/2014	14-00753021	Pothole in Street	Final Outcome	Pothole Patched	10	30 E ELM ST	60611	1176399.91378993	1908106.10450502	42	18	8	41.903312908929266	-87.6274232482672	(41.903312908929266, -87.6274232482672)
55 | 05/19/2014	Completed	08/27/2014	14-00736478	Pothole in Street	Final Outcome	Pothole Patched	10	520 E ILLINOIS ST	60611	1180045.15771678	1903729.01249721	42	18	8	41.891214491225654	-87.61421696505855	(41.891214491225654, -87.61421696505855)
56 | 04/08/2014	Completed	08/25/2014	14-00504762	Pothole in Street	Final Outcome	Pothole Patched	2	1300 N CALIFORNIA AVE	60622	1157492.02101534	1908524.16997455	26	14	24	41.904795555353125	-87.69707304441381	(41.904795555353125, -87.69707304441381)


--------------------------------------------------------------------------------
/tests/csv/standardized_text.csv:
--------------------------------------------------------------------------------
1 | the,quick
2 | brown,fox
3 | jumps over,the
4 | lazy,dog


--------------------------------------------------------------------------------
/tests/doc/raw_text.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/doc/raw_text.doc


--------------------------------------------------------------------------------
/tests/doc/raw_text.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | I love word documents. They are lovely. They make me so happy I could
 3 | smile. And that is why I wrote this package.
 4 | 
 5 | Sample text is hard. That is where http://hipsum.co comes in handy.
 6 | 
 7 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop-
 8 | up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B.
 9 | Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's
10 | kogi American Apparel. Shabby chic distillery church-key locavore beard,
11 | food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi
12 | organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag
13 | Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred
14 | hoodie vegan, food truck leggings Austin pour-over banjo trust fund before
15 | they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid
16 | Carles Austin tofu.
17 | 
18 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn
19 | biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro,
20 | viral asymmetrical artisan bicycle rights bitters master cleanse
21 | Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS
22 | hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee
23 | retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard
24 | gluten-free seitan, VHS sartorial pork belly gastropub meh whatever
25 | authentic synth. Beard single-origin coffee irony fixie, before they sold
26 | out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan
27 | hashtag Brooklyn four loko fanny pack 90's mustache 8-bit.
28 | 
29 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice Banksy
30 | roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo booth, Wes
31 | Anderson skateboard Odd Future. Etsy art party Bushwick keffiyeh. Pork
32 | belly 3 wolf moon butcher mustache. YOLO raw denim lo-fi, hoodie gentrify
33 | Schlitz 8-bit sriracha Shoreditch retro brunch. Williamsburg farm-to-table
34 | beard, mlkshk Banksy fap kogi Etsy art party squid semiotics. XOXO church-
35 | key Pitchfork mlkshk irony tote bag.
36 | 
37 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic
38 | trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt
39 | chambray, leggings shabby chic gastropub YOLO plaid hoodie Williamsburg
40 | Godard mixtape. Retro Godard keytar biodiesel, freegan paleo Etsy you
41 | probably haven't heard of them Pitchfork Schlitz readymade small batch
42 | cred. Pug trust fund paleo, 90's fixie typewriter next level banjo. Banksy
43 | occupy authentic master cleanse Bushwick fingerstache selfies, direct trade
44 | craft beer cliche +1 cray. Locavore four loko biodiesel Neutra chia mlkshk.
45 | Fanny pack YOLO Portland, mlkshk PBR&B single-origin coffee drinking
46 | vinegar 8-bit flannel gentrify stumptown pop-up.
47 | Oh. You need a little dummy text for your mockup? How quaint.
48 | 
49 | I bet you are still using Bootstrap too
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/tests/doc/standardized_text.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/doc/standardized_text.doc


--------------------------------------------------------------------------------
/tests/doc/standardized_text_1.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/doc/standardized_text_1.odt


--------------------------------------------------------------------------------
/tests/docker_entry.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # This script gets called from within the
4 | # Docker container.
5 | 
6 | ./tests/run.py
7 | 


--------------------------------------------------------------------------------
/tests/docx/paragraphs_and_tables.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/docx/paragraphs_and_tables.docx


--------------------------------------------------------------------------------
/tests/docx/paragraphs_and_tables.txt:
--------------------------------------------------------------------------------
 1 | This is the paragraph before the table. It should appear first.
 2 | 
 3 | 
 4 | 
 5 | Row 1, Column 1
 6 | 
 7 | Row 1, Column 2
 8 | 
 9 | Row 1, Column 3
10 | 
11 | Row 2, Column 1
12 | 
13 | Row 2, Column 2
14 | 
15 | Row 2, Column 3
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | INNER TABLE!
26 | 
27 | 
28 | 
29 | Inner 1,1
30 | 
31 | Inner 1,2
32 | 
33 | Inner 2,1
34 | 
35 | Inner 1,2
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | This is the paragraph after the table. It should appear last.


--------------------------------------------------------------------------------
/tests/docx/raw_text.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/docx/raw_text.docx


--------------------------------------------------------------------------------
/tests/docx/raw_text.txt:
--------------------------------------------------------------------------------
 1 | I love word documents. They are lovely. They make me so happy I could smile. And that’s why I wrote this package.
 2 | 
 3 | 
 4 | 
 5 | Sample text is hard. That’s where http://hipsum.co comes in handy.
 6 | 
 7 | 
 8 | 
 9 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church-key locavore beard, food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred hoodie vegan, food truck leggings Austin pour-over banjo trust fund before they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu.
10 | 
11 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard gluten-free seitan, VHS sartorial pork belly gastropub meh whatever authentic synth. Beard single-origin coffee irony fixie, before they sold out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan hashtag Brooklyn four loko fanny pack 90's mustache 8-bit.
12 | 
13 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo-fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag.
14 | 
15 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt chambray, leggings shabby chic gastropub YOLO plaid hoodie Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan paleo Etsy you probably haven't heard of them Pitchfork Schlitz readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter next level banjo. Banksy occupy authentic master cleanse Bushwick fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify stumptown pop-up.
16 | 
17 | Oh. You need a little dummy text for your mockup? How quaint.
18 | 
19 | I bet you’re still using Bootstrap too…


--------------------------------------------------------------------------------
/tests/docx/standardized_text.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/docx/standardized_text.docx


--------------------------------------------------------------------------------
/tests/eml/raw_text.eml:
--------------------------------------------------------------------------------
  1 | Return-path: <from@example.com>
  2 | Envelope-to: to@example.com
  3 | Delivery-date: Sat, 27 Mar 2010 12:11:32 +0000
  4 | Received: from mail by mail.example.com with local-bsmtp (Exim 4.69)
  5 | 	(envelope-from <from@example.com>)
  6 | 	id 1NvUrP-0002O1-C7
  7 | 	for to@example.com; Sat, 27 Mar 2010 12:11:32 +0000
  8 | X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on
  9 | 	mail.example.com
 10 | X-Spam-Level:
 11 | X-Spam-Status: No, score=-11.3 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00,
 12 | 	HTML_MESSAGE,MPART_ALT_DIFF autolearn=no version=3.2.5
 13 | Received: from cpc1-mort1-0-0-cust399.croy.cable.virginmedia.com ([82.44.61.144] helo=spike)
 14 | 	by mail.example.com with esmtpsa (TLS1.0:RSA_AES_128_CBC_SHA1:16)
 15 | 	(Exim 4.69)
 16 | 	(envelope-from <from@example.com>)
 17 | 	id 1NvUrP-0002Nw-6s
 18 | 	for to@example.com; Sat, 27 Mar 2010 12:11:27 +0000
 19 | From: "Example" <from@example.com>
 20 | To: <to@example.com>
 21 | Subject: test
 22 | Date: Sat, 27 Mar 2010 12:11:21 -0000
 23 | Message-ID: <118CE886A86E491FBC9BF5A1474EACD4@spike>
 24 | MIME-Version: 1.0
 25 | Content-Type: multipart/mixed;
 26 | 	boundary="----=_NextPart_000_006B_01CACDA6.A0401560"
 27 | X-Mailer: Microsoft Office Outlook 11
 28 | X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7600.16385
 29 | Thread-Index: AcrNpp0Z+/nuAPz4TiOf1ZIdke/vzQ==
 30 | 
 31 | This is a multi-part message in MIME format.
 32 | 
 33 | ------=_NextPart_000_006B_01CACDA6.A0401560
 34 | Content-Type: multipart/alternative;
 35 | 	boundary="----=_NextPart_001_006C_01CACDA6.A0401560"
 36 | 
 37 | 
 38 | ------=_NextPart_001_006C_01CACDA6.A0401560
 39 | Content-Type: text/plain;
 40 | 	charset="us-ascii"
 41 | Content-Transfer-Encoding: 7bit
 42 | 
 43 | test
 44 | 
 45 | 
 46 | ------=_NextPart_001_006C_01CACDA6.A0401560
 47 | Content-Type: text/html;
 48 | 	charset="us-ascii"
 49 | Content-Transfer-Encoding: quoted-printable
 50 | 
 51 | <html xmlns:v=3D"urn:schemas-microsoft-com:vml" =
 52 | xmlns:o=3D"urn:schemas-microsoft-com:office:office" =
 53 | xmlns:w=3D"urn:schemas-microsoft-com:office:word" =
 54 | xmlns=3D"http://www.w3.org/TR/REC-html40">
 55 | 
 56 | <head>
 57 | <META HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html; =
 58 | charset=3Dus-ascii">
 59 | <meta name=3DGenerator content=3D"Microsoft Word 11 (filtered medium)">
 60 | <style>
 61 | <!--
 62 |  /* Style Definitions */
 63 |  p.MsoNormal, li.MsoNormal, div.MsoNormal
 64 | 	{margin:0cm;
 65 | 	margin-bottom:.0001pt;
 66 | 	font-size:12.0pt;
 67 | 	font-family:Arial;}
 68 | a:link, span.MsoHyperlink
 69 | 	{color:blue;
 70 | 	text-decoration:underline;}
 71 | a:visited, span.MsoHyperlinkFollowed
 72 | 	{color:purple;
 73 | 	text-decoration:underline;}
 74 | span.EmailStyle17
 75 | 	{mso-style-type:personal-compose;
 76 | 	font-family:Arial;
 77 | 	color:windowtext;}
 78 | @page Section1
 79 | 	{size:595.3pt 841.9pt;
 80 | 	margin:72.0pt 90.0pt 72.0pt 90.0pt;}
 81 | div.Section1
 82 | 	{page:Section1;}
 83 | -->
 84 | </style>
 85 | <!--[if gte mso 9]><xml>
 86 |  <o:shapedefaults v:ext=3D"edit" spidmax=3D"1026" />
 87 | </xml><![endif]--><!--[if gte mso 9]><xml>
 88 |  <o:shapelayout v:ext=3D"edit">
 89 |   <o:idmap v:ext=3D"edit" data=3D"1" />
 90 |  </o:shapelayout></xml><![endif]-->
 91 | </head>
 92 | 
 93 | <body lang=3DEN-GB link=3Dblue vlink=3Dpurple>
 94 | 
 95 | <div class=3DSection1>
 96 | 
 97 | <p class=3DMsoNormal><font size=3D2 face=3DArial><span =
 98 | style=3D'font-size:10.0pt'>test<o:p></o:p></span></font></p>
 99 | 
100 | </div>
101 | 
102 | </body>
103 | 
104 | </html>
105 | 
106 | ------=_NextPart_001_006C_01CACDA6.A0401560--
107 | 
108 | ------=_NextPart_000_006B_01CACDA6.A0401560
109 | Content-Type: message/rfc822
110 | Content-Transfer-Encoding: 7bit
111 | Content-Disposition: attachment
112 | 
113 | Received: from mail by mail.example.com with local-bsmtp (Exim 4.69)
114 | 	(envelope-from <from@example.com>)
115 | 	id 1NvUpr-0002NA-Sp
116 | 	for from@example.com; Sat, 27 Mar 2010 12:09:54 +0000
117 | Received: from cpc1-mort1-0-0-cust399.croy.cable.virginmedia.com ([82.44.61.144] helo=spike)
118 | 	by mail.example.com with esmtpsa (TLS1.0:RSA_AES_128_CBC_SHA1:16)
119 | 	(Exim 4.69)
120 | 	(envelope-from <from@example.com>)
121 | 	id 1NvUpr-0002N7-Ob
122 | 	for from@example.com; Sat, 27 Mar 2010 12:09:51 +0000
123 | Return-Path: <from@example.com>
124 | From: "Example" <from@example.com>
125 | To: <to@example.com>
126 | Subject: test
127 | Date: Sat, 27 Mar 2010 12:09:46 -0000
128 | Message-ID: <63850E3677964844B666AE60AFB35F6C@spike>
129 | MIME-Version: 1.0
130 | Content-Type: multipart/alternative;
131 | 	boundary="----=_NextPart_000_0067_01CACDA6.A0401560"
132 | X-Mailer: Microsoft Office Outlook 11
133 | X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) onmail.example.com
134 | X-Spam-Level:
135 | X-Spam-Status: No, score=-10.5 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00,TVD_SPACE_RATIO autolearn=no version=3.2.5
136 | X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7600.16385
137 | Thread-Index: AcrNpjn33LNJQ9EnTDiluk/gD2wIxw==
138 | 
139 | This is a multi-part message in MIME format.
140 | 
141 | ------=_NextPart_000_0067_01CACDA6.A0401560
142 | Content-Type: text/plain;
143 | 	charset="us-ascii"
144 | Content-Transfer-Encoding: 7bit
145 | 
146 | test
147 | 
148 | ------=_NextPart_000_0067_01CACDA6.A0401560
149 | Content-Type: text/html;
150 | 	charset="us-ascii"
151 | Content-Transfer-Encoding: quoted-printable
152 | 
153 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
154 | <HTML>
155 | <HEAD>
156 | <META HTTP-EQUIV=3D"Content-Type" CONTENT=3D"text/html; =
157 | charset=3Dus-ascii">
158 | <META NAME=3D"Generator" CONTENT=3D"MS Exchange Server version =
159 | 6.5.7036.0">
160 | <TITLE>test</TITLE>
161 | </HEAD>
162 | <BODY>
163 | <!-- Converted from text/plain format -->
164 | 
165 | <P><FONT SIZE=3D2>test</FONT>
166 | </P>
167 | 
168 | </BODY>
169 | </HTML>
170 | ------=_NextPart_000_0067_01CACDA6.A0401560--
171 | 
172 | ------=_NextPart_000_006B_01CACDA6.A0401560--
173 | 
174 | 


--------------------------------------------------------------------------------
/tests/eml/raw_text.txt:
--------------------------------------------------------------------------------
1 | test
2 | 
3 | 
4 | 
5 | test
6 | 


--------------------------------------------------------------------------------
/tests/eml/standardized_text.eml:
--------------------------------------------------------------------------------
 1 | From nobody Mon Aug 25 13:47:30 2014
 2 | Content-Type: text/plain; charset="us-ascii"
 3 | MIME-Version: 1.0
 4 | Content-Transfer-Encoding: 7bit
 5 | Subject: test
 6 | From: me@example.com
 7 | To: you@example.com
 8 | 
 9 | the quick brown fox
10 | jumps over the lazy dog


--------------------------------------------------------------------------------
/tests/epub/raw_text.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/epub/raw_text.epub


--------------------------------------------------------------------------------
/tests/epub/raw_text.txt:
--------------------------------------------------------------------------------
 1 | Epub testing
 2 | With subtitle...
 3 | Introduction
 4 | Welcome here! All the text have ben generate with the Samuel L lorem ipsum.
 5 | We happy?
 6 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
 7 | We happy?
 8 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die.
 9 | Oh... what I'm gon' do?
10 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
11 | No man, I don't eat pork
12 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
13 | Is she dead, yes or no?
14 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
15 | We happy?
16 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
17 | We happy?
18 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die.
19 | Oh... what I'm gon' do?
20 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
21 | No man, I don't eat pork
22 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
23 | Is she dead, yes or no?
24 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
25 | We happy?
26 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
27 | We happy?
28 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die.
29 | Oh... what I'm gon' do?
30 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
31 | No man, I don't eat pork
32 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
33 | Is she dead, yes or no?
34 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
35 | We happy?
36 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
37 | We happy?
38 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die.
39 | Oh... what I'm gon' do?
40 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
41 | No man, I don't eat pork
42 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
43 | Is she dead, yes or no?
44 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
45 | We happy?
46 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
47 | We happy?
48 | The lysine contingency - it's intended to prevent the spread of the animals is case they ever got off the island. Dr. Wu inserted a gene that makes a single faulty enzyme in protein metabolism. The animals can't manufacture the amino acid lysine. Unless they're continually supplied with lysine by us, they'll slip into a coma and die.
49 | Oh... what I'm gon' do?
50 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
51 | No man, I don't eat pork
52 | Well, the way they make shows is, they make one show. That show's called a pilot. Then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. Some pilots get picked and become television programs. Some don't, become nothing. She starred in one of the ones that became nothing.
53 | Is she dead, yes or no?
54 | The path of the righteous man is beset on all sides by the iniquities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who would attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.
55 | 


--------------------------------------------------------------------------------
/tests/epub/standardized_text.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/epub/standardized_text.epub


--------------------------------------------------------------------------------
/tests/gif/raw_text.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/gif/raw_text.gif


--------------------------------------------------------------------------------
/tests/gif/standardized_text.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/gif/standardized_text.gif


--------------------------------------------------------------------------------
/tests/html/standardized_text.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <body>
3 |     <h1>the quick</h1>
4 |     <p>
5 |       brown <b>fox</b> jumps over the lazy dog
6 |     </p>
7 |   </body>
8 | </html>
9 | 


--------------------------------------------------------------------------------
/tests/jpg/raw_text.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/jpg/raw_text.jpg


--------------------------------------------------------------------------------
/tests/jpg/standardized_text.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/jpg/standardized_text.jpg


--------------------------------------------------------------------------------
/tests/json/raw_text.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hey":"hello world this is some text from a json document", 
 3 |   "should_ignore_this_number": 42, 
 4 |   "should_extract_one_two_three": {
 5 |     "1": "one",
 6 |     "2": "two",
 7 |     "3": "three",
 8 |     "should_be_abcdef": ["a", "b", "c", ["d","e"], {"dict":"f"}]
 9 |   },
10 |   "finished": "fine"
11 | }


--------------------------------------------------------------------------------
/tests/json/raw_text.txt:
--------------------------------------------------------------------------------
1 | fine hello world this is some text from a json document one two three a b c d e  f     


--------------------------------------------------------------------------------
/tests/json/standardized_text.json:
--------------------------------------------------------------------------------
1 | {
2 |   "a":"the quick brown fox", 
3 |   "aa": "jumps",
4 |   "b": "over",
5 |   "z": "the lazy dog"
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/mp3/raw_text-m=google.txt:
--------------------------------------------------------------------------------
1 | Everything Is Awesome
2 | 


--------------------------------------------------------------------------------
/tests/mp3/raw_text-m=sphinx.txt:
--------------------------------------------------------------------------------
1 | everything is awesome
2 | 


--------------------------------------------------------------------------------
/tests/mp3/raw_text.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/mp3/raw_text.mp3


--------------------------------------------------------------------------------
/tests/mp3/raw_text.txt:
--------------------------------------------------------------------------------
1 | Everything Is Awesome
2 | 


--------------------------------------------------------------------------------
/tests/mp3/standardized_text.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/mp3/standardized_text.mp3


--------------------------------------------------------------------------------
/tests/msg/raw_text.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/msg/raw_text.msg


--------------------------------------------------------------------------------
/tests/msg/raw_text.txt:
--------------------------------------------------------------------------------
 1 | Test for TIF files
 2 | 
 3 | This is a test email to experiment with the MS Outlook MSG Extractor
 4 | 
 5 | 
 6 | -- 
 7 | 
 8 | 
 9 | Kind regards
10 | 
11 | 
12 | 
13 | 
14 | Brian Zhou
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/msg/standardized_text.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/msg/standardized_text.msg


--------------------------------------------------------------------------------
/tests/no_ext/docx_paragraphs_and_tables:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/no_ext/docx_paragraphs_and_tables


--------------------------------------------------------------------------------
/tests/no_ext/msg_standardized_text:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/no_ext/msg_standardized_text


--------------------------------------------------------------------------------
/tests/no_ext/pdf_standardized_text:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/no_ext/pdf_standardized_text


--------------------------------------------------------------------------------
/tests/odt/raw_text.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/odt/raw_text.odt


--------------------------------------------------------------------------------
/tests/odt/raw_text.txt:
--------------------------------------------------------------------------------
1 | Sample OppenOffice Writer file with		tabs and     multiple spaces
2 | 


--------------------------------------------------------------------------------
/tests/odt/standardized_text.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/odt/standardized_text.odt


--------------------------------------------------------------------------------
/tests/ogg/raw_text.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/ogg/raw_text.ogg


--------------------------------------------------------------------------------
/tests/ogg/raw_text.txt:
--------------------------------------------------------------------------------
1 | Everything Is Awesome
2 | 


--------------------------------------------------------------------------------
/tests/ogg/standardized_text.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/ogg/standardized_text.ogg


--------------------------------------------------------------------------------
/tests/pdf/ocr_text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/ocr_text.pdf


--------------------------------------------------------------------------------
/tests/pdf/raw_text-m=pdfminer.txt:
--------------------------------------------------------------------------------
 1 | I	  love	  word	  documents.	  They	  are	  lovely.	  They	  make	  me	  so	  happy	  I	  could	  smile.	  And	  
 2 | that’s	  why	  I	  wrote	  this	  package.	  
 3 | 	  
 4 | 
 5 | Sample text is hard. That’s 
 6 | where http://hipsum.co comes 
 7 | in handy. 
 8 | 
 9 | 	  
10 | 
11 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin 
12 | pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer 
13 | PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh 
14 | ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church-
15 | key locavore beard, food truck chillwave sartorial deep v flannel authentic 
16 | Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage 
17 | Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel 
18 | keytar Portland post-ironic. Cred hoodie vegan, food truck leggings 
19 | Austin pour-over banjo trust fund before they sold out cray Intelligentsia 
20 | plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. 
21 | 
22 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn 
23 | biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, 
24 | viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter 
25 | YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag 
26 | meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 
27 | 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard 
28 | gluten-free seitan, VHS sartorial pork belly gastropub meh whatever 
29 | authentic synth. Beard single-origin coffee irony fixie, before they sold 
30 | 
31 | out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan 
32 | hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. 
33 | 
34 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice 
35 | Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo 
36 | booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick 
37 | keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo-
38 | fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. 
39 | Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party 
40 | squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag. 
41 | 
42 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic 
43 | trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt 
44 | chambray, leggings shabby chic gastropub YOLO plaid hoodie 
45 | Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan 
46 | paleo Etsy you probably haven't heard of them Pitchfork Schlitz 
47 | readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter 
48 | next level banjo. Banksy occupy authentic master cleanse Bushwick 
49 | fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four 
50 | loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk 
51 | PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify 
52 | stumptown pop-up. 
53 | Oh. You need a little dummy text for your mockup? How quaint. 
54 | 
55 | I bet you’re still using Bootstrap too… 
56 | 
57 | 	  
58 | 
59 | 


--------------------------------------------------------------------------------
/tests/pdf/raw_text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/raw_text.pdf


--------------------------------------------------------------------------------
/tests/pdf/raw_text.txt:
--------------------------------------------------------------------------------
 1 | I	  love	  word	  documents.	  They	  are	  lovely.	  They	  make	  me	  so	  happy	  I	  could	  smile.	  And	  
 2 | that’s	  why	  I	  wrote	  this	  package.	  
 3 | 	  
 4 | 
 5 | Sample text is hard. That’s
 6 | where http://hipsum.co comes
 7 | in handy.
 8 | 	  
 9 | 
10 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin
11 | pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer
12 | PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh
13 | ethnic Marfa 90's kogi American Apparel. Shabby chic distillery churchkey locavore beard, food truck chillwave sartorial deep v flannel authentic
14 | Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage
15 | Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel
16 | keytar Portland post-ironic. Cred hoodie vegan, food truck leggings
17 | Austin pour-over banjo trust fund before they sold out cray Intelligentsia
18 | plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu.
19 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn
20 | biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro,
21 | viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter
22 | YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag
23 | meh. Thundercats semiotics shabby chic forage single-origin coffee retro,
24 | 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard
25 | gluten-free seitan, VHS sartorial pork belly gastropub meh whatever
26 | authentic synth. Beard single-origin coffee irony fixie, before they sold
27 | 
28 | out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan
29 | hashtag Brooklyn four loko fanny pack 90's mustache 8-bit.
30 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice
31 | Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo
32 | booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick
33 | keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lofi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch.
34 | Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party
35 | squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag.
36 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic
37 | trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt
38 | chambray, leggings shabby chic gastropub YOLO plaid hoodie
39 | Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan
40 | paleo Etsy you probably haven't heard of them Pitchfork Schlitz
41 | readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter
42 | next level banjo. Banksy occupy authentic master cleanse Bushwick
43 | fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four
44 | loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk
45 | PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify
46 | stumptown pop-up.
47 | Oh. You need a little dummy text for your mockup? How quaint.
48 | I bet you’re still using Bootstrap too…
49 | 	  
50 | 
51 | 


--------------------------------------------------------------------------------
/tests/pdf/standardized_text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/standardized_text.pdf


--------------------------------------------------------------------------------
/tests/pdf/two_column.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pdf/two_column.pdf


--------------------------------------------------------------------------------
/tests/png/raw_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/png/raw_text.png


--------------------------------------------------------------------------------
/tests/png/standardized_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/png/standardized_text.png


--------------------------------------------------------------------------------
/tests/pptx/raw_text.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pptx/raw_text.pptx


--------------------------------------------------------------------------------
/tests/pptx/raw_text.txt:
--------------------------------------------------------------------------------
  1 | I love 
  2 | 
  3 | PowerPoint. A lot.
  4 | 
  5 | It is 
  6 | 
  7 | lovely. It makes me so happy I could smile. And that’s why I wrote this package.
  8 | 
  9 | hipsum
 10 | 
 11 | Viral Godard typewriter Pitchfork Blue Bottle asymmetrical gluten-free, forage hoodie. High Life ethnic biodiesel banjo you probably haven't heard of them skateboard. Swag forage art party, Marfa 
 12 | 
 13 | yr
 14 | 
 15 |  
 16 | 
 17 | sriracha
 18 | 
 19 |  before they sold out PBR drinking vinegar Blue Bottle gluten-free. Lo-fi single-origin coffee bicycle rights, ennui 
 20 | 
 21 | selfies
 22 | 
 23 |  
 24 | 
 25 | fap
 26 | 
 27 |  
 28 | 
 29 | paleo
 30 | 
 31 |  
 32 | 
 33 | Etsy
 34 | 
 35 |  
 36 | 
 37 | Bushwick
 38 | 
 39 |  ethical. Vice authentic vinyl 3 wolf moon, pour-over tousled Marfa 
 40 | 
 41 | fingerstache
 42 | 
 43 |  readymade blog ethical squid. Echo Park mustache beard, crucifix 
 44 | 
 45 | fingerstache
 46 | 
 47 |  bitters 
 48 | 
 49 | normcore
 50 | 
 51 |  Blue Bottle +1. Skateboard bicycle rights XOXO, literally Vice put a bird on it VHS craft beer.
 52 | 
 53 | Hipsum
 54 | 
 55 |  II
 56 | 
 57 | Lo-fi fashion axe Godard food truck 3 wolf moon Echo Park, 
 58 | 
 59 | normcore
 60 | 
 61 |  raw denim kitsch narwhal 
 62 | 
 63 | mixtape
 64 | 
 65 | . Brunch 
 66 | 
 67 | cray
 68 | 
 69 |  
 70 | 
 71 | scenester
 72 | 
 73 |  selvage 
 74 | 
 75 | chillwave
 76 | 
 77 | , pickled ennui irony ugh salvia beard Schlitz wayfarers kitsch 
 78 | 
 79 | Etsy
 80 | 
 81 | . Brooklyn beard messenger bag tote bag Wes Anderson. 
 82 | 
 83 | Keffiyeh
 84 | 
 85 |  cardigan Blue Bottle Schlitz 
 86 | 
 87 | chillwave
 88 | 
 89 | . Twee 
 90 | 
 91 | Kickstarter
 92 | 
 93 |  
 94 | 
 95 | hella
 96 | 
 97 |  
 98 | 
 99 | banh
100 | 
101 |  mi beard, 
102 | 
103 | Carles
104 | 
105 |  
106 | 
107 | keytar
108 | 
109 |  
110 | 
111 | mumblecore
112 | 
113 |  fanny pack 
114 | 
115 | selfies
116 | 
117 |  
118 | 
119 | Tonx
120 | 
121 |  bitters 
122 | 
123 | mixtape
124 | 
125 |  
126 | 
127 | gastropub
128 | 
129 | . Vegan 
130 | 
131 | Neutra
132 | 
133 |  small batch 
134 | 
135 | keffiyeh
136 | 
137 | , 
138 | 
139 | normcore
140 | 
141 |  single-origin coffee meh pickled High Life 
142 | 
143 | meggings
144 | 
145 |  hoodie next level 
146 | 
147 | fingerstache
148 | 
149 |  church-key. Jean shorts wayfarers messenger bag kitsch banjo art party single-origin coffee.


--------------------------------------------------------------------------------
/tests/pptx/standardized_text.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/pptx/standardized_text.pptx


--------------------------------------------------------------------------------
/tests/ps/raw_text.ps:
--------------------------------------------------------------------------------
 1 | %!PS-Adobe-1.0
 2 | 
 3 | newpath  
 4 | 50  50      moveto
 5 | 50  300     lineto     % absolute coordinates
 6 | 400 300     lineto
 7 | 400  50     lineto  
 8 | closepath  
 9 | stroke     
10 | 
11 | newpath  
12 | 0.5 setgray
13 | 100 100 moveto  
14 | 100   0 rlineto         % relative coordinates 
15 | 0   100 rlineto 
16 | -100  0 rlineto 
17 | closepath       
18 | fill                 
19 | 
20 | /Times-Roman findfont  
21 | 20 scalefont     
22 | setfont         
23 | 
24 | newpath    
25 | 0.0 setgray
26 | 72 72 moveto     
27 | (How exciting!) show  
28 | 
29 | gsave
30 |   200 200 moveto
31 |   0.5 1 scale       
32 |   (Narrow Text) show  
33 | grestore
34 | 
35 | 0 2 360 {   
36 |     newpath
37 |     gsave
38 |       rotate
39 |       300 300 moveto
40 |       100 0   rlineto
41 |       stroke
42 |     grestore
43 | } for
44 | 
45 | gsave   
46 | /Times-Roman findfont 60 scalefont setfont
47 | 72 72 moveto (Clipping) true charpath 
48 | clip     
49 | 174 72 translate
50 | 0 2 360 {                 % every 2. degree of 360 degrees
51 |     newpath
52 |     gsave
53 |       rotate               
54 |       0 0 moveto   
55 |       300 0 rlineto
56 |       stroke 
57 |     grestore
58 |   } for
59 | grestore 
60 | 
61 | 
62 | showpage 
63 | 


--------------------------------------------------------------------------------
/tests/ps/raw_text.txt:
--------------------------------------------------------------------------------
1 |            Narrow Text
2 |     How exciting!
3 | 


--------------------------------------------------------------------------------
/tests/psv/raw_text.txt:
--------------------------------------------------------------------------------
 1 | CREATION DATE	STATUS	COMPLETION DATE	SERVICE REQUEST NUMBER	TYPE OF SERVICE REQUEST	CURRENT ACTIVITY	MOST RECENT ACTION	NUMBER OF POTHOLES FILLED ON BLOCK	STREET ADDRESS	ZIP	X COORDINATE	Y COORDINATE	Ward	Police District	Community Area	LATITUDE	LONGITUDE	LOCATION
 2 | 08/28/2014	Completed	08/28/2014	14-01433654	Pothole in Street	Final Outcome	Pothole Patched	5	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
 3 | 08/27/2014	Completed	08/27/2014	14-01424541	Pothole in Street	Final Outcome	Pothole Patched	1	100 N MICHIGAN AVE	60602	1177299.6140023	1900836.66107586	42	1	32	41.88328915138877	-87.62454939639862	(41.88328915138877, -87.62454939639862)
 4 | 08/27/2014	Completed	08/27/2014	14-01424527	Pothole in Street	Final Outcome	Pothole Patched	5	100 S MICHIGAN AVE	60603	1177324.39101512	1899960.94636756	42	1	32	41.88077066474826	-87.62448301713603	(41.88077066474826, -87.62448301713603)
 5 | 08/27/2014	Completed	08/27/2014	14-01424501	Pothole in Street	Final Outcome	Pothole Patched	5	200 S MICHIGAN AVE	60604	1177323.63491104	1899497.03462832	42	1	32	41.87948932869769	-87.6244981249339	(41.87948932869769, -87.6244981249339)
 6 | 08/27/2014	Completed	08/27/2014	14-01424389	Pothole in Street	Final Outcome	Pothole Patched	5	300 S MICHIGAN AVE	60604	1177339.54271824	1899032.66368812	2	1	32	41.87821734531058	-87.62445584029679	(41.87821734531058, -87.62445584029679)
 7 | 08/27/2014	Completed	08/27/2014	14-01424212	Pothole in Street	Final Outcome	Pothole Patched	15	400 S MICHIGAN AVE	60605	1177347.93260788	1898568.04117246	2	1	32	41.87693801152278	-87.6244366179451	(41.87693801152278, -87.6244366179451)
 8 | 08/25/2014	Completed	08/25/2014	14-01404817	Pothole in Street	Final Outcome	Pothole Patched	10	740 N WABASH AVE	60611	1176600.11893119	1905427.14160096	42	18	8	41.896091479927165	-87.6269829451334	(41.896091479927165, -87.6269829451334)
 9 | 08/22/2014	Completed	08/22/2014	14-01395263	Pothole in Street	Final Outcome	Pothole Patched	1	1137 W CHICAGO AVE	60642	1168573.58997114	1905510.8499928	27	12	24	41.89613260416578	-87.65667886202922	(41.89613260416578, -87.65667886202922)
10 | 08/22/2014	Completed	08/22/2014	14-01390538	Pothole in Street	Final Outcome	Pothole Patched	5	615 N FRANKLIN ST	60654	1174219.86039744	1904291.9608037	42	18	8	41.89291371668045	-87.63546140541867	(41.89291371668045, -87.63546140541867)
11 | 08/21/2014	Completed	08/28/2014	14-01383161	Pothole in Street	Final Outcome	Pothole Patched	5	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
12 | 08/12/2014	Completed	08/25/2014	14-01323742	Pothole in Street	Final Outcome	Pothole Patched	2	2744 W EVERGREEN AVE	60622	1157855.84623339	1908863.62028457	26	14	24	41.905779104276164	-87.69629062504274	(41.905779104276164, -87.69629062504274)
13 | 08/11/2014	Completed	08/25/2014	14-01317376	Pothole in Street	Final Outcome	Pothole Patched	4	2701 W HIRSCH ST	60622	1158141.76231382	1909202.67761659	26	14	24	41.9064904995143	-87.69453948271031	(41.9064904995143, -87.69453948271031)
14 | 08/07/2014	Completed	08/27/2014	14-01294373	Pothole in Street	Final Outcome	Pothole Patched	1	159 N DEARBORN ST	60601	1175915.29501332	1901390.11610965	42	1	32	41.884767887865465	-87.62932139824203	(41.884767887865465, -87.62932139824203)
15 | 08/06/2014	Completed	08/26/2014	14-01284763	Pothole in Street	Final Outcome	Pothole Patched	17	100 S WABASH AVE	60603	1176824.63149672	1899942.95872303	42	1	32	41.88072398164076	-87.62631635116725	(41.88072398164076, -87.62631635116725)
16 | 08/04/2014	Completed	08/25/2014	14-01264182	Pothole in Street	Final Outcome	Pothole Patched	3	201 E SUPERIOR ST	60611	1177708.98999045	1905405.56110885	42	18	8	41.8956500849415	-87.62265814875258	(41.8956500849415, -87.62265814875258)
17 | 08/04/2014	Completed	08/22/2014	14-01266469	Pothole in Street	Final Outcome	Pothole Patched	8	220 W SUPERIOR ST	60654	1174518.27808694	1905317.80798624	42	18	8	41.89569109218158	-87.63509877442043	(41.89569109218158, -87.63509877442043)
18 | 08/03/2014	Completed	08/22/2014	14-01258054	Pothole in Street	Final Outcome	Pothole Patched	8	600 N WABASH AVE	60611	1176632.67288564	1904207.6496666	42	18	8	41.892531148204995	-87.62689596447262	(41.892531148204995, -87.62689596447262)
19 | 07/29/2014	Completed	08/28/2014	14-01228358	Pothole in Street	Final Outcome	Pothole Patched	7	10 W ELM ST	60610	1176002.41378993	1908093.80450502	42	18	8	41.90328038833401	-87.62904275288018	(41.90328038833401, -87.62904275288018)
20 | 07/29/2014	Completed	08/25/2014	14-01224309	Pothole in Street	Final Outcome	Pothole Patched	15	201 E HURON ST	60611	1177717.86567145	1905118.00892518	42	18	8	41.89486671392807	-87.62262561802254	(41.89486671392807, -87.62262561802254)
21 | 07/28/2014	Completed	08/27/2014	14-01208070	Pothole in Street	Final Outcome	Pothole Patched	2	177 N WELLS ST	60606	1174702.03147832	1901552.59999636	42	1	32	41.88525719405359	-87.63376979601067	(41.88525719405359, -87.63376979601067)
22 | 07/26/2014	Completed	08/28/2014	14-01203517	Pothole in Street	Final Outcome	Pothole Patched	1	1 W MAPLE ST	60610	1176169.35729479	1907624.99998895	42	18	8	41.90177327913275	-87.62842192883082	(41.90177327913275, -87.62842192883082)
23 | 07/24/2014	Completed	08/22/2014	14-01190319	Pothole in Street	Final Outcome	Pothole Patched	3	730 N FRANKLIN ST	60654	1174191.71119404	1905308.4805198	42	18	8	41.89562283550215	-87.63582499692767	(41.89562283550215, -87.63582499692767)
24 | 07/23/2014	Completed	08/28/2014	14-01178461	Pothole in Street	Final Outcome	Pothole Patched	3	1155 N DEARBORN ST	60610	1175723.69360823	1908131.92908715	42	18	8	41.90344694644998	-87.62982489848619	(41.90344694644998, -87.62982489848619)
25 | 07/21/2014	Completed	08/22/2014	14-01161819	Pothole in Street	Final Outcome	Pothole Patched	3	600 E GRAND AVE	60611	1180764.02691294	1904055.10759636	42	18	8	41.892094136861786	-87.61156988394656	(41.892094136861786, -87.61156988394656)
26 | 07/18/2014	Completed	08/27/2014	14-01149914	Pothole in Street	Final Outcome	Pothole Patched	4	150 N FRANKLIN ST	60606	1174306.13841283	1901261.81180327	42	1	32	41.88449312034977	-87.63552722968024	(41.88449312034977, -87.63552722968024)
27 | 07/17/2014	Completed	08/22/2014	14-01134572	Pothole in Street	Final Outcome	Pothole Patched	2	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
28 | 07/14/2014	Completed	08/28/2014	14-01112602	Pothole in Street	Final Outcome	Pothole Patched	15	170 W OAK ST	60610	1174950.17410849	1907124.5048948	42	18	8	41.90065230620216	-87.63419240464901	(41.90065230620216, -87.63419240464901)
29 | 07/14/2014	Completed	08/22/2014	14-01114977	Pothole in Street	Final Outcome	Pothole Patched	3	461 N CITYFRONT PLAZA DR	60611	1177942.61391555	1903427.32649215	42	18	8	41.89012269030228	-87.62144215679457	(41.89012269030228, -87.62144215679457)
30 | 07/14/2014	Completed	08/27/2014	14-01114104	Pothole in Street	Final Outcome	Pothole Patched	13	462 N LAKE SHORE DR	60611	1180059.25771678	1903489.31249721	42	18	8	41.89043391381039	-87.61432635149913	(41.89043391381039, -87.61432635149913)
31 | 07/13/2014	Completed	08/22/2014	14-01106120	Pothole in Street	Final Outcome	Pothole Patched	2	37 E OHIO ST	60611	1176632.67288564	1904207.6496666	42	18	8	41.892389760220446	-87.6266700643506	(41.892389760220446, -87.6266700643506)
32 | 07/11/2014	Completed	08/27/2014	14-01092992	Pothole in Street	Final Outcome	Pothole Patched	0	517 N LAKE SHORE DR	60611	1180023.3999851	1903912.87000671	42	18	8	41.891747134984456	-87.61415723810663	(41.891747134984456, -87.61415723810663)
33 | 07/11/2014	Completed	08/28/2014	14-01092312	Pothole in Street	Final Outcome	Pothole Patched	2	1165 N LA SALLE DR	60610	1174923.84946801	1908163.01737071	42	18	8	41.90367107146938	-87.63276634924583	(41.90367107146938, -87.63276634924583)
34 | 07/09/2014	Completed	08/22/2014	14-01072126	Pothole in Street	Final Outcome	Pothole Patched	1	200 W OHIO ST	60654	1174634.07810009	1904152.08572945	42	18	8	41.892497422329804	-87.63416622539611	(41.892497422329804, -87.63416622539611)
35 | 07/08/2014	Completed	08/26/2014	14-01064944	Pothole in Street	Final Outcome	Pothole Patched	4	34 S STATE ST	60603	1176384.09122814	1900236.40989122	42	1	32	41.881194280111245	-87.62788715387323	(41.881194280111245, -87.62788715387323)
36 | 07/06/2014	Completed	08/22/2014	14-01050617	Pothole in Street	Final Outcome	Pothole Patched	5	430 N FRANKLIN ST	60654	1174247.46842821	1903265.17618927	42	18	8	41.890001134206734	-87.63568286112702	(41.890001134206734, -87.63568286112702)
37 | 06/30/2014	Completed	08/26/2014	14-01013381	Pothole in Street	Final Outcome	Pothole Patched	7	50 E JACKSON BLVD	60604	1176916.76437779	1899018.37548671	42	1	32	41.87836259235806	-87.62578989491375	(41.87836259235806, -87.62578989491375)
38 | 06/26/2014	Completed	08/27/2014	14-00992410	Pothole in Street	Final Outcome	Pothole Patched	6	201 N WELLS ST	60606	1174696.68801277	1901736.11738759	42	1	32	41.885758730147366	-87.63378404961261	(41.885758730147366, -87.63378404961261)
39 | 06/23/2014	Completed	08/22/2014	14-00961579	Pothole in Street	Final Outcome	Pothole Patched	1	340 W ERIE ST	60654	1173808.62199403	1904714.70549298	42	18	8	41.89405989499057	-87.63717643739275	(41.89405989499057, -87.63717643739275)
40 | 06/23/2014	Completed	08/26/2014	14-00967688	Pothole in Street	Final Outcome	Pothole Patched	12	140 S DEARBORN ST	60603	1175959.03010572	1899623.29000633	42	1	32	41.87967604011696	-87.62950228266759	(41.87967604011696, -87.62950228266759)
41 | 06/18/2014	Completed	08/27/2014	14-00933275	Pothole in Street	Final Outcome	Pothole Patched	15	211 N DEARBORN ST	60601	1175900.7237733	1901846.43477009	42	1	32	41.88604316092842	-87.62936052143365	(41.88604316092842, -87.62936052143365)
42 | 06/12/2014	Completed	08/27/2014	14-00897156	Pothole in Street	Final Outcome	Pothole Patched	42	200 S LA SALLE ST	60604	1175161.95537691	1899423.25992154	42	1	32	41.87936625147701	-87.63243763414764	(41.87936625147701, -87.63243763414764)
43 | 06/12/2014	Completed	08/22/2014	14-00890682	Pothole in Street	Final Outcome	Pothole Patched	1	306 W OHIO ST	60654	1174162.61920993	1904141.19087966	42	18	8	41.89247410656883	-87.63593436430777	(41.89247410656883, -87.63593436430777)
44 | 06/11/2014	Completed	08/28/2014	14-00887552	Pothole in Street	Final Outcome	Pothole Patched	4	1020 N STATE ST	60610	1176176.20887962	1907374.89001835	42	18	8	41.901200484161656	-87.62847652035359	(41.901200484161656, -87.62847652035359)
45 | 06/10/2014	Completed	08/27/2014	14-00880864	Pothole in Street	Final Outcome	Pothole Patched	0	245 W WASHINGTON ST	60606	1174540.03218467	1900805.68709955	42	1	32	41.883096426444126	-87.63515649580224	(41.883096426444126, -87.63515649580224)
46 | 06/04/2014	Completed	08/22/2014	14-00844247	Pothole in Street	Final Outcome	Pothole Patched	2	525 W SUPERIOR ST	60654	1172505.49257168	1905270.97268573	42	18	8	41.895379256528365	-87.64251783750353	(41.895379256528365, -87.64251783750353)
47 | 06/03/2014	Completed	08/26/2014	14-00829362	Pothole in Street	Final Outcome	Pothole Patched	2	100 S STATE ST	60603	1176400.89309128	1899922.58000955	42	1	32	41.88071513784339	-87.62787607841581	(41.88071513784339, -87.62787607841581)
48 | 06/01/2014	Completed	08/26/2014	14-00819596	Pothole in Street	Final Outcome	Pothole Patched	1	21 S DEARBORN ST	60603	1175943.73762726	1900281.08996842	42	1	32	41.88163394997373	-87.62924355913282	(41.88163394997373, -87.62924355913282)
49 | 05/31/2014	Completed	08/28/2014	14-00816655	Pothole in Street	Final Outcome	Pothole Patched	2	1250 N DEARBORN ST	60610	1175716.58493337	1908364.4279693	42	18	8	41.905379013702905	-87.63017455348202	(41.905379013702905, -87.63017455348202)
50 | 05/29/2014	Completed	08/22/2014	14-00805945	Pothole in Street	Final Outcome	Pothole Patched	10	738 N LARRABEE ST	60654	1172208.49919151	1905332.36197604	42	18	8	41.89589422031982	-87.64313907101959	(41.89589422031982, -87.64313907101959)
51 | 05/29/2014	Completed	08/26/2014	14-00802449	Pothole in Street	Final Outcome	Pothole Patched	6	33 S STATE ST	60603	1176383.89122814	1900241.30989122	42	1	32	41.8812023900134	-87.62759359231444	(41.8812023900134, -87.62759359231444)
52 | 05/28/2014	Completed	08/22/2014	14-00797786	Pothole in Street	Final Outcome	Pothole Patched	2	720 N FRANKLIN ST	60654	1174193.91119404	1905217.9805198	42	18	8	41.89553241941632	-87.63582582624329	(41.89553241941632, -87.63582582624329)
53 | 05/26/2014	Completed	08/28/2014	14-00780187	Pothole in Street	Final Outcome	Pothole Patched	15	12 W ELM ST	60610	1175983.31378993	1908093.10450502	42	18	8	41.90327840141334	-87.62912853335304	(41.90327840141334, -87.62912853335304)
54 | 05/21/2014	Completed	08/28/2014	14-00753021	Pothole in Street	Final Outcome	Pothole Patched	10	30 E ELM ST	60611	1176399.91378993	1908106.10450502	42	18	8	41.903312908929266	-87.6274232482672	(41.903312908929266, -87.6274232482672)
55 | 05/19/2014	Completed	08/27/2014	14-00736478	Pothole in Street	Final Outcome	Pothole Patched	10	520 E ILLINOIS ST	60611	1180045.15771678	1903729.01249721	42	18	8	41.891214491225654	-87.61421696505855	(41.891214491225654, -87.61421696505855)
56 | 04/08/2014	Completed	08/25/2014	14-00504762	Pothole in Street	Final Outcome	Pothole Patched	2	1300 N CALIFORNIA AVE	60622	1157492.02101534	1908524.16997455	26	14	24	41.904795555353125	-87.69707304441381	(41.904795555353125, -87.69707304441381)


--------------------------------------------------------------------------------
/tests/psv/standardized_text.psv:
--------------------------------------------------------------------------------
1 | the|quick
2 | brown|fox
3 | jumps over|the
4 | lazy|dog


--------------------------------------------------------------------------------
/tests/rtf/raw_text.txt:
--------------------------------------------------------------------------------
 1 | I love word documents. They are lovely. They make me so happy I could smile. And that is why I wrote this package.
 2 | 
 3 | Sample text is hard. That is where http://hipsum.co comes in handy.
 4 | 
 5 | Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church-key locavore beard, food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred hoodie vegan, food truck leggings Austin pour-over banjo trust fund before they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu.
 6 | Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard gluten-free seitan, VHS sartorial pork belly gastropub meh whatever authentic synth. Beard single-origin coffee irony fixie, before they sold out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan hashtag Brooklyn four loko fanny pack 90's mustache 8-bit.
 7 | Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo-fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag.
 8 | Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt chambray, leggings shabby chic gastropub YOLO plaid hoodie Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan paleo Etsy you probably haven't heard of them Pitchfork Schlitz readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter next level banjo. Banksy occupy authentic master cleanse Bushwick fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify stumptown pop-up.
 9 | Oh. You need a little dummy text for your mockup? How quaint.
10 | I bet you are still using Bootstrap too
11 | 
12 | 


--------------------------------------------------------------------------------
/tests/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Run the test suite that is specified in the .travis.yml file
 4 | """
 5 | 
 6 | import os
 7 | import subprocess
 8 | 
 9 | import yaml
10 | 
11 | from textract.colors import green, red
12 | 
13 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14 | def run_test(command):
15 |     wrapped_command = "cd %s && %s" % (root_dir, command)
16 |     pipe = subprocess.Popen(
17 |         wrapped_command, shell=True,
18 |     )
19 |     pipe.wait()
20 |     if pipe.returncode == 0:
21 |         print(green("TEST PASSED"))
22 |     else:
23 |         print(red("TEST FAILED"))
24 |     return pipe.returncode
25 | 
26 | # load the script tests from the .travis.yml file
27 | with open(os.path.join(root_dir, '.travis.yml')) as stream:
28 |     travis_yml = yaml.load_all(stream.read())
29 | config = travis_yml.next()
30 | tests = config['script']
31 | 
32 | # run the tests
33 | if isinstance(tests, (str, unicode)):
34 |     returncode = run_test(tests)
35 | elif isinstance(tests, (list, tuple)):
36 |     returncode = 0
37 |     for test in tests:
38 |         returncode += run_test(test)
39 | 
40 | if returncode == 0:
41 |     print(green("ALL TESTS PASSED"))
42 | else:
43 |     print(red("SOME TESTS FAILED, SEE ABOVE"))
44 | 


--------------------------------------------------------------------------------
/tests/run_docker_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Run this to create an up-to-date Docker container and run tests.
 4 | 
 5 | cd $(dirname $0)/..
 6 | base=$(pwd)
 7 | 
 8 | image="textract/ubuntu12.04"
 9 | 
10 | cp tests/Dockerfile ./Dockerfile
11 | 
12 | # Note: For speed, the image won't be automatically rebuilt. If the dependencies
13 | # change and the existing image is outdated, just delete it with:
14 | # docker rmi <image name>
15 | docker images | grep $image || docker build -t $image .
16 | docker run --rm -v $base:/home/textract/src $image
17 | 
18 | rm ./Dockerfile
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/test_csv.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class CsvTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'csv'
8 | 


--------------------------------------------------------------------------------
/tests/test_doc.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class DocTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'doc'
8 | 


--------------------------------------------------------------------------------
/tests/test_docx.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | 
 4 | from . import base
 5 | 
 6 | 
 7 | class DocxTestCase(base.BaseParserTestCase, unittest.TestCase):
 8 |     extension = 'docx'
 9 | 
10 |     def test_tables(self):
11 |         """make sure table output is correct"""
12 |         d = self.get_extension_directory()
13 |         self.compare_cli_output(os.path.join(d, "paragraphs_and_tables.docx"))
14 | 


--------------------------------------------------------------------------------
/tests/test_eml.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class EmlTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'eml'
8 | 


--------------------------------------------------------------------------------
/tests/test_epub.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class EpubTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'epub'
8 | 


--------------------------------------------------------------------------------
/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import subprocess
 4 | import uuid
 5 | 
 6 | from . import base
 7 | 
 8 | 
 9 | class ExceptionTestCase(base.GenericUtilities, unittest.TestCase):
10 |     """This class contains a bunch of tests to make sure that textract
11 |     fails in expected ways.
12 |     """
13 | 
14 |     def test_unsupported_extension_cli(self):
15 |         """Make sure unsupported extension exits with non-zero status"""
16 |         filename = self.get_temp_filename(extension="extension")
17 |         command = "textract %(filename)s 2> /dev/null" % locals()
18 |         self.assertEqual(1, subprocess.call(command, shell=True))
19 |         os.remove(filename)
20 | 
21 |     def test_unsupported_extension_python(self):
22 |         """Make sure unsupported extension raises the correct error"""
23 |         filename = self.get_temp_filename(extension="extension")
24 |         import textract
25 |         from textract.exceptions import ExtensionNotSupported
26 |         with self.assertRaises(ExtensionNotSupported):
27 |             textract.process(filename)
28 |         os.remove(filename)
29 | 
30 |     def test_missing_filename_cli(self):
31 |         """Make sure missing files exits with non-zero status"""
32 |         filename = self.get_temp_filename()
33 |         os.remove(filename)
34 |         command = "textract %(filename)s 2> /dev/null" % locals()
35 |         self.assertEqual(1, subprocess.call(command, shell=True))
36 | 
37 |     def test_missing_filename_python(self):
38 |         """Make sure missing files raise the correct error"""
39 |         filename = self.get_temp_filename()
40 |         os.remove(filename)
41 |         import textract
42 |         from textract.exceptions import MissingFileError
43 |         with self.assertRaises(MissingFileError):
44 |             textract.process(filename)
45 | 
46 |     def test_shell_parser_run(self):
47 |         """get a useful error message when a dependency is missing"""
48 |         from textract.parsers import utils
49 |         from textract.parsers import exceptions
50 |         parser = utils.ShellParser()
51 |         try:
52 |             # There shouldn't be a command on the path matching a random uuid
53 |             parser.run([str(uuid.uuid4())])
54 |         except exceptions.ShellError as e:
55 |             self.assertTrue(e.is_not_installed())
56 |         else:
57 |             self.assertTrue(False, "Expected ShellError")
58 | 


--------------------------------------------------------------------------------
/tests/test_gif.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class GifTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'gif'
8 | 


--------------------------------------------------------------------------------
/tests/test_html.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | 
 4 | from . import base
 5 | 
 6 | 
 7 | class HtmlTestCase(base.BaseParserTestCase, unittest.TestCase):
 8 |     extension = 'html'
 9 | 
10 |     def test_table_text_python(self):
11 |         """Make sure tables in html look pretty through python"""
12 |         d = self.get_extension_directory()
13 |         self.compare_python_output(os.path.join(d, "tables.html"))
14 | 
15 |     def test_table_text_cli(self):
16 |         """Make sure tables in html look pretty through cli"""
17 |         d = self.get_extension_directory()
18 |         self.compare_cli_output(os.path.join(d, "tables.html"))
19 | 


--------------------------------------------------------------------------------
/tests/test_jpg.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import os
 4 | 
 5 | from . import base
 6 | 
 7 | 
 8 | class JpgTestCase(base.ShellParserTestCase, unittest.TestCase):
 9 |     extension = 'jpg'
10 | 
11 |     def get_jpeg_filename(self, contents_filename):
12 |         temp_filename = self.get_temp_filename()
13 |         jpeg_filename = temp_filename + ".jpeg"
14 |         os.remove(temp_filename)
15 |         shutil.copyfile(contents_filename, jpeg_filename)
16 |         return jpeg_filename
17 | 
18 |     def test_jpeg_synonym_cli(self):
19 |         """Make sure .jpeg synonym works in cli"""
20 |         jpeg_filename = self.get_jpeg_filename(self.raw_text_filename)
21 |         self.compare_cli_output(
22 |             jpeg_filename,
23 |             self.get_expected_filename(self.raw_text_filename),
24 |         )
25 |         os.remove(jpeg_filename)
26 | 
27 |     def test_jpeg_synonym_python(self):
28 |         """Make sure .jpeg synonym works in python"""
29 |         jpeg_filename = self.get_jpeg_filename(self.raw_text_filename)
30 |         self.compare_python_output(
31 |             jpeg_filename,
32 |             self.get_expected_filename(self.raw_text_filename),
33 |         )
34 |         os.remove(jpeg_filename)
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/test_json.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class JsonTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'json'
8 | 


--------------------------------------------------------------------------------
/tests/test_mp3.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from . import base
 4 | 
 5 | 
 6 | class Mp3TestCase(base.ShellParserTestCase, unittest.TestCase):
 7 |     extension = 'mp3'
 8 | 
 9 |     def test_mp3(self):
10 |         """make sure default audio method output is correct"""
11 |         self.compare_python_output(self.raw_text_filename)
12 | 
13 |     def test_mp3_google(self):
14 |         """make sure google api python output is correct"""
15 |         self.compare_python_output(self.raw_text_filename, method='google')
16 | 
17 |     def test_mp3_sphinx(self):
18 |         """make sure sphinx python output is correct"""
19 |         self.compare_python_output(self.raw_text_filename, method='sphinx')
20 | 


--------------------------------------------------------------------------------
/tests/test_msg.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class MsgTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'msg'
8 | 


--------------------------------------------------------------------------------
/tests/test_no_ext.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import textract
 4 | 
 5 | class No_Ext_TestCase(unittest.TestCase):
 6 | 
 7 |     def test_docx(self):
 8 |         current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 9 |         docx_file = os.path.join(current_dir, "tests/no_ext/docx_paragraphs_and_tables")
10 |         # pass the file without extension and provide the extension as a parameter
11 |         text = textract.process(docx_file, extension='docx')
12 |         print(text)
13 | 
14 |     def test_msg(self):
15 |         current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
16 |         msg_file = os.path.join(current_dir, "tests/no_ext/msg_standardized_text")
17 |         # pass the file without extension and provide the extension as a parameter
18 |         text = textract.process(msg_file, extension='msg')
19 |         print(text)
20 | 
21 |     def test_pdf(self):
22 |         current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
23 |         pdf_file = os.path.join(current_dir, "tests/no_ext/pdf_standardized_text")
24 |         # pass the file without extension and provide the extension as a parameter
25 |         text = textract.process(pdf_file, extension='.pdf')
26 |         print(text)
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/test_odt.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class OdtTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'odt'
8 | 


--------------------------------------------------------------------------------
/tests/test_ogg.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class OggTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'ogg'
8 | 


--------------------------------------------------------------------------------
/tests/test_pdf.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import six
 4 | 
 5 | from . import base
 6 | 
 7 | 
 8 | class PdfTestCase(base.ShellParserTestCase, unittest.TestCase):
 9 |     extension = 'pdf'
10 | 
11 |     def test_pdfminer_python(self):
12 |         """make sure pdfminer python output is correct"""
13 |         self.compare_python_output(self.raw_text_filename, method='pdfminer')
14 | 
15 |     def test_pdfminer_cli(self):
16 |         """make sure pdfminer command line output is correct"""
17 |         self.compare_cli_output(self.raw_text_filename, method='pdfminer')
18 | 
19 |     def test_tesseract_cli(self):
20 |         """confirm pdf extraction with tesseract"""
21 |         d = self.get_extension_directory()
22 |         self.compare_cli_output(
23 |             os.path.join(d, "ocr_text.pdf"),
24 |             expected_filename=os.path.join(d, "ocr_text.txt"),
25 |             method='tesseract',
26 |         )
27 | 
28 |     def test_two_column(self):
29 |         """Preserve two column layout in extraction"""
30 |         filename = os.path.join(self.get_extension_directory(), 'two_column.pdf')
31 |         self.compare_python_output(filename, layout=True)
32 | 


--------------------------------------------------------------------------------
/tests/test_png.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class PngTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'png'
8 | 


--------------------------------------------------------------------------------
/tests/test_pptx.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class PptxTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'pptx'
8 | 


--------------------------------------------------------------------------------
/tests/test_ps.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class PsTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'ps'
8 | 


--------------------------------------------------------------------------------
/tests/test_psv.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class PsvTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'psv'
8 | 


--------------------------------------------------------------------------------
/tests/test_rtf.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class RtfTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'rtf'
8 | 


--------------------------------------------------------------------------------
/tests/test_tiff.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class PngTestCase(base.ShellParserTestCase, unittest.TestCase):
7 |     extension = 'tiff'
8 | 


--------------------------------------------------------------------------------
/tests/test_tsv.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class TsvTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'tsv'
8 | 


--------------------------------------------------------------------------------
/tests/test_txt.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import os
 4 | 
 5 | from . import base
 6 | 
 7 | 
 8 | class TxtTestCase(base.BaseParserTestCase, unittest.TestCase):
 9 |     extension = 'txt'
10 | 
11 |     def test_extensionless_filenames(self):
12 |         """make sure that text from extensionless files is treated as txt"""
13 |         temp_filename = self.get_temp_filename()
14 |         shutil.copyfile(self.raw_text_filename, temp_filename)
15 |         self.compare_python_output(temp_filename, self.raw_text_filename)
16 |         os.remove(temp_filename)
17 | 


--------------------------------------------------------------------------------
/tests/test_wav.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class WavTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'wav'
8 | 


--------------------------------------------------------------------------------
/tests/test_xls.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class XlsTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'xls'
8 | 


--------------------------------------------------------------------------------
/tests/test_xlsx.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from . import base
4 | 
5 | 
6 | class XlsxTestCase(base.BaseParserTestCase, unittest.TestCase):
7 |     extension = 'xlsx'
8 | 


--------------------------------------------------------------------------------
/tests/tiff/raw_text.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/tiff/raw_text.tiff


--------------------------------------------------------------------------------
/tests/tiff/standardized_text.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/tiff/standardized_text.tiff


--------------------------------------------------------------------------------
/tests/tsv/raw_text.txt:
--------------------------------------------------------------------------------
 1 | CREATION DATE	STATUS	COMPLETION DATE	SERVICE REQUEST NUMBER	TYPE OF SERVICE REQUEST	CURRENT ACTIVITY	MOST RECENT ACTION	NUMBER OF POTHOLES FILLED ON BLOCK	STREET ADDRESS	ZIP	X COORDINATE	Y COORDINATE	Ward	Police District	Community Area	LATITUDE	LONGITUDE	LOCATION
 2 | 08/28/2014	Completed	08/28/2014	14-01433654	Pothole in Street	Final Outcome	Pothole Patched	5	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
 3 | 08/27/2014	Completed	08/27/2014	14-01424541	Pothole in Street	Final Outcome	Pothole Patched	1	100 N MICHIGAN AVE	60602	1177299.6140023	1900836.66107586	42	1	32	41.88328915138877	-87.62454939639862	(41.88328915138877, -87.62454939639862)
 4 | 08/27/2014	Completed	08/27/2014	14-01424527	Pothole in Street	Final Outcome	Pothole Patched	5	100 S MICHIGAN AVE	60603	1177324.39101512	1899960.94636756	42	1	32	41.88077066474826	-87.62448301713603	(41.88077066474826, -87.62448301713603)
 5 | 08/27/2014	Completed	08/27/2014	14-01424501	Pothole in Street	Final Outcome	Pothole Patched	5	200 S MICHIGAN AVE	60604	1177323.63491104	1899497.03462832	42	1	32	41.87948932869769	-87.6244981249339	(41.87948932869769, -87.6244981249339)
 6 | 08/27/2014	Completed	08/27/2014	14-01424389	Pothole in Street	Final Outcome	Pothole Patched	5	300 S MICHIGAN AVE	60604	1177339.54271824	1899032.66368812	2	1	32	41.87821734531058	-87.62445584029679	(41.87821734531058, -87.62445584029679)
 7 | 08/27/2014	Completed	08/27/2014	14-01424212	Pothole in Street	Final Outcome	Pothole Patched	15	400 S MICHIGAN AVE	60605	1177347.93260788	1898568.04117246	2	1	32	41.87693801152278	-87.6244366179451	(41.87693801152278, -87.6244366179451)
 8 | 08/25/2014	Completed	08/25/2014	14-01404817	Pothole in Street	Final Outcome	Pothole Patched	10	740 N WABASH AVE	60611	1176600.11893119	1905427.14160096	42	18	8	41.896091479927165	-87.6269829451334	(41.896091479927165, -87.6269829451334)
 9 | 08/22/2014	Completed	08/22/2014	14-01395263	Pothole in Street	Final Outcome	Pothole Patched	1	1137 W CHICAGO AVE	60642	1168573.58997114	1905510.8499928	27	12	24	41.89613260416578	-87.65667886202922	(41.89613260416578, -87.65667886202922)
10 | 08/22/2014	Completed	08/22/2014	14-01390538	Pothole in Street	Final Outcome	Pothole Patched	5	615 N FRANKLIN ST	60654	1174219.86039744	1904291.9608037	42	18	8	41.89291371668045	-87.63546140541867	(41.89291371668045, -87.63546140541867)
11 | 08/21/2014	Completed	08/28/2014	14-01383161	Pothole in Street	Final Outcome	Pothole Patched	5	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
12 | 08/12/2014	Completed	08/25/2014	14-01323742	Pothole in Street	Final Outcome	Pothole Patched	2	2744 W EVERGREEN AVE	60622	1157855.84623339	1908863.62028457	26	14	24	41.905779104276164	-87.69629062504274	(41.905779104276164, -87.69629062504274)
13 | 08/11/2014	Completed	08/25/2014	14-01317376	Pothole in Street	Final Outcome	Pothole Patched	4	2701 W HIRSCH ST	60622	1158141.76231382	1909202.67761659	26	14	24	41.9064904995143	-87.69453948271031	(41.9064904995143, -87.69453948271031)
14 | 08/07/2014	Completed	08/27/2014	14-01294373	Pothole in Street	Final Outcome	Pothole Patched	1	159 N DEARBORN ST	60601	1175915.29501332	1901390.11610965	42	1	32	41.884767887865465	-87.62932139824203	(41.884767887865465, -87.62932139824203)
15 | 08/06/2014	Completed	08/26/2014	14-01284763	Pothole in Street	Final Outcome	Pothole Patched	17	100 S WABASH AVE	60603	1176824.63149672	1899942.95872303	42	1	32	41.88072398164076	-87.62631635116725	(41.88072398164076, -87.62631635116725)
16 | 08/04/2014	Completed	08/25/2014	14-01264182	Pothole in Street	Final Outcome	Pothole Patched	3	201 E SUPERIOR ST	60611	1177708.98999045	1905405.56110885	42	18	8	41.8956500849415	-87.62265814875258	(41.8956500849415, -87.62265814875258)
17 | 08/04/2014	Completed	08/22/2014	14-01266469	Pothole in Street	Final Outcome	Pothole Patched	8	220 W SUPERIOR ST	60654	1174518.27808694	1905317.80798624	42	18	8	41.89569109218158	-87.63509877442043	(41.89569109218158, -87.63509877442043)
18 | 08/03/2014	Completed	08/22/2014	14-01258054	Pothole in Street	Final Outcome	Pothole Patched	8	600 N WABASH AVE	60611	1176632.67288564	1904207.6496666	42	18	8	41.892531148204995	-87.62689596447262	(41.892531148204995, -87.62689596447262)
19 | 07/29/2014	Completed	08/28/2014	14-01228358	Pothole in Street	Final Outcome	Pothole Patched	7	10 W ELM ST	60610	1176002.41378993	1908093.80450502	42	18	8	41.90328038833401	-87.62904275288018	(41.90328038833401, -87.62904275288018)
20 | 07/29/2014	Completed	08/25/2014	14-01224309	Pothole in Street	Final Outcome	Pothole Patched	15	201 E HURON ST	60611	1177717.86567145	1905118.00892518	42	18	8	41.89486671392807	-87.62262561802254	(41.89486671392807, -87.62262561802254)
21 | 07/28/2014	Completed	08/27/2014	14-01208070	Pothole in Street	Final Outcome	Pothole Patched	2	177 N WELLS ST	60606	1174702.03147832	1901552.59999636	42	1	32	41.88525719405359	-87.63376979601067	(41.88525719405359, -87.63376979601067)
22 | 07/26/2014	Completed	08/28/2014	14-01203517	Pothole in Street	Final Outcome	Pothole Patched	1	1 W MAPLE ST	60610	1176169.35729479	1907624.99998895	42	18	8	41.90177327913275	-87.62842192883082	(41.90177327913275, -87.62842192883082)
23 | 07/24/2014	Completed	08/22/2014	14-01190319	Pothole in Street	Final Outcome	Pothole Patched	3	730 N FRANKLIN ST	60654	1174191.71119404	1905308.4805198	42	18	8	41.89562283550215	-87.63582499692767	(41.89562283550215, -87.63582499692767)
24 | 07/23/2014	Completed	08/28/2014	14-01178461	Pothole in Street	Final Outcome	Pothole Patched	3	1155 N DEARBORN ST	60610	1175723.69360823	1908131.92908715	42	18	8	41.90344694644998	-87.62982489848619	(41.90344694644998, -87.62982489848619)
25 | 07/21/2014	Completed	08/22/2014	14-01161819	Pothole in Street	Final Outcome	Pothole Patched	3	600 E GRAND AVE	60611	1180764.02691294	1904055.10759636	42	18	8	41.892094136861786	-87.61156988394656	(41.892094136861786, -87.61156988394656)
26 | 07/18/2014	Completed	08/27/2014	14-01149914	Pothole in Street	Final Outcome	Pothole Patched	4	150 N FRANKLIN ST	60606	1174306.13841283	1901261.81180327	42	1	32	41.88449312034977	-87.63552722968024	(41.88449312034977, -87.63552722968024)
27 | 07/17/2014	Completed	08/22/2014	14-01134572	Pothole in Street	Final Outcome	Pothole Patched	2	600 N MICHIGAN AVE	60611	1177329.77956558	1904235.62916875	42	18	8	41.89259322553828	-87.6243340479495	(41.89259322553828, -87.6243340479495)
28 | 07/14/2014	Completed	08/28/2014	14-01112602	Pothole in Street	Final Outcome	Pothole Patched	15	170 W OAK ST	60610	1174950.17410849	1907124.5048948	42	18	8	41.90065230620216	-87.63419240464901	(41.90065230620216, -87.63419240464901)
29 | 07/14/2014	Completed	08/22/2014	14-01114977	Pothole in Street	Final Outcome	Pothole Patched	3	461 N CITYFRONT PLAZA DR	60611	1177942.61391555	1903427.32649215	42	18	8	41.89012269030228	-87.62144215679457	(41.89012269030228, -87.62144215679457)
30 | 07/14/2014	Completed	08/27/2014	14-01114104	Pothole in Street	Final Outcome	Pothole Patched	13	462 N LAKE SHORE DR	60611	1180059.25771678	1903489.31249721	42	18	8	41.89043391381039	-87.61432635149913	(41.89043391381039, -87.61432635149913)
31 | 07/13/2014	Completed	08/22/2014	14-01106120	Pothole in Street	Final Outcome	Pothole Patched	2	37 E OHIO ST	60611	1176632.67288564	1904207.6496666	42	18	8	41.892389760220446	-87.6266700643506	(41.892389760220446, -87.6266700643506)
32 | 07/11/2014	Completed	08/27/2014	14-01092992	Pothole in Street	Final Outcome	Pothole Patched	0	517 N LAKE SHORE DR	60611	1180023.3999851	1903912.87000671	42	18	8	41.891747134984456	-87.61415723810663	(41.891747134984456, -87.61415723810663)
33 | 07/11/2014	Completed	08/28/2014	14-01092312	Pothole in Street	Final Outcome	Pothole Patched	2	1165 N LA SALLE DR	60610	1174923.84946801	1908163.01737071	42	18	8	41.90367107146938	-87.63276634924583	(41.90367107146938, -87.63276634924583)
34 | 07/09/2014	Completed	08/22/2014	14-01072126	Pothole in Street	Final Outcome	Pothole Patched	1	200 W OHIO ST	60654	1174634.07810009	1904152.08572945	42	18	8	41.892497422329804	-87.63416622539611	(41.892497422329804, -87.63416622539611)
35 | 07/08/2014	Completed	08/26/2014	14-01064944	Pothole in Street	Final Outcome	Pothole Patched	4	34 S STATE ST	60603	1176384.09122814	1900236.40989122	42	1	32	41.881194280111245	-87.62788715387323	(41.881194280111245, -87.62788715387323)
36 | 07/06/2014	Completed	08/22/2014	14-01050617	Pothole in Street	Final Outcome	Pothole Patched	5	430 N FRANKLIN ST	60654	1174247.46842821	1903265.17618927	42	18	8	41.890001134206734	-87.63568286112702	(41.890001134206734, -87.63568286112702)
37 | 06/30/2014	Completed	08/26/2014	14-01013381	Pothole in Street	Final Outcome	Pothole Patched	7	50 E JACKSON BLVD	60604	1176916.76437779	1899018.37548671	42	1	32	41.87836259235806	-87.62578989491375	(41.87836259235806, -87.62578989491375)
38 | 06/26/2014	Completed	08/27/2014	14-00992410	Pothole in Street	Final Outcome	Pothole Patched	6	201 N WELLS ST	60606	1174696.68801277	1901736.11738759	42	1	32	41.885758730147366	-87.63378404961261	(41.885758730147366, -87.63378404961261)
39 | 06/23/2014	Completed	08/22/2014	14-00961579	Pothole in Street	Final Outcome	Pothole Patched	1	340 W ERIE ST	60654	1173808.62199403	1904714.70549298	42	18	8	41.89405989499057	-87.63717643739275	(41.89405989499057, -87.63717643739275)
40 | 06/23/2014	Completed	08/26/2014	14-00967688	Pothole in Street	Final Outcome	Pothole Patched	12	140 S DEARBORN ST	60603	1175959.03010572	1899623.29000633	42	1	32	41.87967604011696	-87.62950228266759	(41.87967604011696, -87.62950228266759)
41 | 06/18/2014	Completed	08/27/2014	14-00933275	Pothole in Street	Final Outcome	Pothole Patched	15	211 N DEARBORN ST	60601	1175900.7237733	1901846.43477009	42	1	32	41.88604316092842	-87.62936052143365	(41.88604316092842, -87.62936052143365)
42 | 06/12/2014	Completed	08/27/2014	14-00897156	Pothole in Street	Final Outcome	Pothole Patched	42	200 S LA SALLE ST	60604	1175161.95537691	1899423.25992154	42	1	32	41.87936625147701	-87.63243763414764	(41.87936625147701, -87.63243763414764)
43 | 06/12/2014	Completed	08/22/2014	14-00890682	Pothole in Street	Final Outcome	Pothole Patched	1	306 W OHIO ST	60654	1174162.61920993	1904141.19087966	42	18	8	41.89247410656883	-87.63593436430777	(41.89247410656883, -87.63593436430777)
44 | 06/11/2014	Completed	08/28/2014	14-00887552	Pothole in Street	Final Outcome	Pothole Patched	4	1020 N STATE ST	60610	1176176.20887962	1907374.89001835	42	18	8	41.901200484161656	-87.62847652035359	(41.901200484161656, -87.62847652035359)
45 | 06/10/2014	Completed	08/27/2014	14-00880864	Pothole in Street	Final Outcome	Pothole Patched	0	245 W WASHINGTON ST	60606	1174540.03218467	1900805.68709955	42	1	32	41.883096426444126	-87.63515649580224	(41.883096426444126, -87.63515649580224)
46 | 06/04/2014	Completed	08/22/2014	14-00844247	Pothole in Street	Final Outcome	Pothole Patched	2	525 W SUPERIOR ST	60654	1172505.49257168	1905270.97268573	42	18	8	41.895379256528365	-87.64251783750353	(41.895379256528365, -87.64251783750353)
47 | 06/03/2014	Completed	08/26/2014	14-00829362	Pothole in Street	Final Outcome	Pothole Patched	2	100 S STATE ST	60603	1176400.89309128	1899922.58000955	42	1	32	41.88071513784339	-87.62787607841581	(41.88071513784339, -87.62787607841581)
48 | 06/01/2014	Completed	08/26/2014	14-00819596	Pothole in Street	Final Outcome	Pothole Patched	1	21 S DEARBORN ST	60603	1175943.73762726	1900281.08996842	42	1	32	41.88163394997373	-87.62924355913282	(41.88163394997373, -87.62924355913282)
49 | 05/31/2014	Completed	08/28/2014	14-00816655	Pothole in Street	Final Outcome	Pothole Patched	2	1250 N DEARBORN ST	60610	1175716.58493337	1908364.4279693	42	18	8	41.905379013702905	-87.63017455348202	(41.905379013702905, -87.63017455348202)
50 | 05/29/2014	Completed	08/22/2014	14-00805945	Pothole in Street	Final Outcome	Pothole Patched	10	738 N LARRABEE ST	60654	1172208.49919151	1905332.36197604	42	18	8	41.89589422031982	-87.64313907101959	(41.89589422031982, -87.64313907101959)
51 | 05/29/2014	Completed	08/26/2014	14-00802449	Pothole in Street	Final Outcome	Pothole Patched	6	33 S STATE ST	60603	1176383.89122814	1900241.30989122	42	1	32	41.8812023900134	-87.62759359231444	(41.8812023900134, -87.62759359231444)
52 | 05/28/2014	Completed	08/22/2014	14-00797786	Pothole in Street	Final Outcome	Pothole Patched	2	720 N FRANKLIN ST	60654	1174193.91119404	1905217.9805198	42	18	8	41.89553241941632	-87.63582582624329	(41.89553241941632, -87.63582582624329)
53 | 05/26/2014	Completed	08/28/2014	14-00780187	Pothole in Street	Final Outcome	Pothole Patched	15	12 W ELM ST	60610	1175983.31378993	1908093.10450502	42	18	8	41.90327840141334	-87.62912853335304	(41.90327840141334, -87.62912853335304)
54 | 05/21/2014	Completed	08/28/2014	14-00753021	Pothole in Street	Final Outcome	Pothole Patched	10	30 E ELM ST	60611	1176399.91378993	1908106.10450502	42	18	8	41.903312908929266	-87.6274232482672	(41.903312908929266, -87.6274232482672)
55 | 05/19/2014	Completed	08/27/2014	14-00736478	Pothole in Street	Final Outcome	Pothole Patched	10	520 E ILLINOIS ST	60611	1180045.15771678	1903729.01249721	42	18	8	41.891214491225654	-87.61421696505855	(41.891214491225654, -87.61421696505855)
56 | 04/08/2014	Completed	08/25/2014	14-00504762	Pothole in Street	Final Outcome	Pothole Patched	2	1300 N CALIFORNIA AVE	60622	1157492.02101534	1908524.16997455	26	14	24	41.904795555353125	-87.69707304441381	(41.904795555353125, -87.69707304441381)


--------------------------------------------------------------------------------
/tests/tsv/standardized_text.tsv:
--------------------------------------------------------------------------------
1 | the	quick
2 | brown	fox
3 | jumps over	the
4 | lazy  dog


--------------------------------------------------------------------------------
/tests/txt/raw_text.txt:
--------------------------------------------------------------------------------
 1 | Little Bo peep has lost her sheep
 2 | And doesn't know where to find them.
 3 | Leave them alone and they'll come home,
 4 | Bringing their tails behind them.
 5 | Little Bo peep fell fast asleep
 6 | And dreamt she heard them bleating,
 7 | But when she awoke, she found it a joke,
 8 | For they were all still fleeting.
 9 | Then up she took her little crook
10 | Determined for to find them.
11 | She found them indeed, but it made her heart bleed,
12 | For they left their tails behind them.
13 | It happened one day, as Bo peep did stray
14 | Into a meadow hard by, 
15 | There she espied their tails side by side
16 | All hung on a tree to dry.
17 | She heaved a sigh, and wiped her eye,
18 | And over the hillocks went rambling,
19 | And tried what she could,
20 | As a shepherdess should,
21 | To tack again each to its lambkin.
22 | 


--------------------------------------------------------------------------------
/tests/txt/standardized_text.txt:
--------------------------------------------------------------------------------
1 | the quick brown fox jumps over the lazy dog
2 | 


--------------------------------------------------------------------------------
/tests/wav/raw_text.txt:
--------------------------------------------------------------------------------
1 | Everything Is Awesome
2 | 


--------------------------------------------------------------------------------
/tests/wav/raw_text.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/wav/raw_text.wav


--------------------------------------------------------------------------------
/tests/wav/standardized_text.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/wav/standardized_text.wav


--------------------------------------------------------------------------------
/tests/xls/raw_text.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xls/raw_text.xls


--------------------------------------------------------------------------------
/tests/xls/standardized_text.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xls/standardized_text.xls


--------------------------------------------------------------------------------
/tests/xlsx/raw_text.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xlsx/raw_text.xlsx


--------------------------------------------------------------------------------
/tests/xlsx/standardized_text.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deanmalmgren/textract/ec3c0c3c982078d22e51cc2753baeaf48cdf2e19/tests/xlsx/standardized_text.xlsx


--------------------------------------------------------------------------------
/textract/__init__.py:
--------------------------------------------------------------------------------
1 | from .parsers import process
2 | 
3 | VERSION = "1.6.5"
4 | 


--------------------------------------------------------------------------------
/textract/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Use argparse to handle command-line arguments.
  3 | """
  4 | 
  5 | import argparse
  6 | import encodings
  7 | import os
  8 | import pkgutil
  9 | import sys
 10 | import six
 11 | import re
 12 | import glob
 13 | 
 14 | import argcomplete
 15 | 
 16 | from . import VERSION
 17 | from .parsers import DEFAULT_ENCODING, _get_available_extensions
 18 | 
 19 | 
 20 | class AddToNamespaceAction(argparse.Action):
 21 |     """This adds KEY,VALUE arbitrary pairs to the argparse.Namespace object
 22 |     """
 23 |     def __call__(self, parser, namespace, values, option_string=None):
 24 |         key, val = values.strip().split('=')
 25 |         if hasattr(namespace, key):
 26 |             parser.error((
 27 |                 'Duplicate specification of the key "%(key)s" with --option.'
 28 |             ) % locals())
 29 |         setattr(namespace, key, val)
 30 | 
 31 | 
 32 | # Fix FileType to honor 'b' flag, see: https://bugs.python.org/issue14156
 33 | class FileType(argparse.FileType):
 34 |     def __call__(self, string):
 35 |         if string == '-' and six.PY3:
 36 |             if 'r' in self._mode:
 37 |                 string = sys.stdin.fileno()
 38 |             elif 'w' in self._mode:
 39 |                 string = sys.stdout.fileno()
 40 |         return super(FileType, self).__call__(string)
 41 | 
 42 | 
 43 | # This function is necessary to enable autodocumentation of the script
 44 | # output
 45 | def get_parser():
 46 |     """Initialize the parser for the command line interface and bind the
 47 |     autocompletion functionality"""
 48 | 
 49 |     # initialize the parser
 50 |     parser = argparse.ArgumentParser(
 51 |         description=(
 52 |             'Command line tool for extracting text from any document. '
 53 |         ) % locals(),
 54 |     )
 55 | 
 56 |     # define the command line options here
 57 |     parser.add_argument(
 58 |         'filename', help='Filename to extract text.',
 59 |     ).completer = argcomplete.completers.FilesCompleter
 60 |     parser.add_argument(
 61 |         '-e', '--encoding', type=str, default=DEFAULT_ENCODING,
 62 |         choices=_get_available_encodings(),
 63 |         help='Specify the encoding of the output.',
 64 |     )
 65 |     parser.add_argument(
 66 |         '--extension', type=str, default=None,
 67 |         choices=_get_available_extensions(),
 68 |         help='Specify the extension of the file.',
 69 |     )
 70 |     parser.add_argument(
 71 |         '-m', '--method', default='',
 72 |         help='Specify a method of extraction for formats that support it',
 73 |     )
 74 |     parser.add_argument(
 75 |         '-o', '--output', type=FileType('wb'), default='-',
 76 |         help='Output raw text in this file',
 77 |     )
 78 |     parser.add_argument(
 79 |         '-O', '--option', type=str, action=AddToNamespaceAction,
 80 |         help=(
 81 |             'Add arbitrary options to various parsers of the form '
 82 |             'KEYWORD=VALUE. A full list of available KEYWORD options is '
 83 |             'available at http://bit.ly/textract-options'
 84 |         ),
 85 |     )
 86 |     parser.add_argument(
 87 |         '-v', '--version', action='version', version='%(prog)s '+VERSION,
 88 |     )
 89 | 
 90 |     # enable autocompletion with argcomplete
 91 |     argcomplete.autocomplete(parser)
 92 | 
 93 |     return parser
 94 | 
 95 | 
 96 | def _get_available_encodings():
 97 |     """Get a list of the available encodings to make it easy to
 98 |     tab-complete the command line interface.
 99 | 
100 |     Inspiration from http://stackoverflow.com/a/3824405/564709
101 |     """
102 |     available_encodings = set(encodings.aliases.aliases.values())
103 |     paths = [os.path.dirname(encodings.__file__)]
104 |     for importer, modname, ispkg in pkgutil.walk_packages(path=paths):
105 |         available_encodings.add(modname)
106 |     available_encodings = list(available_encodings)
107 |     available_encodings.sort()
108 |     return available_encodings
109 | 


--------------------------------------------------------------------------------
/textract/colors.py:
--------------------------------------------------------------------------------
 1 | """Inspiration from
 2 | https://github.com/fabric/fabric/blob/master/fabric/colors.py
 3 | """
 4 | import re
 5 | 
 6 | 
 7 | def _wrap_with(code, bold=False):
 8 |     def inner(text):
 9 |         c = code
10 |         if bold:
11 |             c = "1;%s" % c
12 |         return "\033[%sm%s\033[0m" % (c, text)
13 |     return inner
14 | 
15 | 
16 | red = _wrap_with('31')
17 | green = _wrap_with('32')
18 | yellow = _wrap_with('33')
19 | blue = _wrap_with('34')
20 | magenta = _wrap_with('35')
21 | cyan = _wrap_with('36')
22 | white = _wrap_with('37')
23 | 
24 | bold_red = _wrap_with('31', True)
25 | bold_green = _wrap_with('32', True)
26 | bold_yellow = _wrap_with('33', True)
27 | bold_blue = _wrap_with('34', True)
28 | bold_magenta = _wrap_with('35', True)
29 | bold_cyan = _wrap_with('36', True)
30 | bold_white = _wrap_with('37', True)
31 | 
32 | 
33 | # regular expression to omit colorcodes
34 | def colorless(text):
35 |     """Remove color from the text"""
36 |     return re.sub(r"\033\[(1;)?[\d]+m", '', text)
37 | 


--------------------------------------------------------------------------------
/textract/exceptions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | # traceback from exceptions that inherit from this class are suppressed
  5 | class CommandLineError(Exception):
  6 |     """The traceback of all CommandLineError's is supressed when the
  7 |     errors occur on the command line to provide a useful command line
  8 |     interface.
  9 |     """
 10 |     def render(self, msg):
 11 |         return msg % vars(self)
 12 | 
 13 | 
 14 | class ExtensionNotSupported(CommandLineError):
 15 |     """This error is raised with unsupported extensions"""
 16 |     def __init__(self, ext):
 17 |         self.ext = ext
 18 | 
 19 |         from .parsers import _get_available_extensions
 20 |         available_extensions = []
 21 |         for e in _get_available_extensions():
 22 |             if e.startswith('.'):
 23 |                 available_extensions.append(e)
 24 |         self.available_extensions_str = ', '.join(available_extensions)
 25 | 
 26 |     def __str__(self):
 27 |         return self.render((
 28 |             'The filename extension %(ext)s is not yet supported by\n'
 29 |             'textract. Please suggest this filename extension here:\n\n'
 30 |             '    https://github.com/deanmalmgren/textract/issues\n\n'
 31 |             'Available extensions include: %(available_extensions_str)s\n'
 32 |         ))
 33 | 
 34 | 
 35 | class MissingFileError(CommandLineError):
 36 |     """This error is raised when the file can not be located at the
 37 |     specified path.
 38 |     """
 39 |     def __init__(self, filename):
 40 |         self.filename = filename
 41 |         self.root, self.ext = os.path.splitext(filename)
 42 | 
 43 |     def __str__(self):
 44 |         return self.render((
 45 |             'The file "%(filename)s" can not be found.\n'
 46 |             'Is this the right path/to/file/you/want/to/extract%(ext)s?'
 47 |         ))
 48 | 
 49 | 
 50 | class UnknownMethod(CommandLineError):
 51 |     """This error is raised when the specified --method on the command
 52 |     line is unknown.
 53 |     """
 54 |     def __init__(self, method):
 55 |         self.method = method
 56 | 
 57 |     def __str__(self):
 58 |         return self.render((
 59 |             'The method "%(method)s" can not be found for this filetype.'
 60 |         ))
 61 | 
 62 | 
 63 | class ShellError(CommandLineError):
 64 |     """This error is raised when a shell.run returns a non-zero exit code
 65 |     (meaning the command failed).
 66 |     """
 67 |     def __init__(self, command, exit_code, stdout, stderr):
 68 |         self.command = command
 69 |         self.exit_code = exit_code
 70 |         self.stdout = stdout
 71 |         self.stderr = stderr
 72 |         self.executable = self.command.split()[0]
 73 | 
 74 |     def is_not_installed(self):
 75 |         return os.name == 'posix' and self.exit_code == 127
 76 | 
 77 |     def not_installed_message(self):
 78 |         return (
 79 |             "The command `%(command)s` failed because the executable\n"
 80 |             "`%(executable)s` is not installed on your system. Please make\n"
 81 |             "sure the appropriate dependencies are installed before using\n"
 82 |             "textract:\n\n"
 83 |             "    http://textract.readthedocs.org/en/latest/installation.html\n"
 84 |         ) % vars(self)
 85 | 
 86 |     def failed_message(self):
 87 |         return (
 88 |             "The command `%(command)s` failed with exit code %(exit_code)d\n"
 89 |             "------------- stdout -------------\n"
 90 |             "%(stdout)s"
 91 |             "------------- stderr -------------\n"
 92 |             "%(stderr)s"
 93 |         ) % vars(self)
 94 | 
 95 |     def __str__(self):
 96 |         if self.is_not_installed():
 97 |             return self.not_installed_message()
 98 |         else:
 99 |             return self.failed_message()
100 | 


--------------------------------------------------------------------------------
/textract/parsers/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Route the request to the appropriate parser based on file type.
  3 | """
  4 | 
  5 | import os
  6 | import importlib
  7 | import glob
  8 | import re
  9 | 
 10 | from .. import exceptions
 11 | 
 12 | # Dictionary structure for synonymous file extension types
 13 | EXTENSION_SYNONYMS = {
 14 |     ".jpeg": ".jpg",
 15 |     ".tff": ".tiff",
 16 |     ".tif": ".tiff",
 17 |     ".htm": ".html",
 18 |     "": ".txt",
 19 |     ".log": ".txt",
 20 |     ".tab": ".tsv",
 21 | }
 22 | 
 23 | # default encoding that is returned by the process method. specify it
 24 | # here so the default is used on both the process function and also by
 25 | # the command line interface
 26 | DEFAULT_OUTPUT_ENCODING = 'utf_8'
 27 | DEFAULT_ENCODING = 'utf_8'
 28 | 
 29 | # filename format
 30 | _FILENAME_SUFFIX = '_parser'
 31 | 
 32 | 
 33 | def process(filename, input_encoding=None, output_encoding=DEFAULT_OUTPUT_ENCODING, extension=None, **kwargs):
 34 |     """This is the core function used for extracting text. It routes the
 35 |     ``filename`` to the appropriate parser and returns the extracted
 36 |     text as a byte-string encoded with ``encoding``.
 37 |     """
 38 | 
 39 |     # make sure the filename exists
 40 |     if not os.path.exists(filename):
 41 |         raise exceptions.MissingFileError(filename)
 42 | 
 43 |     # get the filename extension, which is something like .docx for
 44 |     # example, and import the module dynamically using importlib. This
 45 |     # is a relative import so the name of the package is necessary
 46 |     # normally, file extension will be extracted from the file name
 47 |     # if the file name has no extension, then the user can pass the
 48 |     # extension as an argument
 49 |     if extension:
 50 |         ext = extension
 51 |         # check if the extension has the leading .
 52 |         if not ext.startswith('.'):
 53 |             ext = '.' + ext
 54 |         ext = ext.lower()
 55 |     else:
 56 |         _, ext = os.path.splitext(filename)
 57 |         ext = ext.lower()
 58 | 
 59 |     # check the EXTENSION_SYNONYMS dictionary
 60 |     ext = EXTENSION_SYNONYMS.get(ext, ext)
 61 | 
 62 |     # to avoid conflicts with packages that are installed globally
 63 |     # (e.g. python's json module), all extension parser modules have
 64 |     # the _parser extension
 65 |     rel_module = ext + _FILENAME_SUFFIX
 66 | 
 67 |     # If we can't import the module, the file extension isn't currently
 68 |     # supported
 69 |     try:
 70 |         filetype_module = importlib.import_module(
 71 |             rel_module, 'textract.parsers'
 72 |         )
 73 |     except ImportError:
 74 |         raise exceptions.ExtensionNotSupported(ext)
 75 | 
 76 |     # do the extraction
 77 | 
 78 |     parser = filetype_module.Parser()
 79 |     return parser.process(filename, input_encoding, output_encoding, **kwargs)
 80 | 
 81 | 
 82 | def _get_available_extensions():
 83 |     """Get a list of available file extensions to make it easy for
 84 |     tab-completion and exception handling.
 85 |     """
 86 |     extensions = []
 87 | 
 88 |     # from filenames
 89 |     parsers_dir = os.path.join(os.path.dirname(__file__))
 90 |     glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py")
 91 |     # escape backslashes for python 3.6+
 92 |     glob_filename = glob_filename.replace("//", "////")
 93 |     ext_re = re.compile(glob_filename.replace('*', r"(?P<ext>\w+)"))
 94 |     for filename in glob.glob(glob_filename):
 95 |         ext_match = ext_re.match(filename)
 96 |         ext = ext_match.groups()[0]
 97 |         extensions.append(ext)
 98 |         extensions.append('.' + ext)
 99 | 
100 |     # from relevant synonyms (don't use the '' synonym)
101 |     for ext in EXTENSION_SYNONYMS.keys():
102 |         if ext:
103 |             extensions.append(ext)
104 |             extensions.append(ext.replace('.', '', 1))
105 |     extensions.sort()
106 |     return extensions
107 | 


--------------------------------------------------------------------------------
/textract/parsers/audio.py:
--------------------------------------------------------------------------------
 1 | import speech_recognition as sr
 2 | import os
 3 | 
 4 | from ..exceptions import UnknownMethod, ShellError
 5 | from .utils import ShellParser
 6 | 
 7 | 
 8 | class Parser(ShellParser):
 9 |     """
10 |     Extract text (i.e. speech) from an audio file, using SpeechRecognition.
11 | 
12 |     Since SpeechRecognition expects a .wav file, with 1 channel,
13 |     the audio file has to be converted, via sox, if not compliant
14 | 
15 |     Note: for testing, use -
16 |     http://www2.research.att.com/~ttsweb/tts/demo.php,
17 |     with Rich (US English) for best results
18 |     """
19 | 
20 |     def extract(self, filename, method='', **kwargs):
21 |         speech = ''
22 | 
23 |         # convert to wav, if not already .wav
24 |         base, ext = os.path.splitext(filename)
25 |         if ext != '.wav':
26 |             temp_filename = self.convert_to_wav(filename)
27 |             try:
28 |                 speech = self.extract(temp_filename, method, **kwargs)
29 |             finally:  # make sure temp_file is deleted
30 |                 os.remove(temp_filename)
31 |         else:
32 |             r = sr.Recognizer()
33 | 
34 |             with sr.WavFile(filename) as source:
35 |                 audio = r.record(source)
36 | 
37 |             try:
38 |                 if method == 'google' or method == '':
39 |                     speech = r.recognize_google(audio)
40 |                 elif method == 'sphinx':
41 |                     speech = r.recognize_sphinx(audio)
42 |                 else:
43 |                     raise UnknownMethod(method)
44 |             except LookupError:  # audio is not understandable
45 |                 speech = ''
46 |             except sr.UnknownValueError:
47 |                 speech = ''
48 | 
49 |             # add a newline, to make output cleaner
50 |             speech += '\n'
51 | 
52 |         return speech
53 | 
54 |     def convert_to_wav(self, filename):
55 |         """
56 |         Uses sox cmdline tool, to convert audio file to .wav
57 | 
58 |         Note: for testing, use -
59 |         http://www.text2speech.org/,
60 |         with American Male 2 for best results
61 |         """
62 |         temp_filename = '{0}.wav'.format(self.temp_filename())
63 |         self.run(['sox', '-G', '-c', '1', filename, temp_filename])
64 |         return temp_filename
65 | 


--------------------------------------------------------------------------------
/textract/parsers/csv_parser.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | from .utils import BaseParser
 4 | 
 5 | 
 6 | class Parser(BaseParser):
 7 |     """Extract text from comma separated values files (.csv).
 8 |     """
 9 | 
10 |     delimiter = ','
11 | 
12 |     def extract(self, filename, **kwargs):
13 | 
14 |         # quick 'n dirty solution for the time being
15 |         with open(filename) as stream:
16 |             reader = csv.reader(stream, delimiter=self.delimiter)
17 |             return '\n'.join(['\t'.join(row) for row in reader])
18 | 


--------------------------------------------------------------------------------
/textract/parsers/doc_parser.py:
--------------------------------------------------------------------------------
 1 | from .utils import ShellParser
 2 | 
 3 | 
 4 | class Parser(ShellParser):
 5 |     """Extract text from doc files using antiword.
 6 |     """
 7 | 
 8 |     def extract(self, filename, **kwargs):
 9 |         stdout, stderr = self.run(['antiword', filename])
10 |         return stdout
11 | 


--------------------------------------------------------------------------------
/textract/parsers/docx_parser.py:
--------------------------------------------------------------------------------
 1 | import docx2txt
 2 | 
 3 | from .utils import BaseParser
 4 | 
 5 | 
 6 | class Parser(BaseParser):
 7 |     """Extract text from docx file using python-docx.
 8 |     """
 9 | 
10 |     def extract(self, filename, **kwargs):
11 |         return docx2txt.process(filename)
12 | 


--------------------------------------------------------------------------------
/textract/parsers/eml_parser.py:
--------------------------------------------------------------------------------
 1 | from email.parser import Parser as EmailParser
 2 | 
 3 | from .utils import BaseParser
 4 | 
 5 | 
 6 | class Parser(BaseParser):
 7 |     """Extract text from email messages in .eml format. This gets the
 8 |     subject and all text from the contents.
 9 |     """
10 | 
11 |     def extract(self, filename, **kwargs):
12 |         # TODO: could make option here to omit all non-original content
13 |         # (forwarded content, quoted content in reply, signature, etc),
14 |         # perhaps using https://github.com/zapier/email-reply-parser
15 | 
16 |         # TODO: could also potentially grab text/html content instead of
17 |         # only grabbing text/plain content
18 | 
19 |         with open(filename) as stream:
20 |             parser = EmailParser()
21 |             message = parser.parse(stream)
22 | 
23 |         text_content = []
24 |         for part in message.walk():
25 |             if part.get_content_type().startswith('text/plain'):
26 |                 text_content.append(part.get_payload())
27 |         return '\n\n'.join(text_content)
28 | 


--------------------------------------------------------------------------------
/textract/parsers/epub_parser.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | from .utils import BaseParser
 5 | 
 6 | 
 7 | class Parser(BaseParser):
 8 |     """Extract text from epub"""
 9 | 
10 |     def extract(self, filename, **kwargs):
11 |         book = zipfile.ZipFile(filename)
12 |         result = ''
13 |         for text_name in self.__epub_sections(book):
14 |             if not text_name.endswith("html"):
15 |                 continue
16 |             soup = BeautifulSoup(book.open(text_name), features='lxml')
17 |             html_content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4']
18 |             for child in soup.find_all(html_content_tags):
19 |                 inner_text = child.text.strip() if child.text else ""
20 |                 if inner_text:
21 |                     result += inner_text + '\n'
22 |         return result
23 | 
24 |     def __epub_sections(self, book):
25 |         opf_paths = self.__get_opf_paths(book)
26 |         item_paths = self.__get_item_paths(book, opf_paths)
27 |         return item_paths
28 | 
29 |     def __get_opf_paths(self, book):
30 |         meta_inf = book.open("META-INF/container.xml")
31 |         meta_soup = BeautifulSoup(meta_inf, features='lxml')
32 |         return [f["full-path"] for f in meta_soup.rootfiles.find_all("rootfile")]
33 |     
34 |     def __get_item_paths(self, book, opf_paths):
35 |         item_paths = []
36 |         for opf_path in opf_paths:
37 |             opf_soup = BeautifulSoup(book.open(opf_path), "lxml")
38 |             epub_items = opf_soup.spine.find_all("itemref")
39 |             for epub_item in epub_items:
40 |                 item = self.__get_item(opf_soup, epub_item["idref"])
41 |                 item_paths.append(self.__get_full_item_path(book, item["href"]))
42 |         return item_paths
43 | 
44 |     def __get_item(self, opf_soup, item_id):
45 |         for item in opf_soup.manifest.find_all("item"):
46 |             if item["id"] == item_id:
47 |                 return item
48 |         return None
49 |     
50 |     def __get_full_item_path(self, book, partial_path):
51 |         for filename in book.namelist():
52 |             if filename.endswith(partial_path):
53 |                 return filename
54 | 


--------------------------------------------------------------------------------
/textract/parsers/gif_parser.py:
--------------------------------------------------------------------------------
1 | from .image import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/html_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import six
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from .utils import BaseParser
  7 | 
  8 | 
  9 | class Parser(BaseParser):
 10 |     """Extract text from html file using beautifulsoup4. Filter text to
 11 |     only show the visible parts of the page. Insipration from `here
 12 |     <http://stackoverflow.com/a/1983219/564709>`_.
 13 |     """
 14 | 
 15 |     _disallowed_names = [
 16 |         'style', 'script', '[document]', 'head', 'title', 'html', 'meta',
 17 |         'link', 'body',
 18 |     ]
 19 | 
 20 |     _inline_tags = [
 21 |         'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code',
 22 |         'dfn', 'em', 'kbd', 'strong', 'samp', 'var', 'a', 'bdo', 'br', 'img',
 23 |         'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button',
 24 |         'input', 'label', 'select', 'textarea',
 25 |     ]
 26 | 
 27 |     def _visible(self, element):
 28 |         """Used to filter text elements that have invisible text on the page.
 29 |         """
 30 |         if element.name in self._disallowed_names:
 31 |             return False
 32 |         elif re.match(u'<!--.*-->', six.text_type(element.extract())):
 33 |             return False
 34 |         return True
 35 | 
 36 |     def _inline(self, element):
 37 |         """Used to check whether given element can be treated as inline
 38 |         element (without new line after).
 39 |         """
 40 |         if element.name in self._inline_tags:
 41 |             return True
 42 |         return False
 43 | 
 44 |     def _find_any_text(self, tag):
 45 |         """Looks for any possible text within given tag.
 46 |         """
 47 |         text = ''
 48 |         if tag is not None:
 49 |             text = six.text_type(tag)
 50 |             text = re.sub(r'(<[^>]+>)', '', text)
 51 |             text = re.sub(r'\s', ' ', text)
 52 |             text = text.strip()
 53 |         return text
 54 | 
 55 |     def _parse_tables(self, soup):
 56 |         """Returns array containing basic informations about tables for ASCII
 57 |         replacement (look: _replace_tables()).
 58 |         """
 59 |         tables = []
 60 |         for t in soup.find_all('table'):
 61 |             t_dict = {'width': 0, 'table': t, 'trs': [], 'col_width': {}}
 62 |             trs = t.find_all('tr')
 63 |             if len(trs) > 0:
 64 |                 for tr in trs:
 65 |                     tr_dict = []
 66 |                     tds = tr.find_all('th') + tr.find_all('td')
 67 |                     if len(tds) > 0:
 68 |                         for i, td in enumerate(tds):
 69 |                             td_text = self._find_any_text(td)
 70 |                             length = len(td_text)
 71 |                             if i in t_dict['col_width']:
 72 |                                 t_dict['col_width'][i] = max(
 73 |                                     length,
 74 |                                     t_dict['col_width'][i]
 75 |                                 )
 76 |                             else:
 77 |                                 t_dict['col_width'][i] = length
 78 |                             tr_dict.append({
 79 |                                 'text': td_text,
 80 |                                 'colspan': int(td.get('colspan', 1)),
 81 |                             })
 82 |                         t_dict['trs'].append(tr_dict)
 83 |                 for col in t_dict['col_width']:
 84 |                     t_dict['width'] += t_dict['col_width'][col]
 85 |                 tables.append(t_dict)
 86 |         return tables
 87 | 
 88 |     def _replace_tables(self, soup, v_separator=' | ', h_separator='-'):
 89 |         """Replaces <table> elements with its ASCII equivalent.
 90 |         """
 91 |         tables = self._parse_tables(soup)
 92 |         v_sep_len = len(v_separator)
 93 |         v_left_sep = v_separator.lstrip()
 94 |         for t in tables:
 95 |             html = ''
 96 |             trs = t['trs']
 97 |             h_length = 1 + (v_sep_len * len(t['col_width'])) + t['width']
 98 |             head_foot = (h_separator * h_length) + "\n"
 99 |             html += head_foot
100 |             for tr in trs:
101 |                 html += v_left_sep
102 |                 for i, td in enumerate(tr):
103 |                     text = td['text']
104 |                     col_width = t['col_width'][i] + v_sep_len
105 |                     if td['colspan'] > 1:
106 |                         for j in range(td['colspan']-1):
107 |                             j = j + 1
108 |                             if (i+j) < len(t['col_width']):
109 |                                 col_width += t['col_width'][i+j] + v_sep_len
110 |                     html += ('%' + str(col_width) + 's') % (text + v_separator)
111 |                 html += "\n"
112 |             html += head_foot
113 |             new_table = soup.new_tag('div')
114 |             new_table.string = html
115 |             t['table'].replace_with(new_table)
116 |         return soup
117 | 
118 |     def _join_inlines(self, soup):
119 |         """Unwraps inline elements defined in self._inline_tags.
120 |         """
121 |         elements = soup.find_all(True)
122 |         for elem in elements:
123 |             if self._inline(elem):
124 |                 elem.unwrap()
125 |         return soup
126 | 
127 |     def extract(self, filename, **kwargs):
128 |         with open(filename, "rb") as stream:
129 |             soup = BeautifulSoup(stream, 'lxml')
130 | 
131 |         # Convert tables to ASCII ones
132 |         soup = self._replace_tables(soup)
133 | 
134 |         # Join inline elements
135 |         soup = self._join_inlines(soup)
136 | 
137 |         # Make HTML
138 |         html = ''
139 |         elements = soup.find_all(True)
140 |         elements = [el for el in filter(self._visible, elements)]
141 |         for elem in elements:
142 |             string = elem.string
143 |             if string is None:
144 |                 string = self._find_any_text(elem)
145 |             string = string.strip()
146 |             if len(string) > 0:
147 |                 html += "\n" + string + "\n"
148 |         return html
149 | 


--------------------------------------------------------------------------------
/textract/parsers/image.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Process an image file using tesseract.
 3 | """
 4 | import os
 5 | 
 6 | from .utils import ShellParser
 7 | 
 8 | 
 9 | class Parser(ShellParser):
10 |     """Extract text from various image file formats using tesseract-ocr"""
11 | 
12 |     def extract(self, filename, **kwargs):
13 | 
14 |         # if language given as argument, specify language for tesseract to use
15 |         if 'language' in kwargs:
16 |             args = ['tesseract', filename, 'stdout', '-l', kwargs['language']]
17 |         else:
18 |             args = ['tesseract', filename, 'stdout']
19 | 
20 |         stdout, _ = self.run(args)
21 |         return stdout
22 | 


--------------------------------------------------------------------------------
/textract/parsers/jpg_parser.py:
--------------------------------------------------------------------------------
1 | from .image import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/json_parser.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import six
 3 | 
 4 | from .utils import BaseParser
 5 | 
 6 | 
 7 | class Parser(BaseParser):
 8 |     """Extract all of the string values of a json file (no keys as those
 9 |     are, in some sense, markup). This is useful for parsing content
10 |     from mongodb dumps, for example.
11 |     """
12 | 
13 |     def extract(self, filename, **kwargs):
14 |         with open(filename, 'r') as raw:
15 |             deserialized_json = json.load(raw)
16 |         return self.get_text(deserialized_json)
17 | 
18 |     def get_text(self, deserialized_json):
19 |         """Recursively get text from subcomponents of a deserialized json. To
20 |         enforce the same order on the documents, make sure to read keys of
21 |         deserialized_json in a consistent (alphabetical) order.
22 |         """
23 |         if isinstance(deserialized_json, dict):
24 |             result = ''
25 |             for key in sorted(deserialized_json):
26 |                 result += self.get_text(deserialized_json[key]) + ' '
27 |             return result
28 | 
29 |         if isinstance(deserialized_json, list):
30 |             result = ''
31 |             for item in deserialized_json:
32 |                 result += self.get_text(item) + ' '
33 |             return result
34 | 
35 |         if isinstance(deserialized_json, six.string_types):
36 |             return deserialized_json
37 |         else:
38 |             return ''
39 | 


--------------------------------------------------------------------------------
/textract/parsers/mp3_parser.py:
--------------------------------------------------------------------------------
1 | from .audio import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/msg_parser.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | import extract_msg
 4 | 
 5 | from .utils import BaseParser
 6 | 
 7 | 
 8 | def ensure_bytes(string):
 9 |     """Normalize string to bytes.
10 | 
11 |     `ExtractMsg.Message._getStringStream` can return unicode or bytes depending
12 |     on what is originally stored in message file.
13 | 
14 |     This helper functon makes sure, that bytes type is returned.
15 |     """
16 |     if isinstance(string, six.string_types):
17 |         return string.encode('utf-8')
18 |     return string
19 | 
20 | 
21 | class Parser(BaseParser):
22 |     """Extract text from Microsoft Outlook files (.msg)
23 |     """
24 | 
25 |     def extract(self, filename, **kwargs):
26 |         m = extract_msg.Message(filename)
27 |         return ensure_bytes(m.subject) + six.b('\n\n') + ensure_bytes(m.body)
28 | 


--------------------------------------------------------------------------------
/textract/parsers/odt_parser.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import xml.etree.ElementTree as ET
 3 | 
 4 | from .utils import BaseParser
 5 | 
 6 | 
 7 | class Parser(BaseParser):
 8 |     """Extract text from open document files.
 9 |     """
10 | 
11 |     def extract(self, filename, **kwargs):
12 |         # Inspiration from
13 |         # https://github.com/odoo/odoo/blob/master/addons/document/odt2txt.py
14 |         with open(filename, 'rb') as stream:
15 |             zip_stream = zipfile.ZipFile(stream)
16 |             self.content = ET.fromstring(zip_stream.read("content.xml"))
17 |         return self.to_string()
18 | 
19 |     def to_string(self):
20 |         """ Converts the document to a string. """
21 |         buff = u""
22 |         for child in self.content.iter():
23 |             if child.tag in [self.qn('text:p'), self.qn('text:h')]:
24 |                 buff += self.text_to_string(child) + "\n"
25 |         # remove last newline char
26 |         if buff:
27 |             buff = buff[:-1]
28 |         return buff
29 | 
30 |     def text_to_string(self, element):
31 |         buff = u""
32 |         if element.text is not None:
33 |             buff += element.text
34 |         for child in element:
35 |             if child.tag == self.qn('text:tab'):
36 |                 buff += "\t"
37 |                 if child.tail is not None:
38 |                     buff += child.tail
39 |             elif child.tag == self.qn('text:s'):
40 |                 buff += u" "
41 |                 if child.get(self.qn('text:c')) is not None:
42 |                     buff += u" " * (int(child.get(self.qn('text:c'))) - 1)
43 |                 if child.tail is not None:
44 |                     buff += child.tail
45 |             else:
46 |                 buff += self.text_to_string(child)
47 |         if element.tail is not None:
48 |             buff += element.tail
49 |         return buff
50 | 
51 |     def qn(self, namespace):
52 |         """Connect tag prefix to longer namespace"""
53 |         nsmap = {
54 |             'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
55 |         }
56 |         spl = namespace.split(':')
57 |         return '{{{}}}{}'.format(nsmap[spl[0]], spl[1])
58 | 


--------------------------------------------------------------------------------
/textract/parsers/ogg_parser.py:
--------------------------------------------------------------------------------
1 | from .audio import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/pdf_parser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import six
 4 | from tempfile import mkdtemp
 5 | 
 6 | from ..exceptions import UnknownMethod, ShellError
 7 | 
 8 | from .utils import ShellParser
 9 | from .image import Parser as TesseractParser
10 | 
11 | from distutils.spawn import find_executable
12 | 
13 | class Parser(ShellParser):
14 |     """Extract text from pdf files using either the ``pdftotext`` method
15 |     (default) or the ``pdfminer`` method.
16 |     """
17 | 
18 |     def extract(self, filename, method='', **kwargs):
19 |         if method == '' or method == 'pdftotext':
20 |             try:
21 |                 return self.extract_pdftotext(filename, **kwargs)
22 |             except ShellError as ex:
23 |                 # If pdftotext isn't installed and the pdftotext method
24 |                 # wasn't specified, then gracefully fallback to using
25 |                 # pdfminer instead.
26 |                 if method == '' and ex.is_not_installed():
27 |                     return self.extract_pdfminer(filename, **kwargs)
28 |                 else:
29 |                     raise ex
30 | 
31 |         elif method == 'pdfminer':
32 |             return self.extract_pdfminer(filename, **kwargs)
33 |         elif method == 'tesseract':
34 |             return self.extract_tesseract(filename, **kwargs)
35 |         else:
36 |             raise UnknownMethod(method)
37 | 
38 |     def extract_pdftotext(self, filename, **kwargs):
39 |         """Extract text from pdfs using the pdftotext command line utility."""
40 |         if 'layout' in kwargs:
41 |             args = ['pdftotext', '-layout', filename, '-']
42 |         else:
43 |             args = ['pdftotext', filename, '-']
44 |         stdout, _ = self.run(args)
45 |         return stdout
46 | 
47 |     def extract_pdfminer(self, filename, **kwargs):
48 |         """Extract text from pdfs using pdfminer."""
49 |         #Nested try/except loops? Not great
50 |         #Try the normal pdf2txt, if that fails try the python3
51 |         # pdf2txt, if that fails try the python2 pdf2txt
52 |         pdf2txt_path = find_executable('pdf2txt.py')
53 |         try:
54 |             stdout, _ = self.run(['pdf2txt.py', filename])
55 |         except OSError:
56 |             try:
57 |                 stdout, _ = self.run(['python3',pdf2txt_path, filename])
58 |             except ShellError:
59 |                 stdout, _ = self.run(['python2',pdf2txt_path, filename])
60 |         return stdout
61 | 
62 |     def extract_tesseract(self, filename, **kwargs):
63 |         """Extract text from pdfs using tesseract (per-page OCR)."""
64 |         temp_dir = mkdtemp()
65 |         base = os.path.join(temp_dir, 'conv')
66 |         contents = []
67 |         try:
68 |             stdout, _ = self.run(['pdftoppm', filename, base])
69 | 
70 |             for page in sorted(os.listdir(temp_dir)):
71 |                 page_path = os.path.join(temp_dir, page)
72 |                 page_content = TesseractParser().extract(page_path, **kwargs)
73 |                 contents.append(page_content)
74 |             return six.b('').join(contents)
75 |         finally:
76 |             shutil.rmtree(temp_dir)
77 | 


--------------------------------------------------------------------------------
/textract/parsers/png_parser.py:
--------------------------------------------------------------------------------
1 | from .image import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/pptx_parser.py:
--------------------------------------------------------------------------------
 1 | import pptx
 2 | 
 3 | from .utils import BaseParser
 4 | 
 5 | 
 6 | class Parser(BaseParser):
 7 |     """Extract text from pptx file using python-pptx
 8 |     """
 9 | 
10 |     def extract(self, filename, **kwargs):
11 |         presentation = pptx.Presentation(filename)
12 |         text_runs = []
13 |         for slide in presentation.slides:
14 |             for shape in slide.shapes:
15 |                 if not shape.has_text_frame:
16 |                     continue
17 |                 for paragraph in shape.text_frame.paragraphs:
18 |                     for run in paragraph.runs:
19 |                         text_runs.append(run.text)
20 |         return '\n\n'.join(text_runs)
21 | 


--------------------------------------------------------------------------------
/textract/parsers/ps_parser.py:
--------------------------------------------------------------------------------
 1 | from .utils import ShellParser
 2 | 
 3 | 
 4 | class Parser(ShellParser):
 5 |     """Extract text from postscript files using ps2ascii command.
 6 |     """
 7 | 
 8 |     def extract(self, filename, **kwargs):
 9 |         stdout, _ = self.run(['ps2ascii', filename])
10 |         return stdout
11 | 


--------------------------------------------------------------------------------
/textract/parsers/psv_parser.py:
--------------------------------------------------------------------------------
1 | from .csv_parser import Parser as BaseParser
2 | 
3 | 
4 | class Parser(BaseParser):
5 |     """Extract text from pipe separated values files (.psv).
6 |     """
7 | 
8 |     delimiter = '|'
9 | 


--------------------------------------------------------------------------------
/textract/parsers/rtf_parser.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | from .utils import ShellParser
 4 | 
 5 | 
 6 | class Parser(ShellParser):
 7 |     """Extract text from rtf files using unrtf.
 8 |     """
 9 | 
10 |     def extract(self, filename, **kwargs):
11 |         # http://superuser.com/a/243089/126633
12 |         stdout, stderr = self.run(['unrtf', '--text', filename])
13 |         splitter = six.b('-') * 17 + six.b('\n')
14 |         text_conversion = stdout.split(splitter, 1)[-1]
15 |         return text_conversion
16 | 


--------------------------------------------------------------------------------
/textract/parsers/tiff_parser.py:
--------------------------------------------------------------------------------
1 | from .image import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/tsv_parser.py:
--------------------------------------------------------------------------------
1 | from .csv_parser import Parser as BaseParser
2 | 
3 | 
4 | class Parser(BaseParser):
5 |     """Extract text from tab separated values files (.tsv).
6 |     """
7 | 
8 |     delimiter = '\t'
9 | 


--------------------------------------------------------------------------------
/textract/parsers/txt_parser.py:
--------------------------------------------------------------------------------
 1 | from .utils import BaseParser
 2 | 
 3 | 
 4 | class Parser(BaseParser):
 5 |     """Parse ``.txt`` files"""
 6 | 
 7 |     def extract(self, filename, **kwargs):
 8 |         with open(filename) as stream:
 9 |             return stream.read()
10 | 


--------------------------------------------------------------------------------
/textract/parsers/utils.py:
--------------------------------------------------------------------------------
  1 | """This module includes a bunch of convenient base classes that are
  2 | reused in many of the other parser modules.
  3 | """
  4 | 
  5 | import subprocess
  6 | import tempfile
  7 | import os
  8 | import errno
  9 | 
 10 | import six
 11 | import chardet
 12 | 
 13 | from .. import exceptions
 14 | 
 15 | 
 16 | class BaseParser(object):
 17 |     """The :class:`.BaseParser` abstracts out some common functionality
 18 |     that is used across all document Parsers. In particular, it has
 19 |     the responsibility of handling all unicode and byte-encoding.
 20 |     """
 21 | 
 22 |     def extract(self, filename, **kwargs):
 23 |         """This method must be overwritten by child classes to extract raw
 24 |         text from a filename. This method can return either a
 25 |         byte-encoded string or unicode.
 26 |         """
 27 |         raise NotImplementedError('must be overwritten by child classes')
 28 | 
 29 |     def encode(self, text, encoding):
 30 |         """Encode the ``text`` in ``encoding`` byte-encoding. This ignores
 31 |         code points that can't be encoded in byte-strings.
 32 |         """
 33 |         return text.encode(encoding, 'ignore')
 34 | 
 35 |     def process(self, filename, input_encoding, output_encoding="utf8", **kwargs):
 36 |         """Process ``filename`` and encode byte-string with ``encoding``. This
 37 |         method is called by :func:`textract.parsers.process` and wraps
 38 |         the :meth:`.BaseParser.extract` method in `a delicious unicode
 39 |         sandwich <http://nedbatchelder.com/text/unipain.html>`_.
 40 | 
 41 |         """
 42 |         # make a "unicode sandwich" to handle dealing with unknown
 43 |         # input byte strings and converting them to a predictable
 44 |         # output encoding
 45 |         # http://nedbatchelder.com/text/unipain/unipain.html#35
 46 |         byte_string = self.extract(filename, **kwargs)
 47 |         unicode_string = self.decode(byte_string, input_encoding)
 48 |         return self.encode(unicode_string, output_encoding)
 49 | 
 50 |     def decode(self, text, input_encoding=None):
 51 |         """Decode ``text`` using the `chardet
 52 |         <https://github.com/chardet/chardet>`_ package.
 53 |         """
 54 |         # only decode byte strings into unicode if it hasn't already
 55 |         # been done by a subclass
 56 |         if isinstance(text, six.text_type):
 57 |             return text
 58 | 
 59 |         # empty text? nothing to decode
 60 |         if not text:
 61 |             return u''
 62 | 
 63 |         # use the provided encoding
 64 |         if input_encoding:
 65 |             return text.decode(input_encoding)
 66 | 
 67 |         # use chardet to automatically detect the encoding text if no encoding is provided
 68 |         result = chardet.detect(text)
 69 |         encoding = result['encoding'] if result['confidence'] > 0.80 else 'utf8'
 70 |         return text.decode(encoding, errors="replace")
 71 | 
 72 | 
 73 | class ShellParser(BaseParser):
 74 |     """The :class:`.ShellParser` extends the :class:`.BaseParser` to make
 75 |     it easy to run external programs from the command line with
 76 |     `Fabric <http://www.fabfile.org/>`_-like behavior.
 77 |     """
 78 | 
 79 |     def run(self, args):
 80 |         """Run ``command`` and return the subsequent ``stdout`` and ``stderr``
 81 |         as a tuple. If the command is not successful, this raises a
 82 |         :exc:`textract.exceptions.ShellError`.
 83 |         """
 84 | 
 85 |         # run a subprocess and put the stdout and stderr on the pipe object
 86 |         try:
 87 |             pipe = subprocess.Popen(
 88 |                 args,
 89 |                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
 90 |             )
 91 |         except OSError as e:
 92 |             if e.errno == errno.ENOENT:
 93 |                 # File not found.
 94 |                 # This is equivalent to getting exitcode 127 from sh
 95 |                 raise exceptions.ShellError(
 96 |                     ' '.join(args), 127, '', '',
 97 |                 )
 98 |             else: raise #Reraise the last exception unmodified
 99 | 
100 |         # pipe.wait() ends up hanging on large files. using
101 |         # pipe.communicate appears to avoid this issue
102 |         stdout, stderr = pipe.communicate()
103 | 
104 |         # if pipe is busted, raise an error (unlike Fabric)
105 |         if pipe.returncode != 0:
106 |             raise exceptions.ShellError(
107 |                 ' '.join(args), pipe.returncode, stdout, stderr,
108 |             )
109 | 
110 |         return stdout, stderr
111 | 
112 |     def temp_filename(self):
113 |         """Return a unique tempfile name.
114 |         """
115 |         # TODO: it would be nice to get this to behave more like a
116 |         # context so we can make sure these temporary files are
117 |         # removed, regardless of whether an error occurs or the
118 |         # program is terminated.
119 |         handle, filename = tempfile.mkstemp()
120 |         os.close(handle)
121 |         return filename
122 | 


--------------------------------------------------------------------------------
/textract/parsers/wav_parser.py:
--------------------------------------------------------------------------------
1 | from .audio import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/xls_parser.py:
--------------------------------------------------------------------------------
1 | from .xlsx_parser import Parser
2 | 


--------------------------------------------------------------------------------
/textract/parsers/xlsx_parser.py:
--------------------------------------------------------------------------------
 1 | import xlrd
 2 | import six
 3 | 
 4 | from six.moves import xrange
 5 | 
 6 | from .utils import BaseParser
 7 | 
 8 | 
 9 | class Parser(BaseParser):
10 |     """Extract text from Excel files (.xls/xlsx).
11 |     """
12 | 
13 |     def extract(self, filename, **kwargs):
14 |         workbook = xlrd.open_workbook(filename)
15 |         sheets_name = workbook.sheet_names()
16 |         output = "\n"
17 |         for names in sheets_name:
18 |             worksheet = workbook.sheet_by_name(names)
19 |             num_rows = worksheet.nrows
20 |             num_cells = worksheet.ncols
21 | 
22 |             for curr_row in range(num_rows):
23 |                 row = worksheet.row(curr_row)
24 |                 new_output = []
25 |                 for index_col in xrange(num_cells):
26 |                     value = worksheet.cell_value(curr_row, index_col)
27 |                     if value:
28 |                         if isinstance(value, (int, float)):
29 |                             value = six.text_type(value)
30 |                         new_output.append(value)
31 |                 if new_output:
32 |                     output += u' '.join(new_output) + u'\n'
33 |         return output
34 | 


--------------------------------------------------------------------------------