├── .gitignore
├── .travis.yml
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
├── screenshot_document_list_test_document.png
└── screenshot_document_search_correct.png
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
└── wagtail_textract
│ ├── __init__.py
│ ├── apps.py
│ ├── handlers.py
│ ├── management
│ └── commands
│ │ ├── __init__.py
│ │ └── transcribe_documents.py
│ ├── migrations
│ ├── 0001_initial.py
│ ├── 0001_squashed_0004_auto_20180508_0942.py
│ ├── 0002_auto_20180502_1303.py
│ ├── 0002_auto_20180509_1259.py
│ ├── 0003_auto_20180503_1117.py
│ ├── 0004_auto_20180508_0942.py
│ └── __init__.py
│ ├── models.py
│ ├── settings.py
│ └── tests
│ ├── __init__.py
│ ├── settings.py
│ ├── test_document_class.py
│ ├── test_management_command.py
│ └── testfiles
│ └── test_document.pdf
├── tox.ini
└── travis-textract-requirements
├── debian
├── debian.sh
├── python-dev.txt
├── python-doc.txt
├── python.sh
└── python.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.egg-info/
3 | .pytest_cache/
4 | .tox/
5 | documents/
6 | env/
7 | tessdata/
8 | coverage_html_report/
9 | .coverage
10 | coverage.xml
11 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | dist: trusty
3 |
4 | language: python
5 | cache: pip
6 |
7 | # taken from Textract .travis.yml
8 | before_install:
9 | - sudo ./travis-textract-requirements/debian.sh
10 |
11 | # command to install dependencies
12 | install:
13 | - ./travis-textract-requirements/python.sh
14 | - pip install tox
15 |
16 | # command to run tests
17 | script:
18 | - tox -e $TOXENV
19 | # cache: pip
20 | matrix:
21 | include:
22 | - env: TOXENV=py34-dj20-wt20
23 | python: 3.4
24 | - env: TOXENV=py35-dj20-wt20
25 | python: 3.5
26 | - env: TOXENV=py36-dj20-wt20
27 | python: 3.6
28 |
--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
1 | CHANGES
2 | =======
3 |
4 | 1.3 (unreleased)
5 | ----------------
6 |
7 | - Nothing changed yet.
8 |
9 |
10 | 1.2 (2019-09-06)
11 | ----------------
12 |
13 | - Update Wagtail requirement: 2 through 2.5: https://github.com/fourdigits/wagtail_textract/pull/28/
14 |
15 |
16 | 1.1 (2019-04-15)
17 | ----------------
18 |
19 | - Updated dependency requirements.
20 |
21 |
22 | 1.0 (2018-09-05)
23 | ----------------
24 |
25 | - Updated version of python-pptx.
26 |
27 |
28 | 0.1b1 (2018-06-11)
29 | ------------------
30 |
31 | - Nothing changed yet.
32 |
33 |
34 | 0.1a2 (2018-05-08)
35 | ------------------
36 |
37 | - Fix README markdown rendering on PyPI.
38 |
39 |
40 | 0.1a1 (2018-05-08)
41 | ------------------
42 |
43 | - Fix README markdown rendering on PyPI.
44 |
45 |
46 | 0.1a0 (2018-05-08)
47 | ------------------
48 |
49 | - initial version
50 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018, Four Digits and Wagtail Community
2 |
3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4 |
5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 |
7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 |
9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 |
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst
2 | include *.md
3 | include LICENSE
4 | recursive-include src *
5 | recursive-include docs *
6 | include pytest.ini
7 | include tox.ini
8 | recursive-include travis-textract-requirements *
9 | global-exclude __pycache__
10 | global-exclude *.py[co]
11 | exclude Makefile
12 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | env:
2 | virtualenv --python=`which python3` env
3 |
4 | tessdata:
5 | mkdir tessdata
6 | cd tessdata && curl -LJO https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata
7 |
8 | test: tessdata env
9 | env/bin/pip install -e ".[test]"
10 | coverage run env/bin/pytest
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/fourdigits/wagtail_textract)
2 | [](http://codecov.io/github/fourdigits/wagtail_textract?branch=master)
3 |
4 | # ⚠️ Deprecation warning
5 |
6 | This package is unmaintained, and we have no plans to maintain it.
7 |
8 | We advise you to use it as an example, maybe copy the code into your own project, but don't install the package.
9 |
10 | # Text extraction for Wagtail document search
11 |
12 | This package is for replacing [Wagtail][1]'s Document class with one
13 | that allows searching in Document file contents using [textract][2].
14 |
15 | Textract can extract text from (among [others][6]) PDF, Excel and Word files.
16 |
17 | The package was inspired by the ["Search: Extract text from documents" issue][3] in Wagtail.
18 |
19 | Documents will work as before, except that Document search in Wagtail's admin interface
20 | will also find search terms in the files' contents.
21 |
22 | Some screenshots to illustrate.
23 |
24 | In our fresh Wagtail site with `wagtail_textract` installed,
25 | we uploaded a [file called `test_document.pdf`](./src/wagtail_textract/tests/testfiles/test_document.pdf) with handwritten text in it.
26 | It is listed in the admin interface under Documents:
27 |
28 | 
29 |
30 | If we now search in Documents for the word `correct`, which is one of the handwritten words,
31 | the live search finds it:
32 |
33 | 
34 |
35 | The assumption is that this search should not only be available in Wagtail's admin interface,
36 | but also in a public-facing search view, for which we provide a code example.
37 |
38 |
39 | ## Requirements
40 |
41 | - Wagtail 2 (see [tox.ini](./tox.ini))
42 | - The [Textract dependencies][8]
43 |
44 |
45 | ## Maturity
46 |
47 | We have been using this package in production since August 2018 on https://nuffic.nl.
48 |
49 |
50 | ## Installation
51 |
52 | - Install the [Textract dependencies][8]
53 | - Add `wagtail_textract` to your requirements and/or `pip install wagtail_textract`
54 | - Add to your Django `INSTALLED_APPS`.
55 | - Put `WAGTAILDOCS_DOCUMENT_MODEL = "wagtail_textract.document"` in your Django settings.
56 |
57 | Note: You'll get an incompatibility warning during installation of wagtail_textract (Wagtail 2.0.1 installed):
58 |
59 | ```
60 | requests 2.18.4 has requirement chardet<3.1.0,>=3.0.2, but you'll have chardet 2.3.0 which is incompatible.
61 | textract 1.6.1 has requirement beautifulsoup4==4.5.3, but you'll have beautifulsoup4 4.6.0 which is incompatible.
62 | ```
63 |
64 | We haven't seen this leading to problems, but it's something to keep in mind.
65 |
66 |
67 | ### Tesseract
68 |
69 | In order to make `textract` use [Tesseract][4], which happens if regular
70 | `textract` finds no text, you need to add the data files that Tesseract can
71 | base its word matching on.
72 |
73 | Create a `tessdata` directory in your project directory, and download the
74 | [languages][5] you want.
75 |
76 |
77 | ## Transcribing
78 |
79 | Transcription is done automatically after Document save,
80 | in an [`asyncio`][7] executor to prevent blocking the response during processing.
81 |
82 | To transcribe all existing Documents, run the management command::
83 |
84 | ./manage.py transcribe_documents
85 |
86 | This may take a long time, obviously.
87 |
88 |
89 | ## Usage in custom view
90 |
91 | Here is a code example for a search view (outside Wagtail's admin interface)
92 | that shows both Page and Document results.
93 |
94 | ```python
95 | from itertools import chain
96 |
97 | from wagtail.core.models import Page
98 | from wagtail.documents.models import get_document_model
99 |
100 |
101 | def search(request):
102 | # Search
103 | search_query = request.GET.get('query', None)
104 | if search_query:
105 | page_results = Page.objects.live().search(search_query)
106 | document_results = Document.objects.search(search_query)
107 | search_results = list(chain(page_results, document_results))
108 |
109 | # Log the query so Wagtail can suggest promoted results
110 | Query.get(search_query).add_hit()
111 | else:
112 | search_results = Page.objects.none()
113 |
114 | # Render template
115 | return render(request, 'website/search_results.html', {
116 | 'search_query': search_query,
117 | 'search_results': search_results,
118 | })
119 | ```
120 |
121 | Your template should allow for handling Documents differently than Pages,
122 | because you can't do `pageurl result` on a Document:
123 |
124 | ```jinja2
125 | {% if result.file %}
126 | {{ result }}
127 | {% else %}
128 | {{ result }}
129 | {% endif %}
130 | ```
131 |
132 |
133 | ## What if you already use a custom Document model?
134 |
135 | In order to use wagtail_textract, your `CustomizedDocument` model should do
136 | the same as [wagtail_textract's Document](./src/wagtail_textract/models.py):
137 |
138 | - subclass `TranscriptionMixin`
139 | - alter `search_fields`
140 |
141 | ```python
142 | from wagtail_textract.models import TranscriptionMixin
143 |
144 |
145 | class CustomizedDocument(TranscriptionMixin, ...):
146 | """Extra fields and methods for Document model."""
147 | search_fields = ... + [
148 | index.SearchField(
149 | 'transcription',
150 | partial_match=False,
151 | ),
152 | ]
153 | ```
154 |
155 | Note that the first class to subclass should be `TranscriptionMixin`,
156 | so its `save()` takes precedence over that of the other parent classes.
157 |
158 |
159 | ## Tests
160 |
161 | To run tests, checkout this repository and:
162 |
163 | make test
164 |
165 |
166 | ### Coverage
167 |
168 | A coverage report will be generated in `./coverage_html_report/`.
169 |
170 |
171 | ## Contributors
172 |
173 | - Karl Hobley
174 | - Bertrand Bordage
175 | - Kees Hink
176 | - Tom Hendrikx
177 | - Coen van der Kamp
178 | - Mike Overkamp
179 | - Thibaud Colas
180 | - Dan Braghis
181 | - Dan Swain
182 |
183 |
184 | [1]: https://wagtail.io/
185 | [2]: https://github.com/deanmalmgren/textract
186 | [3]: https://github.com/wagtail/wagtail/issues/542
187 | [4]: https://github.com/tesseract-ocr
188 | [5]: https://github.com/tesseract-ocr/tessdata
189 | [6]: http://textract.readthedocs.io/en/stable/#currently-supporting
190 | [7]: https://docs.python.org/3/library/asyncio.html
191 | [8]: http://textract.readthedocs.io/en/latest/installation.html
192 |
--------------------------------------------------------------------------------
/docs/screenshot_document_list_test_document.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fourdigits/wagtail_textract/c1e67f6f26853c8fc20c20a879c00a32948368a3/docs/screenshot_document_list_test_document.png
--------------------------------------------------------------------------------
/docs/screenshot_document_search_correct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fourdigits/wagtail_textract/c1e67f6f26853c8fc20c20a879c00a32948368a3/docs/screenshot_document_search_correct.png
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | DJANGO_SETTINGS_MODULE = wagtail_textract.tests.settings
3 | testpaths = src/wagtail_textract
4 | python_files = test_*.py
5 | norecursedirs = migrations
6 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal=1
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | try:
3 | from setuptools import setup, find_packages
4 | except ImportError:
5 | from distutils.core import setup
6 |
7 |
8 | # Hack to prevent "TypeError: 'NoneType' object is not callable" error
9 | # in multiprocessing/util.py _exit_function when setup.py exits
10 | # (see http://www.eby-sarna.com/pipermail/peak/2010-May/003357.html)
11 | try:
12 | import multiprocessing # NOQA
13 | except ImportError:
14 | pass
15 |
16 |
17 | install_requires = [
18 | "wagtail>=2,<2.6",
19 | "textract",
20 | ]
21 |
22 | tests_require = [
23 | 'pytest',
24 | 'pytest-django',
25 | 'coverage',
26 | 'codecov',
27 | ]
28 |
29 | setup(
30 | name='wagtail-textract',
31 | version='1.3.dev0',
32 | description='Allow searching for text in Documents in the Wagtail content management system',
33 | author='Kees Hink',
34 | author_email='kees@fourdigits.nl',
35 | url='https://github.com/fourdigits/wagtail_textract',
36 | package_dir={'': 'src'},
37 | packages=find_packages('src'),
38 | include_package_data=True,
39 | license='BSD',
40 | long_description=open('README.md', 'r').read(),
41 | long_description_content_type='text/markdown',
42 | classifiers=[
43 | 'Development Status :: 4 - Beta',
44 | 'Environment :: Web Environment',
45 | 'Intended Audience :: Developers',
46 | 'License :: OSI Approved :: BSD License',
47 | 'Operating System :: OS Independent',
48 | 'Programming Language :: Python',
49 | 'Programming Language :: Python :: 3',
50 | 'Programming Language :: Python :: 3.4',
51 | 'Programming Language :: Python :: 3.5',
52 | 'Programming Language :: Python :: 3.6',
53 | 'Programming Language :: Python :: 3.7',
54 | 'Framework :: Django',
55 | 'Framework :: Django :: 2.0',
56 | 'Framework :: Django :: 2.1',
57 | 'Framework :: Django :: 2.2',
58 | 'Framework :: Wagtail',
59 | 'Framework :: Wagtail :: 2',
60 | 'Topic :: Internet :: WWW/HTTP :: Site Management',
61 | ],
62 | install_requires=install_requires,
63 | extras_require={
64 | 'test': tests_require,
65 | },
66 | entry_points="""""",
67 | zip_safe=False,
68 | cmdclass={
69 | },
70 | )
71 |
--------------------------------------------------------------------------------
/src/wagtail_textract/__init__.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 |
3 | __version__ = pkg_resources.get_distribution("wagtail_textract").version
4 |
5 | default_app_config = 'wagtail_textract.apps.WagtailTextractAppConfig'
6 |
--------------------------------------------------------------------------------
/src/wagtail_textract/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class WagtailTextractAppConfig(AppConfig):
5 | """Wagtail-Textract AppConfig."""
6 | name = 'wagtail_textract'
7 | label = 'wagtail_textract'
8 | verbose_name = "Wagtail-Textract Search integration"
9 |
--------------------------------------------------------------------------------
/src/wagtail_textract/handlers.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import logging
3 | import textract
4 |
5 | logger = logging.getLogger(__name__)
6 | loop = asyncio.get_event_loop()
7 |
8 |
9 | def transcribe_document(document):
10 | """Store the Document file's text in the transcription field."""
11 | try:
12 | text = textract.process(document.file.path).strip()
13 | if not text:
14 | logger.debug('No text found, falling back to tesseract.')
15 | text = textract.process(
16 | document.file.path,
17 | method='tesseract',
18 | ).strip()
19 |
20 | except Exception as err:
21 | text = None
22 | logger.error(
23 | 'Text extraction error with file {file}: {message}'.format(
24 | file=document.filename,
25 | message=str(err),
26 | )
27 | )
28 |
29 | if text:
30 | document.transcription = text.decode()
31 | document.save(transcribe=False)
32 | print("Saved transcription: %s" % text)
33 | else:
34 | logger.error('No text found.')
35 |
36 |
37 | def async_transcribe_document(document):
38 | """Defer transcription to an asyncio executor."""
39 | loop.run_in_executor(None, transcribe_document, document)
40 |
--------------------------------------------------------------------------------
/src/wagtail_textract/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fourdigits/wagtail_textract/c1e67f6f26853c8fc20c20a879c00a32948368a3/src/wagtail_textract/management/commands/__init__.py
--------------------------------------------------------------------------------
/src/wagtail_textract/management/commands/transcribe_documents.py:
--------------------------------------------------------------------------------
1 | from django.core.management.base import BaseCommand
2 |
3 | from wagtail.documents.models import get_document_model
4 |
5 | from wagtail_textract.handlers import async_transcribe_document
6 |
7 |
8 | class Command(BaseCommand):
9 | """Extract text from all Documents."""
10 |
11 | def handle(self, *args, **options):
12 | """Extract text from all Documents."""
13 | for document in get_document_model().objects.all():
14 | self.stdout.write("Transcribing %s" % document)
15 | async_transcribe_document(document)
16 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.4 on 2018-05-02 08:05
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | initial = True
10 |
11 | dependencies = [
12 | ('wagtaildocs', '0007_merge'),
13 | ]
14 |
15 | operations = [
16 | migrations.CreateModel(
17 | name='Document',
18 | fields=[
19 | ('document_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='wagtaildocs.Document')),
20 | ('transcription', models.TextField(null=True)),
21 | ],
22 | options={
23 | 'verbose_name': 'document',
24 | 'abstract': False,
25 | },
26 | bases=('wagtaildocs.document',),
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/0001_squashed_0004_auto_20180508_0942.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.4 on 2018-05-08 09:45
2 |
3 | from django.db import migrations, models
4 | import django.db.models.deletion
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | replaces = [('wagtail_textract', '0001_initial'), ('wagtail_textract', '0002_auto_20180502_1303'), ('wagtail_textract', '0003_auto_20180503_1117'), ('wagtail_textract', '0004_auto_20180508_0942')]
10 |
11 | initial = True
12 |
13 | dependencies = [
14 | ('wagtaildocs', '0007_merge'),
15 | ]
16 |
17 | operations = [
18 | migrations.CreateModel(
19 | name='Document',
20 | fields=[
21 | ('document_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='wagtaildocs.Document')),
22 | ('transcription', models.TextField(blank=True, default='')),
23 | ],
24 | options={
25 | 'verbose_name': 'document',
26 | 'abstract': False,
27 | },
28 | bases=('wagtaildocs.document',),
29 | ),
30 | ]
31 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/0002_auto_20180502_1303.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.4 on 2018-05-02 13:03
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('wagtail_textract', '0001_initial'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='document',
15 | name='transcription',
16 | field=models.TextField(default=''),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/0002_auto_20180509_1259.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.5 on 2018-05-09 12:59
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('wagtail_textract', '0001_squashed_0004_auto_20180508_0942'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterModelOptions(
14 | name='document',
15 | options={},
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/0003_auto_20180503_1117.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.4 on 2018-05-03 11:17
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('wagtail_textract', '0002_auto_20180502_1303'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='document',
15 | name='transcription',
16 | field=models.TextField(blank=True, default=''),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/0004_auto_20180508_0942.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.0.4 on 2018-05-08 09:42
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('wagtail_textract', '0003_auto_20180503_1117'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterModelOptions(
14 | name='document',
15 | options={},
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/src/wagtail_textract/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fourdigits/wagtail_textract/c1e67f6f26853c8fc20c20a879c00a32948368a3/src/wagtail_textract/migrations/__init__.py
--------------------------------------------------------------------------------
/src/wagtail_textract/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 | from wagtail.documents.models import Document as WagtailDocument
3 | from wagtail.search import index
4 |
5 | from wagtail_textract.handlers import async_transcribe_document
6 |
7 |
8 | class TranscriptionMixin(models.Model):
9 | """Mixin class with transcription field and save method."""
10 | transcription = models.TextField(default='', blank=True)
11 |
12 | class Meta:
13 | """Don't create a table, this model is only for subclassing."""
14 | abstract = True
15 |
16 | def save(self, **kwargs):
17 | """Asynchronously transcribe the file."""
18 | transcribe = kwargs.pop('transcribe', True)
19 | super(TranscriptionMixin, self).save(**kwargs)
20 | if transcribe:
21 | async_transcribe_document(self)
22 |
23 |
24 | class Document(TranscriptionMixin, WagtailDocument):
25 | """Include transcription in search_fields."""
26 | search_fields = WagtailDocument.search_fields + [
27 | index.SearchField(
28 | 'transcription',
29 | partial_match=False,
30 | ),
31 | ]
32 |
--------------------------------------------------------------------------------
/src/wagtail_textract/settings.py:
--------------------------------------------------------------------------------
1 | from wagtail.tests.settings import *
2 |
3 | WAGTAILDOCS_DOCUMENT_MODEL = 'wagtail_textract.document'
4 | INSTALLED_APPS = INSTALLED_APPS + ('wagtail_textract',)
5 |
--------------------------------------------------------------------------------
/src/wagtail_textract/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fourdigits/wagtail_textract/c1e67f6f26853c8fc20c20a879c00a32948368a3/src/wagtail_textract/tests/__init__.py
--------------------------------------------------------------------------------
/src/wagtail_textract/tests/settings.py:
--------------------------------------------------------------------------------
1 | from wagtail.tests.settings import *
2 |
3 | MEDIA_ROOT = '.'
4 | WAGTAILDOCS_DOCUMENT_MODEL = 'wagtail_textract.document'
5 | INSTALLED_APPS = INSTALLED_APPS + ('wagtail_textract',)
6 |
--------------------------------------------------------------------------------
/src/wagtail_textract/tests/test_document_class.py:
--------------------------------------------------------------------------------
1 | from wagtail.documents.models import get_document_model
2 |
3 |
4 | def test_document_class():
5 | """Test that the Document model has the required (search) field.
6 |
7 | Actually, this only tests if Wagtails WAGTAILDOCS_DOCUMENT_MODEL
8 | still works, and that our test Django settings are correct.
9 | """
10 | Document = get_document_model()
11 | assert hasattr(Document, 'transcription')
12 | assert 'transcription' in [f.field_name for f in Document.search_fields]
13 |
--------------------------------------------------------------------------------
/src/wagtail_textract/tests/test_management_command.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import time
3 |
4 | from django.core.files import File
5 | from django.core.management import call_command
6 | from wagtail.documents.models import get_document_model
7 |
8 | Document = get_document_model()
9 |
10 |
11 | @pytest.mark.django_db
12 | def test_management_command():
13 | """Test the transcribe_documents management script.
14 |
15 | This creates a Document with the file `test_document.pdf`, which contains
16 | the hand-written words "CORRECT HORSE BATTERY STAPLE" on separate lines.
17 | Unfortunately, the handwriting is not clear enough so OCR recognizes
18 | 'CORRECT H o R SE’ BATTE Ry STAPLE'
19 | """
20 | path = './src/wagtail_textract/tests/testfiles/'
21 | fhandle = open('%s/test_document.pdf' % path, 'rb')
22 | file = File(fhandle)
23 | document = Document.objects.create(
24 | title="Test file",
25 | file=file,
26 | )
27 | fhandle.close()
28 | document.save()
29 | call_command('transcribe_documents')
30 |
31 | # Transcription field is empty initially
32 | document.refresh_from_db()
33 | assert document.transcription == ''
34 |
35 | # After some time, transcription is complete
36 | time.sleep(10)
37 | document.refresh_from_db()
38 | assert 'CORRECT' not in document.transcription
39 | assert 'STAPLE' not in document.transcription
40 |
--------------------------------------------------------------------------------
/src/wagtail_textract/tests/testfiles/test_document.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fourdigits/wagtail_textract/c1e67f6f26853c8fc20c20a879c00a32948368a3/src/wagtail_textract/tests/testfiles/test_document.pdf
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist =
3 | py{34,35,36}-dj{20}-wt{20,21,22}
4 | py{35,36}-dj{21}-wt{23}
5 | py{35,36,37}-dj{21}-wt{24}
6 | py{35,36,37}-dj{22}-wt{25}
7 |
8 | [testenv]
9 | basepython =
10 | py34: python3.4
11 | py35: python3.5
12 | py36: python3.6
13 | py37: python3.7
14 |
15 | deps =
16 | pytest
17 | pytest-django
18 | coverage
19 | codecov
20 | dj20: Django>=2.0,<2.1
21 | dj21: Django>=2.1,<2.2
22 | dj22: Django>=2.2,<2.3
23 | wt20: wagtail>=2.0,<2.1
24 | wt21: wagtail>=2.1,<2.2
25 | wt22: wagtail>=2.2,<2.3
26 | wt23: wagtail>=2.3,<2.4
27 | wt24: wagtail>=2.4,<2.5
28 | wt25: wagtail>=2.5,<2.6
29 |
30 | whitelist_externals =
31 | make
32 |
33 | install_command = pip install -e ".[test]" --upgrade {opts} {packages}
34 |
35 | # include {envsitepackagesdir}/wagtail_textract b/c "import file mismatch":
36 | # http://tox.readthedocs.io/en/latest/example/pytest.html#known-issues-and-limitations
37 | commands =
38 | make tessdata
39 | coverage run {envbindir}/pytest {envsitepackagesdir}/wagtail_textract
40 | codecov --token=eff6f245-ba93-4858-8640-610f27103511
41 |
--------------------------------------------------------------------------------
/travis-textract-requirements/debian:
--------------------------------------------------------------------------------
1 | # required packages
2 | gcc
3 | libpulse-dev
4 | libjpeg-dev
5 | build-essential
6 | git
7 | make
8 |
9 | # these packages are required by python-docx, which depends on lxml
10 | # and requires these things
11 | python-dev
12 | libxml2-dev
13 | libxslt1-dev
14 |
15 | # parse word documents
16 | antiword
17 |
18 | # parse rtf documents
19 | unrtf
20 |
21 | # parse image files
22 | tesseract-ocr=3.03\*
23 | libjpeg-dev
24 |
25 | # parse pdfs
26 | poppler-utils
27 |
28 | # parse postscript files
29 | pstotext
30 |
31 | # parse audio files, with SpeechRecognition
32 | flac
33 |
34 | # filetype conversion libs
35 | ffmpeg
36 | lame
37 | libmad0
38 | libsox-fmt-mp3
39 |
40 | # convert audio files
41 | sox
42 |
43 | # Sphinx Speech Recognition
44 | swig
45 |
46 | # ubuntu 14.04 requires this in addition to libxml2-dev and
47 | # libxslt1-dev for compiling lxml.
48 | # https://github.com/deanmalmgren/textract/issues/19
49 | zlib1g-dev
50 |
--------------------------------------------------------------------------------
/travis-textract-requirements/debian.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This needs to work both for Vagrant provisioning and for Travis
4 | # builds in a Python virtualenv, each of which have different current
5 | # working directories when this script is called. When run in Vagrant, the
6 | # script is copied to /tmp and executed from there, passing the original
7 | # path as the first argument. So deal with that.
8 | if [ "$1" == "" ]; then
9 | # normal
10 | cd $(dirname $0)/..
11 | else
12 | # run from /tmp by Vagrant.
13 | cd $1
14 | fi
15 | base=$(pwd)
16 |
17 | # Install all of the dependencies required in the examples.
18 | # http://docs.travis-ci.com/user/installing-dependencies/#Installing-Ubuntu-packages
19 | add-apt-repository ppa:mc3man/trusty-media -y
20 | apt-get update -qq
21 | sed 's/\(.*\)\#.*/\1/' < $base/travis-textract-requirements/debian | xargs apt-get install -y --fix-missing
22 |
--------------------------------------------------------------------------------
/travis-textract-requirements/python-dev.txt:
--------------------------------------------------------------------------------
1 | # This includes all packages that are used in development, including all
2 | # packages that are required by textract itself (python), packages for
3 | # documentation builds (python-doc)
4 |
5 | -r python.txt
6 | -r python-doc.txt
7 |
8 | # needed for tests/run.py script to read .travis.yml file
9 | PyYAML==3.12
10 | pep8==1.7.0
11 | coveralls==1.1
12 | requests==2.18.1
13 | nose==1.3.7
14 |
15 | # needed for managing versions
16 | bumpversion==0.5.3
17 |
--------------------------------------------------------------------------------
/travis-textract-requirements/python-doc.txt:
--------------------------------------------------------------------------------
1 | # this only includes packages that are needed for documentation build.
2 |
3 | sphinx==1.6.3
4 | sphinx_rtd_theme==0.2.4
5 | sphinx-argparse==0.2.1
6 |
--------------------------------------------------------------------------------
/travis-textract-requirements/python.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This needs to work for vagrant, Travis builds, and Docker builds.
4 | # in a python virtualenv. in the virtual machine provisioning,
5 | # we're passing the directory this should be run from. in travis-ci,
6 | # its run from the root of the repository.
7 | if [ "$#" -eq 1 ]; then
8 | cd $1
9 | fi
10 |
11 | # upgrade pip so we can use wheel downloads
12 | pip install -U pip
13 |
14 | # Install the requirements for this package as well as this module.
15 | pip install -r travis-textract-requirements/python-dev.txt
16 |
--------------------------------------------------------------------------------
/travis-textract-requirements/python.txt:
--------------------------------------------------------------------------------
1 | # This file contains all python dependencies that are required by the textract
2 | # package in order for it to properly work.
3 |
4 | argcomplete==1.8.2
5 | chardet==3.0.4
6 | python-pptx==0.6.12
7 | #pdfminer.six <-- go back to this after the shebang fix is released (see https://github.com/goulu/pdfminer/issues/27)
8 | https://github.com/goulu/pdfminer/zipball/e6ad15af79a26c31f4e384d8427b375c93b03533#egg=pdfminer.six
9 | docx2txt==0.6
10 | beautifulsoup4==4.6.0
11 | xlrd==1.0.0
12 | EbookLib==0.16
13 | SpeechRecognition==3.7.1
14 | https://github.com/mattgwwalker/msg-extractor/zipball/master
15 | six==1.10.0
16 | pocketsphinx==0.1.3
17 |
--------------------------------------------------------------------------------