├── scans
    ├── documents
    │   └── README
    ├── archive
    │   └── README
    ├── ocr
    │   └── README
    ├── in
    │   └── README
    └── ocr-ed
    │   └── README
├── config
    └── paperless
    │   └── README
├── .env
├── ocrmypdf-inotify
    ├── Dockerfile
    └── main.sh
├── README.md
└── docker-compose.yml


/scans/documents/README:
--------------------------------------------------------------------------------
1 | Paperless will put documents and thumbnails here
2 | 


--------------------------------------------------------------------------------
/config/paperless/README:
--------------------------------------------------------------------------------
1 | This is where Paperless will keep its sqlite database
2 | 


--------------------------------------------------------------------------------
/scans/archive/README:
--------------------------------------------------------------------------------
1 | OCRmyPDF will move source PDF files here after they have been OCRed
2 | 


--------------------------------------------------------------------------------
/scans/ocr/README:
--------------------------------------------------------------------------------
1 | This is where OCRmyPDF will keep temporary files as it OCRs your PDFS
2 | 


--------------------------------------------------------------------------------
/scans/in/README:
--------------------------------------------------------------------------------
1 | This is where you will put PDFs that you want OCRed and consumed by Paperless
2 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | PUID=1026
2 | PGID=65537
3 | TZ=Europe/London
4 | CONFIG=/path/to/config
5 | SCANS=/path/to/scans
6 | 


--------------------------------------------------------------------------------
/scans/ocr-ed/README:
--------------------------------------------------------------------------------
1 | OCRmyPDF will puts OCRed PDFs in here, and they would be consumed by Paperless.
2 | 
3 | Also, this is where you put PDFs that you do not want/need to be
4 | OCRed, as OCRmyPDF will simply ignore files that do not need OCRing
5 | (and they will sit in scans/in).
6 | 


--------------------------------------------------------------------------------
/ocrmypdf-inotify/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jbarlow83/ocrmypdf:latest
2 | RUN apt-get update && apt-get install -y inotify-tools tesseract-ocr-rus tesseract-ocr-ukr
3 | RUN mkdir -p /watcher/
4 | WORKDIR /watcher
5 | VOLUME /in /work /out /archive
6 | COPY ./main.sh /watcher/main.sh
7 | RUN chmod 0755 /watcher/main.sh
8 | ENTRYPOINT ["/bin/bash","/watcher/main.sh"]
9 | 


--------------------------------------------------------------------------------
/ocrmypdf-inotify/main.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | set -o pipefail
 3 | 
 4 | if [ -z "${OCRMYPDF_BINARY}" ] ; then
 5 |     OCRMYPDF_BINARY=$(find / -name ocrmypdf -type f -executable 2>/dev/null)
 6 |     if [ -z "${OCRMYPDF_BINARY}" ] ; then
 7 |         echo "Failed to find ocrmypdf binary. Set env var OCRMYPDF_BINARY manually"
 8 |         exit 1
 9 |     else
10 |         echo "Found ocrmypdf binary $OCRMYPDF_BINARY"
11 |     fi
12 | fi
13 | 
14 | if [ ! -x "${OCRMYPDF_BINARY}" ] ; then
15 |     echo "ocrmypdf binary ${OCRMYPDF_BINARY} is not executable. If you set OCRMYPDF_BINARY manually, check your settings"
16 | fi
17 | 
18 | inotifywait -m -e close_write -e moved_to /in |
19 |     while read -r path action file; do
20 |         echo "Waiting for $file..."
21 |         sleep 10
22 |         echo "Processing $file..."
23 | 
24 |         out="${file%%.*}.pdf"
25 |         
26 |         ${OCRMYPDF_BINARY} ${OCRMYPDF_PARAMETERS} "$path/$file" "/work/$out" 2>&1 | tee /tmp/log
27 |         rc=$?
28 |         if [ $rc -ne 0 ] ; then
29 |             echo "OCRmyPDF failed with code $rc"
30 |             if [ -n "$(grep DpiError /tmp/log)" ] ; then
31 |                 echo "It was DpiError, retrying with img2pdf"
32 |                 img2pdf --pagesize A4 "$path/$file" | ${OCRMYPDF_BINARY} ${OCRMYPDF_PARAMETERS} - "/work/$out"
33 |                 rc=$?
34 |                 if [ $rc -ne 0 ] ; then
35 |                     echo "img2pdf + OCRmyPDF failed with code $rc"
36 |                 fi
37 |             fi
38 |         fi
39 | 
40 |         if [ $rc -eq 0 -a -f "/work/$out" ] ; then
41 |             mv -n "/work/$out" "/out/$out"
42 |             mv -n "$path/$file" /archive/$(date +%y%m%d-%H%M%S)_"$file"
43 |             echo "File $file processed and archived"
44 |         else
45 |             echo "Failed to process $file, leaving as is"
46 |             [ -f "/work/$out" ] && rm "/work/$out"
47 |         fi
48 |     done
49 | 
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # paperless-ocrmypdf
 2 | 
 3 | Docker compose recipe for The Paperless Project + OCRmyPDF that uses inotify to detect new files and process them
 4 | 
 5 | I wanted to archive processed documents and re-try OCR on DpiError using img2pdf, so I rolled out my own script instead.
 6 | 
 7 | # Limitations
 8 | 
 9 | This image relies on inotify events from your host system propagating into the container. This works if your host system is Linux, but does not if your host system is Windows (for example, see http://blog.subjectify.us/miscellaneous/2017/04/24/docker-for-windows-watch-bindings.html)
10 | 
11 | Note that since recently (late 2019) OCRmyPDF docker image includes watcher.py (based on Python watchdog module), so you might consider using it instead, even though it depends on filesystem polling.
12 | 
13 | # How does it work
14 | 
15 | This is a file-based workflow, organized in a bunch of folders inside "scans"
16 | 
17 | - PDFs to be OCRed are put into "in"
18 | 
19 | - inotify-based script picks them up and passes them to OCRmyPDF
20 | 
21 | - OCRmyPDF does its job, temporary creating files in "ocr"
22 | 
23 | - Once file is processed, the original is moved from "in" to "archive", and OCRed document is put into "ocr-ed"
24 | 
25 | - Paperless picks it up from "ocr-ed" and moves it into "documents"
26 | 
27 | If you have PDFs that do not need OCR, inject them in the middle of this pipeline by putting them in "ocr-ed"
28 | 
29 | # Configuration
30 | 
31 | Move "config" and "scans" folders somewhere on your filesystem.
32 | 
33 | Change paths in .env to point to the locations of "config" and "scans"
34 | 
35 | If you need extra languages, configure them in docker-compose.yml and modify Dockerfile to install them into ocrmypdf container. Dockerfile currently is written to include English, Russian and Ukrainian languages.
36 | 
37 | Run "docker-compose up -d" and navigate to http://localhost:8000 to configure Paperless.
38 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2.1'
 2 | ##############
 3 | # Containers #
 4 | ##############
 5 | services:
 6 |   ################
 7 |   # Paperless    #
 8 |   ################
 9 |   paperless_web:
10 |     container_name: paperless_web
11 |     hostname: paperless
12 |     image: thepaperlessproject/paperless
13 |     ports:
14 |         - 8000:8000
15 |     healthcheck:
16 |         test: ["CMD", "curl" , "-f", "http://localhost:8000"]
17 |         interval: 30s
18 |         timeout: 10s
19 |         retries: 5
20 |     volumes:
21 |         - ${CONFIG}/paperless:/usr/src/paperless/data
22 |         - ${SCANS}:/usr/src/paperless/media
23 |         - ${SCANS}/ocr-ed:/consume
24 |     # The reason the line is here is so that the webserver that doesn't do
25 |     # any text recognition and doesn't have to install unnecessary
26 |     # languages the user might have set in the env-file by overwriting the
27 |     # value with nothing.
28 |     environment:
29 |         - PAPERLESS_OCR_LANGUAGES=
30 |         - PAPERLESS_DISABLE_LOGIN=true
31 |         - PAPERLESS_INLINE_DOC=true
32 |         - USERMAP_UID=${PUID}
33 |         - USERMAP_GID=${PGID}
34 |         - PAPERLESS_TIME_ZONE=${TZ}
35 |     command: ["runserver", "--insecure", "--noreload", "0.0.0.0:8000"]
36 |   paperless_consumer:
37 |     container_name: paperless_consumer
38 |     image: thepaperlessproject/paperless
39 |     # restart: always
40 |     depends_on:
41 |         paperless_web:
42 |             condition: service_healthy
43 |     volumes:
44 |         - ${CONFIG}/paperless:/usr/src/paperless/data
45 |         - ${SCANS}:/usr/src/paperless/media
46 |         - ${SCANS}/ocr-ed:/consume
47 |         # Likewise, you can add a local path to mount a directory for
48 |         # exporting. This is not strictly needed for paperless to
49 |         # function, only if you're exporting your files: uncomment
50 |         # it and fill in a local path if you know you're going to
51 |         # want to export your documents.
52 |         # - /path/to/another/arbitrary/place:/export
53 |     environment:
54 |         - PAPERLESS_OCR_LANGUAGES=eng+rus+ukr
55 |         - USERMAP_UID=${PUID}
56 |         - USERMAP_GID=${PGID}
57 |     command: ["document_consumer"]
58 |   ############
59 |   # OCRmyPDF #
60 |   ############
61 |   ocrmypdf-inotify:
62 |     container_name: ocrmypdf-inotify
63 |     network_mode: none
64 |     build: './ocrmypdf-inotify'
65 |     restart: always
66 |     user: "${PUID}:${PGID}"
67 |     environment:
68 |       # Container should autodetect location of ocrmypdf binary,
69 |       # but if it fails you can set it manually:
70 |       # OCRMYPDF_BINARY: /usr/local/bin/ocrmypdf
71 |       OCRMYPDF_PARAMETERS: -j 2 -l eng+rus+ukr --clean --rotate-pages --deskew --jobs 4 --output-type pdfa
72 |     volumes:
73 |     - ${SCANS}/in:/in
74 |     - ${SCANS}/ocr:/work
75 |     - ${SCANS}/ocr-ed:/out
76 |     - ${SCANS}/archive:/archive
77 | 


--------------------------------------------------------------------------------