├── scans ├── documents │ └── README ├── archive │ └── README ├── ocr │ └── README ├── in │ └── README └── ocr-ed │ └── README ├── config └── paperless │ └── README ├── .env ├── ocrmypdf-inotify ├── Dockerfile └── main.sh ├── README.md └── docker-compose.yml /scans/documents/README: -------------------------------------------------------------------------------- 1 | Paperless will put documents and thumbnails here 2 | -------------------------------------------------------------------------------- /config/paperless/README: -------------------------------------------------------------------------------- 1 | This is where Paperless will keep its sqlite database 2 | -------------------------------------------------------------------------------- /scans/archive/README: -------------------------------------------------------------------------------- 1 | OCRmyPDF will move source PDF files here after they have been OCRed 2 | -------------------------------------------------------------------------------- /scans/ocr/README: -------------------------------------------------------------------------------- 1 | This is where OCRmyPDF will keep temporary files as it OCRs your PDFS 2 | -------------------------------------------------------------------------------- /scans/in/README: -------------------------------------------------------------------------------- 1 | This is where you will put PDFs that you want OCRed and consumed by Paperless 2 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | PUID=1026 2 | PGID=65537 3 | TZ=Europe/London 4 | CONFIG=/path/to/config 5 | SCANS=/path/to/scans 6 | -------------------------------------------------------------------------------- /scans/ocr-ed/README: -------------------------------------------------------------------------------- 1 | OCRmyPDF will puts OCRed PDFs in here, and they would be consumed by Paperless. 2 | 3 | Also, this is where you put PDFs that you do not want/need to be 4 | OCRed, as OCRmyPDF will simply ignore files that do not need OCRing 5 | (and they will sit in scans/in). 6 | -------------------------------------------------------------------------------- /ocrmypdf-inotify/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jbarlow83/ocrmypdf:latest 2 | RUN apt-get update && apt-get install -y inotify-tools tesseract-ocr-rus tesseract-ocr-ukr 3 | RUN mkdir -p /watcher/ 4 | WORKDIR /watcher 5 | VOLUME /in /work /out /archive 6 | COPY ./main.sh /watcher/main.sh 7 | RUN chmod 0755 /watcher/main.sh 8 | ENTRYPOINT ["/bin/bash","/watcher/main.sh"] 9 | -------------------------------------------------------------------------------- /ocrmypdf-inotify/main.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | set -o pipefail 3 | 4 | if [ -z "${OCRMYPDF_BINARY}" ] ; then 5 | OCRMYPDF_BINARY=$(find / -name ocrmypdf -type f -executable 2>/dev/null) 6 | if [ -z "${OCRMYPDF_BINARY}" ] ; then 7 | echo "Failed to find ocrmypdf binary. Set env var OCRMYPDF_BINARY manually" 8 | exit 1 9 | else 10 | echo "Found ocrmypdf binary $OCRMYPDF_BINARY" 11 | fi 12 | fi 13 | 14 | if [ ! -x "${OCRMYPDF_BINARY}" ] ; then 15 | echo "ocrmypdf binary ${OCRMYPDF_BINARY} is not executable. If you set OCRMYPDF_BINARY manually, check your settings" 16 | fi 17 | 18 | inotifywait -m -e close_write -e moved_to /in | 19 | while read -r path action file; do 20 | echo "Waiting for $file..." 21 | sleep 10 22 | echo "Processing $file..." 23 | 24 | out="${file%%.*}.pdf" 25 | 26 | ${OCRMYPDF_BINARY} ${OCRMYPDF_PARAMETERS} "$path/$file" "/work/$out" 2>&1 | tee /tmp/log 27 | rc=$? 28 | if [ $rc -ne 0 ] ; then 29 | echo "OCRmyPDF failed with code $rc" 30 | if [ -n "$(grep DpiError /tmp/log)" ] ; then 31 | echo "It was DpiError, retrying with img2pdf" 32 | img2pdf --pagesize A4 "$path/$file" | ${OCRMYPDF_BINARY} ${OCRMYPDF_PARAMETERS} - "/work/$out" 33 | rc=$? 34 | if [ $rc -ne 0 ] ; then 35 | echo "img2pdf + OCRmyPDF failed with code $rc" 36 | fi 37 | fi 38 | fi 39 | 40 | if [ $rc -eq 0 -a -f "/work/$out" ] ; then 41 | mv -n "/work/$out" "/out/$out" 42 | mv -n "$path/$file" /archive/$(date +%y%m%d-%H%M%S)_"$file" 43 | echo "File $file processed and archived" 44 | else 45 | echo "Failed to process $file, leaving as is" 46 | [ -f "/work/$out" ] && rm "/work/$out" 47 | fi 48 | done 49 | 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # paperless-ocrmypdf 2 | 3 | Docker compose recipe for The Paperless Project + OCRmyPDF that uses inotify to detect new files and process them 4 | 5 | I wanted to archive processed documents and re-try OCR on DpiError using img2pdf, so I rolled out my own script instead. 6 | 7 | # Limitations 8 | 9 | This image relies on inotify events from your host system propagating into the container. This works if your host system is Linux, but does not if your host system is Windows (for example, see http://blog.subjectify.us/miscellaneous/2017/04/24/docker-for-windows-watch-bindings.html) 10 | 11 | Note that since recently (late 2019) OCRmyPDF docker image includes watcher.py (based on Python watchdog module), so you might consider using it instead, even though it depends on filesystem polling. 12 | 13 | # How does it work 14 | 15 | This is a file-based workflow, organized in a bunch of folders inside "scans" 16 | 17 | - PDFs to be OCRed are put into "in" 18 | 19 | - inotify-based script picks them up and passes them to OCRmyPDF 20 | 21 | - OCRmyPDF does its job, temporary creating files in "ocr" 22 | 23 | - Once file is processed, the original is moved from "in" to "archive", and OCRed document is put into "ocr-ed" 24 | 25 | - Paperless picks it up from "ocr-ed" and moves it into "documents" 26 | 27 | If you have PDFs that do not need OCR, inject them in the middle of this pipeline by putting them in "ocr-ed" 28 | 29 | # Configuration 30 | 31 | Move "config" and "scans" folders somewhere on your filesystem. 32 | 33 | Change paths in .env to point to the locations of "config" and "scans" 34 | 35 | If you need extra languages, configure them in docker-compose.yml and modify Dockerfile to install them into ocrmypdf container. Dockerfile currently is written to include English, Russian and Ukrainian languages. 36 | 37 | Run "docker-compose up -d" and navigate to http://localhost:8000 to configure Paperless. 38 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | ############## 3 | # Containers # 4 | ############## 5 | services: 6 | ################ 7 | # Paperless # 8 | ################ 9 | paperless_web: 10 | container_name: paperless_web 11 | hostname: paperless 12 | image: thepaperlessproject/paperless 13 | ports: 14 | - 8000:8000 15 | healthcheck: 16 | test: ["CMD", "curl" , "-f", "http://localhost:8000"] 17 | interval: 30s 18 | timeout: 10s 19 | retries: 5 20 | volumes: 21 | - ${CONFIG}/paperless:/usr/src/paperless/data 22 | - ${SCANS}:/usr/src/paperless/media 23 | - ${SCANS}/ocr-ed:/consume 24 | # The reason the line is here is so that the webserver that doesn't do 25 | # any text recognition and doesn't have to install unnecessary 26 | # languages the user might have set in the env-file by overwriting the 27 | # value with nothing. 28 | environment: 29 | - PAPERLESS_OCR_LANGUAGES= 30 | - PAPERLESS_DISABLE_LOGIN=true 31 | - PAPERLESS_INLINE_DOC=true 32 | - USERMAP_UID=${PUID} 33 | - USERMAP_GID=${PGID} 34 | - PAPERLESS_TIME_ZONE=${TZ} 35 | command: ["runserver", "--insecure", "--noreload", "0.0.0.0:8000"] 36 | paperless_consumer: 37 | container_name: paperless_consumer 38 | image: thepaperlessproject/paperless 39 | # restart: always 40 | depends_on: 41 | paperless_web: 42 | condition: service_healthy 43 | volumes: 44 | - ${CONFIG}/paperless:/usr/src/paperless/data 45 | - ${SCANS}:/usr/src/paperless/media 46 | - ${SCANS}/ocr-ed:/consume 47 | # Likewise, you can add a local path to mount a directory for 48 | # exporting. This is not strictly needed for paperless to 49 | # function, only if you're exporting your files: uncomment 50 | # it and fill in a local path if you know you're going to 51 | # want to export your documents. 52 | # - /path/to/another/arbitrary/place:/export 53 | environment: 54 | - PAPERLESS_OCR_LANGUAGES=eng+rus+ukr 55 | - USERMAP_UID=${PUID} 56 | - USERMAP_GID=${PGID} 57 | command: ["document_consumer"] 58 | ############ 59 | # OCRmyPDF # 60 | ############ 61 | ocrmypdf-inotify: 62 | container_name: ocrmypdf-inotify 63 | network_mode: none 64 | build: './ocrmypdf-inotify' 65 | restart: always 66 | user: "${PUID}:${PGID}" 67 | environment: 68 | # Container should autodetect location of ocrmypdf binary, 69 | # but if it fails you can set it manually: 70 | # OCRMYPDF_BINARY: /usr/local/bin/ocrmypdf 71 | OCRMYPDF_PARAMETERS: -j 2 -l eng+rus+ukr --clean --rotate-pages --deskew --jobs 4 --output-type pdfa 72 | volumes: 73 | - ${SCANS}/in:/in 74 | - ${SCANS}/ocr:/work 75 | - ${SCANS}/ocr-ed:/out 76 | - ${SCANS}/archive:/archive 77 | --------------------------------------------------------------------------------