├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── auto_makedoc.sh ├── django_ocr_server.png ├── django_ocr_server ├── __init__.py ├── admin.py ├── apiviews.py ├── apps.py ├── conf.py ├── converters.py ├── default_settings.py ├── exceptions.py ├── forms.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── create_user.py │ │ ├── db_ping.py │ │ └── ttl.py ├── migrations │ ├── 0001_initial.py │ ├── 0002_auto_20201230_0312.py │ ├── 0003_auto_20201230_0316.py │ └── __init__.py ├── models.py ├── serializers.py ├── static │ └── django_ocr_server │ │ ├── django-ocr-server.png │ │ ├── django-ocr-server.svg │ │ └── style.css ├── templates │ ├── admin │ │ └── django_ocr_server │ │ │ └── ocredfile │ │ │ └── submit_line.html │ └── django_ocr_server │ │ ├── base.html │ │ ├── forms │ │ └── widgets │ │ │ ├── file_link.html │ │ │ ├── pdf_info.html │ │ │ └── pdf_link.html │ │ └── main_page.html ├── tests │ ├── __init__.py │ ├── assets │ │ ├── deming.pdf │ │ ├── empty_file.txt │ │ ├── not_empty_file.txt │ │ ├── not_image.txt │ │ ├── shmakovpn.jpg │ │ ├── shmakovpn.pdf │ │ ├── some_dir │ │ │ └── .gitkeep │ │ ├── test_eng.pdf │ │ ├── test_eng.png │ │ ├── test_eng_notext.pdf │ │ ├── test_rus.png │ │ └── the_pdf_withtext.pdf │ ├── dependencies │ │ ├── __init__.py │ │ └── test_dependencies.py │ ├── old_tests │ │ ├── __init__.py │ │ └── test_old_tests.py │ ├── settings │ │ ├── __init__.py │ │ ├── test_default_settings.py │ │ └── test_ocr_settings.py │ └── utils │ │ ├── __init__.py │ │ └── test_utils.py ├── urls.py ├── utils.py ├── version.py ├── views.py └── widgets.py ├── doc ├── Makefile ├── make.bat ├── requirements.txt ├── reuirements.txt └── source │ ├── api_documentation.rst │ ├── code-block.types.txt │ ├── commands.rst │ ├── conf.py │ ├── configuration.rst │ ├── contents.rst │ ├── creation_package.rst │ ├── deploy.rst │ ├── developer_guide.rst │ ├── developer_guide │ ├── conf.py.rst │ └── default_settings.py.rst │ ├── django_ocr_server.png │ ├── index.rst │ ├── installation.rst │ ├── introduction.rst │ ├── running_tests.rst │ └── usage_examples.rst ├── install_ubuntu.sh ├── install_ubuntu └── install.sh ├── makedoc.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── static └── favicon.ico ├── upload_to_pypi.sh └── usage_examples ├── curl_example.sh ├── example.png ├── perl_example.pl ├── php_example.php └── python_example.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | venv/ 3 | # migrations/ 4 | *.pyc 5 | django_ocr_server/upload/ 6 | django_ocr_server/pdf/ 7 | 8 | # some temporary files 9 | temp.py 10 | temp.html 11 | notes 12 | 13 | # 14 | dist/ 15 | __pycache__/ 16 | *.egg-info/ 17 | .eggs/ 18 | 19 | # sphinx files 20 | doc/build/html/ 21 | doc/build/doctrees/ 22 | _build 23 | _static 24 | _templates 25 | 26 | # django project files 27 | db.sqlite3 28 | ocr_server/ 29 | manage.py 30 | 31 | # deployment files 32 | uwsgi.ini 33 | 34 | # Visual Studio code 35 | .vscode/ 36 | 37 | # codecov 38 | .coverage 39 | coverage/ 40 | 41 | # nodejs 42 | node_modules/ 43 | 44 | # pytest 45 | .pytest_cache/ 46 | 47 | # mypy 48 | .mypy_cache/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | addons: 4 | apt: 5 | packages: 6 | - libpoppler-cpp-dev 7 | jobs: 8 | include: 9 | - name: "Python 3.7 on Linux" 10 | python: '3.7' 11 | install: 12 | - echo PATH=$PATH 13 | - echo HOME=$HOME 14 | - echo whoami=$(whoami) 15 | - pip install -r requirements.txt 16 | - pip install codecov 17 | - pip install pytest-cov 18 | - django-admin start project ocr_server . 19 | - echo "INSTALLED_APPS.append('django_ocr_server')" >> ocr_server/settings.py 20 | - python manage.py makemigrations django_ocr_server 21 | - python manage.py migrate 22 | script: coverage run manage.py test 23 | after_success: 24 | - codecov -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright Django-ocr-server shmakovpn 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune ocr_server 2 | recursive-include django_ocr_server/static *.png *.svg *.css *.js 3 | recursive-include django_ocr_server/templates *.html 4 | recursive-include django_ocr_server/tests *.pdf *.txt *.jpg *.png 5 | include LICENSE.txt -------------------------------------------------------------------------------- /auto_makedoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs `sphinx build` when files in ./doc/source was changed, uses inotifywait for watching files changes 4 | # Author: shmakovpn 5 | # Date: 2020-01-07 6 | 7 | # Requirements: 8 | # Ubuntu: inotify-tools 9 | # Centos 8: inotify-tools (from epel repository) 10 | 11 | SCRIPT_DIR="$(dirname $0)" 12 | DOCS_SOURCE_DIR="${SCRIPT_DIR}/doc/source" 13 | 14 | # Checking that a VIRTUALENV is activated, exit otherwise 15 | if ! test "${VIRTUAL_ENV}" ; then 16 | echo "A virtualenv is not activated. \$VIRTUAL_ENV is null" 17 | exit 1 18 | fi 19 | 20 | # Checking that *inotifywait* is installed 21 | if ! which inotifywait > /dev/null 2>&1 ; then 22 | echo "*inotifywait* is not installed. Install package *inotify-tools*." 23 | exit 1 24 | fi 25 | 26 | # `inotifywait -r -m -e modify -e move -e crate -e delete watching_dir` generates multiple events 27 | # when a file was saved used vim or something else 28 | # but we want to run `sphinx build` only once when a file was changed. 29 | # Thus `while true` is used. 30 | # inotifywait (without *-m* key) generates one event then stops, 31 | # then makedoc.py runs `shpihx build` 32 | # then next iteration of infinitive cicle `while true` starts `inotifywait -m` once again 33 | while true; do 34 | inotifywait -r -e modify -e move -e create -e delete "${DOCS_SOURCE_DIR}" 2>/dev/null \ 35 | && python "${SCRIPT_DIR}"/makedoc.py 36 | done 37 | -------------------------------------------------------------------------------- /django_ocr_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server.png -------------------------------------------------------------------------------- /django_ocr_server/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/__init__.py 3 | +++++++++++++++++++++++++++++ 4 | 5 | | Author: shmakovpn 6 | | Date: 2019-10-22,2019-04-15,2019-10-11,2019-12-03,2021-01-22 7 | """ 8 | import pathlib 9 | from django_ocr_server.conf import ocr_settings 10 | 11 | # Creating folders for storing uploaded files and recognized PDFs if these do not exist 12 | pathlib.Path(ocr_settings.OCR_PDF_UPLOAD_TO).mkdir(parents=True, exist_ok=True) 13 | pathlib.Path(ocr_settings.OCR_PDF_UPLOAD_TO).mkdir(parents=True, exist_ok=True) -------------------------------------------------------------------------------- /django_ocr_server/admin.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/admin.py 3 | """ 4 | __author__ = 'shmakovpn ' 5 | __date__ = '2019-04-16' 6 | 7 | 8 | from django.core.exceptions import PermissionDenied 9 | from django.contrib import admin 10 | from django.contrib.admin.actions import delete_selected as delete_selected_ 11 | from django.http import HttpResponseRedirect 12 | from django.utils.html import format_html 13 | from django.urls import path, reverse 14 | from .forms import * 15 | 16 | 17 | def remove_selected(modeladmin, request, queryset): 18 | if not modeladmin.has_delete_permission(request): 19 | raise PermissionDenied 20 | for obj in queryset: 21 | obj.delete() 22 | 23 | 24 | remove_selected.short_description = "Delete selected objects" 25 | 26 | 27 | def remove_file_selected(modeladmin, request, queryset): 28 | if not modeladmin.has_change_permission(request): 29 | raise PermissionDenied 30 | for obj in queryset: 31 | obj.remove_file() 32 | 33 | 34 | remove_file_selected.short_description = "Delete files from selected objects" 35 | 36 | 37 | def remove_pdf_selected(modeladmin, request, queryset): 38 | if not modeladmin.has_change_permission(request): 39 | raise PermissionDenied 40 | for obj in queryset: 41 | obj.remove_pdf() 42 | 43 | 44 | remove_pdf_selected.short_description = "Delete PDFs from selected objects" 45 | 46 | 47 | def create_pdf_selected(modeladmin, request, queryset): 48 | """ 49 | This function creates pdf for selected models if creation is possible 50 | :param modeladmin: a modeladmin instance 51 | :param request: a current request 52 | :param queryset: a selected models query set 53 | :return: None 54 | """ 55 | if not modeladmin.has_change_permission(request): 56 | raise PermissionDenied 57 | for obj in queryset: 58 | obj.create_pdf(modeladmin, request) 59 | 60 | 61 | create_pdf_selected.short_description = "Create PDFs in selected objects" 62 | 63 | 64 | def filefield_to_listdisplay(obj): 65 | if 'store_files_disabled' in obj.file.name: 66 | return 'NO FILE' 67 | elif 'file_removed' in obj.file.name: 68 | return 'REMOVED' 69 | filename = os.path.basename(obj.file.name) 70 | return format_html('{}Remove', 71 | reverse(f"{__package__}:download", kwargs={ 72 | 'download_target': 'file', 'filename': filename 73 | }), 74 | filename, 75 | reverse('admin:ocredfile-file-remove', args=[obj.pk]) 76 | ) 77 | 78 | 79 | filefield_to_listdisplay.short_description = "File" 80 | 81 | 82 | def pdffield_to_listdisplay(obj): 83 | """ 84 | Formats pdffield to show in the listdisplay of admin interface 85 | :param obj: a model instance 86 | :return: pdffield html 87 | """ 88 | out = '' 89 | if not obj.ocred_pdf: 90 | out = '' 91 | elif 'store_pdf_disabled' in obj.ocred_pdf.name: 92 | out = 'NO PDF' 93 | elif 'pdf_removed' in obj.ocred_pdf.name: 94 | out = 'REMOVED' 95 | else: 96 | filename = os.path.basename(obj.ocred_pdf.name) 97 | return format_html('{}Remove', 98 | reverse(f"{__package__}:download", kwargs={ 99 | 'download_target': 'pdf', 'filename': filename 100 | }), 101 | filename, 102 | reverse('admin:ocredfile-ocred_pdf-remove', args=[obj.pk]) 103 | ) 104 | if obj.can_create_pdf: 105 | return format_html('{}Create', 106 | out, 107 | reverse('admin:ocredfile-ocred_pdf-create', args=[obj.pk])) 108 | return out 109 | 110 | 111 | pdffield_to_listdisplay.short_description = "PDF" 112 | 113 | 114 | def pdfinfo_to_listdisplay(obj): 115 | html = '' 116 | if obj.pdf_num_pages: 117 | html += '
nPages: '+str(obj.pdf_num_pages)+'
' 118 | if obj.pdf_author: 119 | html += '
Author: '+str(obj.pdf_author)+'
' 120 | if obj.pdf_creation_date: 121 | html += '
Created: '+str(obj.pdf_creation_date)+'
' 122 | if obj.pdf_creator: 123 | html += '
Creator: '+str(obj.pdf_creator)+'
' 124 | if obj.pdf_mod_date: 125 | html += '
Modified: '+str(obj.pdf_mod_date)+'
' 126 | if obj.pdf_producer: 127 | html += '
Producer: '+str(obj.pdf_producer)+'
' 128 | if obj.pdf_title: 129 | html += '
Title: '+str(obj.pdf_title)+'
' 130 | return format_html(html) 131 | 132 | 133 | # Register your models here. 134 | class OCRedFileAdmin(admin.ModelAdmin): 135 | """ 136 | The ModelAdmin for the model OCRedFile 137 | """ 138 | actions = [remove_selected, remove_file_selected, remove_pdf_selected, create_pdf_selected] 139 | list_display = ('md5', 'uploaded', 'ocred', filefield_to_listdisplay, pdffield_to_listdisplay, pdfinfo_to_listdisplay, 'ocred_pdf_md5') 140 | 141 | def get_actions(self, request): 142 | """ 143 | Remove 'delete_selected' from actions. Returns the list of available actions 2019-04-22 144 | :param request: not used 145 | :return: the list of available actions 146 | """ 147 | actions = super(OCRedFileAdmin, self).get_actions(request) 148 | if 'delete_selected' in actions: 149 | del actions['delete_selected'] 150 | return actions 151 | 152 | def get_fieldsets(self, request, obj=None): 153 | """ 154 | This function returns fieldsets for the OCRFileForm, 155 | excludes 'uploaded' and 'ocred' fields from OCRedFileAdmin.fieldsets 2019-03-18 156 | :param request: does not use 157 | :param obj: the current instance of the OCRedFile model 158 | :return: fieldsets for the OCRFileForm, 159 | """ 160 | if not obj: 161 | return ( 162 | (None, { 163 | 'fields': ('file', 'file_type',) 164 | }), 165 | ) 166 | return ( 167 | (None, { 168 | 'fields': ('file', 'ocred_pdf',) 169 | }), 170 | (None, { 171 | 'fields': ('file_type', ) 172 | }), 173 | (None, { 174 | 'fields': (('md5', 'ocred_pdf_md5'), ) 175 | }), 176 | (None, { 177 | 'fields': (('uploaded', 'ocred',), 'pdf_info', ) 178 | }), 179 | (None, { 180 | 'fields': ('text', ) 181 | }) 182 | ) 183 | 184 | def get_readonly_fields(self, request, obj=None): 185 | """ 186 | This function tuple of readonly fields, 187 | excludes 'uploaded' and 'ocred' from readonly fields when adding 2019-03-18 188 | :param request: a current request 189 | :param obj: a model instance 190 | :return: a tuple of 'readonly_fields' 191 | """ 192 | if not obj: 193 | return () 194 | return ('uploaded', 'ocred', ) 195 | 196 | def process_file_remove(self, request, ocredfile_id, *args, **kwargs): 197 | try: 198 | ocredfile = OCRedFile.objects.get(pk=ocredfile_id) 199 | ocredfile.remove_file() 200 | self.message_user(request, 'File removed "'+ocredfile.file.name+'" ') 201 | except Exception as e: 202 | self.message_user(request, 'An error has occurred: '+str(e)) 203 | return HttpResponseRedirect(reverse('admin:{package}_ocredfile_changelist'.format(package=__package__))) 204 | 205 | def process_pdf_remove(self, request, ocredfile_id, *args, **kwargs): 206 | try: 207 | ocredfile = OCRedFile.objects.get(pk=ocredfile_id) 208 | filename = ocredfile.ocred_pdf.name 209 | ocredfile.remove_pdf() 210 | self.message_user(request, 'PDF removed "'+filename+'" ') 211 | except Exception as e: 212 | self.message_user(request, 'An error has occurred: '+str(e)) 213 | return HttpResponseRedirect(reverse('admin:{package}_ocredfile_changelist'.format(package=__package__))) 214 | 215 | def process_pdf_create(self, request, ocredfile_id, *args, **kwargs): 216 | """ 217 | Creates a searchable pdf if it does not exits and creation is possible 218 | :param request: 219 | :param ocredfile_id: a primary key of a model instance 220 | :param args: 221 | :param kwargs: 222 | :return: HttpResponseRedirect to 'admin:ocr_ocredfile_changelist' 223 | """ 224 | try: 225 | ocredfile = OCRedFile.objects.get(pk=ocredfile_id) 226 | ocredfile.create_pdf(self, request) 227 | except Exception as e: 228 | self.message_user(request, 'An error has occurred:'+str(e)) 229 | return HttpResponseRedirect(reverse('admin:{package}_ocredfile_changelist'.format(package=__package__))) 230 | 231 | def get_urls(self): 232 | """ 233 | Creates urls for OCRedFile admin_view 234 | :return: list of urls 235 | """ 236 | urls = super(OCRedFileAdmin, self).get_urls() 237 | custom_urls = [ 238 | path( 239 | '/file_remove/', 240 | self.admin_site.admin_view(self.process_file_remove), 241 | name='ocredfile-file-remove', 242 | ), 243 | path( 244 | '/pdf_remove/', 245 | self.admin_site.admin_view(self.process_pdf_remove), 246 | name='ocredfile-ocred_pdf-remove', 247 | ), 248 | path( 249 | '/pdf_create/', 250 | self.admin_site.admin_view(self.process_pdf_create), 251 | name='ocredfile-ocred_pdf-create', 252 | ), 253 | ] 254 | return custom_urls+urls 255 | 256 | def response_change(self, request, obj): 257 | if '_removefile' in request.POST: 258 | self.message_user(request, 'File removed') 259 | obj.remove_file() # remove file from filesystem and rename filefield to 'file_removed' 260 | return HttpResponseRedirect('.') 261 | if '_removepdf' in request.POST: 262 | self.message_user(request, 'PDF removed') 263 | obj.remove_pdf() # remove ocred_pdf from filesystem and rename filefield to 'pdf_removed' 264 | return HttpResponseRedirect('.') 265 | if '_createpdf' in request.POST: 266 | obj.create_pdf(self, request) # create ocred pdf (if it is possible) 267 | return HttpResponseRedirect('.') 268 | return super(OCRedFileAdmin, self).response_change(request, obj) 269 | 270 | def add_view(self, request, form_url='', extra_context=None): 271 | self.form = OCRedFileAddForm 272 | extra_context = extra_context or {} 273 | extra_context['show_save_and_continue'] = False 274 | extra_context['ocr_show_save_and_add_another'] = True 275 | extra_context['ocr_show_save_and_view'] = True 276 | return super(OCRedFileAdmin, self).add_view(request, form_url, extra_context) 277 | 278 | def change_view(self, request, object_id, form_url='', extra_context=None): 279 | self.form = OCRedFileViewForm 280 | extra_context = extra_context or {} 281 | extra_context['show_save'] = False 282 | extra_context['show_save_and_continue'] = False 283 | extra_context['ocr_show_save_and_add_another'] = False 284 | return super(OCRedFileAdmin, self).change_view(request, object_id, form_url, extra_context) 285 | 286 | def save_model(self, request, obj, form, change): 287 | return super(OCRedFileAdmin, self).save_model(request, obj, form, change) 288 | 289 | def delete_model(self, request, obj): 290 | return super(OCRedFileAdmin, self).delete_model(request, obj) 291 | 292 | 293 | admin.site.register(OCRedFile, OCRedFileAdmin) -------------------------------------------------------------------------------- /django_ocr_server/apiviews.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/apiviews.py 3 | This file contains views for OCR Server based on Django REST API 4 | """ 5 | __author__ = "shmakovpn " 6 | __date__ = "2019-03-19" 7 | 8 | import os 9 | import mimetypes 10 | from django.urls import reverse_lazy 11 | 12 | from rest_framework import generics 13 | from rest_framework.views import APIView 14 | from rest_framework import status 15 | from rest_framework.response import Response 16 | from rest_framework.parsers import MultiPartParser 17 | from rest_framework.authentication import SessionAuthentication, TokenAuthentication 18 | from rest_framework.permissions import IsAuthenticated 19 | from django.utils.http import http_date 20 | from django.views.static import was_modified_since 21 | from django.http import HttpResponse, Http404, HttpResponseNotModified 22 | from django.db.models import Q 23 | from .models import * 24 | from .serializers import * 25 | from .exceptions import * 26 | from django.conf import settings 27 | 28 | 29 | class OcrApiView(APIView): 30 | """ 31 | Parent view class for all OCR Server API views 2019-04-11 32 | """ 33 | authentication_classes = ( 34 | TokenAuthentication, 35 | SessionAuthentication, 36 | ) 37 | permission_classes = (IsAuthenticated,) 38 | 39 | 40 | class UploadFile(OcrApiView): 41 | """ 42 | Uploads the 'file' to OCR Server, 43 | If 'file' already was uploaded to OCR Server, 44 | the view returns the information of the uploaded file 45 | and status_code 200. 46 | Unless OCR Server processing 'file' and returns 47 | information about the new OCRedFile 2019-03-19. 48 | """ 49 | 50 | parser_classes = (MultiPartParser,) 51 | 52 | def post(self, request,): 53 | """ 54 | Uploads the 'file' to OCR Server, \ 55 | If 'file' already was uploaded to OCR Server, \ 56 | the view returns the information of the uploaded file \ 57 | and status_code 200. \ 58 | Unless OCR Server processing 'file' and returns \ 59 | information about the new OCRedFile 2019-03-19. 60 | :param request: rest framework request 61 | :return: rest framework response 62 | """ 63 | if 'file' not in request.FILES: 64 | return Response({ 65 | 'error': True, 66 | 'message': 'A file does not present', 67 | }, status=status.HTTP_400_BAD_REQUEST) 68 | ocred_file_serializer = OCRedFileSerializer(data={'file': request.FILES['file']}) 69 | try: 70 | ocred_file_serializer.is_valid(raise_exception=True) 71 | except (Md5DuplicationError, Md5PdfDuplicationError) as e: 72 | ocred_file = OCRedFile.objects.get(Q(md5=e.md5) | Q(ocred_pdf_md5=e.md5)) 73 | print(f"OCRed file already exists '{e.md5}'") 74 | ocred_file_serializer = OCRedFileSerializer(ocred_file, many=False) 75 | data = ocred_file_serializer.data 76 | return Response({ 77 | 'error': False, 78 | 'created': False, 79 | 'code': e.code, 80 | 'data': data 81 | }, status.HTTP_200_OK) 82 | except FileTypeError as e: 83 | return Response({ 84 | 'error': True, 85 | 'code': e.code, 86 | 'message': e.message, 87 | 'file_type': e.file_type, 88 | }, status=status.HTTP_400_BAD_REQUEST) 89 | ocred_file_serializer.save() 90 | data = ocred_file_serializer.data 91 | return Response({ 92 | 'error': False, 93 | 'created': True, 94 | 'data': data 95 | }, status=status.HTTP_201_CREATED) 96 | 97 | 98 | class OCRedFileList(OcrApiView): 99 | """ 100 | Returns list of OCRedFile instances in JSON format 2019-03-20 101 | """ 102 | def get(self, request, *args, **kwargs): 103 | """ 104 | Returns a list of OCRedFile instances in JSON format 2019-03-24 105 | :param request: rest framework request 106 | :return: rest framework response 107 | """ 108 | ocred_files = OCRedFile.objects.all()[:20] 109 | data = OCRedFileSerializer(ocred_files, many=True).data 110 | return Response(data, status=status.HTTP_200_OK) 111 | 112 | 113 | class Md5(OcrApiView): 114 | """ 115 | Returns information about an already uploaded file, \ 116 | or message that a file with md5=md5 or ocred_pdf_md5=md5 not found 2019-03-24 117 | """ 118 | def get(self, request, md5=md5): 119 | """ 120 | Returns information about an already uploaded file, \ 121 | or message that a file with md5=md5 or ocred_pdf_md5=md5 not found 2019-03-24 122 | :param request: rest framework request 123 | :return: rest framework response 124 | """ 125 | try: 126 | ocred_file = OCRedFile.objects.get(Q(md5=md5) | Q(ocred_pdf_md5=md5)) 127 | except OCRedFile.DoesNotExist: 128 | return Response({ 129 | 'error': False, 130 | 'exists': False, 131 | }, status=status.HTTP_204_NO_CONTENT) 132 | data = OCRedFileSerializer(ocred_file).data 133 | return Response({ 134 | 'error': False, 135 | 'exists': True, 136 | 'data': data, 137 | }, status=status.HTTP_200_OK) 138 | 139 | 140 | class RemoveMd5(OcrApiView): 141 | """ 142 | Removes an OCRedFile if it exists with md5=md5 or ocred_pdf_md5=md5, \ 143 | or returns message that an OCRedFile with md5=md5 or ocred_pdf_md5 not found. 2019-03-24 144 | """ 145 | def delete(self, request, md5=md5): 146 | """ 147 | Removes an OCRedFile if it exists with md5=md5 or ocred_pdf_md5=md5, \ 148 | or returns message that an OCRedFile with md5=md5 or ocred_pdf_md5 not found. 2019-03-24 149 | :param request: rest api framework request 150 | :param md5: The md5 of OCRedFile which will be deleted 151 | :return: rest framework response 152 | """ 153 | try: 154 | ocred_file = OCRedFile.objects.get(Q(md5=md5) | Q(ocred_pdf_md5=md5)) 155 | except OCRedFile.DoesNotExist: 156 | return Response({ 157 | 'error': False, 158 | 'exists': False, 159 | 'removed': False, 160 | }, status=status.HTTP_204_NO_CONTENT) 161 | ocred_file.delete() 162 | return Response({ 163 | 'error': False, 164 | 'exists': True, 165 | 'removed': True, 166 | }, status=status.HTTP_200_OK) 167 | 168 | 169 | class RemoveAll(OcrApiView): 170 | """ 171 | Removes all OCRedFiles 2019-03-24 172 | """ 173 | def delete(self, request): 174 | """ 175 | Removes all OCRedFiles 2019-03-24 176 | :param request: rest framework request 177 | :return: rest framework response 178 | """ 179 | ocred_files = OCRedFile.objects.all() 180 | num_ocred_files = ocred_files.count() 181 | for ocred_file in ocred_files: 182 | ocred_file.delete() 183 | return Response({ 184 | 'error': False, 185 | 'removed': True, 186 | 'count': num_ocred_files, 187 | }, status=status.HTTP_200_OK) 188 | 189 | 190 | class RemoveFileMd5(OcrApiView): 191 | """ 192 | Removes the file from the instance of OCRedFile which has md5=md5 or ocred_pdf_md5=md5 2019-03-25 193 | """ 194 | def delete(self, request, md5=md5): 195 | """ 196 | Removes the file from the instance of OCRedFile which has md5=md5 or ocred_pdf_md5=md5 2019-03-25 197 | :param request: rest framework request 198 | :param md5: The md5 of OCRedFile whose file will be deleted 199 | :return: rest framework response 200 | """ 201 | try: 202 | ocred_file = OCRedFile.objects.get(Q(md5=md5) | Q(ocred_pdf_md5=md5)) 203 | except OCRedFile.DoesNotExist: 204 | return Response({ 205 | 'error': False, 206 | 'exists': False, 207 | 'removed': False, 208 | }, status=status.HTTP_204_NO_CONTENT) 209 | if not ocred_file.can_remove_file: 210 | return Response({ 211 | 'error': False, 212 | 'exists': True, 213 | 'can_remove_file': False, 214 | 'removed': False, 215 | }, status=status.HTTP_204_NO_CONTENT) 216 | ocred_file.remove_file() 217 | return Response({ 218 | 'error': False, 219 | 'exists': True, 220 | 'removed': True, 221 | }, status=status.HTTP_200_OK) 222 | 223 | 224 | class RemoveFileAll(OcrApiView): 225 | """ 226 | Removes files from all of instances of OCRedFile 2019-03-25 227 | """ 228 | def delete(self, request, ): 229 | """ 230 | Removes files from all of instances of OCRedFile 2019-03-25 231 | :param request: rest framework request 232 | :return: rest framework response 233 | """ 234 | old_counter = OCRedFile.Counters.num_removed_files 235 | ocred_files = OCRedFile.objects.all() 236 | for ocred_file in ocred_files: 237 | ocred_file.remove_file() 238 | return Response({ 239 | 'error': False, 240 | 'removed': True, 241 | 'count': OCRedFile.Counters.num_removed_files-old_counter, 242 | }, status=status.HTTP_200_OK) 243 | 244 | 245 | class RemovePdfMd5(OcrApiView): 246 | """ 247 | Removes the ocred_pdf from the instance of OCRedFile which has md5=md5 or ocred_pdf_md5=md5 2019-03-25 248 | """ 249 | def delete(self, request, md5=md5): 250 | """ 251 | Removes the ocred_pdf from the instance of OCRedFile which has md5=md5 or ocred_pdf_md5=md5 2019-03-25 252 | :param request: rest framework request 253 | :param md5: The md5 of OCRedFile whose ocred_pdf will be deleted 254 | :return: rest framework response 255 | """ 256 | try: 257 | ocred_file = OCRedFile.objects.get(Q(md5=md5) | Q(ocred_pdf_md5=md5)) 258 | except OCRedFile.DoesNotExist: 259 | return Response({ 260 | 'error': False, 261 | 'exists': False, 262 | 'removed': False, 263 | }, status=status.HTTP_204_NO_CONTENT) 264 | if not ocred_file.can_remove_pdf: 265 | return Response({ 266 | 'error': False, 267 | 'exists': True, 268 | 'can_remove_pdf': False, 269 | 'removed': False, 270 | }, status=status.HTTP_204_NO_CONTENT) 271 | ocred_file.remove_pdf() 272 | return Response({ 273 | 'error': False, 274 | 'exists': True, 275 | 'removed': True, 276 | }, status=status.HTTP_200_OK) 277 | 278 | 279 | class RemovePdfAll(OcrApiView): 280 | """ 281 | Removes ocred_pdfs from all of instances of OCRedFile 2019-03-25 282 | """ 283 | def delete(self, request, ): 284 | """ 285 | Removes ocred_pdfs from all of instances of OCRedFile 2019-03-25 286 | :param request: rest framework request 287 | :return: rest framework response 288 | """ 289 | old_counter = OCRedFile.Counters.num_removed_pdf 290 | ocred_files = OCRedFile.objects.all() 291 | for ocred_file in ocred_files: 292 | ocred_file.remove_pdf() 293 | return Response({ 294 | 'error': False, 295 | 'removed': True, 296 | 'count': OCRedFile.Counters.num_removed_pdf - old_counter, 297 | }, status=status.HTTP_200_OK) 298 | 299 | 300 | class CreatePdfMd5(OcrApiView): 301 | """ 302 | Creates ocred_pdf in the instance of OCRedFile whose md5=md5 or ocred_pdf_md5=md5 if it is possible 2019-03-25 303 | """ 304 | def get(self, request, md5=md5): 305 | """ 306 | Creates ocred_pdf in the instance of OCRedFile whose md5=md5 or ocred_pdf_md5=md5 if it is possible 2019-03-25 307 | :param request: rest framework request 308 | :param md5: the md5 of the instance of OCRedFile whose ocred_pdf will be created 309 | :return: rest framework response 310 | """ 311 | try: 312 | ocred_file = OCRedFile.objects.get(Q(md5=md5) | Q(ocred_pdf_md5=md5)) 313 | except OCRedFile.DoesNotExist: 314 | return Response({ 315 | 'error': False, 316 | 'exists': False, 317 | 'created': False, 318 | }, status=status.HTTP_204_NO_CONTENT) 319 | if not ocred_file.can_create_pdf: 320 | return Response({ 321 | 'error': False, 322 | 'exists': True, 323 | 'can_create_pdf': False, 324 | 'created': False, 325 | }, status=status.HTTP_204_NO_CONTENT) 326 | ocred_file.create_pdf() 327 | return Response({ 328 | 'error': False, 329 | 'exists': True, 330 | 'created': True, 331 | }, status=status.HTTP_201_CREATED) 332 | 333 | 334 | class CreatePdfAll(OcrApiView): 335 | """ 336 | Creates ocred_pdf in all instances of OCRedFile where it is possible 2019-03-25 337 | """ 338 | def get(self, request, ): 339 | """ 340 | Creates ocred_pdf in all instances of OCRedFile where it is possible 2019-03-25 341 | :param request: rest framework request 342 | :return: rest framework response 343 | """ 344 | old_counter = OCRedFile.Counters.num_created_pdf 345 | ocred_files = OCRedFile.objects.all() 346 | for ocred_file in ocred_files: 347 | ocred_file.create_pdf() 348 | return Response({ 349 | 'error': False, 350 | 'created': True, 351 | 'count': OCRedFile.Counters.num_created_pdf - old_counter, 352 | }, status=status.HTTP_200_OK) 353 | 354 | 355 | class Clean(OcrApiView): 356 | """ 357 | CleanUps folders for OCRedFile.files and OCRedFile.ocred_pdfs from files do not present in OCRedFiles 2019-04-12 358 | """ 359 | def get(self, request, ): 360 | """ 361 | Removes 'files' and 'ocred_pdfs' that are not related with any the OCRedFile 2019-04-13 362 | :param request: not used 363 | :return: rest framework response 364 | """ 365 | removed_files, removed_pdfs = OCRedFile.cleanup() 366 | return Response({ 367 | 'removed files': removed_files, 368 | 'removed files count': len(removed_files), 369 | 'removed ocred_pdf': removed_pdfs, 370 | 'remoced ocred_pdf count': len(removed_pdfs), 371 | }) 372 | 373 | 374 | class Ttl(OcrApiView): 375 | """ 376 | Removes all instances of OCRedFile whose OCRedFile.uploaded+OCR_TTL lower current datetime 377 | if OCR_TTL does not 0, (NOTE: if OCR_TTL<0 all instances of OCRedFile will be removed, use only for tests). 378 | Removes all OCRedFile.files whose OCRedFile.uploaded+OCR_FILES_TTL lower current datetime 379 | if OCR_FILES_TTL does not 0, 380 | (NOTE: if OCR_FILES_TTL<0 all OCRedFile.files will be removed, use only for tests). 381 | Removes all OCRedFile.ocred_pdfs whose OCRedFile.uploaded+OCR_PDF_TTL lower current datetime 382 | if OCR_PDF_TTL does not 0, 383 | (NOTE: if OCR_PDF_TTL<0 all OCRedFile.ocred_pdfs will be removed, use only for tests). 2019-04-13 384 | """ 385 | def get(self, request, ): 386 | """ 387 | Removes all instances of OCRedFile whose OCRedFile.uploaded+OCR_TTL lower current datetime 388 | if OCR_TTL does not 0, (NOTE: if OCR_TTL<0 all instances of OCRedFile will be removed, use only for tests). 389 | Removes all OCRedFile.files whose OCRedFile.uploaded+OCR_FILES_TTL lower current datetime 390 | if OCR_FILES_TTL does not 0, 391 | (NOTE: if OCR_FILES_TTL<0 all OCRedFile.files will be removed, use only for tests). 392 | Removes all OCRedFile.ocred_pdfs whose OCRedFile.uploaded+OCR_PDF_TTL lower current datetime 393 | if OCR_PDF_TTL does not 0, 394 | (NOTE: if OCR_PDF_TTL<0 all OCRedFile.ocred_pdfs will be removed, use only for tests). 2019-04-13 395 | :param request: not used 396 | :return: rest framework response 397 | """ 398 | counter, files_counter, pdf_counter = OCRedFile.ttl() 399 | return Response({ 400 | 'removed': counter, 401 | 'files_removed': files_counter, 402 | 'pdf_removed': pdf_counter, 403 | }) 404 | 405 | 406 | PWD = "%s/%s" % (settings.BASE_DIR, __package__) # directory of the django-ocr-server/ocr application 407 | UPLOAD_DIR = "%s/upload/" % PWD 408 | PDF_DIR = "%s/pdf/" % PWD 409 | 410 | 411 | class DownloadView(OcrApiView): 412 | """ 413 | View for downloading OCRedFile.file or OCRedFile.ocred_pdf 2019-04-09 414 | """ 415 | login_url = reverse_lazy('admin:index') 416 | 417 | def get(self, request, download_target, filename): 418 | """ 419 | The view class for downloading files 2019-04-09 420 | :param request: 421 | :return: HttpResponse 422 | """ 423 | if download_target == 'file': 424 | path = UPLOAD_DIR + filename 425 | else: 426 | path = PDF_DIR + filename 427 | if not os.path.isfile(path): 428 | raise Http404('"%s" does not exist' % path) 429 | stat = os.stat(path) 430 | content_type, encoding = mimetypes.guess_type(path) 431 | content_type = content_type or 'application/octet-stream' 432 | if not was_modified_since(request.META.get('HTTP_IF_MODIFIED_SINCE'), 433 | stat.st_mtime, stat.st_size): 434 | return HttpResponseNotModified(content_type=content_type) 435 | response = HttpResponse(open(path, 'rb').read(), content_type=content_type) 436 | response['Last-Modified'] = http_date(stat.st_mtime) 437 | response['Content-Length'] = stat.st_size 438 | if encoding: 439 | response['Content-Encoding'] = encoding 440 | return response 441 | -------------------------------------------------------------------------------- /django_ocr_server/apps.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/app.py 3 | """ 4 | __author__ = 'shmakovpn ' 5 | __date__ = '2019-04-03' 6 | 7 | from django.apps import AppConfig 8 | 9 | 10 | class DjangoOcrServerConfig(AppConfig): 11 | """ 12 | OCR Server configuration 2019-04-03 13 | """ 14 | name = 'django_ocr_server' 15 | verbose_name = 'Django OCR Server' 16 | -------------------------------------------------------------------------------- /django_ocr_server/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/conf.py 3 | +++++++++++++++++++++++++ 4 | 5 | The settings manager for **django_ocr_server**. 6 | 7 | Usage: 8 | 9 | .. code-block:: python 10 | 11 | from django_ocr_server.conf import ocr_settings 12 | # Next line will print a value of **OCR_TESSERACT_LANG** 13 | # using the variable from the Django's *settings.py* file 14 | # if the variable is set there. 15 | # Or the default value of **OCR_TESSERACT_LANG** from 16 | # *django_ocr_server/default_settings.py* otherwise. 17 | print(ocr_settings.OCR_TESSERACT_LANG) 18 | 19 | | Author: shmakovpn 20 | | Date: 2021-01-20 21 | """ 22 | from typing import List 23 | from datetime import timedelta 24 | from django.conf import settings as _s 25 | import django_ocr_server.default_settings as _ds 26 | 27 | 28 | class DjangoOcrSettings: 29 | """The settings manager of **django_ocr_server**""" 30 | @property 31 | def OCR_STORE_FILES(_) -> bool: 32 | return bool(getattr(_s, 'OCR_STORE_FILES', _ds.OCR_STORE_FILES)) 33 | 34 | @property 35 | def OCR_FILE_PREVIEW(_) -> bool: 36 | return bool(getattr(_s, 'OCR_FILE_PREVIEW', _ds.OCR_FILE_PREVIEW)) 37 | 38 | @property 39 | def OCR_TESSERACT_LANG(_) -> str: 40 | return str(getattr(_s, 'OCR_TESSERACT_LANG', _ds.OCR_TESSERACT_LANG)) 41 | 42 | @property 43 | def OCR_STORE_PDF(_) -> bool: 44 | return bool(getattr(_s, 'OCR_STORE_PDF', _ds.OCR_STORE_PDF)) 45 | 46 | @property 47 | def OCR_STORE_FILES_DISABLED_LABEL(_) -> str: 48 | return str( 49 | getattr(_s, 'OCR_STORE_FILES_LABEL', 50 | _ds.OCR_STORE_FILES_DISABLED_LABEL)) 51 | 52 | @property 53 | def OCR_STORE_PDF_DISABLED_LABEL(_) -> str: 54 | return str( 55 | getattr(_s, 'OCR_FILE_REMOVED_LABEL', _ds.OCR_FILE_REMOVED_LABEL)) 56 | 57 | @property 58 | def OCR_FILE_REMOVED_LABEL(_) -> str: 59 | return str( 60 | getattr(_s, 'OCR_FILE_REMOVED_LABEL', _ds.OCR_FILE_REMOVED_LABEL)) 61 | 62 | @property 63 | def OCR_PDF_REMOVED_LABEL(_) -> str: 64 | return str( 65 | getattr(_s, 'OCR_PDF_REMOVED_LABEL', _ds.OCR_PDF_REMOVED_LABEL)) 66 | 67 | @property 68 | def OCR_ALLOWED_FILE_TYPES(_) -> List[str]: 69 | return list( 70 | getattr(_s, 'OCR_ALLOWED_FILE_TYPES', _ds.OCR_ALLOWED_FILE_TYPES)) 71 | 72 | @property 73 | def OCR_FILES_UPLOAD_TO(_) -> str: 74 | return str(getattr(_s, 'OCR_FILES_UPLOAD_TO', _ds.OCR_FILES_UPLOAD_TO)) 75 | 76 | @property 77 | def OCR_PDF_UPLOAD_TO(_) -> str: 78 | return str(getattr(_s, 'OCR_PDF_UPLOAD_TO', _ds.OCR_PDF_UPLOAD_TO)) 79 | 80 | @property 81 | def OCR_FILES_TTL(_) -> timedelta: 82 | return getattr(_s, 'OCR_FILES_TTL', _ds.OCR_FILES_TTL) 83 | 84 | @property 85 | def OCR_PDF_TTL(_) -> timedelta: 86 | return getattr(_s, 'ocr_pdf_ttl', _ds.OCR_PDF_TTL) 87 | 88 | @property 89 | def OCR_TTL(_) -> timedelta: 90 | return getattr(_s, 'OCR_TTL', _ds.OCR_TTL) 91 | 92 | 93 | ocr_settings: DjangoOcrSettings = DjangoOcrSettings() 94 | """The instance of settings manager of **django_ocr_server**""" -------------------------------------------------------------------------------- /django_ocr_server/converters.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/converters.py 3 | This file contains urls.converters for OCR Server 2019-03-24 4 | """ 5 | __author__ = "shmakovpn " 6 | __date__ = '2019-03-24' 7 | 8 | 9 | from django.urls.converters import StringConverter 10 | 11 | 12 | class Md5Converter(StringConverter): 13 | """ 14 | The md5 converter for path function 2019-03-24 15 | """ 16 | regex = '[a-fA-F\d]{32}' 17 | 18 | 19 | class DonloadTargetConverter(StringConverter): 20 | """ 21 | The download target converter for path function 2019-04-09 22 | """ 23 | regex = 'file|pdf' 24 | -------------------------------------------------------------------------------- /django_ocr_server/default_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/default_settings.py 3 | +++++++++++++++++++++++++++++++++++++ 4 | 5 | This file contains default settings for OCR Server 6 | 7 | | Author: shmakovpn 8 | | Date: 2019-02-21/2019-03-29/2019-04-12/2021-01-19 9 | """ 10 | from typing import List 11 | import os 12 | from datetime import timedelta 13 | from django.conf import settings 14 | 15 | # configure Django's settings if not configured (need for Sphinx autodoc) 16 | if not settings.configured: 17 | settings.configure( 18 | BASE_DIR=os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 19 | 20 | OCR_STORE_FILES: bool = True #: Store uploaded files (True) or not (False), default to True 21 | OCR_FILE_PREVIEW: bool = True #: Show file preview in admin (True) or not (False), default to True 22 | OCR_TESSERACT_LANG: str = 'rus+eng' #: Sets priority of using languages, default to 'rus+eng' 23 | OCR_STORE_PDF: bool = True #: Generate and store recognized searchable PDF (True) or not (False), default to True 24 | 25 | OCR_STORE_FILES_DISABLED_LABEL: str = 'store_files_disabled' 26 | """The text of storeing uploaded files disabled label in the admin interface""" 27 | 28 | OCR_STORE_PDF_DISABLED_LABEL: str = 'store_pdf_disabled' 29 | """The text of storeing recognized PDF disabled label in the admin interface""" 30 | 31 | OCR_FILE_REMOVED_LABEL: str = 'file_removed' 32 | """The text of the label of *file removed* in the admin interface""" 33 | 34 | OCR_PDF_REMOVED_LABEL: str = 'pdf_removed' 35 | """The text of the label of *PDF removed* in the admin interface""" 36 | 37 | OCR_ALLOWED_FILE_TYPES: List[str] = [ 38 | 'application/pdf', 39 | 'image/jpeg', 40 | 'image/png', 41 | 'image/bmp', 42 | 'image/tiff', 43 | ] 44 | """The types of file allowed to uploading""" 45 | """ 46 | 2019-10-22 shmakovpn. An error was found when trying to deploy Django-OCR_Server using Apache 47 | because usage of relative paths is a wrong way when Apache mod_wsgi is using 48 | https://modwsgi.readthedocs.io/en/develop/user-guides/application-issues.html#application-working-directory 49 | """ 50 | 51 | OCR_FILES_UPLOAD_TO: str = os.path.join(settings.BASE_DIR, __package__, 52 | 'upload') 53 | """The directory for saving uploaded files""" 54 | 55 | OCR_PDF_UPLOAD_TO: str = os.path.join(settings.BASE_DIR, __package__, 'pdf') 56 | """The directory for storeing searchable PDFs""" 57 | 58 | OCR_FILES_TTL: timedelta = timedelta(0) 59 | """ 60 | When current datetime will be grater then the datetime of file uploading plus this timedelta, 61 | the uploaded file will be removed. 62 | *timedelta(0)* means that **OCR_FILES_TTL** is disabled. 63 | Defaults to *timedelta(0)*. 64 | """ 65 | 66 | OCR_PDF_TTL: timedelta = timedelta(0) 67 | """ 68 | When current datetime will be grater then the datetime of creating recognized PDF plus this timedelta, 69 | the recognized PDF will be removed. 70 | *timedelta(0)* means that **OCR_PDF_TTL** is disabled. 71 | Defaults to *timedelta(0)*. 72 | """ 73 | 74 | OCR_TTL: timedelta = timedelta(0) 75 | """ 76 | When current datetime will be grater then the datetime of creating the model (OCRedFile) in the database plus this timedelta, 77 | the model in the database will be removed. 78 | *timedelta(0)* means that **OCR_TTL** is disabled. 79 | Defaults to *timedelta(0)*. 80 | """ 81 | -------------------------------------------------------------------------------- /django_ocr_server/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/exceptions.py 3 | """ 4 | __author__ = 'shmakovpn ' 5 | __date__ = '2019-03-21/2021-01-06' 6 | 7 | from typing import Optional, Pattern, List, Tuple, Match 8 | import re 9 | from django.core.exceptions import ValidationError 10 | 11 | 12 | class Md5DuplicationError(ValidationError): 13 | """ 14 | OCRedFile with the same md5 already exists 2019-03-21/2021-01-06 15 | """ 16 | md5 = None 17 | MESSAGE = 'OCRedFile with the same md5 already exists' 18 | CODE = 'md5_exists' 19 | 20 | def __init__(self, md5: str): 21 | self.md5 = md5 22 | message: str = f"{self.MESSAGE} '{self.md5}'" 23 | super().__init__(message=message, code=self.CODE) 24 | 25 | 26 | class Md5PdfDuplicationError(Md5DuplicationError): 27 | """ 28 | OCRedFile with the same ocred_pdf_md5 already exists 2019-03-21/2021-01-06 29 | """ 30 | MESSAGE = 'OCRedFile with the same ocred_pdf_md5 already exists' 31 | CODE = 'pdf_md5_exists' 32 | 33 | 34 | class FileTypeError(ValidationError): 35 | """ 36 | The uploaded file has a not allowed type of content 2019-03-21/2021-01-06 37 | """ 38 | file_type: Optional[str] = None 39 | CODE = 'wrong_file_type' 40 | 41 | def __init__(self, file_type: str): 42 | self.file_type = file_type 43 | super().__init__( 44 | message="The uploaded file has a not allowed type of content '{}'". 45 | format(self.file_type), 46 | code=self.CODE, 47 | ) 48 | 49 | 50 | class DoesNotSaved(RuntimeError): 51 | """ 52 | Error trying to use unsaved OCRedFile 2019-04-11/2021-01-06 53 | """ 54 | def __init__(self): 55 | super().__init__(f"Trying to use unsaved OCRedFile") 56 | 57 | 58 | class UnresolvedDependencyError(RuntimeError): 59 | """ 60 | The execution was failed via unresolved dependency 2021-01-07 61 | """ 62 | message: Optional[str] = None 63 | program_name: Optional[str] = None 64 | dependency_name: Optional[str] = None 65 | 66 | def __init__(self, message: str, program_name: str, dependency_name: str): 67 | self.message = message 68 | self.program_name = program_name 69 | self.dependency_name = dependency_name 70 | super().__init__( 71 | f"'{program_name}' was failed via unresolved dependency '{dependency_name}': {message}" 72 | ) 73 | -------------------------------------------------------------------------------- /django_ocr_server/forms.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/forms.py 3 | Forms for the OCR Server 4 | """ 5 | __author__ = 'shmakovpn ' 6 | __date__ = '2019-03-18' 7 | 8 | from django import forms 9 | from .models import * 10 | from .widgets import * 11 | from django.core.validators import ValidationError 12 | from django.utils.translation import gettext as _ 13 | from .utils import md5 14 | 15 | 16 | class OCRedFileViewForm(forms.ModelForm): 17 | """ 18 | The form for view an OCRedFile 2019-03-18 19 | """ 20 | pdf_info = forms.Field(label='PDF Info', required=False, ) # SEE __init__ below 21 | 22 | def __init__(self, *args, **kwargs): 23 | super(OCRedFileViewForm, self).__init__(*args, **kwargs) 24 | # init FileLink widget 25 | self.fields['file'].widget = FileLink(attrs={'target': '_blank'}, file_type=self.instance.file_type) 26 | # init PdfInfo widget 27 | pdf_info = {} 28 | pdf_info['pdf_num_pages'] = self.instance.pdf_num_pages 29 | pdf_info['pdf_author'] = self.instance.pdf_author 30 | pdf_info['pdf_creation_date'] = self.instance.pdf_creation_date 31 | pdf_info['pdf_creator'] = self.instance.pdf_creator 32 | pdf_info['pdf_mod_date'] = self.instance.pdf_mod_date 33 | pdf_info['pdf_producer'] = self.instance.pdf_producer 34 | pdf_info['pdf_title'] = self.instance.pdf_title 35 | self.fields['pdf_info'].widget = PdfInfo(attrs={}, pdf_info=pdf_info) 36 | # init PdfLink widget 37 | self.fields['ocred_pdf'].widget = PdfLink(attrs={'target': '_blank', 'readonly': True}, 38 | can_create_pdf=self.instance.can_create_pdf) 39 | 40 | class Meta: 41 | model = OCRedFile 42 | exclude = [] 43 | widgets = { 44 | 'md5': forms.TextInput(attrs={'size': 32, 'readonly': True}), 45 | # 'file' SEE __init__ 46 | 'file_type': forms.TextInput(attrs={'readonly': True}), 47 | 'uploaded': forms.DateTimeInput(attrs={'readonly': True}), 48 | 'ocred': forms.DateTimeInput(attrs={'readonly': True}), 49 | 'text': forms.Textarea(attrs={'readonly': True, 'rows': 4}), 50 | # 'ocred_pdf': SEE __init__ 51 | 'ocred_pdf_md5': forms.TextInput(attrs={'size': 32, 'readonly': True}), 52 | 'pdf_num_pages': forms.TextInput(attrs={'readonly': True}), 53 | } 54 | 55 | 56 | class OCRedFileAddForm(forms.ModelForm): 57 | """ 58 | The form for uploading file for OCR 2019-03-18 59 | """ 60 | def clean(self): 61 | """ 62 | The clean for add OCRedFile form. Checks that a md5 sum of a uploaded file does not already 63 | exist in the OCRedFile.md5 field or in the OCRedFile.ocred_pdf_md5 field. Checks that uploaded file is an image 64 | or pdf 2019-03-18 65 | :return: a cleaned data dict 66 | """ 67 | print('OCRedFileAddForm->clean') 68 | cleaned_data = super(OCRedFileAddForm, self).clean() 69 | file = self.files.get('file') 70 | if not file: 71 | raise ValidationError(_('A file does not present'), 72 | code='invalid') 73 | cleaned_data['file_type'] = file.content_type 74 | OCRedFile.is_valid_file_type(file_type=cleaned_data['file_type'], raise_exception=True) 75 | content = file.read() 76 | file.seek(0) 77 | md5_txt = md5(content) 78 | print('OCRedFileAddForm->clean md5='+md5_txt) 79 | OCRedFile.is_valid_ocr_md5(md5_value=md5_txt, raise_exception=True) 80 | cleaned_data['md5'] = md5_txt 81 | return cleaned_data 82 | 83 | class Meta: 84 | model = OCRedFile 85 | exclude = ['uploaded', 'ocred', ] 86 | widgets = { 87 | 'md5': forms.HiddenInput(), 88 | # 'file': 89 | 'file_type': forms.HiddenInput(), 90 | 'ocred': forms.HiddenInput(), 91 | 'text': forms.HiddenInput(), 92 | 'ocred_pdf': forms.HiddenInput(), 93 | 'ocred_pdf_md5': forms.HiddenInput(), 94 | 'pdf_num_pages': forms.HiddenInput(), 95 | 'pdf_info': forms.HiddenInput(), 96 | } 97 | labels = { 98 | 'file': 'File to upload', 99 | } 100 | 101 | 102 | -------------------------------------------------------------------------------- /django_ocr_server/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/management/__init__.py -------------------------------------------------------------------------------- /django_ocr_server/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/management/commands/__init__.py -------------------------------------------------------------------------------- /django_ocr_server/management/commands/clean.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/management/commands/clean.py 3 | CleanUps folders for OCRedFile.files and OCRedFile.ocred_pdfs from files do not present in OCRedFiles 4 | """ 5 | __author__ = 'shmakovpn ' 6 | __date__ = '2019-04-18' 7 | 8 | 9 | from django.core.management.base import BaseCommand 10 | from django_ocr_server.models import * 11 | 12 | 13 | class Command(BaseCommand): 14 | """ 15 | CleanUps folders for OCRedFile.files and OCRedFile.ocred_pdfs from files do not present in OCRedFiles 2019-04-18 16 | """ 17 | help = 'CleanUps folders for OCRedFile.files and OCRedFile.ocred_pdfs from files do not present in OCRedFiles' 18 | 19 | def handle(self, *args, **options): 20 | """ 21 | CleanUps folders for OCRedFile.files and OCRedFile.ocred_pdfs from files do not present in OCRedFiles 2019-04-18 22 | :param args: not used 23 | :param options: not used 24 | :return: None 25 | """ 26 | clean_result = OCRedFile.cleanup() 27 | for file in clean_result[0]: 28 | self.stdout.write(self.style.SUCCESS('File "%s" removed' % file)) 29 | for pdf in clean_result[1]: 30 | self.stdout.write(self.style.SUCCESS('PDF "%s" removed' % pdf)) 31 | self.stdout.write(self.style.SUCCESS('------')) 32 | self.stdout.write(self.style.SUCCESS('Total files removed: "%s"' % len(clean_result[0]))) 33 | self.stdout.write(self.style.SUCCESS('Total pdf removed: "%s"' % len(clean_result[1]))) 34 | -------------------------------------------------------------------------------- /django_ocr_server/management/commands/create_user.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/management/commands/create_user.py 3 | Creates user with username=$1 and password=$2 if it does not already exist. 4 | If the user with username=$1 is already exists changes its password=$2 5 | Then get or create auth_token for a the user 6 | Returns the Token of created/updated user 7 | """ 8 | __author__ = 'shmakovpn ' 9 | __date__ = '2019-12-03' 10 | 11 | 12 | from django.core.management.base import BaseCommand 13 | from django.contrib.auth.models import User 14 | from rest_framework.authtoken.models import Token 15 | 16 | 17 | class Command(BaseCommand): 18 | """ 19 | Creates user with username=$1 and password=$2 if it does not already exist. 20 | If the user with username=$1 is already exists changes its password=$2. 21 | Then get or create auth_token for a the user. 22 | Returns the Token of created/updated user. 23 | 2019-12-03 24 | """ 25 | help = """ 26 | Creates user with username=$1 and password=$2 if it does not already exist. 27 | If the user with username=$1 is already exists changes its password=$2. 28 | Then get or create auth_token for a the user. 29 | Returns the Token of created/updated user. 30 | 2019-12-03 31 | """ 32 | 33 | def add_arguments(self, parser): 34 | """ 35 | Configures command line arguments 36 | :param parser: 37 | :return: 38 | """ 39 | parser.add_argument('username', type=str) 40 | parser.add_argument('password', type=str) 41 | 42 | def handle(self, *args, **options): 43 | """ 44 | Creates user with username=$1 and password=$2 if it does not already exist. 45 | If the user with username=$1 is already exists changes its password=$2. 46 | Then get or create auth_token for a the user. 47 | Returns the Token of created/updated user. 48 | 2019-12-03 49 | :param args: an array of command line arguments. args 50 | :param options: not used 51 | :return: None 52 | """ 53 | username = options['username'] 54 | password = options['password'] 55 | user, created = User.objects.get_or_create(username=username) 56 | if not created: 57 | self.stderr.write(f"Warning: user with name '{username}' already exists. Change password") 58 | user.set_password(password) 59 | user.save() 60 | token, token_created = Token.objects.get_or_create(user=user) 61 | if not token_created: 62 | self.stderr.write(f"Info: Auth_token already exists") 63 | self.stdout.write(token.key) 64 | -------------------------------------------------------------------------------- /django_ocr_server/management/commands/db_ping.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/management/commands/db_ping.py 3 | Database ping test 4 | """ 5 | __author__ = 'shmakovpn ' 6 | __date__ = '2019-12-05' 7 | 8 | 9 | from django.core.management.base import BaseCommand 10 | from django.contrib.auth.models import User 11 | from django.db import connection 12 | from time import sleep 13 | 14 | class Command(BaseCommand): 15 | """ 16 | Performs a default database ping test. Prints result to stdout 17 | 2019-12-05 18 | """ 19 | help = """ 20 | Performs a default database ping test. Prints result to stdout 21 | 2019-12-05 22 | """ 23 | 24 | def add_arguments(self, parser): 25 | """ 26 | Configures command line arguments 27 | :param parser: 28 | :return: 29 | """ 30 | parser.add_argument('wait_interval', nargs='?', type=int, help="Time to wait in second before ping database") 31 | 32 | def handle(self, *args, **options): 33 | """ 34 | Performs a default database ping test. Prints result to stdout 35 | 2019-12-05 36 | :param args: not used 37 | :param options: ['wait_interval'] 38 | :return: None 39 | """ 40 | if len(options): 41 | sleep(options['wait_interval']) 42 | if connection.is_usable(): 43 | self.stdout.write(self.style.SUCCESS("database ping success")) 44 | exit(0) 45 | else: 46 | self.stderr.write(self.style.ERROR("database ping failed")) 47 | exit(1) 48 | -------------------------------------------------------------------------------- /django_ocr_server/management/commands/ttl.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/management/commands/ttl.py 3 | Removes all instances of OCRedFile whose OCRedFile.uploaded+OCR_TTL lower current datetime 4 | if OCR_TTL does not 0, (NOTE: if OCR_TTL<0 all instances of OCRedFile will be removed, use only for tests). 5 | Removes all OCRedFile.files whose OCRedFile.uploaded+OCR_FILES_TTL lower current datetime 6 | if OCR_FILES_TTL does not 0, 7 | (NOTE: if OCR_FILES_TTL<0 all OCRedFile.files will be removed, use only for tests). 8 | Removes all OCRedFile.ocred_pdfs whose OCRedFile.uploaded+OCR_PDF_TTL lower current datetime 9 | if OCR_PDF_TTL does not 0, 10 | (NOTE: if OCR_PDF_TTL<0 all OCRedFile.ocred_pdfs will be removed, use only for tests). 2019-04-13 11 | """ 12 | __author__ = 'shmakovpn ' 13 | __date__ = '2019-04-18' 14 | 15 | 16 | from django.core.management.base import BaseCommand 17 | from django_ocr_server.models import * 18 | 19 | 20 | class Command(BaseCommand): 21 | """ 22 | Removes all instances of OCRedFile whose OCRedFile.uploaded+OCR_TTL lower current datetime 23 | if OCR_TTL does not 0, (NOTE: if OCR_TTL<0 all instances of OCRedFile will be removed, use only for tests). 24 | Removes all OCRedFile.files whose OCRedFile.uploaded+OCR_FILES_TTL lower current datetime 25 | if OCR_FILES_TTL does not 0, 26 | (NOTE: if OCR_FILES_TTL<0 all OCRedFile.files will be removed, use only for tests). 27 | Removes all OCRedFile.ocred_pdfs whose OCRedFile.uploaded+OCR_PDF_TTL lower current datetime 28 | if OCR_PDF_TTL does not 0, 29 | (NOTE: if OCR_PDF_TTL<0 all OCRedFile.ocred_pdfs will be removed, use only for tests). 2019-04-18 30 | """ 31 | help = """ 32 | Removes all instances of OCRedFile whose OCRedFile.uploaded+OCR_TTL lower current datetime 33 | if OCR_TTL does not 0, (NOTE: if OCR_TTL<0 all instances of OCRedFile will be removed, use only for tests). 34 | Removes all OCRedFile.files whose OCRedFile.uploaded+OCR_FILES_TTL lower current datetime 35 | if OCR_FILES_TTL does not 0, 36 | (NOTE: if OCR_FILES_TTL<0 all OCRedFile.files will be removed, use only for tests). 37 | Removes all OCRedFile.ocred_pdfs whose OCRedFile.uploaded+OCR_PDF_TTL lower current datetime 38 | if OCR_PDF_TTL does not 0, 39 | (NOTE: if OCR_PDF_TTL<0 all OCRedFile.ocred_pdfs will be removed, use only for tests). 40 | """ 41 | 42 | def handle(self, *args, **options): 43 | """ 44 | Removes all instances of OCRedFile whose OCRedFile.uploaded+OCR_TTL lower current datetime 45 | if OCR_TTL does not 0, (NOTE: if OCR_TTL<0 all instances of OCRedFile will be removed, use only for tests). 46 | Removes all OCRedFile.files whose OCRedFile.uploaded+OCR_FILES_TTL lower current datetime 47 | if OCR_FILES_TTL does not 0, 48 | (NOTE: if OCR_FILES_TTL<0 all OCRedFile.files will be removed, use only for tests). 49 | Removes all OCRedFile.ocred_pdfs whose OCRedFile.uploaded+OCR_PDF_TTL lower current datetime 50 | if OCR_PDF_TTL does not 0, 51 | (NOTE: if OCR_PDF_TTL<0 all OCRedFile.ocred_pdfs will be removed, use only for tests). 2019-04-18 52 | :param args: not used 53 | :param options: not used 54 | :return: None 55 | """ 56 | ttl_result = OCRedFile.ttl() 57 | self.stdout.write(self.style.SUCCESS('Total models removed: %s' % str(ttl_result[0]))) 58 | self.stdout.write(self.style.SUCCESS('Total files removed: %s' % str(ttl_result[1]))) 59 | self.stdout.write(self.style.SUCCESS('Total pdf removed: %s' % str(ttl_result[2]))) 60 | -------------------------------------------------------------------------------- /django_ocr_server/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.1.7 on 2019-04-16 10:17 2 | 3 | from django.db import migrations, models 4 | import django_ocr_server.models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | initial = True 10 | 11 | dependencies = [ 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='OCRedFile', 17 | fields=[ 18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 19 | ('md5', models.CharField(blank=True, max_length=32, unique=True, verbose_name='md5')), 20 | ('file', models.FileField(null=True, upload_to=django_ocr_server.models.set_ocredfile_name, verbose_name='uploaded file')), 21 | ('file_type', models.CharField(blank=True, max_length=20, null=True, verbose_name='content type')), 22 | ('text', models.TextField(blank=True, null=True, verbose_name='OCRed content')), 23 | ('uploaded', models.DateTimeField(auto_now_add=True, verbose_name='uploaded datetime')), 24 | ('ocred', models.DateTimeField(blank=True, null=True, verbose_name='OCRed datetime')), 25 | ('ocred_pdf', models.FileField(null=True, upload_to=django_ocr_server.models.set_pdffile_name, verbose_name='Searchable PDF')), 26 | ('ocred_pdf_md5', models.CharField(blank=True, max_length=32, null=True, verbose_name="Searchable PDF's md5")), 27 | ('pdf_num_pages', models.IntegerField(blank=True, null=True, verbose_name="PDF's num pages")), 28 | ('pdf_author', models.CharField(blank=True, max_length=128, null=True, verbose_name="PDF's author")), 29 | ('pdf_creation_date', models.DateTimeField(blank=True, null=True, verbose_name="PDF's creation date")), 30 | ('pdf_creator', models.CharField(blank=True, max_length=128, null=True, verbose_name="PDF's creator")), 31 | ('pdf_mod_date', models.DateTimeField(blank=True, null=True, verbose_name="PDF's mod date")), 32 | ('pdf_producer', models.CharField(blank=True, max_length=128, null=True, verbose_name="PDF's producer")), 33 | ('pdf_title', models.CharField(blank=True, max_length=128, null=True, verbose_name="PDF's title")), 34 | ], 35 | options={ 36 | 'verbose_name': 'OCRedFile', 37 | 'verbose_name_plural': 'OCRedFiles', 38 | }, 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /django_ocr_server/migrations/0002_auto_20201230_0312.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.4 on 2020-12-30 03:12 2 | 3 | from django.db import migrations, models 4 | import django_ocr_server.models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('django_ocr_server', '0001_initial'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='ocredfile', 16 | name='file', 17 | field=models.FileField(max_length=500, null=True, upload_to=django_ocr_server.models.set_ocredfile_name, verbose_name='uploaded file'), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /django_ocr_server/migrations/0003_auto_20201230_0316.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.4 on 2020-12-30 03:16 2 | 3 | from django.db import migrations, models 4 | import django_ocr_server.models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('django_ocr_server', '0002_auto_20201230_0312'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='ocredfile', 16 | name='ocred_pdf', 17 | field=models.FileField(max_length=500, null=True, upload_to=django_ocr_server.models.set_pdffile_name, verbose_name='Searchable PDF'), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /django_ocr_server/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/migrations/__init__.py -------------------------------------------------------------------------------- /django_ocr_server/serializers.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/serializers/py 3 | This file contains serializer for Django REST Framework 4 | """ 5 | __author__ = 'shmakovpn ' 6 | __date__ = '2019-03-18' 7 | 8 | from django.utils.translation import gettext_lazy as _ 9 | from rest_framework import serializers 10 | from .models import * 11 | from .utils import md5 12 | 13 | 14 | class OCRedFileSerializer(serializers.ModelSerializer): 15 | """ 16 | The OCRedFile model serializer 2019-03-18 17 | """ 18 | def __init__(self, *args, **kwargs): 19 | """ 20 | OCRedFileSerializer constructor 21 | :param args: 22 | :param kwargs: 23 | """ 24 | super(OCRedFileSerializer, self).__init__(*args, **kwargs) 25 | 26 | def is_valid(self, raise_exception=False): 27 | """ 28 | The OCRedFile model serializer validator 2019-03-20 29 | :param raise_exception: 30 | :return: boolean True if the data is valid 31 | """ 32 | try: 33 | file_type = self.initial_data['file'].content_type 34 | except ValueError as e: 35 | if raise_exception: 36 | raise ValidationError(_('OCRedFileSerializer. The "content_type" of the "file" does not exist')) 37 | else: 38 | return False 39 | content = self.initial_data['file'].read() 40 | self.initial_data['file'].seek(0) 41 | if not OCRedFile.is_valid_file_type(file_type=file_type, raise_exception=raise_exception): 42 | return False 43 | md5_value = md5(content) 44 | if not OCRedFile.is_valid_ocr_md5(md5_value=md5_value, raise_exception=raise_exception): 45 | print('OCRedFileSerializer.is_valid md5=' + md5_value + ' failed') 46 | return False 47 | return super(OCRedFileSerializer, self).is_valid(raise_exception) 48 | 49 | @property 50 | def data(self): 51 | """ 52 | This function returns filtered Serializer.data without 'file' and 'ocred_pdf' fields 2019-04-11 53 | :return: filtered Serializer.data dictionary without 'file' and 'ocred_pdf' 54 | """ 55 | data = super(OCRedFileSerializer, self).data 56 | if 'file' in data: 57 | del data['file'] 58 | if 'ocred_pdf' in data: 59 | del data['ocred_pdf'] 60 | return data 61 | 62 | class Meta: 63 | model = OCRedFile 64 | fields = ( 65 | 'id', 66 | 'md5', 67 | 'file', 68 | 'download_file', # url for downloading OCRedFile.file if exists 69 | 'file_type', 70 | 'text', 71 | 'uploaded', 72 | 'ocred', 73 | 'ocred_pdf', 74 | 'download_ocred_pdf', # url for downloading OCRedFile.ocred_pdf if exists 75 | 'ocred_pdf_md5', 76 | 'pdf_num_pages', 77 | 'pdf_author', 78 | 'pdf_creation_date', 79 | 'pdf_creator', 80 | 'pdf_mod_date', 81 | 'pdf_producer', 82 | 'pdf_title', 83 | 'can_create_pdf', 84 | 'can_remove_file', 85 | 'can_remove_pdf', 86 | ) 87 | extra_kwargs = { 88 | 'id': {'read_only': True}, 89 | 'md5': {'read_only': True}, 90 | 'download_file': {'read_only': True}, 91 | 'file_type': {'read_only': True}, 92 | 'text': {'read_only': True}, 93 | 'uploaded': {'read_only': True}, 94 | 'ocred': {'read_only': True}, 95 | 'download_ocred_pdf': {'read_only': True}, 96 | 'ocred_pdf_md5': {'read_only': True}, 97 | 'pdf_num_pages': {'read_only': True}, 98 | 'pdf_author': {'read_only': True}, 99 | 'pdf_creation_date': {'read_only': True}, 100 | 'pdf_creator': {'read_only': True}, 101 | 'pdf_mod_date': {'read_only': True}, 102 | 'pdf_producer': {'read_only': True}, 103 | 'pdf_title': {'read_only': True}, 104 | 'can_create_pdf': {'read_only': True}, 105 | 'can_remove_file': {'read_only': True}, 106 | 'can_remove_pdf': {'read_only': True}, 107 | } 108 | -------------------------------------------------------------------------------- /django_ocr_server/static/django_ocr_server/django-ocr-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/static/django_ocr_server/django-ocr-server.png -------------------------------------------------------------------------------- /django_ocr_server/static/django_ocr_server/django-ocr-server.svg: -------------------------------------------------------------------------------- 1 | 2 | 19 | 21 | 22 | 24 | image/svg+xml 25 | 27 | 28 | 29 | 30 | 31 | 51 | 53 | 56 | 58 | Initial data 64 | 65 | 66 | 73 | 76 | 78 | Django-ocr-server 84 | 85 | 86 | 94 | 97 | 99 | image 105 | 106 | 107 | 115 | 118 | 120 | PDF 126 | 127 | 128 | 136 | 139 | 141 | Text 147 | 148 | 149 | 157 | 160 | 162 | Searchable 168 | 169 | 170 | 177 | 184 | 192 | 199 | 206 | 213 | 220 | 223 | 225 | image has text 231 | 232 | 233 | 240 | 247 | 250 | 252 | yes 258 | 259 | 260 | 263 | 265 | Result 271 | 272 | 273 | 281 | 288 | 291 | 293 | PDF recognition 299 | 300 | 301 | 308 | 311 | 313 | PDF has text 319 | 320 | 321 | 328 | 335 | 342 | 349 | 352 | 354 | yes 360 | 361 | 362 | 369 | 376 | 379 | 381 | no 387 | 388 | 389 | 396 | 399 | 401 | Recognized PDF 407 | 408 | 409 | 416 | 423 | 430 | 437 | 440 | 442 | yes 448 | 449 | 450 | 457 | 464 | 471 | 478 | 485 | 492 | 499 | 502 | 504 | Image recognition 510 | 511 | 512 | 515 | 517 | PDF 523 | 524 | 525 | 528 | 530 | has text 536 | 537 | 538 | 539 | -------------------------------------------------------------------------------- /django_ocr_server/static/django_ocr_server/style.css: -------------------------------------------------------------------------------- 1 | #nav { 2 | float: right; 3 | } -------------------------------------------------------------------------------- /django_ocr_server/templates/admin/django_ocr_server/ocredfile/submit_line.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/submit_line.html" %} 2 | {% load i18n admin_urls %} 3 | {% block submit-row %} 4 | {% if show_save %}{% endif %} 5 | {% if show_delete_link %} 6 | {% url opts|admin_urlname:'delete' original.pk|admin_urlquote as delete_url %} 7 | 8 | {% endif %} 9 | {% if show_save_as_new %}{% endif %} 10 | {% if show_save_and_add_another %} 11 | {% if ocr_show_save_and_add_another %} 12 | 13 | {% endif %} 14 | {% endif %} 15 | {% if show_save_and_continue %} 16 | 17 | {% elif ocr_show_save_and_view %} 18 | 19 | {% endif %} 20 | {% if show_close %}{% trans 'Close' %}{% endif %} 21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /django_ocr_server/templates/django_ocr_server/base.html: -------------------------------------------------------------------------------- 1 | {% load staticfiles %} 2 | 3 | 4 | 5 | 6 | OCR Server | {% block title %}{% endblock %} 7 | 8 | 9 | 12 | {% block external %} 13 | 14 | {% endblock %} 15 | 16 | 17 | 20 |

{% block head %}{% endblock %}

21 | {% block content %}{% endblock %} 22 | 23 | -------------------------------------------------------------------------------- /django_ocr_server/templates/django_ocr_server/forms/widgets/file_link.html: -------------------------------------------------------------------------------- 1 | {% if widget.store_files_disabled %} 2 | Store files is disabled. 3 | {% endif %} 4 | {% if widget.file_missing %} 5 | File is missing. 6 | {% elif widget.file_removed %} 7 | File removed. 8 | {% else %} 9 | {{ widget.filename|stringformat:'s' }} 10 | 11 | {% if widget.file_preview %} 12 |
13 | {% endif %} 14 | {% endif %} 15 | -------------------------------------------------------------------------------- /django_ocr_server/templates/django_ocr_server/forms/widgets/pdf_info.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 26 | 27 |
4 | {% if pdf_info.pdf_num_pages %} 5 |
nPages: {{ pdf_info.pdf_num_pages }}
6 | {% endif %} 7 | {% if pdf_info.pdf_author %} 8 |
Author: {{ pdf_info.pdf_author }}
9 | {% endif %} 10 | {% if pdf_info.pdf_creation_date %} 11 |
Created: {{ pdf_info.pdf_creation_date }}
12 | {% endif %} 13 | {% if pdf_info.pdf_creator %} 14 |
Creator: {{ pdf_info.pdf_creator }}
15 | {% endif %} 16 | {% if pdf_info.pdf_mod_date %} 17 |
Modified: {{ pdf_info.pdf_mod_date }}
18 | {% endif %} 19 | {% if pdf_info.pdf_producer %} 20 |
Producer: {{ pdf_info.pdf_producer }}
21 | {% endif %} 22 | {% if pdf_info.pdf_title %} 23 |
Title: {{ pdf_info.pdf_title }}
24 | {% endif %} 25 |
-------------------------------------------------------------------------------- /django_ocr_server/templates/django_ocr_server/forms/widgets/pdf_link.html: -------------------------------------------------------------------------------- 1 | {% if widget.store_pdf_disabled %} 2 | Store PDF is disabled. 3 | {% endif %} 4 | {% if widget.pdf_missing %} 5 | PDF is missing. 6 | {% elif widget.pdf_removed %} 7 | PDF removed. 8 | {% elif widget.pdf_exists %} 9 | {{ widget.filename|stringformat:'s' }} 10 | 11 | {% endif %} 12 | {% if widget.create_pdf_button %} 13 | 14 | {% endif %} 15 | -------------------------------------------------------------------------------- /django_ocr_server/templates/django_ocr_server/main_page.html: -------------------------------------------------------------------------------- 1 | {% extends package|add:'/base.html' %} 2 | {% block title %}Welcome to OCR Server{% endblock %} 3 | {% block head %}Welcome to OCR Server{% endblock %} 4 | {% block home_link %}home{% endblock %} 5 | {% block content %} 6 |
STORE_FILES={{ STORE_FILES }}
7 |
STORE_PDF={{ STORE_PDF }}
8 | {% endblock %} 9 | -------------------------------------------------------------------------------- /django_ocr_server/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/__init__.py 3 | +++++++++++++++++++++++++++++++++++ 4 | 5 | | Author: shmakovpn 6 | | Date: 2021-01-07 7 | """ -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/deming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/deming.pdf -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/empty_file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/empty_file.txt -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/not_empty_file.txt: -------------------------------------------------------------------------------- 1 | content 2 | -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/not_image.txt: -------------------------------------------------------------------------------- 1 | This is not an image or a pdf 2 | -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/shmakovpn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/shmakovpn.jpg -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/shmakovpn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/shmakovpn.pdf -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/some_dir/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/some_dir/.gitkeep -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/test_eng.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/test_eng.pdf -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/test_eng.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/test_eng.png -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/test_eng_notext.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/test_eng_notext.pdf -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/test_rus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/test_rus.png -------------------------------------------------------------------------------- /django_ocr_server/tests/assets/the_pdf_withtext.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/assets/the_pdf_withtext.pdf -------------------------------------------------------------------------------- /django_ocr_server/tests/dependencies/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/dependencies/__init__.py 3 | 4 | Author: shmakovpn 5 | Date: 2021-01-11 6 | """ -------------------------------------------------------------------------------- /django_ocr_server/tests/dependencies/test_dependencies.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/dependencies/test_dependencies.py 3 | 4 | 5 | 6 | Author: shmakovpn 7 | Date: 2021-01-11 8 | """ 9 | import re 10 | from typing import List, Match, Optional 11 | from django.test import SimpleTestCase 12 | import subprocess 13 | 14 | 15 | class TestDependencies(SimpleTestCase): 16 | def test_echo(self): 17 | """The testing that the *echo* command exists""" 18 | cmd: List[str] = ['echo', 'hello'] 19 | try: 20 | popen: subprocess.Popen = subprocess.Popen(cmd, 21 | stdout=subprocess.PIPE, 22 | stderr=subprocess.PIPE, 23 | stdin=subprocess.PIPE) 24 | stdout, stderr = popen.communicate() 25 | self.assertEqual(popen.returncode, 0) 26 | self.assertEqual(stdout.decode(), 'hello\n') 27 | self.assertEqual(stderr.decode(), '') 28 | except FileNotFoundError as e: 29 | self.assertIsNone(f'Testing the "echo" is exists was failed: {e}') 30 | 31 | def test_cat(self): 32 | """The testing the *cat* command exists""" 33 | cmd: List[str] = ['cat'] 34 | popen: subprocess.Popen = subprocess.Popen(cmd, 35 | stdout=subprocess.PIPE, 36 | stderr=subprocess.PIPE, 37 | stdin=subprocess.PIPE) 38 | input: bytes = 'hello'.encode() 39 | stdout, stderr = popen.communicate(input=input) 40 | self.assertEqual(popen.returncode, 0) 41 | self.assertEqual(stdout.decode(), 'hello') 42 | self.assertEqual(stderr.decode(), '') 43 | 44 | def test_tesseract(self): 45 | """The testing that the *tesseract* command exists""" 46 | cmd: List[str] = ['tesseract', '--version'] 47 | try: 48 | popen: subprocess.Popen = subprocess.Popen(cmd, 49 | stdout=subprocess.PIPE, 50 | stderr=subprocess.PIPE, 51 | stdin=subprocess.PIPE) 52 | stdout, stderr = popen.communicate() 53 | self.assertEqual(popen.returncode, 0) 54 | self.assertEqual(stderr.decode(), '') 55 | match: Optional[Match] = re.search(r'^(tesseract)\s(\d+)', 56 | stdout.decode()) 57 | version: int = int(match.group(2)) 58 | self.assertTrue(version >= 4) 59 | except FileNotFoundError as e: 60 | self.assertIsNone( 61 | f'Testing the "tesseract" is exists was failed: {e}') 62 | 63 | def test_ocrmypdf(self): 64 | """The testing that the *ocrmypdf* command exists""" 65 | cmd: List[str] = ['ocrmypdf', '--version'] 66 | try: 67 | popen: subprocess.Popen = subprocess.Popen(cmd, 68 | stdout=subprocess.PIPE, 69 | stderr=subprocess.PIPE, 70 | stdin=subprocess.PIPE) 71 | stdout, stderr = popen.communicate() 72 | self.assertEqual(popen.returncode, 0) 73 | self.assertEqual(stderr.decode(), '') 74 | match: Optional[Match] = re.search(r'^(\d+)', stdout.decode()) 75 | version: int = int(match.group(1)) 76 | self.assertTrue(version >= 11) 77 | except FileNotFoundError as e: 78 | self.assertIsNone( 79 | f'Testing the "ocrmypdf" is exists was failed: {e}') 80 | 81 | def test_ghostscript(self): 82 | """The testing that the *gs* command exists""" 83 | cmd: List[str] = ['gs', '-version'] 84 | try: 85 | popen: subprocess.Popen = subprocess.Popen(cmd, 86 | stdout=subprocess.PIPE, 87 | stderr=subprocess.PIPE, 88 | stdin=subprocess.PIPE) 89 | stdout, stderr = popen.communicate() 90 | self.assertEqual(popen.returncode, 0) 91 | self.assertEqual(stderr.decode(), '') 92 | match: Optional[Match] = re.search(r'(Ghostscript)\s(\d+)', 93 | stdout.decode()) 94 | version: int = int(match.group(2)) 95 | self.assertTrue(version >= 9) 96 | except FileNotFoundError as e: 97 | self.assertIsNone(f'Testing the "gs" is exists was failed: {e}') -------------------------------------------------------------------------------- /django_ocr_server/tests/old_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/django_ocr_server/tests/old_tests/__init__.py -------------------------------------------------------------------------------- /django_ocr_server/tests/settings/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/settings/__init__.py 3 | 4 | Author: shmakovpn 5 | Date: 2021-01-22 6 | """ -------------------------------------------------------------------------------- /django_ocr_server/tests/settings/test_default_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/default_settings.py 3 | +++++++++++++++++++++++++++++++++++++++++++ 4 | 5 | Tests for *django_ocr_server/default_settings.py*. 6 | 7 | | Author: shmakovpn 8 | | Date: 2021-01-07 9 | """ 10 | from datetime import timedelta 11 | import os 12 | from typing import List 13 | from django.test import SimpleTestCase 14 | import django_ocr_server.default_settings as _ds 15 | from django.conf import settings 16 | 17 | 18 | class TestDefaultSettings(SimpleTestCase): 19 | def test_dstore_files(self) -> None: 20 | self.assertTrue(_ds.OCR_STORE_FILES) 21 | 22 | def test_file_preview(self) -> None: 23 | self.assertTrue(_ds.OCR_FILE_PREVIEW) 24 | 25 | def test_tesseract_lang(self) -> None: 26 | self.assertEqual(_ds.OCR_TESSERACT_LANG, 'rus+eng') 27 | 28 | def test_dstore_pdf(self) -> None: 29 | self.assertTrue(_ds.OCR_STORE_PDF) 30 | 31 | def test_dstore_files_disabled_label(self) -> None: 32 | self.assertEqual(_ds.OCR_STORE_FILES_DISABLED_LABEL, 33 | 'store_files_disabled') 34 | 35 | def test_dstore_pdf_disabled_label(self) -> None: 36 | self.assertEqual(_ds.OCR_STORE_PDF_DISABLED_LABEL, 37 | 'store_pdf_disabled') 38 | 39 | def test_file_removed_label(self) -> None: 40 | self.assertEqual(_ds.OCR_FILE_REMOVED_LABEL, 'file_removed') 41 | 42 | def test_pdf_removed_label(self) -> None: 43 | self.assertEqual(_ds.OCR_PDF_REMOVED_LABEL, 'pdf_removed') 44 | 45 | def test_allowed_file_types(self) -> None: 46 | self.assertEqual(len(_ds.OCR_ALLOWED_FILE_TYPES), 5) 47 | self.assertIn('application/pdf', _ds.OCR_ALLOWED_FILE_TYPES) 48 | self.assertIn('image/jpeg', _ds.OCR_ALLOWED_FILE_TYPES) 49 | self.assertIn('image/png', _ds.OCR_ALLOWED_FILE_TYPES) 50 | self.assertIn('image/bmp', _ds.OCR_ALLOWED_FILE_TYPES) 51 | self.assertIn('image/tiff', _ds.OCR_ALLOWED_FILE_TYPES) 52 | 53 | def test_files_upload_to(self) -> None: 54 | files_upload_to: str = os.path.join(settings.BASE_DIR, 55 | 'django_ocr_server', 'upload') 56 | self.assertEqual(_ds.OCR_FILES_UPLOAD_TO, files_upload_to) 57 | 58 | def test_pdf_upload_to(self) -> None: 59 | pdf_upload_to: str = os.path.join(settings.BASE_DIR, 60 | 'django_ocr_server', 'pdf') 61 | self.assertEqual(_ds.OCR_PDF_UPLOAD_TO, pdf_upload_to) 62 | 63 | def test_files_ttl(self) -> None: 64 | self.assertEqual(_ds.OCR_FILES_TTL, timedelta(0)) 65 | 66 | def test_pdf_ttl(self) -> None: 67 | self.assertEqual(_ds.OCR_PDF_TTL, timedelta(0)) 68 | 69 | def test_ttl(self) -> None: 70 | self.assertEqual(_ds.OCR_TTL, timedelta(0)) 71 | 72 | def test_default_settings_amount(self) -> None: 73 | """The testing that amount of default settings is equal 14""" 74 | settings: List[str] = dir(_ds) 75 | filtered_settings_list: List[str] = list( 76 | filter(lambda x: x.startswith('OCR'), settings)) 77 | self.assertEqual(len(filtered_settings_list), 14) 78 | -------------------------------------------------------------------------------- /django_ocr_server/tests/settings/test_ocr_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/utils/test_settings_getters.py 3 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 | 5 | Testing *ocr_settings: DjangoOcrSettings* from django_ocr_server/conf.py 6 | 7 | | Author: shmakovpn 8 | | Date: 2021-01-22 9 | """ 10 | from typing import Callable, Type 11 | from datetime import timedelta 12 | from django.test.testcases import SimpleTestCase 13 | from django.test.utils import override_settings 14 | from django_ocr_server.conf import ocr_settings 15 | 16 | 17 | 18 | 19 | class TestOcrSettings(SimpleTestCase): 20 | @override_settings(OCR_STORE_FILES=None) 21 | def test_get_store_files__not_set(self) -> None: 22 | """Testing get_store_files when no OCR_STORE_FILES""" 23 | print(ocr_settings.OCR_FILE_PREVIEW) 24 | self.assertTrue(True) 25 | # reveal_type(TestOcrSettings.test_get_store_files__not_set) 26 | _TestClass = Type[SimpleTestCase] 27 | print(type(TestOcrSettings.test_get_store_files__not_set)) 28 | print(type(_TestClass)) 29 | reveal_type(_TestClass) 30 | reveal_type(SimpleTestCase) 31 | # del settings.OCR_STORE_FILES 32 | # self.assertEqual(_u.get_store_files(), _s.STORE_FILES) 33 | 34 | def hello(self: TestOcrSettings): 35 | pass 36 | 37 | 38 | m: Callable[..., None] = hello 39 | 40 | 41 | # @override_settings(OCR_STORE_FILES=True) 42 | # def test_get_store_files__true(self): 43 | # """Testing get_store_files() when OCR_STORE_FILES is True""" 44 | # self.assertEqual(_u.get_store_files(), True) 45 | 46 | # @override_settings(OCR_STORE_FILES=False) 47 | # def test_get_store_files__false(self): 48 | # """Testing get_store_files() when OCR_STORE_FILES is False""" 49 | # self.assertEqual(_u.get_store_files(), False) 50 | 51 | # @override_settings(OCR_FILE_PREVIEW=None) 52 | # def test_get_file_preview__not_set(self): 53 | # """Testing get_file_preview() when no OCR_FILE_PREVIEW""" 54 | # del settings.OCR_FILE_PREVIEW 55 | # self.assertEqual(_u.get_file_preview(), _s.FILE_PREVIEW) 56 | 57 | # @override_settings(OCR_FILE_PREVIEW=True) 58 | # def test_get_file_preview__true(self): 59 | # """Testing get_file_preview() when OCR_FILE_PREVIEW is True""" 60 | # self.assertEqual(_u.get_file_preview(), True) 61 | 62 | # @override_settings(OCR_FILE_PREVIEW=False) 63 | # def test_get_file_preview__false(self): 64 | # """Testing get_file_preview() when OCR_FILE_PREVIEW is False""" 65 | # self.assertEqual(_u.get_file_preview(), False) 66 | 67 | # @override_settings(OCR_TESSERACT_LANG=None) 68 | # def test_get_tesseract_lang__not_set(self): 69 | # """Testing get_tesseract_lang() when no OCR_TESSERACT_LANG""" 70 | # del settings.OCR_TESSERACT_LANG 71 | # self.assertEqual(_u.get_tesseract_lang(), _s.TESSERACT_LANG) 72 | 73 | # @override_settings(OCR_TESSERACT_LANG='new+lang') 74 | # def test_get_tesseract_lang__overrided(self): 75 | # """Testing get_tesseract when OCR_TESSERACT_LANG is set""" 76 | # self.assertEqual(_u.get_tesseract_lang(), 'new+lang') 77 | 78 | # @override_settings(OCR_STORE_PDF=None) 79 | # def test_get_store_pdf__not_set(self): 80 | # """Testing get_store_pdf() when no OCR_STORE_PDF""" 81 | # del settings.OCR_STORE_PDF 82 | # self.assertEqual(_u.get_store_pdf(), _s.STORE_PDF) 83 | 84 | # @override_settings(OCR_STORE_PDF=True) 85 | # def test_get_store_pdf__true(self): 86 | # """Testing get_store_pdf() when OCR_STORE_PDF is True""" 87 | # self.assertEqual(_u.get_store_pdf(), True) 88 | 89 | # @override_settings(OCR_STORE_PDF=False) 90 | # def test_get_store_pdf__false(self): 91 | # """Testing get_store_pdf() when OCR_STORE_PDF is False""" 92 | # self.assertEqual(_u.get_store_pdf(), False) 93 | 94 | # @override_settings(OCR_FILES_UPLOAD_TO=None) 95 | # def test_get_ocr_files_upload_to__not_set(self): 96 | # """Testing get_ocr_files_upload_to() when no OCR_FILES_UPLOAD_TO""" 97 | # del settings.OCR_FILES_UPLOAD_TO 98 | # self.assertEqual(_u.get_files_upload_to(), _s.FILES_UPLOAD_TO) 99 | 100 | # @override_settings(OCR_FILES_UPLOAD_TO='upload+to') 101 | # def test_get_ocr_files_upload_to__overrided(self): 102 | # """Testing get_ocr_file_upload_to() when is OCR_FILES_UPLOAD_TO is set""" 103 | # self.assertEqual(_u.get_files_upload_to(), 'upload+to') 104 | 105 | # @override_settings(OCR_PDF_UPLOAD_TO=None) 106 | # def test_get_ocr_pdf_upload_to__not_set(self): 107 | # """Testing get_pdf_upload_to() when no OCR_PDF_UPLOAD_TO""" 108 | # del settings.OCR_PDF_UPLOAD_TO 109 | # self.assertEqual(_u.get_pdf_upload_to(), _s.PDF_UPLOAD_TO) 110 | 111 | # @override_settings(OCR_PDF_UPLOAD_TO='pdf+to') 112 | # def test_get_ocr_pdf_upload_to__overrided(self): 113 | # """Testing get_pdf_upload_to() when OCR_PDF_UPLOAD_TO is set""" 114 | # self.assertEqual(_u.get_pdf_upload_to(), 'pdf+to') 115 | 116 | # @override_settings(OCR_FILES_TTL=None) 117 | # def test_ocr_files_ttl__not_set(self): 118 | # """Testing get_ocr_files_ttl() when no OCR_FILES_TTL""" 119 | # del settings.OCR_FILES_TTL 120 | # self.assertEqual(_u.get_ocr_files_ttl(), _s.FILES_TTL) 121 | 122 | # @override_settings(OCR_FILES_TTL=timedelta(days=36)) 123 | # def test_ocr_files_ttl__overrided(self): 124 | # """Testing get_ocr_files_ttl() when OCR_FILES_TTL is set""" 125 | # self.assertEqual(_u.get_ocr_files_ttl(), timedelta(days=36)) 126 | 127 | # @override_settings(OCR_PDF_TTL=None) 128 | # def test_ocr_pdf_ttl__not_set(self): 129 | # """Testing get_pdf_ttl() when no OCR_PDF_TTL""" 130 | # del settings.OCR_PDF_TTL 131 | # self.assertEqual(_u.get_ocr_pdf_ttl(), _s.PDF_TTL) 132 | 133 | # @override_settings(OCR_PDF_TTL=timedelta(days=36)) 134 | # def test_ocr_pdf_ttl__overrided(self): 135 | # """Testing get_ocr_pdf_ttl() when OCR_PDF_TTL is set""" 136 | # self.assertEqual(_u.get_ocr_pdf_ttl(), timedelta(days=36)) 137 | 138 | # @override_settings(OCR_TTL=None) 139 | # def test_ocr_ttl__not_set(self): 140 | # """Testing get_ocr_ttl() when no OCR_TTL""" 141 | # del settings.OCR_TTL 142 | # self.assertEqual(_u.get_ocr_ttl(), _s.TTL) 143 | 144 | # @override_settings(OCR_TTL=timedelta(days=36)) 145 | # def test_ocr_ttl__overrided(self): 146 | # """Testing get_ocr_ttl() when OCR_TTL is set""" 147 | # self.assertEqual(_u.get_ocr_ttl(), timedelta(days=36)) -------------------------------------------------------------------------------- /django_ocr_server/tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/utils/__init__.py 3 | +++++++++++++++++++++++++++++++++++++++++ 4 | 5 | | Author: shmakovpn 6 | | Date: 2020-01-07 7 | """ -------------------------------------------------------------------------------- /django_ocr_server/tests/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/tests/utils/test_utils.py 3 | +++++++++++++++++++++++++++++++++++++++++++ 4 | 5 | | Author: shmakovpn 6 | | Date: 2021-01-07 7 | """ 8 | from typing import List, Match, Optional, NewType, Type 9 | from django.test import SimpleTestCase 10 | from unittest.mock import patch 11 | from io import StringIO 12 | import django_ocr_server.utils as _u 13 | import os 14 | import pdftotext 15 | import re 16 | 17 | #: type hint for os.path 18 | Path: Type = NewType('Path', str) 19 | 20 | 21 | class TestUtils(SimpleTestCase): 22 | def test_read_binary_file(self): 23 | """Testing read_binary_file(path: str)""" 24 | tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) 25 | empty_file: Path = Path(os.path.join(tests_dir, 'empty_file.txt')) 26 | empty_content: bytes = _u.read_binary_file(empty_file) 27 | self.assertEqual(empty_content, bytes()) 28 | folder: Path = os.path.join(tests_dir, 'some_dir') 29 | with self.assertRaisesMessage( 30 | IsADirectoryError, f"[Errno 21] Is a directory: '{folder}'"): 31 | _: bytes = _u.read_binary_file(folder) 32 | not_empty_file: Path = Path( 33 | os.path.join(tests_dir, 'not_empty_file.txt')) 34 | not_empty_content: bytes = _u.read_binary_file(not_empty_file) 35 | self.assertEqual(type(not_empty_content), bytes) 36 | self.assertEqual(not_empty_content, 'content\n'.encode()) 37 | no_file: Path = Path(os.path.join(tests_dir, 'no_file.txt')) 38 | with self.assertRaisesMessage( 39 | FileNotFoundError, 40 | f"[Errno 2] No such file or directory: '{no_file}'"): 41 | _: bytes = _u.read_binary_file(no_file) 42 | 43 | def test_omp_thread_limit(self): 44 | """Testing that the environment variable OMP_THREAD_LIMIT equeals to '1' """ 45 | self.assertEqual(os.environ['OMP_THREAD_LIMIT'], '1') 46 | 47 | def test_md5(self): 48 | """Testing md5(content: bytes)""" 49 | EMPTY_MD5: str = 'd41d8cd98f00b204e9800998ecf8427e' 50 | empty_md5: str = _u.md5(''.encode()) 51 | self.assertEqual(empty_md5, EMPTY_MD5) 52 | NOT_EMPTY_MD5: str = 'f75b8179e4bbe7e2b4a074dcef62de95' 53 | not_epmty_md5: str = _u.md5('content\n'.encode()) 54 | self.assertEqual(not_epmty_md5, NOT_EMPTY_MD5) 55 | 56 | def test_pdf2text(self): 57 | """Testing pdf2text(pdf_content: bytes)""" 58 | tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) 59 | pdf_notext: Path = Path(os.path.join(tests_dir, 'test_eng_notext.pdf')) 60 | pdf_notext_content: bytes = _u.read_binary_file(pdf_notext) 61 | pdf_notext_decoded: str = _u.pdf2text(pdf_notext_content) 62 | self.assertEqual(pdf_notext_decoded, '') 63 | pdf_withtext: Path = Path( 64 | os.path.join(tests_dir, 'the_pdf_withtext.pdf')) 65 | pdf_withtext_content: bytes = _u.read_binary_file(pdf_withtext) 66 | pdf_withtext_decoded: str = _u.pdf2text(pdf_withtext_content) 67 | self.assertEqual(pdf_withtext_decoded, 'The test if pdf with text') 68 | not_pdf: Path = Path(os.path.join(tests_dir, 'test_eng.png')) 69 | not_pdf_content: bytes = _u.read_binary_file(not_pdf) 70 | with self.assertRaisesMessage(pdftotext.Error, 71 | f'poppler error creating document'): 72 | _: str = _u.pdf2text(not_pdf_content) 73 | 74 | def test_date_pattern(self): 75 | """Testing DATE_PATTERN: Pattern""" 76 | date_string: str = '2021-01-11 something else' 77 | self.assertEqual(_u.DATE_PATTERN.sub(r'\2\3\4', date_string), 78 | '20210111 something else') 79 | 80 | def test_remove_date_hyphens(self): 81 | """Testing removeDateHyphens(date_string: str)""" 82 | date_string: str = '2021-01-11 something else' 83 | self.assertEqual(_u.removeDateHyphens(date_string), 84 | '20210111 something else') 85 | twice_as_date: str = "20190311033852-00'00'" 86 | self.assertEqual(_u.removeDateHyphens(twice_as_date), twice_as_date) 87 | 88 | def test_parse_pdf_datetime(self): 89 | """Testing parse_pdf_datetime(pdf_datetime: str)""" 90 | deming_dt: str = "D:20190311033852+00'00'" 91 | self.assertEqual(_u.parse_pdf_datetime(deming_dt), 92 | '2019-03-11 03:38:52+0000') 93 | shmakovpn_dt: str = "D:20190412074856+03'00'" 94 | self.assertEqual(_u.parse_pdf_datetime(shmakovpn_dt), 95 | '2019-04-12 07:48:56+0300') 96 | test_eng_notext_dt1: str = "2019-03-17T09:52:26+07:00" 97 | self.assertEqual(_u.parse_pdf_datetime(test_eng_notext_dt1), 98 | '2019-03-17 09:52:26+0700') 99 | test_eng_notext_dt2: str = "D:20190317095226+07'00'" 100 | self.assertEqual(_u.parse_pdf_datetime(test_eng_notext_dt2), 101 | '2019-03-17 09:52:26+0700') 102 | test_eng_dt: str = "D:20190310075751Z00'00'" 103 | self.assertEqual(_u.parse_pdf_datetime(test_eng_dt), 104 | '2019-03-10 07:57:51+0000') 105 | the_pdf_withtext_dt: str = "D:20190317034557Z00'00'" 106 | self.assertEqual(_u.parse_pdf_datetime(the_pdf_withtext_dt), 107 | '2019-03-17 03:45:57+0000') 108 | dt = '2020-01-10 17:57:31' 109 | self.assertEqual(_u.parse_pdf_datetime(dt), '2020-01-10 17:57:31') 110 | _deming_dt: str = "D:20190311033852-00'00'" 111 | self.assertEqual(_u.parse_pdf_datetime(_deming_dt), 112 | '2019-03-11 03:38:52-0000') 113 | _shmakovpn_dt: str = "D:20190412074856-03'00'" 114 | self.assertEqual(_u.parse_pdf_datetime(_shmakovpn_dt), 115 | '2019-04-12 07:48:56-0300') 116 | _test_eng_notext_dt1: str = "2019-03-17T09:52:26-07:00" 117 | self.assertEqual(_u.parse_pdf_datetime(_test_eng_notext_dt1), 118 | '2019-03-17 09:52:26-0700') 119 | _test_eng_notext_dt2: str = "D:20190317095226-07'00'" 120 | self.assertEqual(_u.parse_pdf_datetime(_test_eng_notext_dt2), 121 | '2019-03-17 09:52:26-0700') 122 | _test_eng_dt: str = "D:20190310075751-Z00'00'" 123 | self.assertEqual(_u.parse_pdf_datetime(_test_eng_dt), 124 | '2019-03-10 07:57:51-0000') 125 | _the_pdf_withtext_dt: str = "D:20190317034557-Z00'00'" 126 | self.assertEqual(_u.parse_pdf_datetime(_the_pdf_withtext_dt), 127 | '2019-03-17 03:45:57-0000') 128 | not_datetime: str = 'not datetime' 129 | 130 | with patch('sys.stdout', new_callable=StringIO) as patched_stdout: 131 | fake_datetime: str = _u.parse_pdf_datetime(not_datetime) 132 | stdout_value: str = patched_stdout.getvalue() 133 | self.assertEqual( 134 | stdout_value, 135 | "could not parse pdf_datetime: 'notdatetime', using now() instead\n" 136 | ) 137 | match_fake: Optional[Match] = re.search( 138 | r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', fake_datetime) 139 | self.assertIsNotNone(match_fake) 140 | 141 | def test_pdf_info_class(self): 142 | """Testing class PdfInfo""" 143 | pdf_info: _u.PdfInfo = _u.PdfInfo() 144 | self.assertEqual(pdf_info.author, '') 145 | self.assertEqual(pdf_info.creation_date, '') 146 | self.assertEqual(pdf_info.creator, '') 147 | self.assertEqual(pdf_info.mod_date, '') 148 | self.assertEqual(pdf_info.producer, '') 149 | self.assertEqual(pdf_info.title, '') 150 | self.assertEqual(pdf_info.num_pages, 0) 151 | self.assertEqual(str(pdf_info), f"PdfInfo("\ 152 | + f"author='', "\ 153 | + f"creation_date='', "\ 154 | + f"creator='', "\ 155 | + f"mod_date='', "\ 156 | + f"producer='', "\ 157 | + f"title='', "\ 158 | + f"num_pages=0"\ 159 | + f")") 160 | 161 | def test_pypdf_info_to_pdf_info(self): 162 | """Testing pypdf_inof_to_pdf_info(pypdf_info: PyPDF2.pdf.DocumentInformation)""" 163 | pdf_info: _u.PdfInfo = _u.pypdf_info_to_pdf_info({ 164 | '/Author': 'author', 165 | '/CreationDate': "D:20190412074856+03'00'", 166 | '/Creator': 'creator', 167 | '/ModDate': "D:20200412074856+03'00'", 168 | '/Producer': 'producer', 169 | '/Title': 'title' 170 | }) 171 | self.assertEqual(pdf_info.author, 'author') 172 | self.assertEqual(pdf_info.creation_date, '2019-04-12 07:48:56+0300') 173 | self.assertEqual(pdf_info.creator, 'creator') 174 | self.assertEqual(pdf_info.mod_date, '2020-04-12 07:48:56+0300') 175 | self.assertEqual(pdf_info.producer, 'producer') 176 | self.assertEqual(pdf_info.title, 'title') 177 | self.assertEqual(pdf_info.num_pages, 0) 178 | 179 | def test_get_pdf_info(self): 180 | """Testing get_pdf_info(pdf_content: bytes)""" 181 | tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) 182 | test_eng_pdf: Path = Path(os.path.join(tests_dir, 'test_eng.pdf')) 183 | test_eng_pdf_content: bytes = _u.read_binary_file(test_eng_pdf) 184 | test_eng_pdf_info: _u.PdfInfo = _u.get_pdf_info(test_eng_pdf_content) 185 | self.assertEqual(test_eng_pdf_info.author, '') 186 | self.assertEqual(test_eng_pdf_info.creation_date, 187 | '2019-03-10 07:57:51+0000') 188 | self.assertEqual(test_eng_pdf_info.creator, '') 189 | self.assertEqual(test_eng_pdf_info.mod_date, '') 190 | self.assertEqual(test_eng_pdf_info.producer, 'Tesseract 4.0.0-beta.1') 191 | self.assertEqual(test_eng_pdf_info.title, '') 192 | self.assertEqual(test_eng_pdf_info.num_pages, 1) 193 | 194 | with patch('sys.stdout', new_callable=StringIO) as patched_stdout: 195 | not_pdf_info: _u.PdfInfo = _u.get_pdf_info(bytes()) 196 | stdout_value: str = patched_stdout.getvalue() 197 | self.assertEqual( 198 | stdout_value, 199 | "PyPDF2.PdfFileReader exception: Cannot read an empty file\n") 200 | self.assertEqual(not_pdf_info.author, '') 201 | self.assertEqual(not_pdf_info.creation_date, '') 202 | self.assertEqual(not_pdf_info.creator, '') 203 | self.assertEqual(not_pdf_info.mod_date, '') 204 | self.assertEqual(not_pdf_info.producer, '') 205 | self.assertEqual(not_pdf_info.title, '') 206 | self.assertEqual(not_pdf_info.num_pages, 0) 207 | 208 | def test_cmd_stdin(self): 209 | """The testing cmd_stdin(args: List[str], stdin: bytes)""" 210 | cmd: List[str] = ['cat', '-'] 211 | stdin: bytes = 'hello'.encode() 212 | result: str = _u.cmd_stdin(cmd, stdin).decode() 213 | self.assertEqual(result, 'hello') 214 | 215 | not_cmd: List[str] = ['nocmdabracadabra'] 216 | with self.assertRaisesMessage( 217 | FileNotFoundError, 218 | f"[Errno 2] No such file or directory: '{not_cmd[0]}'"): 219 | _: str = _u.cmd_stdin(not_cmd, stdin=stdin).decode() 220 | 221 | def test_tesseract_strarg(self): 222 | """The testing TESSERACT_STRARG""" 223 | tesseract_lang: str = _u.get_tesseract_lang() 224 | tesseract_strargs: List[str] = _u.TESSERACT_STRARG 225 | self.assertEqual(tesseract_strargs[0], 'tesseract') 226 | self.assertEqual(tesseract_strargs[1], '-l') 227 | self.assertEqual(tesseract_strargs[2], tesseract_lang) 228 | self.assertEqual(tesseract_strargs[3], '-') 229 | self.assertEqual(tesseract_strargs[4], '-') 230 | self.assertEqual(len(tesseract_strargs), 5) 231 | 232 | def test_tesseract_pdfarg(self): 233 | """The testing TESSERACT_PDFARG""" 234 | tesseract_pdfarg: List[str] = _u.TESSERACT_PDFARG 235 | self.assertEqual(len(tesseract_pdfarg), 6) 236 | self.assertEqual(tesseract_pdfarg[5], 'pdf') 237 | 238 | def test_ocr_img2str(self): 239 | """The testing ocr_img2str(stdin: bytes)""" 240 | tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) 241 | test_eng_png: Path = Path(os.path.join(tests_dir, 'test_eng.png')) 242 | test_eng_png_content: bytes = _u.read_binary_file(test_eng_png) 243 | test_eng_ocred_text: str = _u.ocr_img2str(test_eng_png_content) 244 | self.assertTrue(test_eng_ocred_text, 245 | 'A some english text to test Tesseract') 246 | 247 | def test_ocr_img2pdf(self): 248 | """The testing ocr_img2pdf(stdin: bytes)""" 249 | tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) 250 | test_eng_png: Path = Path(os.path.join(tests_dir, 'test_eng.png')) 251 | test_eng_png_content: bytes = _u.read_binary_file(test_eng_png) 252 | test_eng_ocred_pdf: bytes = _u.ocr_img2pdf(test_eng_png_content) 253 | self.assertIsNotNone(test_eng_ocred_pdf) 254 | test_eng_ocred_pdf_text: str = _u.pdf2text(test_eng_ocred_pdf) 255 | self.assertEqual(test_eng_ocred_pdf_text, 256 | 'A some english text to test Tesseract') 257 | 258 | def test_pdf_need_ocr(self): 259 | """The testing ocr_need_pdf(pdf_text: str)""" 260 | self.assertTrue(_u.pdf_need_ocr('')) 261 | self.assertFalse(_u.pdf_need_ocr('привет')) 262 | self.assertFalse(_u.pdf_need_ocr('the boy and an appple')) 263 | self.assertTrue(_u.pdf_need_ocr('ablaldgh')) 264 | 265 | def test_get_ocr_pdf_cmd(self): 266 | """Testing get_ocr_pdf_cmd(filename: str)""" 267 | args: List[str] = _u.get_ocr_pdf_cmd('Filename') 268 | self.assertEqual(len(args), 8) 269 | self.assertEqual(args[0], 'ocrmypdf') 270 | self.assertEqual(args[1], '-l') 271 | self.assertEqual(args[2], _u.get_tesseract_lang()) 272 | self.assertEqual(args[3], '-') 273 | self.assertEqual(args[4], 'Filename') 274 | self.assertEqual(args[5], '--force-ocr') 275 | self.assertEqual(args[6], '--sidecar') 276 | self.assertEqual(args[7], '-') 277 | 278 | def test_ocr_pdf(self): 279 | """Testing ocr_pdf(pdf_content, filename)""" 280 | tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__))) 281 | -------------------------------------------------------------------------------- /django_ocr_server/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/urls.py 3 | OCR Server URL dispatcher 4 | """ 5 | __author__ = 'shmakopvn ' 6 | __date__ = '2019-03-19' 7 | 8 | from django.urls import path 9 | from django.conf import settings 10 | from django.conf.urls.static import static 11 | from django.views.generic.base import RedirectView 12 | from rest_framework.authtoken import views 13 | from rest_framework_swagger.views import get_swagger_view 14 | 15 | from django.urls.converters import register_converter 16 | from django.urls import reverse_lazy 17 | from .converters import Md5Converter, DonloadTargetConverter 18 | 19 | # importing views 20 | from .views import * 21 | from .apiviews import * 22 | 23 | register_converter(Md5Converter, 'md5') 24 | register_converter(DonloadTargetConverter, 'download_target') 25 | 26 | schema_view = get_swagger_view(title='OCR Server API') 27 | app_name = __package__ 28 | urlpatterns = [ 29 | path('', RedirectView.as_view(url=reverse_lazy('admin:index'), permanent=False), name='root'), 30 | path('login/', views.obtain_auth_token, name='login'), 31 | path('upload/', UploadFile.as_view(), name='upload'), 32 | path('list/', OCRedFileList.as_view(), name='list'), 33 | path('remove/file/all/', RemoveFileAll.as_view(), name='remove_file_all'), 34 | path('remove/file//', RemoveFileMd5.as_view(), name='remove_file_md5'), 35 | path('remove/pdf/all/', RemovePdfAll.as_view(), name='remove_pdf_all'), 36 | path('remove/pdf//', RemovePdfMd5.as_view(), name='remove_pdf_md5'), 37 | path('create/pdf/all/', CreatePdfAll.as_view(), name='create_pdf_all'), 38 | path('create/pdf//', CreatePdfMd5.as_view(), name='create_pdf_md5'), 39 | path('remove/all/', RemoveAll.as_view(), name='remove_all'), 40 | path('remove//', RemoveMd5.as_view(), name='remove_md5'), 41 | path('/', Md5.as_view(), name='md5'), 42 | path('swagger/', schema_view), 43 | path('download///', DownloadView.as_view(), name='download'), 44 | path('clean/', Clean.as_view(), name='clean'), 45 | path('ttl/', Ttl.as_view(), name='ttl'), 46 | ] 47 | 48 | -------------------------------------------------------------------------------- /django_ocr_server/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/utils.py 3 | ++++++++++++++++++++++++++ 4 | 5 | This file provides functions and classes of 'django_ocr_server' 6 | common for the whole project. 7 | 8 | | Author: shmakovpn 9 | | Date: 2019-03-11/2021-01-22 10 | """ 11 | # type hints 12 | from typing import List, Pattern 13 | # standard 14 | import os 15 | import re 16 | import regex 17 | import hashlib # needed to md5 hash calculation 18 | import subprocess # needed to run tesseract 19 | from io import BytesIO # for conversion a pdf content represented as bytes to an inmemory pdf file 20 | from datetime import datetime, timedelta 21 | # dependencies 22 | import pdftotext # needed to extraction text from pdf 23 | import PyPDF2 # needed to get pdfInfo 24 | # django 25 | from django.conf import settings 26 | # django_ocr_server 27 | from django_ocr_server.conf import ocr_settings 28 | 29 | 30 | def read_binary_file(path: str) -> bytes: 31 | """ 32 | It reads a file from the path 2019-03-10 33 | :param path: path to a file 34 | :return: contents of the file 35 | """ 36 | f = open(path, 'rb') 37 | content: bytes = f.read() 38 | f.close() 39 | return content 40 | 41 | 42 | os.environ['OMP_THREAD_LIMIT'] = '1' 43 | 44 | 45 | def md5(content: bytes) -> str: 46 | """ 47 | Generates md5 hash of content 2019-03-10 48 | :param content: a data for md5 generation 49 | :return: an md5 hash of a content 50 | """ 51 | hash_md5 = hashlib.md5() 52 | hash_md5.update(content) 53 | return hash_md5.hexdigest() 54 | 55 | 56 | def pdf2text(pdf_content: bytes) -> str: 57 | """ 58 | It converts pdf_content as bytes to string 2019-03-10 59 | :param pdf_content: a content of a pdf file as bytes 60 | :return: text of pdf 61 | """ 62 | pdfs: pdftotext.PDF = pdftotext.PDF(BytesIO(pdf_content)) 63 | pdf_text: str = '' 64 | page: int 65 | for page in range(len(pdfs)): 66 | pdf_text += pdfs[page] 67 | return pdf_text 68 | 69 | 70 | #: date regex pattern 71 | DATE_PATTERN: Pattern = re.compile(r'((\d\d\d\d)-?(\d\d)-?(\d\d))') 72 | 73 | 74 | def removeDateHyphens(date_string: str) -> str: 75 | """Removes hyphens from a string that contains a date""" 76 | return DATE_PATTERN.sub(r'\2\3\4', date_string, 1) 77 | 78 | 79 | def parse_pdf_datetime(pdf_datetime: str) -> str: 80 | """ 81 | This inner function parse a datetime from a string returned the PdfFileReader.getDocumentInfo()['/CreationDate'] or ['/ModDate'] 2019-03-11 82 | :param pdf_datetime: a string from the PdfFileReader.getDocumentInfo()['/CreationDate'] or ['/ModDate'] 83 | :return: datetime of the pdf document creation date 84 | """ 85 | pdf_datetime = pdf_datetime.strip('D') 86 | pdf_datetime = pdf_datetime.replace(':', '') 87 | pdf_datetime = pdf_datetime.replace('T', '') 88 | pdf_datetime = pdf_datetime.replace(' ', '') 89 | pdf_datetime = removeDateHyphens(pdf_datetime) 90 | if re.match(r'^\d{14}', pdf_datetime): 91 | year: str = pdf_datetime[0:4] 92 | month: str = pdf_datetime[4:6] 93 | day: str = pdf_datetime[6:8] 94 | hour: str = pdf_datetime[8:10] 95 | minute: str = pdf_datetime[10:12] 96 | second: str = pdf_datetime[12:14] 97 | pdf_datetime = pdf_datetime[14:] 98 | parsed_datetime: str = f'{year}-{month}-{day} {hour}:{minute}:{second}' 99 | pdf_datetime = pdf_datetime.strip('+') 100 | timezone_sign: str = '+' 101 | if re.match(r'^-', pdf_datetime): 102 | timezone_sign = '-' 103 | pdf_datetime = pdf_datetime.strip('-') 104 | pdf_datetime = pdf_datetime.strip('Z') 105 | if re.match(r'^\d\d', pdf_datetime): 106 | parsed_datetime = f'{parsed_datetime}{timezone_sign}{pdf_datetime[0:2]}' 107 | pdf_datetime = pdf_datetime[2:] 108 | pdf_datetime = pdf_datetime.strip("'") 109 | pdf_datetime = pdf_datetime.strip('"') 110 | if re.match(r'^\d\d', pdf_datetime): 111 | parsed_datetime = f'{parsed_datetime}{pdf_datetime[0:2]}' 112 | return parsed_datetime 113 | # otherwise 114 | print( 115 | f"could not parse pdf_datetime: '{pdf_datetime}', using now() instead") 116 | today = datetime.now() 117 | return f"{today.date()} {today.time()}" 118 | 119 | 120 | class PdfInfo: 121 | """ An info of a PDF document """ 122 | author: str = '' 123 | creation_date: str = '' 124 | creator: str = '' 125 | mod_date: str = '' 126 | producer: str = '' 127 | title: str = '' 128 | num_pages: int = 0 129 | 130 | def __str__(self) -> str: 131 | return f"PdfInfo(author='{self.author}', "\ 132 | + f"creation_date='{self.creation_date}', "\ 133 | + f"creator='{self.creator}', "\ 134 | + f"mod_date='{self.mod_date}', "\ 135 | + f"producer='{self.producer}', "\ 136 | + f"title='{self.title}', "\ 137 | + f"num_pages={self.num_pages})" 138 | 139 | 140 | def pypdf_info_to_pdf_info( 141 | pypdf_info: PyPDF2.pdf.DocumentInformation) -> PdfInfo: 142 | """Converts PyPDF2.pdf.DocumentInformation to PdfInfo""" 143 | pdf_info: PdfInfo = PdfInfo() 144 | if '/Author' in pypdf_info: 145 | pdf_info.author = pypdf_info['/Author'] 146 | if '/CreationDate' in pypdf_info: 147 | pdf_info.creation_date = parse_pdf_datetime( 148 | pypdf_info['/CreationDate']) 149 | if '/Creator' in pypdf_info: 150 | pdf_info.creator = pypdf_info['/Creator'] 151 | if '/ModDate' in pypdf_info: 152 | pdf_info.mod_date = parse_pdf_datetime(pypdf_info['/ModDate']) 153 | if '/Producer' in pypdf_info: 154 | pdf_info.producer = pypdf_info['/Producer'] 155 | if '/Title' in pypdf_info: 156 | pdf_info.title = pypdf_info['/Title'] 157 | return pdf_info 158 | 159 | 160 | def get_pdf_info(pdf_content: bytes) -> PdfInfo: 161 | """ 162 | It extract pdfInfo from pdf 2019-03-11 163 | :param pdf_content: a content of a pdf file as bytes 164 | :return: pdf info as PdfInfo object 165 | """ 166 | try: 167 | pdf_reader: PyPDF2.PdfFileReader = PyPDF2.PdfFileReader( 168 | BytesIO(pdf_content)) 169 | pypdf_info: PyPDF2.pdf.DocumentInformation = pdf_reader.getDocumentInfo( 170 | ) 171 | pdf_info: PdfInfo = pypdf_info_to_pdf_info(pypdf_info) 172 | pdf_info.num_pages = pdf_reader.numPages 173 | return pdf_info 174 | except Exception as e: 175 | print("PyPDF2.PdfFileReader exception: " + str(e)) 176 | return PdfInfo( 177 | ) # reading PdfInfo was failed, return an empty (stub) object 178 | 179 | 180 | def cmd_stdin(args: List[str], stdin: bytes) -> bytes: 181 | """ 182 | Launches command using *args* and send *stdin* to its standard input. 183 | 2019-03-10/2020-01-13 184 | """ 185 | popen: subprocess.Popen = subprocess.Popen(args, 186 | stdout=subprocess.PIPE, 187 | stderr=subprocess.PIPE, 188 | stdin=subprocess.PIPE) 189 | stdout, _ = popen.communicate(input=stdin) 190 | return stdout 191 | 192 | 193 | def get_tesseract_img_args() -> List[str]: 194 | """Creates the command-line argument array to recognize an image from *stdin*""" 195 | return [ 196 | 'tesseract', 197 | '-l', 198 | ocr_settings.OCR_TESSERACT_LANG, 199 | '-', 200 | '-', 201 | ] 202 | 203 | 204 | def get_tesseract_pdf_args() -> List[str]: 205 | """Creates the command-line arguments array to recognize a PDF from *stdin*""" 206 | return get_tesseract_img_args() + ['pdf'] 207 | 208 | 209 | def ocr_img2str(stdin: bytes) -> str: 210 | """ 211 | Recognize image from 'stdin' to string 2019-03-10 *4597# 212 | """ 213 | return cmd_stdin(get_tesseract_img_args(), stdin).decode() 214 | 215 | 216 | def ocr_img2pdf(stdin: bytes) -> bytes: 217 | """ 218 | It recognize image from 'stdin' to pdf 2019-03-10 219 | :param stdin: image as bytes 220 | :return: content of recognized image as pdf (bytes) 221 | """ 222 | return cmd_stdin(get_tesseract_pdf_args(), stdin) 223 | 224 | 225 | def pdf_need_ocr(pdf_text: str) -> bool: 226 | """ 227 | This function analyses a text of a pdf document and determines whenever pdf document is need to be OCRed or not 2019-03-11 228 | :param pdf_text: a text of a pdf document 229 | :return: boolean. True if a pdf document need to be OCRed, False otherwise 230 | """ 231 | if not len(pdf_text): 232 | return True # a pdf document does not contain a text. It needs to be OCRed 233 | if regex.search(r'\p{IsCyrillic}', pdf_text): 234 | return False # a pdf document contains cyrillic symbols. It does not need to be OCRed 235 | if re.search(r'the ', pdf_text, re.IGNORECASE): 236 | return False # a pdf document contains the 'the' article. It needs to be OCRed 237 | return True # a pdf document needs to be OCRed by default 238 | 239 | 240 | def get_ocr_pdf_cmd(filename: str) -> List[str]: 241 | """ 242 | Returns array of command line arguments, 243 | needed to recognize a pdf document using tesseract 244 | """ 245 | return [ 246 | 'ocrmypdf', 247 | '-l', 248 | ocr_settings.OCR_TESSERACT_LANG, 249 | '-', # using STDIN 250 | filename, # 251 | '--force-ocr', 252 | '--sidecar', 253 | '-' # using STDOUT for sidecar 254 | ] 255 | 256 | 257 | def ocr_pdf(pdf_content: bytes, filename: str) -> str: 258 | """ 259 | This function OCRs a pdf document from the stdin, 260 | then saves searchable pdf to a disk if filename does not equal 'store_pdf_disabled', 261 | returns a recognized text 2019-04-11 262 | :param pdf_content: the content of document as bytes 263 | :param filename: the filename of a searchable pdf that will be created 264 | :return: a recognized text 265 | """ 266 | args: List[str] = get_ocr_pdf_cmd(filename) 267 | process: subprocess.Popen = subprocess.Popen(args=args, 268 | stdout=subprocess.PIPE, 269 | stderr=subprocess.PIPE, 270 | stdin=subprocess.PIPE) 271 | stdout_data, stderr_data = process.communicate(input=pdf_content) 272 | if process.returncode: 273 | raise RuntimeError( 274 | f"Process '{' '.join(args)}' failed with code {process.returncode}: {stderr_data.decode()}" 275 | ) 276 | return stdout_data.decode() 277 | -------------------------------------------------------------------------------- /django_ocr_server/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/version.py 3 | ++++++++++++++++++++++++++++ 4 | 5 | | Author: shmakovpn 6 | | Date: 2021-01-19 7 | """ 8 | VERSION: str = '2.0' #: The current version of django_ocr_server 9 | -------------------------------------------------------------------------------- /django_ocr_server/views.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/views.py 3 | This file contains views of OCR Server, 4 | """ 5 | __author__ = 'shmakovpn ' 6 | __date__ = '2019-03-19' 7 | 8 | 9 | from django.views.generic import View 10 | 11 | from django.contrib.auth.mixins import LoginRequiredMixin 12 | 13 | 14 | from django.conf import settings 15 | import os 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /django_ocr_server/widgets.py: -------------------------------------------------------------------------------- 1 | """ 2 | django_ocr_server/widgets.py 3 | ++++++++++++++++++++++++++++ 4 | 5 | | Author: shmakovpn 6 | | 2019-04-16,2021-01-22 7 | """ 8 | from typing import Optional 9 | import os 10 | # django 11 | from django.forms.widgets import Widget 12 | from django.urls import reverse 13 | from django.conf import settings 14 | # django_ocr_settings 15 | from django_ocr_server.conf import ocr_settings 16 | 17 | 18 | class LinkWidget(Widget): 19 | """ 20 | Base class for FileLink and PdfLink 2019-04-12 21 | """ 22 | template_name: Optional[str] = None 23 | 24 | def __init__(self, *args, **kwargs) -> None: 25 | """ 26 | LinkWidget constructor, 2019-04-12 27 | :param args: 28 | :param kwargs: 29 | """ 30 | super().__init__(*args, **kwargs) 31 | 32 | 33 | class FileLink(LinkWidget): 34 | """ 35 | Widget of OCRedFile.file for using in the admin interface 36 | """ 37 | template_name: str = 'django_ocr_server/forms/widgets/file_link.html' 38 | file_type: Optional[str] = None 39 | 40 | def __init__(self, *args, **kwargs) -> None: 41 | if 'file_type' in kwargs: 42 | self.file_type = kwargs.pop('file_type') 43 | super().__init__(*args, **kwargs) 44 | 45 | def get_context(self, name, value, attrs): 46 | context = super(FileLink, self).get_context(name, value, attrs) 47 | if ocr_settings.OCR_FILE_PREVIEW and 'image' in self.file_type: 48 | context['widget']['file_preview'] = True 49 | if not ocr_settings.OCR_STORE_FILES: 50 | context['widget']['store_files_disabled'] = True 51 | if ocr_settings.OCR_STORE_FILES_DISABLED_LABEL in context['widget'][ 52 | 'value']: 53 | context['widget']['file_missing'] = True 54 | elif ocr_settings.OCR_FILE_REMOVED_LABEL in context['widget']['value']: 55 | context['widget']['file_removed'] = True 56 | else: 57 | context['widget']['filename'] = os.path.basename(str(value)) 58 | context['widget']['url'] = reverse( 59 | f'{__package__}:download', 60 | kwargs={ 61 | 'download_target': 'file', 62 | 'filename': context['widget']['filename'] 63 | }) 64 | return context 65 | 66 | 67 | class PdfLink(LinkWidget): 68 | """ 69 | Widget that shows a link to pdf file on the update model admin page. 70 | If pdf file exists the 'Remove PDF' button shows. 71 | If pdf file does not exists and it is possible to create it the 'Create PDF' button will shows 72 | """ 73 | template_name: str = 'django_ocr_server/forms/widgets/pdf_link.html' 74 | can_create_pdf: Optional[bool] = None 75 | 76 | def __init__(self, *args, **kwargs) -> None: 77 | if 'can_create_pdf' in kwargs: 78 | self.can_create_pdf = kwargs.pop('can_create_pdf') 79 | super().__init__(*args, **kwargs) 80 | 81 | def get_context(self, name, value, attrs): 82 | """ 83 | This function creates context for rendering widget template. 84 | If pdf file exists the context['pdf_exists'] will be True 85 | If pdf file does not exist and it is possible to create it context['create_pdf_button'] will be True 86 | """ 87 | context = super().get_context(name, value, attrs) 88 | 89 | if not ocr_settings.OCR_STORE_PDF: 90 | context['widget']['store_pdf_disabled'] = True 91 | if not context['widget']['value']: 92 | # value is empty, this means that OCRedFile.file is PDF and it has text, 93 | # or OCRedFile.file was ocred but OCRedFile.text is empty 94 | # In this case no need to show Remove button, and no need to show Create button 95 | return context 96 | if self.can_create_pdf: 97 | context['widget']['create_pdf_button'] = True 98 | if 'store_pdf_disabled' in context['widget']['value']: 99 | context['widget']['pdf_missing'] = True 100 | elif 'pdf_removed' in context['widget']['value']: 101 | context['widget']['pdf_removed'] = True 102 | else: 103 | context['widget']['filename'] = os.path.basename(str(value)) 104 | context['widget']['url'] = reverse( 105 | f'{__package__}:download', 106 | kwargs={ 107 | 'download_target': 'pdf', 108 | 'filename': context['widget']['filename'] 109 | }) 110 | context['widget']['pdf_exists'] = True 111 | return context 112 | 113 | 114 | class PdfInfo(Widget): 115 | template_name: str = 'django_ocr_server/forms/widgets/pdf_info.html' 116 | pdf_info = None 117 | 118 | def __init__(self, *args, **kwargs): 119 | if 'pdf_info' in kwargs: 120 | self.pdf_info = kwargs.pop('pdf_info') 121 | super().__init__(*args, **kwargs) 122 | 123 | def get_context(self, name, value, attrs): 124 | context = super(PdfInfo, self).get_context(name, value, attrs) 125 | context['pdf_info'] = self.pdf_info 126 | return context 127 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | django -------------------------------------------------------------------------------- /doc/reuirements.txt: -------------------------------------------------------------------------------- 1 | # nothing to do -------------------------------------------------------------------------------- /doc/source/api_documentation.rst: -------------------------------------------------------------------------------- 1 | .. index:: API documentation 2 | 3 | API documentation 4 | ================= 5 | 6 | | Django-ocr-server provides API documentation use restframework.documentation and swagger. 7 | | Visit http://localhost:8000/swagger and http://localhost:8000/docs/ -------------------------------------------------------------------------------- /doc/source/code-block.types.txt: -------------------------------------------------------------------------------- 1 | abap 2 | abnf 3 | ada, ada95, ada2005 4 | adl 5 | agda 6 | ahk, autohotkey 7 | alloy 8 | ampl 9 | antlr-as, antlr-actionscript 10 | antlr-cpp 11 | antlr-csharp, antlr-c# 12 | antlr-java 13 | antlr-objc 14 | antlr-perl 15 | antlr-python 16 | antlr-ruby, antlr-rb 17 | antlr 18 | apacheconf, aconf, apache 19 | apl 20 | applescript 21 | arduino 22 | as, actionscript 23 | as3, actionscript3 24 | aspectj 25 | aspx-cs 26 | aspx-vb 27 | asy, asymptote 28 | at, ambienttalk, ambienttalk/2 29 | autoit 30 | awk, gawk, mawk, nawk 31 | basemake 32 | bash, sh, ksh, shell 33 | bat, batch, dosbatch, winbatch 34 | bbcode 35 | bc 36 | befunge 37 | blitzbasic, b3d, bplus 38 | blitzmax, bmax 39 | bnf 40 | boo 41 | boogie 42 | brainfuck, bf 43 | bro 44 | bugs, winbugs, openbugs 45 | c-objdump 46 | c 47 | ca65 48 | cadl 49 | camkes, idl4 50 | cbmbas 51 | ceylon 52 | cfc 53 | cfengine3, cf3 54 | cfm 55 | cfs 56 | chai, chaiscript 57 | chapel, chpl 58 | cheetah, spitfire 59 | cirru 60 | clay 61 | clean 62 | clojure, clj 63 | clojurescript, cljs 64 | cmake 65 | cobol 66 | cobolfree 67 | coffee-script, coffeescript, coffee 68 | common-lisp, cl, lisp 69 | componentpascal, cp 70 | console, shell-session 71 | control, debcontrol 72 | coq 73 | cpp, c++ 74 | cpp-objdump, c++-objdumb, cxx-objdump 75 | cpsa 76 | crmsh, pcmk 77 | croc 78 | cryptol, cry 79 | csharp, c# 80 | csound, csound-orc 81 | csound-document, csound-csd 82 | csound-score, csound-sco 83 | css+django, css+jinja 84 | css+erb, css+ruby 85 | css+genshitext, css+genshi 86 | css+lasso 87 | css+mako 88 | css+mako 89 | css+mozpreproc 90 | css+myghty 91 | css+php 92 | css+smarty 93 | css 94 | cucumber, gherkin 95 | cuda, cu 96 | cypher 97 | cython, pyx, pyrex 98 | d-objdump 99 | d 100 | dart 101 | delphi, pas, pascal, objectpascal 102 | dg 103 | diff, udiff 104 | django, jinja 105 | docker, dockerfile 106 | doscon 107 | dpatch 108 | dtd 109 | duel, jbst, jsonml+bst 110 | dylan-console, dylan-repl 111 | dylan-lid, lid 112 | dylan 113 | earl-grey, earlgrey, eg 114 | easytrieve 115 | ebnf 116 | ec 117 | ecl 118 | eiffel 119 | elixir, ex, exs 120 | elm 121 | emacs, elisp, emacs-lisp 122 | erb 123 | erl 124 | erlang 125 | evoque 126 | extempore 127 | ezhil 128 | factor 129 | fan 130 | fancy, fy 131 | felix, flx 132 | fish, fishshell 133 | flatline 134 | fortran 135 | fortranfixed 136 | foxpro, vfp, clipper, xbase 137 | fsharp 138 | gap 139 | gas, asm 140 | genshi, kid, xml+genshi, xml+kid 141 | genshitext 142 | glsl 143 | gnuplot 144 | go 145 | golo 146 | gooddata-cl 147 | gosu 148 | groff, nroff, man 149 | groovy 150 | gst 151 | haml 152 | handlebars 153 | haskell, hs 154 | haxeml, hxml 155 | hexdump 156 | hsail, hsa 157 | html+cheetah, html+spitfire, htmlcheetah 158 | html+django, html+jinja, htmldjango 159 | html+evoque 160 | html+genshi, html+kid 161 | html+handlebars 162 | html+lasso 163 | html+mako 164 | html+mako 165 | html+myghty 166 | html+php 167 | html+smarty 168 | html+twig 169 | html+velocity 170 | html 171 | http 172 | hx, haxe, hxsl 173 | hybris, hy 174 | hylang 175 | i6t 176 | idl 177 | idris, idr 178 | iex 179 | igor, igorpro 180 | inform6, i6 181 | inform7, i7 182 | ini, cfg, dosini 183 | io 184 | ioke, ik 185 | irc 186 | isabelle 187 | j 188 | jade 189 | jags 190 | jasmin, jasminxt 191 | java 192 | javascript+mozpreproc 193 | jcl 194 | jlcon 195 | js+cheetah, javascript+cheetah, js+spitfire, javascript+spitfire 196 | js+django, javascript+django, js+jinja, javascript+jinja 197 | js+erb, javascript+erb, js+ruby, javascript+ruby 198 | js+genshitext, js+genshi, javascript+genshitext, javascript+genshi 199 | js+lasso, javascript+lasso 200 | js+mako, javascript+mako 201 | js+mako, javascript+mako 202 | js+myghty, javascript+myghty 203 | js+php, javascript+php 204 | js+smarty, javascript+smarty 205 | js, javascript 206 | jsgf 207 | json 208 | jsonld, json-ld 209 | jsp 210 | julia, jl 211 | kal 212 | kconfig, menuconfig, linux-config, kernel-config 213 | koka 214 | kotlin 215 | lagda, literate-agda 216 | lasso, lassoscript 217 | lcry, literate-cryptol, lcryptol 218 | lean 219 | less 220 | lhs, literate-haskell, lhaskell 221 | lidr, literate-idris, lidris 222 | lighty, lighttpd 223 | limbo 224 | liquid 225 | live-script, livescript 226 | llvm 227 | logos 228 | logtalk 229 | lsl 230 | lua 231 | make, makefile, mf, bsdmake 232 | mako 233 | mako 234 | maql 235 | mask 236 | mason 237 | mathematica, mma, nb 238 | matlab 239 | matlabsession 240 | minid 241 | modelica 242 | modula2, m2 243 | monkey 244 | moocode, moo 245 | moon, moonscript 246 | mozhashpreproc 247 | mozpercentpreproc 248 | mql, mq4, mq5, mql4, mql5 249 | mscgen, msc 250 | mupad 251 | mxml 252 | myghty 253 | mysql 254 | nasm 255 | ncl 256 | nemerle 257 | nesc 258 | newlisp 259 | newspeak 260 | nginx 261 | nimrod, nim 262 | nit 263 | nixos, nix 264 | nsis, nsi, nsh 265 | numpy 266 | objdump-nasm 267 | objdump 268 | objective-c++, objectivec++, obj-c++, objc++ 269 | objective-c, objectivec, obj-c, objc 270 | objective-j, objectivej, obj-j, objj 271 | ocaml 272 | octave 273 | odin 274 | ooc 275 | opa 276 | openedge, abl, progress 277 | pacmanconf 278 | pan 279 | parasail 280 | pawn 281 | perl, pl 282 | perl6, pl6 283 | php, php3, php4, php5 284 | pig 285 | pike 286 | pkgconfig 287 | plpgsql 288 | postgresql, postgres 289 | postscript, postscr 290 | pot, po 291 | pov 292 | powershell, posh, ps1, psm1 293 | praat 294 | prolog 295 | properties, jproperties 296 | protobuf, proto 297 | ps1con 298 | psql, postgresql-console, postgres-console 299 | puppet 300 | py3tb 301 | pycon 302 | pypylog, pypy 303 | pytb 304 | python, py, sage 305 | python3, py3 306 | qbasic, basic 307 | qml, qbs 308 | qvto, qvt 309 | racket, rkt 310 | ragel-c 311 | ragel-cpp 312 | ragel-d 313 | ragel-em 314 | ragel-java 315 | ragel-objc 316 | ragel-ruby, ragel-rb 317 | ragel 318 | raw 319 | rb, ruby, duby 320 | rbcon, irb 321 | rconsole, rout 322 | rd 323 | rebol 324 | red, red/system 325 | redcode 326 | registry 327 | resource, resourcebundle 328 | rexx, arexx 329 | rhtml, html+erb, html+ruby 330 | roboconf-graph 331 | roboconf-instances 332 | robotframework 333 | rql 334 | rsl 335 | rst, rest, restructuredtext 336 | rts, trafficscript 337 | rust 338 | sass 339 | sc, supercollider 340 | scala 341 | scaml 342 | scheme, scm 343 | scilab 344 | scss 345 | shen 346 | silver 347 | slim 348 | smali 349 | smalltalk, squeak, st 350 | smarty 351 | sml 352 | snobol 353 | sourceslist, sources.list, debsources 354 | sp 355 | sparql 356 | spec 357 | splus, s, r 358 | sql 359 | sqlite3 360 | squidconf, squid.conf, squid 361 | ssp 362 | stan 363 | swift 364 | swig 365 | systemverilog, sv 366 | tads3 367 | tap 368 | tcl 369 | tcsh, csh 370 | tcshcon 371 | tea 372 | termcap 373 | terminfo 374 | terraform, tf 375 | tex, latex 376 | text 377 | thrift 378 | todotxt 379 | trac-wiki, moin 380 | treetop 381 | ts, typescript 382 | turtle 383 | twig 384 | typoscript 385 | typoscriptcssdata 386 | typoscripthtmldata 387 | urbiscript 388 | vala, vapi 389 | vb.net, vbnet 390 | vcl 391 | vclsnippets, vclsnippet 392 | vctreestatus 393 | velocity 394 | verilog, v 395 | vgl 396 | vhdl 397 | vim 398 | wdiff 399 | x10, xten 400 | xml+cheetah, xml+spitfire 401 | xml+django, xml+jinja 402 | xml+erb, xml+ruby 403 | xml+evoque 404 | xml+lasso 405 | xml+mako 406 | xml+mako 407 | xml+myghty 408 | xml+php 409 | xml+smarty 410 | xml+velocity 411 | xml 412 | xquery, xqy, xq, xql, xqm 413 | xslt 414 | xtend 415 | xul+mozpreproc 416 | yaml+jinja, salt, sls 417 | yaml 418 | zephir | -------------------------------------------------------------------------------- /doc/source/commands.rst: -------------------------------------------------------------------------------- 1 | .. index:: Management Commands 2 | 3 | Management Commands 4 | =================== 5 | 6 | Run it to clean trash. It removes all uploaded files and PDFs that do not have related models in database. 7 | 8 | .. code-block:: shell-session 9 | 10 | $ python manage.py clean 11 | 12 | Run it to remove models, uploaded files and PDFs, whose time to live (TTL) has expired. 13 | 14 | .. code-block:: shell-session 15 | 16 | $ python manage.py ttl 17 | 18 | Create the user for API (return the AUTH-token). 19 | 20 | .. code-block:: shell-session 21 | 22 | $ python manage.py create_user username password 23 | b2db7002e037a4edb25aed33b04b97e468970376 24 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | doc/source/conf.py 3 | ++++++++++++++++++ 4 | 5 | Sphinx configuration file of django_ocr_server 6 | 7 | | Author: shmakovpn 8 | | Date: 2021-01-19 9 | """ 10 | from typing import List 11 | import os 12 | import sys 13 | 14 | # -- Project information ----------------------------------------------------- 15 | 16 | project: str = 'django_ocr_server' 17 | copyright: str = '2019, shmakovpn' 18 | author: str = 'shmakovpn' 19 | 20 | SCRIPT_DIR: str = os.path.dirname(os.path.abspath(__file__)) 21 | DOCS_DIR: str = os.path.dirname(SCRIPT_DIR) 22 | PROJECT_DIR: str = os.path.dirname(DOCS_DIR) 23 | PACKAGE_DIR: str = os.path.join(PROJECT_DIR, project) 24 | sys.path.insert(0, PROJECT_DIR) # needed to automodule 25 | 26 | from django_ocr_server.version import VERSION 27 | from django.conf import settings 28 | if not settings.configured: 29 | settings.configure( 30 | BASE_DIR=PROJECT_DIR, 31 | ) 32 | 33 | # mocking C modules 34 | # autodock_mock_imports: List[str] = [] 35 | 36 | # The short X.Y version 37 | version: str = VERSION 38 | # The full version, including alpha/beta/rc tags 39 | release: str = VERSION 40 | 41 | # -- General configuration --------------------------------------------------- 42 | 43 | # Add any Sphinx extension module names here, as strings. They can be 44 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 45 | # ones. 46 | extensions: List[str] = [ 47 | 'sphinx.ext.todo', 48 | 'sphinx.ext.viewcode', 49 | 'sphinx.ext.autodoc', 50 | 'sphinx.ext.intersphinx', 51 | ] 52 | 53 | master_doc: str = 'contents' 54 | 55 | # Add any paths that contain templates here, relative to this directory. 56 | templates_path: List[str] = ['_templates'] 57 | 58 | # List of patterns, relative to source directory, that match files and 59 | # directories to ignore when looking for source files. 60 | # This pattern also affects html_static_path and html_extra_path. 61 | exclude_patterns: List[str] = [] 62 | 63 | # -- Options for HTML output ------------------------------------------------- 64 | 65 | # The theme to use for HTML and HTML Help pages. See the documentation for 66 | # a list of builtin themes. 67 | # 68 | html_theme: str = 'sphinx_rtd_theme' 69 | 70 | # Add any paths that contain custom static files (such as style sheets) here, 71 | # relative to this directory. They are copied after the builtin static files, 72 | # so a file named "default.css" will overwrite the builtin "default.css". 73 | html_static_path: List[str] = ['_static'] 74 | 75 | html_css_files: List[str] = [ 76 | 'custom.css', 77 | ] -------------------------------------------------------------------------------- /doc/source/configuration.rst: -------------------------------------------------------------------------------- 1 | .. index:: Configuration 2 | .. index:: OCR_FILE_PREVIEW 3 | .. index:: OCR_TESSERACT_LANG 4 | .. index:: OCR_STORE_PDF 5 | .. index:: OCR_FILES_UPLOAD_TO 6 | .. index:: OCR_PDF_UPLOAD_TO 7 | .. index:: OCR_FILES_TTL 8 | .. index:: OCR_PDF_TTL 9 | .. index:: OCR_TTL 10 | 11 | Configuration 12 | ============= 13 | For changing your django_ocr_server behavior you can use 14 | several parameters in the settings.py of your django project. 15 | 16 | .. index:: OCR_STORE_FILES 17 | 18 | .. py:currentmodule:: django_ocr_server.default_settings 19 | 20 | .. autodata:: OCR_STORE_FILES 21 | 22 | .. autodata:: OCR_FILE_PREVIEW 23 | 24 | .. autodata:: OCR_TESSERACT_LANG 25 | 26 | .. autodata:: OCR_STORE_PDF 27 | 28 | .. autodata:: OCR_STORE_FILES_DISABLED_LABEL 29 | 30 | .. autodata:: OCR_STORE_PDF_DISABLED_LABEL 31 | 32 | .. autodata:: OCR_FILE_REMOVED_LABEL 33 | 34 | .. autodata:: OCR_PDF_REMOVED_LABEL 35 | 36 | .. autodata:: OCR_ALLOWED_FILE_TYPES 37 | 38 | .. autodata:: OCR_FILES_UPLOAD_TO 39 | 40 | .. autodata:: OCR_PDF_UPLOAD_TO 41 | 42 | Time to live settings 43 | +++++++++++++++++++++ 44 | 45 | .. autodata:: OCR_FILES_TTL 46 | 47 | .. autodata:: OCR_PDF_TTL 48 | 49 | .. autodata:: OCR_TTL 50 | -------------------------------------------------------------------------------- /doc/source/contents.rst: -------------------------------------------------------------------------------- 1 | .. include:: index.rst -------------------------------------------------------------------------------- /doc/source/creation_package.rst: -------------------------------------------------------------------------------- 1 | .. index:: Creation a distribution package 2 | 3 | Creation a distribution package 4 | =============================== 5 | As mentioned earlier, the automatic installation script 'install_ubuntu.sh' 6 | uses the package from the PyPI repository by default. To change this behavior or 7 | if you need your own distribution package you can build it. 8 | 9 | Run command 10 | | $cd path to cloned project from github 11 | | $python setup.py sdist 12 | 13 | Look in 'dist' directory, there is your package was created. 14 | 15 | Also you can continue automatic installation. The package will be used. -------------------------------------------------------------------------------- /doc/source/deploy.rst: -------------------------------------------------------------------------------- 1 | .. index:: Deploying to production 2 | 3 | Deploying to production 4 | ======================= 5 | 6 | .. index:: Linux Mint 19 deploy to production 7 | .. index:: Ubuntu bionic deploy to production 8 | 9 | Linux Mint 19 (Ubuntu bionic) 10 | ----------------------------- 11 | Installing nginx 12 | 13 | .. code-block:: shell-session 14 | 15 | $ sudo apt install nginx 16 | 17 | Installing uwsgi (on virtualenv django_ocr_server) 18 | 19 | .. code-block:: shell-session 20 | 21 | $ pip install uwsgi 22 | 23 | .. index:: uwsgi Linux Mint 19 configuration 24 | 25 | .. index:: uwsgi Ubuntu bionic configuration 26 | 27 | Create {path_to_your_project}/uwsgi.ini 28 | 29 | .. code-block:: cfg 30 | 31 | [uwsgi] 32 | chdir = {path_to_your_project} # e.g. /home/shmakovpn/ocr_server 33 | module = {your_project}.wsgi # e.g. ocr_server.wsgi 34 | home = {path_to_your_virtualenv} # e.g. /home/shmakovpn/.virtualenvs/django_ocr_server 35 | master = true 36 | processes = 10 37 | http = 127.0.0.1:8003 38 | vacuum = true 39 | 40 | .. index:: nginx Linux Mint 19 configuration 41 | 42 | .. index:: nginx Ubuntu bionic configuration 43 | 44 | Create /etc/nginx/sites-available/django_ocr_server.conf 45 | 46 | .. code-block:: nginx 47 | 48 | server { 49 | listen 80; # choose port what you want 50 | server_name _; 51 | charset utf-8; 52 | client_max_body_size 75M; 53 | location /static/rest_framework_swagger { 54 | alias {path_to_your virtualenv}/lib/python3.6/site-packages/rest_framework_swagger/static/rest_framework_swagger; 55 | } 56 | location /static/rest_framework { 57 | alias {path_to_your virtualenv}/lib/python3.7/site-packages/rest_framework/static/rest_framework; 58 | } 59 | location /static/admin { 60 | alias {path_to_your virtualenv}/lib/python3.7/site-packages/django/contrib/admin/static/admin; 61 | } 62 | location / { 63 | proxy_pass http://127.0.0.1:8003; 64 | } 65 | } 66 | 67 | Enable the django_ocr_server site 68 | 69 | .. code-block:: shell-session 70 | 71 | $ sudo ln -s /etc/nginx/sites-available/django_ocr_server.conf /etc/nginx/sites-enabled/ 72 | 73 | Remove the nginx default site 74 | 75 | .. code-block:: shell-session 76 | 77 | $ sudo rm /etc/nginx/sites-enabled/default 78 | 79 | .. index:: systemd service unit Linux Mint 19 80 | 81 | .. index:: systemc service unit Ubuntu bionic 82 | 83 | Create the systemd service unit /etc/systemd/system/django-ocr-server.service 84 | 85 | .. code-block:: cfg 86 | 87 | [Unit] 88 | Description=uWSGI Django OCR Server 89 | After=syslog.service 90 | 91 | [Service] 92 | User={your user} 93 | Group={your group} 94 | Environment="PATH={path_to_your_virtualenv}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" 95 | ExecStart={path_to_your_virtualenv}/bin/uwsgi --ini {path_to_your_project}/uwsgi.ini 96 | RuntimeDirectory=uwsgi 97 | Restart=always 98 | KillSignal=SIGQUIT 99 | Type=notify 100 | StandardError=syslog 101 | NotifyAccess=all 102 | 103 | [Install] 104 | WantedBy=multi-user.target 105 | 106 | Reload systemd 107 | 108 | .. code-block:: shell-session 109 | 110 | $ sudo systemctl daemon-reload 111 | 112 | Start the django-ocr-server service 113 | 114 | .. code-block:: shell-session 115 | 116 | $ sudo systemctl start django-ocr-server 117 | 118 | Enable the django-ocr-server service to start automatically after server is booted 119 | 120 | .. code-block:: shell-session 121 | 122 | $ sudo systemclt enable django-ocr-server 123 | 124 | Start nginx 125 | 126 | .. code-block:: shell-session 127 | 128 | $ sudo systemctl start nginx 129 | 130 | Enable nginx service to start automatically after server is booted 131 | 132 | .. code-block:: shell-session 133 | 134 | $ sudo systemctl enable nginx 135 | 136 | Go to http://{your_server}:80 137 | You will be redirected to admin page 138 | 139 | .. index:: Centos 7 deploy to production 140 | 141 | Centos 7 142 | -------- 143 | 144 | Installing nginx 145 | 146 | .. code-block:: shell-session 147 | 148 | $ sudo apt install nginx 149 | 150 | Installing uwsgi (on virtualenv django_ocr_server) 151 | 152 | .. code-block:: shell-session 153 | 154 | $ pip install uwsgi 155 | 156 | .. index:: uwsgi configuration Centos 7 157 | 158 | Create /var/www/ocr_server/uwsgi.ini 159 | 160 | .. code-block:: cfg 161 | 162 | [uwsgi] 163 | chdir = /var/www/ocr_server 164 | module = ocr_server.wsgi 165 | home = /var/www/ocr_server/venv 166 | master = true 167 | processes = 10 168 | http = 127.0.0.1:8003 169 | vacuum = true 170 | 171 | .. index:: systemd service unit centos 7 172 | 173 | Create the systemd service unit /etc/systemd/system/django-ocr-server.service 174 | 175 | .. code-block:: cfg 176 | 177 | [Unit] 178 | Description=uWSGI Django OCR Server 179 | After=syslog.service 180 | 181 | [Service] 182 | User=nginx 183 | Group=nginx 184 | Environment="PATH=/var/www/ocr_server/venv/bin:/sbin:/bin:/usr/sbin:/usr/bin" 185 | ExecStart=/var/www/ocr_server/venv/bin/uwsgi --ini /var/www/ocr_server/uwsgi.ini 186 | RuntimeDirectory=uwsgi 187 | Restart=always 188 | KillSignal=SIGQUIT 189 | Type=notify 190 | StandardError=syslog 191 | NotifyAccess=all 192 | 193 | [Install] 194 | WantedBy=multi-user.target 195 | 196 | Reload systemd service 197 | 198 | .. code-block:: shell-session 199 | 200 | $ sudo systemctl daemon-reload 201 | 202 | Chango user of /var/www/ocr_server to nginx 203 | 204 | .. code-block:: shell-session 205 | 206 | $ sudo chown -R nginx:nginx /var/www/ocr_server 207 | 208 | Start Django-ocr-server service 209 | 210 | .. code-block:: shell-session 211 | 212 | $ sudo systemctl start django-ocr-service 213 | 214 | Check that port is up 215 | 216 | .. code-block:: shell-session 217 | 218 | $ sudo netstat -anlpt \| grep 8003 219 | 220 | you have to got something like this: 221 | 222 | .. code-block:: shell-session 223 | 224 | tcp 0 0 127.0.0.1:8003 0.0.0.0:* LISTEN 2825/uwsgi 225 | 226 | Enable Django-ocr-server uwsgi service 227 | 228 | .. code-block:: shell-session 229 | 230 | $ sudo systemctl enable django-ocr-service 231 | 232 | .. index:: nginx Centos 7 configuration 233 | 234 | Edit /etc/nginx/nginx.conf 235 | 236 | .. code-block:: nginx 237 | 238 | server { 239 | listen 80 default_server; 240 | listen [::]:80 default_server; 241 | server_name _; 242 | charset utf-8; 243 | client_max_body_size 75M; 244 | location /static/rest_framework_swagger { 245 | alias /var/www/ocr_server/venv/lib/python3.6/site-packages/rest_framework_swagger/static/rest_framework_swagger; 246 | } 247 | location /static/rest_framework { 248 | alias /var/www/ocr_server/venv/lib/python3.6/site-packages/rest_framework/static/rest_framework; 249 | } 250 | location /static/admin { 251 | alias /var/www/ocr_server/venv/lib/python3.6/site-packages/django/contrib/admin/static/admin; 252 | } 253 | location / { 254 | proxy_pass http://127.0.0.1:8003; 255 | } 256 | } 257 | 258 | .. index:: selinux Centos 7 configuration 259 | 260 | Configure SELinux 261 | | Django has a bug (https://code.djangoproject.com/ticket/29027#no1) 262 | | By default it stores uploading files size more than 2,5Mb to /tmp folder. A temp file gets 263 | 'system_u:object_r:httpd_tmp_t:s0' SELinux context. Then Django tries to copy this file 264 | to the uploading folder with its SELinux context using os.setxattr() from lib/python3.6/shutil.py. 265 | But it is a wrong behavior because in the uploading folder the SELinux context of a file 266 | have to be 'http_sys_rw_content_t'. To solve the problem we have to create another folder for 267 | temp files with 'http_sys_rw_content_t' for example /var/www/ocr_server/tmp. Then configure Django 268 | to store temp files to this folder. 269 | 270 | .. code-block:: shell-session 271 | 272 | $ sudo mkdir /var/www/ocr_server/tmp 273 | $ sudo chown {your_user} /var/www/ocr_server/tmp 274 | 275 | Change /var/www/ocr_server/ocr_server/settings.py 276 | 277 | .. code-block:: python 278 | 279 | FILE_UPLOAD_TEMP_DIR = os.path.join(BASE_DIR, 'tmp') 280 | 281 | Configure SELinux contexts 282 | 283 | .. code-block:: shell-session 284 | 285 | $ sudo semanage port -a -t http_port_t -p tcp 8003 286 | $ sudo semanage fcontext -a -t httpd_sys_content_t '/var/www/ocr_server/venv/lib/python3.6/site-packages/rest_framework_swagger/static/rest_framework_swagger(/.*)?' 287 | $ sudo semanage fcontext -a -t httpd_sys_content_t '/var/www/ocr_server/venv/lib/python3.6/site-packages/rest_framework/static/rest_framework(/.*)?' 288 | $ sudo semanage fcontext -a -t httpd_sys_content_t '/var/www/ocr_server/venv/lib/python3.6/site-packages/django/contrib/admin/static/admin(/.*)?' 289 | $ find /var/www/ocr_server/venv/lib/python3.6/site-packages/ | grep '\.so' | grep -v '\.libs' | xargs -L1 sudo semanage fcontext -a -t httpd_sys_script_exec_t 290 | $ sudo semanage fcontext -a -t httpd_sys_script_exec_t '/var/www/ocr_server/venv/bin(/.*)?' 291 | $ sudo semanage fcontext -a -t httpd_sys_script_exec_t '/var/www/ocr_server/venv/lib/python3.6/site-packages/psycopg2/.libs(/.*)?' 292 | $ sudo semanage fcontext -a -t httpd_sys_rw_content_t '/var/www/ocr_server/django_ocr_server/upload(/.*)?' 293 | $ sudo semanage fcontext -a -t httpd_sys_rw_content_t '/var/www/ocr_server/django_ocr_server/pdf(/.*)?' 294 | $ sudo semanage fcontext -a -t httpd_sys_rw_content_t '/var/www/ocr_server/tmp(./*)?' 295 | $ sudo restorecon -Rv /var/www/ocr_server 296 | $ sudo setsebool -P httpd_can_network_connect_db 1 297 | 298 | Start nginx service 299 | 300 | .. code-block:: shell-session 301 | 302 | $ sudo systemctl start nginx 303 | 304 | Enable nginx service 305 | 306 | .. code-block:: shell-session 307 | 308 | $ sudo systemctl enable nginx 309 | 310 | .. index:: firewall Centos 7 configuration 311 | 312 | Configure firewall 313 | 314 | .. code-block:: shell-session 315 | 316 | $ sudo firewall-cmd --zone=public --add-service=http --permanent 317 | $ sudo firewall-cmd --reload 318 | 319 | | Go to http://{your_server}:80 320 | | You will be redirected to admin page 321 | -------------------------------------------------------------------------------- /doc/source/developer_guide.rst: -------------------------------------------------------------------------------- 1 | Developer Guide 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | developer_guide/default_settings.py 9 | developer_guide/conf.py 10 | -------------------------------------------------------------------------------- /doc/source/developer_guide/conf.py.rst: -------------------------------------------------------------------------------- 1 | django_ocr_server/conf.py 2 | ========================= 3 | 4 | The settings manager of **django_ocr_server**. 5 | 6 | .. automodule:: django_ocr_server.conf 7 | :members: -------------------------------------------------------------------------------- /doc/source/developer_guide/default_settings.py.rst: -------------------------------------------------------------------------------- 1 | django_ocr_server/default_settings.py 2 | ===================================== 3 | 4 | The default settings of **django_ocr_server**. 5 | 6 | .. literalinclude:: ../../../django_ocr_server/default_settings.py 7 | :language: python -------------------------------------------------------------------------------- /doc/source/django_ocr_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/doc/source/django_ocr_server.png -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to django_ocr_server's documentation! 2 | ============================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | Introduction 9 | Installation 10 | Configuration 11 | Deploying to production 12 | Usage examples 13 | Running tests 14 | API documentation 15 | Management commands 16 | Creation a distribution package 17 | developer_guide 18 | 19 | 20 | Indices and tables 21 | ================== 22 | 23 | * :ref:`genindex` 24 | * :ref:`modindex` 25 | 26 | -------------------------------------------------------------------------------- /doc/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. index:: Installation 2 | 3 | Installation 4 | ============ 5 | 6 | .. index:: Linux Mint 19 installation 7 | .. index:: Ubuntu bionic installation 8 | 9 | Linux Mint 19 (Ubuntu bionic) 10 | ----------------------------- 11 | Installing packages 12 | 13 | .. code-block:: shell-session 14 | 15 | $ sudo apt install g++ # need to build pdftotext 16 | $ sudo apt install libpoppler-cpp-dev # need to buid pdftotext 17 | 18 | Installing tesseract 19 | 20 | .. code-block:: shell-session 21 | 22 | $ sudo apt install tesseract-ocr 23 | $ sudo apt install tesseract-ocr-rus # install languages you want 24 | 25 | Installing ghostscript 26 | 27 | .. code-block:: shell-session 28 | 29 | $ sudo apt install ghostscript 30 | 31 | Installing python3.7 32 | 33 | .. code-block:: shell-session 34 | 35 | $ sudo apt install python3.7 36 | $ sudo apt install python3.7-dev 37 | 38 | Installing pip 39 | 40 | .. code-block:: shell-session 41 | 42 | $ sudo apt install python-pip 43 | 44 | Installing virtualenv 45 | 46 | .. code-block:: shell-session 47 | 48 | $ pip install --user virtualenv 49 | $ echo 'PATH=~/.local/bin:$PATH' >> ~/.bashrc 50 | $ source ~/.bashrc 51 | 52 | Installing virtualenvwrapper 53 | 54 | .. code-block:: shell-session 55 | 56 | $ pip install --user setuptools 57 | $ pip install --user wheel 58 | $ pip install --user virtualenvwrapper 59 | $ echo 'source ~/.local/bin/virtualenvwrapper.sh' >> ~/.bashrc 60 | $ source ~/.bashrc 61 | 62 | Creating virtualenv for django_ocr_server 63 | 64 | .. code-block:: shell-session 65 | 66 | $ mkvirtualenv django_ocr_server -p /usr/bin/python3.7 67 | 68 | 69 | Installing django-ocr-server (on virtualenv django_ocr_server). 70 | It installs Django as a dependency. 71 | 72 | .. code-block:: shell-session 73 | 74 | $ pip install django-ocr-server 75 | 76 | Create your Django project (on virtualenv django_ocr_server) 77 | 78 | .. code-block:: shell-session 79 | 80 | $ django-admin startproject ocr_server 81 | 82 | Go to project directory 83 | 84 | .. code-block:: shell-session 85 | 86 | $ cd ocr_server 87 | 88 | .. index:: settings.py Linux Mint 19 89 | .. index:: settings.py Ubuntu bionic 90 | 91 | 92 | Edit ocr_server/settings.py 93 | 94 | Add applications to INSTALLED_APPS 95 | 96 | .. code-block:: python 97 | 98 | INSTALLED_APPS = [ 99 | ... 100 | 'rest_framework', 101 | 'rest_framework.authtoken', 102 | 'django_ocr_server', 103 | 'rest_framework_swagger', 104 | ] 105 | 106 | .. index:: urls.py Linux Mint 19 107 | .. index:: urls.py Ubuntu bionic 108 | 109 | Edit ocr_server/urls.py 110 | 111 | .. code-block:: python 112 | 113 | from django.contrib import admin 114 | from django.urls import path, include 115 | from rest_framework.documentation import include_docs_urls 116 | 117 | admin.site.site_header = 'OCR Server Administration' 118 | admin.site.site_title = 'Welcome to OCR Server Administration Portal' 119 | 120 | urlpatterns = [ 121 | path('admin/', admin.site.urls, ), 122 | path('docs/', include_docs_urls(title='OCR Server API')), 123 | path('', include('django_ocr_server.urls'), ), 124 | ] 125 | 126 | Perform migrations (on virtualenv django_ocr_server) 127 | 128 | .. code-block:: shell-session 129 | 130 | $ python manage.py migrate 131 | 132 | Create superuser (on virtualenv django_ocr_server) 133 | 134 | .. code-block:: shell-session 135 | 136 | $ python manage.py createsuperuser 137 | 138 | Run server (on virtualenv django_ocr_server), than visit http://localhost:8000/ 139 | 140 | .. code-block:: shell-session 141 | 142 | $ python manage.py runserver 143 | 144 | .. index:: Linux Mint 19 automatic installation 145 | .. index:: Ubuntu bionic automatic inatallation 146 | 147 | Linux Mint 19 (Ubuntu bionic) automatic installation 148 | ----------------------------------------------------- 149 | 150 | Clone django_ocr_server from github 151 | 152 | .. code-block:: shell-session 153 | 154 | $ git clone https://github.com/shmakovpn/django_ocr_server.git 155 | 156 | Run the installation script using sudo 157 | 158 | .. code-block:: shell-session 159 | 160 | $sudo {your_path}/django_ocr_server/install_ubuntu.sh 161 | 162 | The script creates OS user named 'django_ocr_server', installs all needed packages. 163 | Creates the virtual environment. 164 | It installs django_ocr_server (from PyPI by default, but you can create the package from 165 | cloned repository, see the topic 'Creation a distribution package' how to do this). 166 | Then it creates the django project named 'ocr_server' in the home directory of 'django_ocr_server' OS user. 167 | After the script changes settings.py and urls.py is placed in ~django_ocr_server/ocr_server/ocr_server/. 168 | Finally it applies migrations and creates the superuser named 'admin' with the same password 'admin'. 169 | 170 | Run server under OS user django_ocr_server, then change 'admin' password in the http://localhost:your_port/admin/ page. 171 | 172 | .. code-block:: shell-session 173 | 174 | $ sudo su 175 | # su django_ocr_server 176 | $ cd ~/ocr_server 177 | $ workon django_ocr_server 178 | $ python manage.py runserver 179 | 180 | .. index:: Centos 7 installation 181 | 182 | Centos 7 183 | -------- 184 | 185 | Install epel repository 186 | 187 | .. code-block:: shell-session 188 | 189 | $ sudo yum install epel-release 190 | 191 | Install yum-utils 192 | 193 | .. code-block:: shell-session 194 | 195 | $ sudo yum install yum-utils 196 | 197 | Install ghostscript (Interpreter for PostScript language & PDF needed for ocrmypdf) 198 | 199 | .. code-block:: shell-session 200 | 201 | $ sudo yum install ghostscript 202 | 203 | Install wget (A utility for retrieving files using the HTTP or FTP protocols for download qpdf that needed for ocrmypdf) 204 | 205 | .. code-block:: shell-session 206 | 207 | $ sudo yum install wget 208 | 209 | Install qpdf 210 | 211 | .. code-block:: shell-session 212 | 213 | $ cd /usr/local/src 214 | $ wget https://github.com/qpdf/qpdf/releases/download/release-qpdf-9.1.0/qpdf-9.1.0.tar.gz 215 | $ # TODO tar -zxvf qpdf-9.1.0.tar.gz 216 | $ # TODO cd qpdf-9.1.0 217 | $ # TODO ./Configure 218 | $ # TODO make 219 | $ # TODO make install 220 | 221 | Install python 3.6 222 | 223 | .. code-block:: shell-session 224 | 225 | $ sudo yum install python36 226 | $ sudo yum install python36-devel 227 | 228 | Install gcc 229 | 230 | .. code-block:: shell-session 231 | 232 | $ sudo yum intall gcc 233 | $ sudo yum install gcc-c++ 234 | 235 | Install poppler-cpp-devel (Development files for C++ wrapper for building pdftotext) 236 | 237 | .. code-block:: shell-session 238 | 239 | $ sudo yum install poppler-cpp-devel 240 | 241 | .. index:: Tesseract OCR Centos 7 installation 242 | 243 | Install tesseract 244 | 245 | .. code-block:: shell-session 246 | 247 | $ sudo yum-config-manager --add-repo https://download.opensuse.org/repositories/home:/Alexander_Pozdnyakov/CentOS_7/ 248 | $ sudo bash -c "echo 'gpgcheck=0' >> /etc/yum.repos.d/download.opensuse.org_repositories_home_Alexander_Pozdnyakov_CentOS_7*.repo" 249 | $ sudo yum update 250 | $ sudo yum install tesseract 251 | $ sudo yum install tesseract-langpack-rus # install a language pack you need 252 | 253 | Install pip 254 | 255 | .. code-block:: shell-session 256 | 257 | $ sudo yum install python-pip 258 | 259 | Install virtualenv 260 | 261 | .. code-block:: shell-session 262 | 263 | $ sudo pip install virtualenv 264 | 265 | Create the virtual env for django_ocr_server 266 | 267 | .. code-block:: shell-session 268 | 269 | $ sudo virtualenv /var/www/ocr_server/venv -p /usr/bin/python3.6 --distribute 270 | 271 | Give rights to the project folder to your user 272 | 273 | .. code-block:: shell-session 274 | 275 | $ sudo chown -R {your_user} /var/www/ocr_server/ 276 | 277 | Activate virtualenv 278 | 279 | .. code-block:: shell-session 280 | 281 | $ source /var/www/ocr_server/venv/bin/activate 282 | 283 | .. index:: Postgresql 11 Centos 7 installation and configuration 284 | 285 | Install postgresql 11 (The Postgresql version 9.2 that is installing in Centos 7 by default returns an error when applying migrations ) 286 | 287 | .. code-block:: shell-session 288 | 289 | $ sudo rpm -Uvh https://yum.postgresql.org/11/redhat/rhel-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm 290 | $ sudo yum install postgresql11-server 291 | $ sudo yum install postgresql-devel 292 | $ sudo /usr/pgsql-11/bin/postgresql-11-setup initdb 293 | 294 | Edit /var/lib/pgsql/11/data/pg_hba.conf 295 | 296 | .. code-block:: text 297 | 298 | host all all 127.0.0.1/32 md5 299 | host all all ::1/128 md5 300 | 301 | .. code-block:: bash 302 | 303 | $ sudo systemctl enable postgresql-11 304 | $ sudo systemctl start postgresql-11 305 | $ sudo -u postgres psql 306 | 307 | Create the database and it's user 308 | 309 | .. code-block:: psql 310 | 311 | create database django_ocr_server encoding utf8; 312 | create user django_ocr_server with password 'django_ocr_server'; 313 | alter database django_ocr_server owner to django_ocr_server; 314 | alter user django_ocr_server createdb; -- if you want to run tests 315 | \q 316 | 317 | Install python postgres database driver 318 | 319 | .. code-block:: bash 320 | 321 | $ pip install psycopg2-binary # (on virtualenv django_ocr_server) 322 | 323 | Installing django-ocr-server (on virtualenv django_ocr_server). It installs Django as a dependency 324 | 325 | .. code-block:: shell-session 326 | 327 | $ pip install django-ocr-server 328 | 329 | Create django project (on virtualenv django_ocr_server) 330 | 331 | .. code-block:: shell-session 332 | 333 | $ cd /var/www/ocr_server 334 | $ django-admin startproject ocr_server . 335 | 336 | .. index:: settings.py Centos 7 337 | 338 | Edit ocr_server/settings.py 339 | 340 | Add applications to INSTALLED_APPS 341 | 342 | .. code-block:: python 343 | 344 | INSTALLED_APPS = [ 345 | ... 346 | 'rest_framework', 347 | 'rest_framework.authtoken', 348 | 'django_ocr_server', 349 | 'rest_framework_swagger', 350 | ] 351 | 352 | .. index:: database configuration Centos 7 353 | 354 | Configure database connection 355 | 356 | .. code-block:: python 357 | 358 | DATABASES = { 359 | 'default': { 360 | 'ENGINE': 'django.db.backends.postgresql_psycopg2', 361 | 'NAME': 'django_ocr_server', 362 | 'USER': 'django_ocr_server', 363 | 'PASSWORD': 'django_ocr_server', 364 | 'HOST': 'localhost', 365 | 'PORT': '', 366 | } 367 | } 368 | 369 | .. index:: urls.py Centos 7 370 | 371 | Edit ocr_server/urls.py 372 | 373 | .. code-block:: python 374 | 375 | from django.contrib import admin 376 | from django.urls import path, include 377 | from rest_framework.documentation import include_docs_urls 378 | 379 | admin.site.site_header = 'OCR Server Administration' 380 | admin.site.site_title = 'Welcome to OCR Server Administration Portal' 381 | 382 | urlpatterns = [ 383 | path('admin/', admin.site.urls, ), 384 | path('docs/', include_docs_urls(title='OCR Server API')), 385 | path('', include('django_ocr_server.urls'), ), 386 | ] 387 | 388 | Apply migrations (on virtualenv django_ocr_server) 389 | 390 | .. code-block:: shell-session 391 | 392 | $ python manage.py migrate 393 | 394 | Create superuser (on virtualenv django_ocr_server) 395 | 396 | .. code-block:: shell-session 397 | 398 | $ python manage.py createsuperuser 399 | 400 | Run server (on virtualenv django_ocr_server), than visit http://localhost:8000/ 401 | 402 | .. code-block:: shell-session 403 | 404 | $ python manage.py runserver -------------------------------------------------------------------------------- /doc/source/introduction.rst: -------------------------------------------------------------------------------- 1 | .. index:: Introduction 2 | 3 | Introduction 4 | ============ 5 | 6 | Django-ocr-server lets you recognize images and PDF. It is using tesseract for this. 7 | https://github.com/tesseract-ocr/tesseract 8 | 9 | Django-ocr-server saves the result in the database. 10 | To prevent repeated recognition of the same file, 11 | it also saves the hash sum of the uploaded file. 12 | Therefore, when reloading an already existing file, the result returns immediately, 13 | bypassing the recognition process, which significantly reduces the load on the server. 14 | 15 | If as a result of recognition a non-empty text is received, a searchable PDF is created. 16 | 17 | For the searchable PDF is calculated hash sum too. 18 | Therefore, if you upload the created by Django-ocr-server searchable pdf to the server back, 19 | then this file will not be recognized, but the result will be immediately returned. 20 | 21 | The server can process not only images, but PDF. 22 | At the same time, he analyzes, if the PDF already contains real text, 23 | this text will be used and the file will not be recognized, 24 | which reduces the load on the server and improves the quality of the output. 25 | 26 | .. image:: django_ocr_server.png 27 | 28 | Storage of downloaded files and created searchable PDFs can be disabled in the settings. 29 | 30 | For uploaded files and created searchable PDFs, 31 | and the processing results whole 32 | in the settings you can specify the lifetime after which the data will be automatically deleted. 33 | 34 | To interact with Django-ocr-server you can use API or the admin interface. -------------------------------------------------------------------------------- /doc/source/running_tests.rst: -------------------------------------------------------------------------------- 1 | .. index:: Running tests 2 | .. index:: django_ocr_server.tests 3 | 4 | Running tests 5 | ============= 6 | Perform under you django_ocr_server virtual environment 7 | $python manage.py test django_ocr_server.tests -------------------------------------------------------------------------------- /doc/source/usage_examples.rst: -------------------------------------------------------------------------------- 1 | .. index:: Usage examples 2 | 3 | Usage examples 4 | ============== 5 | You can download all examples from https://github.com/shmakovpn/django_ocr_server/tree/master/usage_examples 6 | 7 | .. index:: curl usage example 8 | 9 | curl 10 | ---- 11 | Use curl with '@' before the path of the uploading file 12 | .. code-block:: bash 13 | 14 | #!/usr/bin/env bash 15 | curl -F "file=@example.png" localhost:8000/upload/ 16 | 17 | .. index:: Python usage example 18 | 19 | python 20 | ------ 21 | Use requests.post function 22 | .. code-block:: python 23 | 24 | import requests 25 | 26 | 27 | with open("example.png", 'rb') as fp: 28 | print(requests.post("http://localhost:8000/upload/", 29 | files={'file': fp}, ).content) 30 | 31 | .. index:: Perl usage example 32 | 33 | perl 34 | ---- 35 | Use LWP::UserAgent and HTTP::Request::Common 36 | .. code-block:: perl 37 | 38 | #!/usr/bin/perl 39 | use strict; 40 | use warnings FATAL => 'all'; 41 | use LWP::UserAgent; 42 | use HTTP::Request::Common; 43 | 44 | my $ua = LWP::UserAgent->new; 45 | my $url = "http://localhost:8000/upload/"; 46 | my $fname = "example.png"; 47 | 48 | my $req = POST($url, 49 | Content_Type => 'form-data', 50 | Content => [ 51 | file => [ $fname ] 52 | ]); 53 | 54 | my $response = $ua->request($req); 55 | 56 | if ($response->is_success()) { 57 | print "OK: ", $response->content; 58 | } else { 59 | print "Failed: ", $response->as_string; 60 | } 61 | 62 | .. index:: php usage example 63 | 64 | php 65 | --- 66 | Use CURLFile($file, $mime, $name) 67 | .. code-block:: php 68 | 69 | new CURLFile($file, $mime, $name), 85 | ); 86 | 87 | curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); 88 | 89 | // Execute the request 90 | $response = curl_exec( $ch); 91 | echo($response); 92 | 93 | curl_close ($ch); 94 | 95 | ?> -------------------------------------------------------------------------------- /install_ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # install_ubuntu.sh automates installing Django-ocr-server 3 | # author: shmakovpn 4 | # date: 2019-04-23 5 | 6 | PROD='django-ocr-server' 7 | PACKAGE='django_ocr_server' 8 | USER='django_ocr_server' # OS username for Django project 9 | PROJECT='ocr_server' # name of the Django project 10 | PWD=$(dirname $(readlink -f $0)) 11 | 12 | # installs a package using apt (apt install -y package name) 2019-04-23 13 | # check that the package successfully installed 14 | install_apt() { 15 | dpkg -l | grep -qi "^ii\s\+$1\(\s\|:\)" > /dev/null 2>&1 # check package already installed 16 | if [ $? -ne 0 ]; then 17 | echo "Info: package '$1' is not installed. Installing" 18 | apt install -y $1 19 | if [ $? -ne 0 ]; then 20 | echo "Error. Could not install '$1'. Install it manually, then run this script again" 21 | exit 1 22 | else 23 | echo "Info: package '$1' installed successfully" 24 | fi 25 | else 26 | echo "Info: package '$1' is already installed" 27 | fi 28 | } 29 | 30 | 31 | # exec $1 from $USER, send evironment *_PROXY and PATH 2019-04-24 32 | exec_from_user() { 33 | sudo http_proxy=$http_proxy https_proxy=$https_proxy HTTP_PROXY=$HTTP_PROXY HTTPS_PROXY=$HTTPS_PROXY PATH=$PATH -H -i -u $USER $1 34 | } 35 | 36 | 37 | echo "This script automatically installs $PROD" 38 | 39 | # checks root privileges 40 | if [ $(id -u) -ne 0 ]; then 41 | echo "This script must be run as root" 42 | exit 1 43 | fi 44 | 45 | # DEBUG 01 46 | # userdel -r "$USER" # remove $USER and it's home directory 47 | # END DEBUG 01 48 | 49 | # check that user exists 50 | id "$USER" > /dev/null 2>&1 51 | if [ $? -ne 0 ]; then 52 | echo "Info: The user '$USER' does not exist, create it." 53 | # creating user for Django-ocr-server 54 | useradd -m $USER -s /bin/bash 55 | id "$USER" > /dev/null 2>&1 56 | if [ $? -ne 0 ]; then 57 | echo "Error. Could not create the user '$USER'" 58 | fi 59 | else 60 | echo "INFO: The user '$USER' is already exits" 61 | fi 62 | echo "$PROD will be installed under user '$USER'" 63 | 64 | eval "USER_HOME=$(echo ~$USER)" 65 | 66 | PATH="$USER_HOME/.local/bin:$PATH" # 67 | 68 | ls -alF $PWD/dist | grep -e "$PROD.*\.tar\.gz$" > /dev/null 2>&1 69 | # checking that the package django-ocr-server-{version}.tar.gz exists in the $PWD/dist directory 70 | if [ $? -eq 0 ]; then 71 | # the package django-ocr-server-{version}.tar.gz found in the $PWD/dist directory, copying it to the $USER directory 72 | cp $PWD/dist/$PROD*tar.gz $USER_HOME > /dev/null 2>&1 73 | if [ $? -ne 0 ]; then 74 | echo "Error. Could not copy $PWD/dist/$PROD*tar.gz to $USER_HOME" 75 | exit 1 76 | fi 77 | else 78 | # the package does not exist in the $PWD/dist directory 79 | # checking that the package exists in the $PWD directory 80 | ls -alF $PWD | grep -e "$PROD.*\.tar\.gz$" > /dev/null 2>&1 81 | if [ $? -eq 0 ]; then 82 | # the package django-ocr-server-{version}.tar.gz found in the $PWD directory, copying it to the $USER directory 83 | cp $PWD/$PROD*tar.gz $USER_HOME > /dev/null 2>&1 84 | if [ $? -ne 0 ]; then 85 | echo "Error. Could not copy $PWD/$PROD*tar.gz to $USER_HOME" 86 | exit 1 87 | fi 88 | else 89 | # the package django-ocr-server-{version}.tar.gz does not exist neither in the $PWD/dist nor in the $PWD direcory 90 | # do nothing, further installation process will try to download the package from PyPI 91 | echo "Info. The file $PROD*tar.gz not found, further installation process will try to download the package from PyPI" 92 | fi 93 | fi 94 | 95 | 96 | # Installing packages 97 | install_apt "g++" 98 | install_apt "libpoppler-cpp-dev" 99 | install_apt "tesseract-ocr" 100 | install_apt "tesseract-ocr-rus" 101 | install_apt "python3.7" 102 | install_apt "python3.7-dev" 103 | install_apt "python-pip" 104 | # install_apt "python-setuptools" # it will be installed later using pip 105 | 106 | # create installation script in $USER environment 107 | echo "#!/usr/bin/env bash" > $USER_HOME/install.sh 108 | echo "PACKAGE=$PACKAGE" >> $USER_HOME/install.sh 109 | echo "PROJECT=$PROJECT" >> $USER_HOME/install.sh 110 | echo "PROD=$PROD" >> $USER_HOME/install.sh 111 | echo "PROJECT_PATH=$PROJECT_PATH" >> $USER_HOME/install.sh 112 | chmod +x $USER_HOME/install.sh 113 | chown $USER:$USER $USER_HOME/install.sh 114 | cat $PWD/install_ubuntu/install.sh >> $USER_HOME/install.sh 115 | 116 | exec_from_user "$USER_HOME/install.sh" 117 | 118 | echo 'Installation successfully finished' -------------------------------------------------------------------------------- /install_ubuntu/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # cheks that pip package installed in user environment 4 | search_pip() { 5 | echo $1 | grep $PROD > /dev/null 2>&1 6 | if [ $? -eq 0 ]; then 7 | # looking for django-ocr-server 8 | pip freeze --all | grep $PROD > /dev/null 2>&1 9 | else 10 | # looking for another package 11 | pip freeze --all | grep -qi "^$1=" > /dev/null 2>&1 12 | fi 13 | } 14 | 15 | 16 | # installs a package using pip (sudo -H -u $USER pip install --user $1) 2019-04-24 17 | # checks that the package successfully installed 18 | install_pip() { 19 | search_pip $1 20 | if [ $? -ne 0 ]; then 21 | echo "INFO: '$1' is not installed. Installing" 22 | echo $1 | grep $PROD > /dev/null 2>&1 23 | if [ $? -eq 0 ]; then 24 | #installing django-ocr-server 25 | pip install $1 26 | else 27 | #installing another pip package 28 | pip install --user $1 29 | fi 30 | search_pip $1 31 | if [ $? -ne 0 ]; then 32 | echo "Error. Could not install '$1'. Install it manually, then run this script again" 33 | exit 1 34 | fi 35 | else 36 | echo "INFO: '$1' is already installed" 37 | fi 38 | } 39 | 40 | # appends line $1 to $USER_HOME/.bashrc if the string does not exist 2019-04-23 41 | append_bashrc() { 42 | #checking that the line $1 does not exist 43 | cat $HOME/.bashrc | grep -qi "^$1$" > /dev/null 2>&1 44 | if [ $? -ne 0 ]; then 45 | echo "$1" >> $HOME/.bashrc 46 | echo "INFO: The line '$1' appended into '~/.bashrc'" 47 | else 48 | echo "INFO: The line '$1' already exists in '~/.bashrc'" 49 | fi 50 | } 51 | 52 | # checks that app $1 is in the django project settings file 53 | is_app_in_django_settings() { 54 | # checking that the django project settings file exists 55 | if [ ! -f $SETTINGS_FILE ]; then 56 | echo "Error: The django project settings file '$SETTINGS_FILE' does not exist" 57 | exit 1 58 | fi 59 | cat $SETTINGS_FILE | grep -Pzo "INSTALLED_APPS\s?=\s?\[[\s\w\.,']*$1[\s\w\.,']*\]\n?" > /dev/null 2>&1 60 | # now $?=0 if app is in settings file 61 | # $? not 0 otherwise 62 | } 63 | 64 | # adds app $1 to the django project settings 65 | add_app2django_settings() { 66 | is_app_in_django_settings "$1" 67 | if [ $? -ne 0 ]; then 68 | echo "Info. The app '$1' is not in the django project settings file '$SETTINGS_FILE'. Adding." 69 | sed -i -e '1h;2,$H;$!d;g' -re "s/(INSTALLED_APPS\s?=\s?\[[\n '._a-zA-Z,]*)/\1 '$1',\n/g" $SETTINGS_FILE 70 | # checking that app $1 successfully added to django project settings file 71 | is_app_in_django_settings $1 72 | if [ $? -ne 0 ]; then 73 | echo "Error. Could not add the app '$1' to the django project settings file '$SETTINGS_FILE'. Add it manually, then run this script again." 74 | exit 1 75 | else 76 | echo "Info. The app '$1' was successfully added to the django settings file '$SETTINGS_FILE'." 77 | fi 78 | else 79 | echo "Info. The app '$1' is already in the django project settings file '$SETTINGS_FILE'" 80 | fi 81 | } 82 | 83 | IMPORT_STRING=0 # The boolean flag, means that the url string is adding to the django project urls file is an import string 84 | 85 | # checks that line is in the django project urls file 86 | is_line_in_django_urls() { 87 | # checking that the django project urls file exists 88 | if [ ! -f $URLS_FILE ]; then 89 | echo "Error. The django project urls fie '$URLS_FILE' does not exist" 90 | exit 1 91 | fi 92 | echo "$1" | grep -iP "^from .+ import .+$" > /dev/null 2>&1 93 | # check if $1 is an import string or is an url string 94 | if [ $? -eq 0 ]; then 95 | echo "Info. The url string='$1' is an import string" 96 | IMPORT_STRING=1 97 | cat $URLS_FILE | grep -i "^$1$" > /dev/null 2>&1 98 | else 99 | echo "Info. The url string='$1' is not an import string" 100 | IMPORT_STRING=0 101 | ESCAPED_STRING=`echo $1 | sed -re "s/\(/\\\\\(/g" | sed -re "s/\)/\\\\\)/g"` 102 | GREP_REQ="urlpatterns\s?=\s?\[[\s\w\.,\(\)'=\/]*$ESCAPED_STRING[\s\w\.,\(\)'=\/]*\]\n?" 103 | cat $URLS_FILE | grep -Pzo "$GREP_REQ" > /dev/null 2>&1 104 | fi 105 | # return: $? will be not 0 if $1 does not exist in the $URLS_FILE 106 | } 107 | 108 | # adds string $1 to the django project urls file 109 | add_str2django_urls() { 110 | is_line_in_django_urls "$1" 111 | # checking that string $1 does not exist in django project urls file 112 | if [ $? -ne 0 ]; then 113 | echo "Info. The url string='$1' does not exists in URLS_FILE='$URLS_FILE'" 114 | if [ $IMPORT_STRING -eq 1 ]; then 115 | echo "Info. Adding the url string='$1' as an import string" 116 | sed -i -re "s/^(urlpatterns)/$1\n\n\1/g" $URLS_FILE 117 | is_line_in_django_urls "$1" 118 | if [ $? -ne 0 ]; then 119 | echo "Error: Could not add the url string='$1' to the django project urls file '$URLS_FILE'. Add it manually, then run this script again." 120 | exit 1 121 | else 122 | echo "Info. The url string='$1' was successfully added to the django urls file '$URLS_FILE'." 123 | fi 124 | else 125 | echo "Info. Adding the url string='$1' as a path string" 126 | sed -i -e '1h;2,$H;$!d;g' -re "s@(urlpatterns\s?=\s?\[[\n '._a-zA-Z,()=\/]*)@\1 $1\n@g" $URLS_FILE 127 | is_line_in_django_urls "$1" 128 | if [ $? -ne 0 ]; then 129 | echo "Error: Could not add the url string='$1' to the django project urls file '$URLS_FILE'. Add it manually, then run this script again." 130 | exit 1 131 | else 132 | echo "Info. The url string='$1' was successfully added to the django urls file '$URLS_FILE'." 133 | fi 134 | fi 135 | else 136 | echo "Info. The url string='$1' already exists in URLS_FILE='$URLS_FILE'" 137 | fi 138 | } 139 | 140 | 141 | # installing setuptools 142 | install_pip "setuptools" 143 | # installing 'virtualenv' under user 144 | install_pip "virtualenv" 145 | # append_bashrc "export PATH=$HOME/.local/bin:$PATH" 146 | append_bashrc "export PATH=$PATH" # 2019-05-04 147 | # installing 'wheel' under user 148 | install_pip "wheel" 149 | # installing 'virtualenvwrapper' under user 150 | install_pip "virtualenvwrapper" 151 | export VIRTUALENVWRAPPER_PYTHON=$(head -n 1 `which pip` | sed s/..//) 152 | append_bashrc "VIRTUALENVWRAPPER_PYTHON=$VIRTUALENVWRAPPER_PYTHON" 153 | append_bashrc ". $HOME/.local/bin/virtualenvwrapper.sh" 154 | source $HOME/.local/bin/virtualenvwrapper.sh 155 | 156 | # checking that django_ocr_server virtualenv exists 157 | ls -alF $HOME/.virtualenvs/ | grep "$PACKAGE/\$" > /dev/null 2>&1 158 | if [ $? -ne 0 ]; then 159 | # django_ocr_server virtualenv does not exist 160 | echo "Info: virtual environment '$PACKAGE' does not exit. Installing it." 161 | mkvirtualenv $PACKAGE -p /usr/bin/python3.7 162 | ls -alF $HOME/.virtualenvs/ | grep "$PACKAGE/\$" > /dev/null 2>&1 163 | if [ $? -ne 0 ]; then 164 | echo "Error. Could not create virtual environment '$PACKAGE'. Create it manually and run this script once again." 165 | exit 1 166 | else 167 | echo "Info. The virtual environment '$PACKAGE' created successfully." 168 | fi 169 | else 170 | # django_ocr_server virtualenv exists 171 | echo "Info: virtual environment '$PACKAGE' already exits. Working on it" 172 | workon $PACKAGE 173 | fi 174 | 175 | # checking that django-ocr-serer-{version}.tar.gz in $HOME folder 176 | ls -alF $HOME | grep -e "$PROD.*\.tar.gz$" > /dev/null 2>&1 177 | if [ $? -ne 0 ]; then 178 | # django-ocr-server-{version}.tar.gz package was not found in $HOME directory, try to install it from PyPI 179 | echo "Info. The $PROD-{version}.tar.gz package was not found in '$HOME' directory, try to install it from PyPI" 180 | install_pip "$PROD" 181 | else 182 | # The django-ocr-server package was found in $HOME directory, installing it. 183 | install_pip $HOME/`ls -alF | grep -e "$PROD.*\.tar.gz$" | sed -re "s/^.*($PROD.*\.tar\.gz)$/\1/"` 184 | fi 185 | 186 | # checking that Django project exists 187 | ls -alF $HOME | grep "$PROJECT/\$" > /dev/null 2>&1 188 | if [ $? -ne 0 ]; then 189 | echo "Info: Django project folder '~/$PROJECT' does not exist. Creating Django project" 190 | cd $HOME 191 | django-admin startproject $PROJECT 192 | ls -alF $HOME | grep "$PROJECT/\$" > /dev/null 2>&1 193 | if [ $? -ne 0 ]; then 194 | echo "Error: Could not start '$PROJECT' django project. Start it manually, then run this script again." 195 | exit 1 196 | fi 197 | cd $HOME/$PROJECT 198 | else 199 | echo "Info: Django project folder '~/$PROJECT' exists" 200 | fi 201 | 202 | SETTINGS_FILE="$HOME/$PROJECT/$PROJECT/settings.py" # the django project settings file 203 | add_app2django_settings "rest_framework" 204 | add_app2django_settings "rest_framework.authtoken" 205 | add_app2django_settings "django_ocr_server" 206 | add_app2django_settings "rest_framework_swagger" 207 | 208 | 209 | URLS_FILE="$HOME/$PROJECT/$PROJECT/urls.py" # the django project urls file 210 | add_str2django_urls "from django.contrib import admin" 211 | add_str2django_urls "from django.urls import path" 212 | add_str2django_urls "from django.urls import include" 213 | add_str2django_urls "from rest_framework.documentation import include_docs_urls" 214 | add_str2django_urls "path('docs/', include_docs_urls(title='OCR Server API'))," 215 | add_str2django_urls "path('', include('django_ocr_server.urls'), )," 216 | add_str2django_urls "path('admin/', admin.site.urls)," 217 | 218 | # applying migrations 219 | cd $HOME/$PROJECT 220 | python manage.py migrate 221 | 222 | # creating the django project superuser 223 | echo "from django.contrib.auth.models import User; user=User.objects.get_or_create(username='admin')[0]; user.is_staff=True; user.is_superuser=True; user.set_password('admin'); user.save(); print('Superuser successfully created with username \'admin\' and password \'admin\'')" | python manage.py shell 224 | 225 | # howto message 226 | echo "Go to '$HOME/$PROJECT'; workon '$PACKAGE'; python manage.py runserver; then goto http://localhost:8000" -------------------------------------------------------------------------------- /makedoc.py: -------------------------------------------------------------------------------- 1 | """ 2 | makedoc.py 3 | 4 | Generates the documentation of django_ocr_server 5 | 6 | Usage: 7 | 8 | .. code-block:: bash 9 | 10 | python makedoc.py 11 | 12 | Author: shmakovpn 13 | Date: 2020-01-13 14 | """ 15 | import os 16 | SCRIPT_DIR: str = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | 19 | def run_sphinx() -> None: 20 | docs_dir: str = os.path.join(SCRIPT_DIR, 'doc') 21 | docs_source_dir: str = os.path.join(docs_dir, 'source') 22 | build_dir: str = os.path.join(docs_dir, 'build') 23 | html_dir: str = os.path.join(build_dir, 'html') 24 | cmd: str = f'sphinx-build -b html "{docs_source_dir}" "{html_dir}"' 25 | os.system(cmd) 26 | print('__END__') 27 | 28 | 29 | if __name__ == '__main__': 30 | run_sphinx() 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | django 2 | regex 3 | PyPDF 4 | pdftotext 5 | pytesseract 6 | ocrmypdf 7 | djangorestframework 8 | beautifulsoup4 9 | django-rest-swagger -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # djnago_ocr_server setup.cfg 2 | # author shmakovpn 3 | # date 2019-05-28 4 | [metadata] 5 | description-file = README.rst 6 | license_file = LICENSE.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | setup.py 3 | ++++++++ 4 | 5 | django-ocr-server installation script 6 | 7 | | Author: shmakovpn 8 | | Date: 2019-04-16/2021-01-19 9 | """ 10 | 11 | from setuptools import setup, find_packages 12 | import os 13 | from django_ocr_server.version import VERSION 14 | 15 | # allow setup.py to be run from any path 16 | os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir))) 17 | 18 | with open(os.path.join(os.path.dirname(__file__), 'README.rst')) as readme: 19 | long_description = readme.read() 20 | 21 | setup( 22 | name='django-ocr-server', 23 | version=VERSION, 24 | packages=find_packages(), 25 | author='shmakovpn', 26 | author_email='shmakovpn@yandex.ru', 27 | url='https://github.com/shmakovpn/django_ocr_server', 28 | download_url= 29 | f'https://github.com/shmakovpn/django_ocr_server/archive/{VERSION}.zip', 30 | # desctiption='Django OCR Server', 31 | long_description=long_description, 32 | entry_points={ 33 | 'console_sripts': [], 34 | }, 35 | install_requires=[ 36 | 'Django>=2.1.7', 37 | 'regex>=2019.2.21', # Used to determine that the text layer of the loaded PDF document was automatically created as a result of recognition of images by the scanner as text in English, while the images contain text in Cyrillic. 38 | 'PyPDF2>=1.26.0', # used to analizing PDF documents uploaded to the server 39 | 'pdftotext>=2.1.1', # used to extracting text from PDF documents uploaded to the server 40 | 'pytesseract>=0.2.6', # wrapper for tesseract-ocr https://github.com/tesseract-ocr/tesseract 41 | 'ocrmypdf>=8.2.0', # used for pdf recognition 42 | 'djangorestframework>=3.9.2', # used for API 43 | 'beautifulsoup4>=4.7.1', # used for tests 44 | 'django-rest-swagger>=2.2.0' # used for documentation 45 | ], 46 | include_package_data=True, 47 | # test_suite='tests', 48 | ) 49 | -------------------------------------------------------------------------------- /static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/static/favicon.ico -------------------------------------------------------------------------------- /upload_to_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # upload_to_pypi.sh uploads the builded python package of django_ocr_server 3 | # with the version set in django_ocr_server/__init__.py file in the __version__ variable 4 | # to pypi.org 5 | 6 | # Author: shmakovpn 7 | # Date: 2019-10-11 8 | 9 | SCRIPT_DIR=$(dirname $(readlink -f $0)) # directory of this script 10 | PACKAGE='django_ocr_server' 11 | 12 | PACKAGE_INIT_FILE=${SCRIPT_DIR}/${PACKAGE}/__init__.py 13 | 14 | if [[ ! -f ${PACKAGE_INIT_FILE} ]]; then 15 | echo "Error: init file '${PACKAGE_INIT_FILE}' does not exist" 16 | exit 1 17 | fi 18 | 19 | VERSION=$(cat "${PACKAGE_INIT_FILE}" | sed -re '/^__version__/!d; s/ //g; s/^__version__=//; s/#.*$//' -re "s/'//g") 20 | 21 | if [[ -z ${VERSION} ]]; then 22 | echo "Error: could not get version from '${PACKAGE_INIT_FILE}'" 23 | fi 24 | 25 | echo "Info: got version '${VERSION}'" 26 | 27 | PACKAGE_FILE="dist/$(echo ${PACKAGE} | sed -re 's/_/-/g')-${VERSION}.tar.gz" 28 | 29 | if [[ ! -f ${PACKAGE_FILE} ]]; then 30 | echo "Error: package file '${PACKAGE_FILE}' does not exist" 31 | exit 1 32 | fi 33 | 34 | python -m twine upload "${PACKAGE_FILE}" 35 | -------------------------------------------------------------------------------- /usage_examples/curl_example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl -F "name=@example.png" localhost:8000/upload/ -------------------------------------------------------------------------------- /usage_examples/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shmakovpn/django_ocr_server/4d694629c39c18a6c13bcdfafdb8258b78e5a859/usage_examples/example.png -------------------------------------------------------------------------------- /usage_examples/perl_example.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings FATAL => 'all'; 4 | use LWP::UserAgent; 5 | use HTTP::Request::Common; 6 | 7 | my $ua = LWP::UserAgent->new; 8 | my $url = "http://localhost:8000/upload/"; 9 | my $fname = "example.png"; 10 | 11 | my $req = POST($url, 12 | Content_Type => 'form-data', 13 | Content => [ 14 | file => [ $fname ] 15 | ]); 16 | 17 | my $response = $ua->request($req); 18 | 19 | if ($response->is_success()) { 20 | print "OK: ", $response->content; 21 | } else { 22 | print "Failed: ", $response->as_string; 23 | } -------------------------------------------------------------------------------- /usage_examples/php_example.php: -------------------------------------------------------------------------------- 1 | new CURLFile($file, $mime, $name), 17 | ); 18 | 19 | curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); 20 | 21 | // Execute the request 22 | $response = curl_exec($ch); 23 | echo($response); 24 | 25 | curl_close ($ch); 26 | 27 | ?> 28 | -------------------------------------------------------------------------------- /usage_examples/python_example.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | with open("example.png", 'rb') as fp: 5 | print(requests.post("http://localhost:8000/upload/", 6 | files={'file': fp}, ).content) 7 | --------------------------------------------------------------------------------