├── .nvmrc ├── .stylelintignore ├── config ├── packages │ ├── test │ │ ├── twig.yaml │ │ ├── webpack_encore.yaml │ │ ├── framework.yaml │ │ ├── web_profiler.yaml │ │ └── monolog.yaml │ ├── twig.yaml │ ├── mailer.yaml │ ├── prod │ │ ├── routing.yaml │ │ ├── cache.yaml │ │ ├── webpack_encore.yaml │ │ ├── deprecations.yaml │ │ └── monolog.yaml │ ├── toolforge.yaml │ ├── doctrine.yaml │ ├── assets.yaml │ ├── dev │ │ ├── web_profiler.yaml │ │ ├── cache.yaml │ │ └── monolog.yaml │ ├── cache.yaml │ ├── routing.yaml │ ├── nelmio_api_doc.yaml │ ├── framework.yaml │ └── webpack_encore.yaml ├── routes │ ├── dev │ │ ├── framework.yaml │ │ └── web_profiler.yaml │ └── annotations.yaml ├── preload.php ├── routes.yaml ├── bundles.php └── services.yaml ├── .stylelintrc.json ├── docker ├── install.sh ├── run.sh └── setup.sh ├── public ├── favicon.ico └── index.php ├── .minus-x.json ├── templates ├── bundles │ └── NelmioApiDocBundle │ │ └── SwaggerUi │ │ └── index.html.twig ├── _tesseract_options.html.twig ├── _transkribus_help.html.twig ├── _transkribus_options.html.twig ├── transkribus.html.twig └── base.html.twig ├── src ├── Exception │ ├── EngineNotFoundException.php │ └── OcrException.php ├── Engine │ ├── EngineResult.php │ ├── EngineFactory.php │ ├── Image.php │ ├── GoogleCloudVisionEngine.php │ ├── TesseractEngine.php │ └── TranskribusEngine.php ├── Controller │ └── TranskribusController.php ├── Kernel.php ├── Twig │ └── AppExtension.php └── EventListener │ └── ExceptionListener.php ├── assets ├── images │ ├── OOjs_UI_icon_move.svg │ ├── Crop_-_The_Noun_Project.svg │ └── WikimediaOCR-logo.svg └── styles │ └── app.css ├── tests ├── OcrTestCase.php ├── bootstrap.php ├── fixtures │ └── google-account-keyfile.json ├── Twig │ └── AppExtensionTest.php └── Controller │ └── OcrControllerTest.php ├── bin ├── phpunit └── console ├── phpcs.xml.dist ├── .gitignore ├── Gruntfile.js ├── Dockerfile ├── i18n ├── vec.json ├── ja.json ├── ka.json ├── ce.json ├── lb.json ├── ro.json ├── pnb.json ├── br.json ├── el.json ├── roa-tara.json ├── sat.json ├── hu.json ├── ps.json ├── io.json ├── te.json ├── az.json ├── lt.json ├── cs.json ├── pl.json ├── ru.json ├── vi.json ├── sv.json ├── zh-hant.json ├── sk.json ├── zh-hans.json ├── tl.json ├── id.json ├── fi.json ├── ko.json ├── he.json ├── fa.json ├── ar.json ├── sl.json ├── krc.json ├── it.json ├── tr.json ├── ms.json ├── bn.json ├── en.json ├── ia.json └── mk.json ├── package.json ├── .phpcs.xml ├── README.md ├── .phan └── config.php ├── .github └── workflows │ └── ci.yml ├── .env ├── phpunit.xml.dist ├── check_tesseract.sh ├── webpack.config.js ├── composer.json └── CONTRIBUTING.md /.nvmrc: -------------------------------------------------------------------------------- 1 | 16 2 | -------------------------------------------------------------------------------- /.stylelintignore: -------------------------------------------------------------------------------- 1 | public/bundles/ 2 | 3 | -------------------------------------------------------------------------------- /config/packages/test/twig.yaml: -------------------------------------------------------------------------------- 1 | twig: 2 | strict_variables: true 3 | -------------------------------------------------------------------------------- /.stylelintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "stylelint-config-wikimedia" 3 | } 4 | -------------------------------------------------------------------------------- /docker/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | npm install 5 | composer install 6 | -------------------------------------------------------------------------------- /config/packages/test/webpack_encore.yaml: -------------------------------------------------------------------------------- 1 | #webpack_encore: 2 | # strict_mode: false 3 | -------------------------------------------------------------------------------- /config/packages/twig.yaml: -------------------------------------------------------------------------------- 1 | twig: 2 | default_path: '%kernel.project_dir%/templates' 3 | -------------------------------------------------------------------------------- /config/packages/mailer.yaml: -------------------------------------------------------------------------------- 1 | framework: 2 | mailer: 3 | dsn: '%env(MAILER_DSN)%' 4 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wikimedia/wikimedia-ocr/HEAD/public/favicon.ico -------------------------------------------------------------------------------- /config/packages/prod/routing.yaml: -------------------------------------------------------------------------------- 1 | framework: 2 | router: 3 | strict_requirements: null 4 | -------------------------------------------------------------------------------- /config/packages/toolforge.yaml: -------------------------------------------------------------------------------- 1 | toolforge: 2 | intuition: 3 | domain: 'wikimedia-ocr' 4 | -------------------------------------------------------------------------------- /config/packages/doctrine.yaml: -------------------------------------------------------------------------------- 1 | doctrine: 2 | dbal: 3 | connections: 4 | default: 5 | -------------------------------------------------------------------------------- /.minus-x.json: -------------------------------------------------------------------------------- 1 | { 2 | "ignore": [ 3 | "./bin/.phpunit/phpunit-8.5.34-0/phpunit" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /config/packages/test/framework.yaml: -------------------------------------------------------------------------------- 1 | framework: 2 | test: true 3 | session: 4 | storage_id: session.storage.mock_file 5 | -------------------------------------------------------------------------------- /config/packages/assets.yaml: -------------------------------------------------------------------------------- 1 | framework: 2 | assets: 3 | json_manifest_path: '%kernel.project_dir%/public/build/manifest.json' 4 | -------------------------------------------------------------------------------- /config/routes/dev/framework.yaml: -------------------------------------------------------------------------------- 1 | _errors: 2 | resource: '@FrameworkBundle/Resources/config/routing/errors.xml' 3 | prefix: /_error 4 | -------------------------------------------------------------------------------- /config/packages/test/web_profiler.yaml: -------------------------------------------------------------------------------- 1 | web_profiler: 2 | toolbar: false 3 | intercept_redirects: false 4 | 5 | framework: 6 | profiler: { collect: false } 7 | -------------------------------------------------------------------------------- /templates/bundles/NelmioApiDocBundle/SwaggerUi/index.html.twig: -------------------------------------------------------------------------------- 1 | {% extends '@!NelmioApiDoc/SwaggerUi/index.html.twig' %} 2 | 3 | {% block header %} 4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /config/packages/dev/web_profiler.yaml: -------------------------------------------------------------------------------- 1 | web_profiler: 2 | toolbar: true 3 | intercept_redirects: false 4 | 5 | framework: 6 | profiler: { only_exceptions: false } 7 | -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker run -ti -p ${DOCKER_OCR_PORT:-8000}:8000 --mount type=bind,source="$(cd "$(dirname $0)/.."; pwd)",target=/wikimedia-ocr wikimedia-ocr:latest 4 | -------------------------------------------------------------------------------- /config/routes/annotations.yaml: -------------------------------------------------------------------------------- 1 | controllers: 2 | resource: ../../src/Controller/ 3 | type: annotation 4 | 5 | kernel: 6 | resource: ../../src/Kernel.php 7 | type: annotation 8 | -------------------------------------------------------------------------------- /config/packages/dev/cache.yaml: -------------------------------------------------------------------------------- 1 | framework: 2 | cache: 3 | app: cache.adapter.filesystem 4 | pools: 5 | app_keys: 6 | adapter: cache.adapter.filesystem 7 | -------------------------------------------------------------------------------- /config/packages/prod/cache.yaml: -------------------------------------------------------------------------------- 1 | framework: 2 | cache: 3 | app: cache.adapter.filesystem 4 | pools: 5 | app_keys: 6 | adapter: cache.adapter.filesystem 7 | -------------------------------------------------------------------------------- /config/packages/prod/webpack_encore.yaml: -------------------------------------------------------------------------------- 1 | #webpack_encore: 2 | # Cache the entrypoints.json (rebuild Symfony's cache when entrypoints.json changes) 3 | # Available in version 1.2 4 | #cache: true 5 | -------------------------------------------------------------------------------- /src/Exception/EngineNotFoundException.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | move 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /config/packages/prod/deprecations.yaml: -------------------------------------------------------------------------------- 1 | # As of Symfony 5.1, deprecations are logged in the dedicated "deprecation" channel when it exists 2 | #monolog: 3 | # channels: [deprecation] 4 | # handlers: 5 | # deprecation: 6 | # type: stream 7 | # channels: [deprecation] 8 | # path: php://stderr 9 | -------------------------------------------------------------------------------- /config/routes.yaml: -------------------------------------------------------------------------------- 1 | #index: 2 | # path: / 3 | # controller: App\Controller\DefaultController::index 4 | 5 | app.swagger_ui: 6 | path: /api/doc 7 | methods: GET 8 | defaults: { _controller: nelmio_api_doc.controller.swagger_ui } 9 | 10 | app.swagger: 11 | path: /openapi.json 12 | methods: GET 13 | defaults: { _controller: nelmio_api_doc.controller.swagger } 14 | -------------------------------------------------------------------------------- /tests/OcrTestCase.php: -------------------------------------------------------------------------------- 1 | projectDir = self::$kernel->getProjectDir(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | bootEnv( dirname( __DIR__ ) . '/.env' ); 12 | } 13 | -------------------------------------------------------------------------------- /tests/fixtures/google-account-keyfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "service_account", 3 | "project_id": "", 4 | "private_key_id": "", 5 | "private_key": "", 6 | "client_email": "", 7 | "client_id": "", 8 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 9 | "token_uri": "https://oauth2.googleapis.com/token", 10 | "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", 11 | "client_x509_cert_url": "" 12 | } 13 | -------------------------------------------------------------------------------- /docker/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "Building image" 6 | docker build -t wikimedia-ocr $(dirname $0)/.. 7 | 8 | echo "Running composer install && npm install" 9 | docker run --mount type=bind,source="$(cd "$(dirname $0)/.."; pwd)",target=/wikimedia-ocr wikimedia-ocr:latest bash ./docker/install.sh 10 | 11 | echo $'\e[1;32m'Everything looks good. Run ./docker/run.sh and an instance should be available at the default port \(8000\) $'\e[0m' 12 | -------------------------------------------------------------------------------- /config/packages/nelmio_api_doc.yaml: -------------------------------------------------------------------------------- 1 | nelmio_api_doc: 2 | documentation: 3 | info: 4 | title: Wikimedia OCR 5 | description: A web service for Tesseract, Google and Transkribus OCR engines. 6 | version: 1.0.0 7 | areas: 8 | path_patterns: 9 | - ^/api$ 10 | - ^/api/available_langs$ 11 | - ^/api/tesseract/available_psms$ 12 | - ^/api/transkribus/available_line_ids$ 13 | -------------------------------------------------------------------------------- /bin/phpunit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | bin/ 14 | config/ 15 | public/ 16 | src/ 17 | tests/ 18 | 19 | 20 | -------------------------------------------------------------------------------- /config/packages/test/monolog.yaml: -------------------------------------------------------------------------------- 1 | monolog: 2 | channels: ['main', 'nested', 'tesseract'] 3 | handlers: 4 | main: 5 | type: fingers_crossed 6 | action_level: error 7 | handler: nested 8 | excluded_http_codes: [404, 405] 9 | channels: ["!event"] 10 | nested: 11 | type: stream 12 | path: "%kernel.logs_dir%/%kernel.environment%.log" 13 | level: debug 14 | tesseract: 15 | type: fingers_crossed 16 | action_level: error 17 | handler: nested 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ###> symfony/framework-bundle ### 2 | /.env.local 3 | /.env.local.php 4 | /.env.*.local 5 | /config/secrets/prod/prod.decrypt.private.php 6 | /public/bundles/ 7 | /var/ 8 | /vendor/ 9 | ###< symfony/framework-bundle ### 10 | 11 | ###> squizlabs/php_codesniffer ### 12 | /.phpcs-cache 13 | /phpcs.xml 14 | ###< squizlabs/php_codesniffer ### 15 | 16 | ###> symfony/webpack-encore-bundle ### 17 | /node_modules/ 18 | /public/build/ 19 | npm-debug.log 20 | ###< symfony/webpack-encore-bundle ### 21 | 22 | ###> symfony/phpunit-bridge ### 23 | .phpunit 24 | .phpunit.result.cache 25 | /phpunit.xml 26 | ###< symfony/phpunit-bridge ### 27 | -------------------------------------------------------------------------------- /public/index.php: -------------------------------------------------------------------------------- 1 | bootEnv( dirname( __DIR__ ) . '/.env' ); 12 | 13 | if ( $_SERVER['APP_DEBUG'] ) { 14 | umask( 0000 ); 15 | 16 | Debug::enable(); 17 | } 18 | 19 | $kernel = new Kernel( $_SERVER['APP_ENV'], (bool)$_SERVER['APP_DEBUG'] ); 20 | $request = Request::createFromGlobals(); 21 | $response = $kernel->handle( $request ); 22 | $response->send(); 23 | $kernel->terminate( $request, $response ); 24 | -------------------------------------------------------------------------------- /Gruntfile.js: -------------------------------------------------------------------------------- 1 | /* eslint-env node */ 2 | module.exports = function Gruntfile(grunt) 3 | { 4 | 5 | grunt.loadNpmTasks('grunt-banana-checker'); 6 | grunt.loadNpmTasks('grunt-stylelint'); 7 | 8 | grunt.initConfig({ 9 | banana: { 10 | all: { 11 | src: 'i18n/' 12 | } 13 | }, 14 | stylelint: { 15 | all: [ 16 | '**/*.{css,less}', 17 | '!node_modules/**', 18 | '!vendor/**', 19 | '!var/**', 20 | '!public/build/**' 21 | ] 22 | }, 23 | }); 24 | 25 | grunt.registerTask('test', ['stylelint', 'banana']); 26 | grunt.registerTask('default', 'test'); 27 | }; 28 | -------------------------------------------------------------------------------- /templates/_tesseract_options.html.twig: -------------------------------------------------------------------------------- 1 |
2 | {{ msg('tesseract-options') }} 3 |
4 | 5 | 10 |

11 | {{ msg('tesseract-psm-help') }} 12 |

13 |
14 |
15 | -------------------------------------------------------------------------------- /templates/_transkribus_help.html.twig: -------------------------------------------------------------------------------- 1 |
2 | {{ msg('transkribus-browse-public-models') }} 3 |
4 | 18 | {{ msg('transkribus-request-for-model') }} 19 | 20 |
-------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM php:7.4-buster 2 | 3 | WORKDIR /wikimedia-ocr 4 | 5 | EXPOSE 8000 6 | 7 | RUN apt-get update -q && apt-get install -y \ 8 | git \ 9 | wget \ 10 | curl \ 11 | libicu-dev \ 12 | libzip-dev \ 13 | unzip \ 14 | tesseract-ocr-all \ 15 | && docker-php-ext-install intl \ 16 | && docker-php-ext-install bcmath \ 17 | && wget -nv -O- https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer \ 18 | && wget -nv -O- https://get.symfony.com/cli/installer | bash \ 19 | && mv /root/.symfony/bin/symfony /usr/local/bin/symfony \ 20 | && curl -fsSL https://deb.nodesource.com/setup_12.x | bash - \ 21 | && apt-get install -y nodejs 22 | 23 | 24 | CMD npm run watch & symfony serve && fg 25 | -------------------------------------------------------------------------------- /i18n/vec.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Candalua" 5 | ] 6 | }, 7 | "title": "WikimediaOCR", 8 | "subtitle": "Scrivi zo el testo da le imagini", 9 | "form-heading": "Trascrivi un'imagine", 10 | "image-url": "URL de l'imagine", 11 | "image-url-help": "Scrivi qua un indirizo de na imagine ospità su de un server Wikimedia, tipo: $1", 12 | "image-alt-text": "L’imagine orixenal", 13 | "language-code": "Lengue (facoltatìo)", 14 | "engine": "Motór OCR", 15 | "submit": "Trascrivi", 16 | "copy-to-clipboard": "Copia sui apunti", 17 | "copied-to-clipboard": "Copià!", 18 | "documentation": "Documentasion", 19 | "version": "Versión $1", 20 | "report-issue": "Segnała on problema", 21 | "langs-placeholder": "Lassa vodo par capir en automàtego la lengoa." 22 | } 23 | -------------------------------------------------------------------------------- /config/bundles.php: -------------------------------------------------------------------------------- 1 | [ 'all' => true ], 6 | Doctrine\Bundle\DoctrineBundle\DoctrineBundle::class => [ 'all' => true ], 7 | Wikimedia\ToolforgeBundle\ToolforgeBundle::class => [ 'all' => true ], 8 | Symfony\Bundle\TwigBundle\TwigBundle::class => [ 'all' => true ], 9 | Twig\Extra\TwigExtraBundle\TwigExtraBundle::class => [ 'all' => true ], 10 | Symfony\Bundle\WebProfilerBundle\WebProfilerBundle::class => [ 'dev' => true, 'test' => true ], 11 | Symfony\WebpackEncoreBundle\WebpackEncoreBundle::class => [ 'all' => true ], 12 | Symfony\Bundle\MonologBundle\MonologBundle::class => [ 'all' => true ], 13 | Nelmio\ApiDocBundle\NelmioApiDocBundle::class => [ 'all' => true ], 14 | ]; 15 | -------------------------------------------------------------------------------- /assets/images/Crop_-_The_Noun_Project.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/Engine/EngineResult.php: -------------------------------------------------------------------------------- 1 | text = $text; 22 | $this->warnings = $warnings; 23 | } 24 | 25 | /** 26 | * @return string 27 | */ 28 | public function getText(): string { 29 | return $this->text; 30 | } 31 | 32 | /** 33 | * @return string[] 34 | */ 35 | public function getWarnings(): array { 36 | return $this->warnings; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/Exception/OcrException.php: -------------------------------------------------------------------------------- 1 | i18nKey = $i18nKey; 23 | $this->i18nParams = $i18nParams; 24 | } 25 | 26 | /** 27 | * @return string 28 | */ 29 | public function getI18nKey(): string { 30 | return $this->i18nKey; 31 | } 32 | 33 | /** 34 | * @return mixed[] 35 | */ 36 | public function getI18nParams(): array { 37 | return $this->i18nParams; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /config/packages/dev/monolog.yaml: -------------------------------------------------------------------------------- 1 | monolog: 2 | channels: ['main', 'console', 'tesseract'] 3 | handlers: 4 | main: 5 | type: stream 6 | path: "%kernel.logs_dir%/%kernel.environment%.log" 7 | level: debug 8 | channels: ["!event"] 9 | # uncomment to get logging in your browser 10 | # you may have to allow bigger header sizes in your Web server configuration 11 | #firephp: 12 | # type: firephp 13 | # level: info 14 | #chromephp: 15 | # type: chromephp 16 | # level: info 17 | console: 18 | type: console 19 | process_psr_3_messages: false 20 | channels: ["!event", "!doctrine", "!console"] 21 | tesseract: 22 | type: fingers_crossed 23 | action_level: error 24 | handler: console 25 | -------------------------------------------------------------------------------- /i18n/ja.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Apple TD", 5 | "MathXplore", 6 | "もなー(偽物)", 7 | "組曲師" 8 | ] 9 | }, 10 | "title": "WikimediaOCR", 11 | "subtitle": "画像からテキストを書き写す", 12 | "image-url": "画像のURL", 13 | "language-code": "言語(オプション)", 14 | "engine": "OCRエンジン", 15 | "engine-name-transkribus": "トランスクリバス OCR", 16 | "copied-to-clipboard": "コピーしました", 17 | "version": "バージョン $1", 18 | "report-issue": "問題を報告", 19 | "langs-placeholder": "自動言語検出のために空白のままにします。", 20 | "transkribus-no-lang-error": "言語が選択されていません", 21 | "transkribus-browse-public-models": "Transkribus のすべての公開言語モデルを閲覧する", 22 | "transkribus-request-for-model": "Transkribus から OCR ツールにモデルを追加するリクエストを行う", 23 | "transkribus-line-id-none-option": "なし", 24 | "transkribus-job-id": "ジョブID", 25 | "transkribus-job-state": "進捗", 26 | "transkribus-job-description": "説明", 27 | "transkribus-job-start": "開始", 28 | "transkribus-job-end": "終了" 29 | } 30 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "devDependencies": { 3 | "@symfony/webpack-encore": "^1.8.2", 4 | "@babel/plugin-proposal-class-properties": "^7.18.6", 5 | "bootstrap": "3.4.1", 6 | "core-js": "^3.32.1", 7 | "cropperjs": "^1.6.0", 8 | "file-loader": "^6.2.0", 9 | "grunt": "^1.6.1", 10 | "grunt-banana-checker": "^0.11.0", 11 | "grunt-stylelint": "^0.19.0", 12 | "jquery": "^3.7.1", 13 | "regenerator-runtime": "^0.13.11", 14 | "select2": "^4.0.13", 15 | "select2-bootstrap-theme": "0.1.0-beta.10", 16 | "stylelint-config-wikimedia": "^0.15.0", 17 | "webpack-notifier": "^1.15.0" 18 | }, 19 | "license": "GPL-3.0-or-later", 20 | "private": true, 21 | "scripts": { 22 | "dev": "encore dev", 23 | "watch": "encore dev --watch", 24 | "build": "encore production --progress", 25 | "test": "grunt test" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Controller/TranskribusController.php: -------------------------------------------------------------------------------- 1 | render( 'transkribus.html.twig', [ 24 | 'jobs' => $transkribusClient->getJobs(), 25 | ] ); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.phpcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | . 12 | 13 | 14 | 15 | 16 | . 17 | ./vendor/ 18 | ./var/ 19 | src/Kernel.php 20 | ./node_modules/ 21 | ./bin/.phpunit/ 22 | ./public/build/ 23 | ./public/bundles/ 24 | ./assets/ 25 | 26 | -------------------------------------------------------------------------------- /i18n/ka.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Გიო ოქრო" 5 | ] 6 | }, 7 | "title": "ვიკიმედიის OCR", 8 | "subtitle": "ტექსტის გაშიფვრა სურათიდან", 9 | "form-heading": "სურათის გაშიფვრა", 10 | "image-url": "სურათის URL", 11 | "image-url-help": "ჩასვით URL იმ სურათისა, რომელიც განთავსებულია ვიკიმედიის სერვერზე, როგორიცაა მაგალითად: $1", 12 | "image-url-error": "სურათის URL უნდა იწყებოდეს {{PLURAL:$1|შემდეგ დომენურ სახელზე|ერთ-ერთ შემდეგ დომენურ სახელზე}} და მთავრდებოდეს ფაილის დაშვებულ გაფართოებაზე: $2", 13 | "image-alt-text": "საწყისი სურათი", 14 | "language-code": "ენები (არასავალდებულო)", 15 | "engine": "OCR-ის ძრავა", 16 | "engine-not-found-warning": "მოთხოვნილი ძრავა „$1“ ვერ მოიძებნა. მის ნაცვლად, გამოიყენეთ ნაგულისხმები ძრავა „$2“.", 17 | "engine-invalid-langs-warning": "შემდეგი ენები უმოქმედოა ან ძრავის მხარდაჭერა არ აქვს და იგნორირება მოხდა: $1", 18 | "submit": "მთელი გვერდის გაშიფვრა", 19 | "submit-crop": "გარკვეული სივრცის გაშიფვრა", 20 | "drag-help": "აირჩიეთ ჩამოჭრის ხელსაწყო და მონიშნეთ სურათზე შესაბამისი ფართობის მართკუთხედი, რათა მხოლოდ მისი გაშიფვრა მოხდეს." 21 | } 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Wikimedia OCR 2 | ============= 3 | 4 | A web service and UI for providing OCR text from images hosted on MediaWiki wikis. 5 | Can be integrated into the [ProofreadPage extension](https://www.mediawiki.org/wiki/Extension:ProofreadPage) 6 | via the [Wikisource extension](https://www.mediawiki.org/wiki/Extension:Wikisource). 7 | 8 | Documentation: 9 | * For system administrators: https://wikitech.wikimedia.org/wiki/Nova_Resource:Wikisource/Wikimedia_OCR 10 | * For Wikisource users: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR 11 | * Of the API: https://ocr.wmcloud.org/api/doc 12 | * For contributors: [CONTRIBUTING.md](https://github.com/wikimedia/wikimedia-ocr/blob/main/CONTRIBUTING.md) 13 | 14 | ![CI](https://github.com/wikimedia/wikimedia-ocr/workflows/CI/badge.svg) 15 | 16 | ## Licenses 17 | 18 | * Wikimedia OCR is GPL 3.0 or later (see the LICENSE file) 19 | * [Crop_-_The_Noun_Project.svg](https://commons.wikimedia.org/wiki/File:Crop_-_The_Noun_Project.svg) is CC0 20 | * [OOjs_UI_icon_move.svg](https://commons.wikimedia.org/wiki/File:OOjs_UI_icon_move.svg) is CC-BY-SA-4.0 21 | -------------------------------------------------------------------------------- /assets/images/WikimediaOCR-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/Engine/EngineFactory.php: -------------------------------------------------------------------------------- 1 | */ 11 | private $engines; 12 | 13 | /** 14 | * @param GoogleCloudVisionEngine $cloudVisionEngine 15 | * @param TesseractEngine $tesseractEngine 16 | * @param TranskribusEngine $transkribusEngine 17 | */ 18 | public function __construct( 19 | GoogleCloudVisionEngine $cloudVisionEngine, 20 | TesseractEngine $tesseractEngine, 21 | TranskribusEngine $transkribusEngine 22 | ) { 23 | $this->engines = [ 24 | 'google' => $cloudVisionEngine, 25 | 'tesseract' => $tesseractEngine, 26 | 'transkribus' => $transkribusEngine, 27 | ]; 28 | } 29 | 30 | /** 31 | * @param string $name 32 | * @return TesseractEngine|GoogleCloudVisionEngine|TranskribusEngine 33 | */ 34 | public function get( string $name ): EngineBase { 35 | if ( !isset( $this->engines[$name] ) ) { 36 | throw new EngineNotFoundException(); 37 | } 38 | return $this->engines[$name]; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /templates/_transkribus_options.html.twig: -------------------------------------------------------------------------------- 1 |
2 | {{ msg('transkribus-options') }} 3 |
4 | 5 | 18 |

19 | {{ msg('transkribus-line-help') }} 20 |

21 |
22 |
23 | -------------------------------------------------------------------------------- /i18n/ce.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Умар" 5 | ] 6 | }, 7 | "title": "ВикимедиаOCR", 8 | "subtitle": "Суьрташ тӀера текст йеша", 9 | "form-heading": "Сурт схьадастар", 10 | "image-url": "Суьртан URL", 11 | "image-url-help": "Викимедиа-сервер тӀехь хӀоттийначу суьртан URL чуйилла, масала: $1.", 12 | "image-alt-text": "Йуьхьанцара сурт", 13 | "language-code": "Меттанаш (тӀехь дац)", 14 | "engine": "OCR бух", 15 | "submit": "Йерриг агӀо транскрипци йан", 16 | "copy-to-clipboard": "Хийцаран буфере копийан.", 17 | "copied-to-clipboard": "Копийина!", 18 | "google-error": "Google сервисо йухадерзийна гӀалаташ: $1.", 19 | "image-retrieval-failed": "Сурт схьаэца аьтто ца баьлла: $1.", 20 | "documentation": "Документаци", 21 | "api-tooltip": "API документаци хьажа", 22 | "version": "Верси $1", 23 | "report-issue": "Проблемех хаам бар", 24 | "langs-placeholder": "Мотт авто къастош, меттиг йаьсса йита.", 25 | "langs-param-error": "ОРТ моторо ловш дай рогӀера {{PLURAL:$1|меттанаш}}: $2.", 26 | "tesseract-options": "Tesseract нисдаран гӀирс", 27 | "tesseract-psm-label": "АгӀонан сегментацин кеп", 28 | "transkribus-job-description": "Цуьнах лаьцна" 29 | } 30 | -------------------------------------------------------------------------------- /i18n/lb.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Robby", 5 | "Volvox" 6 | ] 7 | }, 8 | "title": "WikimediaOCR", 9 | "subtitle": "Text vu Biller transkribéieren", 10 | "form-heading": "E Bild transkribéieren", 11 | "image-url": "URL vum Bild", 12 | "image-url-help": "Setzt d'URL vun engem Bild an, dat op engem Wikimedia-Server gehost gëtt, wéi z. B.: $1", 13 | "image-url-error": "D'URL vum Bild muss mat {{PLURAL:$1|dem follgenden Domainnumm|engem vun de follgenden Domainnimm}} ufänken a mat enger gülteger Dateiendung ophalen: $2", 14 | "image-alt-text": "D'Originalbild", 15 | "language-code": "Sproochen (optional)", 16 | "engine-name-transkribus": "Transkribus-OCR", 17 | "submit": "Ganz Säit transkribéieren", 18 | "copied-to-clipboard": "Kopéiert!", 19 | "documentation": "Dokumentatioun", 20 | "version": "Versioun $1", 21 | "report-issue": "E Probleem mellen", 22 | "transkribus-default-error": "Feelercode '$1' :: D'Ufro konnt net ofgeschloss ginn, probéiert nach eng Kéier!", 23 | "transkribus-no-lang-error": "Et gouf keng Sprooch ausgewielt", 24 | "transkribus-job-state": "Status", 25 | "transkribus-job-description": "Beschreiwung", 26 | "transkribus-job-start": "Ugefaangen", 27 | "transkribus-job-end": "Fäerdeg" 28 | } 29 | -------------------------------------------------------------------------------- /i18n/ro.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Alesaru", 5 | "Dan Mihai Pitea" 6 | ] 7 | }, 8 | "title": "WikimediaOCR", 9 | "subtitle": "Transcriere text din imagini", 10 | "form-heading": "Transcriere text dintr-o imagine", 11 | "image-url": "URL-ul imaginii", 12 | "image-alt-text": "Imaginea originală", 13 | "language-code": "Limbi (opțional)", 14 | "engine-name-tesseract": "OCR Tesseract", 15 | "submit": "Transcrierea întregii pagini", 16 | "copy-to-clipboard": "Copiere în clipboard", 17 | "copied-to-clipboard": "Copiat cu succes!", 18 | "documentation": "Documentație", 19 | "api": "API-ul", 20 | "api-tooltip": "Afișare documentație API", 21 | "version": "Versiunea $1", 22 | "report-issue": "Raportează o problemă", 23 | "langs-placeholder": "Lăsați câmpul gol pentru detectarea automată a limbii.", 24 | "tesseract-options": "Opțiuni Tesseract", 25 | "tesseract-psm-8": "Tratează imaginea drept un singur cuvânt.", 26 | "tesseract-psm-9": "Tratează imaginea drept un singur cuvânt într-un cerc.", 27 | "tesseract-psm-10": "Tratează imaginea drept o singură literă.", 28 | "transkribus-jobs": "Transkribus Jobs", 29 | "transkribus-job-id": "ID job", 30 | "transkribus-job-end": "Terminat", 31 | "transkribus-job-waited": "Întârziere de pornire (minute)" 32 | } 33 | -------------------------------------------------------------------------------- /config/packages/framework.yaml: -------------------------------------------------------------------------------- 1 | # see https://symfony.com/doc/current/reference/configuration/framework.html 2 | framework: 3 | secret: '%env(APP_SECRET)%' 4 | #csrf_protection: true 5 | #http_method_override: true 6 | 7 | # Enables session support. Note that the session will ONLY be started if you read or write from it. 8 | # Remove or comment this section to explicitly disable session support. 9 | session: 10 | storage_factory_id: "session.storage.factory.native" 11 | cookie_secure: auto 12 | cookie_samesite: lax 13 | 14 | #esi: true 15 | #fragments: true 16 | php_errors: 17 | log: true 18 | 19 | http_client: 20 | default_options: 21 | retry_failed: 22 | max_retries: 3 23 | delay: 1000 24 | multiplier: 3 25 | max_delay: 5000 26 | jitter: 0.3 27 | http_codes: 28 | 0: ['GET', 'HEAD'] 29 | 423: true 30 | 425: true 31 | 429: true 32 | 502: true 33 | 503: true 34 | 500: [ 'GET', 'HEAD' ] 35 | 504: [ 'GET', 'HEAD' ] 36 | 507: [ 'GET', 'HEAD' ] 37 | 510: [ 'GET', 'HEAD' ] 38 | -------------------------------------------------------------------------------- /config/packages/prod/monolog.yaml: -------------------------------------------------------------------------------- 1 | monolog: 2 | channels: ['main', 'tesseract'] 3 | handlers: 4 | main: 5 | type: fingers_crossed 6 | action_level: error 7 | handler: nested 8 | excluded_http_codes: [404, 405] 9 | buffer_size: 50 # How many messages should be saved? Prevent memory leaks 10 | nested: 11 | type: stream 12 | path: php://stderr 13 | level: debug 14 | formatter: monolog.formatter.json 15 | console: 16 | type: console 17 | process_psr_3_messages: false 18 | channels: ["!event", "!doctrine"] 19 | mailer: 20 | type: deduplication 21 | time: 300 22 | handler: symfony_mailer 23 | symfony_mailer: 24 | type: symfony_mailer 25 | level: critical 26 | from_email: '%env(APP_MAIL_SENDER)%' 27 | to_email: 28 | - '%env(APP_LOG_RECIPIENT_1)%' 29 | - '%env(APP_LOG_RECIPIENT_2)%' 30 | subject: '%env(APP_LOG_SUBJECT)% %%message%%' 31 | formatter: monolog.formatter.html 32 | content_type: text/html 33 | tesseract: 34 | type: fingers_crossed 35 | action_level: error 36 | handler: nested 37 | -------------------------------------------------------------------------------- /config/packages/webpack_encore.yaml: -------------------------------------------------------------------------------- 1 | webpack_encore: 2 | # The path where Encore is building the assets - i.e. Encore.setOutputPath() 3 | output_path: '%kernel.project_dir%/public/build' 4 | # If multiple builds are defined (as shown below), you can disable the default build: 5 | # output_path: false 6 | 7 | # Set attributes that will be rendered on all script and link tags 8 | script_attributes: 9 | defer: true 10 | # link_attributes: 11 | 12 | # If using Encore.enableIntegrityHashes() and need the crossorigin attribute (default: false, or use 'anonymous' or 'use-credentials') 13 | # crossorigin: 'anonymous' 14 | 15 | # Preload all rendered script and link tags automatically via the HTTP/2 Link header 16 | # preload: true 17 | 18 | # Throw an exception if the entrypoints.json file is missing or an entry is missing from the data 19 | # strict_mode: false 20 | 21 | # If you have multiple builds: 22 | # builds: 23 | # pass "frontend" as the 3rg arg to the Twig functions 24 | # {{ encore_entry_script_tags('entry1', null, 'frontend') }} 25 | 26 | # frontend: '%kernel.project_dir%/public/frontend/build' 27 | 28 | # Cache the entrypoints.json (rebuild Symfony's cache when entrypoints.json changes) 29 | # Put in config/packages/prod/webpack_encore.yaml 30 | # cache: true 31 | -------------------------------------------------------------------------------- /i18n/pnb.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Bgo eiu" 5 | ] 6 | }, 7 | "title": "وِکیمیڈیا درشٹی سمبندی اکھر پچھاݨ", 8 | "subtitle": "تصویراں توں لکھت لے جاؤ", 9 | "form-heading": "تصویر توں لکھت لے جاؤ", 10 | "image-url": "کڑی دا پتہ", 11 | "image-url-help": "وِکیمیڈیا سروَر توں کڑی دا پتہ پایو، مثلاً: $1", 12 | "image-url-error": "{{PLURAL:$1|فائل دی قسم $2 نال اِتھوں پتہ ضروری اے}}", 13 | "image-alt-text": "پہلی تصویر", 14 | "language-code": "بولیاں", 15 | "engine": "لکھت پچھاݨ والا", 16 | "engine-name-google": "گوگل کلاؤڈ درشٹی سمبندی اکھر پچھاݨ", 17 | "engine-name-tesseract": "ٹیسرکٹ درشٹی سمبندی اکھر پچھاݨ", 18 | "engine-invalid-langs-warning": "ایہہ بولیاں سمجھی نہیں لگی: $1", 19 | "submit": "پورے صفحے توں لکھو", 20 | "submit-crop": "اِک کھیتر توں لکھو", 21 | "drag-mode-crop": "نواں کروپ کھیتر بݨاوݨ لئی چلیو", 22 | "drag-mode-crop-alt": "کروپ کرن دا بٹن", 23 | "copy-to-clipboard": "کاپی کرو", 24 | "copied-to-clipboard": "لکھت کاپی کیتی گئی اے!", 25 | "google-error": "گوگل توں غلطی آئی اے: $1", 26 | "image-retrieval-failed": "تصویر لبھ نہیں سکدی اے: $1", 27 | "documentation": "دستاویز", 28 | "api": "ایپلیکیشن پروگرامنگ اینٹرفیس", 29 | "api-tooltip": "اےپی‌آئی دستاویز ویکھو", 30 | "version": "ورژن $1", 31 | "report-issue": "مسئلہ ریپورٹ کرو", 32 | "langs-param-error": "{{PLURAL:$1|بولی سمجھی نہیں لگی|بولیاں سمجھی نہیں لگی}}: $2", 33 | "tesseract-options": "ٹیسرکٹ دیاں سیٹنگاں" 34 | } 35 | -------------------------------------------------------------------------------- /.phan/config.php: -------------------------------------------------------------------------------- 1 | null, 7 | 8 | 'directory_list' => [ 9 | 'src', 10 | 'vendor', 11 | ], 12 | 13 | 'exclude_file_regex' => '@^vendor/.*/(tests?|Tests?)/@', 14 | 15 | 'exclude_analysis_directory_list' => [ 16 | 'vendor/', 17 | ], 18 | 19 | 'suppress_issue_types' => [ 20 | // PHPCS does this already and without false positives. 21 | 'PhanUnreferencedUseNormal', 22 | 'SecurityCheck-LikelyFalsePositive', 23 | ], 24 | 25 | 'enable_extended_internal_return_type_plugins' => true, 26 | 'generic_types_enabled' => true, 27 | 28 | 'null_casts_as_any_type' => false, 29 | 'scalar_implicit_cast' => false, 30 | // Note: dead code detection has false positives with symfony magic methods 31 | 32 | 'redundant_condition_detection' => true, 33 | 34 | 'quick_mode' => false, 35 | 36 | 'plugins' => [ 37 | 'UnreachableCodePlugin', 38 | 'PregRegexCheckerPlugin', 39 | 'UnusedSuppressionPlugin', 40 | 'DuplicateArrayKeyPlugin', 41 | 'DuplicateExpressionPlugin', 42 | 'RedundantAssignmentPlugin', 43 | 'StrictLiteralComparisonPlugin', 44 | 'DollarDollarPlugin', 45 | 'LoopVariableReusePlugin', 46 | 'StrictComparisonPlugin', 47 | 'SimplifyExpressionPlugin', 48 | 'vendor/drenso/phan-extensions/Plugin/Annotation/SymfonyAnnotationPlugin.php', 49 | 'vendor/mediawiki/phan-taint-check-plugin/GenericSecurityCheckPlugin.php', 50 | ], 51 | ]; 52 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | getParameterOption(['--env', '-e'], null, true)) { 24 | putenv('APP_ENV='.$_SERVER['APP_ENV'] = $_ENV['APP_ENV'] = $env); 25 | } 26 | 27 | if ($input->hasParameterOption('--no-debug', true)) { 28 | putenv('APP_DEBUG='.$_SERVER['APP_DEBUG'] = $_ENV['APP_DEBUG'] = '0'); 29 | } 30 | 31 | (new Dotenv())->bootEnv(dirname(__DIR__).'/.env'); 32 | 33 | if ($_SERVER['APP_DEBUG']) { 34 | umask(0000); 35 | 36 | if (class_exists(Debug::class)) { 37 | Debug::enable(); 38 | } 39 | } 40 | 41 | $kernel = new Kernel($_SERVER['APP_ENV'], (bool) $_SERVER['APP_DEBUG']); 42 | $application = new Application($kernel); 43 | $application->run($input); 44 | -------------------------------------------------------------------------------- /templates/transkribus.html.twig: -------------------------------------------------------------------------------- 1 | {% extends 'base.html.twig' %} 2 | 3 | {% block body %} 4 | 5 |

{{msg('transkribus-jobs')}}

6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | {% for job in jobs %} 19 | 20 | 21 | 22 | 23 | 24 | 29 | 34 | 35 | {% endfor %} 36 |
{{ msg('transkribus-job-id') }}{{ msg('transkribus-job-state') }}{{ msg('transkribus-job-description') }}{{ msg('transkribus-job-start') }}{{ msg('transkribus-job-end') }}{{ msg('transkribus-job-waited') }}
{{ job.jobId }}{{ job.state }}{{ job.description }}{{ (job.createTime/1000)|round | format_datetime( 'short', 'short' ) }} 25 | {% if job.endTime %} 26 | {{((job.endTime/1000)|round) | format_datetime( 'none', 'short' ) }} 27 | {% endif %} 28 | 30 | {% if job.endTime %} 31 | {{ ( ( job.startTime - job.createTime ) / 1000 / 60 ) | round }} 32 | {% endif %} 33 |
37 | 38 | {% endblock %} 39 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | 11 | jobs: 12 | build: 13 | 14 | env: 15 | APP_ENV: test 16 | 17 | strategy: 18 | matrix: 19 | php: [ '7.3', '7.4', '8.0', '8.1' ] 20 | runPhan: [ true ] 21 | include: 22 | - php: '8.2' 23 | runPhan: false 24 | 25 | runs-on: ubuntu-latest 26 | 27 | steps: 28 | - name: Checkout 29 | uses: actions/checkout@v1 30 | 31 | - name: Set up PHP 32 | uses: shivammathur/setup-php@v2 33 | with: 34 | php-version: ${{matrix.php}} 35 | coverage: none 36 | extensions: ast 37 | 38 | - name: Read .nvmrc 39 | run: echo ::set-output name=NODE_VERSION::$(cat .nvmrc) 40 | id: nvm 41 | 42 | - name: Set up Node ${{ steps.nvm.outputs.NODE_VERSION }} 43 | uses: actions/setup-node@v2 44 | with: 45 | node-version: ${{ steps.nvm.outputs.NODE_VERSION }} 46 | 47 | - name: Install tesseract 48 | run: | 49 | sudo apt-get update 50 | sudo apt-get install -y tesseract-ocr-all 51 | 52 | - name: Install wikimedia-ocr 53 | run: | 54 | composer install 55 | npm ci 56 | 57 | - name: Test 58 | run: | 59 | composer test-common 60 | npm run test 61 | npm run build 62 | git status 63 | git status | grep "nothing to commit, working tree clean" 64 | 65 | - name: Test (Phan) 66 | if: ${{ matrix.runPhan }} 67 | run: composer phan 68 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | # In all environments, the following files are loaded if they exist, 2 | # the latter taking precedence over the former: 3 | # 4 | # * .env contains default values for the environment variables needed by the app 5 | # * .env.local uncommitted file with local overrides 6 | # * .env.$APP_ENV committed environment-specific defaults 7 | # * .env.$APP_ENV.local uncommitted environment-specific overrides 8 | # 9 | # Real environment variables win over .env files. 10 | # 11 | # DO NOT DEFINE PRODUCTION SECRETS IN THIS FILE NOR IN ANY OTHER COMMITTED FILES. 12 | # 13 | # Run "composer dump-env prod" to compile .env files for production use (requires symfony/flex >=1.2). 14 | # https://symfony.com/doc/current/best_practices.html#use-environment-variables-for-infrastructure-configuration 15 | 16 | ###> symfony/framework-bundle ### 17 | APP_ENV=dev 18 | APP_SECRET=3f028b359f05763e6cf2049cec74579e 19 | ###< symfony/framework-bundle ### 20 | 21 | ###> symfony/mailer ### 22 | MAILER_DSN=smtp://mail.tools.wmflabs.org:25 23 | ###< symfony/mailer ### 24 | 25 | APP_GOOGLE_KEYFILE= 26 | 27 | APP_MAIL_SENDER=tools.wikimedia-ocr@tools.wmflabs.org 28 | APP_LOG_RECIPIENT_1=admin1@example.org 29 | APP_LOG_RECIPIENT_2=admin2@example.org 30 | APP_LOG_SUBJECT="[Wikimedia OCR]" 31 | 32 | # Comma-separated list of the host names (without protocols) of where images are stored. 33 | APP_IMAGE_HOSTS=upload.wikimedia.org,upload.wikimedia.beta.wmflabs.org 34 | 35 | REDIS_HOST=redis.svc.tools.eqiad1.wikimedia.cloud 36 | 37 | APP_CACHE_TTL=3600 # 1 hour 38 | 39 | APP_TRANSKRIBUS_USERNAME= 40 | APP_TRANSKRIBUS_PASSWORD= 41 | -------------------------------------------------------------------------------- /src/Kernel.php: -------------------------------------------------------------------------------- 1 | import('../config/{packages}/*.yaml'); 18 | $container->import('../config/{packages}/'.$this->environment.'/*.yaml'); 19 | 20 | if (is_file(\dirname(__DIR__).'/config/services.yaml')) { 21 | $container->import('../config/services.yaml'); 22 | $container->import('../config/{services}_'.$this->environment.'.yaml'); 23 | } elseif (is_file($path = \dirname(__DIR__).'/config/services.php')) { 24 | (require $path)($container->withPath($path), $this); 25 | } 26 | } 27 | 28 | protected function configureRoutes(RoutingConfigurator $routes): void 29 | { 30 | $routes->import('../config/{routes}/'.$this->environment.'/*.yaml'); 31 | $routes->import('../config/{routes}/*.yaml'); 32 | 33 | if (is_file(\dirname(__DIR__).'/config/routes.yaml')) { 34 | $routes->import('../config/routes.yaml'); 35 | } elseif (is_file($path = \dirname(__DIR__).'/config/routes.php')) { 36 | (require $path)($routes->withPath($path), $this); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /i18n/br.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Fulup", 5 | "Iriep" 6 | ] 7 | }, 8 | "title": "WikimediaOCR", 9 | "subtitle": "Treuzskrivañ an destenn adalek skeudennoù", 10 | "form-heading": "Treuzskrivañ ur skeudenn", 11 | "image-url": "URL ar skeudenn", 12 | "image-url-help": "Ensoc'hañ URL ur skeudenn herberc'hiet war ur servijer Wikimedia evel da skouer: $1", 13 | "image-url-error": "URL ar skeudenn a rank kregiñ gant {{PLURAL:$1|an anv domani da-heul|unan eus an anvioù domani da-heul}} hag a rank echuiñ gant un astenn restr reizh: $2", 14 | "image-alt-text": "Ar skeudenn orin", 15 | "language-code": "Yezhoù (diret)", 16 | "engine": "Lusker OCR", 17 | "engine-not-found-warning": "Neket bet kavet al lusker azgoulennet '$1'. Implijout al lusker '$2' e-plas.", 18 | "engine-invalid-langs-warning": "Direizh eo ar yezhoù da heul pe n'int ket skoret gant ar c'heflusker ; lezet int bet a-gostez : $1", 19 | "submit": "Treuzskrivañ ar bajenn a-bezh", 20 | "submit-crop": "Treuzskrivañ an takad", 21 | "drag-help": "Diuzañ ar benveg didroc'hañ ha tresañ un hirgarrezenn war ar skeudenn dindan evit treuzskrivañ ul lodenn eus ar bajenn hepken.", 22 | "drag-mode-move": "Ruzañ a lakay ar skeudenn da fiñval", 23 | "drag-mode-move-alt": "Arlun a arouez an oberiadenn 'fiñval'", 24 | "drag-mode-crop": "Ruzañ a grouo un takad didroc'hañ nevez.", 25 | "copy-to-clipboard": "Eilañ er golver", 26 | "copied-to-clipboard": "Eilet !", 27 | "documentation": "Teuliadur", 28 | "version": "Stumm $1", 29 | "report-issue": "Titourañ un draen", 30 | "langs-placeholder": "Lezel goullo evit mat vo detektet ar yezh ent emgefre." 31 | } 32 | -------------------------------------------------------------------------------- /i18n/el.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Jimkats", 5 | "Norhorn" 6 | ] 7 | }, 8 | "title": "WikimediaOCR", 9 | "form-heading": "Μεταγραφή εικόνας", 10 | "image-url": "URL εικόνας", 11 | "image-alt-text": "Η αρχική εικόνα", 12 | "language-code": "Γλώσσες (προαιρετικά)", 13 | "engine": "Μηχανή OCR", 14 | "engine-name-transkribus": "Transkribus OCR", 15 | "submit": "Μεταγραφή ολόκληρης της σελίδας", 16 | "submit-crop": "Περιοχή μεταγραφής", 17 | "copy-to-clipboard": "Αντιγραφή στο πρόχειρο", 18 | "copied-to-clipboard": "Αντιγράφηκε!", 19 | "google-error": "Η υπηρεσία Google επέστρεψε ένα σφάλμα: $1", 20 | "image-retrieval-failed": "Η ανάκτηση εικόνας απέτυχε: $1", 21 | "documentation": "Τεκμηρίωση", 22 | "api-tooltip": "Δείτε την τεκμηρίωση του API", 23 | "version": "Έκδοση $1", 24 | "report-issue": "Αναφέρετε ένα πρόβλημα", 25 | "loading-message": "Εκτέλεση μεταγραφής...", 26 | "tesseract-options": "Επιλογές Tesseract", 27 | "transkribus-language-code": "Γλωσσικό Μοντέλο", 28 | "transkribus-default-error": "Κωδικός σφάλματος '$1' :: Δεν είναι δυνατή η ολοκλήρωση του αιτήματος, προσπαθήστε ξανά!", 29 | "transkribus-no-lang-error": "Δεν επιλέχθηκε γλώσσα", 30 | "transkribus-options": "Επιλογές Transkribus", 31 | "transkribus-line-label": "Μοντέλο Ανίχνευσης Γραμμής", 32 | "transkribus-line-id-none-option": "Κανένα", 33 | "transkribus-jobs": "Εργασίες Transcribus", 34 | "transkribus-job-id": "ID εργασίας", 35 | "transkribus-job-state": "Κατάσταση", 36 | "transkribus-job-description": "Περιγραφή", 37 | "transkribus-job-start": "Ξεκίνησε", 38 | "transkribus-job-end": "Τελείωσε", 39 | "transkribus-job-waited": "Καθυστέρηση έναρξης (λεπτά)" 40 | } 41 | -------------------------------------------------------------------------------- /i18n/roa-tara.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Joetaras" 5 | ] 6 | }, 7 | "title": "UicchimedieOCR", 8 | "subtitle": "Trascrive 'u teste da l'immaggine", 9 | "form-heading": "Trascrive cumme immaggine", 10 | "image-url": "URL de l'immaggine", 11 | "image-url-help": "Mitte 'na URL de l'immaggine ospitate sus a 'u server de Uicchimedie cumme: $1", 12 | "image-url-error": "L'URL de l'immaggine adda accumenzà cu {{PLURAL:$1|stu|une de ste}} nome de dominie e addà spiccià cu 'n'estenzione de file valide: $2", 13 | "image-alt-text": "L'immaggine origgenale", 14 | "language-code": "Lènghe (facoltative)", 15 | "engine": "Motore OCR", 16 | "engine-not-found-warning": "'U motore cercate '$1' non g'ha state acchiate. Ause 'u motore predefinite '$2'.", 17 | "engine-invalid-langs-warning": "Ste lènghe non ge sò valide o non ge sò supportate da 'u motore e avènene cacate: $1", 18 | "submit": "Trascrive totte 'a pàgene", 19 | "submit-crop": "Trascrive l'arèe", 20 | "copy-to-clipboard": "Copie jndr'à l'appunde", 21 | "copied-to-clipboard": "Copiate!", 22 | "image-retrieval-failed": "Recupere de l'immaggine fallite: $1", 23 | "documentation": "Documendazione", 24 | "version": "Versione $1", 25 | "report-issue": "Segnale 'nu probbleme", 26 | "tesseract-options": "Opziune de tesseract", 27 | "tesseract-psm-label": "Metode d'a segmentazione d'a pàgene", 28 | "tesseract-psm-7": "Tratte l'immaggine cumme 'na linèe de test singole.", 29 | "tesseract-psm-8": "Tratte l'immaggine cumme 'na parole singole.", 30 | "tesseract-psm-9": "Tratte l'immaggine cumme 'na parole singole jndr'à 'nu cerchie.", 31 | "tesseract-psm-10": "Tratte l'immaggine cumme 'nu carattere singole.", 32 | "tesseract-psm-12": "Teste sparse cu OSD." 33 | } 34 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | tests 28 | 29 | 30 | 31 | 32 | 33 | src 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 47 | 48 | -------------------------------------------------------------------------------- /i18n/sat.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "ᱤᱧ ᱢᱟᱛᱟᱞ" 5 | ] 6 | }, 7 | "title": "ᱣᱤᱠᱤᱢᱤᱰᱤᱭᱟ ᱳᱥᱤᱟᱨ", 8 | "subtitle": "ᱪᱤᱛᱟᱹᱨ ᱠᱷᱚᱱ ᱚᱞ ᱨᱮᱭᱟᱜ ᱚᱞ ᱚᱞ ᱢᱮ ᱾", 9 | "form-heading": "ᱢᱤᱫᱴᱟᱝ ᱪᱤᱛᱟᱹᱨ ᱴᱨᱟᱱᱤᱥᱠᱨᱤᱯ ᱢᱮ ᱾", 10 | "image-url": "ᱪᱤᱛᱟᱹᱨ ᱨᱮᱭᱟᱜ ᱤᱭᱩ ᱟᱨ ᱮᱞ ᱾", 11 | "image-url-help": "ᱩᱭᱠᱤᱢᱤᱰᱤᱭᱟ ᱥᱟᱨᱵᱷᱟᱨ ᱨᱮ ᱦᱳᱥᱴ ᱟᱠᱟᱱ ᱢᱤᱫᱴᱟᱝ ᱪᱤᱛᱟᱹᱨ ᱨᱮᱭᱟᱜ ᱤᱭᱩ ᱟᱨ ᱮᱞ ᱥᱮᱞᱮᱫ ᱢᱮ ᱡᱮᱞᱮᱠᱟ '1: $1'", 12 | "image-alt-text": "ᱢᱩᱲᱩᱫ ᱪᱤᱛᱟᱹᱨ ᱫᱚ ᱾", 13 | "language-code": "ᱯᱟᱹᱨᱥᱤ (ᱚᱯᱥᱟᱱᱟᱞ)", 14 | "engine": "ᱳᱥᱤᱟᱨ ᱤᱱᱡᱤᱱ", 15 | "engine-name-transkribus": "ᱴᱨᱮᱱᱥᱠᱨᱤᱵᱟᱥ ᱳᱥᱤᱟᱨ", 16 | "engine-not-found-warning": "ᱱᱮᱦᱚᱨ ᱞᱮᱱ ᱤᱱᱡᱤᱱ '$1' ᱫᱚ ᱵᱟᱝ ᱧᱟᱢ ᱟᱠᱟᱱᱟ ᱾ ᱚᱱᱟ ᱵᱚᱫᱚᱞ ᱛᱮ ᱰᱯᱷᱚᱞᱴ ᱤᱱᱡᱤᱱ '$2' ᱵᱮᱣᱦᱟᱨ ᱢᱮ ᱾", 17 | "submit": "ᱜᱚᱴᱟ ᱥᱟᱠᱟᱢ ᱫᱚ ᱚᱞ ᱢᱮ", 18 | "submit-crop": "ᱴᱚᱴᱷᱟ ᱫᱚ ᱚᱞ ᱢᱮ", 19 | "drag-help": "ᱠᱨᱚᱯ ᱴᱩᱞ ᱵᱟᱪᱷᱱᱟᱣ ᱢᱮ ᱟᱨ ᱥᱟᱠᱟᱢ ᱨᱮᱭᱟᱜ ᱢᱤᱫᱴᱟᱝ ᱴᱚᱴᱷᱟ ᱜᱮ ᱚᱞ ᱞᱟᱹᱜᱤᱫ ᱞᱟᱛᱟᱨ ᱨᱮ ᱪᱤᱛᱟᱹᱨ ᱨᱮ ᱢᱤᱫᱴᱟᱝ ᱟᱭᱢᱟᱜᱟᱱ ᱰᱨᱟᱜ ᱢᱮ ᱾", 20 | "drag-mode-move": "ᱰᱨᱟᱜᱽ ᱠᱚᱨᱟᱣ ᱪᱤᱛᱟᱹᱨ ᱫᱚ ᱪᱟᱞᱟᱣ ᱟᱭ ᱾", 21 | "drag-mode-move-alt": "'ᱢᱩᱵᱷ' ᱮᱠᱥᱚᱱ ᱨᱮᱭᱟᱜ ᱩᱫᱩᱜ ᱥᱚᱫᱚᱨ ᱟᱭᱠᱚᱱ᱾", 22 | "drag-mode-crop": "ᱰᱨᱟᱜᱽ ᱢᱤᱫᱴᱟᱝ ᱱᱟᱶᱟ ᱪᱟᱥ ᱴᱚᱴᱷᱟ ᱵᱮᱱᱟᱣᱼᱟ", 23 | "drag-mode-crop-alt": "'ᱪᱟᱥ' ᱠᱟᱹᱢᱤ ᱨᱮᱭᱟᱜ ᱩᱫᱩᱜ ᱥᱚᱫᱚᱨ ᱟᱭᱠᱚᱱ ᱾", 24 | "copy-to-clipboard": "ᱠᱞᱤᱯᱵᱳᱨᱰ ᱨᱮ ᱠᱚᱯᱤ ᱢᱮ", 25 | "copied-to-clipboard": "ᱠᱚᱯᱤ ᱠᱚᱨᱟᱣ ᱦᱩᱭ ᱟᱠᱟᱱᱟ", 26 | "google-error": "ᱜᱩᱜᱚᱞ ᱯᱚᱨᱤᱥᱮᱵᱟ ᱢᱤᱫᱴᱟᱝ ᱞᱨᱳᱴ ᱨᱩᱣᱟᱹᱲ ᱟᱠᱟᱫᱟ ᱾ $1", 27 | "image-retrieval-failed": "ᱪᱤᱛᱟᱹᱨ ᱧᱟᱢ ᱵᱟᱝ ᱦᱩᱭ ᱟᱠᱟᱱᱟ ᱾ $1", 28 | "version": "ᱵᱚᱨᱱᱚᱱ $1", 29 | "report-issue": "ᱟᱱᱟᱴ ᱨᱮ ᱨᱮᱯᱳᱨᱴ ᱢᱮ", 30 | "langs-placeholder": "ᱟᱡᱛᱮ ᱯᱟᱹᱨᱥᱤ ᱵᱟᱰᱟᱭ ᱞᱟᱹᱜᱤᱫ ᱵᱮᱠᱚᱞᱚᱠ ᱫᱚᱦᱚ ᱢᱮ ᱾", 31 | "loading-message": "ᱴᱨᱟᱱᱥᱤᱠᱯᱥᱚᱱ ᱠᱚᱨᱟᱣ...", 32 | "tesseract-psm-label": "ᱥᱟᱠᱟᱢ ᱦᱟᱹᱴᱤᱧ ᱦᱚᱨᱟ", 33 | "tesseract-psm-help": "ᱟᱨᱦᱚᱸ ᱱᱟᱯᱟᱭ ᱟᱭᱢᱟᱼᱠᱞᱩᱢ ᱨᱮᱭᱟᱜ ᱜᱚᱲᱚ ᱞᱟᱹᱜᱤᱫ \"ᱥᱯᱟᱨᱥ ᱴᱮᱥᱴ\" ᱵᱮᱣᱦᱟᱨ ᱢᱮ ᱾", 34 | "tesseract-psm-0": "ᱳᱭᱮᱨᱮᱱᱴᱮᱥᱚᱱ ᱟᱨ ᱥᱠᱨᱤᱯᱴ ᱰᱤᱴᱮᱠᱥᱚᱱ (ᱳᱮᱥᱰᱤ) ᱥᱩᱢᱩᱝ ᱾", 35 | "tesseract-psm-1": "ᱳᱹ ᱮᱥᱹᱰᱤ ᱥᱟᱶ ᱟᱡᱛᱮ ᱥᱟᱠᱟᱢ ᱦᱟᱹᱴᱤᱧ ᱾" 36 | } 37 | -------------------------------------------------------------------------------- /i18n/hu.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Dj", 5 | "Hanna Tardos", 6 | "Tacsipacsi" 7 | ] 8 | }, 9 | "title": "WikimédiaOCR", 10 | "subtitle": "Szöveg átírása képekből", 11 | "form-heading": "Kép átírása", 12 | "image-url": "Kép URL-címe", 13 | "image-url-help": "Egy tényleges képfájl teljes URL-je, a következő domainek egyikével: $1", 14 | "image-url-error": "A fájlnévnek egy érvényes kiterjesztéssel kell végződnie, és a következő {{PLURAL:$1|domainnévvel|domainnevek egyikével}} kell kezdődnie: $2", 15 | "image-alt-text": "Az eredeti kép", 16 | "language-code": "Nyelvek (nem kötelező)", 17 | "engine": "OCR-motor", 18 | "engine-not-found-warning": "A kért „$1” motor nem található. Ehelyett az alapértelmezett „$2” motort használja.", 19 | "engine-invalid-langs-warning": "A következő nyelvek érvénytelenek vagy a motor által nem támogatottak, ezért figyelmen kívül hagyták: $1", 20 | "submit": "Az egész oldal átírása", 21 | "submit-crop": "Átírási terület", 22 | "copy-to-clipboard": "Másolás a vágólapra", 23 | "copied-to-clipboard": "Kimásolva!", 24 | "google-error": "A Google-szolgáltatás hibát adott vissza: $1", 25 | "image-retrieval-failed": "A kép lekérése sikertelen: $1", 26 | "documentation": "Dokumentáció", 27 | "version": "$1 verzió", 28 | "report-issue": "Hibabejelentés", 29 | "langs-param-error": "Az OCR-motor a következő {{PLURAL:$1|nyelvet|nyelveket}} nem támogatja: $2", 30 | "tesseract-options": "Tesseract-beállítások", 31 | "transkribus-line-id-none-option": "Nincs", 32 | "transkribus-mixed-line-option": "Vegyes vonal tájolás", 33 | "transkribus-line-help": "Hagyd üresen, ha nem vagy biztos abban, hogy melyik vonalérzékelési modellt használja", 34 | "transkribus-job-id": "Feladatazonosító", 35 | "transkribus-job-state": "Állapot", 36 | "transkribus-job-description": "Leírás", 37 | "transkribus-job-start": "Elindult", 38 | "transkribus-job-end": "Kész", 39 | "transkribus-job-waited": "Indítási késleltetés (perc)" 40 | } 41 | -------------------------------------------------------------------------------- /tests/Twig/AppExtensionTest.php: -------------------------------------------------------------------------------- 1 | projectDir, 26 | new TesseractOCR() 27 | ); 28 | $transkribusEngine = new TranskribusEngine( 29 | new TranskribusClient( 30 | getenv( 'APP_TRANSKRIBUS_USERNAME' ), 31 | getenv( 'APP_TRANSKRIBUS_PASSWORD' ), 32 | new MockHttpClient(), 33 | new NullAdapter(), 34 | new NullAdapter() 35 | ), 36 | new Intuition(), 37 | $this->projectDir, 38 | new MockHttpClient() 39 | ); 40 | $this->ext = new AppExtension( $tesseractEngine, $transkribusEngine ); 41 | } 42 | 43 | /** 44 | * @covers AppExtension::getOcrLangName 45 | */ 46 | public function testOcrLangName(): void { 47 | // Non-standard language code with name defined in models.json 48 | static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'aze_cyrl' ) ); 49 | 50 | // Standard language code (name provided by Intuition) 51 | static::assertSame( 'English', $this->ext->getOcrLangName( 'en' ) ); 52 | } 53 | 54 | /** 55 | * @covers AppExtension::getLineIdName 56 | */ 57 | public function testLineIdName(): void { 58 | static::assertSame( 'Balinese Line Detection Model', $this->ext->getLineIdName( 'bali' ) ); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /i18n/ps.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "شاه زمان پټان" 5 | ] 6 | }, 7 | "title": "ويکي‌رسنۍ ليدنيزه‌لوښې‌پېژندنه", 8 | "subtitle": "په ويکي‌سرچينې يا کوم بل ځای کې د ويکي‌رسنۍ خونديځ کې د سکين شوي انځورونو د ليک لمېسلو او کارولو لپاره يو توکی.", 9 | "form-heading": "انځور پوره‌لیکل", 10 | "image-url": "انځور وېبتړ", 11 | "image-url-help": "په ويکي‌رسنۍ پالنگر ې کوربه‌شوی انځور وېبتړ دننه کړئ لکه: $1", 12 | "image-alt-text": "ار انځور", 13 | "language-code": "ژبې (اختياري)", 14 | "engine": "ليدنيزه‌لوښې‌پېژندنه اينجن", 15 | "engine-name-transkribus": "پوره‌ليکلو ليدنيزه‌لوښې‌پېژندنه", 16 | "submit": "ټول مخ پوره‌ليکل", 17 | "submit-crop": "پوره‌ليکلو سيمه", 18 | "drag-mode-move": "کشول به انځور وخوځوي", 19 | "copy-to-clipboard": "ټينگدړې ته لمېسل", 20 | "copied-to-clipboard": "ولمېسل‌شو!", 21 | "google-error": "گوگل پالنگر يوه تېروتنه راوگرځوله: $1", 22 | "documentation": "لاسوند", 23 | "api-tooltip": "ای‌پي‌آی لاسوند کتل", 24 | "version": "$1 بلبڼه", 25 | "report-issue": "د يوې ستونزې خبر ورکول", 26 | "langs-placeholder": "د ژبې خپلکاره موندلو لپاره تش پرېښودل.", 27 | "langs-param-error": "لاندې {{PLURAL:$1|ژبه|ژبې}} د ليدنيزه‌لوښې‌پېژندنې اېنجن نه ملاتړ کوي: $2", 28 | "loading-message": "د پوره‌ليکلو ترسره‌کول...", 29 | "tesseract-psm-label": "مخ وېشلو چلند", 30 | "transkribus-language-code": "ژبې چلنوال", 31 | "transkribus-no-lang-error": "هېڅ ژبه غوره‌شوې نه ده", 32 | "transkribus-options": "پوره‌لیکلو خوښنې", 33 | "transkribus-line-label": "د کرښې موندل بېلگه", 34 | "transkribus-line-id-none-option": "هېڅ", 35 | "transkribus-mixed-line-option": "د يوځای شوې کرښې لوری", 36 | "transkribus-line-help": "که تاسو ډاډه نه ياست چې کومه د کرښې موندلو لوري بېلگه وکاروئ؛ نو تش يې پرېږدئ", 37 | "transkribus-jobs": "پوره‌لیکلو دندې", 38 | "transkribus-job-id": "دندې پېژند", 39 | "transkribus-job-state": "دريځ", 40 | "transkribus-job-description": "څرگنداوی", 41 | "transkribus-job-start": "پیل شو", 42 | "transkribus-job-end": "پای ته ورسېد", 43 | "transkribus-job-waited": "پيل ځنډ (دقيقې)" 44 | } 45 | -------------------------------------------------------------------------------- /assets/styles/app.css: -------------------------------------------------------------------------------- 1 | @import '~bootstrap'; 2 | @import '~select2'; 3 | 4 | /* Encore can't find '~select2-bootstrap-theme' with @import */ 5 | @import '../../node_modules/select2-bootstrap-theme/dist/select2-bootstrap.min.css'; 6 | 7 | .page-header { 8 | background-color: #f5f5f5; 9 | margin: 0 0 25px; 10 | padding: 24px 0 18px; 11 | } 12 | 13 | .container { 14 | max-width: 1170px; 15 | } 16 | 17 | .page-header .container { 18 | align-items: center; 19 | display: flex; 20 | /* Width of container + logo size and it's padding */ 21 | max-width: calc( 1170px + ( (50px + 20px) * 2) ); 22 | width: auto; 23 | } 24 | 25 | .logo { 26 | margin: 10px 20px 25px 0; 27 | } 28 | 29 | body.rtl .logo { 30 | float: right; 31 | margin: 10px 0 25px 20px; 32 | } 33 | 34 | .page-title { 35 | font-weight: bold; 36 | margin-bottom: 0; 37 | } 38 | 39 | .page-subtitle { 40 | font-size: 1em; 41 | } 42 | 43 | .form-heading { 44 | border-bottom: 1px solid #e5e5e5; 45 | font-size: 1.5em; 46 | margin: 25px 0; 47 | } 48 | 49 | fieldset, 50 | .alert { 51 | max-width: 541px; 52 | } 53 | 54 | /* Avoid select2 input from exceeding viewport on smaller screens */ 55 | .select2-container { 56 | /* stylelint-disable declaration-no-important */ 57 | width: 100% !important; 58 | } 59 | 60 | .radio:first-of-type { 61 | margin-top: 0; 62 | } 63 | 64 | .engine-options { 65 | margin-top: 30px; 66 | } 67 | 68 | .engine-help { 69 | margin-top: 10px; 70 | } 71 | 72 | .submit-btn { 73 | margin-top: 40px; 74 | } 75 | 76 | .output-buttons { 77 | text-align: right; 78 | margin-bottom: 10px; 79 | } 80 | 81 | .nojs .nojs-hide { 82 | display: none; 83 | } 84 | 85 | .loader { 86 | background-color: #f5f5f5; 87 | padding: 12px; 88 | } 89 | 90 | .loader p { 91 | margin: 0; 92 | font-weight: bold; 93 | } 94 | 95 | @keyframes loader { 96 | to { 97 | transform: rotate( 360deg ); 98 | } 99 | } 100 | 101 | .glyphicon.glyphicon-refresh { 102 | margin-right: 5px; 103 | animation: loader 1500ms linear infinite; 104 | } 105 | -------------------------------------------------------------------------------- /i18n/io.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Joao Xavier" 5 | ] 6 | }, 7 | "title": "WikimedioOCR", 8 | "subtitle": "Trasskribar texti de imaji", 9 | "form-heading": "Transskribar ul imajo", 10 | "image-url": "URL dil imajo", 11 | "image-url-help": "Adjuntez URL por imajo qua esas che altra servero Wikimedia, exemple: $1", 12 | "image-url-error": "La URL di ula imajo mustas komencar kun {{PLURAL:$1|la sequanta domeno-nomo|un ek la sequanta domeno-nomi}}, e finar kun ula valida sufixo: $2", 13 | "image-alt-text": "L'originala imajo", 14 | "language-code": "Idiomi (fakultativa)", 15 | "engine": "Mashino OCR", 16 | "engine-not-found-warning": "La demandita utensilo '$1' ne uzesis. Vice ol, uzez l'utensilo ''default'', $2.", 17 | "engine-invalid-langs-warning": "La sequanta idiomi esas nevalida, o ne suportata dal utensilo. Pro to, li ignoresis: $1", 18 | "submit": "Transskriptez tota pagino", 19 | "submit-crop": "Transskribo-areo", 20 | "drag-mode-move": "Trananta, l'imajo movesos", 21 | "copy-to-clipboard": "Kopiez a ''clipboard''", 22 | "copied-to-clipboard": "Kopiita!", 23 | "google-error": "La servado Google montris eroro: $1", 24 | "image-retrieval-failed": "Faliis la rekupero dil imajo: $1", 25 | "documentation": "Dokumentigo", 26 | "api": "API", 27 | "api-tooltip": "Videz la dokumentigo dil API", 28 | "version": "Versiono $1", 29 | "report-issue": "Informez problemo", 30 | "langs-param-error": "La sequanta {{PLURAL:$1|linguo|lingui}} ne agnoskesas dal OCR-softwaro*: $2", 31 | "tesseract-options": "Tesseract-opcioni", 32 | "tesseract-psm-label": "Metodo por sementigo di pagini", 33 | "tesseract-psm-0": "Orientation and script detection (OSD) only.", 34 | "tesseract-psm-1": "Automatic page segmentation with OSD.", 35 | "tesseract-psm-2": "Automatic page segmentation, but no OSD, or OCR. (not implemented)", 36 | "tesseract-psm-3": "Fully automatic page segmentation, but no OSD. (Default)", 37 | "tesseract-psm-4": "Supozar singla kolumno di texto kun varianta grandesi.", 38 | "tesseract-psm-7": "Traktez l'imajo kom singla lineo di texto.", 39 | "tesseract-psm-8": "Traktez l'imajo kom singla vorto.", 40 | "tesseract-psm-9": "Traktez l'imajo kom singla vorto en cirklo.", 41 | "tesseract-psm-10": "Traktez l'imajo kom singla karaktero." 42 | } 43 | -------------------------------------------------------------------------------- /src/Engine/Image.php: -------------------------------------------------------------------------------- 1 | imageUrl = $imageUrl; 31 | $this->crop = $crop; 32 | } 33 | 34 | /** 35 | * @return string 36 | */ 37 | public function getUrl(): string { 38 | return $this->imageUrl; 39 | } 40 | 41 | public function needsCropping(): bool { 42 | return isset( $this->crop['width'] ) && $this->crop['width'] > 0 43 | && isset( $this->crop['height'] ) && $this->crop['height'] > 0; 44 | } 45 | 46 | /** 47 | * @return Crop 48 | */ 49 | public function getCrop(): Crop { 50 | return new Crop( 51 | new Point( $this->crop['x'], $this->crop['y'] ), 52 | new Box( $this->crop['width'], $this->crop['height'] ) 53 | ); 54 | } 55 | 56 | public function hasData(): bool { 57 | return $this->data !== null; 58 | } 59 | 60 | /** 61 | * @return string 62 | */ 63 | public function getData(): string { 64 | if ( $this->data === null ) { 65 | throw new LogicException( 'Image::setData() must be called before getData()' ); 66 | } 67 | return $this->data; 68 | } 69 | 70 | /** 71 | * @param string $data 72 | */ 73 | public function setData( string $data ): void { 74 | $this->data = $data; 75 | } 76 | 77 | /** 78 | * Get the image data size in bytes. 79 | * @return int 80 | */ 81 | public function getSize(): int { 82 | if ( $this->data === null ) { 83 | throw new LogicException( 'Image::setData() must be called before getSize()' ); 84 | } 85 | return $this->size ?? strlen( $this->data ); 86 | } 87 | 88 | /** 89 | * @param int $size 90 | */ 91 | public function setSize( int $size ): void { 92 | $this->size = $size; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/Twig/AppExtension.php: -------------------------------------------------------------------------------- 1 | tesseractEngine = $tesseractEngine; 26 | $this->transkribusEngine = $transkribusEngine; 27 | } 28 | 29 | /** 30 | * Registry of custom TwigFunctions. 31 | * @return TwigFunction[] 32 | */ 33 | public function getFunctions(): array { 34 | return [ 35 | new TwigFunction( 'ocr_lang_name', [ $this, 'getOcrLangName' ] ), 36 | new TwigFunction( 'line_id_name', [ $this, 'getLineIdName' ] ), 37 | ]; 38 | } 39 | 40 | /** 41 | * Registry of custom TwigFilters. 42 | * @return TwigFilter[] 43 | */ 44 | public function getFilters(): array { 45 | return [ 46 | new TwigFilter( 'textarea_rows', [ $this, 'getTextareaRows' ] ), 47 | ]; 48 | } 49 | 50 | /** 51 | * Get the number of rows a textarea should be based on the size of the given text. 52 | * @param string $text 53 | * @return int 54 | */ 55 | public function getTextareaRows( string $text ): int { 56 | return max( 10, substr_count( $text, "\n" ) ); 57 | } 58 | 59 | /** 60 | * Get the name of the given language. This adds a few translations that don't exist in Intuition. 61 | * @param string|null $lang 62 | * @return string 63 | */ 64 | public function getOcrLangName( ?string $lang = null ): string { 65 | return $this->tesseractEngine->getModelTitle( $lang ); 66 | } 67 | 68 | /** 69 | * Get the name of the given line detection model ID. 70 | * @param string|null $lineIdLang 71 | * @return string 72 | */ 73 | public function getLineIdName( ?string $lineIdLang = null ): string { 74 | return $this->transkribusEngine->getLineIdModelName( $lineIdLang ); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /check_tesseract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | # Note, this assumes that the output of `tesseract --version` will remain consistent. 6 | MIN_TESSERACT_VERSION="tesseract 4" 7 | 8 | if [ -n "${DISABLE_TESSERACT_CHECK+placeholder}" ]; then 9 | echo "DISABLE_TESSERACT_CHECK is set, skipping tesseract check." 10 | exit 0 11 | fi 12 | 13 | echo "Checking tesseract installation" 14 | 15 | if ! type tesseract &> /dev/null; then 16 | echo "Tesseract not found!" 17 | exit 1 18 | else 19 | echo "Tesseract executable OK" 20 | fi 21 | 22 | # Similar to what tesseract-ocr-for-php does 23 | CUR_TESSERACT_VERSION=$(tesseract --version | head -n1 | sed "s/tesseract v/tesseract /") 24 | CUR_MIN_VERSION=$( echo -e "$MIN_TESSERACT_VERSION\n$CUR_TESSERACT_VERSION" | sort -V | head -n1 ) 25 | if [ "$CUR_MIN_VERSION" != "$MIN_TESSERACT_VERSION" ]; then 26 | echo "Tesseract version mismatch: current is ${CUR_TESSERACT_VERSION}, minimum required is ${MIN_TESSERACT_VERSION}" 27 | exit 1 28 | else 29 | echo "Tesseract version OK" 30 | fi 31 | 32 | # For the future, we might make languages optional; we'd probably have to cache the result of `tesseract --list-langs`. 33 | 34 | if type jq &> /dev/null; then 35 | # Sort both just in case, and remove duplicates from the expected list to account for google having more variants that 36 | # map to the same code in tesseract (e.g. zh and zh-hans) 37 | # Skip deu_latf as it's not insalled by default yet (but will be in the future). 38 | AVAILABLE_LANGS=$(tesseract --list-langs | tail -n +2 | sort) 39 | EXPECTED_LANGS=$(jq -r '.tesseract | keys | to_entries[] | .value' public/models.json | sort -u | sed "/^deu_latf$/d" ) 40 | 41 | EXTRA_LOCAL_LANGS=$( comm -23 <( echo "$AVAILABLE_LANGS" ) <( echo "$EXPECTED_LANGS" ) ) 42 | MISSING_LOCAL_LANGS=$( comm -13 <( echo "$AVAILABLE_LANGS" ) <( echo "$EXPECTED_LANGS" ) ) 43 | 44 | if [ -z "$MISSING_LOCAL_LANGS" ]; then 45 | echo "All expected languages are installed" 46 | else 47 | echo -e "The following required languages are not installed:\n$MISSING_LOCAL_LANGS" 48 | exit 1 49 | fi 50 | if [ -n "$EXTRA_LOCAL_LANGS" ]; then 51 | echo -e "The following languages are installed but not supported:\n$EXTRA_LOCAL_LANGS" 52 | fi 53 | else 54 | echo "jq is not installed, skipping validation of available languages" 55 | fi 56 | 57 | echo "All checks passed!" 58 | -------------------------------------------------------------------------------- /i18n/te.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Chaduvari" 5 | ] 6 | }, 7 | "title": "WikimediaOCR", 8 | "subtitle": "బొమ్మల నుండి పాఠ్యాన్ని ఎత్తిరాయండి", 9 | "form-heading": "ఓ బొమ్మను ఎత్తిరాయండి", 10 | "image-url": "బొమ్మ URL", 11 | "image-url-help": "$1 లాంటి ఏదైనా వికీమీడియా సర్వరులో ఉన్న బొమ్మ URL ఇవ్వండి:", 12 | "image-url-error": "బొమ్మ URL తప్పనిసరిగా {{PLURAL:$1|కింది డొమెయిను పేరుతో|కింది డొమెయిను పేర్లలో ఏదో ఒకదానితో}} మొదలై, ఒక సరైన ఫైలు ఎక్‌స్టెన్షనుతో ముగియాలి: $2", 13 | "image-alt-text": "ఒరిజినలు బొమ్మ", 14 | "language-code": "భాషలు (ఐచ్ఛికం)", 15 | "engine": "OCR ఇంజను", 16 | "engine-name-google": "Google Cloud Vision OCR", 17 | "engine-name-tesseract": "Tesseract OCR", 18 | "engine-not-found-warning": "అడిగిన ఇంజను '$1' కనబడలేదు. దాని బదులు డిఫాల్టు ఇంజను '$2' ను వాడుతున్నాం.", 19 | "engine-invalid-langs-warning": "కింది భాషలు చెల్లనివి, లేదా ఇంజను వాటికి మద్దతు ఇవ్వడం లేదు. అంచేత వాటిని పక్కన పెట్టాం: $1", 20 | "submit": "మొత్తం పేజీ నంతటినీ ఎత్తిరాయి", 21 | "submit-crop": "ఈ ప్రాంతాన్ని ఎత్తిరాయి", 22 | "drag-help": "క్రాప్ పరికరాన్ని ఎంచుకుని, కొంత ప్రాంతాన్ని మాత్రమే ఎత్తిరాయాలంటే, కింద ఉన్న బొమ్మపై ఒక దీర్ఘ చతురస్ర రూపాన్ని లాగండి.", 23 | "drag-mode-move": "లాగితే బొమ్మ కదులుతుంది", 24 | "drag-mode-move-alt": "'కదిలించు' చర్యను సూచించే ఐకను", 25 | "drag-mode-crop": "లాగితే కొత్త క్రాప్ ప్రాంతాన్ని సృష్టిస్తుంది", 26 | "drag-mode-crop-alt": "'క్రాప్' చర్యను సూచించే ఐకను", 27 | "copy-to-clipboard": "క్లిప్‌బోర్డుకు కాపీ చెయ్యి", 28 | "copied-to-clipboard": "కాపీ చేసాం!", 29 | "google-error": "గూగుల్ సేవ ఓ లోపాన్ని చూపించింది: $1", 30 | "image-retrieval-failed": "బొమ్మను తేవడం విఫలమైంది: $1", 31 | "documentation": "డాక్యుమెంటేషను", 32 | "api": "API", 33 | "api-tooltip": "API డాక్యుమెంటేషన్ను చూపించు", 34 | "version": "వెర్షను $1", 35 | "report-issue": "సమస్యను నివేదించండి", 36 | "langs-placeholder": "భాషను ఆటోమాటిగ్గా ఎంచుకునేందుకు ఖాళీగా వదిలెయ్యండి.", 37 | "langs-param-error": "కింది {{PLURAL:$1|భాషకు|భాషలకు}} OCR ఇంజను మద్దతు ఇవ్వదు: $2", 38 | "tesseract-options": "Tesseract వికల్పాలు", 39 | "tesseract-psm-7": "బొమ్మను ఒకే పాఠ్యపు పంక్తిగా భావించు.", 40 | "tesseract-psm-8": "బొమ్మను ఒకే పదంగా భావించు.", 41 | "tesseract-psm-9": "బొమ్మను ఒక వృత్తంలో ఉన్న ఒకే పదంగా భావించు.", 42 | "tesseract-psm-10": "బొమ్మను ఒకే కారెక్టరుగా భావించు.", 43 | "tesseract-internal-error": "టెస్సరాక్ట్ ఇంజను ఏదో అంతర్గత లోపాన్ని చూపించింది.", 44 | "transkribus-no-lang-error": "భాష దేన్నీ ఎంచుకోలేదు", 45 | "transkribus-multiple-lang-error": "బహుళ భాషలకు అనుమతి లేదు, ఒకటే భాషను ఇవ్వండి" 46 | } 47 | -------------------------------------------------------------------------------- /i18n/az.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Nemoralis", 5 | "Toghrul Rahimli", 6 | "Şeyx Şamil", 7 | "Əkrəm", 8 | "Əkrəm Cəfər" 9 | ] 10 | }, 11 | "title": "WikimediaOCR", 12 | "subtitle": "Şəkillərdən mətni transkripsiya edin", 13 | "form-heading": "Şəkli transkripsiya edin", 14 | "image-url": "Şəkil URL", 15 | "image-url-help": "Vikimedia serverində yerləşdirilən şəkil URL-ni daxil edin, məsələn: $1", 16 | "image-url-error": "Şəkil URL-i {{PLURAL:$1|aşağıdakı domen adı|aşağıdakı domen adlarından biri}} ilə başlamalıdır və etibarlı fayl uzantısı ilə bitməlidir: $2", 17 | "image-alt-text": "Orijinal şəkil", 18 | "language-code": "Dillər (qeyri-məcburi)", 19 | "engine": "OCR motoru", 20 | "engine-name-transkribus": "Transkribus OCR", 21 | "engine-not-found-warning": "Tələb olunan \"$1\" mühərriki tapılmadı. Əvəzində defolt mühərrik olan \"$2\" istifadə edin.", 22 | "engine-invalid-langs-warning": "Aşağıdakı dillər etibarsızdır və ya mühərrik tərəfindən dəstəklənmir və nəzərə alınmayıb: $1", 23 | "submit": "Bütün səhifəni transkripsiya et", 24 | "submit-crop": "Transkripsiya sahəsi", 25 | "drag-help": "Kəsmə alətini seçin və səhifənin yalnız bir sahəsini transkripsiya etmək üçün aşağıdakı şəkildə düzbucaqlı çəkin.", 26 | "drag-mode-move": "Sürüşdürdükdə şəkil hərəkət edəcək", 27 | "drag-mode-move-alt": "\"Yerini dəyiş\" hərəkətini təmsil edən ikona.", 28 | "drag-mode-crop": "Sürüşdürmək yeni kəsim sahəsi yaradacaq", 29 | "drag-mode-crop-alt": "\"Kəsmə\" hərəkətini təmsil edən ikona.", 30 | "copy-to-clipboard": "Mübadilə buferinə kopiyala", 31 | "copied-to-clipboard": "Kopyalandı!", 32 | "google-error": "Google xidməti xəta verdi: $1", 33 | "image-retrieval-failed": "Şəklin alınması uğursuz oldu: $1", 34 | "documentation": "Sənədləşdirmə", 35 | "api-tooltip": "API sənədləşdirməsinə bax", 36 | "version": "$1 versiyası", 37 | "report-issue": "Bir problem bildir", 38 | "langs-placeholder": "Avtomatik dil aşkarlanması üçün boş buraxın.", 39 | "langs-param-error": "Aşağıdakı {{PLURAL:$1|dil|dillər}} OCR mühərriki tərəfindən dəstəklənmir: $2", 40 | "loading-message": "Transkripsiya həyata keçirilir...", 41 | "tesseract-options": "Tesseract seçimləri", 42 | "tesseract-psm-label": "Səhifə bölmə metodları", 43 | "tesseract-psm-help": "Daha yaxşı çox sütunlu dəstək üçün \"Seyrək mətn\"i sınayın.", 44 | "tesseract-psm-0": "Yalnız oriyentasiya və skript aşkarlanması (OSD).", 45 | "tesseract-psm-1": "OSD ilə avtomatik səhifə bölünməsi.", 46 | "tesseract-psm-2": "Avtomatik səhifə bölünməsi, lakin OSD və ya OCR yoxdur. (həyata keçirilmir)" 47 | } 48 | -------------------------------------------------------------------------------- /i18n/lt.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Nokeoo" 5 | ] 6 | }, 7 | "subtitle": "Transkribuoti tekstą iš paveikslėlių", 8 | "form-heading": "Transkribuoti paveikslėlį", 9 | "image-url": "Paveikslėlio URL", 10 | "image-url-help": "Įterpkite paveikslėlio URL, kuris talpinamas Vikimedija serveryje, pvz.: $1", 11 | "image-url-error": "Paveikslėlio URL turi prasidėti su {{PLURAL:$1|šiuo domeno pavadinimu|su vienu iš šių domeno pavadinimų}} ir baigtis su galimu failo plėtiniu: $2", 12 | "image-alt-text": "Originalus paveikslėlis", 13 | "language-code": "Kalbos (nebūtina)", 14 | "engine": "OCR variklis", 15 | "engine-not-found-warning": "Prašomas variklis '$1' nerastas. Vietoj to, naudojamas numatytasis variklis '$2'.", 16 | "engine-invalid-langs-warning": "Šios kalbos negalimos arba nepalaikomos variklio ir todėl buvo ignoruojamos: $1", 17 | "submit": "Transkribuoti visą puslapį", 18 | "submit-crop": "Transkribuoti plotą", 19 | "drag-help": "Pasirinkite apkarpymo įrankį ir nupieškite stačiakampį paveikslėlyje žemiau, kad transkribuotumėte tik puslapio plotą.", 20 | "drag-mode-move": "Tempimas perkels paveikslėlį", 21 | "drag-mode-move-alt": "Ikona, nurodanti perkėlimo veiksmą.", 22 | "drag-mode-crop": "Tempimas sukurs nauja iškarpos plotą", 23 | "drag-mode-crop-alt": "Ikona, nurodanti kirpimo veiksmą.", 24 | "copy-to-clipboard": "Kopijuoti į iškarpinę", 25 | "copied-to-clipboard": "Nukopijuota!", 26 | "google-error": "Google paslauga grąžino klaidą: $1", 27 | "image-retrieval-failed": "Paveikslėlio gavimas nepavyko: $1", 28 | "documentation": "Dokumentacija", 29 | "api-tooltip": "Žiūrėti API dokumentaciją", 30 | "version": "Versija $1", 31 | "report-issue": "Pranešti apie problemą", 32 | "langs-placeholder": "Palikite tuščia, kad kalba būtų nustatyta automatiškai.", 33 | "langs-param-error": "{{PLURAL:$1|Ši kalba nepalaikoma|Šios kalbos nepalaikomos}} OCR variklio: $2", 34 | "tesseract-psm-7": "Laikyti paveikslėlį vienos eilutės tekstu.", 35 | "tesseract-psm-8": "Laikyti paveikslėlį vienu žodžiu.", 36 | "tesseract-psm-9": "Laikyti paveikslėlį vienu žodžiu apskritime.", 37 | "tesseract-psm-10": "Laikyti paveikslėlį vienu simboliu.", 38 | "transkribus-language-code": "Kalbos modelis", 39 | "transkribus-unauthorized-error": "Klaidos kodas '$1' :: prašymas neleistinas", 40 | "transkribus-default-error": "Klaidos kodas '$1' :: nepavyko užbaigti prašymo, bandykite dar kartą!", 41 | "transkribus-no-lang-error": "Nepasirinkta jokia kalba", 42 | "transkribus-multiple-lang-error": "Kelios kalbos neleidžiamos, nurodykite vieną kalbą", 43 | "transkribus-line-id-none-option": "Nėra" 44 | } 45 | -------------------------------------------------------------------------------- /config/services.yaml: -------------------------------------------------------------------------------- 1 | # This file is the entry point to configure your own services. 2 | # Files in the packages/ subdirectory configure your dependencies. 3 | 4 | # Put parameters here that don't need to change on each machine where the app is deployed 5 | # https://symfony.com/doc/current/best_practices/configuration.html#application-related-configuration 6 | parameters: 7 | cache_ttl: '%env(APP_CACHE_TTL)%' 8 | 9 | services: 10 | # default configuration for services in *this* file 11 | _defaults: 12 | autowire: true # Automatically injects dependencies in your services. 13 | autoconfigure: true # Automatically registers your services as commands, event subscribers, etc. 14 | 15 | # makes classes in src/ available to be used as services 16 | # this creates a service per class whose id is the fully-qualified class name 17 | App\: 18 | resource: '../src/' 19 | exclude: 20 | - '../src/DependencyInjection/' 21 | - '../src/Kernel.php' 22 | - '../src/Tests/' 23 | 24 | # controllers are imported separately to make sure services can be injected 25 | # as action arguments even if you don't extend any base controller class 26 | App\Controller\: 27 | resource: '../src/Controller/' 28 | tags: ['controller.service_arguments'] 29 | 30 | # https://symfony.com/doc/current/service_container/parent_services.html 31 | App\Engine\EngineBase: 32 | arguments: 33 | $projectDir: '%kernel.project_dir%' 34 | calls: 35 | - setImageHosts: [ '%env(APP_IMAGE_HOSTS)%' ] 36 | 37 | App\Engine\TesseractEngine: 38 | parent: App\Engine\EngineBase 39 | 40 | App\Engine\GoogleCloudVisionEngine: 41 | parent: App\Engine\EngineBase 42 | arguments: 43 | $keyFile: '%env(APP_GOOGLE_KEYFILE)%' 44 | 45 | App\Engine\TranskribusEngine: 46 | parent: App\Engine\EngineBase 47 | 48 | App\Engine\TranskribusClient: 49 | arguments: 50 | $username: '%env(APP_TRANSKRIBUS_USERNAME)%' 51 | $password: '%env(APP_TRANSKRIBUS_PASSWORD)%' 52 | 53 | App\EventListener\ExceptionListener: 54 | arguments: 55 | - '@request_stack' 56 | - '@twig' 57 | - '@Krinkle\Intuition\Intuition' 58 | - '@monolog.logger.tesseract' 59 | tags: 60 | - { name: kernel.event_listener, event: kernel.exception } 61 | 62 | # Vendor services for autowiring 63 | thiagoalessio\TesseractOCR\TesseractOCR: 64 | 65 | # please note that last definitions always *replace* previous ones 66 | # add more service definitions when explicit configuration is needed 67 | -------------------------------------------------------------------------------- /webpack.config.js: -------------------------------------------------------------------------------- 1 | const Encore = require('@symfony/webpack-encore'); 2 | 3 | // Manually configure the runtime environment if not already configured yet by the "encore" command. 4 | // It's useful when you use tools that rely on webpack.config.js file. 5 | if (!Encore.isRuntimeEnvironmentConfigured()) { 6 | Encore.configureRuntimeEnvironment(process.env.NODE_ENV || 'dev'); 7 | } 8 | 9 | Encore 10 | // directory where compiled assets will be stored 11 | .setOutputPath('public/build/') 12 | // public path used by the web server to access the output path 13 | .setPublicPath('/build') 14 | // only needed for CDN's or sub-directory deploy 15 | //.setManifestKeyPrefix('build/') 16 | 17 | .copyFiles({ 18 | from: './assets/images', 19 | to: 'images/[path][name].[ext]' 20 | }) 21 | 22 | /* 23 | * ENTRY CONFIG 24 | * 25 | * Each entry will result in one JavaScript file (e.g. app.js) 26 | * and one CSS file (e.g. app.css) if your JavaScript imports CSS. 27 | */ 28 | .addEntry('app', './assets/app.js') 29 | 30 | // When enabled, Webpack "splits" your files into smaller pieces for greater optimization. 31 | .splitEntryChunks() 32 | 33 | // will require an extra script tag for runtime.js 34 | // but, you probably want this, unless you're building a single-page app 35 | .enableSingleRuntimeChunk() 36 | 37 | /* 38 | * FEATURE CONFIG 39 | * 40 | * Enable & configure other features below. For a full 41 | * list of features, see: 42 | * https://symfony.com/doc/current/frontend.html#adding-more-features 43 | */ 44 | .cleanupOutputBeforeBuild() 45 | .enableBuildNotifications() 46 | .enableSourceMaps(!Encore.isProduction()) 47 | // enables hashed filenames (e.g. app.abc123.css) 48 | .enableVersioning(Encore.isProduction()) 49 | 50 | .configureBabel((config) => { 51 | config.plugins.push('@babel/plugin-proposal-class-properties'); 52 | }) 53 | 54 | // enables @babel/preset-env polyfills 55 | .configureBabelPresetEnv((config) => { 56 | config.useBuiltIns = 'usage'; 57 | config.corejs = 3; 58 | }) 59 | 60 | // enables Sass/SCSS support 61 | //.enableSassLoader() 62 | 63 | // uncomment if you use TypeScript 64 | //.enableTypeScriptLoader() 65 | 66 | // uncomment if you use React 67 | //.enableReactPreset() 68 | 69 | // uncomment to get integrity="..." attributes on your script & link tags 70 | // requires WebpackEncoreBundle 1.4 or higher 71 | //.enableIntegrityHashes(Encore.isProduction()) 72 | 73 | // uncomment if you're having problems with a jQuery plugin 74 | //.autoProvidejQuery() 75 | ; 76 | 77 | module.exports = Encore.getWebpackConfig(); 78 | -------------------------------------------------------------------------------- /tests/Controller/OcrControllerTest.php: -------------------------------------------------------------------------------- 1 | push( $request ); 35 | $request->setSession( new Session( new MockArraySessionStorage() ) ); 36 | $intuition = new Intuition( [] ); 37 | $gcv = new GoogleCloudVisionEngine( 38 | dirname( __DIR__ ) . '/fixtures/google-account-keyfile.json', 39 | $intuition, 40 | $this->projectDir, 41 | new MockHttpClient() 42 | ); 43 | $controller = new OcrController( 44 | $requestStack, 45 | $intuition, 46 | new EngineFactory( 47 | $gcv, 48 | new TesseractEngine( new MockHttpClient(), $intuition, $this->projectDir, new TesseractOCR() ), 49 | new TranskribusEngine( 50 | new TranskribusClient( 51 | getenv( 'APP_TRANSKRIBUS_USERNAME' ), 52 | getenv( 'APP_TRANSKRIBUS_PASSWORD' ), 53 | new MockHttpClient(), 54 | new NullAdapter(), 55 | new NullAdapter() 56 | ), 57 | $intuition, 58 | $this->projectDir, 59 | new MockHttpClient() 60 | ), 61 | ), 62 | new FilesystemAdapter() 63 | ); 64 | $this->assertSame( $expectedLangs, $controller->getLangs( $request ) ); 65 | } 66 | 67 | /** 68 | * @return mixed[] 69 | */ 70 | public function provideGetLang(): array { 71 | return [ 72 | [ 73 | [ 'lang' => 'ar' ], 74 | [ 'ar' ], 75 | ], 76 | [ 77 | [ 'langs' => [ 'a|b', 'c!', 'ab' ] ], 78 | [ 'ab', 'c' ], 79 | ], 80 | 'special characters' => [ 81 | [ 'langs' => [ 'sr-Latn', 'Canadian_Aboriginal' ] ], 82 | [ 'sr-Latn', 'Canadian_Aboriginal' ], 83 | ], 84 | 'numbers' => [ 85 | [ 'langs' => [ 'ru-petr1708' ] ], 86 | [ 'ru-petr1708' ], 87 | ], 88 | ]; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /i18n/cs.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Georg101" 5 | ] 6 | }, 7 | "title": "Wikimédia OCR", 8 | "subtitle": "Přepis textu z obrázků", 9 | "form-heading": "Přepsat obrázek", 10 | "image-url": "Adresa URL obrázku", 11 | "image-url-help": "Vložte adresu URL obrázku na server Wikimedia, například: $1", 12 | "image-url-error": "URL obrázku musí začínat na {{PLURAL:$1| nasledující název domény | jeden z následujících názvů domén}} a končit platnou príponou souboru: $2", 13 | "image-alt-text": "Původní obrázek", 14 | "language-code": "Jazyky (volitelné)", 15 | "engine": "engine OCR", 16 | "engine-not-found-warning": "Požadovaný modul „ $1 “ nebyl nalezen. Místo toho se používá výchozí modul „ $2", 17 | "engine-invalid-langs-warning": "Následující jazyky jsou neplatné nebo nepodporované a byly ignorované: $1", 18 | "submit": "Přepsat", 19 | "copy-to-clipboard": "Kopírovat do schránky", 20 | "copied-to-clipboard": "Zkopírováno!", 21 | "google-error": "Služba Google vrátila chybu: $1", 22 | "image-retrieval-failed": "Načítání obrázku selhalo: $1", 23 | "documentation": "Dokumentace", 24 | "version": "Verze $1", 25 | "report-issue": "Nahlásit problém", 26 | "langs-placeholder": "Pro automatickou detekci jazyka nechejte pole prázdné.", 27 | "langs-param-error": "{{PLURAL:$1|Nasledující jazyk není je podporovaný|Nasledující jazyky nejsou podporované}} modulem OCR: $2", 28 | "tesseract-options": "Možnosti Tesseractu", 29 | "tesseract-psm-label": "Metoda segmentace stránek", 30 | "tesseract-psm-help": "Vyzkoušejte „Řídký text“ pro lepší podporu vícero sloupců.", 31 | "tesseract-psm-0": "Jenom orientace a detekce skriptu (OSD).", 32 | "tesseract-psm-1": "Automatická segmentace stránek pomocí OSD.", 33 | "tesseract-psm-2": "Automatická segmentace stránek, ale bez OSD nebo OCR. (není implementováno)", 34 | "tesseract-psm-3": "Plně automatická segmentace stránek, ale bez OSD. (Výchozí)", 35 | "tesseract-psm-4": "Předpokládaný jeden sloupec textu s proměnlivou velikostí.", 36 | "tesseract-psm-5": "Předpokládaný jeden jednotný blok vertikálně zarovnaného textu.", 37 | "tesseract-psm-6": "Předpokládaný jeden jednotný blok textu.", 38 | "tesseract-psm-7": "Zacházet s obrázkem jako s jedním řádkem textu.", 39 | "tesseract-psm-8": "Zacházet s obrázkem jako s jedním slovem.", 40 | "tesseract-psm-9": "Zacházet s obrázkem jako s jedním slovem v kruhu.", 41 | "tesseract-psm-10": "Zacházet s obrázkem jako s jedním znakem.", 42 | "tesseract-psm-11": "Řídký text. Najděte co nejvíce textu v rozházeném pořadí.", 43 | "tesseract-psm-12": "Řídký text s OSD.", 44 | "tesseract-psm-13": "Syrový řádek. Zacházet s obrázkem jako s jedním řádkem textu a obcházet hacky, které jsou specifické pro Tesseract.", 45 | "tesseract-param-error": "Tesseract nepodporuje možnost ' $1 ' s hodnotou $2. Maximální hodnota: $3", 46 | "tesseract-internal-error": "Engine tesseractu vrátil interní chybu." 47 | } 48 | -------------------------------------------------------------------------------- /i18n/pl.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Ankry", 5 | "Chrumps", 6 | "Darellur", 7 | "Frozengeist", 8 | "Strebski", 9 | "WaldiSt", 10 | "Woytecr" 11 | ] 12 | }, 13 | "title": "WikimediaOCR", 14 | "subtitle": "Narzędzie do transkrypcji tekstu z skanowanych obrazów na Wikimedia Commons, do użytku na Wikisource i w innych miejscach.", 15 | "form-heading": "Wyodrębnij tekst", 16 | "image-url": "URL obrazu", 17 | "image-url-help": "Poda1 adres URL obrazka na serwerze Wikimedia, takim jak $1", 18 | "image-alt-text": "Obrazek źródłowy", 19 | "language-code": "Języki (opcjonalnie)", 20 | "engine": "Silnik OCR", 21 | "engine-name-transkribus": "Transkribus OCR", 22 | "engine-not-found-warning": "Żądany silnik '$1' jest niedostępny. Będzie użyty silnik '$2'.", 23 | "engine-invalid-langs-warning": "Następujące języki są nieprawidłowe lub niewspierane i zostaną zignorowane: $1", 24 | "submit": "Wyodrębnij tekst z całej strony", 25 | "submit-crop": "Wyodrębnij z obszaru", 26 | "drag-help": "Aktywuj narzędzie przycinania i zaznacz na poniższym obrazku obszar, z którego będzie wyodrębniony tekst.", 27 | "drag-mode-move": "Przeciąganie przesunie obraz", 28 | "drag-mode-move-alt": "Ikona reprezentująca działanie „przesuń”.", 29 | "drag-mode-crop": "Przeciąganie utworzy nowy obszar przycinania", 30 | "drag-mode-crop-alt": "Ikona reprezentująca działanie „przytnij”.", 31 | "copy-to-clipboard": "Kopiuj do schowka", 32 | "copied-to-clipboard": "Skopiowano!", 33 | "google-error": "Usługa Google zgłosiła błąd: $1", 34 | "image-retrieval-failed": "Nie udało się pobrać obrazka: $1", 35 | "documentation": "Dokumentacja", 36 | "api-tooltip": "Dokumentacja API", 37 | "version": "Wersja $1", 38 | "report-issue": "Zgłoś problem", 39 | "langs-placeholder": "Pozostaw puste aby automatycznie rozpoznać język.", 40 | "loading-message": "Wykonywanie transkrypcji...", 41 | "tesseract-options": "Opcje tesseract", 42 | "tesseract-psm-label": "Metoda segmentacji stron", 43 | "tesseract-psm-1": "Automatyczna segmentacja stron z OSD.", 44 | "tesseract-psm-6": "Załóż jeden jednolity blok tekstu.", 45 | "tesseract-psm-7": "Traktuj obraz jako jedną linię tekstu.", 46 | "tesseract-psm-8": "Traktuj obraz jako jedno słowo.", 47 | "tesseract-psm-9": "Traktuj obraz jako jedno słowo w okręgu.", 48 | "tesseract-psm-10": "Traktuj obraz jako jeden znak.", 49 | "tesseract-internal-error": "Silnik tesseract zgłosił błąd wewnętrzny.", 50 | "transkribus-language-code": "Model Języka", 51 | "transkribus-unauthorized-error": "Kod błędu '$1' :: Żądanie nie jest autoryzowane", 52 | "transkribus-default-error": "Kod błędu '$1' :: Nie można zrealizować żądania, spróbuj ponownie!", 53 | "transkribus-browse-public-models": "Przeglądaj wszystkie modele języka publicznego dla Transkribus", 54 | "transkribus-request-for-model": "Złóż wniosek o dodanie modelu z Transkribus do narzędzia OCR", 55 | "transkribus-line-id-none-option": "Żaden", 56 | "transkribus-job-state": "Stan", 57 | "transkribus-job-description": "Opis", 58 | "transkribus-job-start": "Rozpoczęto", 59 | "transkribus-job-end": "Zakończono", 60 | "transkribus-job-waited": "Opóźnienie startu (minuty)" 61 | } 62 | -------------------------------------------------------------------------------- /i18n/ru.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "DDPAT", 5 | "Ice bulldog", 6 | "Kareyac", 7 | "Okras", 8 | "Pplex.vhs", 9 | "Smigles", 10 | "Thothsum" 11 | ] 12 | }, 13 | "title": "ВикимедиаOCR", 14 | "subtitle": "Инструмент для распознавания текста из отсканированных изображений с Викисклада для использования в Викитеке и других местах.", 15 | "form-heading": "Расшифровать изображение", 16 | "image-url": "URL изображения", 17 | "image-url-help": "Вставьте URL-адрес изображения, размещенного на сервере Викимедиа, например: $1", 18 | "image-url-error": "URL изображения должно начинаться со {{PLURAL:$1|следующих доменных имён}} и оканчиваться допустимым расширением файла: $2", 19 | "image-alt-text": "Исходное изображение", 20 | "language-code": "Языки (необязательно)", 21 | "engine": "Движок OCR", 22 | "engine-name-transkribus": "Transkribus OCR", 23 | "engine-not-found-warning": "Запрошенный движок '$1' не найден. Вместо этого используется движок по умолчанию — '$2'.", 24 | "engine-invalid-langs-warning": "Следующие языки недействительны или не поддерживаются движком и были проигнорированы: $1", 25 | "submit": "Транскрибировать всю страницу", 26 | "submit-crop": "Распознать область", 27 | "drag-help": "Выберите инструмент обрезки и перетащите прямоугольник на изображение ниже, чтобы распознать только одну область страницы.", 28 | "drag-mode-move": "Перетаскивание переместит изображение.", 29 | "copy-to-clipboard": "Скопировать в буфер обмена", 30 | "copied-to-clipboard": "Скопировано!", 31 | "google-error": "Служба Google вернула ошибку: $1", 32 | "image-retrieval-failed": "Не удалось получить изображение: $1", 33 | "documentation": "Документация", 34 | "api-tooltip": "Просмотреть документацию по API", 35 | "version": "Версия $1", 36 | "report-issue": "Сообщить об ошибке", 37 | "langs-placeholder": "Оставьте поле пустым для автоматического определения языка.", 38 | "langs-param-error": "Следующие {{PLURAL:$1|языки}} не поддерживаются движком ОРТ: $2", 39 | "loading-message": "Выполнение распознавания…", 40 | "tesseract-options": "Настройки Tesseract", 41 | "tesseract-psm-label": "Метод сегментации страницы", 42 | "tesseract-psm-1": "Автоматическая сегментация страниц с экранным меню.", 43 | "tesseract-psm-7": "Рассматривать изображение как одну текстовую строку.", 44 | "tesseract-psm-8": "Рассматривать изображение как одно слово.", 45 | "tesseract-psm-10": "Рассматривать изображение как один символ.", 46 | "tesseract-psm-12": "Разрезанный текст с экранным меню.", 47 | "tesseract-internal-error": "Механизм тессеракта возвратил внутреннюю ошибку.", 48 | "transkribus-language-code": "Языковая модель", 49 | "transkribus-empty-response-error": "Не удалось проанализировать результат из API Transkribus", 50 | "transkribus-no-lang-error": "Язык не выбран", 51 | "transkribus-multiple-lang-error": "Нельзя использовать несколько языков, выберите один язык", 52 | "transkribus-options": "Настройки Transkribus", 53 | "transkribus-job-id": "Идентификатор задачи", 54 | "transkribus-job-state": "Состояние", 55 | "transkribus-job-description": "Описание", 56 | "transkribus-job-start": "Начато", 57 | "transkribus-job-end": "Завершено", 58 | "transkribus-job-waited": "Задержка запуска (в минутах)" 59 | } 60 | -------------------------------------------------------------------------------- /i18n/vi.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Bapham123", 5 | "Minh Nguyen", 6 | "Nguyễn Mạnh An", 7 | "Vinhtantran" 8 | ] 9 | }, 10 | "title": "WikimediaOCR", 11 | "subtitle": "Tách văn bản từ hình", 12 | "form-heading": "Tách văn bản từ hình", 13 | "image-url": "URL hình", 14 | "image-url-help": "Nhập địa chỉ URL của tập tin hình ảnh được lưu trữ trên máy chủ Wikimedia chẳng hạn như: $1", 15 | "image-url-error": "Địa chỉ URL của hình phải bắt đầu bằng {{PLURAL:$1|tên miền sau|một trong các tên miền sau}} và kết thúc bằng một phần mở rộng tập tin hợp lệ: $2", 16 | "image-alt-text": "Hình ảnh gốc", 17 | "language-code": "Các ngôn ngữ (tùy chọn):", 18 | "engine": "Bộ máy OCR", 19 | "engine-not-found-warning": "Không tìm thấy bộ máy ‘$1’ được yêu cầu. Hãy dùng bộ máy mặc định ‘$2’.", 20 | "engine-invalid-langs-warning": "Các ngôn ngữ sau không hợp lệ hoặc bị bỏ qua vì bộ máy không hỗ trợ: $1", 21 | "submit": "Tách văn bản", 22 | "drag-mode-move-alt": "Biểu tượng biểu thị cho hành động 'di chuyển'.", 23 | "drag-mode-crop-alt": "Biểu tượng biểu thị cho hành động 'cắt xén'.", 24 | "copy-to-clipboard": "Chép vào bảng tạm", 25 | "copied-to-clipboard": "Đã sao chép!", 26 | "google-error": "Dịch vụ Google trả về lỗi: $1", 27 | "image-retrieval-failed": "Truy xuất ảnh thất bại: $1", 28 | "documentation": "Tài liệu hướng dẫn", 29 | "api": "API", 30 | "api-tooltip": "Xem tài liệu API", 31 | "version": "Phiên bản $1", 32 | "report-issue": "Báo cáo lỗi", 33 | "langs-placeholder": "Để trống để tự động phát hiện ngôn ngữ.", 34 | "langs-param-error": "{{PLURAL:$1|Ngôn ngữ|Các ngôn ngữ}} sau không được bộ máy OCR hỗ trợ: $2", 35 | "tesseract-options": "Tùy chọn cho Tesseract", 36 | "tesseract-psm-label": "Phương pháp phân đoạn trang", 37 | "tesseract-psm-help": "Hãy thử “Văn bản lẻ tẻ” để được hỗ trợ bố trí nhiều cột chính xác hơn.", 38 | "tesseract-psm-0": "Chỉ Phát hiện hướng viết và kiểu chữ (Orientation and script detection - OSD).", 39 | "tesseract-psm-1": "Phân đoạn trang tự động bằng OSD.", 40 | "tesseract-psm-2": "Phân đoạn trang tự động, nhưng không dùng OSD lẫn OCR. (chưa hiện thực)", 41 | "tesseract-psm-3": "Phân đoạn trang tự động hoàn toàn, nhưng không dùng OSD. (Mặc định)", 42 | "tesseract-psm-4": "Giả định là một cột văn bản với kích thước thay đổi.", 43 | "tesseract-psm-5": "Giả định là một khối đồng nhất với văn bản canh dọc.", 44 | "tesseract-psm-6": "Giả định là một khối văn bản đồng nhất.", 45 | "tesseract-psm-7": "Xem ảnh như một dòng văn bản duy nhất.", 46 | "tesseract-psm-8": "Xem ảnh như một từ duy nhất.", 47 | "tesseract-psm-9": "Xem ảnh như một từ duy nhất trong vòng tròn.", 48 | "tesseract-psm-10": "Xem ảnh như một ký tự duy nhất.", 49 | "tesseract-psm-11": "Văn bản lẻ tẻ. Tìm càng nhiều văn bản càng tốt không cần theo thứ tự cụ thể.", 50 | "tesseract-psm-12": "Dò văn bản bằng OSD.", 51 | "tesseract-psm-13": "Dòng thô. Xem ảnh như một dòng văn bản duy nhất, bỏ qua những mẹo đặc thù của Tesseract.", 52 | "tesseract-param-error": "Tùy chọn ‘$1’ với giá trị ‘$2’ không được Tesseract hỗ trợ. Giá trị tối đa: $3", 53 | "tesseract-internal-error": "Bộ máy tesseract trả về lỗi nội bộ.", 54 | "transkribus-no-lang-error": "Không có ngôn ngữ nào được chọn", 55 | "transkribus-line-id-none-option": "Không có" 56 | } 57 | -------------------------------------------------------------------------------- /i18n/sv.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "DraconicDark", 5 | "Sabelöga", 6 | "WikiPhoenix" 7 | ] 8 | }, 9 | "title": "Wikimedia OCR", 10 | "subtitle": "Transkribera text från bilder", 11 | "form-heading": "Transkribera en bild", 12 | "image-url": "Bild-URL", 13 | "image-url-help": "Ange en bild-URL som finns på en Wikimedia som exempelvis: $1", 14 | "image-url-error": "Bild-URL måste börja med {{PLURAL:$1|följande domännamn|en av följande domännamn}} och avslutas med ett giltigt filtillägg: $2", 15 | "image-alt-text": "Originalbilden", 16 | "language-code": "Språk (valfri)", 17 | "engine": "OCR-motor", 18 | "engine-not-found-warning": "Den begärda motorn '$1' hittades inte. Använd standardmotorn '$2' i stället.", 19 | "engine-invalid-langs-warning": "Följande språk är ogiltiga eller så stöds dem inte av motorn och ignorerades: $1", 20 | "submit": "Transkribera hela sidan", 21 | "submit-crop": "Transkribera område", 22 | "drag-help": "Välj klippningsverktyget och rita en rektangel i bilden nedan för att bara transkribera en bit av sidan.", 23 | "drag-mode-move": "Drar du på bilden flyttas den", 24 | "drag-mode-move-alt": "Ikonen för åtgärden \"flytta\".", 25 | "drag-mode-crop": "Genom att dra kommer ett nytt klippningsområde skapas", 26 | "drag-mode-crop-alt": "Ikonen för åtgärden \"klipp\".", 27 | "copy-to-clipboard": "Kopiera till urklipp", 28 | "copied-to-clipboard": "Kopierades!", 29 | "google-error": "Google-tjänsten returnerade ett fel: $1", 30 | "image-retrieval-failed": "Bilden hittades inte: $1", 31 | "documentation": "Dokumentation", 32 | "api-tooltip": "Visa API-dokumentationen", 33 | "version": "Version $1", 34 | "report-issue": "Rapportera ett problem", 35 | "langs-placeholder": "Lämna som tom för automatisk språkidentifiering.", 36 | "langs-param-error": "Följande {{PLURAL:$1|språk}} stöds inte av OCR-motorn: $2", 37 | "tesseract-options": "Tesseract-alternativ", 38 | "tesseract-psm-label": "Metod för att segmentera sidan", 39 | "tesseract-psm-help": "Testa \"Gles text\" för bättre stöd för fler kolumner.", 40 | "tesseract-psm-0": "Bara orientering och skriptdetektering (OSD).", 41 | "tesseract-psm-1": "Automatisk sidsegmentering med OSD.", 42 | "tesseract-psm-2": "Automatisk segmentering av sida, men ingen OSD eller OCR. (inte implementerad)", 43 | "tesseract-psm-3": "Fullständig segmentering av sida, men ingen OSD. (Standard)", 44 | "tesseract-psm-4": "Anta en textkolumn i olika storlekar.", 45 | "tesseract-psm-5": "Anta ett enhetlig block med lodrät fixerad text.", 46 | "tesseract-psm-6": "Anta ett enhetligt textblock.", 47 | "tesseract-psm-7": "Behandla bilden som en textrad.", 48 | "tesseract-psm-8": "Behandla bilden som ett ord.", 49 | "tesseract-psm-9": "Behandla bilden som ett ord i en cirkel.", 50 | "tesseract-psm-10": "Behandla bilden som ett tecken.", 51 | "tesseract-psm-11": "Gles text. Hitta så mycket text som möjligt utan en bestämd ordning.", 52 | "tesseract-psm-12": "Gles text med OSD.", 53 | "tesseract-psm-13": "Rå rad. Behandla bilden som en textrad, förbigå hackningar som är Tesseract-specifika.", 54 | "tesseract-param-error": "Alternativet \"$1\" med ett värde av $2 stöds inte av Tesseract. Högsta värdet: $3", 55 | "tesseract-internal-error": "Tesseract-motorn returnerade ett internt fel." 56 | } 57 | -------------------------------------------------------------------------------- /i18n/zh-hant.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Cookai1205", 5 | "Kly", 6 | "Winston Sung", 7 | "捍粵者" 8 | ] 9 | }, 10 | "title": "維基媒體 OCR", 11 | "subtitle": "從圖片轉譯成文字", 12 | "form-heading": "轉譯一張圖片", 13 | "image-url": "圖像 URL", 14 | "image-url-help": "請插入一個托管在維基媒體伺服器的圖片 URL,例如像是:$1", 15 | "image-url-error": "圖片 URL必須以{{PLURAL:$1|以下網域名稱|以下網域名稱之一}}為開頭,並且要以有效的副檔名作為結尾:$2", 16 | "image-alt-text": "原始圖片", 17 | "language-code": "語言(非必填)", 18 | "engine": "OCR 引擎", 19 | "engine-name-transkribus": "Transkribus OCR", 20 | "engine-not-found-warning": "未找到請求的引擎「$1」。改使用預設引擎「$2」。", 21 | "engine-invalid-langs-warning": "以下語言無效或是不被引擎支援而被忽略:$1", 22 | "submit": "轉譯整個頁面", 23 | "submit-crop": "轉譯區域", 24 | "drag-help": "選擇裁剪工具,並在圖片下方拖曳一個矩形,來僅轉譯頁面上的單一區域。", 25 | "drag-mode-move": "拖曳將會移動圖片", 26 | "drag-mode-move-alt": "代表「移動」操作的圖示。", 27 | "drag-mode-crop": "拖曳將會產生新的裁剪區域", 28 | "drag-mode-crop-alt": "代表「裁剪」操作的圖示。", 29 | "copy-to-clipboard": "複製到剪貼簿", 30 | "copied-to-clipboard": "已複製!", 31 | "google-error": "Google 服務回傳一個錯誤:$1", 32 | "image-retrieval-failed": "圖片取回失敗:$1", 33 | "documentation": "文件", 34 | "api-tooltip": "檢視 API 文件", 35 | "version": "版本 $1", 36 | "report-issue": "問題回報", 37 | "langs-placeholder": "留空以自動偵測語言", 38 | "langs-param-error": "以下{{PLURAL:$1|語言}}不被 OCR 引擎支援:$2", 39 | "loading-message": "正在執行轉譯…", 40 | "tesseract-options": "Tesseract 選項", 41 | "tesseract-psm-label": "頁面拆分方式", 42 | "tesseract-psm-help": "嘗試「稀疏文字」來獲得較好的多行支援。", 43 | "tesseract-psm-0": "僅方向與文字檢測(OSD)。", 44 | "tesseract-psm-1": "以 OSD 來自動拆分頁面。", 45 | "tesseract-psm-2": "自動拆分頁面,但不使用 OSD 或是 OCR。(尚未實現)", 46 | "tesseract-psm-3": "全自動頁面拆分,但不使用 OSD。(預設)", 47 | "tesseract-psm-4": "視為文字內容長度可變的單一行。", 48 | "tesseract-psm-5": "視為有一個垂直文字對齊的均勻文字區塊。", 49 | "tesseract-psm-6": "視為有一個均勻文字區塊。", 50 | "tesseract-psm-7": "將圖片視為單行文字。", 51 | "tesseract-psm-8": "將圖片視為單一字詞。", 52 | "tesseract-psm-9": "將圖片視為環繞狀的單一字詞。", 53 | "tesseract-psm-10": "將圖片視為單一字元。", 54 | "tesseract-psm-11": "稀疏文字。以沒有特定的順序來盡可能找出文字。", 55 | "tesseract-psm-12": "以 OSD 來稀疏文字。", 56 | "tesseract-psm-13": "原始行。將圖片視為一行文字,繞過特定於 Tesseract 的駭客攻擊。", 57 | "tesseract-param-error": "帶有值$2的選項「$1」不被 Tesseract 支援。最大值為:$3", 58 | "tesseract-no-text-error": "Tesseract 引擎沒有回傳此圖片的任何文字。", 59 | "tesseract-internal-error": "Tesseract 引擎返回一個內部錯誤。", 60 | "transkribus-language-code": "語言模型", 61 | "transkribus-unauthorized-error": "錯誤代碼 '$1' :: 未經授權的請求", 62 | "transkribus-default-error": "錯誤代碼 '$1' :: 無法完成請求,請重試!", 63 | "transkribus-empty-response-error": "無法解析來自 Transkribus API 的結果", 64 | "transkribus-init-process-error": "無法初始化 Transkribus 程序", 65 | "transkribus-failed-process-error": "Transkribus 程序失敗", 66 | "transkribus-no-lang-error": "未選擇語言", 67 | "transkribus-multiple-lang-error": "不允許多個語言,請指定一種語言", 68 | "transkribus-browse-public-models": "瀏覽 Transkribus 的所有公開語言模型", 69 | "transkribus-request-for-model": "請求從 Transkribus 添加一個模型到 OCR 工具", 70 | "transkribus-options": "Transkribus 選項", 71 | "transkribus-line-label": "文字行檢測模型", 72 | "transkribus-line-id-none-option": "無", 73 | "transkribus-mixed-line-option": "混合直線方向", 74 | "transkribus-line-help": "如果您不確定要使用哪種直線檢測模型,請留空", 75 | "transkribus-jobs": "Transkribus 任務", 76 | "transkribus-job-id": "任務 ID", 77 | "transkribus-job-state": "狀態", 78 | "transkribus-job-description": "描述", 79 | "transkribus-job-start": "已啟動", 80 | "transkribus-job-end": "已完成", 81 | "transkribus-job-waited": "啟動延遲(分鐘)" 82 | } 83 | -------------------------------------------------------------------------------- /i18n/sk.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Yardom78" 5 | ] 6 | }, 7 | "title": "Wikimédia OCR", 8 | "subtitle": "Prepis textu z obrázkov", 9 | "form-heading": "Urobiť prepis z obrázka", 10 | "image-url": "URL obrázka", 11 | "image-url-help": "Vložte adresu URL obrázka na serveri Wikimedia, napríklad: $1", 12 | "image-url-error": "URL obrázka musí začínať na {{PLURAL:$1| nasledujúci názov domény | jeden z nasledujúcich názvov domén}} a končiť platnou príponou súboru: $2", 13 | "image-alt-text": "Pôvodný obrázok", 14 | "language-code": "Jazyky (voliteľné)", 15 | "engine": "OCR motor", 16 | "engine-not-found-warning": "Požadovaný prostriedok „ $1 “ sa nenašiel. Namiesto toho sa používa predvolený modul „ $2", 17 | "engine-invalid-langs-warning": "Nasledujúce jazyky sú neplatné alebo nepodporované a boli ignorované: $1", 18 | "submit": "Prepísať celú stránku", 19 | "submit-crop": "Prepísať oblasť", 20 | "drag-help": "Vyberte nástroj orezávania a označte na obrázku nižšie, ktorú oblasť chcete prepísať.", 21 | "drag-mode-move": "Potiahnutie posunie obrázok", 22 | "drag-mode-move-alt": "Ikona reprezentuje akciu presunu.", 23 | "drag-mode-crop": "Potiahnutie vytvorí novú oblasť pre orezanie", 24 | "drag-mode-crop-alt": "Ikona reprezentujúca akciu orezania.", 25 | "copy-to-clipboard": "Skopírovať do schránky", 26 | "copied-to-clipboard": "Skopírované!", 27 | "google-error": "Služba Google vrátila chybu: $1", 28 | "image-retrieval-failed": "Načítanie obrázka zlyhalo: $1", 29 | "documentation": "Dokumentácia", 30 | "api-tooltip": "Pozrieť API dokumentáciu", 31 | "version": "Verzia $1", 32 | "report-issue": "Nahlásiť problém", 33 | "langs-placeholder": "Pre automatickú detekciu jazyka nechajte pole prázdne.", 34 | "langs-param-error": "{{PLURAL:$1|Nasledovný jazyk nie je podporovaný|Nasledovné jazyky nie sú podporované}} modulom OCR: $2", 35 | "tesseract-options": "Možnosti Tesseractu", 36 | "tesseract-psm-label": "Metóda segmentácie stránok", 37 | "tesseract-psm-help": "Vyskúšajte „Riedky text“ pre lepšiu podporu viacerých stĺpcov.", 38 | "tesseract-psm-0": "Iba orientácia a detekcia skriptu (OSD).", 39 | "tesseract-psm-1": "Automatická segmentácia stránok pomocou OSD.", 40 | "tesseract-psm-2": "Automatická segmentácia stránkok, ale bez OSD alebo OCR. (nie je implementovaný)", 41 | "tesseract-psm-3": "Plne automatická segmentácia stránok, ale bez OSD. (Predvolené)", 42 | "tesseract-psm-4": "Predpokladaný jeden stĺpec textu s premenlivou veľkosťou.", 43 | "tesseract-psm-5": "Predpokladaný jeden jednotný blok vertikálne zarovnaného textu.", 44 | "tesseract-psm-6": "Predpokladaný jeden jednotný blok textu.", 45 | "tesseract-psm-7": "S obrázkom zaobchádzajte ako s jedným textovým riadkom.", 46 | "tesseract-psm-8": "S obrázkom zaobchádzajte ako s jedným slovom.", 47 | "tesseract-psm-9": "S obrázkom zaobchádzajte ako s jedným slovom v kruhu.", 48 | "tesseract-psm-10": "S obrázkom zaobchádzajte ako s jedným znakom.", 49 | "tesseract-psm-11": "Riedky text. Nájdite čo najviac textu v hocijakom poradí.", 50 | "tesseract-psm-12": "Riedky text s OSD.", 51 | "tesseract-psm-13": "Surová linka. S obrázkom zaobchádzajte ako s jedným textovým riadkom, pričom obídete hacky, ktoré sú špecifické pre Tesseract.", 52 | "tesseract-param-error": "Tesseract nepodporuje možnosť ' $1 ' s hodnotou $2 Maximálna hodnota: $3", 53 | "tesseract-internal-error": "Prostriedok tesseractu vrátil internú chybu." 54 | } 55 | -------------------------------------------------------------------------------- /src/Engine/GoogleCloudVisionEngine.php: -------------------------------------------------------------------------------- 1 | imageAnnotator = new ImageAnnotatorClient( [ 'credentials' => $keyFile ] ); 36 | } 37 | } 38 | 39 | /** 40 | * @inheritDoc 41 | */ 42 | public static function getId(): string { 43 | return 'google'; 44 | } 45 | 46 | /** 47 | * @inheritDoc 48 | * @throws OcrException 49 | */ 50 | public function getResult( 51 | string $imageUrl, 52 | string $invalidLangsMode, 53 | array $crop, 54 | ?array $langs = null 55 | ): EngineResult { 56 | $this->checkImageUrl( $imageUrl ); 57 | 58 | [ $validLangs, $invalidLangs ] = $this->filterValidLangs( $langs, $invalidLangsMode ); 59 | 60 | $imageContext = new ImageContext(); 61 | if ( $validLangs ) { 62 | $imageContext->setLanguageHints( $validLangs ); 63 | } 64 | 65 | if ( !$this->imageAnnotator ) { 66 | throw new OcrException( 'google-error', [ 'Key for Google OCR engine is missing' ] ); 67 | } 68 | 69 | $image = $this->getImage( $imageUrl, $crop ); 70 | $imageUrlOrData = $image->hasData() ? $image->getData() : $image->getUrl(); 71 | $response = $this->imageAnnotator->textDetection( $imageUrlOrData, [ 'imageContext' => $imageContext ] ); 72 | 73 | // Re-try with direct upload if the error returned is something similar to 74 | // "The URL does not appear to be accessible by us. Please double check or download the content and pass it in." 75 | // There doesn't seem to be a specific error code for this (it is usually 3, but that's also used for other 76 | // things), so it seems like we have to check the actual message string. 77 | if ( $response->getError() 78 | && stripos( $response->getError()->getMessage(), 'download the content and pass it in' ) !== false 79 | ) { 80 | $image = $this->getImage( $imageUrl, $crop, self::DO_DOWNLOAD_IMAGE ); 81 | $response = $this->imageAnnotator->textDetection( $image->getData(), [ 'imageContext' => $imageContext ] ); 82 | } 83 | 84 | // Other errors, report to the user. 85 | if ( $response->getError() ) { 86 | throw new OcrException( 'google-error', [ $response->getError()->getMessage() ] ); 87 | } 88 | 89 | $annotation = $response->getFullTextAnnotation(); 90 | $resText = $annotation instanceof TextAnnotation ? $annotation->getText() : ''; 91 | $warnings = $invalidLangs ? [ $this->getInvalidLangsWarning( $invalidLangs ) ] : []; 92 | return new EngineResult( $resText, $warnings ); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /i18n/zh-hans.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | ")8", 5 | "Anterdc99", 6 | "Crowley666", 7 | "GuoPC", 8 | "LittlePaw365", 9 | "Midleading", 10 | "Shizhao", 11 | "Zhang8569", 12 | "列维劳德" 13 | ] 14 | }, 15 | "title": "维基媒体OCR", 16 | "subtitle": "从维基共享资源上的扫描图像转录文本,以供维基文库和其他项目使用的工具。", 17 | "form-heading": "转录图像", 18 | "image-url": "图像 URL", 19 | "image-url-help": "插入在维基媒体服务器上托管的图像 URL,例如: $1", 20 | "image-url-error": "图片URL必须以{{PLURAL:$1|以下域名|以下域名之一}}开头并以有效的文件扩展名结尾:$2", 21 | "image-alt-text": "原图", 22 | "language-code": "语言(可选)", 23 | "engine": "OCR引擎", 24 | "engine-name-transkribus": "Transkribus OCR", 25 | "engine-not-found-warning": "请求的引擎“$1”未找到。改用默认引擎“$2”。", 26 | "engine-invalid-langs-warning": "以下语言无效或不受引擎支持而被忽略:$1", 27 | "submit": "转录整页", 28 | "submit-crop": "转录范围", 29 | "drag-help": "选择裁剪工具,然后在下面的图像上拖拽出一个矩形,来仅转录页面上的某一区域。", 30 | "drag-mode-move": "拖曳操作会移动图像", 31 | "drag-mode-move-alt": "代表“移动”操作的图标。", 32 | "drag-mode-crop": "拖拽会创建新裁剪区域", 33 | "drag-mode-crop-alt": "代表“裁剪”操作的图标。", 34 | "copy-to-clipboard": "复制到剪贴板", 35 | "copied-to-clipboard": "已复制!", 36 | "google-error": "Google服务返回错误:$1", 37 | "image-retrieval-failed": "图像检索失败:$1", 38 | "documentation": "文档", 39 | "api-tooltip": "查看 API 文档", 40 | "version": "版本$1", 41 | "report-issue": "报告问题", 42 | "langs-placeholder": "留空以进行自动语言检测。", 43 | "langs-param-error": "以下{{PLURAL:$1|语言}}不受OCR引擎支持:$2", 44 | "loading-message": "正在执行转录...", 45 | "tesseract-options": "Tesseract选项", 46 | "tesseract-psm-label": "页面拆分方式", 47 | "tesseract-psm-help": "尝试“稀疏文本”来获得更好的多列支持。", 48 | "tesseract-psm-0": "仅方向和手写检测(OSD)。", 49 | "tesseract-psm-1": "使用OSD自动拆分页面。", 50 | "tesseract-psm-2": "自动拆分页面,但不使用OSD或OCR。(未实现)", 51 | "tesseract-psm-3": "全自动拆分页面,但不使用OSD。(默认)", 52 | "tesseract-psm-4": "假设有一列可变大小的文本。", 53 | "tesseract-psm-5": "假设有一个统一的垂直对齐的文本块。", 54 | "tesseract-psm-6": "假设有一个统一的文本块。", 55 | "tesseract-psm-7": "将图像视为一行文本。", 56 | "tesseract-psm-8": "将图像视为一个词。", 57 | "tesseract-psm-9": "将图像视为环状的一个词。", 58 | "tesseract-psm-10": "将图像视为一个字符。", 59 | "tesseract-psm-11": "稀疏文本。查找尽可能多的文本,没有特定的顺序。", 60 | "tesseract-psm-12": "使用OSD稀疏文本。", 61 | "tesseract-psm-13": "原始行。将图像视为一行文本,绕过特定于Tesseract的黑客攻击。", 62 | "tesseract-param-error": "带有值$2的“$1”选项不受Tesseract支持。最大值:$3", 63 | "tesseract-no-text-error": "Tesseract 引擎没有返回此图片的任何文本。", 64 | "tesseract-internal-error": "tesseract 引擎返回了一个内部错误。", 65 | "transkribus-language-code": "语言模型", 66 | "transkribus-unauthorized-error": "错误代码“ $1 ”:: 请求未经授权", 67 | "transkribus-default-error": "错误代码“ $1 ”:: 无法完成请求,请重试!", 68 | "transkribus-empty-response-error": "无法解析来自 Transkribus API 的结果", 69 | "transkribus-init-process-error": "无法初始化 Transkribus 进程", 70 | "transkribus-failed-process-error": "Transkribus 进程失败", 71 | "transkribus-no-lang-error": "未选择语言", 72 | "transkribus-multiple-lang-error": "不允许使用多种语言,请指定一种语言", 73 | "transkribus-browse-public-models": "浏览 Transkribus 的所有公开语言模型", 74 | "transkribus-request-for-model": "请求将 Transkribus 中的模型添加到 OCR 工具", 75 | "transkribus-options": "Transkribus 选项", 76 | "transkribus-line-label": "直线检测模型", 77 | "transkribus-line-id-none-option": "无", 78 | "transkribus-mixed-line-option": "混合直线方向", 79 | "transkribus-line-help": "如果您不确定要使用哪种直线检测模型,请留空", 80 | "transkribus-jobs": "Transkribus 任务", 81 | "transkribus-job-id": "任务 ID", 82 | "transkribus-job-state": "状态", 83 | "transkribus-job-description": "描述", 84 | "transkribus-job-start": "已开始", 85 | "transkribus-job-end": "已完成", 86 | "transkribus-job-waited": "开始延迟(分钟)" 87 | } 88 | -------------------------------------------------------------------------------- /templates/base.html.twig: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {% block title %}{{ msg('title') }}{% endblock %} 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | {% block stylesheets %} 17 | {{ encore_entry_link_tags('app') }} 18 | {% if is_rtl() %} 19 | 20 | {% endif %} 21 | {% endblock %} 22 | {% block javascripts %} 23 | {{ encore_entry_script_tags('app') }} 24 | {% endblock %} 25 | 26 | 27 | 38 |
39 | {% for label, messages in app.flashes(['error']) %} 40 | {% for message in messages %} 41 |
42 | {{ message }} 43 |
44 | {% endfor %} 45 | {% endfor %} 46 | {% for label, messages in app.flashes(['warning']) %} 47 | {% for message in messages %} 48 |
49 | {{ message }} 50 |
51 | {% endfor %} 52 | {% endfor %} 53 | 54 | {% block body %}{% endblock %} 55 | 56 |
57 | 74 |
75 | 76 | 77 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wikimedia/wikimedia-ocr", 3 | "description": "A simple wrapper around multiple OCR engines, enabling Wikisources to submit images for OCR and retrieve the resultant text.", 4 | "type": "project", 5 | "license": "GPL-3.0-or-later", 6 | "require": { 7 | "php": ">=7.3", 8 | "ext-bcmath": "*", 9 | "ext-ctype": "*", 10 | "ext-gd": "*", 11 | "ext-iconv": "*", 12 | "ext-json": "*", 13 | "google/cloud-vision": "^1.3", 14 | "imagine/imagine": "^1.2", 15 | "nelmio/api-doc-bundle": "^4.4", 16 | "predis/predis": "^2.2", 17 | "symfony/cache": "5.2.*", 18 | "symfony/console": "5.2.*", 19 | "symfony/dotenv": "5.2.*", 20 | "symfony/framework-bundle": "^5.4", 21 | "symfony/mailer": "^5.2", 22 | "symfony/monolog-bundle": "^3.7", 23 | "symfony/property-info": "5.2.*", 24 | "symfony/twig-bundle": "5.2.*", 25 | "symfony/webpack-encore-bundle": "^1.11", 26 | "symfony/yaml": "5.2.*", 27 | "thiagoalessio/tesseract_ocr": "^2.11", 28 | "twig/extra-bundle": "^2.12|^3.0", 29 | "twig/intl-extra": "^3.7", 30 | "twig/twig": "^2.12|^3.0", 31 | "wikimedia/toolforge-bundle": "^1.3" 32 | }, 33 | "require-dev": { 34 | "drenso/phan-extensions": "^3.3", 35 | "mediawiki/mediawiki-codesniffer": "^39.0", 36 | "mediawiki/minus-x": "^1.1", 37 | "mediawiki/phan-taint-check-plugin": "^4.0", 38 | "symfony/phpunit-bridge": "^5.2", 39 | "symfony/stopwatch": "^5.2", 40 | "symfony/web-profiler-bundle": "^5.2" 41 | }, 42 | "config": { 43 | "platform": { 44 | "php": "7.3.31" 45 | }, 46 | "optimize-autoloader": true, 47 | "preferred-install": { 48 | "*": "dist" 49 | }, 50 | "sort-packages": true 51 | }, 52 | "autoload": { 53 | "psr-4": { 54 | "App\\": "src/" 55 | } 56 | }, 57 | "autoload-dev": { 58 | "psr-4": { 59 | "App\\Tests\\": "tests/" 60 | } 61 | }, 62 | "replace": { 63 | "symfony/polyfill-ctype": "*", 64 | "symfony/polyfill-iconv": "*", 65 | "symfony/polyfill-php72": "*" 66 | }, 67 | "scripts": { 68 | "auto-scripts": [ 69 | "./bin/console cache:clear", 70 | "./bin/console assets:install" 71 | ], 72 | "check-tesseract": "./check_tesseract.sh", 73 | "post-install-cmd": [ 74 | "@auto-scripts", 75 | "@check-tesseract" 76 | ], 77 | "post-update-cmd": [ 78 | "@auto-scripts", 79 | "@check-tesseract" 80 | ], 81 | "test": [ 82 | "@test-common", 83 | "@phan" 84 | ], 85 | "test-common": [ 86 | "composer validate", 87 | "phpcs -s -p .", 88 | "./bin/console lint:twig ./templates", 89 | "./bin/console lint:yaml ./config", 90 | "minus-x check .", 91 | "@check-tesseract", 92 | "./bin/phpunit" 93 | ], 94 | "phan": [ 95 | "phan --allow-polyfill-parser --long-progress-bar" 96 | ], 97 | "fix": "phpcbf" 98 | }, 99 | "conflict": { 100 | "symfony/symfony": "*" 101 | }, 102 | "extra": { 103 | "symfony": { 104 | "allow-contrib": false, 105 | "require": "5.2.*" 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /i18n/tl.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "GinawaSaHapon" 5 | ] 6 | }, 7 | "title": "WikimediaOCR", 8 | "subtitle": "Mag-transcribe ang teksto mula sa mga larawan", 9 | "form-heading": "Mag-transcribe ng larawan", 10 | "image-url": "URL ng larawan", 11 | "image-url-help": "Ilagay ang URL ng larawan na hino-host sa isang server ng Wikimedia tulad ng: $1", 12 | "image-url-error": "Dapat nagsisimula ang URL ng larawan sa {{PLURAL:$1|sumusunod na pangalan ng domain|mga sumusunod na pangalan ng domain}} at magtapos sa isang valid na file extension: $2", 13 | "image-alt-text": "Ang orihinal na larawan", 14 | "language-code": "Mga wika (di-kailangan)", 15 | "engine": "OCR engine", 16 | "engine-not-found-warning": "Hindi nakita ang hiniling na engine na '$1'. Gagamitin na lang ang default na engine na '$2'.", 17 | "engine-invalid-langs-warning": "Invalid o di-suportado ng engine ang mga sumusunod na wika kaya binalewala sila: $1", 18 | "submit": "I-transcribe ang buong pahina", 19 | "submit-crop": "Lugar na ita-transcribe", 20 | "drag-help": "Piliin ang crop tool at mag-drag ng isang parihaba sa larawan sa baba para i-transcribe lang ang isang partikular na lugar ng pahina.", 21 | "drag-mode-move": "Gagalaw ang larawan kung ida-drag", 22 | "drag-mode-move-alt": "Ang icon na kumakatawan sa kilos na 'galawin'.", 23 | "drag-mode-crop": "Gagawa ng bagong lugar na ika-crop kung ida-drag", 24 | "drag-mode-crop-alt": "Ang icon na kumakatawan sa kilos na 'i-crop'.", 25 | "copy-to-clipboard": "Kopyahin sa clipboard", 26 | "copied-to-clipboard": "Nakopya na!", 27 | "google-error": "Nagbalik ng error ang serbisyo ng Google: $1", 28 | "image-retrieval-failed": "Nabigo sa pagkuha sa larawan: $1", 29 | "documentation": "Dokumentasyon", 30 | "api-tooltip": "Tingnan ang dokumentasyon sa API", 31 | "version": "Bersyon $1", 32 | "report-issue": "Mag-ulat ng isyu", 33 | "langs-placeholder": "Bakantehin para sa kusang pag-detect sa wika.", 34 | "langs-param-error": "Hindi suportado ang ng OCR engine ang sumusunod na {{PLURAL:$1|wika|mga wika}}: $2", 35 | "tesseract-options": "Pagsasaayos sa Tesseract", 36 | "tesseract-psm-label": "Paraan ng segmentation sa pahina", 37 | "tesseract-psm-help": "Subukan ang \"Kalat-kalat na teksto\" para sa mas maayos na suporta sa mga maramihang hanay.", 38 | "tesseract-psm-0": "Orientation at script detection (OSD) lang.", 39 | "tesseract-psm-1": "Kusang segmentation sa pahina gamit OSD.", 40 | "tesseract-psm-2": "Kusang segmentation sa pahina, pero walang OSD, o OCR. (di na-implement)", 41 | "tesseract-psm-3": "Kusang segmentation sa pahina, pero walang OSD. (default)", 42 | "tesseract-psm-4": "I-assume ang isang hanay ng teksto na iba-iba ang sukat.", 43 | "tesseract-psm-5": "I-assume ang isang pantay na bloke ng naka-align nang patayo na teksto.", 44 | "tesseract-psm-6": "I-assume ang isang pantay na bloke ng teksto.", 45 | "tesseract-psm-7": "Tratuhin ang larawan bilang isang linya ng teksto.", 46 | "tesseract-psm-8": "Tratuhin ang larawan bilang isang salita.", 47 | "tesseract-psm-9": "Tratuhin ang larawan bilang isang salita sa loob ng isang bilog.", 48 | "tesseract-psm-10": "Tratuhin ang larawan bilang isang karakter.", 49 | "tesseract-psm-11": "Kalat-kalat na teksto. Maghanap ng teksto hanggat posible nang walang partikular na plano.", 50 | "tesseract-psm-12": "Kalat-kalat na teksto na may OSD.", 51 | "tesseract-psm-13": "Raw na linya. Tratuhin ang larawan bilang isang linya ng teksto, na nagba-bypass sa mga hack na specific sa Tesseract.", 52 | "tesseract-param-error": "Hindi suportado ang '$1' na may value na $2. Maximum na value: $3", 53 | "tesseract-internal-error": "Nagbalik ng isang internal error ang tesseract engine." 54 | } 55 | -------------------------------------------------------------------------------- /i18n/id.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Mnam23", 5 | "Penyuwangi", 6 | "Veracious" 7 | ] 8 | }, 9 | "title": "WikimediaOCR", 10 | "subtitle": "Mentranskripkan teks dari gambar", 11 | "form-heading": "Transkripsi sebuah gambar", 12 | "image-url": "URL Gambar", 13 | "image-url-help": "Sisipkan URL gambar yang dihosting di peladen Wikimedia seperti: $1", 14 | "image-url-error": "URL gambar harus dimulai dengan {{PLURAL:$1|nama domain berikut|salah satu dari nama domain berikut}} dan diakhiri dengan ekstensi berkas valid: $2", 15 | "image-alt-text": "Gambar asli", 16 | "language-code": "Bahasa (opsional)", 17 | "engine": "Mesin OCR", 18 | "engine-name-transkribus": "Transkribus OCR", 19 | "engine-not-found-warning": "Mesin '$1' yang diminta tidak ditemukan. Menggunakan mesin bawaan '$2' sebagai gantinya.", 20 | "engine-invalid-langs-warning": "Bahasa berikut tidak valid atau tidak didukung mesin dan diabaikan: $1", 21 | "submit": "Transkripsikan seluruh halaman", 22 | "submit-crop": "Transkripsikan area", 23 | "drag-help": "Pilih perkakas pemangkas dan tarik sebuah persegi pada gambar di bawah untuk mentranskripsikan hanya satu area halaman.", 24 | "drag-mode-move": "Menarik akan memindahkan gambar", 25 | "drag-mode-move-alt": "Ikon merepresentasikan tindakan 'memindahkan'.", 26 | "drag-mode-crop": "Menarik akan membuat area pemangkasan baru", 27 | "drag-mode-crop-alt": "Ikon merepresentasikan tindakan 'pemangkasan'.", 28 | "copy-to-clipboard": "Salin ke papan klip", 29 | "copied-to-clipboard": "Tersalin!", 30 | "google-error": "Layanan Google mengembalikan kesalahan: $1", 31 | "image-retrieval-failed": "Pengambilan gambar gagal: $1", 32 | "documentation": "Dokumentasi", 33 | "api-tooltip": "Lihat dokumentasi API", 34 | "version": "Versi $1", 35 | "report-issue": "Laporkan masalah", 36 | "langs-placeholder": "Biarkan kosong untuk deteksi bahasa otomatis.", 37 | "langs-param-error": "{{PLURAL:$1|Bahasa|Bahasa-bahasa}} berikut tidak didukung oleh mesin OCR: $2", 38 | "tesseract-options": "Pilihan Tesseract", 39 | "tesseract-psm-label": "Metode segmentasi halaman", 40 | "tesseract-psm-help": "Coba \"Teks jarang\" untuk dukungan multi-kolom yang lebih baik.", 41 | "tesseract-psm-0": "Orientasi dan deteksi skrip (OSD) saja.", 42 | "tesseract-psm-1": "Segmentasi halaman otomatis dengan OSD.", 43 | "tesseract-psm-2": "Segmentasi halaman otomatis, tapi tanpa OSD, atau OCR. (tidak diterapkan)", 44 | "tesseract-psm-3": "Segmentasi halaman otomatis sepenuhnya, tetapi tanpa OSD. (Bawaan)", 45 | "tesseract-psm-4": "Asumsikan satu kolom teks dengan ukuran variabel.", 46 | "tesseract-psm-5": "Asumsikan satu blok seragam dari teks yang disejajarkan secara vertikal.", 47 | "tesseract-psm-6": "Asumsikan satu blok teks seragam.", 48 | "tesseract-psm-7": "Perlakukan gambar sebagai satu baris teks.", 49 | "tesseract-psm-8": "Perlakukan gambar sebagai kata tunggal.", 50 | "tesseract-psm-9": "Perlakukan gambar sebagai kata tunggal dalam lingkaran.", 51 | "tesseract-psm-10": "Perlakukan gambar sebagai karakter tunggal.", 52 | "tesseract-psm-11": "Teks jarang. Temukan teks sebanyak mungkin tanpa urutan tertentu.", 53 | "tesseract-psm-12": "Teks jarang dengan OSD.", 54 | "tesseract-psm-13": "Garis mentah. Perlakukan gambar sebagai satu baris teks, melewati peretasan yang spesifik Tesseract.", 55 | "tesseract-param-error": "Pilihan '$1' dengan nilai $2 tidak didukung oleh Tesseract. Nilai maksimum: $3", 56 | "tesseract-internal-error": "Mesin tesseract mengembalikan galat internal.", 57 | "transkribus-language-code": "Model bahasa", 58 | "transkribus-unauthorized-error": "Kode Galat '$1' :: Permintaan tidak diotorisasi", 59 | "transkribus-default-error": "Kode Galat '$1' :: Tak dapat menyelesaikan permintaan, coba lagi!", 60 | "transkribus-no-lang-error": "Tak ada bahasa yang dipilih" 61 | } 62 | -------------------------------------------------------------------------------- /i18n/fi.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "MITO", 5 | "Pyscowicz", 6 | "Veikk0.ma" 7 | ] 8 | }, 9 | "title": "WikimediaOCR", 10 | "subtitle": "Muunna kuvamuotoinen teksti raakatekstiksi", 11 | "form-heading": "Muunna kuvassa oleva kirjoitus tekstiksi", 12 | "image-url": "Kuvan verkko-osoite", 13 | "image-url-help": "Lisää Wikimedian palvelimella sijaitsevan kuvan verkko-osoite, esim.: $1", 14 | "image-url-error": "Kuvan verkko-osoitteen tulee alkaa {{PLURAL:$1|seuraavalla verkkotunnuksella|yhdellä seuraavista verkkotunnuksista}} ja loppua hyväksytyllä tiedostopäätteellä: $2", 15 | "image-alt-text": "Alkuperäinen kuva", 16 | "language-code": "Kielet (valinnainen)", 17 | "engine": "Tekstintunnistusmoottori", 18 | "engine-name-transkribus": "Transkribus OCR", 19 | "engine-not-found-warning": "Pyydettyä moottoria '$1' ei löytynyt. Käytetään sen sijasta oletusmoottoria '$2'.", 20 | "engine-invalid-langs-warning": "Seuraavat kielet ovat virheellisiä tai moottori ei tue niitä ja ne ohitettiin: $1", 21 | "submit": "Muunna koko sivu tekstiksi", 22 | "submit-crop": "Muunna alue tekstiksi", 23 | "drag-help": "Valitse rajaustyökalu ja piirrä alla olevaan kuvaan suorakulmio muuntaaksesi vain yhden alueen sivulta tekstiksi.", 24 | "drag-mode-move": "Raahaaminen siirtää kuvaa", 25 | "drag-mode-move-alt": "Siirtämistoimintoa esittävä kuvake.", 26 | "drag-mode-crop": "Raahaaminen luo uuden rajausalueen", 27 | "drag-mode-crop-alt": "Rajaustoimintoa esittävä kuvake.", 28 | "copy-to-clipboard": "Kopioi leikepöydälle", 29 | "copied-to-clipboard": "Kopioitu!", 30 | "google-error": "Googlen palvelu vastasi virheilmoituksella: $1", 31 | "image-retrieval-failed": "Kuvan noutaminen epäonnistui: $1", 32 | "documentation": "Käyttöohjeet", 33 | "api-tooltip": "Katso rajapinnan dokumentaatio", 34 | "version": "Versio $1", 35 | "report-issue": "Tee vikailmoitus", 36 | "langs-placeholder": "Jätä tyhjäksi tunnistaaksesi kielen automaattisesti.", 37 | "langs-param-error": "Tekstintunnistusmoottori ei tue seuraavaa {{PLURAL:$1|kieltä}}: $2", 38 | "tesseract-options": "Tesseractin asetukset", 39 | "tesseract-psm-label": "Sivunjakomenetelmä", 40 | "tesseract-psm-help": "Kokeile asetusta \"Harva teksti\" mikäli haluat paremman tuen useaan palstaan jaetulle tekstille.", 41 | "tesseract-psm-0": "Vain tekstin suunnan ja kirjoitusjärjestelmän tunnistus (OSD).", 42 | "tesseract-psm-1": "Automaattinen sivunjako ja tekstin suunnan ja kirjoitusjärjestelmän tunnistus", 43 | "tesseract-psm-2": "Automaattinen sivunjako, ei tekstin suunnan ja kirjoitusjärjestelmän tunnistusta eikä tekstintunnistusta. (ei vielä toteutettu)", 44 | "tesseract-psm-3": "Täysin automaattinen sivunjako, ei tekstin suunnan ja kirjoitusjärjestelmän tunnistusta. (Oletus)", 45 | "tesseract-psm-4": "Oleta yksi tekstipalsta ja vaihteleva kirjasinkoko.", 46 | "tesseract-psm-5": "Oleta yksi yhtenäinen, pystysuoraan tasattu tekstilohko.", 47 | "tesseract-psm-6": "Oleta yksi yhtenäinen tekstilohko.", 48 | "tesseract-psm-7": "Käsittele kuvaa yhtenä tekstirivinä.", 49 | "tesseract-psm-8": "Käsittele kuvaa yhtenä sanana.", 50 | "tesseract-psm-9": "Käsittele kuvaa yhtenä sanana, joka on ympyröity.", 51 | "tesseract-psm-10": "Käsittele kuvaa yhtenä kirjoitusmerkkinä.", 52 | "tesseract-psm-11": "Harva teksti. Etsi mahdollisimman paljon tekstiä järjestyksestä välittämättä.", 53 | "tesseract-psm-12": "Harva teksti ja tekstin suunnan ja kirjoitusjärjestelmän tunnistus.", 54 | "tesseract-psm-13": "Tekstirivi. Käsittele kuvaa yhtenä tekstirivinä (kiertää Tesseractia varten tehdyt niksit).", 55 | "tesseract-param-error": "\"$1\"-asetus ei tue arvoa $2. Enimmäisarvo on $3", 56 | "tesseract-internal-error": "Tesseract-moottori kohtasi sisäisen virheen.", 57 | "transkribus-empty-response-error": "Tulosta ei voitu jäsentää Transribus API:sta", 58 | "transkribus-no-lang-error": "Kieltä ei valittu", 59 | "transkribus-multiple-lang-error": "Useita kieliä ei sallita, määritä yksi kieli" 60 | } 61 | -------------------------------------------------------------------------------- /i18n/ko.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Apzp79", 5 | "Namoroka", 6 | "Suleiman the Magnificent Television", 7 | "Ykhwong", 8 | "그냥기여자" 9 | ] 10 | }, 11 | "title": "위키미디어OCR", 12 | "subtitle": "위키미디어 공용의 스캔된 이미지에서 텍스트를 변환하여 위키문헌과 다른 곳에서 사용할 수 있는 도구입니다.", 13 | "form-heading": "그림을 변환하기", 14 | "image-url": "그림 URL", 15 | "image-url-help": "위키미디어 서버에 호스팅된 그림 URL을 삽입하여 다음을 입력합니다: $1", 16 | "image-url-error": "그림 URL은 {{PLURAL:$1|다음 도메인 이름으로|다음 도메인 이름 중 하나로}} 시작하여 유효한 파일 확장자로 끝나야 합니다: $2", 17 | "image-alt-text": "원본 그림", 18 | "language-code": "언어 (선택 사항)", 19 | "engine": "OCR 엔진", 20 | "engine-name-google": "구글 클라우드 비전 OCR", 21 | "engine-name-tesseract": "테서랙트 OCR", 22 | "engine-name-transkribus": "트랜스크리버스 OCR", 23 | "engine-not-found-warning": "요청하신 '$1' 엔진을 찾을 수 없습니다. 대신 기본 엔진 '$2'을 사용합니다.", 24 | "engine-invalid-langs-warning": "다음 언어는 잘못되었거나 엔진에서 지원하지 않으므로 무시되었습니다: $1", 25 | "submit": "전체 페이지 변환", 26 | "submit-crop": "변환 영역", 27 | "drag-help": "자르기 도구를 선택하고 사각형을 아래의 그림 위로 드래그하여 페이지의 일부 영역만 변환합니다.", 28 | "drag-mode-move": "드래그하면 그림이 이동합니다", 29 | "drag-mode-move-alt": "'이동' 동작을 나타내는 아이콘입니다.", 30 | "drag-mode-crop": "드래그하면 새로운 자르기 영역이 생성됩니다", 31 | "drag-mode-crop-alt": "'자르기' 동작을 나타내는 아이콘입니다.", 32 | "copy-to-clipboard": "클립보드에 복사", 33 | "copied-to-clipboard": "복사 완료!", 34 | "google-error": "구글 서비스가 오류를 반환했습니다: $1", 35 | "image-retrieval-failed": "그림 검색에 실패하였습니다: $1", 36 | "documentation": "문서", 37 | "api-tooltip": "API 문서 보기", 38 | "version": "버전 $1", 39 | "report-issue": "문제점 보고", 40 | "langs-placeholder": "자동 언어 감지를 위해 비워두세요.", 41 | "langs-param-error": "다음 {{PLURAL:$1|언어}}는 OCR 엔진에서 지원되지 않습니다: $2", 42 | "loading-message": "변환 수행 중...", 43 | "tesseract-options": "테서랙트 옵션", 44 | "tesseract-psm-label": "페이지 분할 방법", 45 | "tesseract-psm-help": "더 나은 다단 지원을 위해 \"산발적인 텍스트\"를 사용해 보세요.", 46 | "tesseract-psm-0": "방향 및 문자 인식(OSD)만 수행합니다.", 47 | "tesseract-psm-1": "OSD를 이용하여 페이지를 자동 분할합니다.", 48 | "tesseract-psm-2": "OSD나 OCR 없이 페이지를 자동 분할합니다. (구현되지 않음)", 49 | "tesseract-psm-3": "OSD 없이 완전히 자동으로 페이지를 분할합니다. (기본값)", 50 | "tesseract-psm-4": "다양한 크기의 단일 텍스트 열을 가정합니다.", 51 | "tesseract-psm-5": "수직으로 정렬된 단일 텍스트 블록을 가정합니다.", 52 | "tesseract-psm-6": "단일 텍스트 블록을 가정합니다.", 53 | "tesseract-psm-7": "그림을 단일 텍스트 줄로 취급합니다.", 54 | "tesseract-psm-8": "그림을 하나의 단어로 취급합니다.", 55 | "tesseract-psm-9": "그림을 원 내부의 하나의 단어로 취급합니다.", 56 | "tesseract-psm-10": "그림을 단일 문자로 취급합니다.", 57 | "tesseract-psm-11": "산발적인 텍스트. 특별한 정렬 없이 텍스트를 가능한 한 많이 찾습니다.", 58 | "tesseract-psm-12": "OSD를 이용한 산발적인 텍스트.", 59 | "tesseract-psm-13": "가공되지 않은 줄. 그림을 단일 텍스트 줄로 취급하여 테서랙트 고유의 해킹을 우회합니다.", 60 | "tesseract-param-error": "'$2'의 값을 가진 '$1' 옵션은 테러색트에서 지원되지 않습니다. 최댓값: $3", 61 | "tesseract-no-text-error": "테서랙트 엔진이 이 그림으로부터 아무런 텍스트를 반환하지 못했습니다.", 62 | "tesseract-internal-error": "테서랙트 엔진이 내부 오류를 반환했습니다.", 63 | "transkribus-language-code": "언어 모델", 64 | "transkribus-unauthorized-error": "오류 코드 '$1' :: 요청이 허가되지 않음", 65 | "transkribus-default-error": "오류 코드 '$1' :: 요청을 완료할 수 없음, 다시 시도하세요!", 66 | "transkribus-empty-response-error": "트랜스크리버스 API로부터 결과를 파싱하지 못했습니다", 67 | "transkribus-init-process-error": "트랜스크리버스 프로세스를 초기화하는데 실패했습니다", 68 | "transkribus-failed-process-error": "트랜스크리버스 프로세스 실패", 69 | "transkribus-no-lang-error": "선택한 언어가 없습니다", 70 | "transkribus-multiple-lang-error": "여러 언어는 허용되지 않습니다, 하나의 언어만 지정하세요", 71 | "transkribus-browse-public-models": "트랜스크리버스의 모든 공개 언어 모델 둘러보기", 72 | "transkribus-request-for-model": "트랜스크리버스에서 OCR 도구 모델 추가 요청하기", 73 | "transkribus-options": "트랜스크리버스 옵션", 74 | "transkribus-line-label": "줄 감지 모델", 75 | "transkribus-line-id-none-option": "없음", 76 | "transkribus-mixed-line-option": "혼합된 줄 방향", 77 | "transkribus-line-help": "무슨 줄 감지 모델을 사용해야 하는지 모르겠다면 이 부분을 비워두세요", 78 | "transkribus-jobs": "트랜스크리버스 업무", 79 | "transkribus-job-id": "업무 ID", 80 | "transkribus-job-state": "상태", 81 | "transkribus-job-description": "설명", 82 | "transkribus-job-start": "시작됨", 83 | "transkribus-job-end": "완료", 84 | "transkribus-job-waited": "시작 지연 (분)" 85 | } 86 | -------------------------------------------------------------------------------- /src/EventListener/ExceptionListener.php: -------------------------------------------------------------------------------- 1 | request = $requestStack->getCurrentRequest(); 49 | $this->session = $requestStack->getSession(); 50 | $this->twig = $twig; 51 | $this->intuition = $intuition; 52 | $this->tesseractLogger = $tesseractLogger; 53 | } 54 | 55 | /** 56 | * @param ExceptionEvent $event 57 | */ 58 | public function onKernelException( ExceptionEvent $event ): void { 59 | $exception = $event->getThrowable(); 60 | 61 | // We only care about OcrExceptions, and UnsuccessfulCommandException thrown by the library (T282141). 62 | if ( !( $exception instanceof OcrException || $exception instanceof UnsuccessfulCommandException ) 63 | || !$event->isMainRequest() 64 | ) { 65 | return; 66 | } 67 | 68 | $isApi = str_contains( $this->request->getPathInfo(), '/api' ); 69 | $params = array_merge( 70 | OcrController::$params, 71 | $this->request->query->all() 72 | ); 73 | if ( $exception instanceof UnsuccessfulCommandException ) { 74 | $this->tesseractLogger->critical( $exception->__toString() ); 75 | $errorMessage = $this->getMessageForTesseractException( $exception ); 76 | } else { 77 | $errorMessage = $this->intuition->msg( 78 | $exception->getI18nKey(), 79 | [ 'variables' => $exception->getI18nParams() ] 80 | ); 81 | } 82 | 83 | if ( $isApi ) { 84 | $params['error'] = $errorMessage; 85 | $response = new JsonResponse( $params ); 86 | } else { 87 | /** @var FlashBagInterface $flashBag */ 88 | $flashBag = $this->session->getBag( 'flashes' ); 89 | // @phan-suppress-next-line PhanUndeclaredMethod 90 | $flashBag->add( 'error', $errorMessage ); 91 | $response = new Response( 92 | $this->twig->render( 'output.html.twig', $params ) 93 | ); 94 | } 95 | 96 | // Allow cross-origin requests like we do for successful requests. See T285543 97 | $response->headers->set( 'Access-Control-Allow-Origin', '*' ); 98 | $response->setStatusCode( Response::HTTP_BAD_REQUEST ); 99 | $event->setResponse( $response ); 100 | } 101 | 102 | /** 103 | * Given a tesseract-specific exception, try and extract a useful error message. Tries to balance between 104 | * being helpful and not giving away any potentially sensitive information (as might happen if we were 105 | * to pass any error message through). 106 | * 107 | * @param UnsuccessfulCommandException $exc @phan-unused-param 108 | * @return string 109 | */ 110 | private function getMessageForTesseractException( UnsuccessfulCommandException $exc ): string { 111 | // TODO: How can we be more specific about what's gone wrong? 112 | return $this->intuition->msg( 'tesseract-internal-error' ); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Requirements # 2 | 3 | * PHP 7.2+ 4 | * [Composer](http://getcomposer.org/) 5 | * [Symfony CLI](https://symfony.com/download) 6 | 7 | If you need to make asset changes: 8 | 9 | * [Node](https://nodejs.org) with the version specified by the `.nvmrc` [nvm](https://github.com/nvm-sh/nvm#installing-and-updating) file. 10 | 11 | ## Installation ## 12 | 13 | * `composer install` 14 | * `npm install` 15 | 16 | ### For Google Cloud Vision Engine ### 17 | 18 | * Add the missing values from `.env` to a `.env.local` file 19 | * Enable the Cloud Vision API at https://console.cloud.google.com/apis/api/vision.googleapis.com/overview 20 | * Create a new Google service account at https://console.cloud.google.com/iam-admin/serviceaccounts Google gives you 1,000 free lookups per month. 21 | * Give the service account the *Compute Engine Service Account* role. 22 | * Add a new key for the service account, and download the key's JSON file. Nothing needs to be changed in this file. 23 | * Add the path of that file to your `.env.local` as `APP_GOOGLE_KEYFILE`. 24 | 25 | ### For Tesseract OCR Engine ### 26 | * Install [Tesseract](https://tesseract-ocr.github.io) and make sure it's in your `$PATH` 27 | 28 | ### For Transkribus OCR Engine ### 29 | 30 | You can [create a free account](https://readcoop.eu/transkribus/?sc=Transkribus) for Transkribus, and get a small number of free credits. 31 | 32 | You will also need to set the *username* and *password* of your Transkribus account in `.env.local`: 33 | 34 | ```dotenv 35 | APP_TRANSKRIBUS_USERNAME=username 36 | APP_TRANSKRIBUS_PASSWORD=password 37 | ``` 38 | 39 | **Note**: You will require sufficient credits in your account to use the Transkribus API. 40 | 41 | ## Run the application ## 42 | * `symfony serve` to start the application 43 | * `npm run watch` if you need to make JS/CSS changes. Compiled assets are not committed. 44 | 45 | ## Using Redis for caching 46 | 47 | The application caches some data. 48 | In development this is done on the filesystem (in the `var/cache/dev/pools/` directory), 49 | and in production in Redis 50 | (the [Toolforge installation](https://wikitech.wikimedia.org/wiki/Help:Toolforge/Redis_for_Toolforge)). 51 | 52 | To test the Redis configuration locally, open an SSH tunnel to Toolforge's Redis server: 53 | 54 | ```console 55 | $ ssh -N -L 6379:redis.svc.tools.eqiad1.wikimedia.cloud:6379 login.toolforge.org 56 | ``` 57 | 58 | And set the following in `.env.local`: 59 | 60 | ```dotenv 61 | APP_ENV=prod 62 | REDIS_HOST=localhost 63 | ``` 64 | 65 | Then clear the application cache with 66 | 67 | ```console 68 | $ ./bin/console c:c 69 | ``` 70 | 71 | Docker Developer Environment 72 | ============================ 73 | 74 | _(beta: this is a very raw setup and needs improvements)_ 75 | 76 | ### Requirements 77 | 78 | - [Docker installation instructions][docker-install] 79 | 80 | [docker-install]: https://docs.docker.com/install/ 81 | 82 | ### Quickstart 83 | 84 | Setup container 85 | ``` 86 | ./docker/setup.sh 87 | ``` 88 | 89 | Run container 90 | ``` 91 | ./docker/run.sh 92 | ``` 93 | 94 | ## Structure of models.json 95 | 96 | The engines' model and language information is stored in `/public/models.json`, 97 | from where it's read and returned in the `/api/available_langs` API endpoint. 98 | 99 | OCR engines take zero to many model names (often called 'languages' because 100 | there's direct mapping to those, but we're moving away from this nomenclature 101 | now because it doesn't always hold true). 102 | 103 | `models.json` is first grouped by engine, and then each engine has a list of models. 104 | These are identified by a 'model code', which is what the user provides in the `langs[]` parameter. 105 | For some engines these are passed through to the actual engine process or API, 106 | but others don't have convenient model names and so we invent them 107 | and add whatever extra info is needed as additional properties within `models.json`. 108 | 109 | In addition to the model code, every model needs to have at least a `title` and `languages` property. 110 | 111 | * `title`: This is what's shown (unlocalized) to the user. 112 | * `languages`: An array of ISO639 language codes. This is (or will be) what's used to group models when the user is browsing them. 113 | -------------------------------------------------------------------------------- /i18n/he.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Amire80", 5 | "Ghsuvr", 6 | "Leononon", 7 | "YaronSh" 8 | ] 9 | }, 10 | "title": "WikimediaOCR", 11 | "subtitle": "כלי לאחזור טקסט מתמונות סרוקות בוויקישיתוף, לשימוש בוויקיטקסט ובכל מקום אחר.", 12 | "form-heading": "לתמלל תמונה", 13 | "image-url": "כתובת URL של התמונה", 14 | "image-url-help": "נא להכניס כתובת URL של תמונה המאוחסנת בשרת ויקימדיה כמו: $1", 15 | "image-url-error": "קישור התמונה חייב להתחיל {{PLURAL:$1|בשם המתחם הבא|באחד משמות הבאים}} ולהסתיים בסיומת קובץ תקינה: $2", 16 | "image-alt-text": "התמונה המקורית", 17 | "language-code": "שפות (לא חובה)", 18 | "engine": "מנוע זיהוי התווים OCR", 19 | "engine-name-google": "Google Cloud Vision OCR", 20 | "engine-name-tesseract": "Tesseract OCR", 21 | "engine-name-transkribus": "Transkribus OCR", 22 | "engine-not-found-warning": "המנוע המבוקש $1 לא נמצא. נא לנסות את $2 במקום זה.", 23 | "engine-invalid-langs-warning": "השפות הבאות לא תקניות או שלא נתמכות על ידי המנוע ולכל נדחו: $1", 24 | "submit": "לתמלל את כל הדף", 25 | "submit-crop": "אזור תמלול", 26 | "drag-help": "נא לבחור את כלי החיתוך ולגרור את המלבן בתמונה למטה כדי לתמוך רק אזור אחד בדף.", 27 | "drag-mode-move": "גרירה תזיז את התמונה", 28 | "drag-mode-move-alt": "סמל מייצג פעולת 'הזזה'", 29 | "drag-mode-crop": "גרירה תיצור אזור חיתוך חדש", 30 | "drag-mode-crop-alt": "אייקון מייצג פעולת 'חיתוך'", 31 | "copy-to-clipboard": "העתקה ללוח", 32 | "copied-to-clipboard": "הועתק!", 33 | "google-error": "השירות של גוגל החזיר שגיאה: $1", 34 | "image-retrieval-failed": "אחזור התמונה נכשל: $1", 35 | "documentation": "תיעוד", 36 | "api": "API", 37 | "api-tooltip": "ר' את תיעוד ה־API", 38 | "version": "גרסה $1", 39 | "report-issue": "דיווח על טעות", 40 | "langs-placeholder": "יש להשאיר ריק לזיהוי שפה אוטומטי", 41 | "langs-param-error": "{{PLURAL:$1|השפה שציינת אינה נתמכת|השפות שציינת אינן נתמכות}} על־ידי מנוע זיהוי התווים: $2", 42 | "loading-message": "מתבצע זיהוי תווים...", 43 | "tesseract-options": "אפשרויות טסרקט", 44 | "tesseract-psm-label": "שיטת חלקות הדף למקטעים", 45 | "tesseract-psm-help": "יש לנסות את המצב 'טקסט דליל' לתמיכה טובה יותר בריבוי עמודות", 46 | "tesseract-psm-0": "כיוון הדף וזיהוי הכתב בלבד (OSD).", 47 | "tesseract-psm-1": "חלוקת דף אוטומטית למקטעים בשיטת כיוון הטקסט וזיהוי הכתב (OSD).", 48 | "tesseract-psm-2": "חלוקת דף אוטומטית למטקעים, אבל ללא OSD או OCR. (לא ממומש)", 49 | "tesseract-psm-3": "חלוקת דף אוטומטית למקטעים, אבל ללא OSD. (ברירת מחדל)", 50 | "tesseract-psm-4": "להניח עמודה בודדת של טקסט בגדלים משתנים.", 51 | "tesseract-psm-5": "להניח שהטקסט מיושר אנכית באופן אחיד.", 52 | "tesseract-psm-6": "להניח שהטקסט מיושר באופן אחיד.", 53 | "tesseract-psm-7": "להתייחס לתמונה כאל שורת טקסט אחת.", 54 | "tesseract-psm-8": "להתייחס לתמונה כאל מילה אחת.", 55 | "tesseract-psm-9": "להתייחס לתמונה כאל מילה בודדת בעיגול.", 56 | "tesseract-psm-10": "להתייחס לתמונה כאל תו בודד.", 57 | "tesseract-psm-11": "דילול הטקסט. מציאת כמה שיותר טקסט ללא סדר מסוים", 58 | "tesseract-psm-12": "דילול הטקסט בעזרת OSD.", 59 | "tesseract-psm-13": "שורה גולמית. להתייחס לתמונה כשורת טקסט אחת, ולעקוף אפשרויות מיוחדות לטסרקט.", 60 | "tesseract-param-error": "האפשרות $1 עם הערך $2 לא נתמכת על־ידי טסרקט. הערך המרבי הוא $3", 61 | "tesseract-no-text-error": "מנוע Tesseract לא החזיר טקסט לתמונה הזאת.", 62 | "tesseract-internal-error": "מנוע טסרקט החזיר שגיאה פנימית.", 63 | "transkribus-language-code": "מודל שפה", 64 | "transkribus-unauthorized-error": "קוד שגיאה ‚$1’ :: הבקשה לא מאומתת", 65 | "transkribus-default-error": "קוד שגיאה ‚$1’ :: לא ניתן להשלים את הבקשה, נא לנסות שוב!", 66 | "transkribus-empty-response-error": "לא היה אפשר לפענח את התוצאות מ־Transkribus API", 67 | "transkribus-init-process-error": "הפעלת תהליך ה־Transkribus נכשלה", 68 | "transkribus-failed-process-error": "תהליך Transkribus נכשל", 69 | "transkribus-no-lang-error": "לא נבחרה שפה", 70 | "transkribus-multiple-lang-error": "אסור להשתמש בכמה שפות, יש לציין שפה אחת", 71 | "transkribus-browse-public-models": "עיין בכל דגמי השפה הציבוריים עבור Transkribus", 72 | "transkribus-request-for-model": "הגשת בקשה להוספת דגם מ־Transkribus לכלי OCR", 73 | "transkribus-options": "אפשרויות Transkribus", 74 | "transkribus-line-label": "דגם זיהוי קו", 75 | "transkribus-line-id-none-option": "אין", 76 | "transkribus-mixed-line-option": "כיוון קו מעורב", 77 | "transkribus-line-help": "נא להשאיר את זה ריק אם אינך בטוח באיזה דגם זיהוי קו להשתמש", 78 | "transkribus-jobs": "משימות ל־Transkribus", 79 | "transkribus-job-id": "מזהה משימה", 80 | "transkribus-job-state": "מצב", 81 | "transkribus-job-description": "תיאור", 82 | "transkribus-job-start": "התחילה", 83 | "transkribus-job-end": "הסתיימה", 84 | "transkribus-job-waited": "השהיית התחלה (דקות)" 85 | } 86 | -------------------------------------------------------------------------------- /src/Engine/TesseractEngine.php: -------------------------------------------------------------------------------- 1 | ocr = $tesseractOcr; 38 | } 39 | 40 | /** 41 | * @inheritDoc 42 | */ 43 | public static function getId(): string { 44 | return 'tesseract'; 45 | } 46 | 47 | /** 48 | * @inheritDoc 49 | */ 50 | public function getResult( 51 | string $imageUrl, 52 | string $invalidLangsMode, 53 | array $crop, 54 | ?array $langs = null 55 | ): EngineResult { 56 | // Check the URL and fetch the image data. 57 | $this->checkImageUrl( $imageUrl ); 58 | 59 | [ $validLangs, $invalidLangs ] = $this->filterValidLangs( $langs, $invalidLangsMode ); 60 | 61 | $image = $this->getImage( $imageUrl, $crop, self::DO_DOWNLOAD_IMAGE ); 62 | $this->ocr->imageData( $image->getData(), $image->getSize() ); 63 | 64 | if ( $validLangs ) { 65 | $this->ocr->lang( ...$validLangs ); 66 | } 67 | 68 | // Env vars are passed through by the thiagoalessio/tesseract_ocr package to the tesseract command, 69 | // but when they're loaded from Symfony's .env they aren't actually available (by design), 70 | // so we have to load this one manually. We only process one image at a time, so don't benefit from 71 | // multiple threads. See https://github.com/tesseract-ocr/tesseract/issues/898 for some more info. 72 | putenv( 'OMP_THREAD_LIMIT=1' ); 73 | try { 74 | $text = $this->ocr->run(); 75 | } catch ( UnsuccessfulCommandException $e ) { 76 | // An UnsuccessfulCommandException is thrown when there's no output, but that's not an 77 | // actual error so we check for it here and just show a warning. The same exception class 78 | // is also used for other things, hence the message check here. 79 | if ( strpos( $e->getMessage(), 'The command did not produce any output' ) !== false ) { 80 | return new EngineResult( '', [ $this->intuition->msg( 'tesseract-no-text-error' ) ] ); 81 | } 82 | throw $e; 83 | } 84 | 85 | $warnings = $invalidLangs ? [ $this->getInvalidLangsWarning( $invalidLangs ) ] : []; 86 | return new EngineResult( $text, $warnings ); 87 | } 88 | 89 | /** 90 | * Set the page segmentation mode. 91 | * @param int $psm 92 | */ 93 | public function setPsm( int $psm ): void { 94 | $this->validateOption( 'psm', $psm, self::MAX_PSM ); 95 | $this->ocr->psm( $psm ); 96 | } 97 | 98 | /** 99 | * Get available PSM IDs and values. 100 | * @return mixed[][] 101 | */ 102 | public function getAvailablePsms(): array { 103 | $psms = []; 104 | $psmIds = [ 0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 ]; 105 | foreach ( $psmIds as $psmId ) { 106 | array_push( $psms, [ 107 | 'value' => $psmId, 108 | // The following messages can be used here: 'tesseract-psm-0', 'tesseract-psm-1', 109 | // 'tesseract-psm-3', 'tesseract-psm-4', 'tesseract-psm-5', 'tesseract-psm-6', 'tesseract-psm-7', 110 | // 'tesseract-psm-8', 'tesseract-psm-9', 'tesseract-psm-10', 'tesseract-psm-11', 'tesseract-psm-12', 111 | // 'tesseract-psm-13' 112 | 'label' => $this->intuition->msg( 'tesseract-psm-' . $psmId ), 113 | ] ); 114 | } 115 | return $psms; 116 | } 117 | 118 | /** 119 | * Validates the given option. 120 | * @param string $option 121 | * @param int $given 122 | * @param int $maximum 123 | * @throws OcrException 124 | */ 125 | private function validateOption( string $option, int $given, int $maximum ): void { 126 | if ( $given > $maximum ) { 127 | throw new OcrException( 128 | 'tesseract-param-error', 129 | [ 130 | $this->intuition->msg( "tesseract-$option-label" ), 131 | $given, 132 | $maximum, 133 | ] 134 | ); 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /i18n/fa.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Darafsh", 5 | "Ebrahim", 6 | "Ebraminio", 7 | "Jeeputer", 8 | "Mfatemi", 9 | "Yoosef Pooranvary" 10 | ] 11 | }, 12 | "title": "OCR ویکی‌مدیا", 13 | "subtitle": "ابزاری برای رونویسی متن از تصاویر اسکن‌شده در ویکی‌انبار، برای استفاده در ویکی‌نبشته و جاهای دیگر.", 14 | "form-heading": "رونویسی یک تصویر", 15 | "image-url": "نشانی تصویر", 16 | "image-url-help": "نشانی URL تصویری را که در کارگزار ویکی‌مدیا میزبانی شده است را وارد کنید، مانند: $1", 17 | "image-url-error": "نشانی URL تصویر باید با {{PLURAL:$1|the following domain name|one of the following domain names}} آغاز شده و با شناسه فایل معتبری پایان یابد: $2", 18 | "image-alt-text": "تصویر اصلی", 19 | "language-code": "زبان‌ها (اختیاری)", 20 | "engine": "موتور OCR", 21 | "engine-name-transkribus": "Transkribus OCR", 22 | "engine-not-found-warning": "موتور درخواستی $1 پیدا نشد. به جای آن از موتور پیش‌فرض $2 استفاده می‌شود.", 23 | "engine-invalid-langs-warning": "زبان‌های زیر نامعتبر بوده یا توسط موتور پشتیبانی نمی‌شوند و از آنها چشم‌پوشی شد: $1", 24 | "submit": "رونویسی تمام صفحه", 25 | "submit-crop": "ناحیه رونویسی", 26 | "drag-help": "ابزار برش را انتخاب کرده و چهارگوشی بر روی تصویر زیر بکشید تا فقط یک ناحیه از صفحه رونویسی شود.", 27 | "drag-mode-move": "با کشیدن تصویر حرکت خواهد کرد.", 28 | "drag-mode-move-alt": "آیکان نشان‌دهندهٔ عمل «جابجایی»", 29 | "drag-mode-crop": "با کشیدن ناحیه چیدن جدیدی درست خواهد شد", 30 | "drag-mode-crop-alt": "آیکان نشان‌دهندهٔ عمل «چیدن»", 31 | "copy-to-clipboard": "کپی به کلیپ‌بورد", 32 | "copied-to-clipboard": "کپی شد!", 33 | "google-error": "سرویس گوگل خطایی برگرداند: $1", 34 | "image-retrieval-failed": "بازیابی تصویر ناموفق بود: $1", 35 | "documentation": "مستندات", 36 | "api-tooltip": "اسناد API را ببینید", 37 | "version": "نسخهٔ $1", 38 | "report-issue": "گزارش مشکل", 39 | "langs-placeholder": "برای شناسایی خودکار زبان، خالی بگذارید.", 40 | "langs-param-error": "این {{PLURAL:$1|زبان|زبان‌ها}} توسط موتور OCR پشتیبانی نمی‌شوند: $2", 41 | "loading-message": "در حال انجام رونویسی...", 42 | "tesseract-options": "گزینه‌های Tesseract", 43 | "tesseract-psm-label": "روش بخش‌بندی صفحه", 44 | "tesseract-psm-help": "برای پشتیبانی بهتر از متن چندستونی «متن پراکنده» را انتخاب کن.", 45 | "tesseract-psm-0": "فقط تشخیص جهت و خط (OSD)", 46 | "tesseract-psm-1": "بخش‌بندی خودکار صفحه با OSD", 47 | "tesseract-psm-2": "بخش‌بندی خودکار صفحه، ولی بدون استفاده از OSD یا OCR. (هنوز اجرایی نشده است)", 48 | "tesseract-psm-3": "بخش‌بندی کاملاً خودکار صفحه، ولی بدون OSD. (پیش‌فرض)", 49 | "tesseract-psm-4": "فرض کن متن تک‌ستونی با اندازه‌های گوناگون است.", 50 | "tesseract-psm-5": "فرض کن بلوک یکنواختی از متن با چینش عمودی است.", 51 | "tesseract-psm-6": "فرض کن متن یک بلوک یکنواخت است.", 52 | "tesseract-psm-7": "با تصویر مانند یک خط متن رفتار کن.", 53 | "tesseract-psm-8": "با تصویر به عنوان تک‌واژه رفتار کن.", 54 | "tesseract-psm-9": "با تصویر به عنوان تک‌واژه‌ای در یک دایره رفتار کن.", 55 | "tesseract-psm-10": "با تصویر به عنوان یک نویسه رفتار کن", 56 | "tesseract-psm-11": "متن پراکنده. تا جایی که ممکن است بدون توجه به ترتیب، متن پیدا کن.", 57 | "tesseract-psm-12": "متن پراکنده با OSD", 58 | "tesseract-psm-13": "خط خام. با تصویر به عنوان یک خط متن رفتار کن و از ترفندهای اختصاصی Tesseract صرف نظر کن.", 59 | "tesseract-param-error": "گزینه $1 با مقدار $2 توسط Tesseract پشتیبانی نمی‌شود. مقدار بیشینه: $3", 60 | "tesseract-no-text-error": "موتور Tesseract هیچ متنی برای این تصویر بازنگرداند.", 61 | "tesseract-internal-error": "موتور tesseract خطای داخلی برگرداند.", 62 | "transkribus-language-code": "مدل زبانی", 63 | "transkribus-unauthorized-error": "کد خطا ' $1 ' :: درخواست مجاز نیست", 64 | "transkribus-default-error": "کد خطا ' $1 ' :: درخواست تکمیل نشد، دوباره امتحان کنید!", 65 | "transkribus-empty-response-error": "نتیجه بازگردانی‌شده از API ترنسکریبوس قابل تشخیص و تجزیه نیست", 66 | "transkribus-init-process-error": "فرآیند Transkribus راه‌اندازی نشد", 67 | "transkribus-failed-process-error": "فرآیند Transkribus ناموفق بود", 68 | "transkribus-no-lang-error": "هیچ زبانی را انتخاب نکردید", 69 | "transkribus-multiple-lang-error": "نمی‌توانید چند زبان را انتخاب کنید، پس فقط یکی را برگزینید", 70 | "transkribus-browse-public-models": "مشاهده همه مدل‌های زبانی در ترنسکریبوس", 71 | "transkribus-request-for-model": "ثبت درخواست افزودن یک مدل ترنسکریبوس به ابزار رونویسی", 72 | "transkribus-options": "گزینه‌های ترنسکربوس", 73 | "transkribus-line-label": "مدل تشخیص خط", 74 | "transkribus-line-id-none-option": "هیچ کدام", 75 | "transkribus-mixed-line-option": "چیدمان مختلط خطوط", 76 | "transkribus-line-help": "اگر نمی‌دانید از کدام مدل تشخیص خطوط استفاده کنید این مورد را خالی بگذارید", 77 | "transkribus-jobs": "وظایف ترنسکریبوس", 78 | "transkribus-job-id": "وظیفه ID", 79 | "transkribus-job-state": "وضعیت", 80 | "transkribus-job-description": "توضیحات", 81 | "transkribus-job-start": "آغاز شده", 82 | "transkribus-job-end": "پایان", 83 | "transkribus-job-waited": "تأخیر آغاز (دقیقه)" 84 | } 85 | -------------------------------------------------------------------------------- /i18n/ar.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Dr-Taher", 5 | "Mdktb", 6 | "Meno25", 7 | "Mohanad", 8 | "Mohanad Kh" 9 | ] 10 | }, 11 | "title": "ويكيميديا-أداة التعرف على الحروف", 12 | "subtitle": "أداة لنقل النصوص من الصور الممسوحة ضوئيًا على ويكيميديا كومنز، لاستخدامها على ويكي مصدر وأماكن أخرى.", 13 | "form-heading": "نسخ صورة", 14 | "image-url": "عنوان صفحة الشبكة للصورة (URL)", 15 | "image-url-help": "أدخل عنوان صفحة الشبكة لصورة مرفوعة على أحد خوادم ويكيميديا مثل: $1", 16 | "image-url-error": "يجب أن يبدأ عنوان صفحة الشبكة للصورة بـ {{PLURAL:$1|اسم النطاق التالي|أحد أسماء النطاقات التالية}} وينتهي بامتداد ملف صالح: $2", 17 | "image-alt-text": "الصورة الأصلية", 18 | "language-code": "اللغات (اختياري)", 19 | "engine": "أداة التعرف على الحروف", 20 | "engine-name-google": "التعرف الضوئي على الحروف في Google Cloud Vision", 21 | "engine-name-tesseract": "برنامج Tesseract OCR", 22 | "engine-name-transkribus": "ترانسكريبوس OCR", 23 | "engine-not-found-warning": "لم يتم العثور على الأداة المطلوبة ' $1 '. استخدام الأداة الافتراضية \" $2 \" بدلاً من ذلك.", 24 | "engine-invalid-langs-warning": "اللغات التالية غير صالحة أو غير مدعومة من قبل الأداة وجرى تجاهلها: $1", 25 | "submit": "نسخ الصفحة بأكملها", 26 | "submit-crop": "منطقة النسخ", 27 | "drag-help": "حدد أداة الاقتصاص واسحب مستطيلًا على الصورة أدناه لتحديد مساحة واحدة فقط من الصفحة.", 28 | "drag-mode-move": "السحب سيحرك الصورة", 29 | "drag-mode-move-alt": "رمز يمثل إجراء \"النقل\".", 30 | "drag-mode-crop": "سيؤدي السحب إلى إنشاء منطقة اقتصاص جديدة", 31 | "drag-mode-crop-alt": "رمز يمثل إجراء \"الاقتصاص\".", 32 | "copy-to-clipboard": "نسخ إلى الحافظة", 33 | "copied-to-clipboard": "نُسِخت!", 34 | "google-error": "أظهرت خدمة جوجل خطأً: $1", 35 | "image-retrieval-failed": "فشل استرداد الصورة: $1", 36 | "documentation": "التوثيق", 37 | "api": "واجهة برمجة التطبيقات", 38 | "api-tooltip": "عرض وثائق API", 39 | "version": "الإصدار $1", 40 | "report-issue": "أبلغ عن مشكلة", 41 | "langs-placeholder": "اتركه فارغًا للكشف التلقائي عن اللغة.", 42 | "langs-param-error": "التالي {{PLURAL:$1|هذه اللغة|هذه اللغات}} لا يدعمها أداة التعرف على الحروف: $2", 43 | "loading-message": "جاري النسخ...", 44 | "tesseract-options": "خيارات Tesseract", 45 | "tesseract-psm-label": "طريقة تجزئة الصفحة", 46 | "tesseract-psm-help": "جرب \"Sparse text\" للحصول على دعم أفضل للأعمدة المتعددة.", 47 | "tesseract-psm-0": "اكتشاف الاتجاه والنص (OSD) فقط.", 48 | "tesseract-psm-1": "تقسيم تلقائي للصفحة باستخدام OSD.", 49 | "tesseract-psm-2": "تجزئة الصفحة تلقائيًا، ولكن بدون OSD أو OCR. (لم تنفذ)", 50 | "tesseract-psm-3": "تجزئة تلقائية بالكامل للصفحة، ولكن بدون OSD. (افتراضي)", 51 | "tesseract-psm-4": "افترض عمودًا واحدًا من النص ذي الأحجام المتغيرة.", 52 | "tesseract-psm-5": "افترض كتلة نصية واحدة متجانسة، مع محازاة رأسية.", 53 | "tesseract-psm-6": "افترض وجود كتلة نصية واحدة موحدة.", 54 | "tesseract-psm-7": "تعامل مع الصورة كسطر نصي واحد.", 55 | "tesseract-psm-8": "تعامل مع الصورة ككلمة واحدة.", 56 | "tesseract-psm-9": "تعامل مع الصورة ككلمة واحدة في دائرة.", 57 | "tesseract-psm-10": "تعامل مع الصورة كحرف واحد.", 58 | "tesseract-psm-11": "نص متفرق. ابحث عن أكبر قدر ممكن من النص بدون ترتيب معين.", 59 | "tesseract-psm-12": "نصوص متفرقة مع OSD.", 60 | "tesseract-psm-13": "سطر أولي. تعامل مع الصورة كسطر نصي واحد، متجاوزًا الاختراقات الخاصة بـ Tesseract.", 61 | "tesseract-param-error": "لا تدعم Tesseract خيار \"$1\" بقيمة $2. الحد الأقصى للقيمة: $3", 62 | "tesseract-no-text-error": "لم يرد محرك Tesseract أي نص لهذه الصورة.", 63 | "tesseract-internal-error": "أظهرت أداة tesseract خطأ داخلي.", 64 | "transkribus-language-code": "نموذج اللغة", 65 | "transkribus-unauthorized-error": "رمز الخطأ '$1' :: الطلب غير مصرح به", 66 | "transkribus-default-error": "رمز الخطأ '$1' :: غير قادر على إكمال الطلب، حاول مرة أخرى!", 67 | "transkribus-empty-response-error": "لم يتمكن من تحليل النتيجة من واجهة برمجة تطبيقات Transkribus", 68 | "transkribus-init-process-error": "فشل في تهيئة عملية Transkribus", 69 | "transkribus-failed-process-error": "فشلت عملية Transkribus", 70 | "transkribus-no-lang-error": "لم يتم اختيار اللغة", 71 | "transkribus-multiple-lang-error": "لا يُسمح باستخدام لغات متعددة، حدد لغة واحدة", 72 | "transkribus-browse-public-models": "تصفح جميع نماذج اللغة العامة لـ Transkribus", 73 | "transkribus-request-for-model": "قم بتقديم طلب لإضافة نموذج من Transkribus إلى أداة OCR", 74 | "transkribus-options": "خيارات ترانسكريبوس", 75 | "transkribus-line-label": "نموذج اكتشاف الخط", 76 | "transkribus-line-id-none-option": "لا شيء", 77 | "transkribus-mixed-line-option": "اتجاه الخط المختلط", 78 | "transkribus-line-help": "اتركه فارغًا إذا لم تكن متأكدًا من نموذج اكتشاف الخط الذي يجب استخدامه", 79 | "transkribus-jobs": "وظائف ترانسكريبوس", 80 | "transkribus-job-id": "معرف الوظيفة", 81 | "transkribus-job-state": "الحالة", 82 | "transkribus-job-description": "الوصف", 83 | "transkribus-job-start": "بدأ", 84 | "transkribus-job-end": "انتهى", 85 | "transkribus-job-waited": "تأخير البدء (بالدقائق)" 86 | } 87 | -------------------------------------------------------------------------------- /i18n/sl.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Eleassar" 5 | ] 6 | }, 7 | "title": "WikimediaOCR", 8 | "subtitle": "Prepisovanje besedila s slik", 9 | "form-heading": "Prepis slike", 10 | "image-url": "URL slike", 11 | "image-url-help": "Vstavite URL slike, ki gostuje v strežniku Wikimedie, npr.: $1", 12 | "image-url-error": "URL slike se mora začeti z {{PLURAL:$1|naslednjim domenskim imenom|enim od naslednjih domenskih imen}} in se končati z veljavno datotečno pripono: $2", 13 | "image-alt-text": "Izvorna slika", 14 | "language-code": "Jeziki (neobvezno)", 15 | "engine": "Motor OCR", 16 | "engine-name-google": "OCR Google Cloud Vision", 17 | "engine-name-transkribus": "Transkribus OCR", 18 | "engine-not-found-warning": "Zahtevanega motorja »$1« ni bilo mogoče najti. Namesto tega uporabite privzeti motor »$2«.", 19 | "engine-invalid-langs-warning": "Naslednji jeziki so neveljavni ali jih motor ne podpira in so bili prezrti: $1", 20 | "submit": "Prepiši celotno stran", 21 | "submit-crop": "Prepiši območje", 22 | "drag-help": "Izberite orodje za obrezovanje in povlecite pravokotnik na spodnji sliki, da prepišete samo določeno območje strani.", 23 | "drag-mode-move": "Z vlečenjem boste premaknili sliko", 24 | "drag-mode-move-alt": "Ikona, ki predstavlja dejanje »premakni«.", 25 | "drag-mode-crop": "Z vlečenjem boste določili novo območje za obrez", 26 | "drag-mode-crop-alt": "Ikona, ki predstavlja dejanje »obreži«.", 27 | "copy-to-clipboard": "Kopiraj v odložišče", 28 | "copied-to-clipboard": "Kopirano!", 29 | "google-error": "Googlova storitev je vrnila napako: $1", 30 | "image-retrieval-failed": "Pridobivanje slike ni uspelo: $1", 31 | "documentation": "Dokumentacija", 32 | "api-tooltip": "Oglejte si dokumentacijo API-ja", 33 | "version": "Različica $1", 34 | "report-issue": "Sporočite težavo", 35 | "langs-placeholder": "Za samodejno prepoznavo jezika pustite prazno.", 36 | "langs-param-error": "{{PLURAL:$1|Naslednjega jezika|Naslednjih jezikov}} motor OCR ne podpira: $2", 37 | "tesseract-options": "Možnosti Tesseracta", 38 | "tesseract-psm-label": "Način segmentacije strani", 39 | "tesseract-psm-help": "Za boljšo podporo več stolpcev preizkusite »Sparse text«.", 40 | "tesseract-psm-0": "Samo usmeritev in prepoznava pisave (OSD).", 41 | "tesseract-psm-1": "Samodejna segmentacija strani z OSD.", 42 | "tesseract-psm-2": "Samodejna segmentacija strani, vendar brez OSD ali OCR. (ni implementirano)", 43 | "tesseract-psm-3": "Popolnoma samodejna segmentacija strani, vendar brez OSD. (privzeto)", 44 | "tesseract-psm-4": "Predpostavi en sam stolpec besedila različnih velikosti.", 45 | "tesseract-psm-5": "Predpostavi en sam enotni blok navpično poravnanega besedila.", 46 | "tesseract-psm-6": "Predpostavi en sam enotni blok besedila.", 47 | "tesseract-psm-7": "Sliko obravnavaj kot eno samo vrstico besedila.", 48 | "tesseract-psm-8": "Sliko obravnavaj kot eno samo besedo.", 49 | "tesseract-psm-9": "Sliko obravnavaj kot eno samo besedo v krogu.", 50 | "tesseract-psm-10": "Sliko obravnavaj kot en sam znak.", 51 | "tesseract-psm-11": "Pičlo besedilo. Poišči čim več besedila brez določenega vrstnega reda.", 52 | "tesseract-psm-12": "Pičlo besedilo z OSD.", 53 | "tesseract-psm-13": "Neobdelana vrstica. Obravnavaj sliko kot eno samo vrstico besedila, da se preprečijo artefakti, specifični za Tesseract.", 54 | "tesseract-param-error": "Možnosti »$1« z vrednostjo $2 Tesseract ne podpira. Najvišja vrednost: $3", 55 | "tesseract-internal-error": "Motor Tesseract je vrnil notranjo napako.", 56 | "transkribus-language-code": "Jezikovni model", 57 | "transkribus-unauthorized-error": "Koda napake »$1« :: Zahtevek ni pooblaščen", 58 | "transkribus-default-error": "Koda napake »$1« :: Zahtevka ni mogoče dokončati, poskusite znova!", 59 | "transkribus-empty-response-error": "Rezultata API-ja Transkribus ni bilo mogoče razčleniti", 60 | "transkribus-init-process-error": "Inicializacija postopka Transkribus ni uspela", 61 | "transkribus-failed-process-error": "Postopek Transkribus ni uspel", 62 | "transkribus-no-lang-error": "Izbran ni bil noben jezik", 63 | "transkribus-multiple-lang-error": "Več jezikov ni dovoljenih, določite en jezik", 64 | "transkribus-browse-public-models": "Prebrskajte vse javne jezikovne modele za Transkribus", 65 | "transkribus-request-for-model": "Ustvarite prošnjo za dodajanje modela iz Transkribusa v orodje OCR", 66 | "transkribus-options": "Možnosti Transkribus", 67 | "transkribus-line-label": "Model zaznavanja linij", 68 | "transkribus-line-id-none-option": "Noben", 69 | "transkribus-mixed-line-option": "Mešana usmeritev linij", 70 | "transkribus-line-help": "Če niste prepričani, kateri model zaznavanja linij uporabiti, pustite prazno.", 71 | "transkribus-jobs": "Opravila Transkribus", 72 | "transkribus-job-id": "ID opravila", 73 | "transkribus-job-state": "Stanje", 74 | "transkribus-job-description": "Opis", 75 | "transkribus-job-start": "Začeto", 76 | "transkribus-job-end": "Končano", 77 | "transkribus-job-waited": "Začetna zakasnitev (minute)" 78 | } 79 | -------------------------------------------------------------------------------- /i18n/krc.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Къарачайлы" 5 | ] 6 | }, 7 | "title": "ВикимедиаOCR", 8 | "subtitle": "Суратдан текстни ачыкъла", 9 | "form-heading": "Суратны ачыкъла", 10 | "image-url": "Суратны URL-и", 11 | "image-url-help": "Викимедиа серверде орналгъан суратны URL-ин салыгъы, сёз ючюн: $1", 12 | "image-url-error": "Сурат URL-и {{PLURAL:$1|бу домен ат бла|бу домен атланы бири бла}} башланыргъа керекди эмда джараулу файл кенгериу бла бошалыргъа керекди: $2", 13 | "image-alt-text": "Оригинал сурат", 14 | "language-code": "Тилле (амалсыз тюлдюле)", 15 | "engine": "OCR мотор", 16 | "engine-name-transkribus": "Transkribus OCR", 17 | "engine-not-found-warning": "Изленнген мотор' $1 ' табылалмады. Буну орнуна дефолт мотор ' $2 ' хайырланады.", 18 | "engine-invalid-langs-warning": "Тюбюндеги тилле джараусуздула неда мотор джанындан дагъан алмайдыла, неда джокъгъа саналадыла: $1", 19 | "submit": "Бютеу бетни транскрипция эт", 20 | "submit-crop": "Транскрипция къыр", 21 | "drag-help": "Бетни джангы бир джерин копия этер ючюн къыркъыу адырны сайла эмда тюбюндеги суратда тик тёртмюйюшню тарт.", 22 | "drag-mode-move": "Суратны тартыу, аны орнундан тебдирликди", 23 | "drag-mode-move-alt": "'Ташыу' этиуню кёргюзген белги.", 24 | "drag-mode-crop": "Тартыу, джангы къыркъыу аламны къурлукъду", 25 | "drag-mode-crop-alt": "'Къыркъыу' этиуню кёргюзген белги.", 26 | "copy-to-clipboard": "Алмашдырыу буферге копия эт", 27 | "copied-to-clipboard": "Копия этилди!", 28 | "google-error": "Google къуллукъ халат къайтарды: $1", 29 | "image-retrieval-failed": "Сурат алынамады: $1", 30 | "documentation": "Документация", 31 | "api-tooltip": "API документациягъа къара", 32 | "version": "Версия $1", 33 | "report-issue": "Проблеманы юсюнден билдир", 34 | "langs-placeholder": "Тилни автомат халда айгъакълар ючюн къырны бош къойгъуз.", 35 | "langs-param-error": "Бу {{PLURAL:$1|тил|тилле}}, OCR мотор джанындан дагъан болмайдыла: $2", 36 | "loading-message": "Транскрипция этиле турады...", 37 | "tesseract-options": "Tesseract джарашдырыула", 38 | "tesseract-psm-label": "Бетни бёлюмлеме амал", 39 | "tesseract-psm-help": "Талай багъананы дагъаны ючюн \"Аралыкълы текстни\" сынагъыз.", 40 | "tesseract-psm-0": "Къуру ориентация бла скриптлени табыу (OSD)", 41 | "tesseract-psm-1": "OSD бла автомат халда бетни бёлюмлеме.", 42 | "tesseract-psm-2": "Автомат халда бет бёлюмлеме, алай OSD неда OCR болмагъанлай. (этилмеди)", 43 | "tesseract-psm-3": "Толу автомат халда бет бёлюмлеме,а лай OSD тышында. (Дефолт)", 44 | "tesseract-psm-4": "Тюрленнген ёлчемледе джангыз бир текст багъананы баргъа сана.", 45 | "tesseract-psm-5": "Тик халда тюзетилген текстни джангыз бир типли блогу баргъа сана.", 46 | "tesseract-psm-6": "Джангыз бир типли текст блокну баргъа сана.", 47 | "tesseract-psm-7": "Суратха бир текст тизгинча къара.", 48 | "tesseract-psm-8": "Суратха бир джангыз сёза къара.", 49 | "tesseract-psm-9": "Суратны тёгерек ичинде джангыз сёзча кёр.", 50 | "tesseract-psm-10": "Суратха джангыз символча къара.", 51 | "tesseract-psm-11": "Аралыкълы текст. Белгили низам бла не къадар кёб текст табалсанг таб.", 52 | "tesseract-psm-12": "OSD бла аралыкълы текст.", 53 | "tesseract-psm-13": "Чий тизгин. Tesseract энчи хакланы атлатыб, джангыз текст тизгин кибик къара.", 54 | "tesseract-param-error": "$2 багъагъа ие болгъан \"$1\" сайлама Tesseract джанындан дагъан болмаз. Максимум багъасы: $3", 55 | "tesseract-no-text-error": "Tesseract тебдириучю бу сурат ючюн текстни къайтармады.", 56 | "tesseract-internal-error": "Tesseract механизм ич халатны ызына къайтарды.", 57 | "transkribus-language-code": "Тил модель", 58 | "transkribus-unauthorized-error": "Халатны коду «$1» :: Излем авторизация ётмегенди", 59 | "transkribus-default-error": "Халатны коду «$1» :: Излем тамамланалмайды, джангыдан сынагъыз!", 60 | "transkribus-empty-response-error": "Transkribus API эсеб анализ этилалмады.", 61 | "transkribus-init-process-error": "Transkribus процессни инициализациясы джетишимсиз болду", 62 | "transkribus-failed-process-error": "Transkribus процесс джетишимсиз болду", 63 | "transkribus-no-lang-error": "Сайланнган тил джокъду", 64 | "transkribus-multiple-lang-error": "Талай тил хайырланыргъа болмайды, бирни белгилегиз", 65 | "transkribus-browse-public-models": "Transkribus ючюн бютеу ачыкъ тил моделлега къара", 66 | "transkribus-request-for-model": "OCR адыргъа Transkribus моделин къошаргъа излем эт", 67 | "transkribus-options": "Transkribus Опцияла", 68 | "transkribus-line-label": "Сызны Эслеген Модель", 69 | "transkribus-line-id-none-option": "Джокъду", 70 | "transkribus-mixed-line-option": "Сызны Къатыш Ориентациясы", 71 | "transkribus-line-help": "Къайсы сыз эслеучю моделни хайырланыргъа билмей эсегиз, бош къоюгъуз.", 72 | "transkribus-jobs": "Transkribus Вакансия", 73 | "transkribus-job-id": "Вакансия ID", 74 | "transkribus-job-state": "Хал", 75 | "transkribus-job-description": "Ачыкълау", 76 | "transkribus-job-start": "Башланнганды", 77 | "transkribus-job-end": "Бошалгъанды", 78 | "transkribus-job-waited": "Башлауну кечикгени (минутла)" 79 | } 80 | -------------------------------------------------------------------------------- /i18n/it.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Ajeje Brazorf", 5 | "Beta16", 6 | "Candalua" 7 | ] 8 | }, 9 | "title": "WikimediaOCR", 10 | "subtitle": "Uno strumento per trascrivere testo da immagini scansionate su Wikimedia Commons, per l'uso su Wikisource e altrove.", 11 | "form-heading": "Trascrivi un'immagine", 12 | "image-url": "URL dell'immagine", 13 | "image-url-help": "Inserisci l'URL di un'immagine ospitata su un server Wikimedia come: $1", 14 | "image-url-error": "L'URL dell'immagine deve iniziare con {{PLURAL:$1|il seguente nome di dominio|uno dei seguenti nomi di dominio}} e terminare con un'estensione di file valida: $2", 15 | "image-alt-text": "L'immagine originale", 16 | "language-code": "Lingue (facoltativo)", 17 | "engine": "Motore di OCR", 18 | "engine-name-transkribus": "Transkribus OCR", 19 | "engine-not-found-warning": "Il motore richiesto '$1' non è stato trovato. Al suo posto verrà usato il motore predefinito '$2'.", 20 | "engine-invalid-langs-warning": "Le seguenti lingue non sono valide o non sono supportate dal motore e sono state ignorate: $1", 21 | "submit": "Trascrivi tutta la pagina", 22 | "submit-crop": "Trascrivi un'area", 23 | "drag-help": "Seleziona lo strumento di ritaglio e trascina un rettangolo sull'immagine sottostante per trascrivere solo un'area della pagina.", 24 | "drag-mode-move": "Il trascinamento sposterà l'immagine", 25 | "drag-mode-move-alt": "Icona che rappresenta l'azione 'sposta'.", 26 | "drag-mode-crop": "Il trascinamento creerà una nuova area di ritaglio", 27 | "drag-mode-crop-alt": "Icona che rappresenta l'azione 'ritaglia'.", 28 | "copy-to-clipboard": "Copia negli appunti", 29 | "copied-to-clipboard": "Copiato!", 30 | "google-error": "Il servizio Google ha restituito un errore: $1", 31 | "image-retrieval-failed": "Recupero dell'immagine non riuscito: $1", 32 | "documentation": "Documentazione", 33 | "api-tooltip": "Vedi la documentazione dell'API", 34 | "version": "Versione $1", 35 | "report-issue": "Segnala un problema", 36 | "langs-placeholder": "Lascia vuoto per il rilevamento automatico della lingua.", 37 | "langs-param-error": "{{PLURAL:$1|La lingua seguente non è supportata|Le lingue seguenti non sono supportate}} dal motore OCR: $2", 38 | "tesseract-options": "Opzioni di Tesseract", 39 | "tesseract-psm-label": "Metodo di segmentazione della pagina", 40 | "tesseract-psm-help": "Prova \"Testo sparso\" per un migliore supporto multicolonna.", 41 | "tesseract-psm-0": "Solo rilevamento orientamento e script (OSD).", 42 | "tesseract-psm-1": "Segmentazione automatica delle pagine con OSD.", 43 | "tesseract-psm-2": "Segmentazione automatica della pagina, ma nessun OSD o OCR. (non implementato)", 44 | "tesseract-psm-3": "Segmentazione della pagina completamente automatica, ma nessun OSD. (Predefinito)", 45 | "tesseract-psm-4": "Presupponi una singola colonna di testo di dimensioni variabili.", 46 | "tesseract-psm-5": "Presupponi un singolo blocco uniforme di testo allineato verticalmente.", 47 | "tesseract-psm-6": "Presupponi un singolo blocco uniforme di testo.", 48 | "tesseract-psm-7": "Tratta l'immagine come una singola riga di testo.", 49 | "tesseract-psm-8": "Tratta l'immagine come una singola parola.", 50 | "tesseract-psm-9": "Tratta l'immagine come una singola parola in un cerchio.", 51 | "tesseract-psm-10": "Tratta l'immagine come un singolo carattere.", 52 | "tesseract-psm-11": "Testo sparso. Trova più testo possibile senza un ordine particolare.", 53 | "tesseract-psm-12": "Testo sparso con OSD.", 54 | "tesseract-psm-13": "Linea grezza. Tratta l'immagine come una singola riga di testo, aggirando gli hack specifici di Tesseract.", 55 | "tesseract-param-error": "L'opzione '$1' con un valore di $2 non è supportata da Tesseract. Valore massimo: $3", 56 | "tesseract-internal-error": "Il motore Tesseract ha restituito un errore interno.", 57 | "transkribus-language-code": "Modello linguistico", 58 | "transkribus-unauthorized-error": "Codice di errore '$1' :: La richiesta non è autorizzata", 59 | "transkribus-default-error": "Codice di errore '$1' :: Impossibile completare la richiesta, riprova!", 60 | "transkribus-empty-response-error": "Impossibile analizzare il risultato dall'API Transkribus", 61 | "transkribus-init-process-error": "Impossibile inizializzare il processo Transkribus", 62 | "transkribus-failed-process-error": "Il processo Transkribus non è riuscito", 63 | "transkribus-no-lang-error": "Non è stata selezionata alcuna lingua", 64 | "transkribus-multiple-lang-error": "Non sono consentite più lingue, specificare una lingua", 65 | "transkribus-browse-public-models": "Sfoglia tutti i modelli di linguaggio pubblico per Transkribus", 66 | "transkribus-request-for-model": "Fai una richiesta per aggiungere un modello da Transkribus allo strumento OCR", 67 | "transkribus-line-id-none-option": "Nessuno", 68 | "transkribus-job-id": "ID attività", 69 | "transkribus-job-state": "Stato", 70 | "transkribus-job-description": "Descrizione", 71 | "transkribus-job-start": "Iniziato", 72 | "transkribus-job-end": "Finito", 73 | "transkribus-job-waited": "Ritardo di avvio (minuti)" 74 | } 75 | -------------------------------------------------------------------------------- /i18n/tr.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Hedda", 5 | "Leo", 6 | "Slickdaddy" 7 | ] 8 | }, 9 | "title": "WikimediaOCR", 10 | "subtitle": "Wikimedia Commons'ta taranmış görsellerden metinleri yazıya dökmek ve Vikikaynak ve diğer yerlerde kullanmak için bir araç.", 11 | "form-heading": "Bir görüntüyü metne dönüştürün", 12 | "image-url": "Resim URL'si", 13 | "image-url-help": "Bir Wikimedia sunucusunda barındırılan bir resim URL'si ekleyin, örneğin: $1", 14 | "image-url-error": "Resim URL'si {{PLURAL:$1|şu alan adıyla|şu alan adlarından biriyle}} başlamalı ve geçerli bir dosya uzantısıyla bitmelidir: $2", 15 | "image-alt-text": "Özgün resim", 16 | "language-code": "Diller (isteğe bağlı)", 17 | "engine": "OCR motoru", 18 | "engine-name-transkribus": "Transkribus OCR", 19 | "engine-not-found-warning": "İstenen motor ' $1 ' bulunamadı. Bunun yerine varsayılan motor ' $2 ' kullanılıyor.", 20 | "engine-invalid-langs-warning": "Aşağıdaki diller geçersizdir veya altyapı tarafından desteklenmez ve yok sayılır: $1", 21 | "submit": "Tüm sayfayı transkript et", 22 | "submit-crop": "Transkript alanı", 23 | "drag-help": "Sayfanın yalnızca bir alanını kopyalamak için kırpma aracını seçin ve aşağıdaki resimde bir dikdörtgeni sürükleyin.", 24 | "drag-mode-move": "Sürüklemek görüntüyü hareket ettirir", 25 | "drag-mode-move-alt": "'Taşı' eylemini temsil eden simge.", 26 | "drag-mode-crop": "Sürükleme, yeni bir kırpma alanı oluşturacak", 27 | "drag-mode-crop-alt": "'Kırpma' eylemini temsil eden simge.", 28 | "copy-to-clipboard": "Panoya kopyala", 29 | "copied-to-clipboard": "Kopyalandı!", 30 | "google-error": "Google hizmeti bir hata verdi: $1", 31 | "image-retrieval-failed": "Resim alınamadı: $1", 32 | "documentation": "Belgelendirme", 33 | "api": "API", 34 | "api-tooltip": "API belgelerini görüntüleyin", 35 | "version": "Sürüm $1", 36 | "report-issue": "Sorun bildirin", 37 | "langs-placeholder": "Otomatik dil algılama için boş bırakın.", 38 | "langs-param-error": "Şu {{PLURAL:$1|dil|diller}}, OCR motoru tarafından desteklenmemektedir: $2", 39 | "loading-message": "Transkripsiyon gerçekleştiriliyor...", 40 | "tesseract-options": "Tesseract seçenekleri", 41 | "tesseract-psm-label": "Sayfa bölümleme yöntemi", 42 | "tesseract-psm-help": "Daha iyi çoklu sütun desteği için \"Aralıklı metin\"i deneyin.", 43 | "tesseract-psm-0": "Yalnızca yönlendirme ve komut dosyası algılama (OSD).", 44 | "tesseract-psm-1": "OSD ile otomatik sayfa bölümleme.", 45 | "tesseract-psm-2": "Otomatik sayfa bölümleme ancak OSD veya OCR mevcut değil. (uygulanmadı)", 46 | "tesseract-psm-3": "Tam otomatik sayfa bölümleme ancak OSD mevcut değil. (Varsayılan)", 47 | "tesseract-psm-4": "Değişken boyutlarda tek bir metin sütunu varsayın.", 48 | "tesseract-psm-5": "Dikey olarak hizalanmış tek bir tek tip metin bloğu varsayın.", 49 | "tesseract-psm-6": "Tek bir tek tip metin bloğu varsayın.", 50 | "tesseract-psm-7": "Resmi, tek bir metin satırı olarak ele alın.", 51 | "tesseract-psm-8": "Resmi, tek bir kelime olarak ele alın.", 52 | "tesseract-psm-9": "Resmi bir daire içinde tek bir kelime olarak ele alın.", 53 | "tesseract-psm-10": "Resmi, tek bir karakter olarak ele alın.", 54 | "tesseract-psm-11": "Aralıklı metin. Belirli bir sırayla mümkün olduğunca fazla metin bulun.", 55 | "tesseract-psm-12": "OSD ile aralıklı metin.", 56 | "tesseract-psm-13": "Ham satır. Görüntüyü, Tesseract'a özgü saldırıları atlayarak tek bir metin satırı olarak ele alın.", 57 | "tesseract-param-error": "$2 değerine sahip '$1' seçeneği Tesseract tarafından desteklenmez. Maksimum değer: $3", 58 | "tesseract-no-text-error": "Tesseract motoru bu görüntü için herhangi bir metin döndürmedi.", 59 | "tesseract-internal-error": "Tesseract motoru dahili bir hata verdi.", 60 | "transkribus-language-code": "Dil Modeli", 61 | "transkribus-unauthorized-error": "Hata Kodu '$1' :: İstek yetkilendirilmedi", 62 | "transkribus-default-error": "Hata Kodu '$1' :: İstek tamamlanamadı, tekrar deneyin!", 63 | "transkribus-empty-response-error": "Transkribus API'sinden sonuç ayrıştırılamadı", 64 | "transkribus-init-process-error": "Transkribus işlemi başlatılamadı", 65 | "transkribus-failed-process-error": "Transkribus işlemi başarısız oldu", 66 | "transkribus-no-lang-error": "Dil seçilmedi", 67 | "transkribus-multiple-lang-error": "Birden fazla dile izin verilmiyor, bir dil belirtin", 68 | "transkribus-browse-public-models": "Transkribus için tüm genel dil modellerine göz atın", 69 | "transkribus-request-for-model": "Transkribus'tan OCR aracına bir model eklemek için istekte bulunun", 70 | "transkribus-options": "Transkribus Seçenekleri", 71 | "transkribus-line-label": "Çizgi Algılama Modeli", 72 | "transkribus-line-id-none-option": "Hiçbiri", 73 | "transkribus-mixed-line-option": "Karışık Çizgi Yönü", 74 | "transkribus-line-help": "Hangi hat algılama modelini kullanacağınızdan emin değilseniz boş bırakın", 75 | "transkribus-jobs": "Transkribus İşleri", 76 | "transkribus-job-id": "İş Kimliği", 77 | "transkribus-job-state": "Durum", 78 | "transkribus-job-description": "Açıklama", 79 | "transkribus-job-start": "Başladı", 80 | "transkribus-job-end": "Bitti", 81 | "transkribus-job-waited": "Başlatma gecikmesi (dakika)" 82 | } 83 | -------------------------------------------------------------------------------- /i18n/ms.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Anakmalaysia", 5 | "Hakimi97", 6 | "Tofeiku" 7 | ] 8 | }, 9 | "title": "WikimediaOCR", 10 | "subtitle": "Transkripsikan teks daripada imej", 11 | "form-heading": "Transkripsikan imej", 12 | "image-url": "URL imej", 13 | "image-url-help": "Masukkan URL imej yang dihoskan pada pelayan Wikimedia seperti: $1", 14 | "image-url-error": "URL imej mesti bermula dengan {{PLURAL:$1|nama domain berikut|salah satu daripada nama domain berikut}} dan berakhir dengan sambungan fail yang sah: $2", 15 | "image-alt-text": "Imej asal", 16 | "language-code": "Bahasa (pilihan)", 17 | "engine": "Enjin OCR", 18 | "engine-name-transkribus": "Transkribus OCR", 19 | "engine-not-found-warning": "Enjin yang diminta '$1' tidak ditemui. Menggunakan enjin lalai '$2' sebaliknya.", 20 | "engine-invalid-langs-warning": "Bahasa berikut tidak sah atau tidak disokong oleh enjin dan diabaikan: $1", 21 | "submit": "Transkripsikan keseluruhan halaman", 22 | "submit-crop": "Transkripsikan lingkungan", 23 | "drag-help": "Pilih alat pangkas dan seret segi empat panjang pada imej di bawah untuk menyalin hanya satu lingkungan halaman.", 24 | "drag-mode-move": "Menyeret akan mengalihkan imej", 25 | "drag-mode-move-alt": "Ikon yang mewakili tindakan 'pindah'.", 26 | "drag-mode-crop": "Menyeret akan mewujudkan keluasan pangkas baharu", 27 | "drag-mode-crop-alt": "Ikon yang mewakili tindakan 'pangkas'.", 28 | "copy-to-clipboard": "Salin ke papan keratan (''clipboard''):", 29 | "copied-to-clipboard": "Disalin!", 30 | "google-error": "Perkhidmatan Google mengembalikan ralat: $1", 31 | "image-retrieval-failed": "Pengambilan semula imej gagal: $1", 32 | "documentation": "Pendokumenan", 33 | "api-tooltip": "Lihat pendokumenan API", 34 | "version": "Versi $1", 35 | "report-issue": "Laporkan masalah", 36 | "langs-placeholder": "Biarkan kosong untuk pengesanan bahasa automatik.", 37 | "langs-param-error": "{{PLURAL:$1|Bahasa|Bahasa-bahasa}} berikut tidak disokong oleh enjin OCR: $2", 38 | "loading-message": "Melakukan transkripsi...", 39 | "tesseract-options": "Pilihan Tesseract", 40 | "tesseract-psm-label": "Kaedah segmen halaman", 41 | "tesseract-psm-help": "Cuba \"Taburkan teks\" untuk sokongan berbilang lajur yang lebih baik.", 42 | "tesseract-psm-0": "Hanya pengesanan orientasi dan skrip (OSD).", 43 | "tesseract-psm-1": "Pensegmenan halaman automatik dengan OSD.", 44 | "tesseract-psm-2": "Pensegmenan halaman automatik, tetapi tiada OSD atau OCR. (tidak dilaksanakan)", 45 | "tesseract-psm-3": "Pensegmenan halaman automatik penuh, tetapi tiada OSD. (Lalai)", 46 | "tesseract-psm-4": "Andaikan satu lajur teks dengan saiz berubah-ubah.", 47 | "tesseract-psm-5": "Andaikan satu blok seragam tunggal teks yang dijajarkan secara menegak.", 48 | "tesseract-psm-6": "Andaikan satu blok seragam tunggal teks.", 49 | "tesseract-psm-7": "Anggap imej sebagai satu baris teks.", 50 | "tesseract-psm-8": "Anggap imej sebagai satu perkataan tunggal.", 51 | "tesseract-psm-9": "Anggap imej sebagai satu perkataan tunggal dalam lingkaran.", 52 | "tesseract-psm-10": "Anggap imej sebagai satu patah perkataan.", 53 | "tesseract-psm-11": "Taburkan teks. Cari sebanyak teks yang mungkin tanpa urutan tertentu.", 54 | "tesseract-psm-12": "Taburkan teks dengan OSD.", 55 | "tesseract-psm-13": "Barisan mentah. Anggap imej sebagai satu baris teks, memintas penggodaman yang khusus Tesseract.", 56 | "tesseract-param-error": "Pilihan '$1' dengan nilai $2 adalah tidak disokong oleh Tesseract. Nilai maksimum: $3", 57 | "tesseract-no-text-error": "Enjin Tesseract tidak mengembalikan sebarang teks untuk imej ini.", 58 | "tesseract-internal-error": "Enjin tesseract mengembalikan ralat dalaman.", 59 | "transkribus-language-code": "Model bahasa", 60 | "transkribus-unauthorized-error": "Kod Ralat '$1' :: Permintaan tidak dibenarkan", 61 | "transkribus-default-error": "Kod Ralat '$1' :: Tidak dapat memenuhi permintaan, cuba lagi!", 62 | "transkribus-empty-response-error": "Tidak dapat menghuraikan hasil daripada API Transkribus", 63 | "transkribus-init-process-error": "Gagal memulakan proses Transkribus", 64 | "transkribus-failed-process-error": "Proses transkribus gagal", 65 | "transkribus-no-lang-error": "Tiada bahasa dipilih", 66 | "transkribus-multiple-lang-error": "Keberbilangan bahasa tidak dibenarkan, khususkan satu bahasa", 67 | "transkribus-browse-public-models": "Semak imbas semua model bahasa awam untuk Transkribus", 68 | "transkribus-request-for-model": "Buat permintaan untuk menambah model daripada Transkribus kepada alat OCR", 69 | "transkribus-options": "Pilihan Transkribus", 70 | "transkribus-line-label": "Model Pengesanan Barisan", 71 | "transkribus-line-id-none-option": "Tiada", 72 | "transkribus-mixed-line-option": "Orientasi Barisan Campuran", 73 | "transkribus-line-help": "Biarkan kosong jika anda tidak pasti model pengesanan barisan mana yang hendak digunakan", 74 | "transkribus-jobs": "Tugasan Transkribus", 75 | "transkribus-job-id": "ID tugasan", 76 | "transkribus-job-state": "Keadaan", 77 | "transkribus-job-description": "Keterangan", 78 | "transkribus-job-start": "Bermula", 79 | "transkribus-job-end": "Selesai", 80 | "transkribus-job-waited": "Kelewatan bermula (minit)" 81 | } 82 | -------------------------------------------------------------------------------- /src/Engine/TranskribusEngine.php: -------------------------------------------------------------------------------- 1 | 'Balinese Line Detection Model', 24 | ]; 25 | 26 | /** 27 | * TranskribusEngine constructor. 28 | * @param TranskribusClient $transkribusClient 29 | * @param Intuition $intuition 30 | * @param string $projectDir 31 | * @param HttpClientInterface $httpClient 32 | */ 33 | public function __construct( 34 | TranskribusClient $transkribusClient, 35 | Intuition $intuition, 36 | string $projectDir, 37 | HttpClientInterface $httpClient 38 | ) { 39 | parent::__construct( $intuition, $projectDir, $httpClient ); 40 | 41 | $this->transkribusClient = $transkribusClient; 42 | } 43 | 44 | /** 45 | * @inheritDoc 46 | */ 47 | public static function getId(): string { 48 | return 'transkribus'; 49 | } 50 | 51 | /** 52 | * Get line detection models accepted by the engine 53 | * @param bool $onlyLineIds Whether to return only the line detection model IDs 54 | * @param bool $onlyLineIdLangs Whether to return only the line detection model IDs lang codes 55 | * @return string[] Line detection model lang codes or model IDs or model ID names 56 | */ 57 | public function getValidLineIds( bool $onlyLineIds = false, bool $onlyLineIdLangs = false ): array { 58 | $filteredLangList = array_filter( 59 | $this->getModelList(), static function ( $value ) { 60 | return isset( $value['line'] ) && $value['line'] !== ''; 61 | } 62 | ); 63 | 64 | $lineIdLangs = array_keys( $filteredLangList ); 65 | 66 | // return only the lang names as written in the models.json file 67 | if ( $onlyLineIdLangs ) { 68 | return $lineIdLangs; 69 | } 70 | 71 | // create a list that maps from lang name to line detection model name 72 | $lineIDList = []; 73 | foreach ( $lineIdLangs as $lineIdLang ) { 74 | $lineIDList[$lineIdLang] = $this->getLineIdModelName( $lineIdLang ); 75 | } 76 | 77 | // create a list that maps from line detection model ID to line detection model name 78 | $list = []; 79 | foreach ( $lineIdLangs as $lineIDKey ) { 80 | $list[$filteredLangList[$lineIDKey]['line']] = $lineIDList[$lineIDKey]; 81 | } 82 | 83 | // return only the line detection model IDs 84 | if ( $onlyLineIds ) { 85 | return array_keys( $list ); 86 | } 87 | 88 | return $list; 89 | } 90 | 91 | /** 92 | * Get name of the given line detection model from the language code 93 | * @param string|null $lineIdLang 94 | * @return string 95 | */ 96 | public function getLineIdModelName( ?string $lineIdLang = null ): string { 97 | return self::LINE_ID_MODEL_NAMES[$lineIdLang]; 98 | } 99 | 100 | /** 101 | * Set the line detection model ID for the Transkribus engine 102 | * @param int $lineId 103 | * @return void 104 | */ 105 | public function setLineId( int $lineId ): void { 106 | $this->lineId = $lineId; 107 | } 108 | 109 | /** 110 | * @inheritDoc 111 | * @throws OcrException 112 | */ 113 | public function getResult( 114 | string $imageUrl, 115 | string $invalidLangsMode, 116 | array $crop, 117 | ?array $langs = null 118 | ): EngineResult { 119 | $this->checkImageUrl( $imageUrl ); 120 | 121 | $points = ''; 122 | if ( $crop ) { 123 | $x = $crop['x']; 124 | $y = $crop['y']; 125 | $yPlusH = $crop['y'] + $crop['height']; 126 | $xPlusW = $crop['x'] + $crop['width']; 127 | $points = $x . ',' . $y . ' ' . $xPlusW . ',' . 128 | $y . ' ' . $xPlusW . ',' . $yPlusH . ' ' . $x . ',' . $yPlusH; 129 | } 130 | 131 | $htrModelId = 0; 132 | [ $validLangs, $invalidLangs ] = $this->filterValidLangs( $langs, $invalidLangsMode ); 133 | if ( !$validLangs ) { 134 | throw new OcrException( 'transkribus-no-lang-error' ); 135 | } 136 | 137 | if ( count( $validLangs ) > 1 ) { 138 | throw new OcrException( 'transkribus-multiple-lang-error' ); 139 | } 140 | $modelCode = $validLangs[0]; 141 | $modelInfo = $this->getModelList()[$modelCode]; 142 | $htrModelId = (int)$modelInfo['htr']; 143 | $image = $this->getImage( $imageUrl, $crop, self::DO_DOWNLOAD_IMAGE ); 144 | $processId = $this->transkribusClient->initProcess( $image, $htrModelId, $this->lineId, $points ); 145 | 146 | $resText = ''; 147 | while ( $this->transkribusClient->processStatus !== 'FINISHED' ) { 148 | $resText = $this->transkribusClient->retrieveProcessResult( $processId ); 149 | sleep( 2 ); 150 | } 151 | 152 | $warnings = $invalidLangs ? [ $this->getInvalidLangsWarning( $invalidLangs ) ] : []; 153 | return new EngineResult( $resText, $warnings ); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /i18n/bn.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Bodhisattwa", 5 | "Tahmid", 6 | "ZI Jony", 7 | "আফতাবুজ্জামান" 8 | ] 9 | }, 10 | "title": "উইকিমিডিয়া ওসিআর", 11 | "subtitle": "চিত্র থেকে পাঠ্য আহরণ করুন", 12 | "form-heading": "ছবি ট্রান্সক্রাইব করুন", 13 | "image-url": "চিত্রের ইউআরএল", 14 | "image-url-help": "উইকিমিডিয়া সার্ভারে হোস্ট করা একটি চিত্রের ইউআরএল সন্নিবেশ করুন যেমন: $1", 15 | "image-url-error": "ছবির ইউআরএল {{PLURAL:$1|নিম্নোক্ত ডোমেইন নাম|নিম্নোক্ত ডোমেইন নামগুলোর কোনো একটি}} দিয়ে শুরু হতে হবে এবং বৈধ ফাইল এক্সটেনশন দিয়ে শেষ হতে হবে: $2", 16 | "image-alt-text": "প্রকৃত চিত্র", 17 | "language-code": "ভাষা (ঐচ্ছিক)", 18 | "engine": "ওসিআর ইঞ্জিন", 19 | "engine-name-google": "গুগল ক্লাউড ভিশন ওসিআর", 20 | "engine-name-tesseract": "টেসারেক্ট ওসিআর", 21 | "engine-name-transkribus": "ট্রান্সক্রিবাস ওসিআর", 22 | "engine-not-found-warning": "অনুরোধকৃত ইঞ্জিন '$1' খুঁজে পাওয়া যায়নি। এর পরিবর্তে পূর্বনির্ধারিত ইঞ্জিন '$2' ব্যবহার করা হচ্ছে।", 23 | "engine-invalid-langs-warning": "নিম্নলিখিত ভাষাগুলি অবৈধ বা ইঞ্জিন দ্বারা সমর্থিত নয় এবং উপেক্ষা করা হয়েছে: $1", 24 | "submit": "পুরো পাতা থেকে আহরণ করুন", 25 | "submit-crop": "আহরণের এলাকা", 26 | "drag-help": "পৃষ্ঠার শুধুমাত্র অংশবিশেষ ট্রান্সক্রাইব করতে ক্রপ টুলটি নির্বাচন করুন এবং নিচের ছবির উপর একটি আয়তক্ষেত্র আকুন।", 27 | "drag-mode-move": "ড্র্যাগ করলে ছবিটি সরে যাবে", 28 | "drag-mode-move-alt": "'সরানো' ক্রিয়ার প্রতিনিধিত্বকারী আইকন।", 29 | "drag-mode-crop": "ড্যাগ করলে নতুন ক্রপ এলাকা তৈরি হবে", 30 | "drag-mode-crop-alt": "'ক্রপ' ক্রিয়ার প্রতিনিধিত্বকারী আইকন", 31 | "copy-to-clipboard": "ক্লিপবোর্ডে অনুলিপি করুন", 32 | "copied-to-clipboard": "অনুলিপি করা হয়েছে!", 33 | "google-error": "গুগল পরিষেবা একটি ত্রুটি প্রদান করেছে: $1", 34 | "image-retrieval-failed": "ছবি পুনরুদ্ধার ব্যর্থ হয়েছেঃ $1", 35 | "documentation": "নথিপত্র", 36 | "api": "এপিআই", 37 | "api-tooltip": "এপিআই নতিপত্র দেখুন", 38 | "version": "সংস্করণ $1", 39 | "report-issue": "সমস্যা প্রতিবেদন করুন", 40 | "langs-placeholder": "স্বয়ংক্রিয়ভাবে ভাষা সনাক্তকরণের জন্য ফাঁকা রাখুন।", 41 | "langs-param-error": "নিম্নলিখিত {{PLURAL:$1|ভাষাটি|ভাষাগুলি}} ওসিআর ইঞ্জিন দ্বারা সমর্থিত নয়: $2", 42 | "loading-message": "ট্রান্সক্রাইব করা হচ্ছে...", 43 | "tesseract-options": "টেসারেক্ট বিকল্প", 44 | "tesseract-psm-label": "পৃষ্ঠা বিভাজনের পদ্ধতি", 45 | "tesseract-psm-help": "আরও ভালো বহু-কলাম সহায়তার জন্য \"ছড়ানো বিক্ষিপ্ত পাঠ্য\" নির্বাচন করুন।", 46 | "tesseract-psm-0": "শুধু দিক এবং লিপি নির্ধারণ।", 47 | "tesseract-psm-1": "ওএসডি দিয়ে স্বয়ংক্রিয় পৃষ্ঠা বিভাজন।", 48 | "tesseract-psm-2": "স্বয়ংক্রিয় পৃষ্ঠা বিভাজন, কিন্তু ওএসডি বা ওসিআর ছাড়া। (বাস্তবায়িত হয়নি)", 49 | "tesseract-psm-3": "সম্পূর্ণ স্বয়ংক্রিয় পৃষ্ঠা বিভাজন, কিন্তু ওএসডি ছাড়া। (পূর্বনির্ধারিত)", 50 | "tesseract-psm-4": "পরিবর্তনশীল আকারযুক্ত পাঠ্যের একটি একক কলাম ধরে নিন।", 51 | "tesseract-psm-5": "উল্লম্বভাবে সারিবদ্ধ পাঠ্যের একটি একক অভিন্ন ব্লক ধরে নিন।", 52 | "tesseract-psm-6": "পাঠ্যের একটি একক অভিন্ন ব্লক ধরে নিন।", 53 | "tesseract-psm-7": "চিত্রটিকে একটি একক পাঠ্য লাইন হিসেবে বিবেচনা করুন।", 54 | "tesseract-psm-8": "চিত্রটিকে একটি শব্দ হিসেবে বিবেচনা করুন।", 55 | "tesseract-psm-9": "চিত্রটিকে একটি বৃত্তে একটি শব্দ হিসেবে বিবেচনা করুন।", 56 | "tesseract-psm-10": "চিত্রটিকে একটি একক অক্ষর হিসেবে বিবেচনা করুন।", 57 | "tesseract-psm-11": "ছড়ানো বিক্ষিপ্ত পাঠ্য। কোনও নির্দিষ্ট ক্রম ছাড়াই যতটা সম্ভব পাঠ্য খুঁজুন।", 58 | "tesseract-psm-12": "ওএসডিসহ স্পার্স পাঠ্য।", 59 | "tesseract-psm-13": "পাঠ্যের লাইন। টেসরাক্টের নিজস্ব হ্যাকগুলিকে বাদ দিয়ে চিত্রটিকে পাঠ্যের একটি একক লাইন হিসাবে বিবেচনা করুন।", 60 | "tesseract-param-error": "টেসার‍্যাক্টে '$1' অপশনে $2 মান ব্যবহার করা যাবে না। সর্বোচ্চ মান: $3", 61 | "tesseract-no-text-error": "টেসরাক্ট ইঞ্জিন এই ছবির জন্য কোনও লেখা প্রদান করেনি।", 62 | "tesseract-internal-error": "টেসারেক্ট ইঞ্জিনে একটি অভ্যন্তরীণ ত্রুটি হয়েছে।", 63 | "transkribus-language-code": "ভাষা মডেল", 64 | "transkribus-unauthorized-error": "ত্রুটি কোড '$1' :: অনুরোধটি অনুমোদিত নয়।", 65 | "transkribus-default-error": "ত্রুটি কোড '$1' :: অনুরোধ সম্পন্ন করা যায়নি, আবার চেষ্টা করুন!", 66 | "transkribus-empty-response-error": "ট্রান্সক্রিবাস এপিআই থেকে ফলাফল পার্স করা যায়নি", 67 | "transkribus-init-process-error": "ট্রান্সক্রিবাস প্রক্রিয়া আরম্ভকরণ ব্যর্থ হয়েছে", 68 | "transkribus-failed-process-error": "ট্রান্সক্রিবাস প্রক্রিয়া ব্যর্থ হয়েছে", 69 | "transkribus-no-lang-error": "কোনও ভাষা নির্বাচন করা হয়নি", 70 | "transkribus-multiple-lang-error": "একাধিক ভাষা অনুমোদিত নয়, একটি ভাষা নির্দিষ্ট করুন", 71 | "transkribus-browse-public-models": "ট্রান্সক্রিবাসের সকল পাবলিক ভাষার মডেল ব্রাউজ করুন", 72 | "transkribus-request-for-model": "ট্রান্সক্রিবাস থেকে ওসিআর সরঞ্জামে একটি মডেল যোগ করার জন্য অনুরোধ করুন", 73 | "transkribus-options": "ট্রান্সক্রিবাস বিকল্প", 74 | "transkribus-line-label": "রেখা সনাক্তকরণ মডেল", 75 | "transkribus-line-id-none-option": "কোনোটিই নয়", 76 | "transkribus-mixed-line-option": "মিশ্র রেখা অভিমুখায়ন", 77 | "transkribus-line-help": "কোন লাইন সনাক্তকরণ মডেল ব্যবহার করবেন তা নিশ্চিত না হলে খালি রাখুন", 78 | "transkribus-jobs": "ট্রান্সক্রিবাস জবস", 79 | "transkribus-job-id": "কাজের আইডি", 80 | "transkribus-job-state": "স্থিতি", 81 | "transkribus-job-description": "বিবরণ", 82 | "transkribus-job-start": "শুরু হয়েছে", 83 | "transkribus-job-end": "সম্পন্ন হয়েছে", 84 | "transkribus-job-waited": "শুরুর বিলম্ব (মিনিট)" 85 | } 86 | -------------------------------------------------------------------------------- /i18n/en.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": {}, 3 | "title": "WikimediaOCR", 4 | "subtitle": "A tool to transcribe text from scanned images on Wikimedia Commons, for use on Wikisource and elsewhere.", 5 | "form-heading": "Transcribe an image", 6 | "image-url": "Image URL", 7 | "image-url-help": "Insert an image URL hosted on a Wikimedia server such as: $1", 8 | "image-url-error": "Image URL must begin with {{PLURAL:$1|the following domain name|one of the following domain names}} and end with a valid file extension: $2", 9 | "image-alt-text": "The original image", 10 | "language-code": "Languages (optional)", 11 | "engine": "OCR engine", 12 | "engine-name-google": "Google Cloud Vision OCR", 13 | "engine-name-tesseract": "Tesseract OCR", 14 | "engine-name-transkribus": "Transkribus OCR", 15 | "engine-not-found-warning": "The requested engine '$1' was not found. Using the default engine '$2' instead.", 16 | "engine-invalid-langs-warning": "The following languages are invalid or not supported by the engine and were ignored: $1", 17 | "submit": "Transcribe whole page", 18 | "submit-crop": "Transcribe area", 19 | "drag-help": "Select the crop tool and drag a rectangle on the image below to transcribe only one area of the page.", 20 | "drag-mode-move": "Dragging will move the image", 21 | "drag-mode-move-alt": "Icon representing the 'move' action.", 22 | "drag-mode-crop": "Dragging will create a new crop area", 23 | "drag-mode-crop-alt": "Icon representing the 'crop' action.", 24 | "copy-to-clipboard": "Copy to clipboard", 25 | "copied-to-clipboard": "Copied!", 26 | "google-error": "The Google service returned an error: $1", 27 | "image-retrieval-failed": "Image retrieval failed: $1", 28 | "documentation": "Documentation", 29 | "api": "API", 30 | "api-tooltip": "View the API documentation", 31 | "version": "Version $1", 32 | "report-issue": "Report an issue", 33 | "langs-placeholder": "Leave blank for automatic language detection.", 34 | "langs-param-error": "The following {{PLURAL:$1|language is|languages are}} not supported by the OCR engine: $2", 35 | "loading-message": "Performing transcription...", 36 | "tesseract-options": "Tesseract options", 37 | "tesseract-psm-label": "Page segmentation method", 38 | "tesseract-psm-help": "Try \"Sparse text\" for better multi-column support.", 39 | "tesseract-psm-0": "Orientation and script detection (OSD) only.", 40 | "tesseract-psm-1": "Automatic page segmentation with OSD.", 41 | "tesseract-psm-2": "Automatic page segmentation, but no OSD, or OCR. (not implemented)", 42 | "tesseract-psm-3": "Fully automatic page segmentation, but no OSD. (Default)", 43 | "tesseract-psm-4": "Assume a single column of text of variable sizes.", 44 | "tesseract-psm-5": "Assume a single uniform block of vertically aligned text.", 45 | "tesseract-psm-6": "Assume a single uniform block of text.", 46 | "tesseract-psm-7": "Treat the image as a single text line.", 47 | "tesseract-psm-8": "Treat the image as a single word.", 48 | "tesseract-psm-9": "Treat the image as a single word in a circle.", 49 | "tesseract-psm-10": "Treat the image as a single character.", 50 | "tesseract-psm-11": "Sparse text. Find as much text as possible in no particular order.", 51 | "tesseract-psm-12": "Sparse text with OSD.", 52 | "tesseract-psm-13": "Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.", 53 | "tesseract-param-error": "The '$1' option with a value of $2 is not supported by Tesseract. Maximum value: $3", 54 | "tesseract-no-text-error": "The Tesseract engine did not return any text for this image.", 55 | "tesseract-internal-error": "The tesseract engine returned an internal error.", 56 | "transkribus-language-code": "Language Model", 57 | "transkribus-unauthorized-error": "Error Code '$1' :: The request is not authorized", 58 | "transkribus-default-error": "Error Code '$1' :: Unable to complete request, try again!", 59 | "transkribus-empty-response-error": "Could not parse result from Transkribus API", 60 | "transkribus-init-process-error": "Failed to initialize Transkribus process", 61 | "transkribus-failed-process-error": "Transkribus process failed", 62 | "transkribus-no-lang-error": "No language was selected", 63 | "transkribus-multiple-lang-error": "Multiple languages are not allowed, specify one language", 64 | "transkribus-browse-public-models": "Browse all public language models for Transkribus", 65 | "transkribus-request-for-model": "Make a request to add a model from Transkribus to the OCR tool", 66 | "transkribus-options": "Transkribus Options", 67 | "transkribus-line-label": "Line Detection Model", 68 | "transkribus-line-id-none-option": "None", 69 | "transkribus-mixed-line-option": "Mixed Line Orientation", 70 | "transkribus-line-help": "Leave empty if you are not sure of which line detection model to use", 71 | "transkribus-jobs": "Transkribus Jobs", 72 | "transkribus-job-id": "Job ID", 73 | "transkribus-job-state": "State", 74 | "transkribus-job-description": "Description", 75 | "transkribus-job-start": "Started", 76 | "transkribus-job-end": "Finished", 77 | "transkribus-job-waited": "Start delay (minutes)" 78 | } 79 | -------------------------------------------------------------------------------- /i18n/ia.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "McDutchie" 5 | ] 6 | }, 7 | "title": "WikimediaOCR", 8 | "subtitle": "Transcriber texto de imagines", 9 | "form-heading": "Transcriber un imagine", 10 | "image-url": "URL del imagine", 11 | "image-url-help": "Insere un URL de imagine albergate sur un servitor de Wikimedia como: $1", 12 | "image-url-error": "Le adresse URL del imagine debe comenciar con {{PLURAL:$1|le sequente nomine|un del sequente nomines}} de dominio e terminar in un extension de nomine de file valide: $2", 13 | "image-alt-text": "Le imagine original", 14 | "language-code": "Linguas (optional)", 15 | "engine": "Motor OCR", 16 | "engine-name-transkribus": "OCR de Transkribus", 17 | "engine-not-found-warning": "Le motor '$1' requestate non ha essite trovate. Le motor predefinite '$2' es usate in su loco.", 18 | "engine-invalid-langs-warning": "Le sequente linguas non es valide o non es supportate per le motor e ha essite ignorate: $1", 19 | "submit": "Transcriber tote le pagina", 20 | "submit-crop": "Transcriber un area", 21 | "drag-help": "Selige le utensile de taliar e trahe un rectangulo sur le imagine hic infra pro transcriber solmente un area del pagina.", 22 | "drag-mode-move": "Traher displaciara le imagine", 23 | "drag-mode-move-alt": "Icone representante le action 'displaciar'.", 24 | "drag-mode-crop": "Traher creara un nove area de retalio", 25 | "drag-mode-crop-alt": "Icone representante le action 'taliar'.", 26 | "copy-to-clipboard": "Copiar al area de transferentia", 27 | "copied-to-clipboard": "Copiate!", 28 | "google-error": "Le servicio de Google ha restituite un error: $1", 29 | "image-retrieval-failed": "Recuperation de imagine fallite: $1", 30 | "documentation": "Documentation", 31 | "api-tooltip": "Vider le documentation del API", 32 | "version": "Version $1", 33 | "report-issue": "Reportar un problema", 34 | "langs-placeholder": "Lassar vacue pro detection automatic de lingua.", 35 | "langs-param-error": "Le sequente lingua{{PLURAL:$1||s}} non es supportate per le motor OCR: $2", 36 | "loading-message": "Transcription in curso…", 37 | "tesseract-options": "Optiones de Tesseract", 38 | "tesseract-psm-label": "Methodo de segmentation de pagina", 39 | "tesseract-psm-help": "Essaya \"Texto sparse\" pro un melior gestion de plure columnas.", 40 | "tesseract-psm-0": "Detection de orientation e de scriptura (OSD) solmente.", 41 | "tesseract-psm-1": "Segmentation automatic de paginas con OSD.", 42 | "tesseract-psm-2": "Segmentation automatic de paginas, ma sin OSD e sin OCR. (non implementate)", 43 | "tesseract-psm-3": "Segmentation de paginas totalmente automatic, ma sin OSD. (Option predefinite)", 44 | "tesseract-psm-4": "Assumer un sol columna de texto de varie grandores.", 45 | "tesseract-psm-5": "Assumer un sol bloco uniforme de texto alineate verticalmente.", 46 | "tesseract-psm-6": "Assumer un sol bloco uniforme de texto.", 47 | "tesseract-psm-7": "Tractar le imagine como un sol linea de texto.", 48 | "tesseract-psm-8": "Tractar le imagine como un sol parola.", 49 | "tesseract-psm-9": "Tractar le imagine como un sol parola in un circulo.", 50 | "tesseract-psm-10": "Tractar le imagine como un sol character.", 51 | "tesseract-psm-11": "Texto sparse. Trovar tante texto como possibile sin ordine particular.", 52 | "tesseract-psm-12": "Texto sparse con OSD.", 53 | "tesseract-psm-13": "Linea brute. Tractar le imagine como un sol linea de texto, contornante le trucos que es specific a Tesseract.", 54 | "tesseract-param-error": "Le option '$1' con un valor de $2 non es supportate per Tesseract. Valor maxime: $3", 55 | "tesseract-no-text-error": "Le motor Tesseract non ha restituite alcun texto pro iste imagine.", 56 | "tesseract-internal-error": "Le motor tesseract indicava un error interne.", 57 | "transkribus-language-code": "Modello de lingua", 58 | "transkribus-unauthorized-error": "Codice de error '$1' :: Le requesta non es autorisate", 59 | "transkribus-default-error": "Codice de error '$1' :: Non pote terminar le requesta, tenta lo de novo!", 60 | "transkribus-empty-response-error": "Non poteva interpretar le resultato del API de Transkribus", 61 | "transkribus-init-process-error": "Non poteva initialisar le processo Transkribus", 62 | "transkribus-failed-process-error": "Processo Transkribus fallite", 63 | "transkribus-no-lang-error": "Necun lingua ha essite seligite", 64 | "transkribus-multiple-lang-error": "Plure linguas non es permittite; specifica solo un lingua", 65 | "transkribus-browse-public-models": "Percurrer tote le modellos public de lingua pro Transkribus", 66 | "transkribus-request-for-model": "Facer un demanda pro adder un modello de Transkribus al utensile OCR", 67 | "transkribus-options": "Optiones de Transkribus", 68 | "transkribus-line-label": "Modello de detection de linea", 69 | "transkribus-line-id-none-option": "Necun", 70 | "transkribus-mixed-line-option": "Orientation de linea mixte", 71 | "transkribus-line-help": "Lassa vacue si tu non es secur de qual modello de detection de linea utilisar", 72 | "transkribus-jobs": "Travalios de Transkribus", 73 | "transkribus-job-id": "ID del travalio", 74 | "transkribus-job-state": "Stato", 75 | "transkribus-job-description": "Description", 76 | "transkribus-job-start": "Initiate", 77 | "transkribus-job-end": "Finite", 78 | "transkribus-job-waited": "Demora de initio (minutas)" 79 | } 80 | -------------------------------------------------------------------------------- /i18n/mk.json: -------------------------------------------------------------------------------- 1 | { 2 | "@metadata": { 3 | "authors": [ 4 | "Bjankuloski06" 5 | ] 6 | }, 7 | "title": "ОПЗ на Викимедија", 8 | "subtitle": "Препис на текст од слики", 9 | "form-heading": "Препис на слика", 10 | "image-url": "URL на сликата", 11 | "image-url-help": "Ова мора да почнува со едно од следниве доменски имиња и да биде целосна URL кон фактичката сликовна податотека: $1", 12 | "image-url-error": "URL-то на сликата мора да почнува со {{PLURAL:$1|следново доменско име|едно од следниве доменски имиња}} и да завршува со важечка податотечна наставка: $2", 13 | "image-alt-text": "Изворната слика.", 14 | "language-code": "Јазици (незадолж.)", 15 | "engine": "ОПЗ-погон", 16 | "engine-name-google": "ОПЗ на Google Cloud Vision", 17 | "engine-name-tesseract": "ОПЗ на Tesseract", 18 | "engine-name-transkribus": "Transkribus OCR", 19 | "engine-not-found-warning": "Не го најдов побараниот погон „$1“. Ќе го користам основно зададениот погон „$2“.", 20 | "engine-invalid-langs-warning": "Следниве јазици се неважечки или не се поддржани од погонот, па затоа се занемаруваат: $1", 21 | "submit": "Препиши ја целата страница", 22 | "submit-crop": "Препиши го подрчајето", 23 | "drag-help": "Изберете ја алатката за кастрење и повлечете правоаголник врз долуприкажаната слика за да се препише само тој дел од неа.", 24 | "drag-mode-move": "Повлекувањето ќе ја помести сликата", 25 | "drag-mode-move-alt": "Икона за дејството „поместување“.", 26 | "drag-mode-crop": "Повлекувањето ќе направи ново скастрено подрачје", 27 | "drag-mode-crop-alt": "Икона за дејството „кастрење“.", 28 | "copy-to-clipboard": "Копирај во меѓускладот", 29 | "copied-to-clipboard": "Ископирано!", 30 | "google-error": "Службата на Гугл даде грешка: $1", 31 | "image-retrieval-failed": "Не успеав да ја добијам сликата: $1", 32 | "documentation": "Документација", 33 | "api": "Приложник", 34 | "api-tooltip": "Погл. документација на приложникот", 35 | "version": "Верзија $1", 36 | "report-issue": "Пријави проблем", 37 | "langs-placeholder": "Оставете празно за автоматско утврдување на јазикот.", 38 | "langs-param-error": "{{PLURAL:$1|Следниов јазик не е поддржан|Следниве јазици не се поддржани}} од погонот на OCR: $2", 39 | "loading-message": "Вршам препис...", 40 | "tesseract-options": "Можности на Tesseract", 41 | "tesseract-psm-label": "Начин на разделување на страницата", 42 | "tesseract-psm-help": "Пробајте „Редок текст“ за подобра повеќестолбна поддршка.", 43 | "tesseract-psm-0": "Само препознавање на насока и писмо (ПНП).", 44 | "tesseract-psm-1": "Автоматско разделување на страницата со ПНП.", 45 | "tesseract-psm-2": "Автоматско разделување на страницата, но без ПНП или ОПЗ. (не е спроведено)", 46 | "tesseract-psm-3": "Наполно автоматско разделување на страницата, но без ПНП. (По основно)", 47 | "tesseract-psm-4": "Претпостави единечен столб од текст со променливи големини.", 48 | "tesseract-psm-5": "Претпостави единечен еднообразен блок на вертијално подреден текст.", 49 | "tesseract-psm-6": "Претпостави единечен еднообразен блок од текст.", 50 | "tesseract-psm-7": "Сметај ја сликата за еден ред текст.", 51 | "tesseract-psm-8": "Сметај ја сликата за еден збор.", 52 | "tesseract-psm-9": "Сметај ја сликата за еден збор во кружница.", 53 | "tesseract-psm-10": "Сметај ја сликата за еден знак.", 54 | "tesseract-psm-11": "Разреден текст. Најди што повеќе текст по било кој редослед.", 55 | "tesseract-psm-12": "Разреден текст со ПНП.", 56 | "tesseract-psm-13": "Сиров рд. Сметај ја сликата за еден ред текст, заобиколувајќи ги можностите особени за Tesseract.", 57 | "tesseract-param-error": "Можноста „$1“ со вредност $2 не е поддржана од Tesseract. Најголема допуштена вредност: $3", 58 | "tesseract-no-text-error": "Погонот Tesseract не даде никаков текст за оваа слика.", 59 | "tesseract-internal-error": "Погонот Tesseract даде внатрешна грешка.", 60 | "transkribus-language-code": "Јазичен модел", 61 | "transkribus-unauthorized-error": "Грешка „$1“ :: Барањето не е овластено", 62 | "transkribus-default-error": "Грешка „$1“ :: Не можам да го исполнам барањето. Пробајте повторно!", 63 | "transkribus-empty-response-error": "Не можев да го расчленам исходот од приложникот на Transkribus", 64 | "transkribus-init-process-error": "Не можев да ја покренам постапката на Transkribus", 65 | "transkribus-failed-process-error": "Постапката на Transkribus не успеа", 66 | "transkribus-no-lang-error": "Немате избрано јазик", 67 | "transkribus-multiple-lang-error": "Не се дозволени повеќе јазици. Укажете еден", 68 | "transkribus-browse-public-models": "Прелистајте ги сите јавни јазични модели за Transkribus", 69 | "transkribus-request-for-model": "Поднесете барање за додавање на модел од Transkribus кон алатката за оптичко препознавање", 70 | "transkribus-options": "Можности за Transkribus", 71 | "transkribus-line-label": "Модел за откривање редови", 72 | "transkribus-line-id-none-option": "Нема", 73 | "transkribus-mixed-line-option": "Мешана насоченост на редовите", 74 | "transkribus-line-help": "Оставете го празно ако не сте сигурни кој модел за откривање редови треба да се користи", 75 | "transkribus-jobs": "Задачи на Transkribus", 76 | "transkribus-job-id": "Назнака на задачата", 77 | "transkribus-job-state": "Состојба", 78 | "transkribus-job-description": "Опис", 79 | "transkribus-job-start": "Започнато", 80 | "transkribus-job-end": "Завршено", 81 | "transkribus-job-waited": "Одложување на почетокот (минути)" 82 | } 83 | --------------------------------------------------------------------------------