├── .github
    ├── FUNDING.yml
    ├── release-drafter.yml
    └── workflows
    │   ├── build-and-publish-docs.yaml
    │   ├── build-and-publish.yml
    │   ├── create-release.yml
    │   └── lint-and-test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── README.md
├── README_EN.md
├── configs
    ├── yomitoku-layout-parser-rtdtrv2-open-beta.yaml
    ├── yomitoku-table-structure-recognizer-rtdtrv2-open-beta.yaml
    ├── yomitoku-text-detector-dbnet-open-beta.yaml
    ├── yomitoku-text-recognizer-parseq-open-beta.yaml
    └── yomitoku-text-recognizer-parseq-small-open-beta.yaml
├── demo
    ├── sample.pdf
    ├── setting_document_anaysis.py
    ├── simple_document_analysis.py
    ├── simple_layout.py
    ├── simple_ocr.py
    └── text_detector.yaml
├── dockerfile
├── docs
    ├── assets
    │   └── logo.svg
    ├── cli.en.md
    ├── cli.ja.md
    ├── configuration.en.md
    ├── configuration.ja.md
    ├── index.en.md
    ├── index.ja.md
    ├── installation.en.md
    ├── installation.ja.md
    ├── mcp.en.md
    ├── mcp.ja.md
    ├── module.en.md
    └── module.ja.md
├── gallery.md
├── mkdocs.yml
├── pyproject.toml
├── pytest.ini
├── scripts
    └── register_hugging_face_hub.py
├── src
    └── yomitoku
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── main.py
    │       └── mcp_server.py
    │   ├── configs
    │       ├── __init__.py
    │       ├── cfg_layout_parser_rtdtrv2.py
    │       ├── cfg_layout_parser_rtdtrv2_v2.py
    │       ├── cfg_table_structure_recognizer_rtdtrv2.py
    │       ├── cfg_text_detector_dbnet.py
    │       ├── cfg_text_detector_dbnet_v2.py
    │       ├── cfg_text_recognizer_parseq.py
    │       ├── cfg_text_recognizer_parseq_small.py
    │       └── cfg_text_recognizer_parseq_v2.py
    │   ├── constants.py
    │   ├── data
    │       ├── __init__.py
    │       ├── dataset.py
    │       └── functions.py
    │   ├── document_analyzer.py
    │   ├── export
    │       ├── __init__.py
    │       ├── export_csv.py
    │       ├── export_html.py
    │       ├── export_json.py
    │       └── export_markdown.py
    │   ├── layout_analyzer.py
    │   ├── layout_parser.py
    │   ├── models
    │       ├── __init__.py
    │       ├── dbnet_plus.py
    │       ├── layers
    │       │   ├── __init__.py
    │       │   ├── activate.py
    │       │   ├── dbnet_feature_attention.py
    │       │   ├── parseq_transformer.py
    │       │   ├── rtdetr_backbone.py
    │       │   ├── rtdetr_hybrid_encoder.py
    │       │   └── rtdetrv2_decoder.py
    │       ├── parseq.py
    │       └── rtdetr.py
    │   ├── ocr.py
    │   ├── onnx
    │       └── .gitkeep
    │   ├── postprocessor
    │       ├── __init__.py
    │       ├── dbnet_postporcessor.py
    │       ├── parseq_tokenizer.py
    │       └── rtdetr_postprocessor.py
    │   ├── reading_order.py
    │   ├── resource
    │       ├── MPLUS1p-Medium.ttf
    │       └── charset.txt
    │   ├── table_structure_recognizer.py
    │   ├── text_detector.py
    │   ├── text_recognizer.py
    │   └── utils
    │       ├── __init__.py
    │       ├── graph.py
    │       ├── logger.py
    │       ├── misc.py
    │       ├── searchable_pdf.py
    │       └── visualizer.py
├── static
    ├── in
    │   ├── demo.jpg
    │   ├── gallery1.jpg
    │   ├── gallery2.jpg
    │   ├── gallery3.jpg
    │   ├── gallery4.jpg
    │   ├── gallery5.jpg
    │   ├── gallery6.jpg
    │   └── gallery7.jpeg
    ├── logo
    │   └── horizontal.png
    └── out
    │   ├── demo_html.png
    │   ├── figures
    │       ├── in_demo_p1_figure_0.png
    │       ├── in_gallery1_p1_figure_0.png
    │       ├── in_gallery1_p1_figure_1.png
    │       ├── in_gallery1_p1_figure_10.png
    │       ├── in_gallery1_p1_figure_2.png
    │       ├── in_gallery1_p1_figure_3.png
    │       ├── in_gallery1_p1_figure_4.png
    │       ├── in_gallery1_p1_figure_5.png
    │       ├── in_gallery1_p1_figure_6.png
    │       ├── in_gallery1_p1_figure_7.png
    │       ├── in_gallery1_p1_figure_8.png
    │       ├── in_gallery1_p1_figure_9.png
    │       ├── in_gallery3_p1_figure_0.png
    │       ├── in_gallery3_p1_figure_1.png
    │       ├── in_gallery5_p1_figure_0.png
    │       ├── in_gallery5_p1_figure_1.png
    │       ├── in_gallery6_p1_figure_0.png
    │       ├── in_gallery6_p1_figure_1.png
    │       └── in_gallery7_p1_figure_0.png
    │   ├── in_demo_p1.html
    │   ├── in_demo_p1.md
    │   ├── in_demo_p1_layout.jpg
    │   ├── in_demo_p1_ocr.jpg
    │   ├── in_gallery1_p1.html
    │   ├── in_gallery1_p1.md
    │   ├── in_gallery1_p1_layout.jpg
    │   ├── in_gallery1_p1_ocr.jpg
    │   ├── in_gallery2_p1.html
    │   ├── in_gallery2_p1.md
    │   ├── in_gallery2_p1_layout.jpg
    │   ├── in_gallery2_p1_ocr.jpg
    │   ├── in_gallery3_p1.html
    │   ├── in_gallery3_p1.md
    │   ├── in_gallery3_p1_layout.jpg
    │   ├── in_gallery3_p1_ocr.jpg
    │   ├── in_gallery4_p1.html
    │   ├── in_gallery4_p1.md
    │   ├── in_gallery4_p1_layout.jpg
    │   ├── in_gallery4_p1_ocr.jpg
    │   ├── in_gallery5_p1.html
    │   ├── in_gallery5_p1.md
    │   ├── in_gallery5_p1_layout.jpg
    │   ├── in_gallery5_p1_ocr.jpg
    │   ├── in_gallery6_p1.html
    │   ├── in_gallery6_p1.md
    │   ├── in_gallery6_p1_layout.jpg
    │   ├── in_gallery6_p1_ocr.jpg
    │   ├── in_gallery7_p1.html
    │   ├── in_gallery7_p1.md
    │   ├── in_gallery7_p1_layout.jpg
    │   └── in_gallery7_p1_ocr.jpg
├── tests
    ├── data
    │   ├── invalid.jpg
    │   ├── invalid.pdf
    │   ├── rgba.png
    │   ├── sampldoc.tif
    │   ├── small.jpg
    │   ├── subdir
    │   │   └── test.jpg
    │   ├── test.bmp
    │   ├── test.jpg
    │   ├── test.pdf
    │   ├── test.png
    │   ├── test.tiff
    │   ├── test.txt
    │   └── test_gray.jpg
    ├── test_base.py
    ├── test_cli.py
    ├── test_data.py
    ├── test_document_analyzer.py
    ├── test_export.py
    ├── test_layout_analyzer.py
    ├── test_ocr.py
    └── yaml
    │   ├── layout_parser.yaml
    │   ├── table_structure_recognizer.yaml
    │   ├── text_detector.yaml
    │   └── text_recognizer.yaml
└── uv.lock


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
16 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 |   name-template: "v$RESOLVED_VERSION"
 3 |   tag-template: "v$RESOLVED_VERSION"
 4 |   categories: # categorize
 5 |     - title: "🚀 機能追加"
 6 |       labels:
 7 |         - "enhancement"
 8 |     - title: "🔧  リファクタ"
 9 |       labels:
10 |         - "refactoring"
11 |     - title: "🐛 バグ修正"
12 |       labels:
13 |         - "bug"
14 |     - title: "✅ テスト"
15 |       labels:
16 |         - "test"
17 |     - title: "📖 ドキュメント"
18 |       labels:
19 |         - "documentation"
20 |   change-template: "- $TITLE @$AUTHOR (#$NUMBER)"
21 |   change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
22 |   autolabeler: # auto add labels based on branches or titles
23 |     - label: "enhancement"
24 |       branch:
25 |         - '/feature\/.+/'
26 |         - '/feat\/.+/'
27 |     - label: "release"
28 |       branch:
29 |         - '/release\/.+/'
30 |     - label: "refactoring"
31 |       branch:
32 |         - '/refactor\/.+/'
33 |       title:
34 |         - "/refactor/i"
35 |     - label: "bug"
36 |       branch:
37 |         - '/fix\/.+/'
38 |         - '/bug\/.+/'
39 |       title:
40 |         - "/fix/i"
41 |         - "/bug/i"
42 |     - label: "test"
43 |       branch:
44 |         - '/test\/.+/'
45 |     - label: "documentation"
46 |       branch:
47 |         - '/doc\/.+/'
48 |       title:
49 |         - "/doc/i"
50 |   version-resolver: # resolve next version based on tags ($RESOLVED_VERSION)
51 |     major:
52 |       labels:
53 |         - "breaking"
54 |     minor:
55 |       labels:
56 |         - "enhancement"
57 |     default: patch
58 |   template: |
59 |     ## 変更
60 |   
61 |     $CHANGES
62 |   


--------------------------------------------------------------------------------
/.github/workflows/build-and-publish-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   deploy-docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |       - uses: actions/setup-python@v4
14 |         with:
15 |           python-version: 3.x
16 |       - run: | 
17 |           pip install tox
18 |           tox -e docs


--------------------------------------------------------------------------------
/.github/workflows/build-and-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish_PyPI:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |         with:
13 |           fetch-depth: 0
14 |           tags: true
15 |       - name: Set up Python 3.9
16 |         uses: actions/setup-python@v3
17 |         with:
18 |           python-version: "3.9"
19 |       - name: Install uv
20 |         uses: astral-sh/setup-uv@v3
21 |         with:
22 |           enable-cache: true
23 |       - name: Install dependencies
24 |         run: uv sync --dev
25 |       - name: build
26 |         run: uv build
27 |       - name: Publish to PyPI
28 |         run: uv publish --token ${{ secrets.PYPI }}
29 | 


--------------------------------------------------------------------------------
/.github/workflows/create-release.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Release Drafter
 3 | 
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |   pull_request_target: 
 9 |     types: [opened, reopened, synchronize]
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   update_release_draft:
16 |     permissions:
17 |       contents: write
18 |       pull-requests: write
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - uses: release-drafter/release-drafter@v6
22 |         env:
23 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/lint-and-test.yml:
--------------------------------------------------------------------------------
 1 | name: Python Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   lint-and-test:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Cleanup disk space
17 |         run: |
18 |           sudo apt-get clean
19 |           sudo rm -rf /usr/share/dotnet
20 |           sudo rm -rf /usr/local/lib/android
21 |           sudo rm -rf /opt/ghc
22 |           df -h
23 |       - name: Set up Python 3.12
24 |         uses: actions/setup-python@v5
25 |         with:
26 |           python-version: "3.12"
27 |       - name: Install uv
28 |         uses: astral-sh/setup-uv@v3
29 |         with:
30 |           enable-cache: true
31 |       - name: pin python version
32 |         run: uv python pin 3.12
33 |       - name: Update apt
34 |         run: sudo apt update
35 |       - name: Install dependencies
36 |         run: sudo apt install -y libopencv-dev poppler-utils
37 |       - name: Install tox-uv
38 |         run: uv tool install tox --with tox-uv
39 |       - name: Run linter
40 |         run: tox -e lint
41 |       - name: Run tests
42 |         run: tox -p -e py310,py311,py312
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | .tox
12 | 
13 | # Dev tools cache
14 | .ruff_cache
15 | .pytest_cache
16 | 
17 | .DS_Store
18 | 
19 | dataset/
20 | weights/
21 | results/
22 | 
23 | .coverage*
24 | 
25 | *.onnx


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: lint
 5 |         name: lint
 6 |         entry: bash -c '.tox/lint/bin/ruff check --fix'
 7 |         language: system
 8 |         types: [python]
 9 |       - id: format
10 |         name: format
11 |         entry: bash -c '.tox/lint/bin/ruff format'
12 |         language: system
13 |         types: [python]


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 日本語版 | [English](README_EN.md)
 2 | 
 3 | <img src="static/logo/horizontal.png" width="800px">
 4 | 
 5 | ![Python](https://img.shields.io/badge/Python-3.10|3.11|3.12-F9DC3E.svg?logo=python&logoColor=&style=flat)
 6 | ![Pytorch](https://img.shields.io/badge/Pytorch-2.5-EE4C2C.svg?logo=Pytorch&style=fla)
 7 | ![CUDA](https://img.shields.io/badge/CUDA->=11.8-76B900.svg?logo=NVIDIA&style=fla)
 8 | ![OS](https://img.shields.io/badge/OS-Linux|Mac|Win-1793D1.svg?&style=fla)
 9 | [![Document](https://img.shields.io/badge/docs-live-brightgreen)](https://kotaro-kinoshita.github.io/yomitoku/)
10 | [![PyPI Downloads](https://static.pepy.tech/badge/yomitoku)](https://pepy.tech/projects/yomitoku)
11 | 
12 | ## 🌟 概要
13 | 
14 | YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
15 | 
16 | - 🤖 日本語データセットで学習した 4 種類(文字位置の検知、文字列認識、レイアウト解析、表の構造認識)の AI モデルを搭載しています。4 種類のモデルはすべて独自に学習されたモデルで日本語文書に対して、高精度に推論可能です。
17 | - 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000 文字を超える日本語文字の認識をサーポート、手書き文字、縦書きなど日本語特有のレイアウト構造の文書画像の解析も可能です。（日本語以外にも英語の文書に対しても対応しています）。
18 | - 📈 レイアウト解析、表の構造解析, 読み順推定機能により、文書画像のレイアウトの意味的構造を壊さずに情報を抽出することが可能です。
19 | - 📄 多様な出力形式をサポートしています。html やマークダウン、json、csv のいずれかのフォーマットに変換可能です。また、文書内に含まれる図表、画像の抽出の出力も可能です。文書画像をサーチャブルPDFに変換する処理もサポートしています。
20 | - ⚡ GPU 環境で高速に動作し、効率的に文書の文字起こし解析が可能です。また、VRAM も 8GB 以内で動作し、ハイエンドな GPU を用意する必要はありません。
21 | 
22 | ## 🖼️ デモ
23 | 
24 | [gallery.md](gallery.md)にも複数種類の画像の検証結果を掲載しています。
25 | 
26 | |                          入力画像                          |                       OCR の結果                        |
27 | | :--------------------------------------------------------: | :-----------------------------------------------------: |
28 | |        <img src="static/in/demo.jpg" width="400px">        | <img src="static/out/in_demo_p1_ocr.jpg" width="400px"> |
29 | |                    レイアウト解析の結果                    |     エクスポート<br>(HTML で出力したものをスクショ)     |
30 | | <img src="static/out/in_demo_p1_layout.jpg" width="400px"> |   <img src="static/out/demo_html.png" width="400px">    |
31 | 
32 | Markdown でエクスポートした結果は関してはリポジトリ内の[static/out/in_demo_p1.md](static/out/in_demo_p1.md)を参照
33 | 
34 | - `赤枠` : 図、画像等の位置
35 | - `緑枠` : 表領域全体の位置
36 | - `ピンク枠` : 表のセル構造(セル上の文字は [行番号, 列番号] (rowspan x colspan)を表します)
37 | - `青枠` : 段落、テキストグループ領域
38 | - `赤矢印` : 読み順推定の結果
39 | 
40 | 画像の出典:[「令和 6 年版情報通信白書 3 章 2 節 AI の進化に伴い発展するテクノロジー」](https://www.soumu.go.jp/johotsusintokei/whitepaper/ja/r06/pdf/n1410000.pdf)：（総務省） を加工して作成
41 | 
42 | ## 📣 リリース情報
43 | 
44 | - 2025 年  4 月  4 日 YomiToku v0.8.0 手書き文字認識のサポート
45 | - 2024 年 11 月 26 日 YomiToku v0.5.1 (beta) を公開
46 | 
47 | ## 💡 インストールの方法
48 | 
49 | ```
50 | pip install yomitoku
51 | ```
52 | 
53 | - pytorch はご自身の CUDA のバージョンにあったものをインストールしてください。デフォルトでは CUDA12.4 以上に対応したものがインストールされます。
54 | - pytorch は 2.5 以上のバージョンに対応しています。その関係で CUDA11.8 以上のバージョンが必要になります。対応できない場合は、リポジトリ内の Dockerfile を利用してください。
55 | 
56 | ## 🚀 実行方法
57 | 
58 | ```
59 | yomitoku ${path_data} -f md -o results -v --figure --lite
60 | ```
61 | 
62 | - `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。
63 | - `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md, pdf(searchable-pdf) をサポート)
64 | - `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
65 | - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
66 | - `-l`, `--lite` を指定すると軽量モデルで推論を実行します。通常より高速に推論できますが、若干、精度が低下する可能性があります。
67 | - `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
68 | - `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。（デフォルト：画像通りの改行位置位置で改行します。）
69 | - `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
70 | - `--figure` 検出した図、画像を出力ファイルにエクスポートします。
71 | - `--encoding` エクスポートする出力ファイルの文字エンコーディングを指定します。サポートされていない文字コードが含まれる場合は、その文字を無視します。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
72 | - `--combine` PDFを入力に与えたときに、複数ページが含まれる場合に、それらの予測結果を一つのファイルに統合してエクスポートします。
73 | - `--ignore_meta` 文章のheater, fotterなどの文字情報を出力ファイルに含めません。
74 | 
75 | その他のオプションに関しては、ヘルプを参照
76 | 
77 | ```
78 | yomitoku --help
79 | ```
80 | 
81 | **NOTE**
82 | 
83 | - GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
84 | - Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
85 | - AI-OCR の識別精度を高めるために、入力画像の解像度が重要です。低解像度画像では識別精度が低下します。最低でも画像の短辺を 720px 以上の画像で推論することをお勧めします。
86 | 
87 | ## 📝 ドキュメント
88 | 
89 | パッケージの詳細は[ドキュメント](https://kotaro-kinoshita.github.io/yomitoku/)を確認してください。
90 | 
91 | ## LICENSE
92 | 
93 | 本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
94 | 非商用での個人利用、研究目的での利用はご自由にお使いください。
95 | 商用目的での利用に関しては、別途、商用ライセンスを提供しますので、https://www.mlism.com/ にお問い合わせください。
96 | 
97 | YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
98 | 


--------------------------------------------------------------------------------
/configs/yomitoku-layout-parser-rtdtrv2-open-beta.yaml:
--------------------------------------------------------------------------------
 1 | hf_hub_repo: KotaroKinoshita/yomitoku-layout-parser-rtdtrv2-open-beta
 2 | thresh_score: 0.5
 3 | data:
 4 |   img_size:
 5 |   - 640
 6 |   - 640
 7 | PResNet:
 8 |   depth: 50
 9 |   variant: d
10 |   freeze_at: 0
11 |   return_idx:
12 |   - 1
13 |   - 2
14 |   - 3
15 |   num_stages: 4
16 |   freeze_norm: true
17 | HybridEncoder:
18 |   in_channels:
19 |   - 512
20 |   - 1024
21 |   - 2048
22 |   feat_strides:
23 |   - 8
24 |   - 16
25 |   - 32
26 |   hidden_dim: 256
27 |   use_encoder_idx:
28 |   - 2
29 |   num_encoder_layers: 1
30 |   nhead: 8
31 |   dim_feedforward: 1024
32 |   dropout: 0.0
33 |   enc_act: gelu
34 |   expansion: 1.0
35 |   depth_mult: 1
36 |   act: silu
37 | RTDETRTransformerv2:
38 |   num_classes: 6
39 |   feat_channels:
40 |   - 256
41 |   - 256
42 |   - 256
43 |   feat_strides:
44 |   - 8
45 |   - 16
46 |   - 32
47 |   hidden_dim: 256
48 |   num_levels: 3
49 |   num_layers: 6
50 |   num_queries: 300
51 |   num_denoising: 100
52 |   label_noise_ratio: 0.5
53 |   box_noise_scale: 1.0
54 |   eval_spatial_size:
55 |   - 640
56 |   - 640
57 |   eval_idx: -1
58 |   num_points:
59 |   - 4
60 |   - 4
61 |   - 4
62 |   cross_attn_method: default
63 |   query_select_method: default
64 | category:
65 | - tables
66 | - figures
67 | - paragraphs
68 | - section_headings
69 | - page_header
70 | - page_footer
71 | role:
72 | - section_headings
73 | - page_header
74 | - page_footer
75 | 


--------------------------------------------------------------------------------
/configs/yomitoku-table-structure-recognizer-rtdtrv2-open-beta.yaml:
--------------------------------------------------------------------------------
 1 | hf_hub_repo: KotaroKinoshita/yomitoku-table-structure-recognizer-rtdtrv2-open-beta
 2 | thresh_score: 0.4
 3 | data:
 4 |   img_size:
 5 |   - 640
 6 |   - 640
 7 | PResNet:
 8 |   depth: 50
 9 |   variant: d
10 |   freeze_at: 0
11 |   return_idx:
12 |   - 1
13 |   - 2
14 |   - 3
15 |   num_stages: 4
16 |   freeze_norm: true
17 | HybridEncoder:
18 |   in_channels:
19 |   - 512
20 |   - 1024
21 |   - 2048
22 |   feat_strides:
23 |   - 8
24 |   - 16
25 |   - 32
26 |   hidden_dim: 256
27 |   use_encoder_idx:
28 |   - 2
29 |   num_encoder_layers: 1
30 |   nhead: 8
31 |   dim_feedforward: 1024
32 |   dropout: 0.0
33 |   enc_act: gelu
34 |   expansion: 1.0
35 |   depth_mult: 1
36 |   act: silu
37 | RTDETRTransformerv2:
38 |   num_classes: 3
39 |   feat_channels:
40 |   - 256
41 |   - 256
42 |   - 256
43 |   feat_strides:
44 |   - 8
45 |   - 16
46 |   - 32
47 |   hidden_dim: 256
48 |   num_levels: 3
49 |   num_layers: 6
50 |   num_queries: 300
51 |   num_denoising: 100
52 |   label_noise_ratio: 0.5
53 |   box_noise_scale: 1.0
54 |   eval_spatial_size:
55 |   - 640
56 |   - 640
57 |   eval_idx: -1
58 |   num_points:
59 |   - 4
60 |   - 4
61 |   - 4
62 |   cross_attn_method: default
63 |   query_select_method: default
64 | category:
65 | - row
66 | - col
67 | - span


--------------------------------------------------------------------------------
/configs/yomitoku-text-detector-dbnet-open-beta.yaml:
--------------------------------------------------------------------------------
 1 | hf_hub_repo: KotaroKinoshita/yomitoku-text-detector-dbnet-open-beta
 2 | backbone:
 3 |   name: resnet50
 4 |   dilation: true
 5 | decoder:
 6 |   in_channels:
 7 |   - 256
 8 |   - 512
 9 |   - 1024
10 |   - 2048
11 |   hidden_dim: 256
12 |   adaptive: true
13 |   serial: true
14 |   smooth: false
15 |   k: 50
16 | data:
17 |   shortest_size: 1280
18 |   limit_size: 1600
19 | post_process:
20 |   min_size: 2
21 |   thresh: 0.2
22 |   box_thresh: 0.5
23 |   max_candidates: 1500
24 |   unclip_ratio: 7.0
25 | visualize:
26 |   color:
27 |   - 0
28 |   - 255
29 |   - 0
30 |   heatmap: false


--------------------------------------------------------------------------------
/configs/yomitoku-text-recognizer-parseq-open-beta.yaml:
--------------------------------------------------------------------------------
 1 | hf_hub_repo: KotaroKinoshita/yomitoku-text-recognizer-parseq-open-beta
 2 | charset: /home/kinoshita/Projects/know-how/yomitoku/src/yomitoku/resource/charset.txt
 3 | num_tokens: 7312
 4 | max_label_length: 100
 5 | decode_ar: 1
 6 | refine_iters: 1
 7 | data:
 8 |   num_workers: 4
 9 |   batch_size: 128
10 |   img_size:
11 |   - 32
12 |   - 800
13 | encoder:
14 |   patch_size:
15 |   - 8
16 |   - 8
17 |   num_heads: 8
18 |   embed_dim: 512
19 |   mlp_ratio: 4
20 |   depth: 12
21 | decoder:
22 |   embed_dim: 512
23 |   num_heads: 8
24 |   mlp_ratio: 4
25 |   depth: 1
26 | visualize:
27 |   font: src/yomitoku/resource/MPLUS1p-Medium.ttf
28 |   color:
29 |   - 0
30 |   - 0
31 |   - 255
32 |   font_size: 18
33 | 


--------------------------------------------------------------------------------
/configs/yomitoku-text-recognizer-parseq-small-open-beta.yaml:
--------------------------------------------------------------------------------
 1 | hf_hub_repo: KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta
 2 | charset: /home/kinoshita/Projects/know-how/yomitoku/src/yomitoku/resource/charset.txt
 3 | num_tokens: 7312
 4 | max_label_length: 100
 5 | decode_ar: 1
 6 | refine_iters: 1
 7 | data:
 8 |   num_workers: 4
 9 |   batch_size: 128
10 |   img_size:
11 |   - 32
12 |   - 800
13 | encoder:
14 |   patch_size:
15 |   - 16
16 |   - 16
17 |   num_heads: 8
18 |   embed_dim: 384
19 |   mlp_ratio: 4
20 |   depth: 9
21 | decoder:
22 |   embed_dim: 384
23 |   num_heads: 8
24 |   mlp_ratio: 4
25 |   depth: 1
26 | visualize:
27 |   font: src/yomitoku/resource/MPLUS1p-Medium.ttf
28 |   color:
29 |   - 0
30 |   - 0
31 |   - 255
32 |   font_size: 18
33 | 


--------------------------------------------------------------------------------
/demo/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/demo/sample.pdf


--------------------------------------------------------------------------------
/demo/setting_document_anaysis.py:
--------------------------------------------------------------------------------
1 | from yomitoku import DocumentAnalyzer
2 | 
3 | if __name__ == "__main__":
4 |     configs = {"ocr": {"text_detector": {"path_cfg": "demo/text_detector.yaml"}}}
5 | 
6 |     analyzer = DocumentAnalyzer(configs=configs, visualize=True, device="cuda")
7 | 


--------------------------------------------------------------------------------
/demo/simple_document_analysis.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | 
 3 | from yomitoku import DocumentAnalyzer
 4 | from yomitoku.data.functions import load_pdf
 5 | 
 6 | if __name__ == "__main__":
 7 |     PATH_IMGE = "demo/sample.pdf"
 8 |     analyzer = DocumentAnalyzer(visualize=True, device="cuda")
 9 |     # PDFファイルを読み込み
10 |     imgs = load_pdf(PATH_IMGE)
11 |     for i, img in enumerate(imgs):
12 |         results, ocr_vis, layout_vis = analyzer(img)
13 |         # HTML形式で解析結果をエクスポート
14 |         results.to_html(f"output_{i}.html", img=img)
15 |         # 可視化画像を保存
16 |         cv2.imwrite(f"output_ocr_{i}.jpg", ocr_vis)
17 |         cv2.imwrite(f"output_layout_{i}.jpg", layout_vis)
18 | 


--------------------------------------------------------------------------------
/demo/simple_layout.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | 
 3 | from yomitoku import LayoutAnalyzer
 4 | from yomitoku.data.functions import load_pdf
 5 | 
 6 | if __name__ == "__main__":
 7 |     analyzer = LayoutAnalyzer(visualize=True, device="cuda")
 8 |     # PDFファイルを読み込み
 9 |     imgs = load_pdf("demo/sample.pdf")
10 |     for i, img in enumerate(imgs):
11 |         results, layout_vis = analyzer(img)
12 | 
13 |         # JSON形式で解析結果をエクスポート
14 |         results.to_json(f"output_{i}.json")
15 |         cv2.imwrite(f"output_layout_{i}.jpg", layout_vis)
16 | 


--------------------------------------------------------------------------------
/demo/simple_ocr.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | 
 3 | from yomitoku import OCR
 4 | from yomitoku.data.functions import load_pdf
 5 | 
 6 | if __name__ == "__main__":
 7 |     ocr = OCR(visualize=True, device="cuda")
 8 |     # PDFファイルを読み込み
 9 |     imgs = load_pdf("demo/sample.pdf")
10 |     import time
11 | 
12 |     start = time.time()
13 |     for i, img in enumerate(imgs):
14 |         results, ocr_vis = ocr(img)
15 | 
16 |         # JSON形式で解析結果をエクスポート
17 |         results.to_json(f"output_{i}.json")
18 |         cv2.imwrite(f"output_ocr_{i}.jpg", ocr_vis)
19 | 


--------------------------------------------------------------------------------
/demo/text_detector.yaml:
--------------------------------------------------------------------------------
1 | #hf_hub_repo: yomitoku-text-detector-dbnet-open-beta
2 | post_process:
3 |   thresh: 0.1
4 |   unclip_ratio: 2.5
5 | 


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04
 2 | 
 3 | ENV TZ=Asia/Tokyo
 4 | ENV DEBIAN_FRONTEND=noninteractive
 5 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 6 | 
 7 | RUN apt -y update && apt -y upgrade
 8 | 
 9 | ARG PYTHON_VERSION=3.9
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | 
12 | RUN apt install -y --no-install-recommends \
13 |     software-properties-common \
14 |     build-essential \
15 |     curl \
16 |     wget \
17 |     git \
18 |     ca-certificates \
19 |     poppler-utils \
20 |     libopencv-dev \
21 |     && add-apt-repository ppa:deadsnakes/ppa \
22 |     && apt update \
23 |     && apt install -y --no-install-recommends \
24 |     python${PYTHON_VERSION} \
25 |     python${PYTHON_VERSION}-dev \
26 |     python3-pip \
27 |     python3-venv \
28 |     && rm -rf /var/lib/apt/lists/*
29 | 
30 | RUN python${PYTHON_VERSION} --version
31 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
32 | 
33 | RUN python -m pip install --upgrade pip
34 | 
35 | RUN pip install yomitoku
36 | 
37 | WORKDIR /workspace


--------------------------------------------------------------------------------
/docs/cli.en.md:
--------------------------------------------------------------------------------
  1 | # CLI Usage
  2 | 
  3 | The model weight files are downloaded from Hugging Face Hub only during the first execution.
  4 | 
  5 | ```
  6 | yomitoku ${path_data} -v -o results
  7 | ```
  8 | 
  9 | - `${path_data}`: Specify the path to a directory containing images to be analyzed or directly provide the path to an image file. If a directory is specified, images in its subdirectories will also be processed.
 10 | - `-f`, `--format`: Specify the output file format. Supported formats are json, csv, html, md , and pdf(searchable-pdf).
 11 | - `-o`, `--outdir`: Specify the name of the output directory. If it does not exist, it will be created.
 12 | - `-v`, `--vis`: If specified, outputs visualized images of the analysis results.
 13 | 
 14 | **NOTE**
 15 | 
 16 | - Only printed text recognition is supported. While it may occasionally read handwritten text, official support is not provided.
 17 | - YomiToku is optimized for document OCR and is not designed for scene OCR (e.g., text printed on non-paper surfaces like signs).
 18 | - The resolution of input images is critical for improving the accuracy of AI-OCR recognition. Low-resolution images may lead to reduced recognition accuracy. It is recommended to use images with a minimum short side resolution of 720px for inference.
 19 | 
 20 | ## Reference for Help
 21 | 
 22 | Displays the options available for the CLI using 　`--help`, `-h`
 23 | 
 24 | ```
 25 | yomitoku -h
 26 | ```
 27 | 
 28 | ## Running in Lightweight Mode
 29 | 
 30 | By using the --lite option, it is possible to perform inference with a lightweight model. This enables faster analysis compared to the standard mode. However, the accuracy of character recognition may decrease.
 31 | 
 32 | ```
 33 | yomitoku ${path_data} --lite -v
 34 | ```
 35 | 
 36 | ## Specifying Output Format
 37 | 
 38 | You can specify the output format of the analysis results using the --format or -f option. Supported output formats include JSON, CSV, HTML, and MD (Markdown).
 39 | 
 40 | ```
 41 | yomitoku ${path_data} -f md
 42 | ```
 43 | 
 44 | - `pdf`: Detect the text in the image and embed it into the PDF as invisible text, converting the file into a searchable PDF.
 45 | 
 46 | ## Specifying the Output Device
 47 | 
 48 | You can specify the device for running the model using the -d or --device option. Supported options are cuda, cpu, and mps. If a GPU is not available, inference will be performed on the CPU. (Default: cuda)
 49 | 
 50 | ```
 51 | yomitoku ${path_data} -d cpu
 52 | ```
 53 | 
 54 | ## Ignoring Line Breaks
 55 | 
 56 | In the normal mode, line breaks are applied based on the information described in the image. By using the --ignore_line_break option, you can ignore the line break positions in the image and return the same sentence within a paragraph as a single connected output.
 57 | 
 58 | ```
 59 | yomitoku ${path_data} --ignore_line_break
 60 | ```
 61 | 
 62 | ## Outputting Figures and Graph Images
 63 | 
 64 | In the normal mode, information about figures or images contained in document images is not output. By using the --figure option, you can extract figures and images included in the document image, save them as separate image files, and include links to the detected individual images in the output file.
 65 | 
 66 | ```
 67 | yomitoku ${path_data} --figure
 68 | ```
 69 | 
 70 | ## Outputting Text Contained in Figures and Images
 71 | 
 72 | In normal mode, text information contained within figures or images is not included in the output file. By using the --figure_letter option, text information within figures and images will also be included in the output file.
 73 | 
 74 | ```
 75 | yomitoku ${path_data} --figure_letter
 76 | ```
 77 | 
 78 | ## Specifying the Character Encoding of the Output File
 79 | 
 80 | You can specify the character encoding of the output file using the --encoding option. Supported encodings include `utf-8`, `utf-8-sig`, `shift-jis`, `enc-jp`, and `cp932`. If unsupported characters are encountered, they will be ignored and not included in the output.
 81 | 
 82 | ```
 83 | yomitoku ${path_data} --encoding utf-8-sig
 84 | ```
 85 | 
 86 | ## Specifying the Path to Config Files
 87 | 
 88 | Specify the path to the config files for each module as follows:
 89 | 
 90 | - `--td_cfg`: Path to the YAML file containing the config for the Text Detector
 91 | - `--tr_cfg`: Path to the YAML file containing the config for the Text Recognizer
 92 | - `--lp_cfg`: Path to the YAML file containing the config for the Layout Parser
 93 | - `--tsr_cfg`: Path to the YAML file containing the config for the Table Structure Recognizer
 94 | 
 95 | ```
 96 | yomitoku ${path_data} --td_cfg ${path_yaml}
 97 | ```
 98 | 
 99 | ## Do not include metadata in the output file
100 | 
101 | You can exclude metadata such as headers and footers from the output file.
102 | ```
103 | yomitoku ${path_data} --ignore_meta
104 | ```
105 | 
106 | ## Combine multiple pages
107 | 
108 | If the PDF contains multiple pages, you can export them as a single file.
109 | 
110 | ```
111 | yomitoku ${path_data} -f md --combine
112 | ```
113 | 
114 | ## Specifying Reading Order
115 | 
116 | By default, *Auto* mode automatically detects whether a document is written horizontally or vertically and estimates the appropriate reading order. However, you can explicitly specify a custom reading order. For horizontal documents, the default is `top2left`, and for vertical documents, it is `top2bottom`.
117 | 
118 | ```
119 | yomitoku ${path_data} --reading_order left2right
120 | ```
121 | 
122 | * `top2bottom`: Prioritizes reading from top to bottom. Useful for multi-column documents such as word processor files with vertical flow.
123 | 
124 | * `left2right`: Prioritizes reading from left to right. Suitable for layouts like receipts or health insurance cards, where key-value text pairs are arranged in columns.
125 | 
126 | * `right2left`: Prioritizes reading from right to left. Effective for vertically written documents.


--------------------------------------------------------------------------------
/docs/cli.ja.md:
--------------------------------------------------------------------------------
  1 | # CLI Usage
  2 | 
  3 | 初回の実行時のみ, HuggingFaseHub からモデルの重みファイルをダウンロードします。
  4 | 以下のコマンドにて、文書画像の解析を実行します。
  5 | 
  6 | ```
  7 | yomitoku ${path_data} -v -o results
  8 | ```
  9 | 
 10 | - `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。ファイル形式は pdf, jpeg, png, bmp, tiff をサポートしています。
 11 | - `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
 12 | - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
 13 | 
 14 | **Note**:
 15 | 
 16 | - 活字のみの識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
 17 | - OCR は文書 OCR と情景 OCR(看板など紙以外にプリントされた文字)に大別されますが、Yomitoku は文書 OCR 向けに最適化されています。
 18 | - AI-OCR の識別精度を高めるために、入力画像の解像度が重要です。低解像度画像では識別精度が低下します。画像の短辺を 1000px 以上の画像で推論することをお勧めします。
 19 | 
 20 | ## ヘルプの参照
 21 | 
 22 | `--help`, `-h`にて CLI に指定可能なオプションを表示します。
 23 | 
 24 | ## 軽量モードでの実行
 25 | 
 26 | `--lite`オプションを付与することで、軽量モデルを使用して、推論することが可能です。通常モードより高速に解析が実行可能です。ただし、文字の認識精度が低下する可能性があります。
 27 | 
 28 | ```
 29 | yomitoku ${path_data} --lite -v
 30 | ```
 31 | 
 32 | ## 出力フォーマットの指定
 33 | 
 34 | - `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md, pdf(searchable-pdf) をサポート)
 35 | 
 36 | ```
 37 | yomitoku ${path_data} -f md
 38 | ```
 39 | 
 40 | - pdf: 画像内の文字情報を認識し、文字情報を透明テキストとして、PDFに埋め込むことで、サーチャブルPDFに変換します。
 41 | 
 42 | ## 出力デバイスの指定
 43 | 
 44 | - `-d`, `--device` オプションを使用することで、モデルを実行するためのデバイスを指定します。(cuda | cpu | mps)。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
 45 | 
 46 | ```
 47 | yomitoku ${path_data} -d cpu
 48 | ```
 49 | 
 50 | ## 改行の無視
 51 | 
 52 | 通常モードでは、画像内で記述された情報に従い、改行を行います。 `--ignore_line_break` オプションを使用することで、画像の改行位置を無視して、段落内の同一文章を連結して返すことが可能です。
 53 | 
 54 | ```
 55 | yomitoku ${path_data} --ignore_line_break
 56 | ```
 57 | 
 58 | ## 図やグラフ画像の出力
 59 | 
 60 | 通常モードでは、文書画像内の含まれる図や画像の情報を出力しません。`--figure`オプションを使用することで、文書画像に含まれる、図や画像を切り出し、個別の画像として保存、また、出力ファイル内に検出した個別の画像に対するリンクを出力します。
 61 | 
 62 | ```
 63 | yomitoku ${path_data} --figure
 64 | ```
 65 | 
 66 | ## 図や画像内に含まれる文字の出力
 67 | 
 68 | 通常モードでは、図や画像内に含まれる文字情報は出力ファイルに出力しません。 `--figure_letter` オプションを使用することで、画像や図に含まれる文字情報も出力ファイルに出力します。
 69 | 
 70 | ```
 71 | yomitoku ${path_data} --figure_letter
 72 | ```
 73 | 
 74 | ## 出力ファイルの文字コードの指定
 75 | 
 76 | 出力ファイルの文字コードを`--encoding`オプションにて指定できます。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)。サポートされていない文字コードが含まれる場合は、その文字を無視し、出力しません。
 77 | 
 78 | ```
 79 | yomitoku ${path_data} --encoding utf-8-sig
 80 | ```
 81 | 
 82 | ## コンフィグのパスの指定
 83 | 
 84 | 各モジュールに対する config ファイルのパスを指定します。
 85 | 
 86 | - `--td_cfg`: Text Detector に対する config が記述された yaml ファイルに対するパス
 87 | - `--tr_cfg`: Text Recognizer に対する config が記述された yaml ファイルに対するパス
 88 | - `--lp_cfg`: Layout Parser に対する config が記述された yaml ファイルに対するパス
 89 | - `--tsr_cfg`: Table Structure Recognizer に対する config が記述された yaml ファイルに対するパス
 90 | 
 91 | ```
 92 | yomitoku ${path_data} --td_cfg ${path_yaml}
 93 | ```
 94 | 
 95 | ## メタ情報を出力ファイルに加えない
 96 | 
 97 | ヘッダーやフッター等のメタデータを出力ファイルに加えないようにすることができます。
 98 | 
 99 | ```
100 | yomitoku ${path_data} --ignore_meta
101 | ```
102 | 
103 | ## 複数ページを統合する
104 | 
105 | PDFに複数ページが含まれる場合に複数ページを一つのファイルにまとめてエクスポートできます。
106 | 
107 | ```
108 | yomitoku ${path_data} -f md --combine
109 | ```
110 | 
111 | ## 読み取り順を指定する
112 | Autoでは、横書きのドキュメント、縦書きのドキュメントを識別し、自動で読み取り順を推定しますが、任意の読み取り順の指定することが可能です。デフォルトでは横書きの文書は`top2left`, 縦書きは`top2bottom`になります。
113 | 
114 | ```
115 | yomitoku ${path_data} --reading_order left2right
116 | ```
117 | 
118 | - `top2bottom`: 上から下方向に優先的に読み取り順を推定します。段組みのワードドキュメントなどに対して、有効です。
119 | 
120 | - `left2right`: 左から右方向に優先的に読み取り順を推定します。レシートや保険証などキーに対して、値を示すテキストが段組みになっているようなレイアウトに有効です。
121 | 
122 | - `right2left:` 右から左方向に優先的に読み取り順を推定します。縦書きのドキュメントに対して有効です。


--------------------------------------------------------------------------------
/docs/configuration.en.md:
--------------------------------------------------------------------------------
 1 | # Configuration
 2 | 
 3 | The configurable parameters for each module are explained.
 4 | 
 5 | ## Text Detector
 6 | 
 7 | ### input data
 8 | 
 9 | ```yaml
10 | data:
11 |   # If the number of pixels on the shorter side of the image falls below the specified value, the image will be enlarged to ensure that it meets or exceeds the pixel count set here.
12 |   shortest_size: int 
13 | 
14 |   # If the number of pixels on the longer side of the image exceeds the specified value, the image will be resized to ensure that it is equal to or less than the pixel count set here.
15 |   limit_size: int 
16 | ```
17 | 
18 | ### post process
19 | 
20 | ```yaml
21 | post_process:
22 |   #If the size of the larger side of the detected area falls below the specified value, the area will be removed.
23 |   min_size: int 
24 | 
25 |   # This is the threshold for the model's prediction score. Pixels with prediction scores below the specified threshold will be treated as background regions.
26 |   thresh: float 
27 | 
28 |   # The threshold for the model's prediction score is used to treat pixels with prediction scores below the specified threshold as background regions.
29 |   box_thresh: float 
30 | 
31 |   # The maximum number of detectable text regions.
32 |   max_candidates: int 
33 | 
34 |   # A parameter to set the size of the margin area for text regions. Larger values increase the margin around text regions, allowing for detection with more whitespace, while smaller values result in tighter detection.
35 |   unclip_ratio: int 
36 | 
37 | ### Visualization
38 | 
39 | ```yaml
40 | visualize:
41 |   # The color of the bounding box for the detected regions.
42 |   color: [B, G, R] 
43 | 
44 |   # Whether to visualize and render the model's prediction heatmap.
45 |   heatmap: boolean 
46 | ```
47 | 
48 | ## Text Recognizer
49 | 
50 | ### maximum text length 
51 | ```yaml
52 | # The maximum string length that can be predicted. 
53 | max_label_length: int 
54 | ```
55 | 
56 | ### input data
57 | 
58 | ```yaml
59 | data:
60 |   # The number of images used for batch processing.
61 |   batch_size: int 
62 | ```
63 | 
64 | ### visualization
65 | 
66 | ```yaml
67 | visualize:
68 |   # The path to the font used for visualizing the predicted result strings.
69 |   font: str 
70 | 
71 |   # The color of the font used for visualizing the predicted result strings.
72 |   color: [BGR]
73 | 
74 |   # The font size of the predicted result strings.
75 |   font_size: int 
76 | ```
77 | 
78 | ## Layout_parser
79 | 
80 | ### threshold of prediction score
81 | 
82 | ```yaml
83 | # Regions with prediction scores below the specified threshold will be excluded based on the threshold for the model's prediction score.
84 | thresh_score: float 
85 | ```
86 | 
87 | ## Table Structure Recognizer
88 | 
89 | ### threshold of prediction score
90 | 
91 | ```yaml
92 | # Regions with prediction scores below the specified threshold will be excluded based on the threshold for the model's prediction score.
93 | thresh_score: float
94 | ```
95 | 


--------------------------------------------------------------------------------
/docs/configuration.ja.md:
--------------------------------------------------------------------------------
 1 | # Configuration
 2 | 
 3 | 各モジュールに対して、設定可能なパラメータについて説明します。
 4 | 
 5 | ## Text Detector
 6 | 
 7 | ### 入力画像サイズに関する設定
 8 | 
 9 | ```yaml
10 | data:
11 |   # 画像の短辺ピクセル数が設定した数値を下回る場合にここで設定した画像のピクセル数に以上になるように画像を拡大します。
12 |   shortest_size: int 
13 | 
14 |   #画像の長辺ピクセル数が設定した数値を上回る場合にここで設定した画像のピクセル数以下になるように画像を縮小します。
15 |   limit_size: int 
16 | ```
17 | 
18 | ### 後処理
19 | 
20 | ```yaml
21 | post_process:
22 |   #検出した領域の辺の大きいさが設定した数値を下回る場合に領域を除去します。
23 |   min_size: int 
24 | 
25 |   #モデルの予測スコアに対する閾値で、予測スコアが設定した閾値を下回るピクセルを背景領域として扱います。
26 |   thresh: float 
27 | 
28 |   #領域内の予測の平均スコアに対する閾値で、閾値を下回る領域を除外する
29 |   box_thresh: float 
30 | 
31 |   #検出可能なテキスト領域数の上限
32 |   max_candidates: int 
33 | 
34 |   #テキスト領域のマージン領域の大きさを設定するためのパラメータ。大きいほど、テキスト領域のマージンを大きくし、余白を持たせた検出が可能になり、小さいほどタイトな検出になる。
35 |   unclip_ratio: int 
36 | ```
37 | 
38 | ### 可視化設定
39 | 
40 | ```yaml
41 | visualize:
42 |   #検出領域のバウンディングボックスの色の設定
43 |   color: [B, G, R] 
44 | 
45 |   #モデルの予測ヒートマップを可視化、描画するか
46 |   heatmap: boolean 
47 | ```
48 | 
49 | ## Text Recognizer
50 | 
51 | ### 文字列長
52 | 
53 | ```yaml
54 | #予測可能な最大文字列長
55 | max_label_length: int
56 | ```
57 | 
58 | ### 入力画像
59 | 
60 | ```yaml
61 | data:
62 |    #バッチ処理に用いる画像数
63 |   batch_size: int
64 | ```
65 | 
66 | ### 可視化設定
67 | 
68 | ```yaml
69 | visualize:
70 |   # 予測結果文字列の可視化に用いるフォントのパス
71 |   font: str 
72 | 
73 |   # 予測結果文字列の可視化に用いるフォントの色
74 |   color: [BGR] 
75 | 
76 |   # 予測結果文字列のフォントの大きさ
77 |   font_size: int
78 | ```
79 | 
80 | ## Layout_parser
81 | 
82 | ### 予測スコアに対する閾値
83 | 
84 | ```yaml
85 | #モデルの予測スコアに対する閾値で、予測スコアが設定した閾値を領域を除外します。
86 | thresh_score: float 
87 | ```
88 | 
89 | ## Table Structure Recognizer
90 | 
91 | ### 予測スコアに対する閾値
92 | 
93 | ```yaml
94 | #モデルの予測スコアに対する閾値で、予測スコアが設定した閾値を領域を除外します。
95 | thresh_score: float 
96 | ```
97 | 


--------------------------------------------------------------------------------
/docs/index.en.md:
--------------------------------------------------------------------------------
 1 | ## 🌟 Introduction
 2 | 
 3 | YomiToku is a Document AI engine specialized in Japanese document image analysis. It provides full OCR (optical character recognition) and layout analysis capabilities, enabling the recognition, extraction, and conversion of text and diagrams from images.
 4 | 
 5 | - 🤖 Equipped with four AI models trained on Japanese datasets: text detection, text recognition, layout analysis, and table structure recognition. All models are independently trained and optimized for Japanese documents, delivering high-precision inference.
 6 | - 🇯🇵 Each model is specifically trained for Japanese document images, supporting the recognition of over 7,000 Japanese characters, including vertical text and other layout structures unique to Japanese documents. (It also supports English documents.)
 7 | - 📈 By leveraging layout analysis, table structure parsing, and reading order estimation, it extracts information while preserving the semantic structure of the document layout.
 8 | - 📄 Supports a variety of output formats, including HTML, Markdown, JSON, and CSV. It also allows for the extraction of diagrams and images contained within the documents.It also supports converting document images into fully text-searchable PDFs.
 9 | - ⚡ Operates efficiently in GPU environments, enabling fast document transcription and analysis. It requires less than 8GB of VRAM, eliminating the need for high-end GPUs.。
10 | 
11 | ## 🙋 FAQ
12 | 
13 | ### Q. Is it possible to use YomiToku in an environment without internet access?
14 | 
15 | A. Yes, it is possible.
16 | YomiToku connects to Hugging Face Hub to automatically download model files during the first execution, requiring internet access at that time. However, you can manually download the files in advance, allowing YomiToku to operate in an offline environment. For details, please refer to [Module Usage](module.en.md) under the section "Using YomiToku in an Offline Environment."
17 | 
18 | ### Q. Is commercial use allowed?
19 | 
20 | A. This package is licensed under CC BY-NC 4.0. It is available for free for personal and research purposes. For commercial use, a paid commercial license is required. Please contact the developers for further details.
21 | 


--------------------------------------------------------------------------------
/docs/index.ja.md:
--------------------------------------------------------------------------------
 1 | ## 🌟 概要
 2 | 
 3 | YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
 4 | 
 5 | - 🤖 日本語データセットで学習した 4 種類(文字位置の検知、文字列認識、レイアウト解析、表の構造認識)の AI モデルを搭載しています。4 種類のモデルはすべて独自に学習されたモデルで日本語文書に対して、高精度に推論可能です。
 6 | - 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000 文字を超える日本語文字の認識をサーポート、手書き文字、縦書きなど日本語特有のレイアウト構造の文書画像の解析も可能です。（日本語以外にも英語の文書に対しても対応しています）。
 7 | - 📈 レイアウト解析、表の構造解析, 読み順推定機能により、文書画像のレイアウトの意味的構造を壊さずに情報を抽出することが可能です。
 8 | - 📄 多様な出力形式をサポートしています。html やマークダウン、json、csv のいずれかのフォーマットに変換可能です。また、文書内に含まれる図表、画像の抽出の出力も可能です。文書画像を全文検索可能なサーチャブルPDFに変換する処理もサポートしています。
 9 | - ⚡ GPU 環境で高速に動作し、効率的に文書の文字起こし解析が可能です。また、VRAM も 8GB 以内で動作し、ハイエンドな GPU を用意する必要はありません。
10 | 
11 | ## 🙋 FAQ
12 | 
13 | ### Q. インターネットに接続できない環境での動作は可能ですか？
14 | 
15 | A. 可能です。Yomitoku は初回実行時に HuggingFaceHub にアクセスし、自動でモデルファイルのダウンロードを行いますが、この際にインターネットに接続します。しかし、事前に手動でダウンロードすることでインターネットへ接続できない環境でも動作可能です。詳しくは[Module Usase](module.ja.md)の「インターネットに接続できない環境での利用」を参照してください。
16 | 
17 | ### Q. 商用利用は可能ですか？
18 | 
19 | A. 本パッケージは [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) に従います。個人の利用や研究利用に関しては無償でご利用いただけます。商用利用に関しては、別途、有償の商用ライセンスを発行しますので、開発者まで問い合わせください。
20 | 


--------------------------------------------------------------------------------
/docs/installation.en.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | 
 4 | This package requires Python 3.10 or later and PyTorch 2.5 or later for execution. PyTorch must be installed according to your CUDA version. A GPU with more than 8GB of VRAM is recommended. While it can run on a CPU, please note that the processing is not currently optimized for CPUs, which may result in longer execution times.
 5 | 
 6 | ## from PYPI
 7 | 
 8 | ```bash
 9 | pip install yomitoku
10 | ```
11 | 
12 | ## using uv
13 | This repository uses the package management tool [uv](https://docs.astral.sh/uv/). After installing uv, clone the repository and execute the following commands:
14 | 
15 | ```bash
16 | uv sync
17 | ```
18 | 
19 | Using GPU with onnxruntime
20 | ```bash
21 | uv sync --extra gpu
22 | ```
23 | 
24 | When using uv, you need to modify the following part of the pyproject.toml file to match your CUDA version. By default, PyTorch compatible with CUDA 12.4 will be downloaded.
25 | 
26 | ```pyproject.tom
27 | [[tool.uv.index]]
28 | name = "pytorch-cuda124"
29 | url = "https://download.pytorch.org/whl/cu124"
30 | explicit = true
31 | ```
32 | 
33 | 
34 | ## using docker
35 | 
36 | A Dockerfile is provided in the root of the repository, which you are welcome to use.
37 | 
38 | ```bash
39 | docker build -t yomitoku .
40 | ```
41 | 
42 | === "GPU"
43 | 
44 |     ```bash
45 |     docker run -it --gpus all -v $(pwd):/workspace --name yomitoku yomitoku /bin/bash
46 |     ```
47 | 
48 | === "CPU"
49 | 
50 |     ```bash
51 |     docker run -it -v $(pwd):/workspace --name yomitoku yomitoku /bin/bash
52 |     ```
53 | 


--------------------------------------------------------------------------------
/docs/installation.ja.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | 本パッケージは Python3.10+, Pytorch が実行に必要です。Pytorch はご自身の環境に合わせて、インストールが必要です。計算機は GPU(> VRAM 8G)を推奨しています。CPU でも動作しますが、現在、CPU 向けに処理が最適化されておらず、実行に時間がかかりますのでご注意ください。
 4 | 
 5 | ## PYPI からインストール
 6 | 
 7 | ```bash
 8 | pip install yomitoku
 9 | ```
10 | 
11 | ## uv でのインストール
12 | 
13 | 本リポジトリはパッケージ管理ツールに [uv](https://docs.astral.sh/uv/) を使用しています。uv をインストール後、リポジトリをクローンし、以下のコマンドを実行してください
14 | 
15 | ```bash
16 | uv sync
17 | ```
18 | 
19 | onnxruntimeの実行にGPUを使用する場合
20 | ```bash
21 | uv sync --extra gpu
22 | ```
23 | 
24 | uvを利用する場合、`pyproject.toml`の以下の部分をご自身のcudaのバージョンに合わせて修正する必要があります。デフォルトではCUDA12.4に対応したpytorchがダウンロードされます。
25 | 
26 | ```pyproject.tom
27 | [[tool.uv.index]]
28 | name = "pytorch-cuda124"
29 | url = "https://download.pytorch.org/whl/cu124"
30 | explicit = true
31 | ```
32 | 
33 | ## Docker 環境での実行
34 | 
35 | リポジトリの直下に dockerfile を配置していますので、そちらも活用いただけます。
36 | 
37 | ```bash
38 | docker build -t yomitoku .
39 | ```
40 | 
41 | === "GPU"
42 | 
43 |     ```bash
44 |     docker run -it --gpus all -v $(pwd):/workspace --name yomitoku yomitoku /bin/bash
45 |     ```
46 | 
47 | === "CPU"
48 | 
49 |     ```bash
50 |     docker run -it -v $(pwd):/workspace --name yomitoku yomitoku /bin/bash
51 |     ```
52 | 


--------------------------------------------------------------------------------
/docs/mcp.en.md:
--------------------------------------------------------------------------------
 1 | # MCP
 2 | 
 3 | This section explains how to use the Yomitoku MCP server in conjunction with Claude Desktop.
 4 | 
 5 | ## Installing Yomitoku
 6 | 
 7 | First, install Yomitoku by following the "Installation with uv" section in [Installation](installation.en.md).
 8 | 
 9 | However, to add `mcp` as a dependency during installation, include `mcp` in `--extra` as shown below.
10 | 
11 | ```bash
12 | uv sync --extra mcp
13 | ```
14 | 
15 | ## Setting up Claude Desktop
16 | 
17 | Next, add the following configuration to the `mcpServers` section of the Claude Desktop configuration file. (Refer to [here](https://modelcontextprotocol.io/quickstart/user) for how to open the configuration file)
18 | 
19 | ```json
20 | {
21 |   "mcpServers": {
22 |     "yomitoku": {
23 |       "command": "uv",
24 |       "args": [
25 |         "--directory",
26 |         "(Absolute path of the directory where Yomitoku was cloned)",
27 |         "run",
28 |         "yomitoku_mcp"
29 |       ],
30 |       "env": {
31 |         "RESOURCE_DIR": "(Absolute path of the directory containing files for OCR)"
32 |       }
33 |     }
34 |   }
35 | }
36 | ```
37 | 
38 | For example, if you executed `git clone https://github.com/kotaro-kinoshita/yomitoku.git` in `/Users/your-username/workspace`, then `(Directory where Yomitoku was cloned)` would be `/Users/your-username/workspace/yomitoku`, and if you use `sample.pdf` in the `yomitoku/demo` directory, specify `(Directory containing files for OCR)` as `/Users/your-username/workspace/yomitoku/demo`.
39 | 
40 | ## Using Claude Desktop
41 | 
42 | * Please restart Claude Desktop to apply changes to the configuration file.
43 | 
44 | For example, if you use `yomitoku/demo/sample.pdf` as a sample, instruct as follows:
45 | 
46 | ```txt
47 | Analyze sample.pdf using OCR and translate it into English.
48 | ```
49 | 
50 | ## Starting the SSE Server
51 | 
52 | Set the path to the folder containing the images to be processed by OCR in the resource directory.
53 | 
54 | ```
55 | export RESOURCE_DIR="path of dataset"
56 | ```
57 | 
58 | Start the SSE server using the following command:
59 | 
60 | ```
61 | uv run yomitoku_mcp -t sse
62 | ```
63 | 
64 | The SSE server endpoint will be available at `http://127.0.0.1:8000/sse`.
65 | 


--------------------------------------------------------------------------------
/docs/mcp.ja.md:
--------------------------------------------------------------------------------
 1 | # MCP
 2 | 
 3 | ここではYomitokuのMCPサーバーをClaude Desktopに連携して利用する方法を説明します。
 4 | 
 5 | ## Yomitokuのインストール
 6 | 
 7 | まずは
 8 | [Installation](installation.ja.md)の「uvでのインストール」に従ってYomitokuをインストールしてください。
 9 | 
10 | ただし、`mcp`を依存関係に追加するためにインストール時には下記のように`--extra`に`mcp`を加えます。
11 | 
12 | ```bash
13 | uv sync --extra mcp
14 | ```
15 | 
16 | 
17 | ## Claude Desktopの設定
18 | 
19 | 次にClaude Desktopの設定ファイルの`mcpServers`に以下ように設定を追加します。(設定ファイルの開き方は[こちら](https://modelcontextprotocol.io/quickstart/user)を参照してください)
20 | 
21 | ```json
22 | {
23 |   "mcpServers": {
24 |     "yomitoku": {
25 |       "command": "uv",
26 |       "args": [
27 |         "--directory",
28 |         "(YomitokuをCloneしたディレクトリの絶対パス)",
29 |         "run",
30 |         "yomitoku_mcp"
31 |       ],
32 |       "env": {
33 |         "RESOURCE_DIR": "(OCR対象のファイルがあるディレクトリの絶対パス)"
34 |       }
35 |     }
36 |   }
37 | }
38 | ```
39 | 
40 | 
41 | 例えば、`/Users/your-username/workspace`で`git clone https://github.com/kotaro-kinoshita/yomitoku.git`を実行した場合は、`(YomitokuをCloneしたディレクトリ)`は`/Users/your-username/workspace/yomitoku`となり、`yomitoku/demo`ディレクトリの`sample.pdf`を用いる場合は`(OCR対象のファイルがあるディレクトリ)`を`/Users/your-username/workspace/yomitoku/demo`と指定します。
42 | 
43 | ## Claude Desktopでの利用
44 | 
45 | ※ 設定ファイルの変更を反映するにはClaude Desktopを再起動してください。
46 | 
47 | 例えば`yomitoku/demo/sample.pdf`をサンプルとして用いる場合、下記のように指示してください。
48 | 
49 | ```txt
50 | sample.pdfをOCRで解析して要約してください。
51 | ```
52 | 
53 | ## SSEサーバーの起動
54 | 環境変数の`RESOURCE_DIR`にOCRの対象画像が含まれたフォルダのパスを設定してください。
55 | ```
56 | export RESOURCE_DIR="path of dataset"
57 | ```
58 | 
59 | 以下のコマンドでSSEサーバーを起動します。
60 | ```
61 | uv run yomitoku_mcp -t sse
62 | ```
63 | 
64 | ` http://127.0.0.1:8000/sse`がSSEサーバーのエンドポイントになります。


--------------------------------------------------------------------------------
/docs/module.ja.md:
--------------------------------------------------------------------------------
  1 | # モジュールとしてのコード内での利用
  2 | 
  3 | ## Document Analyzer の利用
  4 | 
  5 | Document Analyzer は OCR およびレイアウト解析を実行し、それらの結果を統合した解析結果を返却します。段落、表の構造解析、抽出、図表の検知など様々なユースケースにご利用いただけます。
  6 | 
  7 | <!--codeinclude-->
  8 | 
  9 | [demo/simple_document_analysis.py](../demo/simple_document_analysis.py)
 10 | 
 11 | <!--/codeinclude-->
 12 | 
 13 | - `visualize` を True にすると各処理結果を可視化した結果を第２、第 3 戻り値に OCR、レアウト解析の処理結果をそれぞれ格納し、返却します。False にした場合は None を返却します。描画処理のための計算が増加しますので、デバック用途でない場合は、False を推奨します。
 14 | - `device` には処理に用いる計算機を指定します。Default は"cuda". GPU が利用できない場合は、自動で CPU モードに切り替えて処理を実行します。
 15 | - `configs`を活用すると、パイプラインの処理のより詳細のパラメータを設定できます。
 16 | 
 17 | `DocumentAnalyzer` の処理結果のエクスポートは以下に対応しています。
 18 | 
 19 | - `to_json()`: JSON 形式(\*.json)
 20 | - `to_html()`: HTML 形式(\*.html)
 21 | - `to_csv()`: カンマ区切り CSV 形式(\*.csv)
 22 | - `to_markdown()`: マークダウン形式(\*.md)
 23 | 
 24 | ## AI-OCR のみの利用
 25 | 
 26 | AI-OCR では、テキスト検知と検知したテキストに対して、認識処理を実行し、画像内の文字の位置と読み取り結果を返却します。
 27 | 
 28 | <!--codeinclude-->
 29 | 
 30 | [demo/simple_ocr.py](../demo/simple_ocr.py)
 31 | 
 32 | <!--/codeinclude-->
 33 | 
 34 | - `visualize` を True にすると各処理結果を可視化した結果を第２、第 3 戻り値に OCR、レアウト解析の処理結果をそれぞれ格納し、返却します。False にした場合は None を返却します。描画処理のための計算が増加しますので、デバック用途でない場合は、False を推奨します。
 35 | - `device` には処理に用いる計算機を指定します。Default は"cuda". GPU が利用できない場合は、自動で CPU モードに切り替えて処理を実行します。
 36 | - `configs`を活用すると、パイプラインの処理のより詳細のパラメータを設定できます。
 37 | 
 38 | `OCR`の処理結果のエクスポートは JSON 系形式(`to_json()`)のみサポートしています。
 39 | 
 40 | ## Layout Analyzer のみの利用
 41 | 
 42 | LayoutAnalyzer では、テキスト検知と検知したテキストに対して、段落、図表の検知および表の構造解析処理 AI を実行し、文書内のレイアウト構造を解析します。
 43 | 
 44 | <!--codeinclude-->
 45 | 
 46 | [demo/simple_layout.py](../demo/simple_layout.py)
 47 | 
 48 | <!--/codeinclude-->
 49 | 
 50 | - `visualize` を True にすると各処理結果を可視化した結果を第２、第 3 戻り値に OCR、レアウト解析の処理結果をそれぞれ格納し、返却します。False にした場合は None を返却します。描画処理のための計算が増加しますので、デバック用途でない場合は、False を推奨します。
 51 | - `device` には処理に用いる計算機を指定します。Default は"cuda". GPU が利用できない場合は、自動で CPU モードに切り替えて処理を実行します。
 52 | - `configs`を活用すると、パイプラインの処理のより詳細のパラメータを設定できます。
 53 | 
 54 | `LayoutAnalyzer`の処理結果のエクスポートは JSON 系形式(`to_json()`)のみサポートしています。
 55 | 
 56 | ## パイプラインの詳細設定
 57 | 
 58 | Config を与えることで、より細かい振る舞いを調整できます。モジュールに対して、以下のパラメータを設定可能です。
 59 | 
 60 | - model_name: モデルのアーキテクチャを与えます
 61 | - path_cfg: ハイパパラメータを与えた config のパスを入力します。
 62 | - device: 推論に使用するデバイスを与えます。(cuda | cpu | mps)
 63 | - visualize: 可視化処理の実施の有無を指定します。(boolean)
 64 | - from_pretrained: Pretrained モデルを使用するかどうかを指定します(boolean)
 65 | - infer_onnx: torch の代わりに onnxruntime を使用して、推論するかどうかを指定します(boolean)
 66 | 
 67 | **サポートされるモデルの種類(model_name)**
 68 | 
 69 | - TextRecognizer: "parseq", "parseq-small"
 70 | - TextDetector: "dbnet"
 71 | - LayoutParser: "rtdetrv2"
 72 | - TableStructureRecognizer: "rtdetrv2"
 73 | 
 74 | ### Config の記述方法
 75 | 
 76 | config は辞書形式で与えます。config を与えることでモジュールごとに異なる計算機で処理を実行したり、詳細のパラーメタの設定が可能です。例えば以下のような config を与えると、OCR 処理は GPU で実行し、レイアウト解析機能は CPU で実行します。
 77 | 
 78 | ```python
 79 | from yomitoku import DocumentAnalyzer
 80 | 
 81 | if __name__ == "__main__":
 82 |     configs = {
 83 |         "ocr": {
 84 |             "text_detector": {
 85 |                 "device": "cuda",
 86 |             },
 87 |             "text_recognizer": {
 88 |                 "device": "cuda",
 89 |             },
 90 |         },
 91 |         "layout_analyzer": {
 92 |             "layout_parser": {
 93 |                 "device": "cpu",
 94 |             },
 95 |             "table_structure_recognizer": {
 96 |                 "device": "cpu",
 97 |             },
 98 |         },
 99 |     }
100 | 
101 |     DocumentAnalyzer(configs=configs)
102 | ```
103 | 
104 | ## yaml ファイルでのパラメータの定義
105 | 
106 | Config に yaml ファイルのパスを与えることで、推論時の細部のパラメータの調整が可能です。yaml ファイルの例はリポジトリ内の`configs`ディレクトリ内にあります。モデルのネットワークのパラメータは変更できませんが、後処理のパラメータや入力画像のサイズなどは一部変更が可能です。変更可能なパラメータは[configuration](configuration.ja.md)を参考にしてください。
107 | 
108 | たとえば、以下のように`Text Detector`の後処理の閾値を yaml を定義し、config にパスを設定することができます。config ファイルはすべてのパラメータを記載する必要はなく、変更が必要なパラメータのみの記載が可能です。
109 | 
110 | `text_detector.yaml`の記述
111 | 
112 | ```yaml
113 | post_process:
114 |   thresh: 0.1
115 |   unclip_ratio: 2.5
116 | ```
117 | 
118 | yaml ファイルのパスを config に格納する
119 | 
120 | <!--codeinclude-->
121 | 
122 | [demo/setting_document_anaysis.py](../demo/setting_document_anaysis.py)
123 | 
124 | <!--/codeinclude-->
125 | 
126 | ## インターネットに接続できない環境での利用
127 | 
128 | Yomitoku は初回の実行時に HuggingFaceHub からモデルを自動でダウンロードします。その際にインターネット環境が必要ですが、事前に手動でダウンロードすることでインターネットに接続できない環境でも実行することが可能です。
129 | 
130 | 1. [Git Large File Storage](https://docs.github.com/ja/repositories/working-with-files/managing-large-files/installing-git-large-file-storage)をインストール
131 | 
132 | 2. 事前にインターネットに接続できる環境でモデルリポジトリをダウンロードします。クローンしたリポジトリはご自身のツールで動作環境にコピーしてください。
133 | 
134 | 以下は huggingfacehub からモデルリポジトリをダウンロードするコマンド
135 | 
136 | ```sh
137 | git clone https://huggingface.co/KotaroKinoshita/yomitoku-table-structure-recognizer-rtdtrv2-open-beta
138 | 
139 | git clone https://huggingface.co/KotaroKinoshita/yomitoku-layout-parser-rtdtrv2-open-beta
140 | 
141 | git clone https://huggingface.co/KotaroKinoshita/yomitoku-text-detector-dbnet-open-beta
142 | 
143 | git clone https://huggingface.co/KotaroKinoshita/yomitoku-text-recognizer-parseq-open-beta
144 | ```
145 | 
146 | 3. yomitoku のリポジトリの直下にモデルリポジトリを配置し、yaml ファイルの`hf_hub_repo`でローカルのモデルレポジトリを参照します。以下は `text_detector.yaml` の例です。同様に他のモジュールに対しても yaml ファイルを定義します。
147 | 
148 | ```yaml
149 | hf_hub_repo: yomitoku-text-detector-dbnet-open-beta
150 | ```
151 | 
152 | 4. yaml ファイルのパスを config に格納する
153 | 
154 | <!--codeinclude-->
155 | 
156 | [demo/setting_document_anaysis.py](../demo/setting_document_anaysis.py)
157 | 
158 | <!--/codeinclude-->
159 | 


--------------------------------------------------------------------------------
/gallery.md:
--------------------------------------------------------------------------------
 1 | |                      入力画像                       |                           OCR                            |                       レイアウト解析                        |              markdown              |
 2 | | :-------------------------------------------------: | :------------------------------------------------------: | :---------------------------------------------------------: | :--------------------------------: |
 3 | | <img src="static/in/gallery1.jpg" width="400px"> ※1 | <img src="static/out/in_gallery1_p1_ocr.jpg" width="400px"> ※1 | <img src="static/out/in_gallery1_p1_layout.jpg" width="400px"> ※1 | [results1](static/out/in_gallery1_p1.md) |
 4 | | <img src="static/in/gallery2.jpg" width="400px"> ※2 | <img src="static/out/in_gallery2_p1_ocr.jpg" width="400px"> ※2 | <img src="static/out/in_gallery2_p1_layout.jpg" width="400px"> ※2 | [results2](static/out/in_gallery2_p1.md) |
 5 | | <img src="static/in/gallery3.jpg" width="400px"> ※3 | <img src="static/out/in_gallery3_p1_ocr.jpg" width="400px"> ※3 | <img src="static/out/in_gallery3_p1_layout.jpg" width="400px"> ※3 | [results3](static/out/in_gallery3_p1.md) |
 6 | | <img src="static/in/gallery5.jpg" width="400px"> ※4 | <img src="static/out/in_gallery5_p1_ocr.jpg" width="400px"> ※4 | <img src="static/out/in_gallery5_p1_layout.jpg" width="400px"> ※4 | [results4](static/out/in_gallery5_p1.md) |
 7 | |  <img src="static/in/gallery4.jpg" width="400px">   |  <img src="static/out/in_gallery4_p1_ocr.jpg" width="400px">   |  <img src="static/out/in_gallery4_p1_layout.jpg" width="400px">   |   [results5](static/out/in_gallery4_p1.md) |
 8 | |  <img src="static/in/gallery6.jpg" width="400px">   |  <img src="static/out/in_gallery6_p1_ocr.jpg" width="400px">   |  <img src="static/out/in_gallery6_p1_layout.jpg" width="400px">   | [results6](static/out/in_gallery6_p1.md) |
 9 | |  <img src="static/in/gallery7.jpeg" width="400px">   |  <img src="static/out/in_gallery7_p1_ocr.jpg" width="400px">   |  <img src="static/out/in_gallery7_p1_layout.jpg" width="400px">   | [results7](static/out/in_gallery7_p1.md) |
10 | 
11 | - ※1 出典:[「広報誌 令和 6 年 11 月号」](https://www.soumu.go.jp/menu_news/kouhoushi/koho/2411.html)：（総務省） を加工して作成
12 | - ※2 出典:[「令和 7 年度歳出予算概算要求書（東日本大震災復興特別会計）」](https://www.soumu.go.jp/main_content/000967305.pdf)：（総務省） を加工して作成
13 | - ※3 出典: [「文部科学広報　 2019 年 4 月号」](https://www.mext.go.jp/b_menu/kouhou/08121808/001/1416239.htm)：（文部科学省） を加工して作成
14 | - ※4 出典: [「運転免許の更新等運転免許に関する諸手続について」](https://www.npa.go.jp/policies/application/license_renewal/index.html)：(警察庁)を加工して作成
15 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: YomiToku
 2 | theme:
 3 |   name: material
 4 |   features:
 5 |     - navigation.tabs
 6 |   logo: assets/logo.svg
 7 |   icon:
 8 |     custom_dir: overrides # on
 9 |     repo: fontawesome/brands/github
10 | 
11 |   palette:
12 |     - media: "(prefers-color-scheme: light)"
13 |       scheme: default
14 |       primary: indigo
15 |       accent: blue
16 |       toggle:
17 |         icon: material/toggle-switch
18 |         name: Switch to dark mode
19 |     - media: "(prefers-color-scheme: dark)"
20 |       scheme: slate
21 |       primary: black
22 |       accent: indigo
23 |       toggle:
24 |         icon: material/toggle-switch-off-outline
25 |         name: Switch to system preference
26 | 
27 | markdown_extensions:
28 |   - abbr
29 |   - attr_list
30 |   - pymdownx.snippets
31 |   - pymdownx.critic
32 |   - pymdownx.caret
33 |   - pymdownx.keys
34 |   - pymdownx.mark
35 |   - pymdownx.tilde
36 |   - footnotes
37 |   - def_list
38 |   - md_in_html
39 |   - pymdownx.tasklist:
40 |       custom_checkbox: true
41 |   - toc:
42 |       permalink: true
43 |   - pymdownx.superfences:
44 |       custom_fences:
45 |         - name: mermaid
46 |           class: mermaid
47 |           format: !!python/name:pymdownx.superfences.fence_code_format
48 |   - pymdownx.emoji:
49 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
50 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
51 |       options:
52 |         custom_icons:
53 |           - overrides/.icons
54 |   - pymdownx.highlight:
55 |       anchor_linenums: true
56 |       line_spans: __span
57 |       pygments_lang_class: true
58 |   - pymdownx.inlinehilite
59 |   - pymdownx.tabbed:
60 |       alternate_style: true
61 |   - admonition
62 |   - pymdownx.details
63 |   - pymdownx.superfences
64 |   - pymdownx.arithmatex:
65 |       generic: true
66 | 
67 | nav:
68 |   - Home: index.md
69 |   - Installation: installation.md
70 |   - CLI Usage: cli.md
71 |   - Module Usage: module.md
72 |   - MCP: mcp.md
73 | 
74 | repo_url: https://github.com/kotaro-kinoshita/yomitoku-dev
75 | 
76 | plugins:
77 |   - search:
78 |   - codeinclude:
79 |   - i18n:
80 |       languages:
81 |         - locale: ja
82 |           name: 日本語
83 |           default: true
84 |         - locale: en
85 |           name: English
86 |           link: /yomitoku/en/
87 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling", "uv-dynamic-versioning"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [tool.hatch.version]
  6 | source = "uv-dynamic-versioning"
  7 | 
  8 | [project]
  9 | name = "yomitoku"
 10 | dynamic = ["version"]
 11 | authors = [{name = "Kotaro Kinoshita", email = "kotaro.kinoshita@mlism.com"}]
 12 | description = "Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language."
 13 | readme = "README.md"
 14 | license = {text = "CC BY-NC-SA 4.0"}
 15 | requires-python = ">=3.10,<3.13"
 16 | keywords = ["Japanese", "OCR", "Deep Learning"]
 17 | dependencies = [
 18 |     "huggingface-hub>=0.26.1",
 19 |     "lxml>=5.3.0",
 20 |     "omegaconf>=2.3.0",
 21 |     "opencv-python>=4.10.0.84",
 22 |     "pyclipper>=1.3.0.post6",
 23 |     "pydantic>=2.9.2",
 24 |     "shapely>=2.0.6",
 25 |     "timm>=1.0.11",
 26 |     "torchvision>=0.20.0",
 27 |     "torch>=2.5.0",
 28 |     "pypdfium2>=4.30.0",
 29 |     "onnx>=1.17.0",
 30 |     "torch>=2.5.0",
 31 |     "torchvision>=0.20.0",
 32 |     "onnxruntime>=1.20.1",
 33 |     "reportlab>=4.4.1",
 34 |     "jaconv>=0.4.0",
 35 | ]
 36 | 
 37 | [tool.uv-dynamic-versioning]
 38 | vcs = "git"
 39 | style = "semver"
 40 | format = "{base}"
 41 | 
 42 | [tool.uv]
 43 | dev-dependencies = [
 44 |     "mkdocs-codeinclude-plugin>=0.2.1",
 45 |     "mkdocs-material>=9.5.44",
 46 |     "mkdocs-static-i18n",
 47 |     "mkdocs>=1.6.1",
 48 |     "plantuml-markdown",
 49 |     "pygments",
 50 |     "pytest-cov>=5.0.0",
 51 |     "pytest>=8.3.3",
 52 |     "ruff>=0.7.0",
 53 | ]
 54 | 
 55 | [tool.uv.sources]
 56 | torch = [
 57 |     { index = "pytorch-cpu", marker = "platform_system == 'Darwin'"},
 58 | ]
 59 | torchvision = [
 60 |     { index = "pytorch-cpu", marker = "platform_system == 'Darwin'"},
 61 | ]
 62 | 
 63 | [[tool.uv.index]]
 64 | name = "pytorch-cuda124"
 65 | url = "https://download.pytorch.org/whl/cu124"
 66 | explicit = true
 67 | 
 68 | [[tool.uv.index]]
 69 | name = "pytorch-cpu"
 70 | url = "https://download.pytorch.org/whl/cpu"
 71 | explicit = true
 72 | 
 73 | [project.scripts]
 74 | yomitoku = "yomitoku.cli.main:main"
 75 | yomitoku_mcp = "yomitoku.cli.mcp_server:main"
 76 | 
 77 | [project.optional-dependencies]
 78 | mcp = [
 79 |     "mcp[cli]>=1.6.0",
 80 | ]
 81 | 
 82 | [tool.tox]
 83 | legacy_tox_ini = """
 84 | [tox]
 85 | envlist = lint, py310, py311, py312, docs
 86 | 
 87 | [testenv]
 88 | deps = pytest
 89 | commands = 
 90 |     pytest tests
 91 | 
 92 | [testenv:lint]
 93 | basepython = python3.12
 94 | deps = ruff
 95 | commands = 
 96 |     ruff check
 97 | 
 98 | [testenv:docs]
 99 | basepython = python3.12
100 | deps = 
101 |     mkdocs-material
102 |     pygments 
103 |     plantuml_markdown 
104 |     mkdocs-static-i18n
105 |     mkdocs-codeinclude-plugin
106 | commands = 
107 |     mkdocs gh-deploy --force
108 | """
109 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/pytest.ini


--------------------------------------------------------------------------------
/scripts/register_hugging_face_hub.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | from yomitoku.layout_parser import LayoutParser
 5 | from yomitoku.table_structure_recognizer import TableStructureRecognizer
 6 | from yomitoku.text_detector import TextDetector
 7 | from yomitoku.text_recognizer import TextRecognizer
 8 | 
 9 | 
10 | def get_module(module_name, device):
11 |     if module_name == "text_detector":
12 |         module = TextDetector(
13 |             from_pretrained=True,
14 |             device=device,
15 |         )
16 |         return module
17 | 
18 |     elif module_name == "text_recognizer":
19 |         module = TextRecognizer(
20 |             from_pretrained=True,
21 |             device=device,
22 |         )
23 |         return module
24 | 
25 |     elif module_name == "layout_parser":
26 |         module = LayoutParser(
27 |             from_pretrained=True,
28 |             device=device,
29 |         )
30 |         return module
31 | 
32 |     elif module_name == "table_structure_recognizer":
33 |         module = TableStructureRecognizer(
34 |             from_pretrained=False,
35 |             device=device,
36 |         )
37 |         return module
38 | 
39 |     raise ValueError(f"Invalid module name: {module_name}")
40 | 
41 | 
42 | def main(args):
43 |     module = get_module(args.module, args.device)
44 |     # module.model.load_state_dict(
45 |     #    torch.load(args.checkpoint, map_location="cpu")["model"]
46 |     # )
47 | 
48 |     module.model.save_pretrained(args.name)
49 |     module.model.push_to_hub(f"{args.owner}/{args.name}")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--module", type=str)
55 |     parser.add_argument("--checkpoint", type=str)
56 |     parser.add_argument("--owner", type=str)
57 |     parser.add_argument("--name", type=str)
58 |     parser.add_argument("--device", type=str, default="cuda")
59 |     args = parser.parse_args()
60 | 
61 |     main(args)
62 | 


--------------------------------------------------------------------------------
/src/yomitoku/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib.metadata import version
 2 | 
 3 | from .document_analyzer import DocumentAnalyzer
 4 | from .layout_analyzer import LayoutAnalyzer
 5 | from .layout_parser import LayoutParser
 6 | from .ocr import OCR
 7 | from .table_structure_recognizer import TableStructureRecognizer
 8 | from .text_detector import TextDetector
 9 | from .text_recognizer import TextRecognizer
10 | 
11 | __all__ = [
12 |     "OCR",
13 |     "LayoutParser",
14 |     "TableStructureRecognizer",
15 |     "TextDetector",
16 |     "TextRecognizer",
17 |     "LayoutAnalyzer",
18 |     "DocumentAnalyzer",
19 | ]
20 | __version__ = version(__package__)
21 | 


--------------------------------------------------------------------------------
/src/yomitoku/base.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from pathlib import Path
  3 | from typing import Union
  4 | 
  5 | import torch
  6 | from omegaconf import OmegaConf
  7 | from pydantic import BaseModel, Extra
  8 | 
  9 | from .export import export_json
 10 | from .utils.logger import set_logger
 11 | 
 12 | logger = set_logger(__name__, "INFO")
 13 | 
 14 | 
 15 | def load_yaml_config(path_config: str):
 16 |     path_config = Path(path_config)
 17 |     if not path_config.exists():
 18 |         raise FileNotFoundError(f"Config file not found: {path_config}")
 19 | 
 20 |     with open(path_config, "r") as file:
 21 |         yaml_config = OmegaConf.load(file)
 22 |     return yaml_config
 23 | 
 24 | 
 25 | def load_config(
 26 |     default_config,
 27 |     path_config: Union[str, None] = None,
 28 | ):
 29 |     cfg = OmegaConf.structured(default_config)
 30 |     if path_config is not None:
 31 |         yaml_config = load_yaml_config(path_config)
 32 |         cfg = OmegaConf.merge(cfg, yaml_config)
 33 |     return cfg
 34 | 
 35 | 
 36 | def observer(cls, func):
 37 |     def wrapper(*args, **kwargs):
 38 |         try:
 39 |             start = time.time()
 40 |             result = func(*args, **kwargs)
 41 |             elapsed = time.time() - start
 42 |             logger.info(f"{cls.__name__} {func.__name__} elapsed_time: {elapsed}")
 43 |         except Exception as e:
 44 |             logger.error(f"Error occurred in {cls.__name__} {func.__name__}: {e}")
 45 |             raise e
 46 |         return result
 47 | 
 48 |     return wrapper
 49 | 
 50 | 
 51 | class BaseSchema(BaseModel):
 52 |     class Config:
 53 |         extra = Extra.forbid
 54 |         validate_assignment = True
 55 | 
 56 |     def to_json(self, out_path: str, **kwargs):
 57 |         return export_json(self, out_path, **kwargs)
 58 | 
 59 | 
 60 | class BaseModule:
 61 |     model_catalog = None
 62 | 
 63 |     def __init__(self):
 64 |         if self.model_catalog is None:
 65 |             raise NotImplementedError
 66 | 
 67 |         if not issubclass(self.model_catalog.__class__, BaseModelCatalog):
 68 |             raise ValueError(
 69 |                 f"{self.model_catalog.__class__} is not SubClass BaseModelCatalog."
 70 |             )
 71 | 
 72 |         if len(self.model_catalog.list_model()) == 0:
 73 |             raise ValueError("No model is registered.")
 74 | 
 75 |     def __new__(cls, *args, **kwds):
 76 |         logger.info(f"Initialize {cls.__name__}")
 77 |         cls.__call__ = observer(cls, cls.__call__)
 78 |         return super().__new__(cls)
 79 | 
 80 |     def load_model(self, name, path_cfg, from_pretrained=True):
 81 |         default_cfg, Net = self.model_catalog.get(name)
 82 |         self._cfg = load_config(default_cfg, path_cfg)
 83 |         if from_pretrained:
 84 |             self.model = Net.from_pretrained(self._cfg.hf_hub_repo, cfg=self._cfg)
 85 |         else:
 86 |             self.model = Net(cfg=self._cfg)
 87 | 
 88 |     def save_config(self, path_cfg):
 89 |         OmegaConf.save(self._cfg, path_cfg)
 90 | 
 91 |     def log_config(self):
 92 |         logger.info(OmegaConf.to_yaml(self._cfg))
 93 | 
 94 |     @classmethod
 95 |     def catalog(cls):
 96 |         display = ""
 97 |         for model in cls.model_catalog.list_model():
 98 |             display += f"{model} "
 99 |         logger.info(f"{cls.__name__} Implemented Models")
100 |         logger.info(display)
101 | 
102 |     @property
103 |     def device(self):
104 |         return self._device
105 | 
106 |     @device.setter
107 |     def device(self, device):
108 |         if "cuda" in device:
109 |             if torch.cuda.is_available():
110 |                 self._device = torch.device(device)
111 |             else:
112 |                 self._device = torch.device("cpu")
113 |                 logger.warning("CUDA is not available. Use CPU instead.")
114 |         else:
115 |             self._device = torch.device("cpu")
116 | 
117 | 
118 | class BaseModelCatalog:
119 |     def __init__(self):
120 |         self.catalog = {}
121 | 
122 |     def get(self, model_name):
123 |         model_name = model_name.lower()
124 |         if model_name in self.catalog:
125 |             return self.catalog[model_name]
126 | 
127 |         raise ValueError(f"Unknown model: {model_name}")
128 | 
129 |     def register(self, model_name, config, model):
130 |         if model_name in self.catalog:
131 |             raise ValueError(f"{model_name} is already registered.")
132 | 
133 |         self.catalog[model_name] = (config, model)
134 | 
135 |     def list_model(self):
136 |         return list(self.catalog.keys())
137 | 


--------------------------------------------------------------------------------
/src/yomitoku/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/src/yomitoku/cli/__init__.py


--------------------------------------------------------------------------------
/src/yomitoku/cli/mcp_server.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import io
  3 | import json
  4 | import os
  5 | from argparse import ArgumentParser
  6 | from pathlib import Path
  7 | 
  8 | from mcp.server.fastmcp import Context, FastMCP
  9 | 
 10 | from yomitoku import DocumentAnalyzer
 11 | from yomitoku.data.functions import load_image, load_pdf
 12 | from yomitoku.export import (
 13 |     convert_csv,
 14 |     convert_html,
 15 |     convert_json,
 16 |     convert_markdown,
 17 | )
 18 | 
 19 | try:
 20 |     RESOURCE_DIR = os.environ["RESOURCE_DIR"]
 21 | except KeyError:
 22 |     raise ValueError("Environment variable 'RESOURCE_DIR' is not set.")
 23 | 
 24 | 
 25 | analyzer = None
 26 | 
 27 | 
 28 | async def load_analyzer(ctx: Context) -> DocumentAnalyzer:
 29 |     """
 30 |     Load the DocumentAnalyzer instance if not already loaded.
 31 | 
 32 |     Args:
 33 |         ctx (Context): The context in which the analyzer is being loaded.
 34 | 
 35 |     Returns:
 36 |         DocumentAnalyzer: The loaded document analyzer instance.
 37 |     """
 38 |     global analyzer
 39 |     if analyzer is None:
 40 |         await ctx.info("Load document analyzer")
 41 |         analyzer = DocumentAnalyzer(visualize=False, device="cuda")
 42 |     return analyzer
 43 | 
 44 | 
 45 | mcp = FastMCP("yomitoku")
 46 | 
 47 | 
 48 | @mcp.tool()
 49 | async def process_ocr(ctx: Context, filename: str, output_format: str) -> str:
 50 |     """
 51 |     Perform OCR on the specified file in the resource direcory and convert
 52 |     the results to the desired format.
 53 | 
 54 |     Args:
 55 |         ctx (Context): The context in which the OCR processing is executed.
 56 |         filename (str): The name of the file to process in the resource directory.
 57 |         output_format (str): The desired format for the output. The available options are:
 58 |             - json: Outputs the text as structured data along with positional information.
 59 |             - markdown: Outputs texts and tables in Markdown format.
 60 |             - html: Outputs texts and tables in HTML format.
 61 |             - csv: Outputs texts and tables in CSV format.
 62 | 
 63 |     Returns:
 64 |         str: The OCR results converted to the specified format.
 65 |     """
 66 |     analyzer = await load_analyzer(ctx)
 67 | 
 68 |     await ctx.info("Start ocr processing")
 69 | 
 70 |     file_path = os.path.join(RESOURCE_DIR, filename)
 71 |     if Path(file_path).suffix[1:].lower() in ["pdf"]:
 72 |         imgs = load_pdf(file_path)
 73 |     else:
 74 |         imgs = load_image(file_path)
 75 | 
 76 |     results = []
 77 |     for page, img in enumerate(imgs):
 78 |         analyzer.img = img
 79 |         result, _, _ = await analyzer.run(img)
 80 |         results.append(result)
 81 |         await ctx.report_progress(page + 1, len(imgs))
 82 | 
 83 |     if output_format == "json":
 84 |         return json.dumps(
 85 |             [
 86 |                 convert_json(
 87 |                     result,
 88 |                     out_path=None,
 89 |                     ignore_line_break=True,
 90 |                     img=img,
 91 |                     export_figure=False,
 92 |                     figure_dir=None,
 93 |                 ).model_dump()
 94 |                 for img, result in zip(imgs, results)
 95 |             ],
 96 |             ensure_ascii=False,
 97 |             sort_keys=True,
 98 |             separators=(",", ": "),
 99 |         )
100 |     elif output_format == "markdown":
101 |         return "\n".join(
102 |             [
103 |                 convert_markdown(
104 |                     result,
105 |                     out_path=None,
106 |                     ignore_line_break=True,
107 |                     img=img,
108 |                     export_figure=False,
109 |                 )[0]
110 |                 for img, result in zip(imgs, results)
111 |             ]
112 |         )
113 |     elif output_format == "html":
114 |         return "\n".join(
115 |             [
116 |                 convert_html(
117 |                     result,
118 |                     out_path=None,
119 |                     ignore_line_break=True,
120 |                     img=img,
121 |                     export_figure=False,
122 |                     export_figure_letter="",
123 |                 )[0]
124 |                 for img, result in zip(imgs, results)
125 |             ]
126 |         )
127 |     elif output_format == "csv":
128 |         output = io.StringIO()
129 |         writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
130 |         for img, result in zip(imgs, results):
131 |             elements = convert_csv(
132 |                 result,
133 |                 out_path=None,
134 |                 ignore_line_break=True,
135 |                 img=img,
136 |                 export_figure=False,
137 |             )
138 |             for element in elements:
139 |                 if element["type"] == "table":
140 |                     writer.writerows(element["element"])
141 |                 else:
142 |                     writer.writerow([element["element"]])
143 |                 writer.writerow([""])
144 |         return output.getvalue()
145 |     else:
146 |         raise ValueError(
147 |             f"Unsupported output format: {output_format}."
148 |             " Supported formats are json, markdown, html or csv."
149 |         )
150 | 
151 | 
152 | @mcp.resource("file://list")
153 | async def get_file_list() -> list[str]:
154 |     """
155 |     Retrieve a list of files in the resource directory.
156 | 
157 |     Returns:
158 |         list[str]: A list of filenames in the resource directory.
159 |     """
160 |     return os.listdir(RESOURCE_DIR)
161 | 
162 | 
163 | def run_mcp_server(transport="stdio", mount_path=None):
164 |     """
165 |     Run the MCP server.
166 |     """
167 | 
168 |     if transport == "stdio":
169 |         mcp.run()
170 |     elif transport == "sse":
171 |         mcp.run(transport=transport, mount_path=mount_path)
172 | 
173 | 
174 | def main():
175 |     parser = ArgumentParser(description="Run the MCP server.")
176 |     parser.add_argument(
177 |         "--transport",
178 |         "-t",
179 |         type=str,
180 |         default="stdio",
181 |         choices=["stdio", "sse"],
182 |         help="Transport method for the MCP server.",
183 |     )
184 |     parser.add_argument(
185 |         "--mount_path",
186 |         "-m",
187 |         type=str,
188 |         default=None,
189 |         help="Mount path for the MCP server (only used with SSE transport).",
190 |     )
191 |     args = parser.parse_args()
192 |     run_mcp_server(transport=args.transport, mount_path=args.mount_path)
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     main()
197 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cfg_layout_parser_rtdtrv2 import LayoutParserRTDETRv2Config
 2 | from .cfg_layout_parser_rtdtrv2_v2 import LayoutParserRTDETRv2V2Config
 3 | from .cfg_table_structure_recognizer_rtdtrv2 import (
 4 |     TableStructureRecognizerRTDETRv2Config,
 5 | )
 6 | from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
 7 | from .cfg_text_detector_dbnet_v2 import TextDetectorDBNetV2Config
 8 | from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
 9 | from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
10 | from .cfg_text_recognizer_parseq_v2 import TextRecognizerPARSeqV2Config
11 | 
12 | 
13 | __all__ = [
14 |     "TextDetectorDBNetConfig",
15 |     "TextRecognizerPARSeqConfig",
16 |     "LayoutParserRTDETRv2Config",
17 |     "TableStructureRecognizerRTDETRv2Config",
18 |     "TextRecognizerPARSeqSmallConfig",
19 |     "LayoutParserRTDETRv2V2Config",
20 |     "TextDetectorDBNetV2Config",
21 |     "TextRecognizerPARSeqV2Config",
22 | ]
23 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_layout_parser_rtdtrv2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | 
 5 | @dataclass
 6 | class Data:
 7 |     img_size: List[int] = field(default_factory=lambda: [640, 640])
 8 | 
 9 | 
10 | @dataclass
11 | class BackBone:
12 |     depth: int = 50
13 |     variant: str = "d"
14 |     freeze_at: int = 0
15 |     return_idx: List[int] = field(default_factory=lambda: [1, 2, 3])
16 |     num_stages: int = 4
17 |     freeze_norm: bool = True
18 | 
19 | 
20 | @dataclass
21 | class Encoder:
22 |     in_channels: List[int] = field(default_factory=lambda: [512, 1024, 2048])
23 |     feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
24 | 
25 |     # intra
26 |     hidden_dim: int = 256
27 |     use_encoder_idx: List[int] = field(default_factory=lambda: [2])
28 |     num_encoder_layers: int = 1
29 |     nhead: int = 8
30 |     dim_feedforward: int = 1024
31 |     dropout: float = 0.0
32 |     enc_act: str = "gelu"
33 | 
34 |     # cross
35 |     expansion: float = 1.0
36 |     depth_mult: int = 1
37 |     act: str = "silu"
38 | 
39 | 
40 | @dataclass
41 | class Decoder:
42 |     num_classes: int = 6
43 |     feat_channels: List[int] = field(default_factory=lambda: [256, 256, 256])
44 |     feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
45 |     hidden_dim: int = 256
46 |     num_levels: int = 3
47 | 
48 |     num_layers: int = 6
49 |     num_queries: int = 300
50 | 
51 |     num_denoising: int = 100
52 |     label_noise_ratio: float = 0.5
53 |     box_noise_scale: float = 1.0
54 |     eval_spatial_size: List[int] = field(default_factory=lambda: [640, 640])
55 | 
56 |     eval_idx: int = -1
57 | 
58 |     num_points: List[int] = field(default_factory=lambda: [4, 4, 4])
59 |     cross_attn_method: str = "default"
60 |     query_select_method: str = "default"
61 | 
62 | 
63 | @dataclass
64 | class LayoutParserRTDETRv2Config:
65 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-layout-parser-rtdtrv2-open-beta"
66 |     thresh_score: float = 0.5
67 |     data: Data = field(default_factory=Data)
68 |     PResNet: BackBone = field(default_factory=BackBone)
69 |     HybridEncoder: Encoder = field(default_factory=Encoder)
70 |     RTDETRTransformerv2: Decoder = field(default_factory=Decoder)
71 | 
72 |     category: List[str] = field(
73 |         default_factory=lambda: [
74 |             "tables",
75 |             "figures",
76 |             "paragraphs",
77 |             "section_headings",
78 |             "page_header",
79 |             "page_footer",
80 |         ]
81 |     )
82 | 
83 |     role: List[str] = field(
84 |         default_factory=lambda: [
85 |             "section_headings",
86 |             "page_header",
87 |             "page_footer",
88 |         ]
89 |     )
90 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_layout_parser_rtdtrv2_v2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | 
 5 | @dataclass
 6 | class Data:
 7 |     img_size: List[int] = field(default_factory=lambda: [640, 640])
 8 | 
 9 | 
10 | @dataclass
11 | class BackBone:
12 |     depth: int = 50
13 |     variant: str = "d"
14 |     freeze_at: int = 0
15 |     return_idx: List[int] = field(default_factory=lambda: [1, 2, 3])
16 |     num_stages: int = 4
17 |     freeze_norm: bool = True
18 | 
19 | 
20 | @dataclass
21 | class Encoder:
22 |     in_channels: List[int] = field(default_factory=lambda: [512, 1024, 2048])
23 |     feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
24 | 
25 |     # intra
26 |     hidden_dim: int = 256
27 |     use_encoder_idx: List[int] = field(default_factory=lambda: [2])
28 |     num_encoder_layers: int = 1
29 |     nhead: int = 8
30 |     dim_feedforward: int = 1024
31 |     dropout: float = 0.0
32 |     enc_act: str = "gelu"
33 | 
34 |     # cross
35 |     expansion: float = 1.0
36 |     depth_mult: int = 1
37 |     act: str = "silu"
38 | 
39 | 
40 | @dataclass
41 | class Decoder:
42 |     num_classes: int = 6
43 |     feat_channels: List[int] = field(default_factory=lambda: [256, 256, 256])
44 |     feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
45 |     hidden_dim: int = 256
46 |     num_levels: int = 3
47 | 
48 |     num_layers: int = 6
49 |     num_queries: int = 300
50 | 
51 |     num_denoising: int = 100
52 |     label_noise_ratio: float = 0.5
53 |     box_noise_scale: float = 1.0
54 |     eval_spatial_size: List[int] = field(default_factory=lambda: [640, 640])
55 | 
56 |     eval_idx: int = -1
57 | 
58 |     num_points: List[int] = field(default_factory=lambda: [4, 4, 4])
59 |     cross_attn_method: str = "default"
60 |     query_select_method: str = "default"
61 | 
62 | 
63 | @dataclass
64 | class LayoutParserRTDETRv2V2Config:
65 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-layout-parser-rtdtrv2-v2"
66 |     thresh_score: float = 0.5
67 |     data: Data = field(default_factory=Data)
68 |     PResNet: BackBone = field(default_factory=BackBone)
69 |     HybridEncoder: Encoder = field(default_factory=Encoder)
70 |     RTDETRTransformerv2: Decoder = field(default_factory=Decoder)
71 | 
72 |     category: List[str] = field(
73 |         default_factory=lambda: [
74 |             "tables",
75 |             "figures",
76 |             "paragraphs",
77 |             "section_headings",
78 |             "page_header",
79 |             "page_footer",
80 |         ]
81 |     )
82 | 
83 |     role: List[str] = field(
84 |         default_factory=lambda: [
85 |             "section_headings",
86 |             "page_header",
87 |             "page_footer",
88 |         ]
89 |     )
90 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | 
 5 | @dataclass
 6 | class Data:
 7 |     img_size: List[int] = field(default_factory=lambda: [640, 640])
 8 | 
 9 | 
10 | @dataclass
11 | class BackBone:
12 |     depth: int = 50
13 |     variant: str = "d"
14 |     freeze_at: int = 0
15 |     return_idx: List[int] = field(default_factory=lambda: [1, 2, 3])
16 |     num_stages: int = 4
17 |     freeze_norm: bool = True
18 | 
19 | 
20 | @dataclass
21 | class Encoder:
22 |     in_channels: List[int] = field(default_factory=lambda: [512, 1024, 2048])
23 |     feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
24 | 
25 |     # intra
26 |     hidden_dim: int = 256
27 |     use_encoder_idx: List[int] = field(default_factory=lambda: [2])
28 |     num_encoder_layers: int = 1
29 |     nhead: int = 8
30 |     dim_feedforward: int = 1024
31 |     dropout: float = 0.0
32 |     enc_act: str = "gelu"
33 | 
34 |     # cross
35 |     expansion: float = 1.0
36 |     depth_mult: int = 1
37 |     act: str = "silu"
38 | 
39 | 
40 | @dataclass
41 | class Decoder:
42 |     num_classes: int = 3
43 |     feat_channels: List[int] = field(default_factory=lambda: [256, 256, 256])
44 |     feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
45 |     hidden_dim: int = 256
46 |     num_levels: int = 3
47 | 
48 |     num_layers: int = 6
49 |     num_queries: int = 300
50 | 
51 |     num_denoising: int = 100
52 |     label_noise_ratio: float = 0.5
53 |     box_noise_scale: float = 1.0  # 1.0 0.4
54 |     eval_spatial_size: List[int] = field(default_factory=lambda: [640, 640])
55 | 
56 |     eval_idx: int = -1
57 | 
58 |     num_points: List[int] = field(default_factory=lambda: [4, 4, 4])
59 |     cross_attn_method: str = "default"
60 |     query_select_method: str = "default"
61 | 
62 | 
63 | @dataclass
64 | class TableStructureRecognizerRTDETRv2Config:
65 |     hf_hub_repo: str = (
66 |         "KotaroKinoshita/yomitoku-table-structure-recognizer-rtdtrv2-open-beta"
67 |     )
68 |     thresh_score: float = 0.4
69 |     data: Data = field(default_factory=Data)
70 |     PResNet: BackBone = field(default_factory=BackBone)
71 |     HybridEncoder: Encoder = field(default_factory=Encoder)
72 |     RTDETRTransformerv2: Decoder = field(default_factory=Decoder)
73 | 
74 |     category: List[str] = field(
75 |         default_factory=lambda: [
76 |             "row",
77 |             "col",
78 |             "span",
79 |         ]
80 |     )
81 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_text_detector_dbnet.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | 
 5 | @dataclass
 6 | class BackBone:
 7 |     name: str = "resnet50"
 8 |     dilation: bool = True
 9 | 
10 | 
11 | @dataclass
12 | class Decoder:
13 |     in_channels: list[int] = field(default_factory=lambda: [256, 512, 1024, 2048])
14 |     hidden_dim: int = 256
15 |     adaptive: bool = True
16 |     serial: bool = True
17 |     smooth: bool = False
18 |     k: int = 50
19 | 
20 | 
21 | @dataclass
22 | class Data:
23 |     shortest_size: int = 1280
24 |     limit_size: int = 1600
25 | 
26 | 
27 | @dataclass
28 | class PostProcess:
29 |     min_size: int = 2
30 |     thresh: float = 0.15
31 |     box_thresh: float = 0.5
32 |     max_candidates: int = 1500
33 |     unclip_ratio: float = 7.0
34 | 
35 | 
36 | @dataclass
37 | class Visualize:
38 |     color: List[int] = field(default_factory=lambda: [0, 255, 0])
39 |     heatmap: bool = False
40 | 
41 | 
42 | @dataclass
43 | class TextDetectorDBNetConfig:
44 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-detector-dbnet-open-beta"
45 |     backbone: BackBone = field(default_factory=BackBone)
46 |     decoder: Decoder = field(default_factory=Decoder)
47 |     data: Data = field(default_factory=Data)
48 |     post_process: PostProcess = field(default_factory=PostProcess)
49 |     visualize: Visualize = field(default_factory=Visualize)
50 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_text_detector_dbnet_v2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | 
 5 | @dataclass
 6 | class BackBone:
 7 |     name: str = "resnet50"
 8 |     dilation: bool = True
 9 | 
10 | 
11 | @dataclass
12 | class Decoder:
13 |     in_channels: list[int] = field(default_factory=lambda: [256, 512, 1024, 2048])
14 |     hidden_dim: int = 256
15 |     adaptive: bool = True
16 |     serial: bool = True
17 |     smooth: bool = False
18 |     k: int = 50
19 | 
20 | 
21 | @dataclass
22 | class Data:
23 |     shortest_size: int = 1280
24 |     limit_size: int = 1600
25 | 
26 | 
27 | @dataclass
28 | class PostProcess:
29 |     min_size: int = 2
30 |     thresh: float = 0.2
31 |     box_thresh: float = 0.5
32 |     max_candidates: int = 1500
33 |     unclip_ratio: float = 5.0
34 | 
35 | 
36 | @dataclass
37 | class Visualize:
38 |     color: List[int] = field(default_factory=lambda: [0, 255, 0])
39 |     heatmap: bool = False
40 | 
41 | 
42 | @dataclass
43 | class TextDetectorDBNetV2Config:
44 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-detector-dbnet-v2"
45 |     backbone: BackBone = field(default_factory=BackBone)
46 |     decoder: Decoder = field(default_factory=Decoder)
47 |     data: Data = field(default_factory=Data)
48 |     post_process: PostProcess = field(default_factory=PostProcess)
49 |     visualize: Visualize = field(default_factory=Visualize)
50 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_text_recognizer_parseq.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | from ..constants import ROOT_DIR
 5 | 
 6 | 
 7 | @dataclass
 8 | class Data:
 9 |     num_workers: int = 4
10 |     batch_size: int = 128
11 |     img_size: List[int] = field(default_factory=lambda: [32, 800])
12 | 
13 | 
14 | @dataclass
15 | class Encoder:
16 |     patch_size: List[int] = field(default_factory=lambda: [8, 8])
17 |     num_heads: int = 8
18 |     embed_dim: int = 512
19 |     mlp_ratio: int = 4
20 |     depth: int = 12
21 | 
22 | 
23 | @dataclass
24 | class Decoder:
25 |     embed_dim: int = 512
26 |     num_heads: int = 8
27 |     mlp_ratio: int = 4
28 |     depth: int = 1
29 | 
30 | 
31 | @dataclass
32 | class Visualize:
33 |     font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
34 |     color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
35 |     font_size: int = 18
36 | 
37 | 
38 | @dataclass
39 | class TextRecognizerPARSeqConfig:
40 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-open-beta"
41 |     charset: str = str(ROOT_DIR + "/resource/charset.txt")
42 |     num_tokens: int = 7312
43 |     max_label_length: int = 100
44 |     decode_ar: int = 1
45 |     refine_iters: int = 1
46 | 
47 |     data: Data = field(default_factory=Data)
48 |     encoder: Encoder = field(default_factory=Encoder)
49 |     decoder: Decoder = field(default_factory=Decoder)
50 | 
51 |     visualize: Visualize = field(default_factory=Visualize)
52 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_text_recognizer_parseq_small.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | from ..constants import ROOT_DIR
 5 | 
 6 | 
 7 | @dataclass
 8 | class Data:
 9 |     num_workers: int = 4
10 |     batch_size: int = 128
11 |     img_size: List[int] = field(default_factory=lambda: [32, 800])
12 | 
13 | 
14 | @dataclass
15 | class Encoder:
16 |     patch_size: List[int] = field(default_factory=lambda: [16, 16])
17 |     num_heads: int = 8
18 |     embed_dim: int = 384
19 |     mlp_ratio: int = 4
20 |     depth: int = 9
21 | 
22 | 
23 | @dataclass
24 | class Decoder:
25 |     embed_dim: int = 384
26 |     num_heads: int = 8
27 |     mlp_ratio: int = 4
28 |     depth: int = 1
29 | 
30 | 
31 | @dataclass
32 | class Visualize:
33 |     font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
34 |     color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
35 |     font_size: int = 18
36 | 
37 | 
38 | @dataclass
39 | class TextRecognizerPARSeqSmallConfig:
40 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
41 |     charset: str = str(ROOT_DIR + "/resource/charset.txt")
42 |     num_tokens: int = 7312
43 |     max_label_length: int = 100
44 |     decode_ar: int = 1
45 |     refine_iters: int = 1
46 | 
47 |     data: Data = field(default_factory=Data)
48 |     encoder: Encoder = field(default_factory=Encoder)
49 |     decoder: Decoder = field(default_factory=Decoder)
50 | 
51 |     visualize: Visualize = field(default_factory=Visualize)
52 | 


--------------------------------------------------------------------------------
/src/yomitoku/configs/cfg_text_recognizer_parseq_v2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | from ..constants import ROOT_DIR
 5 | 
 6 | 
 7 | @dataclass
 8 | class Data:
 9 |     num_workers: int = 4
10 |     batch_size: int = 128
11 |     img_size: List[int] = field(default_factory=lambda: [32, 800])
12 | 
13 | 
14 | @dataclass
15 | class Encoder:
16 |     patch_size: List[int] = field(default_factory=lambda: [8, 8])
17 |     num_heads: int = 8
18 |     embed_dim: int = 512
19 |     mlp_ratio: int = 4
20 |     depth: int = 12
21 | 
22 | 
23 | @dataclass
24 | class Decoder:
25 |     embed_dim: int = 512
26 |     num_heads: int = 8
27 |     mlp_ratio: int = 4
28 |     depth: int = 1
29 | 
30 | 
31 | @dataclass
32 | class Visualize:
33 |     font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
34 |     color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
35 |     font_size: int = 18
36 | 
37 | 
38 | @dataclass
39 | class TextRecognizerPARSeqV2Config:
40 |     hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-middle-v2"
41 |     charset: str = str(ROOT_DIR + "/resource/charset.txt")
42 |     num_tokens: int = 7312
43 |     max_label_length: int = 100
44 |     decode_ar: int = 1
45 |     refine_iters: int = 1
46 | 
47 |     data: Data = field(default_factory=Data)
48 |     encoder: Encoder = field(default_factory=Encoder)
49 |     decoder: Decoder = field(default_factory=Decoder)
50 | 
51 |     visualize: Visualize = field(default_factory=Visualize)
52 | 


--------------------------------------------------------------------------------
/src/yomitoku/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 4 | SUPPORT_OUTPUT_FORMAT = ["json", "csv", "html", "markdown", "md", "pdf"]
 5 | SUPPORT_INPUT_FORMAT = ["jpg", "jpeg", "png", "bmp", "tiff", "tif", "pdf"]
 6 | MIN_IMAGE_SIZE = 32
 7 | WARNING_IMAGE_SIZE = 720
 8 | 
 9 | PALETTE = [
10 |     [255, 0, 0],
11 |     [0, 255, 0],
12 |     [0, 0, 255],
13 |     [255, 255, 0],
14 |     [0, 255, 255],
15 |     [255, 0, 255],
16 |     [128, 0, 0],
17 |     [0, 128, 0],
18 |     [0, 0, 128],
19 |     [255, 128, 0],
20 |     [0, 255, 128],
21 |     [128, 0, 255],
22 |     [128, 255, 0],
23 |     [0, 128, 255],
24 |     [255, 0, 128],
25 |     [255, 128, 128],
26 |     [128, 255, 128],
27 |     [128, 128, 255],
28 |     [255, 255, 128],
29 |     [255, 128, 255],
30 |     [128, 255, 255],
31 |     [128, 128, 128],
32 | ]
33 | 


--------------------------------------------------------------------------------
/src/yomitoku/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import load_image, load_pdf
2 | 
3 | __all__ = ["load_image", "load_pdf"]
4 | 


--------------------------------------------------------------------------------
/src/yomitoku/data/dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from torchvision import transforms as T
 3 | 
 4 | from .functions import (
 5 |     extract_roi_with_perspective,
 6 |     resize_with_padding,
 7 |     rotate_text_image,
 8 |     validate_quads,
 9 | )
10 | 
11 | from concurrent.futures import ThreadPoolExecutor
12 | 
13 | 
14 | class ParseqDataset(Dataset):
15 |     def __init__(self, cfg, img, quads, num_workers=8):
16 |         self.img = img[:, :, ::-1]
17 |         self.quads = quads
18 |         self.cfg = cfg
19 |         self.img = img
20 |         self.transform = T.Compose(
21 |             [
22 |                 T.ToTensor(),
23 |                 T.Normalize(0.5, 0.5),
24 |             ]
25 |         )
26 | 
27 |         with ThreadPoolExecutor(max_workers=num_workers) as executor:
28 |             data = list(executor.map(self.preprocess, self.quads))
29 | 
30 |         self.data = [tensor for tensor in data if tensor is not None]
31 | 
32 |     def preprocess(self, quad):
33 |         if validate_quads(self.img, quad) is None:
34 |             return None
35 | 
36 |         roi_img = extract_roi_with_perspective(self.img, quad)
37 | 
38 |         if roi_img is None:
39 |             return None
40 | 
41 |         roi_img = rotate_text_image(roi_img, thresh_aspect=2)
42 |         resized = resize_with_padding(roi_img, self.cfg.data.img_size)
43 | 
44 |         return resized
45 | 
46 |     def __len__(self):
47 |         return len(self.data)
48 | 
49 |     def __getitem__(self, index):
50 |         return self.transform(self.data[index])
51 | 


--------------------------------------------------------------------------------
/src/yomitoku/export/__init__.py:
--------------------------------------------------------------------------------
 1 | from .export_csv import export_csv, save_csv, convert_csv
 2 | from .export_html import export_html, save_html, convert_html
 3 | from .export_json import export_json, save_json, convert_json
 4 | from .export_markdown import export_markdown, save_markdown, convert_markdown
 5 | 
 6 | __all__ = [
 7 |     "export_html",
 8 |     "export_markdown",
 9 |     "export_csv",
10 |     "export_json",
11 |     "save_html",
12 |     "save_markdown",
13 |     "save_csv",
14 |     "save_json",
15 |     "convert_html",
16 |     "convert_markdown",
17 |     "convert_csv",
18 |     "convert_json",
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/yomitoku/export/export_csv.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | 
  4 | from ..utils.misc import save_image
  5 | 
  6 | 
  7 | def table_to_csv(table, ignore_line_break):
  8 |     num_rows = table.n_row
  9 |     num_cols = table.n_col
 10 | 
 11 |     table_array = [["" for _ in range(num_cols)] for _ in range(num_rows)]
 12 | 
 13 |     for cell in table.cells:
 14 |         row = cell.row - 1
 15 |         col = cell.col - 1
 16 |         row_span = cell.row_span
 17 |         col_span = cell.col_span
 18 |         contents = cell.contents
 19 | 
 20 |         if ignore_line_break:
 21 |             contents = contents.replace("\n", "")
 22 | 
 23 |         for i in range(row, row + row_span):
 24 |             for j in range(col, col + col_span):
 25 |                 if i == row and j == col:
 26 |                     table_array[i][j] = contents
 27 |     return table_array
 28 | 
 29 | 
 30 | def paragraph_to_csv(paragraph, ignore_line_break):
 31 |     contents = paragraph.contents
 32 | 
 33 |     if ignore_line_break:
 34 |         contents = contents.replace("\n", "")
 35 | 
 36 |     return contents
 37 | 
 38 | 
 39 | def save_figure(
 40 |     figures,
 41 |     img,
 42 |     out_path,
 43 |     figure_dir="figures",
 44 | ):
 45 |     assert img is not None, "img is required for saving figures"
 46 | 
 47 |     for i, figure in enumerate(figures):
 48 |         x1, y1, x2, y2 = map(int, figure.box)
 49 |         figure_img = img[y1:y2, x1:x2, :]
 50 |         save_dir = os.path.dirname(out_path)
 51 |         save_dir = os.path.join(save_dir, figure_dir)
 52 |         os.makedirs(save_dir, exist_ok=True)
 53 | 
 54 |         filename = os.path.splitext(os.path.basename(out_path))[0]
 55 |         figure_name = f"{filename}_figure_{i}.png"
 56 |         figure_path = os.path.join(save_dir, figure_name)
 57 |         save_image(figure_img, figure_path)
 58 | 
 59 | 
 60 | def convert_csv(
 61 |     inputs,
 62 |     out_path,
 63 |     ignore_line_break,
 64 |     img=None,
 65 |     export_figure: bool = True,
 66 |     figure_dir="figures",
 67 | ):
 68 |     elements = []
 69 |     for table in inputs.tables:
 70 |         table_csv = table_to_csv(table, ignore_line_break)
 71 | 
 72 |         elements.append(
 73 |             {
 74 |                 "type": "table",
 75 |                 "box": table.box,
 76 |                 "element": table_csv,
 77 |                 "order": table.order,
 78 |             }
 79 |         )
 80 | 
 81 |     for paraghraph in inputs.paragraphs:
 82 |         contents = paragraph_to_csv(paraghraph, ignore_line_break)
 83 |         elements.append(
 84 |             {
 85 |                 "type": "paragraph",
 86 |                 "box": paraghraph.box,
 87 |                 "element": contents,
 88 |                 "order": paraghraph.order,
 89 |             }
 90 |         )
 91 | 
 92 |     elements = sorted(elements, key=lambda x: x["order"])
 93 | 
 94 |     if export_figure:
 95 |         save_figure(
 96 |             inputs.figures,
 97 |             img,
 98 |             out_path,
 99 |             figure_dir=figure_dir,
100 |         )
101 | 
102 |     return elements
103 | 
104 | 
105 | def export_csv(
106 |     inputs,
107 |     out_path: str,
108 |     ignore_line_break: bool = False,
109 |     encoding: str = "utf-8",
110 |     img=None,
111 |     export_figure: bool = True,
112 |     figure_dir="figures",
113 | ):
114 |     elements = convert_csv(
115 |         inputs,
116 |         out_path,
117 |         ignore_line_break,
118 |         img,
119 |         export_figure,
120 |         figure_dir,
121 |     )
122 | 
123 |     save_csv(elements, out_path, encoding)
124 |     return elements
125 | 
126 | 
127 | def save_csv(
128 |     elements,
129 |     out_path,
130 |     encoding,
131 | ):
132 |     with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
133 |         writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
134 |         for element in elements:
135 |             if element["type"] == "table":
136 |                 writer.writerows(element["element"])
137 |             else:
138 |                 writer.writerow([element["element"]])
139 | 
140 |             writer.writerow([""])
141 | 


--------------------------------------------------------------------------------
/src/yomitoku/export/export_html.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from html import escape
  4 | from lxml import etree, html
  5 | 
  6 | from ..utils.misc import save_image
  7 | 
  8 | 
  9 | def convert_text_to_html(text):
 10 |     """
 11 |     入力されたテキストをHTMLに変換する関数。
 12 |     URLを検出してリンク化せずそのまま表示し、それ以外はHTMLエスケープする。
 13 |     """
 14 |     url_regex = re.compile(r"https?://[^\s<>]")
 15 | 
 16 |     def replace_url(match):
 17 |         url = match.group(0)
 18 |         return escape(url)
 19 | 
 20 |     return url_regex.sub(replace_url, escape(text))
 21 | 
 22 | 
 23 | def add_td_tag(contents, row_span, col_span):
 24 |     return f'<td rowspan="{row_span}" colspan="{col_span}">{contents}</td>'
 25 | 
 26 | 
 27 | def add_table_tag(contents):
 28 |     return f'<table border="1" style="border-collapse: collapse">{contents}</table>'
 29 | 
 30 | 
 31 | def add_tr_tag(contents):
 32 |     return f"<tr>{contents}</tr>"
 33 | 
 34 | 
 35 | def add_p_tag(contents):
 36 |     return f"<p>{contents}</p>"
 37 | 
 38 | 
 39 | def add_html_tag(text):
 40 |     return f"<html><body>{text}</body></html>"
 41 | 
 42 | 
 43 | def add_h1_tag(contents):
 44 |     return f"<h1>{contents}</h1>"
 45 | 
 46 | 
 47 | def table_to_html(table, ignore_line_break):
 48 |     pre_row = 1
 49 |     rows = []
 50 |     row = []
 51 |     for cell in table.cells:
 52 |         if cell.row != pre_row:
 53 |             rows.append(add_tr_tag("".join(row)))
 54 |             row = []
 55 | 
 56 |         row_span = cell.row_span
 57 |         col_span = cell.col_span
 58 |         contents = cell.contents
 59 | 
 60 |         if contents is None:
 61 |             contents = ""
 62 | 
 63 |         contents = convert_text_to_html(contents)
 64 | 
 65 |         if ignore_line_break:
 66 |             contents = contents.replace("\n", "")
 67 |         else:
 68 |             contents = contents.replace("\n", "<br>")
 69 | 
 70 |         row.append(add_td_tag(contents, row_span, col_span))
 71 |         pre_row = cell.row
 72 |     else:
 73 |         rows.append(add_tr_tag("".join(row)))
 74 | 
 75 |     table_html = add_table_tag("".join(rows))
 76 | 
 77 |     return {
 78 |         "box": table.box,
 79 |         "order": table.order,
 80 |         "html": table_html,
 81 |     }
 82 | 
 83 | 
 84 | def paragraph_to_html(paragraph, ignore_line_break):
 85 |     contents = paragraph.contents
 86 |     contents = convert_text_to_html(contents)
 87 | 
 88 |     if ignore_line_break:
 89 |         contents = contents.replace("\n", "")
 90 |     else:
 91 |         contents = contents.replace("\n", "<br>")
 92 | 
 93 |     if paragraph.role == "section_headings":
 94 |         contents = add_h1_tag(contents)
 95 | 
 96 |     return {
 97 |         "box": paragraph.box,
 98 |         "order": paragraph.order,
 99 |         "html": add_p_tag(contents),
100 |     }
101 | 
102 | 
103 | def figure_to_html(
104 |     figures,
105 |     img,
106 |     out_path,
107 |     export_figure_letter=False,
108 |     ignore_line_break=False,
109 |     figure_dir="figures",
110 |     width=200,
111 | ):
112 |     assert img is not None, "img is required for saving figures"
113 | 
114 |     elements = []
115 |     for i, figure in enumerate(figures):
116 |         x1, y1, x2, y2 = map(int, figure.box)
117 |         figure_img = img[y1:y2, x1:x2, :]
118 |         save_dir = os.path.dirname(out_path)
119 |         save_dir = os.path.join(save_dir, figure_dir)
120 |         os.makedirs(save_dir, exist_ok=True)
121 | 
122 |         filename = os.path.splitext(os.path.basename(out_path))[0]
123 |         figure_name = f"{filename}_figure_{i}.png"
124 |         figure_path = os.path.join(save_dir, figure_name)
125 |         save_image(figure_img, figure_path)
126 | 
127 |         elements.append(
128 |             {
129 |                 "order": figure.order,
130 |                 "html": f'<img src="{figure_dir}/{figure_name}" width="{width}"><br>',
131 |             }
132 |         )
133 | 
134 |         if export_figure_letter:
135 |             paragraphs = sorted(figure.paragraphs, key=lambda x: x.order)
136 |             for paragraph in paragraphs:
137 |                 contents = paragraph_to_html(paragraph, ignore_line_break)
138 |                 html = contents["html"]
139 |                 elements.append(
140 |                     {
141 |                         "order": figure.order,
142 |                         "html": html,
143 |                     }
144 |                 )
145 | 
146 |     return elements
147 | 
148 | 
149 | def convert_html(
150 |     inputs,
151 |     out_path,
152 |     ignore_line_break,
153 |     export_figure,
154 |     export_figure_letter,
155 |     img=None,
156 |     figure_width=200,
157 |     figure_dir="figures",
158 | ):
159 |     html_string = ""
160 |     elements = []
161 |     for table in inputs.tables:
162 |         elements.append(table_to_html(table, ignore_line_break))
163 | 
164 |     for paragraph in inputs.paragraphs:
165 |         elements.append(paragraph_to_html(paragraph, ignore_line_break))
166 | 
167 |     if export_figure:
168 |         elements.extend(
169 |             figure_to_html(
170 |                 inputs.figures,
171 |                 img,
172 |                 out_path,
173 |                 export_figure_letter,
174 |                 ignore_line_break,
175 |                 width=figure_width,
176 |                 figure_dir=figure_dir,
177 |             ),
178 |         )
179 | 
180 |     elements = sorted(elements, key=lambda x: x["order"])
181 | 
182 |     html_string = "".join([element["html"] for element in elements])
183 |     if not len(html_string) == 0:
184 |         parsed_html = html.fromstring(html_string)
185 |         formatted_html = etree.tostring(
186 |             parsed_html, pretty_print=True, encoding="unicode"
187 |         )
188 |     else:
189 |         formatted_html = ""
190 | 
191 |     return formatted_html, elements
192 | 
193 | 
194 | def export_html(
195 |     inputs,
196 |     out_path: str,
197 |     ignore_line_break: bool = False,
198 |     export_figure: bool = True,
199 |     export_figure_letter: bool = False,
200 |     img=None,
201 |     figure_width=200,
202 |     figure_dir="figures",
203 |     encoding: str = "utf-8",
204 | ):
205 |     formatted_html, elements = convert_html(
206 |         inputs,
207 |         out_path,
208 |         ignore_line_break,
209 |         export_figure,
210 |         export_figure_letter,
211 |         img,
212 |         figure_width,
213 |         figure_dir,
214 |     )
215 | 
216 |     save_html(formatted_html, out_path, encoding)
217 | 
218 |     return formatted_html
219 | 
220 | 
221 | def save_html(
222 |     html,
223 |     out_path,
224 |     encoding,
225 | ):
226 |     with open(out_path, "w", encoding=encoding, errors="ignore") as f:
227 |         f.write(html)
228 | 


--------------------------------------------------------------------------------
/src/yomitoku/export/export_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from ..utils.misc import save_image
 5 | 
 6 | 
 7 | def paragraph_to_json(paragraph, ignore_line_break):
 8 |     if ignore_line_break:
 9 |         paragraph.contents = paragraph.contents.replace("\n", "")
10 | 
11 | 
12 | def table_to_json(table, ignore_line_break):
13 |     for cell in table.cells:
14 |         if ignore_line_break:
15 |             cell.contents = cell.contents.replace("\n", "")
16 | 
17 | 
18 | def save_figure(
19 |     figures,
20 |     img,
21 |     out_path,
22 |     figure_dir="figures",
23 | ):
24 |     assert img is not None, "img is required for saving figures"
25 | 
26 |     for i, figure in enumerate(figures):
27 |         x1, y1, x2, y2 = map(int, figure.box)
28 |         figure_img = img[y1:y2, x1:x2, :]
29 |         save_dir = os.path.dirname(out_path)
30 |         save_dir = os.path.join(save_dir, figure_dir)
31 |         os.makedirs(save_dir, exist_ok=True)
32 | 
33 |         filename = os.path.splitext(os.path.basename(out_path))[0]
34 |         figure_name = f"{filename}_figure_{i}.png"
35 |         figure_path = os.path.join(save_dir, figure_name)
36 |         save_image(figure_img, figure_path)
37 | 
38 | 
39 | def convert_json(inputs, out_path, ignore_line_break, img, export_figure, figure_dir):
40 |     from yomitoku.document_analyzer import DocumentAnalyzerSchema
41 | 
42 |     if isinstance(inputs, DocumentAnalyzerSchema):
43 |         for table in inputs.tables:
44 |             table_to_json(table, ignore_line_break)
45 | 
46 |     if isinstance(inputs, DocumentAnalyzerSchema):
47 |         for paragraph in inputs.paragraphs:
48 |             paragraph_to_json(paragraph, ignore_line_break)
49 | 
50 |     if isinstance(inputs, DocumentAnalyzerSchema) and export_figure:
51 |         save_figure(
52 |             inputs.figures,
53 |             img,
54 |             out_path,
55 |             figure_dir=figure_dir,
56 |         )
57 | 
58 |     return inputs
59 | 
60 | 
61 | def export_json(
62 |     inputs,
63 |     out_path,
64 |     ignore_line_break=False,
65 |     encoding: str = "utf-8",
66 |     img=None,
67 |     export_figure=False,
68 |     figure_dir="figures",
69 | ):
70 |     inputs = convert_json(
71 |         inputs,
72 |         out_path,
73 |         ignore_line_break,
74 |         img,
75 |         export_figure,
76 |         figure_dir,
77 |     )
78 | 
79 |     save_json(
80 |         inputs.model_dump(),
81 |         out_path,
82 |         encoding,
83 |     )
84 | 
85 |     return inputs
86 | 
87 | 
88 | def save_json(data, out_path, encoding):
89 |     with open(out_path, "w", encoding=encoding, errors="ignore") as f:
90 |         json.dump(
91 |             data,
92 |             f,
93 |             ensure_ascii=False,
94 |             indent=4,
95 |             sort_keys=True,
96 |             separators=(",", ": "),
97 |         )
98 | 


--------------------------------------------------------------------------------
/src/yomitoku/export/export_markdown.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | from ..utils.misc import save_image
  5 | 
  6 | 
  7 | def escape_markdown_special_chars(text):
  8 |     special_chars = r"([`*{}[\]()#+!~|-])"
  9 |     return re.sub(special_chars, r"\\\1", text)
 10 | 
 11 | 
 12 | def paragraph_to_md(paragraph, ignore_line_break):
 13 |     contents = escape_markdown_special_chars(paragraph.contents)
 14 | 
 15 |     if ignore_line_break:
 16 |         contents = contents.replace("\n", "")
 17 |     else:
 18 |         contents = contents.replace("\n", "<br>")
 19 | 
 20 |     if paragraph.role == "section_headings":
 21 |         contents = "# " + contents
 22 | 
 23 |     return {
 24 |         "order": paragraph.order,
 25 |         "box": paragraph.box,
 26 |         "md": contents + "\n",
 27 |     }
 28 | 
 29 | 
 30 | def table_to_md(table, ignore_line_break):
 31 |     num_rows = table.n_row
 32 |     num_cols = table.n_col
 33 | 
 34 |     table_array = [["" for _ in range(num_cols)] for _ in range(num_rows)]
 35 | 
 36 |     for cell in table.cells:
 37 |         row = cell.row - 1
 38 |         col = cell.col - 1
 39 |         row_span = cell.row_span
 40 |         col_span = cell.col_span
 41 |         contents = cell.contents
 42 | 
 43 |         for i in range(row, row + row_span):
 44 |             for j in range(col, col + col_span):
 45 |                 contents = escape_markdown_special_chars(contents)
 46 |                 if ignore_line_break:
 47 |                     contents = contents.replace("\n", "")
 48 |                 else:
 49 |                     contents = contents.replace("\n", "<br>")
 50 | 
 51 |                 if i == row and j == col:
 52 |                     table_array[i][j] = contents
 53 | 
 54 |     table_md = ""
 55 |     for i in range(num_rows):
 56 |         row = "|".join(table_array[i])
 57 |         table_md += f"|{row}|\n"
 58 | 
 59 |         if i == 0:
 60 |             header = "|".join(["-" for _ in range(num_cols)])
 61 |             table_md += f"|{header}|\n"
 62 | 
 63 |     return {
 64 |         "order": table.order,
 65 |         "box": table.box,
 66 |         "md": table_md,
 67 |     }
 68 | 
 69 | 
 70 | def figure_to_md(
 71 |     figures,
 72 |     img,
 73 |     out_path,
 74 |     export_figure_letter=False,
 75 |     ignore_line_break=False,
 76 |     width=200,
 77 |     figure_dir="figures",
 78 | ):
 79 |     assert img is not None, "img is required for saving figures"
 80 | 
 81 |     elements = []
 82 |     for i, figure in enumerate(figures):
 83 |         x1, y1, x2, y2 = map(int, figure.box)
 84 |         figure_img = img[y1:y2, x1:x2, :]
 85 |         save_dir = os.path.dirname(out_path)
 86 |         save_dir = os.path.join(save_dir, figure_dir)
 87 |         os.makedirs(save_dir, exist_ok=True)
 88 | 
 89 |         filename = os.path.splitext(os.path.basename(out_path))[0]
 90 |         figure_name = f"{filename}_figure_{i}.png"
 91 |         figure_path = os.path.join(save_dir, figure_name)
 92 |         save_image(figure_img, figure_path)
 93 | 
 94 |         elements.append(
 95 |             {
 96 |                 "order": figure.order,
 97 |                 "md": f'<img src="{figure_dir}/{figure_name}" width="{width}px"><br>',
 98 |             }
 99 |         )
100 | 
101 |         if export_figure_letter:
102 |             paragraphs = sorted(figure.paragraphs, key=lambda x: x.order)
103 |             for paragraph in paragraphs:
104 |                 element = paragraph_to_md(paragraph, ignore_line_break)
105 |                 element = {
106 |                     "order": figure.order,
107 |                     "md": element["md"],
108 |                 }
109 |                 elements.append(element)
110 | 
111 |     return elements
112 | 
113 | 
114 | def convert_markdown(
115 |     inputs,
116 |     out_path,
117 |     ignore_line_break=False,
118 |     img=None,
119 |     export_figure_letter=False,
120 |     export_figure=True,
121 |     figure_width=200,
122 |     figure_dir="figures",
123 | ):
124 |     elements = []
125 |     for table in inputs.tables:
126 |         elements.append(table_to_md(table, ignore_line_break))
127 | 
128 |     for paragraph in inputs.paragraphs:
129 |         elements.append(paragraph_to_md(paragraph, ignore_line_break))
130 | 
131 |     if export_figure:
132 |         elements.extend(
133 |             figure_to_md(
134 |                 inputs.figures,
135 |                 img,
136 |                 out_path,
137 |                 export_figure_letter,
138 |                 ignore_line_break,
139 |                 figure_width,
140 |                 figure_dir=figure_dir,
141 |             )
142 |         )
143 | 
144 |     elements = sorted(elements, key=lambda x: x["order"])
145 |     markdown = "\n".join([element["md"] for element in elements])
146 |     return markdown, elements
147 | 
148 | 
149 | def export_markdown(
150 |     inputs,
151 |     out_path: str,
152 |     ignore_line_break: bool = False,
153 |     img=None,
154 |     export_figure_letter=False,
155 |     export_figure=True,
156 |     figure_width=200,
157 |     figure_dir="figures",
158 |     encoding: str = "utf-8",
159 | ):
160 |     markdown, elements = convert_markdown(
161 |         inputs,
162 |         out_path,
163 |         ignore_line_break,
164 |         img,
165 |         export_figure_letter,
166 |         export_figure,
167 |         figure_width,
168 |         figure_dir,
169 |     )
170 | 
171 |     save_markdown(markdown, out_path, encoding)
172 |     return markdown
173 | 
174 | 
175 | def save_markdown(
176 |     markdown,
177 |     out_path,
178 |     encoding,
179 | ):
180 |     with open(out_path, "w", encoding=encoding, errors="ignore") as f:
181 |         f.write(markdown)
182 | 


--------------------------------------------------------------------------------
/src/yomitoku/layout_analyzer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .base import BaseSchema
 4 | from .layout_parser import Element, LayoutParser
 5 | from .table_structure_recognizer import (
 6 |     TableStructureRecognizer,
 7 |     TableStructureRecognizerSchema,
 8 | )
 9 | 
10 | 
11 | class LayoutAnalyzerSchema(BaseSchema):
12 |     paragraphs: List[Element]
13 |     tables: List[TableStructureRecognizerSchema]
14 |     figures: List[Element]
15 | 
16 | 
17 | class LayoutAnalyzer:
18 |     def __init__(self, configs={}, device="cuda", visualize=False):
19 |         layout_parser_kwargs = {
20 |             "device": device,
21 |             "visualize": visualize,
22 |         }
23 |         table_structure_recognizer_kwargs = {
24 |             "device": device,
25 |             "visualize": visualize,
26 |         }
27 | 
28 |         if isinstance(configs, dict):
29 |             if "layout_parser" in configs:
30 |                 layout_parser_kwargs.update(configs["layout_parser"])
31 | 
32 |             if "table_structure_recognizer" in configs:
33 |                 table_structure_recognizer_kwargs.update(
34 |                     configs["table_structure_recognizer"]
35 |                 )
36 |         else:
37 |             raise ValueError(
38 |                 "configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku-dev/usage/"
39 |             )
40 | 
41 |         self.layout_parser = LayoutParser(
42 |             **layout_parser_kwargs,
43 |         )
44 |         self.table_structure_recognizer = TableStructureRecognizer(
45 |             **table_structure_recognizer_kwargs,
46 |         )
47 | 
48 |     def __call__(self, img):
49 |         layout_results, vis = self.layout_parser(img)
50 |         table_boxes = [table.box for table in layout_results.tables]
51 |         table_results, vis = self.table_structure_recognizer(img, table_boxes, vis=vis)
52 | 
53 |         results = LayoutAnalyzerSchema(
54 |             paragraphs=layout_results.paragraphs,
55 |             tables=table_results,
56 |             figures=layout_results.figures,
57 |         )
58 | 
59 |         return results, vis
60 | 


--------------------------------------------------------------------------------
/src/yomitoku/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .dbnet_plus import DBNet
 2 | from .parseq import PARSeq
 3 | from .rtdetr import RTDETRv2
 4 | 
 5 | __all__ = [
 6 |     "DBNet",
 7 |     "PARSeq",
 8 |     "RTDETRv2",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/yomitoku/models/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/src/yomitoku/models/layers/__init__.py


--------------------------------------------------------------------------------
/src/yomitoku/models/layers/activate.py:
--------------------------------------------------------------------------------
 1 | # Copyright(c) 2023 lyuwenyu
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch.nn as nn
15 | 
16 | 
17 | def get_activation(act: str, inplace: bool = True):
18 |     """get activation"""
19 |     if act is None:
20 |         return nn.Identity()
21 | 
22 |     elif isinstance(act, nn.Module):
23 |         return act
24 | 
25 |     act = act.lower()
26 | 
27 |     if act == "silu" or act == "swish":
28 |         m = nn.SiLU()
29 | 
30 |     elif act == "relu":
31 |         m = nn.ReLU()
32 | 
33 |     elif act == "leaky_relu":
34 |         m = nn.LeakyReLU()
35 | 
36 |     elif act == "silu":
37 |         m = nn.SiLU()
38 | 
39 |     elif act == "gelu":
40 |         m = nn.GELU()
41 | 
42 |     elif act == "hardsigmoid":
43 |         m = nn.Hardsigmoid()
44 | 
45 |     else:
46 |         raise RuntimeError("")
47 | 
48 |     if hasattr(m, "inplace"):
49 |         m.inplace = inplace
50 | 
51 |     return m
52 | 


--------------------------------------------------------------------------------
/src/yomitoku/models/layers/dbnet_feature_attention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class ScaleChannelAttention(nn.Module):
  7 |     def __init__(self, in_planes, out_planes, num_features, init_weight=True):
  8 |         super(ScaleChannelAttention, self).__init__()
  9 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
 10 |         print(self.avgpool)
 11 |         self.fc1 = nn.Conv2d(in_planes, out_planes, 1, bias=False)
 12 |         self.bn = nn.BatchNorm2d(out_planes)
 13 |         self.fc2 = nn.Conv2d(out_planes, num_features, 1, bias=False)
 14 |         if init_weight:
 15 |             self._initialize_weights()
 16 | 
 17 |     def _initialize_weights(self):
 18 |         for m in self.modules():
 19 |             if isinstance(m, nn.Conv2d):
 20 |                 nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
 21 |                 if m.bias is not None:
 22 |                     nn.init.constant_(m.bias, 0)
 23 |             if isinstance(m, nn.BatchNorm2d):
 24 |                 nn.init.constant_(m.weight, 1)
 25 |                 nn.init.constant_(m.bias, 0)
 26 | 
 27 |     def forward(self, x):
 28 |         global_x = self.avgpool(x)
 29 |         global_x = self.fc1(global_x)
 30 |         global_x = F.relu(self.bn(global_x))
 31 |         global_x = self.fc2(global_x)
 32 |         global_x = F.softmax(global_x, 1)
 33 |         return global_x
 34 | 
 35 | 
 36 | class ScaleChannelSpatialAttention(nn.Module):
 37 |     def __init__(self, in_planes, out_planes, num_features, init_weight=True):
 38 |         super(ScaleChannelSpatialAttention, self).__init__()
 39 |         self.channel_wise = nn.Sequential(
 40 |             nn.AdaptiveAvgPool2d(1),
 41 |             nn.Conv2d(in_planes, out_planes, 1, bias=False),
 42 |             # nn.BatchNorm2d(out_planes),
 43 |             nn.ReLU(),
 44 |             nn.Conv2d(out_planes, in_planes, 1, bias=False),
 45 |         )
 46 |         self.spatial_wise = nn.Sequential(
 47 |             # Nx1xHxW
 48 |             nn.Conv2d(1, 1, 3, bias=False, padding=1),
 49 |             nn.ReLU(),
 50 |             nn.Conv2d(1, 1, 1, bias=False),
 51 |             nn.Sigmoid(),
 52 |         )
 53 |         self.attention_wise = nn.Sequential(
 54 |             nn.Conv2d(in_planes, num_features, 1, bias=False), nn.Sigmoid()
 55 |         )
 56 |         if init_weight:
 57 |             self._initialize_weights()
 58 | 
 59 |     def _initialize_weights(self):
 60 |         for m in self.modules():
 61 |             if isinstance(m, nn.Conv2d):
 62 |                 nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
 63 |                 if m.bias is not None:
 64 |                     nn.init.constant_(m.bias, 0)
 65 |             if isinstance(m, nn.BatchNorm2d):
 66 |                 nn.init.constant_(m.weight, 1)
 67 |                 nn.init.constant_(m.bias, 0)
 68 | 
 69 |     def forward(self, x):
 70 |         # global_x = self.avgpool(x)
 71 |         # shape Nx4x1x1
 72 |         global_x = self.channel_wise(x).sigmoid()
 73 |         # shape: NxCxHxW
 74 |         global_x = global_x + x
 75 |         # shape:Nx1xHxW
 76 |         x = torch.mean(global_x, dim=1, keepdim=True)
 77 |         global_x = self.spatial_wise(x) + global_x
 78 |         global_x = self.attention_wise(global_x)
 79 |         return global_x
 80 | 
 81 | 
 82 | class ScaleSpatialAttention(nn.Module):
 83 |     def __init__(self, in_planes, out_planes, num_features, init_weight=True):
 84 |         super(ScaleSpatialAttention, self).__init__()
 85 |         self.spatial_wise = nn.Sequential(
 86 |             # Nx1xHxW
 87 |             nn.Conv2d(1, 1, 3, bias=False, padding=1),
 88 |             nn.ReLU(),
 89 |             nn.Conv2d(1, 1, 1, bias=False),
 90 |             nn.Sigmoid(),
 91 |         )
 92 |         self.attention_wise = nn.Sequential(
 93 |             nn.Conv2d(in_planes, num_features, 1, bias=False), nn.Sigmoid()
 94 |         )
 95 |         if init_weight:
 96 |             self._initialize_weights()
 97 | 
 98 |     def _initialize_weights(self):
 99 |         for m in self.modules():
100 |             if isinstance(m, nn.Conv2d):
101 |                 nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
102 |                 if m.bias is not None:
103 |                     nn.init.constant_(m.bias, 0)
104 |             if isinstance(m, nn.BatchNorm2d):
105 |                 nn.init.constant_(m.weight, 1)
106 |                 nn.init.constant_(m.bias, 0)
107 | 
108 |     def forward(self, x):
109 |         global_x = torch.mean(x, dim=1, keepdim=True)
110 |         global_x = self.spatial_wise(global_x) + x
111 |         global_x = self.attention_wise(global_x)
112 |         return global_x
113 | 
114 | 
115 | class ScaleFeatureSelection(nn.Module):
116 |     def __init__(
117 |         self,
118 |         in_channels,
119 |         inter_channels,
120 |         out_features_num=4,
121 |         attention_type="scale_spatial",
122 |     ):
123 |         super(ScaleFeatureSelection, self).__init__()
124 |         self.in_channels = in_channels
125 |         self.inter_channels = inter_channels
126 |         self.out_features_num = out_features_num
127 |         self.conv = nn.Conv2d(in_channels, inter_channels, 3, padding=1)
128 |         self.type = attention_type
129 |         if self.type == "scale_spatial":
130 |             self.enhanced_attention = ScaleSpatialAttention(
131 |                 inter_channels, inter_channels // 4, out_features_num
132 |             )
133 |         elif self.type == "scale_channel_spatial":
134 |             self.enhanced_attention = ScaleChannelSpatialAttention(
135 |                 inter_channels, inter_channels // 4, out_features_num
136 |             )
137 |         elif self.type == "scale_channel":
138 |             self.enhanced_attention = ScaleChannelAttention(
139 |                 inter_channels, inter_channels // 2, out_features_num
140 |             )
141 | 
142 |     def _initialize_weights(self, m):
143 |         classname = m.__class__.__name__
144 |         if classname.find("Conv") != -1:
145 |             nn.init.kaiming_normal_(m.weight.data)
146 |         elif classname.find("BatchNorm") != -1:
147 |             m.weight.data.fill_(1.0)
148 |             m.bias.data.fill_(1e-4)
149 | 
150 |     def forward(self, concat_x, features_list):
151 |         concat_x = self.conv(concat_x)
152 |         score = self.enhanced_attention(concat_x)
153 |         assert len(features_list) == self.out_features_num
154 |         if self.type not in ["scale_channel_spatial", "scale_spatial"]:
155 |             shape = features_list[0].shape[2:]
156 |             score = F.interpolate(score, size=shape, mode="bilinear")
157 |         x = []
158 |         for i in range(self.out_features_num):
159 |             x.append(score[:, i : i + 1] * features_list[i])
160 |         return torch.cat(x, dim=1)
161 | 


--------------------------------------------------------------------------------
/src/yomitoku/models/rtdetr.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from huggingface_hub import PyTorchModelHubMixin
 3 | 
 4 | from .layers.rtdetr_backbone import PResNet
 5 | from .layers.rtdetr_hybrid_encoder import HybridEncoder
 6 | from .layers.rtdetrv2_decoder import RTDETRTransformerv2
 7 | 
 8 | 
 9 | class RTDETRv2(nn.Module, PyTorchModelHubMixin):
10 |     def __init__(self, cfg):
11 |         super().__init__()
12 |         self.cfg = cfg
13 |         self.backbone = PResNet(**cfg.PResNet)
14 |         self.encoder = HybridEncoder(**cfg.HybridEncoder)
15 |         self.decoder = RTDETRTransformerv2(**cfg.RTDETRTransformerv2)
16 | 
17 |     def forward(self, x, targets=None):
18 |         x = self.backbone(x)
19 |         x = self.encoder(x)
20 |         x = self.decoder(x, targets)
21 | 
22 |         return x
23 | 


--------------------------------------------------------------------------------
/src/yomitoku/ocr.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from pydantic import conlist
 4 | 
 5 | from yomitoku.text_detector import TextDetector
 6 | from yomitoku.text_recognizer import TextRecognizer
 7 | 
 8 | from .base import BaseSchema
 9 | 
10 | 
11 | class WordPrediction(BaseSchema):
12 |     points: conlist(
13 |         conlist(int, min_length=2, max_length=2),
14 |         min_length=4,
15 |         max_length=4,
16 |     )
17 |     content: str
18 |     direction: str
19 |     rec_score: float
20 |     det_score: float
21 | 
22 | 
23 | class OCRSchema(BaseSchema):
24 |     words: List[WordPrediction]
25 | 
26 | 
27 | def ocr_aggregate(det_outputs, rec_outputs):
28 |     words = []
29 |     for points, det_score, pred, rec_score, direction in zip(
30 |         det_outputs.points,
31 |         det_outputs.scores,
32 |         rec_outputs.contents,
33 |         rec_outputs.scores,
34 |         rec_outputs.directions,
35 |     ):
36 |         words.append(
37 |             {
38 |                 "points": points,
39 |                 "content": pred,
40 |                 "direction": direction,
41 |                 "det_score": det_score,
42 |                 "rec_score": rec_score,
43 |             }
44 |         )
45 |     return words
46 | 
47 | 
48 | class OCR:
49 |     def __init__(self, configs={}, device="cuda", visualize=False):
50 |         text_detector_kwargs = {
51 |             "device": device,
52 |             "visualize": visualize,
53 |         }
54 |         text_recognizer_kwargs = {
55 |             "device": device,
56 |             "visualize": visualize,
57 |         }
58 | 
59 |         if isinstance(configs, dict):
60 |             if "text_detector" in configs:
61 |                 text_detector_kwargs.update(configs["text_detector"])
62 |             if "text_recognizer" in configs:
63 |                 text_recognizer_kwargs.update(configs["text_recognizer"])
64 |         else:
65 |             raise ValueError(
66 |                 "configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku-dev/usage/"
67 |             )
68 | 
69 |         self.detector = TextDetector(**text_detector_kwargs)
70 |         self.recognizer = TextRecognizer(**text_recognizer_kwargs)
71 | 
72 |     def __call__(self, img):
73 |         """_summary_
74 | 
75 |         Args:
76 |             img (np.ndarray): cv2 image(BGR)
77 |         """
78 | 
79 |         det_outputs, vis = self.detector(img)
80 |         rec_outputs, vis = self.recognizer(img, det_outputs.points, vis=vis)
81 | 
82 |         outputs = {"words": ocr_aggregate(det_outputs, rec_outputs)}
83 |         results = OCRSchema(**outputs)
84 |         return results, vis
85 | 


--------------------------------------------------------------------------------
/src/yomitoku/onnx/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/src/yomitoku/onnx/.gitkeep


--------------------------------------------------------------------------------
/src/yomitoku/postprocessor/__init__.py:
--------------------------------------------------------------------------------
 1 | from .dbnet_postporcessor import DBnetPostProcessor
 2 | from .parseq_tokenizer import ParseqTokenizer
 3 | from .rtdetr_postprocessor import RTDETRPostProcessor
 4 | 
 5 | __all__ = [
 6 |     "DBnetPostProcessor",
 7 |     "RTDETRPostProcessor",
 8 |     "ParseqTokenizer",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/yomitoku/postprocessor/dbnet_postporcessor.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import math
  3 | import numpy as np
  4 | import pyclipper
  5 | from shapely.geometry import Polygon
  6 | 
  7 | 
  8 | class DBnetPostProcessor:
  9 |     def __init__(self, min_size, thresh, box_thresh, max_candidates, unclip_ratio):
 10 |         self.min_size = min_size
 11 |         self.thresh = thresh
 12 |         self.box_thresh = box_thresh
 13 |         self.max_candidates = max_candidates
 14 |         self.unclip_ratio = unclip_ratio
 15 | 
 16 |     def __call__(self, preds, image_size):
 17 |         """
 18 |         pred:
 19 |             binary: text region segmentation map, with shape (N, H, W)
 20 |             thresh: [if exists] thresh hold prediction with shape (N, H, W)
 21 |             thresh_binary: [if exists] binarized with threshhold, (N, H, W)
 22 |         """
 23 |         pred = preds["binary"][0]
 24 |         segmentation = self.binarize(pred)[0]
 25 |         height, width = image_size
 26 |         quads, scores = self.boxes_from_bitmap(pred, segmentation, width, height)
 27 |         return quads, scores
 28 | 
 29 |     def binarize(self, pred):
 30 |         return pred > self.thresh
 31 | 
 32 |     def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
 33 |         """
 34 |         _bitmap: single map with shape (H, W),
 35 |             whose values are binarized as {0, 1}
 36 |         """
 37 | 
 38 |         assert len(_bitmap.shape) == 2
 39 |         bitmap = _bitmap.cpu().numpy()  # The first channel
 40 | 
 41 |         pred = pred.cpu().detach().numpy()[0]
 42 |         height, width = bitmap.shape
 43 |         contours, _ = cv2.findContours(
 44 |             (bitmap * 255).astype(np.uint8),
 45 |             cv2.RETR_LIST,
 46 |             cv2.CHAIN_APPROX_SIMPLE,
 47 |         )
 48 | 
 49 |         num_contours = min(len(contours), self.max_candidates)
 50 | 
 51 |         boxes = []
 52 |         scores = []
 53 |         for index in range(num_contours):
 54 |             contour = contours[index].squeeze(1)
 55 |             points, sside = self.get_mini_boxes(contour)
 56 | 
 57 |             if sside < self.min_size:
 58 |                 continue
 59 |             points = np.array(points)
 60 |             score = self.box_score_fast(pred, contour)
 61 | 
 62 |             if self.box_thresh > score:
 63 |                 continue
 64 | 
 65 |             box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
 66 |             box, sside = self.get_mini_boxes(box)
 67 |             if sside < self.min_size + 2:
 68 |                 continue
 69 |             box = np.array(box)
 70 |             if not isinstance(dest_width, int):
 71 |                 dest_width = dest_width.item()
 72 |                 dest_height = dest_height.item()
 73 | 
 74 |             box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
 75 |             box[:, 1] = np.clip(
 76 |                 np.round(box[:, 1] / height * dest_height), 0, dest_height
 77 |             )
 78 | 
 79 |             boxes.append(box.astype(np.int16).tolist())
 80 |             scores.append(score)
 81 | 
 82 |         return boxes, scores
 83 | 
 84 |     def unclip(self, box, unclip_ratio=7):
 85 |         # 小さい文字が見切れやすい、大きい文字のマージンが過度に大きくなる等の課題がある
 86 |         # 対応として、文字の大きさに応じて、拡大パラメータを動的に変更する
 87 |         # Note: こののルールはヒューリスティックで理論的根拠はない
 88 |         poly = Polygon(box)
 89 |         width = box[:, 0].max() - box[:, 0].min()
 90 |         height = box[:, 1].max() - box[:, 1].min()
 91 |         box_dist = min(width, height)
 92 |         ratio = unclip_ratio / math.sqrt(box_dist)
 93 | 
 94 |         distance = poly.area * ratio / poly.length
 95 |         offset = pyclipper.PyclipperOffset()
 96 |         offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
 97 |         expanded = np.array(offset.Execute(distance))
 98 |         return expanded
 99 | 
100 |     def get_mini_boxes(self, contour):
101 |         bounding_box = cv2.minAreaRect(contour)
102 |         points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
103 | 
104 |         index_1, index_2, index_3, index_4 = 0, 1, 2, 3
105 |         if points[1][1] > points[0][1]:
106 |             index_1 = 0
107 |             index_4 = 1
108 |         else:
109 |             index_1 = 1
110 |             index_4 = 0
111 |         if points[3][1] > points[2][1]:
112 |             index_2 = 2
113 |             index_3 = 3
114 |         else:
115 |             index_2 = 3
116 |             index_3 = 2
117 | 
118 |         box = [
119 |             points[index_1],
120 |             points[index_2],
121 |             points[index_3],
122 |             points[index_4],
123 |         ]
124 |         return box, min(bounding_box[1])
125 | 
126 |     def box_score_fast(self, bitmap, _box):
127 |         h, w = bitmap.shape[:2]
128 |         box = _box.copy()
129 |         xmin = np.clip(np.floor(box[:, 0].min()).astype(int), 0, w - 1)
130 |         xmax = np.clip(np.ceil(box[:, 0].max()).astype(int), 0, w - 1)
131 |         ymin = np.clip(np.floor(box[:, 1].min()).astype(int), 0, h - 1)
132 |         ymax = np.clip(np.ceil(box[:, 1].max()).astype(int), 0, h - 1)
133 | 
134 |         mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
135 |         box[:, 0] = box[:, 0] - xmin
136 |         box[:, 1] = box[:, 1] - ymin
137 |         cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
138 |         return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
139 | 


--------------------------------------------------------------------------------
/src/yomitoku/postprocessor/parseq_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Scene Text Recognition Model Hub
  2 | # Copyright 2022 Darwin Bautista
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     https://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from abc import ABC, abstractmethod
 17 | from typing import Optional
 18 | 
 19 | import torch
 20 | from torch import Tensor
 21 | from torch.nn.utils.rnn import pad_sequence
 22 | 
 23 | 
 24 | class BaseTokenizer(ABC):
 25 |     def __init__(
 26 |         self,
 27 |         charset: str,
 28 |         specials_first: tuple = (),
 29 |         specials_last: tuple = (),
 30 |     ) -> None:
 31 |         self._itos = specials_first + tuple(charset) + specials_last
 32 |         self._stoi = {s: i for i, s in enumerate(self._itos)}
 33 | 
 34 |     def __len__(self):
 35 |         return len(self._itos)
 36 | 
 37 |     def _tok2ids(self, tokens: str) -> list[int]:
 38 |         return [self._stoi[s] for s in tokens]
 39 | 
 40 |     def _ids2tok(self, token_ids: list[int], join: bool = True) -> str:
 41 |         tokens = [self._itos[i] for i in token_ids]
 42 |         return "".join(tokens) if join else tokens
 43 | 
 44 |     @abstractmethod
 45 |     def encode(
 46 |         self, labels: list[str], device: Optional[torch.device] = None
 47 |     ) -> Tensor:
 48 |         """Encode a batch of labels to a representation suitable for the model.
 49 | 
 50 |         Args:
 51 |             labels: List of labels. Each can be of arbitrary length.
 52 |             device: Create tensor on this device.
 53 | 
 54 |         Returns:
 55 |             Batched tensor representation padded to the max label length. Shape: N, L
 56 |         """
 57 |         raise NotImplementedError
 58 | 
 59 |     @abstractmethod
 60 |     def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
 61 |         """Internal method which performs the necessary filtering prior to decoding."""
 62 |         raise NotImplementedError
 63 | 
 64 |     def decode(
 65 |         self, token_dists: Tensor, raw: bool = False
 66 |     ) -> tuple[list[str], list[Tensor]]:
 67 |         """Decode a batch of token distributions.
 68 | 
 69 |         Args:
 70 |             token_dists: softmax probabilities over the token distribution. Shape: N, L, C
 71 |             raw: return unprocessed labels (will return list of list of strings)
 72 | 
 73 |         Returns:
 74 |             list of string labels (arbitrary length) and
 75 |             their corresponding sequence probabilities as a list of Tensors
 76 |         """
 77 |         batch_tokens = []
 78 |         batch_probs = []
 79 |         for dist in token_dists:
 80 |             probs, ids = dist.max(-1)  # greedy selection
 81 |             if not raw:
 82 |                 probs, ids = self._filter(probs, ids)
 83 |             tokens = self._ids2tok(ids, not raw)
 84 |             probs = probs.cpu().numpy()
 85 |             probs = float(probs.prod())
 86 |             batch_tokens.append(tokens)
 87 |             batch_probs.append(probs)
 88 |         return batch_tokens, batch_probs
 89 | 
 90 | 
 91 | class ParseqTokenizer(BaseTokenizer):
 92 |     BOS = "[B]"
 93 |     EOS = "[E]"
 94 |     PAD = "[P]"
 95 | 
 96 |     def __init__(self, charset: str) -> None:
 97 |         specials_first = (self.EOS,)
 98 |         specials_last = (self.BOS, self.PAD)
 99 |         super().__init__(charset, specials_first, specials_last)
100 |         self.eos_id, self.bos_id, self.pad_id = [
101 |             self._stoi[s] for s in specials_first + specials_last
102 |         ]
103 | 
104 |     def encode(
105 |         self, labels: list[str], device: Optional[torch.device] = None
106 |     ) -> Tensor:
107 |         batch = [
108 |             torch.as_tensor(
109 |                 [self.bos_id] + self._tok2ids(y) + [self.eos_id],
110 |                 dtype=torch.long,
111 |                 device=device,
112 |             )
113 |             for y in labels
114 |         ]
115 |         return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
116 | 
117 |     def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
118 |         ids = ids.tolist()
119 |         try:
120 |             eos_idx = ids.index(self.eos_id)
121 |         except ValueError:
122 |             eos_idx = len(ids)  # Nothing to truncate.
123 |         # Truncate after EOS
124 |         ids = ids[:eos_idx]
125 |         probs = probs[: eos_idx + 1]  # but include prob. for EOS (if it exists)
126 |         return probs, ids
127 | 


--------------------------------------------------------------------------------
/src/yomitoku/postprocessor/rtdetr_postprocessor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 lyuwenyu
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.nn.functional as F
 19 | import torchvision
 20 | 
 21 | 
 22 | def mod(a, b):
 23 |     out = a - a // b * b
 24 |     return out
 25 | 
 26 | 
 27 | class RTDETRPostProcessor(nn.Module):
 28 |     __share__ = [
 29 |         "num_classes",
 30 |         "use_focal_loss",
 31 |         "num_top_queries",
 32 |         "remap_mscoco_category",
 33 |     ]
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         num_classes=80,
 38 |         use_focal_loss=True,
 39 |         num_top_queries=300,
 40 |         remap_mscoco_category=False,
 41 |     ) -> None:
 42 |         super().__init__()
 43 |         self.use_focal_loss = use_focal_loss
 44 |         self.num_top_queries = num_top_queries
 45 |         self.num_classes = int(num_classes)
 46 |         self.remap_mscoco_category = remap_mscoco_category
 47 |         self.deploy_mode = False
 48 | 
 49 |     def extra_repr(self) -> str:
 50 |         return f"use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}"
 51 | 
 52 |     def clamp(self, boxes, h, w):
 53 |         boxes[:, 0] = torch.clamp(boxes[:, 0], min=torch.Tensor([0]), max=None)
 54 |         boxes[:, 1] = torch.clamp(boxes[:, 1], min=torch.Tensor([0]), max=None)
 55 |         boxes[:, 2] = torch.clamp(boxes[:, 2], min=torch.Tensor([0]), max=w)
 56 |         boxes[:, 3] = torch.clamp(boxes[:, 3], min=torch.Tensor([0]), max=h)
 57 |         return boxes
 58 | 
 59 |     # def forward(self, outputs, orig_target_sizes):
 60 |     def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold):
 61 |         logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
 62 |         # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
 63 | 
 64 |         bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
 65 |         bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
 66 | 
 67 |         w, h = orig_target_sizes.unbind(1)
 68 | 
 69 |         if self.use_focal_loss:
 70 |             scores = F.sigmoid(logits)
 71 |             scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
 72 |             # TODO for older tensorrt
 73 |             # labels = index % self.num_classes
 74 |             labels = mod(index, self.num_classes)
 75 |             index = index // self.num_classes
 76 |             boxes = bbox_pred.gather(
 77 |                 dim=1,
 78 |                 index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]),
 79 |             )
 80 | 
 81 |         else:
 82 |             scores = F.softmax(logits)[:, :, :-1]
 83 |             scores, labels = scores.max(dim=-1)
 84 |             if scores.shape[1] > self.num_top_queries:
 85 |                 scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
 86 |                 labels = torch.gather(labels, dim=1, index=index)
 87 |                 boxes = torch.gather(
 88 |                     boxes,
 89 |                     dim=1,
 90 |                     index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]),
 91 |                 )
 92 | 
 93 |         # TODO for onnx export
 94 |         if self.deploy_mode:
 95 |             return labels, boxes, scores
 96 | 
 97 |         # TODO
 98 |         if self.remap_mscoco_category:
 99 |             from ...data.dataset import mscoco_label2category
100 | 
101 |             labels = (
102 |                 torch.tensor(
103 |                     [mscoco_label2category[int(x.item())] for x in labels.flatten()]
104 |                 )
105 |                 .to(boxes.device)
106 |                 .reshape(labels.shape)
107 |             )
108 | 
109 |         results = []
110 |         for lab, box, sco in zip(labels, boxes, scores):
111 |             lab = lab[sco > threshold]
112 |             box = box[sco > threshold]
113 |             sco = sco[sco > threshold]
114 | 
115 |             lab = lab.cpu().numpy()
116 |             sco = sco.cpu().numpy()
117 | 
118 |             box = self.clamp(box.cpu(), h.cpu(), w.cpu()).numpy()
119 | 
120 |             result = dict(labels=lab, boxes=box, scores=sco)
121 |             results.append(result)
122 | 
123 |         return results
124 | 
125 |     def deploy(
126 |         self,
127 |     ):
128 |         self.eval()
129 |         self.deploy_mode = True
130 |         return self
131 | 


--------------------------------------------------------------------------------
/src/yomitoku/resource/MPLUS1p-Medium.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/src/yomitoku/resource/MPLUS1p-Medium.ttf


--------------------------------------------------------------------------------
/src/yomitoku/text_detector.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import os
  6 | from pydantic import conlist
  7 | 
  8 | from .base import BaseModelCatalog, BaseModule, BaseSchema
  9 | from .configs import (
 10 |     TextDetectorDBNetConfig,
 11 |     TextDetectorDBNetV2Config,
 12 | )
 13 | from .data.functions import (
 14 |     array_to_tensor,
 15 |     resize_shortest_edge,
 16 |     standardization_image,
 17 | )
 18 | from .models import DBNet
 19 | from .postprocessor import DBnetPostProcessor
 20 | from .utils.visualizer import det_visualizer
 21 | from .constants import ROOT_DIR
 22 | 
 23 | import onnx
 24 | import onnxruntime
 25 | 
 26 | 
 27 | class TextDetectorModelCatalog(BaseModelCatalog):
 28 |     def __init__(self):
 29 |         super().__init__()
 30 |         self.register("dbnet", TextDetectorDBNetConfig, DBNet)
 31 |         self.register("dbnetv2", TextDetectorDBNetV2Config, DBNet)
 32 | 
 33 | 
 34 | class TextDetectorSchema(BaseSchema):
 35 |     points: List[
 36 |         conlist(
 37 |             conlist(int, min_length=2, max_length=2),
 38 |             min_length=4,
 39 |             max_length=4,
 40 |         )
 41 |     ]
 42 |     scores: List[float]
 43 | 
 44 | 
 45 | class TextDetector(BaseModule):
 46 |     model_catalog = TextDetectorModelCatalog()
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         model_name="dbnetv2",
 51 |         path_cfg=None,
 52 |         device="cuda",
 53 |         visualize=False,
 54 |         from_pretrained=True,
 55 |         infer_onnx=False,
 56 |     ):
 57 |         super().__init__()
 58 |         self.load_model(
 59 |             model_name,
 60 |             path_cfg,
 61 |             from_pretrained=from_pretrained,
 62 |         )
 63 | 
 64 |         self.device = device
 65 |         self.visualize = visualize
 66 | 
 67 |         self.model.eval()
 68 |         self.post_processor = DBnetPostProcessor(**self._cfg.post_process)
 69 |         self.infer_onnx = infer_onnx
 70 | 
 71 |         if infer_onnx:
 72 |             name = self._cfg.hf_hub_repo.split("/")[-1]
 73 |             path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
 74 |             if not os.path.exists(path_onnx):
 75 |                 self.convert_onnx(path_onnx)
 76 | 
 77 |             self.model = None
 78 | 
 79 |             model = onnx.load(path_onnx)
 80 |             if torch.cuda.is_available() and device == "cuda":
 81 |                 self.sess = onnxruntime.InferenceSession(
 82 |                     model.SerializeToString(), providers=["CUDAExecutionProvider"]
 83 |                 )
 84 |             else:
 85 |                 self.sess = onnxruntime.InferenceSession(model.SerializeToString())
 86 | 
 87 |             self.model = None
 88 | 
 89 |         if self.model is not None:
 90 |             self.model.to(self.device)
 91 | 
 92 |     def convert_onnx(self, path_onnx):
 93 |         dynamic_axes = {
 94 |             "input": {0: "batch_size", 2: "height", 3: "width"},
 95 |             "output": {0: "batch_size", 2: "height", 3: "width"},
 96 |         }
 97 | 
 98 |         dummy_input = torch.randn(1, 3, 256, 256, requires_grad=True)
 99 | 
100 |         torch.onnx.export(
101 |             self.model,
102 |             dummy_input,
103 |             path_onnx,
104 |             opset_version=14,
105 |             input_names=["input"],
106 |             output_names=["output"],
107 |             dynamic_axes=dynamic_axes,
108 |         )
109 | 
110 |     def preprocess(self, img):
111 |         img = img.copy()
112 |         img = img[:, :, ::-1].astype(np.float32)
113 |         resized = resize_shortest_edge(
114 |             img, self._cfg.data.shortest_size, self._cfg.data.limit_size
115 |         )
116 |         normalized = standardization_image(resized)
117 |         tensor = array_to_tensor(normalized)
118 |         return tensor
119 | 
120 |     def postprocess(self, preds, image_size):
121 |         return self.post_processor(preds, image_size)
122 | 
123 |     def __call__(self, img):
124 |         """apply the detection model to the input image.
125 | 
126 |         Args:
127 |             img (np.ndarray): target image(BGR)
128 |         """
129 | 
130 |         ori_h, ori_w = img.shape[:2]
131 |         tensor = self.preprocess(img)
132 | 
133 |         if self.infer_onnx:
134 |             input = tensor.numpy()
135 |             results = self.sess.run(["output"], {"input": input})
136 |             preds = {"binary": torch.tensor(results[0])}
137 |         else:
138 |             with torch.inference_mode():
139 |                 tensor = tensor.to(self.device)
140 |                 preds = self.model(tensor)
141 | 
142 |         quads, scores = self.postprocess(preds, (ori_h, ori_w))
143 |         outputs = {"points": quads, "scores": scores}
144 | 
145 |         results = TextDetectorSchema(**outputs)
146 | 
147 |         vis = None
148 |         if self.visualize:
149 |             vis = det_visualizer(
150 |                 img,
151 |                 quads,
152 |                 preds=preds,
153 |                 vis_heatmap=self._cfg.visualize.heatmap,
154 |                 line_color=tuple(self._cfg.visualize.color[::-1]),
155 |             )
156 | 
157 |         return results, vis
158 | 


--------------------------------------------------------------------------------
/src/yomitoku/text_recognizer.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import os
  6 | import unicodedata
  7 | from pydantic import conlist
  8 | 
  9 | from .base import BaseModelCatalog, BaseModule, BaseSchema
 10 | from .configs import (
 11 |     TextRecognizerPARSeqConfig,
 12 |     TextRecognizerPARSeqSmallConfig,
 13 |     TextRecognizerPARSeqV2Config,
 14 | )
 15 | from .data.dataset import ParseqDataset
 16 | from .models import PARSeq
 17 | from .postprocessor import ParseqTokenizer as Tokenizer
 18 | from .utils.misc import load_charset
 19 | from .utils.visualizer import rec_visualizer
 20 | 
 21 | from .constants import ROOT_DIR
 22 | import onnx
 23 | import onnxruntime
 24 | 
 25 | 
 26 | class TextRecognizerModelCatalog(BaseModelCatalog):
 27 |     def __init__(self):
 28 |         super().__init__()
 29 |         self.register("parseq", TextRecognizerPARSeqConfig, PARSeq)
 30 |         self.register("parseqv2", TextRecognizerPARSeqV2Config, PARSeq)
 31 |         self.register("parseq-small", TextRecognizerPARSeqSmallConfig, PARSeq)
 32 | 
 33 | 
 34 | class TextRecognizerSchema(BaseSchema):
 35 |     contents: List[str]
 36 |     directions: List[str]
 37 |     scores: List[float]
 38 |     points: List[
 39 |         conlist(
 40 |             conlist(int, min_length=2, max_length=2),
 41 |             min_length=4,
 42 |             max_length=4,
 43 |         )
 44 |     ]
 45 | 
 46 | 
 47 | class TextRecognizer(BaseModule):
 48 |     model_catalog = TextRecognizerModelCatalog()
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         model_name="parseqv2",
 53 |         path_cfg=None,
 54 |         device="cuda",
 55 |         visualize=False,
 56 |         from_pretrained=True,
 57 |         infer_onnx=False,
 58 |     ):
 59 |         super().__init__()
 60 |         self.load_model(
 61 |             model_name,
 62 |             path_cfg,
 63 |             from_pretrained=from_pretrained,
 64 |         )
 65 |         self.charset = load_charset(self._cfg.charset)
 66 |         self.tokenizer = Tokenizer(self.charset)
 67 | 
 68 |         self.device = device
 69 | 
 70 |         self.model.tokenizer = self.tokenizer
 71 |         self.model.eval()
 72 | 
 73 |         self.visualize = visualize
 74 | 
 75 |         self.infer_onnx = infer_onnx
 76 | 
 77 |         if infer_onnx:
 78 |             name = self._cfg.hf_hub_repo.split("/")[-1]
 79 |             path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
 80 |             if not os.path.exists(path_onnx):
 81 |                 self.convert_onnx(path_onnx)
 82 | 
 83 |             self.model = None
 84 | 
 85 |             model = onnx.load(path_onnx)
 86 |             if torch.cuda.is_available() and device == "cuda":
 87 |                 self.sess = onnxruntime.InferenceSession(
 88 |                     model.SerializeToString(), providers=["CUDAExecutionProvider"]
 89 |                 )
 90 |             else:
 91 |                 self.sess = onnxruntime.InferenceSession(model.SerializeToString())
 92 | 
 93 |         if self.model is not None:
 94 |             self.model.to(self.device)
 95 | 
 96 |     def preprocess(self, img, polygons):
 97 |         dataset = ParseqDataset(self._cfg, img, polygons)
 98 |         dataloader = self._make_mini_batch(dataset)
 99 | 
100 |         return dataloader
101 | 
102 |     def _make_mini_batch(self, dataset):
103 |         mini_batches = []
104 |         mini_batch = []
105 |         for data in dataset:
106 |             data = torch.unsqueeze(data, 0)
107 |             mini_batch.append(data)
108 | 
109 |             if len(mini_batch) == self._cfg.data.batch_size:
110 |                 mini_batches.append(torch.cat(mini_batch, 0))
111 |                 mini_batch = []
112 |         else:
113 |             if len(mini_batch) > 0:
114 |                 mini_batches.append(torch.cat(mini_batch, 0))
115 | 
116 |         return mini_batches
117 | 
118 |     def convert_onnx(self, path_onnx):
119 |         img_size = self._cfg.data.img_size
120 |         input = torch.randn(1, 3, *img_size, requires_grad=True)
121 |         dynamic_axes = {
122 |             "input": {0: "batch_size"},
123 |             "output": {0: "batch_size"},
124 |         }
125 | 
126 |         self.model.export_onnx = True
127 |         torch.onnx.export(
128 |             self.model,
129 |             input,
130 |             path_onnx,
131 |             opset_version=14,
132 |             input_names=["input"],
133 |             output_names=["output"],
134 |             do_constant_folding=True,
135 |             dynamic_axes=dynamic_axes,
136 |         )
137 | 
138 |     def postprocess(self, p, points):
139 |         pred, score = self.tokenizer.decode(p)
140 |         pred = [unicodedata.normalize("NFKC", x) for x in pred]
141 | 
142 |         directions = []
143 |         for point in points:
144 |             point = np.array(point)
145 |             w = np.linalg.norm(point[0] - point[1])
146 |             h = np.linalg.norm(point[1] - point[2])
147 | 
148 |             direction = "vertical" if h > w * 2 else "horizontal"
149 |             directions.append(direction)
150 | 
151 |         return pred, score, directions
152 | 
153 |     def __call__(self, img, points, vis=None):
154 |         """
155 |         Apply the recognition model to the input image.
156 | 
157 |         Args:
158 |             img (np.ndarray): target image(BGR)
159 |             points (list): list of quadrilaterals. Each quadrilateral is represented as a list of 4 points sorted clockwise.
160 |             vis (np.ndarray, optional): rendering image. Defaults to None.
161 |         """
162 | 
163 |         dataloader = self.preprocess(img, points)
164 |         preds = []
165 |         scores = []
166 |         directions = []
167 |         for data in dataloader:
168 |             if self.infer_onnx:
169 |                 input = data.numpy()
170 |                 results = self.sess.run(["output"], {"input": input})
171 |                 p = torch.tensor(results[0])
172 |             else:
173 |                 with torch.inference_mode():
174 |                     data = data.to(self.device)
175 |                     p = self.model(data).softmax(-1)
176 | 
177 |             pred, score, direction = self.postprocess(p, points)
178 |             preds.extend(pred)
179 |             scores.extend(score)
180 |             directions.extend(direction)
181 | 
182 |         outputs = {
183 |             "contents": preds,
184 |             "scores": scores,
185 |             "points": points,
186 |             "directions": directions,
187 |         }
188 |         results = TextRecognizerSchema(**outputs)
189 | 
190 |         if self.visualize:
191 |             if vis is None:
192 |                 vis = img.copy()
193 |             vis = rec_visualizer(
194 |                 vis,
195 |                 results,
196 |                 font_size=self._cfg.visualize.font_size,
197 |                 font_color=tuple(self._cfg.visualize.color[::-1]),
198 |                 font_path=self._cfg.visualize.font,
199 |             )
200 | 
201 |         return results, vis
202 | 


--------------------------------------------------------------------------------
/src/yomitoku/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/src/yomitoku/utils/__init__.py


--------------------------------------------------------------------------------
/src/yomitoku/utils/graph.py:
--------------------------------------------------------------------------------
 1 | class Node:
 2 |     def __init__(self, id, prop):
 3 |         self.id = id
 4 |         self.prop = prop
 5 |         self.parents = []
 6 |         self.children = []
 7 | 
 8 |         self.is_locked = False
 9 | 
10 |     def add_link(self, node):
11 |         if node in self.children:
12 |             return
13 | 
14 |         self.children.append(node)
15 |         node.parents.append(self)
16 | 
17 |     def __repr__(self):
18 |         if "contents" in self.prop:
19 |             return self.prop["contents"]
20 |         return "table"
21 | 


--------------------------------------------------------------------------------
/src/yomitoku/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from logging import Formatter, StreamHandler, getLogger
 3 | 
 4 | 
 5 | def set_logger(name, level="INFO"):
 6 |     logger = getLogger(name)
 7 |     logger.setLevel(level)
 8 |     handler = StreamHandler()
 9 |     handler.setLevel(level)
10 |     format = Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
11 |     handler.setFormatter(format)
12 |     logger.addHandler(handler)
13 | 
14 |     warnings.filterwarnings("ignore")
15 |     return logger
16 | 


--------------------------------------------------------------------------------
/src/yomitoku/utils/misc.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | 
  3 | 
  4 | def load_charset(charset_path):
  5 |     with open(charset_path, "r", encoding="utf-8") as f:
  6 |         charset = f.read()
  7 |     return charset
  8 | 
  9 | 
 10 | def filter_by_flag(elements, flags):
 11 |     assert len(elements) == len(flags)
 12 |     return [element for element, flag in zip(elements, flags) if flag]
 13 | 
 14 | 
 15 | def save_image(img, path):
 16 |     success, buffer = cv2.imencode(".jpg", img)
 17 |     if not success:
 18 |         raise ValueError("Failed to encode image")
 19 | 
 20 |     with open(path, "wb") as f:
 21 |         f.write(buffer.tobytes())
 22 | 
 23 | 
 24 | def calc_overlap_ratio(rect_a, rect_b):
 25 |     intersection = calc_intersection(rect_a, rect_b)
 26 |     if intersection is None:
 27 |         return 0, None
 28 | 
 29 |     ix1, iy1, ix2, iy2 = intersection
 30 | 
 31 |     overlap_width = ix2 - ix1
 32 |     overlap_height = iy2 - iy1
 33 |     bx1, by1, bx2, by2 = rect_b
 34 | 
 35 |     b_area = (bx2 - bx1) * (by2 - by1)
 36 |     overlap_area = overlap_width * overlap_height
 37 | 
 38 |     overlap_ratio = overlap_area / b_area
 39 |     return overlap_ratio, intersection
 40 | 
 41 | 
 42 | def is_contained(rect_a, rect_b, threshold=0.8):
 43 |     """二つの矩形A, Bが与えられたとき、矩形Bが矩形Aに含まれるかどうかを判定する。
 44 |     ずれを許容するため、重複率求め、thresholdを超える場合にTrueを返す。
 45 | 
 46 | 
 47 |     Args:
 48 |         rect_a (np.array): x1, y1, x2, y2
 49 |         rect_b (np.array): x1, y1, x2, y2
 50 |         threshold (float, optional): 判定の閾値. Defaults to 0.9.
 51 | 
 52 |     Returns:
 53 |         bool: 矩形Bが矩形Aに含まれる場合True
 54 |     """
 55 | 
 56 |     overlap_ratio, _ = calc_overlap_ratio(rect_a, rect_b)
 57 | 
 58 |     if overlap_ratio > threshold:
 59 |         return True
 60 | 
 61 |     return False
 62 | 
 63 | 
 64 | def calc_intersection(rect_a, rect_b):
 65 |     ax1, ay1, ax2, ay2 = map(int, rect_a)
 66 |     bx1, by1, bx2, by2 = map(int, rect_b)
 67 | 
 68 |     # 交差領域の左上と右下の座標
 69 |     ix1 = max(ax1, bx1)
 70 |     iy1 = max(ay1, by1)
 71 |     ix2 = min(ax2, bx2)
 72 |     iy2 = min(ay2, by2)
 73 | 
 74 |     overlap_width = max(0, ix2 - ix1)
 75 |     overlap_height = max(0, iy2 - iy1)
 76 | 
 77 |     if overlap_width == 0 or overlap_height == 0:
 78 |         return None
 79 | 
 80 |     return [ix1, iy1, ix2, iy2]
 81 | 
 82 | 
 83 | def is_intersected_horizontal(rect_a, rect_b, threshold=0.5):
 84 |     _, ay1, _, ay2 = map(int, rect_a)
 85 |     _, by1, _, by2 = map(int, rect_b)
 86 | 
 87 |     # 交差領域の左上と右下の座標
 88 |     iy1 = max(ay1, by1)
 89 |     iy2 = min(ay2, by2)
 90 | 
 91 |     min_height = min(ay2 - ay1, by2 - by1)
 92 | 
 93 |     overlap_height = max(0, iy2 - iy1)
 94 | 
 95 |     if (overlap_height / min_height) < threshold:
 96 |         return False
 97 | 
 98 |     return True
 99 | 
100 | 
101 | def is_intersected_vertical(rect_a, rect_b):
102 |     ax1, _, ax2, _ = map(int, rect_a)
103 |     bx1, _, bx2, _ = map(int, rect_b)
104 | 
105 |     # 交差領域の左上と右下の座標
106 |     ix1 = max(ax1, bx1)
107 |     ix2 = min(ax2, bx2)
108 | 
109 |     overlap_width = max(0, ix2 - ix1)
110 | 
111 |     if overlap_width == 0:
112 |         return False
113 | 
114 |     return True
115 | 
116 | 
117 | def quad_to_xyxy(quad):
118 |     x1 = min([x for x, _ in quad])
119 |     y1 = min([y for _, y in quad])
120 |     x2 = max([x for x, _ in quad])
121 |     y2 = max([y for _, y in quad])
122 | 
123 |     return x1, y1, x2, y2
124 | 
125 | 
126 | def convert_table_array(table):
127 |     n_rows = table.n_row
128 |     n_cols = table.n_col
129 | 
130 |     table_array = [["" for _ in range(n_cols)] for _ in range(n_rows)]
131 | 
132 |     for cell in table.cells:
133 |         row = cell.row - 1
134 |         col = cell.col - 1
135 |         row_span = cell.row_span
136 |         col_span = cell.col_span
137 |         contents = cell.contents
138 | 
139 |         for i in range(row, row + row_span):
140 |             for j in range(col, col + col_span):
141 |                 table_array[i][j] = contents
142 | 
143 |     return table_array
144 | 
145 | 
146 | def convert_table_array_to_dict(table_array, header_row=1):
147 |     n_cols = len(table_array[0])
148 |     n_rows = len(table_array)
149 | 
150 |     header_cols = []
151 |     for i in range(n_cols):
152 |         header = []
153 |         for j in range(header_row):
154 |             header.append(table_array[j][i])
155 | 
156 |         if len(header) > 0:
157 |             header_cols.append("_".join(header))
158 |         else:
159 |             header_cols.append(f"col_{i}")
160 | 
161 |     table_dict = []
162 |     for i in range(header_row, n_rows):
163 |         row_dict = {}
164 |         for j in range(n_cols):
165 |             row_dict[header_cols[j]] = table_array[i][j]
166 |         table_dict.append(row_dict)
167 | 
168 |     return table_dict
169 | 


--------------------------------------------------------------------------------
/src/yomitoku/utils/searchable_pdf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from PIL import Image
  4 | from io import BytesIO
  5 | 
  6 | from reportlab.pdfgen import canvas
  7 | from reportlab.pdfbase.ttfonts import TTFont
  8 | from reportlab.pdfbase import pdfmetrics
  9 | from reportlab.pdfbase.pdfmetrics import stringWidth
 10 | 
 11 | import numpy as np
 12 | import jaconv
 13 | 
 14 | from ..constants import ROOT_DIR
 15 | 
 16 | FONT_PATH = ROOT_DIR + "/resource/MPLUS1p-Medium.ttf"
 17 | 
 18 | 
 19 | def _poly2rect(points):
 20 |     """
 21 |     Convert a polygon defined by its corner points to a rectangle.
 22 |     The points should be in the format [[x1, y1], [x2, y2], [x3, y3], [x4, y4]].
 23 |     """
 24 |     points = np.array(points, dtype=int)
 25 |     x_min = points[:, 0].min()
 26 |     x_max = points[:, 0].max()
 27 |     y_min = points[:, 1].min()
 28 |     y_max = points[:, 1].max()
 29 | 
 30 |     return [x_min, y_min, x_max, y_max]
 31 | 
 32 | 
 33 | def _calc_font_size(content, bbox_height, bbox_width):
 34 |     rates = np.arange(0.5, 1.0, 0.01)
 35 | 
 36 |     min_diff = np.inf
 37 |     best_font_size = None
 38 |     for rate in rates:
 39 |         font_size = bbox_height * rate
 40 |         text_w = stringWidth(content, "MPLUS1p-Medium", font_size)
 41 |         diff = abs(text_w - bbox_width)
 42 |         if diff < min_diff:
 43 |             min_diff = diff
 44 |             best_font_size = font_size
 45 | 
 46 |     return best_font_size
 47 | 
 48 | 
 49 | def to_full_width(text):
 50 |     fw_map = {
 51 |         "\u00a5": "\uffe5",  # ¥ → ￥
 52 |         "\u00b7": "\u30fb",  # · → ・
 53 |         " ": "\u3000",  # 半角スペース→全角スペース
 54 |     }
 55 | 
 56 |     TO_FULLWIDTH = str.maketrans(fw_map)
 57 | 
 58 |     jaconv_text = jaconv.h2z(text, kana=True, ascii=True, digit=True)
 59 |     jaconv_text = jaconv_text.translate(TO_FULLWIDTH)
 60 | 
 61 |     return jaconv_text
 62 | 
 63 | 
 64 | def create_searchable_pdf(images, ocr_results, output_path, font_path=None):
 65 |     if font_path is None:
 66 |         font_path = FONT_PATH
 67 | 
 68 |     pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", font_path))
 69 | 
 70 |     packet = BytesIO()
 71 |     c = canvas.Canvas(packet)
 72 | 
 73 |     for i, (image, ocr_result) in enumerate(zip(images, ocr_results)):
 74 |         image = Image.fromarray(image[:, :, ::-1])  # Convert BGR to RGB
 75 |         pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", FONT_PATH))
 76 | 
 77 |         image_path = f"tmp_{i}.png"
 78 |         image.save(image_path)
 79 |         w, h = image.size
 80 | 
 81 |         c.setPageSize((w, h))
 82 |         c.drawImage(image_path, 0, 0, width=w, height=h)
 83 |         os.remove(image_path)  # Clean up temporary image file
 84 | 
 85 |         for word in ocr_result.words:
 86 |             text = word.content
 87 |             bbox = _poly2rect(word.points)
 88 |             direction = word.direction
 89 | 
 90 |             x1, y1, x2, y2 = bbox
 91 |             bbox_height = y2 - y1
 92 |             bbox_width = x2 - x1
 93 | 
 94 |             if direction == "vertical":
 95 |                 text = to_full_width(text)
 96 | 
 97 |             if direction == "horizontal":
 98 |                 font_size = _calc_font_size(text, bbox_height, bbox_width)
 99 |             else:
100 |                 font_size = _calc_font_size(text, bbox_width, bbox_height)
101 | 
102 |             c.setFont("MPLUS1p-Medium", font_size)
103 |             c.setFillColorRGB(1, 1, 1, alpha=0)  # 透明
104 |             if direction == "vertical":
105 |                 base_y = h - y2 + (bbox_height - font_size)
106 |                 for j, ch in enumerate(text):
107 |                     c.saveState()
108 |                     c.translate(x1 + font_size * 0.5, base_y - (j - 1) * font_size)
109 |                     c.rotate(-90)
110 |                     c.drawString(0, 0, ch)
111 |                     c.restoreState()
112 |             else:
113 |                 base_y = h - y2 + (bbox_height - font_size) * 0.5
114 |                 c.drawString(x1, base_y, text)
115 |         c.showPage()
116 |     c.save()
117 | 
118 |     with open(output_path, "wb") as f:
119 |         f.write(packet.getvalue())
120 | 


--------------------------------------------------------------------------------
/src/yomitoku/utils/visualizer.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from PIL import Image, ImageDraw, ImageFont, features
  4 | from ..constants import PALETTE
  5 | from .logger import set_logger
  6 | 
  7 | logger = set_logger(__name__, "INFO")
  8 | 
  9 | 
 10 | def _reading_order_visualizer(img, elements, line_color, tip_size):
 11 |     out = img.copy()
 12 |     for i, element in enumerate(elements):
 13 |         if i == 0:
 14 |             continue
 15 | 
 16 |         prev_element = elements[i - 1]
 17 |         cur_x1, cur_y1, cur_x2, cur_y2 = element.box
 18 |         prev_x1, prev_y1, prev_x2, prev_y2 = prev_element.box
 19 | 
 20 |         cur_center = (
 21 |             cur_x1 + (cur_x2 - cur_x1) / 2,
 22 |             cur_y1 + (cur_y2 - cur_y1) / 2,
 23 |         )
 24 |         prev_center = (
 25 |             prev_x1 + (prev_x2 - prev_x1) / 2,
 26 |             prev_y1 + (prev_y2 - prev_y1) / 2,
 27 |         )
 28 | 
 29 |         arrow_length = np.linalg.norm(np.array(cur_center) - np.array(prev_center))
 30 | 
 31 |         # tipLength を計算（矢印長さに対する固定サイズの割合）
 32 |         if arrow_length > 0:
 33 |             tip_length = tip_size / arrow_length
 34 |         else:
 35 |             tip_length = 0  # 長さが0なら矢じりもゼロ
 36 | 
 37 |         cv2.arrowedLine(
 38 |             out,
 39 |             (int(prev_center[0]), int(prev_center[1])),
 40 |             (int(cur_center[0]), int(cur_center[1])),
 41 |             line_color,
 42 |             2,
 43 |             tipLength=tip_length,
 44 |         )
 45 |     return out
 46 | 
 47 | 
 48 | def reading_order_visualizer(
 49 |     img,
 50 |     results,
 51 |     line_color=(0, 0, 255),
 52 |     tip_size=10,
 53 |     visualize_figure_letter=False,
 54 | ):
 55 |     elements = results.paragraphs + results.tables + results.figures
 56 |     elements = sorted(elements, key=lambda x: x.order)
 57 | 
 58 |     out = _reading_order_visualizer(img, elements, line_color, tip_size)
 59 | 
 60 |     if visualize_figure_letter:
 61 |         for figure in results.figures:
 62 |             out = _reading_order_visualizer(
 63 |                 out, figure.paragraphs, line_color=(0, 255, 0), tip_size=5
 64 |             )
 65 | 
 66 |     return out
 67 | 
 68 | 
 69 | def det_visualizer(img, quads, preds=None, vis_heatmap=False, line_color=(0, 255, 0)):
 70 |     out = img.copy()
 71 |     h, w = out.shape[:2]
 72 |     if vis_heatmap:
 73 |         preds = preds["binary"][0]
 74 |         binary = preds.detach().cpu().numpy()
 75 |         binary = binary.squeeze(0)
 76 |         binary = (binary * 255).astype(np.uint8)
 77 |         binary = cv2.resize(binary, (w, h), interpolation=cv2.INTER_LINEAR)
 78 |         heatmap = cv2.applyColorMap(binary, cv2.COLORMAP_JET)
 79 |         out = cv2.addWeighted(out, 0.5, heatmap, 0.5, 0)
 80 | 
 81 |     for quad in quads:
 82 |         quad = np.array(quad).astype(np.int32)
 83 |         out = cv2.polylines(out, [quad], True, line_color, 1)
 84 |     return out
 85 | 
 86 | 
 87 | def layout_visualizer(results, img):
 88 |     out = img.copy()
 89 |     results_dict = results.dict()
 90 |     for id, (category, preds) in enumerate(results_dict.items()):
 91 |         for element in preds:
 92 |             box = element["box"]
 93 |             role = element["role"]
 94 | 
 95 |             if role is None:
 96 |                 role = ""
 97 |             else:
 98 |                 role = f"({role})"
 99 | 
100 |             color = PALETTE[id % len(PALETTE)]
101 |             x1, y1, x2, y2 = tuple(map(int, box))
102 |             out = cv2.rectangle(out, (x1, y1), (x2, y2), color, 2)
103 |             out = cv2.putText(
104 |                 out,
105 |                 category + role,
106 |                 (x1, y1),
107 |                 cv2.FONT_HERSHEY_SIMPLEX,
108 |                 0.5,
109 |                 color,
110 |                 2,
111 |             )
112 | 
113 |     return out
114 | 
115 | 
116 | def table_visualizer(img, table):
117 |     out = img.copy()
118 |     cells = table.cells
119 |     for cell in cells:
120 |         box = cell.box
121 |         row = cell.row
122 |         col = cell.col
123 |         row_span = cell.row_span
124 |         col_span = cell.col_span
125 | 
126 |         text = f"[{row}, {col}] ({row_span}x{col_span})"
127 | 
128 |         x1, y1, x2, y2 = map(int, box)
129 |         out = cv2.rectangle(out, (x1, y1), (x2, y2), (255, 0, 255), 2)
130 |         out = cv2.putText(
131 |             out,
132 |             text,
133 |             (x1, y1),
134 |             cv2.FONT_HERSHEY_SIMPLEX,
135 |             0.5,
136 |             (255, 0, 0),
137 |             2,
138 |         )
139 | 
140 |     return out
141 | 
142 | 
143 | def rec_visualizer(
144 |     img,
145 |     outputs,
146 |     font_path,
147 |     font_size=12,
148 |     font_color=(255, 0, 0),
149 | ):
150 |     out = img.copy()
151 |     pillow_img = Image.fromarray(out)
152 |     draw = ImageDraw.Draw(pillow_img)
153 |     has_raqm = features.check_feature(feature="raqm")
154 |     if not has_raqm:
155 |         logger.warning(
156 |             "libraqm is not installed. Vertical text rendering is not supported. Rendering horizontally instead."
157 |         )
158 | 
159 |     for pred, quad, direction in zip(
160 |         outputs.contents, outputs.points, outputs.directions
161 |     ):
162 |         quad = np.array(quad).astype(np.int32)
163 |         font = ImageFont.truetype(font_path, font_size)
164 |         if direction == "horizontal" or not has_raqm:
165 |             x_offset = 0
166 |             y_offset = -font_size
167 | 
168 |             pos_x = quad[0][0] + x_offset
169 |             pox_y = quad[0][1] + y_offset
170 |             draw.text((pos_x, pox_y), pred, font=font, fill=font_color)
171 |         else:
172 |             x_offset = -font_size
173 |             y_offset = 0
174 | 
175 |             pos_x = quad[0][0] + x_offset
176 |             pox_y = quad[0][1] + y_offset
177 |             draw.text(
178 |                 (pos_x, pox_y),
179 |                 pred,
180 |                 font=font,
181 |                 fill=font_color,
182 |                 direction="ttb",
183 |             )
184 | 
185 |     out = np.array(pillow_img)
186 |     return out
187 | 


--------------------------------------------------------------------------------
/static/in/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/demo.jpg


--------------------------------------------------------------------------------
/static/in/gallery1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery1.jpg


--------------------------------------------------------------------------------
/static/in/gallery2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery2.jpg


--------------------------------------------------------------------------------
/static/in/gallery3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery3.jpg


--------------------------------------------------------------------------------
/static/in/gallery4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery4.jpg


--------------------------------------------------------------------------------
/static/in/gallery5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery5.jpg


--------------------------------------------------------------------------------
/static/in/gallery6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery6.jpg


--------------------------------------------------------------------------------
/static/in/gallery7.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/in/gallery7.jpeg


--------------------------------------------------------------------------------
/static/logo/horizontal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/logo/horizontal.png


--------------------------------------------------------------------------------
/static/out/demo_html.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/demo_html.png


--------------------------------------------------------------------------------
/static/out/figures/in_demo_p1_figure_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_demo_p1_figure_0.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_0.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_1.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_10.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_2.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_3.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_4.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_5.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_6.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_7.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_8.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery1_p1_figure_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery1_p1_figure_9.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery3_p1_figure_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery3_p1_figure_0.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery3_p1_figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery3_p1_figure_1.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery5_p1_figure_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery5_p1_figure_0.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery5_p1_figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery5_p1_figure_1.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery6_p1_figure_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery6_p1_figure_0.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery6_p1_figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery6_p1_figure_1.png


--------------------------------------------------------------------------------
/static/out/figures/in_gallery7_p1_figure_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/figures/in_gallery7_p1_figure_0.png


--------------------------------------------------------------------------------
/static/out/in_demo_p1.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |   <p>Al の進化に伴う課題と現状の取組</p>
 3 |   <p>第1節</p>
 4 |   <p>第4章<br/>デジタルテクノロジーの課題と現状の対応策</p>
 5 |   <table border="1" style="border-collapse: collapse">
 6 |     <tr>
 7 |       <td rowspan="1" colspan="3">図表I-4-1-1<br/>生成AIの課題</td>
 8 |     </tr>
 9 |     <tr>
10 |       <td rowspan="1" colspan="2">リスク</td>
11 |       <td rowspan="1" colspan="1">事例</td>
12 |     </tr>
13 |     <tr>
14 |       <td rowspan="8" colspan="1">従来型AI<br/>から存在<br/>するリスク</td>
15 |       <td rowspan="1" colspan="1">バイアスのある結果及び差別的な結果の出力</td>
16 |       <td rowspan="1" colspan="1">●IT企業が自社で開発したAI人材採用システムが女性を差別するという機械学習面の欠陥を持<br/>ち合わせていた</td>
17 |     </tr>
18 |     <tr>
19 |       <td rowspan="1" colspan="1">フィルターバブル及びエコーチェンバー現象</td>
20 |       <td rowspan="1" colspan="1">● SNS 等によるレコメンドを通じた社会の分断が生じている</td>
21 |     </tr>
22 |     <tr>
23 |       <td rowspan="1" colspan="1">多様性の喪失</td>
24 |       <td rowspan="1" colspan="1">●社会全体が同じモデルを、同じ温度感で使った場合、導かれる意見及び回答がLLMによって<br/>収束してしまい、多様性が失われる可能性がある</td>
25 |     </tr>
26 |     <tr>
27 |       <td rowspan="1" colspan="1">不適切な個人情報の取扱い</td>
28 |       <td rowspan="1" colspan="1">●透明性を欠く個人情報の利用及び個人情報の政治利用も問題視されている</td>
29 |     </tr>
30 |     <tr>
31 |       <td rowspan="1" colspan="1">生命、身体、財産の侵害</td>
32 |       <td rowspan="1" colspan="1">●AI が不適切な判断を下すことで、自動運転車が事故を引き起こし、生命や財産に深刻な損害<br/>を与える可能性がある<br/>●トリアージにおいては、AIが順位を決定する際に倫理的なバイアスを持つことで、公平性の<br/>喪失等が生じる可能性がある</td>
33 |     </tr>
34 |     <tr>
35 |       <td rowspan="1" colspan="1">データ汚染攻撃</td>
36 |       <td rowspan="1" colspan="1">●AIの学習実施時及びサービス運用時には学習データへの不正データ混入、サービス運用時で<br/>はアプリケーション自体を狙ったサイバー攻撃等のリスクが存在する</td>
37 |     </tr>
38 |     <tr>
39 |       <td rowspan="1" colspan="1">ブラックボックス化、判断に関する説明の要求</td>
40 |       <td rowspan="1" colspan="1">●AIの判断のブラックボックス化に起因する問題も生じている<br/>●AIの判断に関する透明性を求める動きも上がっている</td>
41 |     </tr>
42 |     <tr>
43 |       <td rowspan="1" colspan="1">エネルギー使用量及び環境の負荷</td>
44 |       <td rowspan="1" colspan="1">●AIの利用拡大により、計算リソースの需要も拡大しており、結果として、データセンターが<br/>増大しエネルギー使用量の増加が懸念されている</td>
45 |     </tr>
46 |     <tr>
47 |       <td rowspan="7" colspan="1">生成AIで<br/>特に顕在化<br/>したリスク</td>
48 |       <td rowspan="1" colspan="1">悪用</td>
49 |       <td rowspan="1" colspan="1">●AIの詐欺目的での利用も問題視されている</td>
50 |     </tr>
51 |     <tr>
52 |       <td rowspan="1" colspan="1">機密情報の流出</td>
53 |       <td rowspan="1" colspan="1">●AIの利用においては、個人情報や機密情報がプロンプトとして入力され、そのAIからの出力<br/>等を通じて流出してしまうリスクがある</td>
54 |     </tr>
55 |     <tr>
56 |       <td rowspan="1" colspan="1">ハルシネーション</td>
57 |       <td rowspan="1" colspan="1">●生成AIが事実と異なることをもっともらしく回答する「ハルシネーション」に関してはAI開<br/>発者・提供者への訴訟も起きている</td>
58 |     </tr>
59 |     <tr>
60 |       <td rowspan="1" colspan="1">偽情報、誤情報を鵜呑みにすること</td>
61 |       <td rowspan="1" colspan="1">●生成AIが生み出す誤情報を鵜呑みにすることがリスクとなりうる<br/>●ディープフェイクは、各国で悪用例が相次いでいる</td>
62 |     </tr>
63 |     <tr>
64 |       <td rowspan="1" colspan="1">著作権との関係</td>
65 |       <td rowspan="1" colspan="1">●知的財産権の取扱いへの議論が提起されている</td>
66 |     </tr>
67 |     <tr>
68 |       <td rowspan="1" colspan="1">資格等との関係</td>
69 |       <td rowspan="1" colspan="1">●生成AIの活用を通じた業法免許や資格等の侵害リスクも考えうる</td>
70 |     </tr>
71 |     <tr>
72 |       <td rowspan="1" colspan="1">バイアスの再生成</td>
73 |       <td rowspan="1" colspan="1">●生成AIは既存の情報に基づいて回答を作るため既存の情報に含まれる偏見を増幅し、不公平<br/>や差別的な出力が継続/拡大する可能性がある</td>
74 |     </tr>
75 |   </table>
76 |   <p/>
77 |   <h1>1 主要なLLMの概要</h1>
78 |   <p>(出典)「AI事業者ガイドライン(第1.0版)」別添(概要)</p>
79 |   <p>生成AIの基盤となる大規模言語モデル(LLM)の開発では、マイクロソフトやグーグルなど米<br/>国ビックテック企業などが先行している状況にある。</p>
80 |   <p>しかし、日本以外の企業·研究機関がクローズに研究開発を進めたLLM を活用するだけでは、<br/>LLM構築の過程がブラックボックス化してしまい、LLMを活用する際の権利侵害や情報漏えいな<br/>どの懸念を払拭できない。日本語に強いLLMの利活用のためには、構築の過程や用いるデータが<br/>明らかな、透明性の高い安心して利活用できる国産のLLM構築が必要となる*3。すでに日本の企業<br/>においても、独自にLLM開発に取り組んでおり、ここではその動向を紹介する。</p>
81 |   <p>ビッグテック企業が開発したLLMと比べると、日本では、中規模モデルのLLMが開発されてい<br/>る傾向が見られる(図表 I-4-1-2)。</p>
82 |   <p>*3 産業技術総合研究所プレスリリース「産総研の計算資源ABCIを用いて世界トップレベルの生成AIの開発を開始一産総研·東京工業大学·<br/>LLM-jp(国立情報学研究所主宰)が協力ー」(2023年10月17日), &lt; &lt; https://www.aist go.jp/aist_j/news/pr20231017.html&gt; (2022<br/>参照)</p>
83 |   <p>令和6年版 情報通信白書 第I部 47</p>
84 | </div>
85 | 


--------------------------------------------------------------------------------
/static/out/in_demo_p1.md:
--------------------------------------------------------------------------------
 1 | Al の進化に伴う課題と現状の取組
 2 | 
 3 | 第1節
 4 | 
 5 | 第4章<br>デジタルテクノロジーの課題と現状の対応策
 6 | 
 7 | |図表I\-4\-1\-1<br>生成AIの課題|||
 8 | |-|-|-|
 9 | |リスク||事例|
10 | |従来型AI<br>から存在<br>するリスク|バイアスのある結果及び差別的な結果の出力|●IT企業が自社で開発したAI人材採用システムが女性を差別するという機械学習面の欠陥を持<br>ち合わせていた|
11 | ||フィルターバブル及びエコーチェンバー現象|● SNS 等によるレコメンドを通じた社会の分断が生じている|
12 | ||多様性の喪失|●社会全体が同じモデルを、同じ温度感で使った場合、導かれる意見及び回答がLLMによって<br>収束してしまい、多様性が失われる可能性がある|
13 | ||不適切な個人情報の取扱い|●透明性を欠く個人情報の利用及び個人情報の政治利用も問題視されている|
14 | ||生命、身体、財産の侵害|●AI が不適切な判断を下すことで、自動運転車が事故を引き起こし、生命や財産に深刻な損害<br>を与える可能性がある<br>●トリアージにおいては、AIが順位を決定する際に倫理的なバイアスを持つことで、公平性の<br>喪失等が生じる可能性がある|
15 | ||データ汚染攻撃|●AIの学習実施時及びサービス運用時には学習データへの不正データ混入、サービス運用時で<br>はアプリケーション自体を狙ったサイバー攻撃等のリスクが存在する|
16 | ||ブラックボックス化、判断に関する説明の要求|●AIの判断のブラックボックス化に起因する問題も生じている<br>●AIの判断に関する透明性を求める動きも上がっている|
17 | ||エネルギー使用量及び環境の負荷|●AIの利用拡大により、計算リソースの需要も拡大しており、結果として、データセンターが<br>増大しエネルギー使用量の増加が懸念されている|
18 | |生成AIで<br>特に顕在化<br>したリスク|悪用|●AIの詐欺目的での利用も問題視されている|
19 | ||機密情報の流出|●AIの利用においては、個人情報や機密情報がプロンプトとして入力され、そのAIからの出力<br>等を通じて流出してしまうリスクがある|
20 | ||ハルシネーション|●生成AIが事実と異なることをもっともらしく回答する「ハルシネーション」に関してはAI開<br>発者・提供者への訴訟も起きている|
21 | ||偽情報、誤情報を鵜呑みにすること|●生成AIが生み出す誤情報を鵜呑みにすることがリスクとなりうる<br>●ディープフェイクは、各国で悪用例が相次いでいる|
22 | ||著作権との関係|●知的財産権の取扱いへの議論が提起されている|
23 | ||資格等との関係|●生成AIの活用を通じた業法免許や資格等の侵害リスクも考えうる|
24 | ||バイアスの再生成|●生成AIは既存の情報に基づいて回答を作るため既存の情報に含まれる偏見を増幅し、不公平<br>や差別的な出力が継続/拡大する可能性がある|
25 | 
26 | # 1 主要なLLMの概要
27 | 
28 | \(出典\)「AI事業者ガイドライン\(第1.0版\)」別添\(概要\)
29 | 
30 | 生成AIの基盤となる大規模言語モデル\(LLM\)の開発では、マイクロソフトやグーグルなど米<br>国ビックテック企業などが先行している状況にある。
31 | 
32 | しかし、日本以外の企業·研究機関がクローズに研究開発を進めたLLM を活用するだけでは、<br>LLM構築の過程がブラックボックス化してしまい、LLMを活用する際の権利侵害や情報漏えいな<br>どの懸念を払拭できない。日本語に強いLLMの利活用のためには、構築の過程や用いるデータが<br>明らかな、透明性の高い安心して利活用できる国産のLLM構築が必要となる\*3。すでに日本の企業<br>においても、独自にLLM開発に取り組んでおり、ここではその動向を紹介する。
33 | 
34 | ビッグテック企業が開発したLLMと比べると、日本では、中規模モデルのLLMが開発されてい<br>る傾向が見られる\(図表 I\-4\-1\-2\)。
35 | 
36 | \*3 産業技術総合研究所プレスリリース「産総研の計算資源ABCIを用いて世界トップレベルの生成AIの開発を開始一産総研·東京工業大学·<br>LLM\-jp\(国立情報学研究所主宰\)が協力ー」\(2023年10月17日\), < < https://www.aist go.jp/aist_j/news/pr20231017.html> \(2022<br>参照\)
37 | 
38 | 令和6年版 情報通信白書 第I部 47
39 | 


--------------------------------------------------------------------------------
/static/out/in_demo_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_demo_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_demo_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_demo_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery1_p1.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |   <p>TELEWORK TELEWORK TELEWORK</p>
 3 |   <p>WORK TELEWORK TELEWORK TELEWORK TELEWORK TELEWORK</p>
 4 |   <p/>
 5 |   <h1>テレワークのさらなる普及·定着に向け<br/>「テレワーク月間」<br/>を実施します!</h1>
 6 |   <p>今年度は育児・介護休業法の改正法が成立し、<br/>来年度以降は3歳未満の子を育てる社員がテレ<br/>ワークを選択できるように措置を講ずることが、<br/>事業主の努力義務になります。</p>
 7 |   <p>総務省では、地方や中小企業等を含め、テレ<br/>ワークの活用により、多様な働き手が活躍の機<br/>会を持てるよう、テレビ番組とのタイアップを<br/>通じて、テレワークの必要性を改めて実感でき<br/>るような情報発信を行います。また、全国の総<br/>合通信局等では、それぞれの地域における取組<br/>やテレワーク活用を進める先進企業を紹介する<br/>催しを実施します。</p>
 8 |   <p>2 MIC 2024 November Vol.287</p>
 9 |   <p>総務省は、内閣官房、内閣府、デジタル庁、<br/>厚生労働省、経済産業省、国土交通省、観光庁、<br/>環境省、日本テレワーク協会、日本テレワーク<br/>学会と連携して、11月をテレワーク月間とし、<br/>テレワークのさらなる普及·定着に向けた各種<br/>イベント等を集中的に開催します。テレワーク<br/>月間期間中は駅構内やイベント会場等にポス<br/>ターを掲出するほか、ホームページ(※)にて、テ<br/>レワーク実施団体·実施者の登録受付や、テレ<br/>ワーク活用に関するトピック·コンテンツの掲<br/>載、各種イベントに係る情報発信等を行います。</p>
10 |   <p>令和6年度テレワーク月間ポスター</p>
11 |   <p>特集)</p>
12 |   <p>テレワークのさらなる普及・定着に向け「テレワーク月間」を実施します! TELEWORK</p>
13 |   <p>テレワーク月間最終週の11月25日(月)には、内閣府、総務省、厚生労働省、<br/>経済産業省、国土交通省主催で「働く、を変える」テレワークイベントを開催し、<br/>テレワークトップランナー 2024 (総務大臣賞)、輝くテレワーク大賞(厚生労<br/>働大臣)および地方創生テレワークアワード(地方創生担当大臣賞)の合同表彰<br/>式を実施予定です。</p>
14 |   <p>総務省では、平成27年度から、テレワークの導入·活用を進めている企業・<br/>団体を「テレワーク先駆者」とし、その中から十分な実績を持つ企業・団体等を<br/>「テレワーク先駆者百選」として公表するとともに、平成28年度には「テレワー<br/>ク先駆者百選 総務大臣賞」を創設し、「テレワーク先駆者百選」の中から特に優<br/>れた取組を表彰してきました。</p>
15 |   <p>新型コロナウイルス感染症の拡大に際して、企業·団体等においてテレワー<br/>クの導入が進んだ経緯を踏まえ、令和5年度からは、名称や一部の審査基準を<br/>見直したうえで、新たに「テレワークトップランナー」として先進企業の公表、<br/>表彰を開始しました。本年は、テレワークの活用による経営効果の発揮やテレ<br/>ワーク導入が馴染まないと思われている業態の企業におけるテレワーク活用·<br/>業務改革等について、特色ある優れた取組等を実施している企業·団体を「テ<br/>レワークトップランナー 2024」として選定·公表し、その中から特に優れた取<br/>組を「テレワークトップランナー 2024総務大臣賞」として表彰します 。※ 今年度の<br/>表彰団体の募集は、すでに終了しています。</p>
16 |   <p>表彰式は、会場(御茶ノ水ソラシティ、東京都)での観覧の他、オンライン配<br/>信も実施予定です。</p>
17 |   <p>テレワークの必要性について考え直すきっかけとなるよう、各種イベントに<br/>ご参加いただくとともに、テレワーク月間実施団体·実施者としての登録の呼<br/>びかけについても、是非、ご協力をお願いします。</p>
18 |   <p/>
19 |   <h1>テレワークトップランナー 2023 総務大臣賞受賞企業</h1>
20 |   <table border="1" style="border-collapse: collapse">
21 |     <tr>
22 |       <td rowspan="1" colspan="1">企業名<br/>(五十音順)</td>
23 |       <td rowspan="1" colspan="1">業種、所在地、<br/>従業員数</td>
24 |       <td rowspan="1" colspan="1">取組の特徴</td>
25 |     </tr>
26 |     <tr>
27 |       <td rowspan="1" colspan="1">アフラック生命<br/>保険株式会社</td>
28 |       <td rowspan="1" colspan="1">金融 ·保険業<br/>東京都、4,910人</td>
29 |       <td rowspan="1" colspan="1">・20-30 代の女性の離職率が半減、育児に関わる短時間勤務制度を利用<br/>する社員の人数が27.9%減少(フルタイムの増加)など、女性の仕事<br/>と家庭の両立、キャリア形成に寄与<br/>·通勤手当 39.3%削減、紙帳票のペーパレス化等により、コストダウンの<br/>効果も顕在化</td>
30 |     </tr>
31 |     <tr>
32 |       <td rowspan="1" colspan="1">株式会社<br/>キャリア・マム</td>
33 |       <td rowspan="1" colspan="1">サービス業<br/>東京都、38人</td>
34 |       <td rowspan="1" colspan="1">·11万人の主婦会員のうち、年間約3,000 人の地方在住テレワーカーに<br/>業務発注を行い、就労支援を実施するとともに、在宅ワーカーとしての<br/>人材育成を実施<br/>・地方自治体と連携し、地域でのテレワーカーの創出、テレワーク活用の<br/>裾野拡大に貢献</td>
35 |     </tr>
36 |     <tr>
37 |       <td rowspan="1" colspan="1">シェイプウィン<br/>株式会社</td>
38 |       <td rowspan="1" colspan="1">専門・<br/>技術サービス業<br/>東京都、17人</td>
39 |       <td rowspan="1" colspan="1">・フルリモートでの勤務も可能とし、求人応募数が約7倍に増加。東京<br/>では人材獲得の競争が激しい PR·マーケティング系の専門人材を地方<br/>や海外から採用<br/>·離職率は約80%から約 14%まで低下</td>
40 |     </tr>
41 |     <tr>
42 |       <td rowspan="1" colspan="1">株式会社<br/>スタッフサービス・<br/>クラウドワーク</td>
43 |       <td rowspan="1" colspan="1">サービス業<br/>神奈川県、454人</td>
44 |       <td rowspan="1" colspan="1">・通勤が困難な 454 名の重度身体障がい者の雇用を創出。入社1年後の<br/>定着率は 97.3%<br/>・入社後、配属前2ヶ月間のコミュニケーション研修を実施する他、自<br/>主性を重んじた1日3回の定時ミーティングにより、社員による主体的<br/>なチーム運営を実現</td>
45 |     </tr>
46 |     <tr>
47 |       <td rowspan="1" colspan="1">株式会社<br/>テレワーク<br/>マネジメント</td>
48 |       <td rowspan="1" colspan="1">専門・<br/>技術サービス業<br/>北海道、11人</td>
49 |       <td rowspan="1" colspan="1">・社内SNS及びバーチャルオフィスにて社内コミュニケーションを統一し、<br/>全員が同じルールの下で活用することを徹底することで、効率の良い意<br/>思疎通、社員の一体感の醸成を図っている<br/>·簡単な操作で細かく労働時間を記録できるシステムの運用により、フェ<br/>アに働ける環境を実現</td>
50 |     </tr>
51 |     <tr>
52 |       <td rowspan="1" colspan="1">株式会社<br/>プログレス</td>
53 |       <td rowspan="1" colspan="1">情報通信業<br/>東京都、86人</td>
54 |       <td rowspan="1" colspan="1">·コミュニケーションの促進に向け、対面でのチームビルディング、バーチャ<br/>ルオフィスの活用等、多数の取り組みを進める他、コミュニケーション<br/>時の留意点等を示したガイドを全社員に向けて公開<br/>·リーダー間でのメンバー状態の共有、社員への毎月サーベイを実施し、<br/>社員の変化に対し、きめ細やか且つ早期にフォローできる仕組みを構築</td>
55 |     </tr>
56 |   </table>
57 |   <p>主催者代表挨拶<br/>(小森総務大臣政務官)</p>
58 |   <p>表彰状授与</p>
59 |   <p>受賞企業集合写真(3府省合同)</p>
60 |   <p>テレワークトップランナー 2023<br/>ロゴマーク</p>
61 |   <p>テレワーク月間関連イベントの様子</p>
62 |   <p>※総務省主催セミナー「ニューノー<br/>マル時代に求められる働き方·<br/>環境整備の実態」</p>
63 |   <p>セミナーのアーカイブ動画を、<br/>テレワーク月間ホームページに<br/>て公開しております。</p>
64 |   <p>https://teleworkgekkan.go.jp/events/<br/>telework-seminar-202403.html</p>
65 |   <p>2024 November Vol.287 MIC 3</p>
66 | </div>
67 | 


--------------------------------------------------------------------------------
/static/out/in_gallery1_p1.md:
--------------------------------------------------------------------------------
 1 | TELEWORK TELEWORK TELEWORK
 2 | 
 3 | WORK TELEWORK TELEWORK TELEWORK TELEWORK TELEWORK
 4 | 
 5 | # テレワークのさらなる普及·定着に向け<br>「テレワーク月間」<br>を実施します\!
 6 | 
 7 | 今年度は育児・介護休業法の改正法が成立し、<br>来年度以降は3歳未満の子を育てる社員がテレ<br>ワークを選択できるように措置を講ずることが、<br>事業主の努力義務になります。
 8 | 
 9 | 総務省では、地方や中小企業等を含め、テレ<br>ワークの活用により、多様な働き手が活躍の機<br>会を持てるよう、テレビ番組とのタイアップを<br>通じて、テレワークの必要性を改めて実感でき<br>るような情報発信を行います。また、全国の総<br>合通信局等では、それぞれの地域における取組<br>やテレワーク活用を進める先進企業を紹介する<br>催しを実施します。
10 | 
11 | 2 MIC 2024 November Vol.287
12 | 
13 | 総務省は、内閣官房、内閣府、デジタル庁、<br>厚生労働省、経済産業省、国土交通省、観光庁、<br>環境省、日本テレワーク協会、日本テレワーク<br>学会と連携して、11月をテレワーク月間とし、<br>テレワークのさらなる普及·定着に向けた各種<br>イベント等を集中的に開催します。テレワーク<br>月間期間中は駅構内やイベント会場等にポス<br>ターを掲出するほか、ホームページ\(※\)にて、テ<br>レワーク実施団体·実施者の登録受付や、テレ<br>ワーク活用に関するトピック·コンテンツの掲<br>載、各種イベントに係る情報発信等を行います。
14 | 
15 | 令和6年度テレワーク月間ポスター
16 | 
17 | 特集\)
18 | 
19 | テレワークのさらなる普及・定着に向け「テレワーク月間」を実施します\! TELEWORK
20 | 
21 | テレワーク月間最終週の11月25日\(月\)には、内閣府、総務省、厚生労働省、<br>経済産業省、国土交通省主催で「働く、を変える」テレワークイベントを開催し、<br>テレワークトップランナー 2024 \(総務大臣賞\)、輝くテレワーク大賞\(厚生労<br>働大臣\)および地方創生テレワークアワード\(地方創生担当大臣賞\)の合同表彰<br>式を実施予定です。
22 | 
23 | 総務省では、平成27年度から、テレワークの導入·活用を進めている企業・<br>団体を「テレワーク先駆者」とし、その中から十分な実績を持つ企業・団体等を<br>「テレワーク先駆者百選」として公表するとともに、平成28年度には「テレワー<br>ク先駆者百選 総務大臣賞」を創設し、「テレワーク先駆者百選」の中から特に優<br>れた取組を表彰してきました。
24 | 
25 | 新型コロナウイルス感染症の拡大に際して、企業·団体等においてテレワー<br>クの導入が進んだ経緯を踏まえ、令和5年度からは、名称や一部の審査基準を<br>見直したうえで、新たに「テレワークトップランナー」として先進企業の公表、<br>表彰を開始しました。本年は、テレワークの活用による経営効果の発揮やテレ<br>ワーク導入が馴染まないと思われている業態の企業におけるテレワーク活用·<br>業務改革等について、特色ある優れた取組等を実施している企業·団体を「テ<br>レワークトップランナー 2024」として選定·公表し、その中から特に優れた取<br>組を「テレワークトップランナー 2024総務大臣賞」として表彰します 。※ 今年度の<br>表彰団体の募集は、すでに終了しています。
26 | 
27 | 表彰式は、会場\(御茶ノ水ソラシティ、東京都\)での観覧の他、オンライン配<br>信も実施予定です。
28 | 
29 | テレワークの必要性について考え直すきっかけとなるよう、各種イベントに<br>ご参加いただくとともに、テレワーク月間実施団体·実施者としての登録の呼<br>びかけについても、是非、ご協力をお願いします。
30 | 
31 | # テレワークトップランナー 2023 総務大臣賞受賞企業
32 | 
33 | |企業名<br>\(五十音順\)|業種、所在地、<br>従業員数|取組の特徴|
34 | |-|-|-|
35 | |アフラック生命<br>保険株式会社|金融 ·保険業<br>東京都、4,910人|・20\-30 代の女性の離職率が半減、育児に関わる短時間勤務制度を利用<br>する社員の人数が27.9%減少\(フルタイムの増加\)など、女性の仕事<br>と家庭の両立、キャリア形成に寄与<br>·通勤手当 39.3%削減、紙帳票のペーパレス化等により、コストダウンの<br>効果も顕在化|
36 | |株式会社<br>キャリア・マム|サービス業<br>東京都、38人|·11万人の主婦会員のうち、年間約3,000 人の地方在住テレワーカーに<br>業務発注を行い、就労支援を実施するとともに、在宅ワーカーとしての<br>人材育成を実施<br>・地方自治体と連携し、地域でのテレワーカーの創出、テレワーク活用の<br>裾野拡大に貢献|
37 | |シェイプウィン<br>株式会社|専門・<br>技術サービス業<br>東京都、17人|・フルリモートでの勤務も可能とし、求人応募数が約7倍に増加。東京<br>では人材獲得の競争が激しい PR·マーケティング系の専門人材を地方<br>や海外から採用<br>·離職率は約80%から約 14%まで低下|
38 | |株式会社<br>スタッフサービス・<br>クラウドワーク|サービス業<br>神奈川県、454人|・通勤が困難な 454 名の重度身体障がい者の雇用を創出。入社1年後の<br>定着率は 97.3%<br>・入社後、配属前2ヶ月間のコミュニケーション研修を実施する他、自<br>主性を重んじた1日3回の定時ミーティングにより、社員による主体的<br>なチーム運営を実現|
39 | |株式会社<br>テレワーク<br>マネジメント|専門・<br>技術サービス業<br>北海道、11人|・社内SNS及びバーチャルオフィスにて社内コミュニケーションを統一し、<br>全員が同じルールの下で活用することを徹底することで、効率の良い意<br>思疎通、社員の一体感の醸成を図っている<br>·簡単な操作で細かく労働時間を記録できるシステムの運用により、フェ<br>アに働ける環境を実現|
40 | |株式会社<br>プログレス|情報通信業<br>東京都、86人|·コミュニケーションの促進に向け、対面でのチームビルディング、バーチャ<br>ルオフィスの活用等、多数の取り組みを進める他、コミュニケーション<br>時の留意点等を示したガイドを全社員に向けて公開<br>·リーダー間でのメンバー状態の共有、社員への毎月サーベイを実施し、<br>社員の変化に対し、きめ細やか且つ早期にフォローできる仕組みを構築|
41 | 
42 | 主催者代表挨拶<br>\(小森総務大臣政務官\)
43 | 
44 | 表彰状授与
45 | 
46 | 受賞企業集合写真\(3府省合同\)
47 | 
48 | テレワークトップランナー 2023<br>ロゴマーク
49 | 
50 | テレワーク月間関連イベントの様子
51 | 
52 | ※総務省主催セミナー「ニューノー<br>マル時代に求められる働き方·<br>環境整備の実態」
53 | 
54 | セミナーのアーカイブ動画を、<br>テレワーク月間ホームページに<br>て公開しております。
55 | 
56 | https://teleworkgekkan.go.jp/events/<br>telework\-seminar\-202403.html
57 | 
58 | 2024 November Vol.287 MIC 3
59 | 


--------------------------------------------------------------------------------
/static/out/in_gallery1_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery1_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery1_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery1_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery2_p1.md:
--------------------------------------------------------------------------------
 1 | 復興特 1
 2 | 
 3 | # 令 和 7 年 度 歳 出 概 算 要 求 額 総 表
 4 | 
 5 | # 9101東 日本大震災復興特別会計 \(総務省\)
 6 | 
 7 | \(単位:千円\)
 8 | 
 9 | |||||||||||||
10 | |-|-|-|-|-|-|-|-|-|-|-|-|
11 | |要求<br>番号|区<br>分|前 年 度 予 算 額|||7 年 度 概 算 要 求 額||||||明細書<br>頁数|
12 | |||一般行政経費|その他の経費|計<br>\(A\)|一般行政経費|その他の経費|計<br>\(日\)|||||
13 | |1<br>2|01 東 日 本 大 震 災 復 興|0|57,263,734|57,263,734|0|898,529|898,529|||△<br>56,365,205|3|
14 | ||20 総 務 省|0|56,973,678|56,973,678|0|0|0|||△<br>56,973,678|3|
15 | ||010 総 務 本 省|0|56,973,678|56,973,678|0|0|0|||56,973,678|3|
16 | ||228 地 方 交 付 税 交 付 金|0|56,973,678|56,973,678|0|0|0|||56,973,678|3|
17 | ||01\-31 地方交付税交付金財源の交付税及び譲与税配<br>付金特別会計へ繰入れに必要な経費|0|56,973,678|56,973,678|0|0|0|||56,973,678|3|
18 | ||31 復<br>興<br>庁|0|290,056|290,056|0|898,529|898,529|||608,473|3|
19 | ||010 復<br>庁|0|290,056|290,056|0|898,529|898,529|||608,473|3|
20 | ||213 生 活 基 盤 行 政 復 興 政策 費|0|269,131|269,131|0|468,183|468,183|||199,052|3|
21 | |2|01\-95 情報通信技術の利活用高度化に必要な経費|0|1,255|1,255|0|1,255|1,255|||0|3|
22 | |3|05\-95 情報通信技術の利用環境整備に必要な経費|0|114,697|114,697|0|107,166|107,166|||△<br>7,531|3|
23 | |4|30\-95 消防防災体制等の整備に必要な経費|0|153,179|153,179|0|359,762|359,762|||206,583|3|
24 | ||225 生 活 基 盤 行 政 復 興 事 業 費|0|20,925|20,925|0|430,346|430,346|||409,421|4|
25 | |5|01\-95 消防防災体制等の整備に必要な経費|0|20,925|20,925|0|430,346|430,346|||409,421|4|
26 | ||計<br>計|0|57,263,734|57,263,734|0|898,529|898,529|||△<br>56,365,205||
27 | 


--------------------------------------------------------------------------------
/static/out/in_gallery2_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery2_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery2_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery2_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery3_p1.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |   <p>特集 文教·科学技術施策の動向と展開</p>
 3 |   <p/>
 4 |   <h1>2019年度税制改正の概要</h1>
 5 |   <p/>
 6 |   <h1>1.教育資金の一括贈与に係る贈与税の非課税措置<br/>の拡充</h1>
 7 |   <p>祖父母等から孫等に対して教育資金を一括贈与し<br/>た場合の贈与税の非課税措置について、以下の見直<br/>しを行った上で、適用期限を2年延長することが認<br/>められました(2021年3月31日まで)。</p>
 8 |   <p>○教育資金管理契約の終了年齢を、従来の30歳から、<br/>在学中であることを条件に40歳まで引き上げる</p>
 9 |   <p/>
10 |   <h1>文部科学省大臣官房政策課</h1>
11 |   <p>○3歳以上の孫等の教育資金の範囲を、学校等や教<br/>育訓練給付の支給対象となる教育訓練に係る費用<br/>に限定(習い事等は対象外)。</p>
12 |   <p>○贈与から3年以内に祖父母等が亡くなった場合、<br/>孫等が23歳以上であれば贈与の残額を相続財産に<br/>加算する(在学中の場合を除く)。</p>
13 |   <p>これにより、世代間の資産移転を通じた教育負担<br/>の軽減を引き続き促進してまいります。</p>
14 |   <p/>
15 |   <h1>2.特定の学資としての資金の貸付けに係る消費貸借<br/>契約書の印紙税の非課税措置の延長</h1>
16 |   <p>公益法人・学校法人等が実施する、経済的理由に<br/>より修学困難な生徒又は学生に対する無利息等の条<br/>件で行われる奨学金貸与事業の借用証書等に係る印<br/>紙税の非課税措置について、その適用期限を3年延<br/>長することが認められました(2022年3月31日<br/>まで)。</p>
17 |   <p>これにより、引き続き、奨学金貸与に係る学生の<br/>負担軽減を図ってまいります。</p>
18 |   <p/>
19 |   <h1>3.試験研究を行った場合の法人税額等の特別控除<br/>の延長及び拡充</h1>
20 |   <p>民間企業の研究開発投資の維持·拡大に貢献し、<br/>競争力を強化するため、民間企業が試験研究を行っ<br/>た場合に法人税額等の控除を受けられる研究開発税<br/>制について、以下の5点が認められました。</p>
21 |   <p>1ベンチャー企業の総額型の控除上限について法</p>
22 |   <p>人税額の40%(現行:25%)に引上げ</p>
23 |   <p>2オープンイノベーション型における研究開発型<br/>ベンチャーとの共同研究における控除率を25%<br/>(現行:20%)に引き上げるとともに控除上限<br/>を10%(現行:5%)に引上げ</p>
24 |   <p>3総額型の控除率の上限を14%(原則:10%)と<br/>する特例の適用期限を2年延長</p>
25 |   <p>5大学等との共同研究に係る費用について、研究開発のプロジェクトマネジメント業務等を担うURA(リサーチ·アドミニストレータ)の人件費の<br/>適用を明確化</p>
26 |   <p>4試験研究費の対売上比率が10%を超えた場合の控除上限の上乗せ措置の簡素化</p>
27 |   <p/>
28 |   <h1>(その他)</h1>
29 |   <p>9</p>
30 |   <p>文部科学広報 No.233 2019年4月号</p>
31 | </div>
32 | 


--------------------------------------------------------------------------------
/static/out/in_gallery3_p1.md:
--------------------------------------------------------------------------------
 1 | 特集 文教·科学技術施策の動向と展開
 2 | 
 3 | # 2019年度税制改正の概要
 4 | 
 5 | # 1.教育資金の一括贈与に係る贈与税の非課税措置<br>の拡充
 6 | 
 7 | 祖父母等から孫等に対して教育資金を一括贈与し<br>た場合の贈与税の非課税措置について、以下の見直<br>しを行った上で、適用期限を2年延長することが認<br>められました\(2021年3月31日まで\)。
 8 | 
 9 | ○教育資金管理契約の終了年齢を、従来の30歳から、<br>在学中であることを条件に40歳まで引き上げる
10 | 
11 | # 文部科学省大臣官房政策課
12 | 
13 | ○3歳以上の孫等の教育資金の範囲を、学校等や教<br>育訓練給付の支給対象となる教育訓練に係る費用<br>に限定\(習い事等は対象外\)。
14 | 
15 | ○贈与から3年以内に祖父母等が亡くなった場合、<br>孫等が23歳以上であれば贈与の残額を相続財産に<br>加算する\(在学中の場合を除く\)。
16 | 
17 | これにより、世代間の資産移転を通じた教育負担<br>の軽減を引き続き促進してまいります。
18 | 
19 | # 2.特定の学資としての資金の貸付けに係る消費貸借<br>契約書の印紙税の非課税措置の延長
20 | 
21 | 公益法人・学校法人等が実施する、経済的理由に<br>より修学困難な生徒又は学生に対する無利息等の条<br>件で行われる奨学金貸与事業の借用証書等に係る印<br>紙税の非課税措置について、その適用期限を3年延<br>長することが認められました\(2022年3月31日<br>まで\)。
22 | 
23 | これにより、引き続き、奨学金貸与に係る学生の<br>負担軽減を図ってまいります。
24 | 
25 | # 3.試験研究を行った場合の法人税額等の特別控除<br>の延長及び拡充
26 | 
27 | 民間企業の研究開発投資の維持·拡大に貢献し、<br>競争力を強化するため、民間企業が試験研究を行っ<br>た場合に法人税額等の控除を受けられる研究開発税<br>制について、以下の5点が認められました。
28 | 
29 | 1ベンチャー企業の総額型の控除上限について法
30 | 
31 | 人税額の40%\(現行:25%\)に引上げ
32 | 
33 | 2オープンイノベーション型における研究開発型<br>ベンチャーとの共同研究における控除率を25%<br>\(現行:20%\)に引き上げるとともに控除上限<br>を10%\(現行:5%\)に引上げ
34 | 
35 | 3総額型の控除率の上限を14%\(原則:10%\)と<br>する特例の適用期限を2年延長
36 | 
37 | 5大学等との共同研究に係る費用について、研究開発のプロジェクトマネジメント業務等を担うURA\(リサーチ·アドミニストレータ\)の人件費の<br>適用を明確化
38 | 
39 | 4試験研究費の対売上比率が10%を超えた場合の控除上限の上乗せ措置の簡素化
40 | 
41 | # \(その他\)
42 | 
43 | 9
44 | 
45 | 文部科学広報 No.233 2019年4月号
46 | 


--------------------------------------------------------------------------------
/static/out/in_gallery3_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery3_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery3_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery3_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery4_p1.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |   <p/>
 3 |   <h1>たまごのお店<br/>むこたま</h1>
 4 |   <p>むこたま 柏の葉店</p>
 5 |   <p>〒2770871</p>
 6 |   <p>千葉県柏市若柴186番地 中央146街区1<br/>ARAGE 112<br/>KOIL LINK G</p>
 7 |   <p>TEL:0471-28-8905</p>
 8 |   <p>https://www.mukodai.com</p>
 9 |   <p>登録番号:T7040002093726</p>
10 |   <p>端末番号:6B4A</p>
11 |   <p>AM9:00~PM7:00</p>
12 |   <p>お買い上げ、誠にありがとうございます。</p>
13 |   <p>またのお越しをお待ちしております。</p>
14 |   <p>2024-11-13 13:26:15</p>
15 |   <p/>
16 |   <h1>むこたまソフト ※</h1>
17 |   <p>¥529</p>
18 |   <p>端末取引ID:50631</p>
19 |   <p>小計<br/>¥529</p>
20 |   <p>合計<br/>¥529</p>
21 |   <p>内消費税<br/>(¥39)</p>
22 |   <p>(8%対象 ¥529 内消費税 ¥39)</p>
23 |   <p>合計点数<br/>1点</p>
24 |   <p>お預り金額<br/>¥1,000</p>
25 |   <p>注) ※は軽減税率(8%)適用</p>
26 |   <p>お釣り<br/>¥471</p>
27 |   <p>オンラインでもご購入いただけます!</p>
28 |   <p>http://www.mukotama.com/</p>
29 |   <p>No. 7314719750041</p>
30 | </div>
31 | 


--------------------------------------------------------------------------------
/static/out/in_gallery4_p1.md:
--------------------------------------------------------------------------------
 1 | # たまごのお店<br>むこたま
 2 | 
 3 | むこたま 柏の葉店
 4 | 
 5 | 〒2770871
 6 | 
 7 | ARAGE 112<br>千葉県柏市若柴186番地 中央146街区1<br>KOIL LINK G
 8 | 
 9 | TEL:0471\-28\-8905
10 | 
11 | https://www.mukodai.com
12 | 
13 | 登録番号:T7040002093726
14 | 
15 | 端末番号:6B4A
16 | 
17 | AM9:00\~PM7:00
18 | 
19 | お買い上げ、誠にありがとうございます。
20 | 
21 | またのお越しをお待ちしております。
22 | 
23 | 2024\-11\-13 13:26:15
24 | 
25 | 端末取引ID:50631
26 | 
27 | # むこたまソフト ※
28 | 
29 | ¥529
30 | 
31 | 小計<br>¥529
32 | 
33 | 合計<br>¥529
34 | 
35 | 内消費税<br>\(¥39\)
36 | 
37 | \(8%対象 ¥529 内消費税 ¥39\)
38 | 
39 | 合計点数<br>1点
40 | 
41 | お預り金額<br>¥1,000
42 | 
43 | 注\) ※は軽減税率\(8%\)適用
44 | 
45 | お釣り<br>¥471
46 | 
47 | オンラインでもご購入いただけます\!
48 | 
49 | http://www.mukotama.com/
50 | 
51 | No. 7314719750041
52 | 


--------------------------------------------------------------------------------
/static/out/in_gallery4_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery4_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery4_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery4_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery5_p1.html:
--------------------------------------------------------------------------------
1 | <div>
2 |   <p/>
3 |   <h1>氏名<br/>日 本 花 子<br/>昭和61年 5月 1日生</h1>
4 | </div>
5 | 


--------------------------------------------------------------------------------
/static/out/in_gallery5_p1.md:
--------------------------------------------------------------------------------
1 | # 氏名<br>日 本 花 子<br>昭和61年 5月 1日生
2 | 


--------------------------------------------------------------------------------
/static/out/in_gallery5_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery5_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery5_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery5_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery6_p1.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |   <p>(1)</p>
 3 |   <table border="1" style="border-collapse: collapse">
 4 |     <tr>
 5 |       <td rowspan="1" colspan="2">栄養成分表示(1袋50gあたり)</td>
 6 |     </tr>
 7 |     <tr>
 8 |       <td rowspan="1" colspan="1">エネルギー</td>
 9 |       <td rowspan="1" colspan="1">101kcal</td>
10 |     </tr>
11 |     <tr>
12 |       <td rowspan="1" colspan="1">たんぱく質</td>
13 |       <td rowspan="1" colspan="1">√<br/>13.6g</td>
14 |     </tr>
15 |     <tr>
16 |       <td rowspan="1" colspan="1">脂質</td>
17 |       <td rowspan="1" colspan="1">0.2g<br/>1</td>
18 |     </tr>
19 |     <tr>
20 |       <td rowspan="1" colspan="1">炭水化物</td>
21 |       <td rowspan="1" colspan="1">13.6g</td>
22 |     </tr>
23 |     <tr>
24 |       <td rowspan="1" colspan="1">糖 質</td>
25 |       <td rowspan="1" colspan="1">8.8g</td>
26 |     </tr>
27 |     <tr>
28 |       <td rowspan="1" colspan="1">食物繊維</td>
29 |       <td rowspan="1" colspan="1">4.8g</td>
30 |     </tr>
31 |     <tr>
32 |       <td rowspan="1" colspan="1">食塩相当量</td>
33 |       <td rowspan="1" colspan="1">11.9g</td>
34 |     </tr>
35 |     <tr>
36 |       <td rowspan="1" colspan="1">カリウム</td>
37 |       <td rowspan="1" colspan="1">523mg</td>
38 |     </tr>
39 |     <tr>
40 |       <td rowspan="1" colspan="1">カルシウム</td>
41 |       <td rowspan="1" colspan="1">83mg</td>
42 |     </tr>
43 |   </table>
44 |   <p>ごみを出すときは<br/>各市町村の区分<br/>に従ってください</p>
45 |   <p>4 901159 304208</p>
46 |   <p>●お客様相談室<br/>0120-041-965<br/>受付9:00~17:00<br/>(土·日·祝日を除く)</p>
47 |   <table border="1" style="border-collapse: collapse">
48 |     <tr>
49 |       <td rowspan="1" colspan="1">(1)<br/>名 称</td>
50 |       <td rowspan="1" colspan="1">塩こんぶ<br/>(1)</td>
51 |     </tr>
52 |     <tr>
53 |       <td rowspan="1" colspan="1">原材料名</td>
54 |       <td rowspan="1" colspan="1">昆布(北海道産)、醤油(大豆·小麦<br/>を含む)、食塩、醤油加工品(大豆·<br/>小麦を含む)/調味料(アミノ酸<br/>等)、甘味料(ソルビトール、甘草)、<br/>増粘多糖類</td>
55 |     </tr>
56 |     <tr>
57 |       <td rowspan="1" colspan="1">内容量</td>
58 |       <td rowspan="1" colspan="1">50g</td>
59 |     </tr>
60 |     <tr>
61 |       <td rowspan="1" colspan="1">賞味期限</td>
62 |       <td rowspan="1" colspan="1">表面下部に記載</td>
63 |     </tr>
64 |     <tr>
65 |       <td rowspan="1" colspan="1">保存方法</td>
66 |       <td rowspan="1" colspan="1">直射日光、高温多湿を避け常温で<br/>保存してください。</td>
67 |     </tr>
68 |     <tr>
69 |       <td rowspan="1" colspan="1">製造者</td>
70 |       <td rowspan="1" colspan="1">株式会社くらこん<br/>〒573-1132<br/>大阪府枚方市招提田近2-1-3<br/>(製造所固有記号は賞味期限の後に記載)</td>
71 |     </tr>
72 |   </table>
73 |   <p>&lt;食物アレルギーをお持ちのお客様へ&gt;<br/>食品表示法によるアレルギー物質を含む食品28品目について、以下に記載いたしました。</p>
74 |   <p>●使用している食品<br/>小麦・大豆</p>
75 |   <p>※1 この商品に含まれる全ての小麦は醤油由来です。<br/>※2 同じ製造ラインで、乳成分を含む製品を製造しています。</p>
76 |   <p>えび・かに・くるみ・そば・卵・乳成分(上記※2参照)・落花生(ピーナッツ)・アー<br/>モンド・あわび・いか・いくら・オレンジ・カシューナッツ・キライフルーツ・牛肉・<br/>ごま・さけ・さば・鶏肉・バナナ・豚肉・まつたけ・もも・やまいも・りんご・ゼラチン</p>
77 |   <p>●使用していない食品</p>
78 | </div>
79 | 


--------------------------------------------------------------------------------
/static/out/in_gallery6_p1.md:
--------------------------------------------------------------------------------
 1 | \(1\)
 2 | 
 3 | |栄養成分表示\(1袋50gあたり\)||
 4 | |-|-|
 5 | |エネルギー|101kcal|
 6 | |たんぱく質|√<br>13.6g|
 7 | |脂質|0.2g<br>1|
 8 | |炭水化物|13.6g|
 9 | |糖 質|8.8g|
10 | |食物繊維|4.8g|
11 | |食塩相当量|11.9g|
12 | |カリウム|523mg|
13 | |カルシウム|83mg|
14 | 
15 | ごみを出すときは<br>各市町村の区分<br>に従ってください
16 | 
17 | 4 901159 304208
18 | 
19 | ●お客様相談室<br>0120\-041\-965<br>受付9:00\~17:00<br>\(土·日·祝日を除く\)
20 | 
21 | |\(1\)<br>名 称|塩こんぶ<br>\(1\)|
22 | |-|-|
23 | |原材料名|昆布\(北海道産\)、醤油\(大豆·小麦<br>を含む\)、食塩、醤油加工品\(大豆·<br>小麦を含む\)/調味料\(アミノ酸<br>等\)、甘味料\(ソルビトール、甘草\)、<br>増粘多糖類|
24 | |内容量|50g|
25 | |賞味期限|表面下部に記載|
26 | |保存方法|直射日光、高温多湿を避け常温で<br>保存してください。|
27 | |製造者|株式会社くらこん<br>〒573\-1132<br>大阪府枚方市招提田近2\-1\-3<br>\(製造所固有記号は賞味期限の後に記載\)|
28 | 
29 | <食物アレルギーをお持ちのお客様へ><br>食品表示法によるアレルギー物質を含む食品28品目について、以下に記載いたしました。
30 | 
31 | ●使用している食品<br>小麦・大豆
32 | 
33 | ※1 この商品に含まれる全ての小麦は醤油由来です。<br>※2 同じ製造ラインで、乳成分を含む製品を製造しています。
34 | 
35 | えび・かに・くるみ・そば・卵・乳成分\(上記※2参照\)・落花生\(ピーナッツ\)・アー<br>モンド・あわび・いか・いくら・オレンジ・カシューナッツ・キライフルーツ・牛肉・<br>ごま・さけ・さば・鶏肉・バナナ・豚肉・まつたけ・もも・やまいも・りんご・ゼラチン
36 | 
37 | ●使用していない食品
38 | 


--------------------------------------------------------------------------------
/static/out/in_gallery6_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery6_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery6_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery6_p1_ocr.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery7_p1.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 |   <img src="figures/in_gallery7_p1_figure_0.png" width="200"/>
 3 |   <br/>
 4 |   <p>国民民主党<br/>Democratic Party For the People</p>
 5 |   <p/>
 6 |   <h1>103万円の壁」引き上げ結局どうなった?!</h1>
 7 |   <p>↑</p>
 8 |   <p>·年末の与党案(123万案)<br/>・2月の新与党案</p>
 9 |   <p>+10万円</p>
10 |   <p>+37万円</p>
11 |   <p>+10万円</p>
12 |   <p>恒次措置</p>
13 |   <p>累進税率<br/>10%<br/>20%</p>
14 |   <p>5%</p>
15 |   <p>+30万円</p>
16 |   <p>+10万円;+5万円</p>
17 |   <p>給与所得控除</p>
18 |   <p>基礎控除58万円</p>
19 |   <p>課税対象所得</p>
20 |   <p>&gt;2年間限定</p>
21 |   <p>収入</p>
22 |   <p>103<br/>190 200<br/>160<br/>475 665 850<br/>2400 2500<br/>2450</p>
23 |   <p>減税額<br/>年2.3万</p>
24 |   <p>年2.4万<br/>年2.0万 年3.0万</p>
25 | </div>
26 | 


--------------------------------------------------------------------------------
/static/out/in_gallery7_p1.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery7_p1.md


--------------------------------------------------------------------------------
/static/out/in_gallery7_p1_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery7_p1_layout.jpg


--------------------------------------------------------------------------------
/static/out/in_gallery7_p1_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/static/out/in_gallery7_p1_ocr.jpg


--------------------------------------------------------------------------------
/tests/data/invalid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/invalid.jpg


--------------------------------------------------------------------------------
/tests/data/invalid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/invalid.pdf


--------------------------------------------------------------------------------
/tests/data/rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/rgba.png


--------------------------------------------------------------------------------
/tests/data/sampldoc.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/sampldoc.tif


--------------------------------------------------------------------------------
/tests/data/small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/small.jpg


--------------------------------------------------------------------------------
/tests/data/subdir/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/subdir/test.jpg


--------------------------------------------------------------------------------
/tests/data/test.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/test.bmp


--------------------------------------------------------------------------------
/tests/data/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/test.jpg


--------------------------------------------------------------------------------
/tests/data/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/test.pdf


--------------------------------------------------------------------------------
/tests/data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/test.png


--------------------------------------------------------------------------------
/tests/data/test.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/test.tiff


--------------------------------------------------------------------------------
/tests/data/test.txt:
--------------------------------------------------------------------------------
1 | dummy


--------------------------------------------------------------------------------
/tests/data/test_gray.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kotaro-kinoshita/yomitoku/87e3aed0c1f49b879baf9db24926c376f8353eb9/tests/data/test_gray.jpg


--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import patch
  2 | 
  3 | import pytest
  4 | 
  5 | from yomitoku.base import (
  6 |     BaseModelCatalog,
  7 |     BaseModule,
  8 |     load_config,
  9 |     load_yaml_config,
 10 | )
 11 | from yomitoku.configs import LayoutParserRTDETRv2Config
 12 | from yomitoku.models import RTDETRv2
 13 | 
 14 | 
 15 | def test_load_yaml_config():
 16 |     path_cfg = "tests/yaml/dummy.yaml"
 17 | 
 18 |     with pytest.raises(FileNotFoundError):
 19 |         load_yaml_config(path_cfg)
 20 | 
 21 |     with pytest.raises(ValueError):
 22 |         load_yaml_config("tests/data/test.jpg")
 23 | 
 24 |     path_cfg = "tests/yaml/layout_parser.yaml"
 25 |     yaml_config = load_yaml_config(path_cfg)
 26 |     assert yaml_config.thresh_score == 0.8
 27 | 
 28 | 
 29 | def test_load_config():
 30 |     default_config = LayoutParserRTDETRv2Config
 31 |     path_config = "tests/yaml/dummy.yaml"
 32 |     path_config = "tests/yaml/layout_parser.yaml"
 33 |     cfg = load_config(default_config, path_config)
 34 |     assert cfg.thresh_score == 0.8
 35 |     assert cfg.hf_hub_repo == "KotaroKinoshita/yomitoku-layout-parser-rtdtrv2-open-beta"
 36 | 
 37 | 
 38 | class TestModelCatalog(BaseModelCatalog):
 39 |     def __init__(self):
 40 |         super().__init__()
 41 |         self.register(
 42 |             "test",
 43 |             LayoutParserRTDETRv2Config,
 44 |             RTDETRv2,
 45 |         )
 46 | 
 47 | 
 48 | class TestModule(BaseModule):
 49 |     model_catalog = TestModelCatalog()
 50 | 
 51 |     def __init__(self):
 52 |         super().__init__()
 53 | 
 54 |     def __call__(self):
 55 |         pass
 56 | 
 57 | 
 58 | def test_base_model(tmp_path):
 59 |     module = TestModule()
 60 |     module.load_model("test", None)
 61 |     assert isinstance(module.model, RTDETRv2)
 62 | 
 63 |     module.save_config(tmp_path / "config.yaml")
 64 |     data = load_yaml_config(tmp_path / "config.yaml")
 65 |     default = LayoutParserRTDETRv2Config()
 66 |     assert data.hf_hub_repo == default.hf_hub_repo
 67 | 
 68 |     module.log_config()
 69 |     module.catalog()
 70 | 
 71 | 
 72 | def test_base_catalog():
 73 |     catalog = TestModelCatalog()
 74 |     assert catalog.list_model() == ["test"]
 75 | 
 76 |     with pytest.raises(ValueError):
 77 |         catalog.get("dummy")
 78 | 
 79 |     list = catalog.list_model()
 80 |     assert list == ["test"]
 81 | 
 82 |     with pytest.raises(ValueError):
 83 |         catalog.register("test", None, None)
 84 | 
 85 | 
 86 | def test_base_call():
 87 |     with patch("yomitoku.base.observer") as mock:
 88 |         module = TestModule()
 89 |         module()
 90 |         mock.assert_called_once()
 91 | 
 92 | 
 93 | def test_invalid_base_model():
 94 |     class InvalidModel(BaseModule):
 95 |         def __init__(self):
 96 |             super().__init__()
 97 | 
 98 |     with pytest.raises(NotImplementedError):
 99 |         InvalidModel()
100 | 
101 |     class InvalidModel(BaseModule):
102 |         model_catalog = 1
103 | 
104 |     with pytest.raises(ValueError):
105 |         InvalidModel()
106 | 
107 |     class InvalidModelCatalog(BaseModelCatalog):
108 |         def __init__(self):
109 |             self.catalog = {}
110 | 
111 |     class InvalidModel(BaseModule):
112 |         model_catalog = InvalidModelCatalog()
113 | 
114 |     with pytest.raises(ValueError):
115 |         InvalidModel()
116 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | 
  4 | from yomitoku.data.functions import (
  5 |     array_to_tensor,
  6 |     load_image,
  7 |     load_pdf,
  8 |     resize_shortest_edge,
  9 |     resize_with_padding,
 10 |     rotate_text_image,
 11 |     standardization_image,
 12 |     validate_quads,
 13 | )
 14 | 
 15 | 
 16 | def test_load_image():
 17 |     with pytest.raises(FileNotFoundError):
 18 |         load_image("dummy.jpg")
 19 | 
 20 |     with pytest.raises(ValueError):
 21 |         load_image("tests/data/test.txt")
 22 | 
 23 |     with pytest.raises(ValueError):
 24 |         load_image("tests/data/small.jpg")
 25 | 
 26 |     with pytest.raises(ValueError):
 27 |         load_image("tests/data/test.pdf")
 28 | 
 29 |     with pytest.raises(ValueError):
 30 |         load_image("tests/data/invalid.jpg")
 31 | 
 32 |     targets = [
 33 |         "tests/data/test.jpg",
 34 |         "tests/data/test.png",
 35 |         "tests/data/test.tiff",
 36 |         "tests/data/test.bmp",
 37 |         "tests/data/test_gray.jpg",
 38 |         "tests/data/rgba.png",
 39 |         "tests/data/sampldoc.tif",
 40 |     ]
 41 | 
 42 |     for target in targets:
 43 |         image = load_image(target)
 44 |         assert len(image) >= 1
 45 |         assert image[0].shape[2] == 3
 46 |         assert image[0].shape[0] > 32
 47 |         assert image[0].shape[1] > 32
 48 |         assert image[0].dtype == "uint8"
 49 | 
 50 | 
 51 | def test_load_pdf():
 52 |     with pytest.raises(FileNotFoundError):
 53 |         load_pdf("dummy.pdf")
 54 | 
 55 |     with pytest.raises(ValueError):
 56 |         load_pdf("tests/data/test.txt")
 57 | 
 58 |     with pytest.raises(ValueError):
 59 |         load_pdf("tests/data/invalid.pdf")
 60 | 
 61 |     targets = [
 62 |         "tests/data/test.jpg",
 63 |         "tests/data/test.png",
 64 |         "tests/data/test.tiff",
 65 |         "tests/data/test.bmp",
 66 |         "tests/data/test_gray.jpg",
 67 |     ]
 68 | 
 69 |     for target in targets:
 70 |         with pytest.raises(ValueError):
 71 |             load_pdf(target)
 72 | 
 73 |     target = "tests/data/test.pdf"
 74 |     images = load_pdf(target)
 75 |     assert len(images) == 2
 76 |     for image in images:
 77 |         assert image.shape[2] == 3
 78 |         assert image.shape[0] > 0
 79 |         assert image.shape[1] > 0
 80 |         assert image.dtype == "uint8"
 81 | 
 82 | 
 83 | def test_resize_shortest_edge():
 84 |     img = np.zeros((1920, 1920, 3), dtype=np.uint8)
 85 |     resized = resize_shortest_edge(img, 1280, 1500)
 86 |     h, w = resized.shape[:2]
 87 |     assert min(h, w) == 1280
 88 |     assert h % 32 == 0
 89 |     assert w % 32 == 0
 90 | 
 91 |     img = np.zeros((1280, 1920, 3), dtype=np.uint8)
 92 |     resized = resize_shortest_edge(img, 1280, 1600)
 93 |     h, w = resized.shape[:2]
 94 |     assert max(h, w) == 1600
 95 |     assert h % 32 == 0
 96 |     assert w % 32 == 0
 97 | 
 98 |     resized = resize_shortest_edge(img, 1000, 1000)
 99 |     h, w = resized.shape[:2]
100 |     assert h % 32 == 0
101 |     assert w % 32 == 0
102 | 
103 | 
104 | def test_standardization_image():
105 |     img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
106 |     normalized = standardization_image(img)
107 |     assert normalized.shape == img.shape
108 |     assert normalized.dtype == "float32"
109 | 
110 | 
111 | def test_array_to_tensor():
112 |     img = np.random.randint(0, 255, (100, 50, 3), dtype=np.uint8)
113 |     tensor = array_to_tensor(img)
114 |     assert tensor.shape == (1, 3, 100, 50)
115 | 
116 | 
117 | def test_rotate_image():
118 |     img = np.random.randint(0, 255, (100, 30, 3), dtype=np.uint8)
119 |     rotated = rotate_text_image(img, thresh_aspect=2)
120 |     assert rotated.shape == (30, 100, 3)
121 | 
122 |     img = np.random.randint(0, 255, (30, 100, 3), dtype=np.uint8)
123 |     rotated = rotate_text_image(img, thresh_aspect=2)
124 |     assert rotated.shape == (30, 100, 3)
125 | 
126 | 
127 | def test_resize_with_padding():
128 |     img = np.random.randint(0, 255, (50, 100, 3), dtype=np.uint8)
129 |     resized = resize_with_padding(img, (50, 100))
130 |     assert resized.shape == (50, 100, 3)
131 | 
132 |     img = np.random.randint(0, 255, (50, 150, 3), dtype=np.uint8)
133 |     resized = resize_with_padding(img, (50, 100))
134 |     assert resized.shape == (50, 100, 3)
135 | 
136 |     img = np.random.randint(0, 255, (60, 100, 3), dtype=np.uint8)
137 |     resized = resize_with_padding(img, (50, 100))
138 |     assert resized.shape == (50, 100, 3)
139 | 
140 | 
141 | def test_validate_quads():
142 |     img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
143 |     quad = [[0, 0], [0, 10], [10, 10]]
144 | 
145 |     assert validate_quads(img, quad) is None
146 | 
147 |     quad = [[0], [0, 10], [10, 10], [10, 0]]
148 | 
149 |     assert validate_quads(img, quad) is None
150 | 
151 |     quad = [[0, 0], [0, 150], [10, 150], [10, 0]]
152 | 
153 |     assert validate_quads(img, quad) is None
154 | 
155 |     quad = [[150, 0], [150, 10], [10, 10], [10, 0]]
156 | 
157 |     assert validate_quads(img, quad) is None
158 | 
159 |     quad = [[-1, 0], [-1, 10], [10, 10], [10, 0]]
160 | 
161 |     assert validate_quads(img, quad) is None
162 | 
163 |     quad = [[0, -1], [0, 10], [10, 10], [10, -1]]
164 | 
165 |     assert validate_quads(img, quad) is None
166 | 
167 |     quads = [
168 |         [[0, 0], [0, 10], [10, 10], [10, 0]],
169 |         [[0, 0], [0, 20], [10, 20], [10, 0]],
170 |         [[10, 0], [10, 30], [80, 30], [80, 0]],
171 |     ]
172 | 
173 |     for quad in quads:
174 |         assert validate_quads(img, quad)
175 | 


--------------------------------------------------------------------------------
/tests/test_layout_analyzer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from omegaconf import OmegaConf
 4 | 
 5 | from yomitoku.layout_analyzer import LayoutAnalyzer
 6 | 
 7 | 
 8 | def test_layout():
 9 |     device = "cpu"
10 |     visualize = True
11 |     config = {
12 |         "layout_parser": {
13 |             "path_cfg": "tests/yaml/layout_parser.yaml",
14 |         },
15 |         "table_structure_recognizer": {
16 |             "path_cfg": "tests/yaml/table_structure_recognizer.yaml",
17 |         },
18 |     }
19 | 
20 |     analyzer = LayoutAnalyzer(configs=config, device=device, visualize=visualize)
21 | 
22 |     assert analyzer.layout_parser.device == torch.device(device)
23 |     assert analyzer.table_structure_recognizer.device == torch.device(device)
24 | 
25 |     assert analyzer.layout_parser.visualize == visualize
26 |     assert analyzer.table_structure_recognizer.visualize == visualize
27 | 
28 |     layout_parser_cfg = OmegaConf.load(config["layout_parser"]["path_cfg"])
29 |     table_structure_recognizer_cfg = OmegaConf.load(
30 |         config["table_structure_recognizer"]["path_cfg"]
31 |     )
32 | 
33 |     assert analyzer.layout_parser.thresh_score == layout_parser_cfg.thresh_score
34 | 
35 |     assert (
36 |         analyzer.table_structure_recognizer.thresh_score
37 |         == table_structure_recognizer_cfg.thresh_score
38 |     )
39 | 
40 | 
41 | def test_layout_invalid_path():
42 |     config = {
43 |         "layout_parser": {
44 |             "path_cfg": "tests/yaml/dummy.yaml",
45 |         },
46 |     }
47 |     with pytest.raises(FileNotFoundError):
48 |         LayoutAnalyzer(configs=config)
49 | 


--------------------------------------------------------------------------------
/tests/test_ocr.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from omegaconf import OmegaConf
 4 | 
 5 | from yomitoku.ocr import OCR
 6 | 
 7 | 
 8 | def test_ocr():
 9 |     device = "cpu"
10 |     visualize = True
11 |     config = {
12 |         "text_detector": {
13 |             "path_cfg": "tests/yaml/text_detector.yaml",
14 |         },
15 |         "text_recognizer": {
16 |             "path_cfg": "tests/yaml/text_recognizer.yaml",
17 |         },
18 |     }
19 | 
20 |     ocr = OCR(configs=config, device=device, visualize=visualize)
21 | 
22 |     assert ocr.detector.device == torch.device(device)
23 |     assert ocr.recognizer.device == torch.device(device)
24 |     assert ocr.detector.visualize == visualize
25 |     assert ocr.recognizer.visualize == visualize
26 | 
27 |     text_detector_cfg = OmegaConf.load(config["text_detector"]["path_cfg"])
28 |     text_recognizer_cfg = OmegaConf.load(config["text_recognizer"]["path_cfg"])
29 | 
30 |     assert ocr.detector.post_processor.thresh == text_detector_cfg.post_process.thresh
31 | 
32 |     assert ocr.recognizer.model.refine_iters == text_recognizer_cfg.refine_iters
33 | 
34 | 
35 | def test_ocr_invalid_path():
36 |     config = {
37 |         "text_detector": {
38 |             "path_cfg": "tests/yaml/dummy.yaml",
39 |         },
40 |     }
41 | 
42 |     with pytest.raises(FileNotFoundError):
43 |         OCR(configs=config)
44 | 


--------------------------------------------------------------------------------
/tests/yaml/layout_parser.yaml:
--------------------------------------------------------------------------------
1 | thresh_score: 0.8
2 | 


--------------------------------------------------------------------------------
/tests/yaml/table_structure_recognizer.yaml:
--------------------------------------------------------------------------------
1 | thresh_score: 0.8
2 | 


--------------------------------------------------------------------------------
/tests/yaml/text_detector.yaml:
--------------------------------------------------------------------------------
1 | post_process:
2 |   thresh: 0.4
3 | 


--------------------------------------------------------------------------------
/tests/yaml/text_recognizer.yaml:
--------------------------------------------------------------------------------
1 | refine_iters: 0
2 | 


--------------------------------------------------------------------------------