├── .github
    └── workflows
    │   ├── Ruff_python_formater.yml
    │   ├── autotools.yaml
    │   ├── msys2.yaml
    │   └── stale.yml
├── .gitignore
├── AUTHORS
├── CMakeLists.txt
├── COPYING
├── ChangeLog
├── INSTALL
├── Makefile.am
├── NEWS
├── README.md
├── autogen.sh
├── configure.ac
├── doc
    ├── Makefile.am
    ├── PATENTS
    ├── fcd14492.pdf
    └── jbig2enc.html
├── images
    └── feyn.tif
├── jbig2topdf.py
└── src
    ├── Makefile.am
    ├── jbig2.cc
    ├── jbig2arith.cc
    ├── jbig2arith.h
    ├── jbig2comparator.cc
    ├── jbig2comparator.h
    ├── jbig2enc.cc
    ├── jbig2enc.h
    ├── jbig2segments.h
    ├── jbig2structs.h
    ├── jbig2sym.cc
    └── jbig2sym.h


/.github/workflows/Ruff_python_formater.yml:
--------------------------------------------------------------------------------
 1 | name: Ruff_python_formater
 2 | 
 3 | on: [ push, pull_request ]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 |       - name: Install Python
11 |         uses: actions/setup-python@v5
12 |         with:
13 |           python-version: "3.12"
14 |       - name: Get Python changed files
15 |         id: changed-py-files
16 |         uses: tj-actions/changed-files@v45
17 |         with:
18 |           files: |
19 |             *.py
20 |             **/*.py
21 |       - name: Install dependencies
22 |         if: steps.changed-py-files.outputs.any_changed == 'true'
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           pip install ruff isort
26 |       # Update output format to enable automatic inline annotations.
27 |       - name: Run Ruff
28 |         if: steps.changed-py-files.outputs.any_changed == 'true'
29 |         run: |
30 |           ruff check --output-format=github ${{ steps.changed-py-files.outputs.all_changed_files }}
31 |           isort .
32 | 


--------------------------------------------------------------------------------
/.github/workflows/autotools.yaml:
--------------------------------------------------------------------------------
 1 | name: autotools
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - src/**
 7 |       - configure.ac
 8 |       - Makefile.am
 9 |   pull_request:
10 |     paths:
11 |       - src/**
12 |       - configure.ac
13 |       - Makefile.am
14 |   workflow_dispatch:
15 | 
16 | jobs:
17 | 
18 |   linux:
19 |     runs-on: ${{ matrix.config.os }}
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         config:
24 |           - { icon: '🟧', name: ubuntu-24.04-clang-18-autotools, os: ubuntu-24.04, cxx: clang++-18} #installed
25 |           - { icon: '🟨', name: ubuntu-24.04-gcc-14-autotools, os: ubuntu-24.04, cxx: g++-14} #installed
26 | 
27 |     steps:
28 |     - name: '🧰 Checkout'
29 |       uses: actions/checkout@v4
30 |       with:
31 |         fetch-depth: 0
32 |         persist-credentials: false
33 | 
34 |     - name: '${{matrix.config.icon}} Setup ${{matrix.config.name}}'
35 |       run: |
36 |            sudo apt-get update
37 |            sudo apt-get install -y ${{ matrix.config.cxx }}
38 |            sudo apt-get install autoconf libleptonica-dev -y
39 |            ${{ matrix.config.cxx }} --version
40 |            git log -3 --pretty=format:'%h %ad %s | %an'
41 | 
42 |     - name: '️#️⃣ get-hash'
43 |       id: hash
44 |       run: echo "sha_short=$(git describe --tags)" >> $GITHUB_OUTPUT
45 | 
46 |     - name: '🔧 Configure jbig2enc ${{steps.hash.outputs.sha_short}}'
47 |       run: |
48 |            ./autogen.sh
49 |            ./configure '--disable-shared' 'CXX=${{matrix.config.cxx}}' 'CXXFLAGS=-g -O2 -static'
50 | 
51 |     - name: '🚧 Build and install jbig2enc'
52 |       run: |
53 |            make
54 |            sudo make install
55 | 
56 |     - name: 'ℹ️ Display version'
57 |       run: |
58 |            jbig2 -V
59 | 
60 |     - name: '🏃 Run test'
61 |       run: |
62 |            jbig2 -a -p -v images/feyn.tif > feyn.jb2
63 |            python3 jbig2topdf.py -s feyn.jb2 > feyn.pdf
64 |       if: success() || failure()
65 | 


--------------------------------------------------------------------------------
/.github/workflows/msys2.yaml:
--------------------------------------------------------------------------------
  1 | name: Msys2
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     paths:
  6 |       - src/**
  7 |       - configure.ac
  8 |       - Makefile.am
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 | 
 13 |   build:
 14 |     runs-on: windows-latest
 15 |     strategy:
 16 |       fail-fast: false
 17 |       matrix:
 18 |         include:
 19 |           - { icon: '🟦', sys: mingw64 }
 20 |           - { icon: '🟨', sys: ucrt64  }
 21 |           # - { icon: '⬛', sys: mingw32 }  error: target not found: mingw-w64-i686-leptonica
 22 |     name: 🐜${{ matrix.icon }} ${{ matrix.sys }}
 23 |     defaults:
 24 |       run:
 25 |         shell: msys2 {0}
 26 |     steps:
 27 | 
 28 |     - name: '🧰 Checkout'
 29 |       uses: actions/checkout@v4
 30 |       with:
 31 |         fetch-depth: 0
 32 |         persist-credentials: false
 33 | 
 34 |     - name: '${{ matrix.icon }} Setup MSYS2'
 35 |       uses: msys2/setup-msys2@v2
 36 |       with:
 37 |         msystem: ${{matrix.sys}}
 38 |         update: true
 39 |         install: >-
 40 |           git
 41 |           make
 42 |           autoconf
 43 |           automake
 44 |           libtool
 45 |           python3
 46 |           pkg-config
 47 |           zip
 48 |           binutils
 49 |         pacboy: >-
 50 |           zlib:x
 51 |           leptonica:p
 52 |           gcc:p
 53 | 
 54 |     - name: '️#️⃣ get-hash'
 55 |       id: hash
 56 |       run: echo "sha_short=$(git describe --tags)" >> $GITHUB_OUTPUT
 57 |       # run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
 58 |       # run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
 59 | 
 60 |     - name: '🔧 Configure jbig2enc'
 61 |       run: |
 62 |            ./autogen.sh
 63 |            ./configure '--disable-shared' 'CXXFLAGS=-g -O2 -static'
 64 |            # ./configure '--disable-shared' 'CXXFLAGS=-g -O2'
 65 | 
 66 |     - name: '🚧 Build and install jbig2enc'
 67 |       run: |
 68 |            # make LDFLAGS="-all-static"
 69 |            make
 70 |            make install
 71 | 
 72 |     - name: 'ℹ️ Display version'
 73 |       run: |
 74 |            jbig2 -V
 75 | 
 76 |     - name: '🏃 Run test'
 77 |       run: |
 78 |            jbig2 -a -p -v images/feyn.tif > feyn.jb2
 79 |            python3 jbig2topdf.py -s feyn.jb2 > feyn.pdf
 80 | 
 81 |     - name: '🍃 List files in the repository'
 82 |       run: |
 83 |            # ls -lR
 84 |            # find /c/ -name "libstdc++-6.dll"
 85 |            # find /c/ -name "libgcc_s_seh-1.dll"
 86 |            # find /c/ -name "libleptonica-6.dll"
 87 |            # ls -lR ${{ github.workspace }}
 88 |            # dir /?
 89 |            # dir /S
 90 |            ls -lR /${{matrix.sys}}/bin/*.dll
 91 |            objdump -p src/jbig2.exe | grep "DLL Name:"
 92 | 
 93 |     - name: '📦 Make zip'
 94 |       run: |
 95 |         name=jbig2enc_${{ steps.hash.outputs.sha_short }}-${{matrix.sys}}
 96 |         mkdir -p $name/{doc,lib}
 97 |         # Copy programs and library
 98 |         cp jbig2topdf.py src/jbig2.exe $name/
 99 |         cp /${{matrix.sys}}/lib/{libjbig2enc.la,libjbig2enc.a} $name/lib
100 |         # Copy documentation files
101 |         cp AUTHORS ChangeLog COPYING INSTALL NEWS README.md doc/jbig2enc.html doc/PATENTS $name/doc
102 |         # Copy dependancies
103 |         cp /${{matrix.sys}}/bin/{libstdc++-6.dll,libgcc_s_seh-1.dll,libleptonica-6.dll,libpng16-16.dll,libwebp-7.dll,libjpeg-8.dll,libwebpmux-3.dll,libwinpthread-1.dll,libsharpyuv-0.dll,libtiff-6.dll,libLerc.dll,liblzma-5.dll,libgif-7.dll,libopenjp2-7.dll,libdeflate.dll,libjbig-0.dll,zlib1.dll,libzstd.dll} $name/
104 |         zip -r $name.zip $name/
105 |     - name: '⏫ Upload Build Results'
106 |       uses: actions/upload-artifact@v4
107 |       with:
108 |         name: jbig2enc_${{ steps.hash.outputs.sha_short }}-${{matrix.sys}}
109 |         path: jbig2enc_${{ steps.hash.outputs.sha_short }}-${{matrix.sys}}.zip
110 |         retention-days: 5


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
 2 | #
 3 | # You can adjust the behavior by modifying this file.
 4 | # For more information, see:
 5 | # https://github.com/actions/stale
 6 | name: Mark stale issues and pull requests
 7 | 
 8 | on:
 9 |   schedule:
10 |   - cron: '22 2 * * *'
11 | 
12 | jobs:
13 |   stale:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       issues: write
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/stale@v5
22 |       with:
23 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
24 |         stale-issue-message: 'Stale issue message'
25 |         stale-pr-message: 'Stale pull request message'
26 |         stale-issue-label: 'no-issue-activity'
27 |         stale-pr-label: 'no-pr-activity'
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | autom4te*
 2 | test*
 3 | *.o
 4 | *.a
 5 | *.lo
 6 | *.la
 7 | Makefile
 8 | /libtool
 9 | /src/.libs
10 | /src/jbig2
11 | Debug*
12 | Release*
13 | *.suo
14 | *.ncb
15 | *.user
16 | *.patch
17 | *.pdf
18 | *.png
19 | *.jpg
20 | *.jb2
21 | *.sym
22 | *.tiff
23 | *.tif
24 | *.exe
25 | *.[0-9]{0,4}
26 | *.[0-9][0-9][0-9][0-9]
27 | .hg*
28 | *m4*
29 | config.*
30 | configure
31 | install-sh
32 | *.in
33 | missing
34 | ltmain.sh
35 | build*
36 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Author:
 2 | =======
 3 | Adam Langley <agl@imperialviolet.org>
 4 | 
 5 | Contributors:
 6 | =============
 7 | Dan Bloomberg <dan.bloomberg@gmail.com>
 8 | Misty De Meo <mistydemeo@gmail.com>
 9 | zdenop <zdenop@gmail.com>
10 | Steven Lee http://www.rubypdf.com
11 | Radim Hatlapatka <hata.radim@gmail.com>
12 | Alexander Kobel <a-kobel@a-kobel.de>
13 | James R. Barlow <james@purplerock.ca>
14 | MarkJoy <markjoy999@gmail.com>
15 | zvezdochiot <mykaralw@yandex.ru>
16 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.10)
  2 | project(jbig2enc CXX)
  3 | set(Version "0.30")
  4 | 
  5 | set(CMAKE_CXX_STANDARD 14)
  6 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
  7 | set(CMAKE_CXX_EXTENSIONS OFF)
  8 | 
  9 | if(NOT CMAKE_BUILD_TYPE)
 10 |   message(STATUS "Setting build type to 'Release' as none was specified.")
 11 |   set(CMAKE_BUILD_TYPE
 12 |       Release
 13 |       CACHE STRING "Choose the type of build." FORCE)
 14 |   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release")
 15 | endif()
 16 | 
 17 | # ##############################################################################
 18 | # jbig2enc library
 19 | # ##############################################################################
 20 | add_definitions(-DVERSION="${Version}")
 21 | 
 22 | if(WIN32)
 23 |   set(LIBS ${LIBS} Ws2_32)
 24 |   if(MSVC)
 25 |     add_definitions(-DWIN32)
 26 |     add_definitions(-D_LIB)
 27 |     add_definitions(-D_CONSOLE)
 28 |     add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 29 |     add_definitions(-D_SCL_SECURE_NO_WARNINGS)
 30 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MD")
 31 |     # turn off warnings in Release
 32 |     # C4267: '=': conversion from 'size_t' to 'int', possible loss of data
 33 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /wd4267")
 34 |     # C4996: 'x': The POSIX name for this item is deprecated. Instead,
 35 |     # use the ISO C and C++ conformant name:
 36 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /wd4996")
 37 |     # C4334: '<<': result of 32-bit shift implicitly converted to 64
 38 |     # bits (was 64-bit shift intended?)
 39 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /wd4334")
 40 |     # C4244: 'x': conversion from 'a' to 'b', possible loss of data
 41 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /wd4244")
 42 |     # C4305: 'initializing': truncation from 'double' to 'float'
 43 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /wd4305")
 44 |     # Enable multiprocessor builds on Visual Studio
 45 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 46 |   endif()
 47 | else()
 48 |   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -DNDEBUG")
 49 |   set(CMAKE_CXX_FLAGS_DEBUG
 50 |       "${CMAKE_CXX_FLAGS_DEBUG} -DUNIFICATION_DEBUGGING -Wall -DDEBUG -pedantic -Og -g -noalign"
 51 |   )
 52 | endif()
 53 | 
 54 | if(NOT DEFINED CMAKE_DEBUG_POSTFIX)
 55 |   set(CMAKE_DEBUG_POSTFIX "d")
 56 | endif()
 57 | 
 58 | find_package(Leptonica REQUIRED)
 59 | message(STATUS "Found Leptonica library: ${Leptonica_VERSION}")
 60 | include_directories(${Leptonica_INCLUDE_DIRS})
 61 | link_directories(${Leptonica_LIBRARY_DIRS})
 62 | 
 63 | set(libjbig2enc_src
 64 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2arith.cc"
 65 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2comparator.cc"
 66 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2enc.cc"
 67 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2sym.cc")
 68 | set(libjbig2enc_hdr
 69 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2arith.h"
 70 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2comparator.h"
 71 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2enc.h"
 72 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2segments.h"
 73 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2structs.h"
 74 |     "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2sym.h")
 75 | add_library(libjbig2enc ${libjbig2enc_src} ${libjbig2enc_hdr})
 76 | set_target_properties(libjbig2enc PROPERTIES DEBUG_POSTFIX
 77 |                                              ${CMAKE_DEBUG_POSTFIX})
 78 | if(MSVC)
 79 |     # Linking to setargv.obj enables wildcard globbing for the
 80 |     # command line utilities, when compiling with MSVC
 81 |     # https://learn.microsoft.com/en-us/cpp/c-language/expanding-wildcard-arguments
 82 |     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} setargv.obj")
 83 | endif()
 84 | 
 85 | install(
 86 |   TARGETS libjbig2enc
 87 |   RUNTIME DESTINATION bin
 88 |   LIBRARY DESTINATION lib
 89 |   ARCHIVE DESTINATION lib)
 90 | install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2enc.h" DESTINATION include/)
 91 | install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/jbig2topdf.py" DESTINATION bin/)
 92 | 
 93 | # ##############################################################################
 94 | # Programs
 95 | # ##############################################################################
 96 | 
 97 | add_executable(jbig2 "${CMAKE_CURRENT_SOURCE_DIR}/src/jbig2.cc")
 98 | set_target_properties(jbig2 PROPERTIES DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})
 99 | target_link_libraries(jbig2 PRIVATE ${LIBS} libjbig2enc)
100 | target_link_libraries(jbig2 PUBLIC ${Leptonica_LIBRARIES})
101 | install(
102 |   TARGETS jbig2
103 |   RUNTIME DESTINATION bin
104 |   LIBRARY DESTINATION lib
105 |   ARCHIVE DESTINATION lib)
106 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | See also doc/PATENTS.
 2 | 
 3 | Copyright 2006 Google Inc. All Rights Reserved.
 4 | Author: agl@imperialviolet.org (Adam Langley)
 5 | 
 6 | Copyright (C) 2006 Google Inc.
 7 | 
 8 | Licensed under the Apache License, Version 2.0 (the "License");
 9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 | 
12 |      http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
 1 | 0.30: (2024-12-24)
 2 |   * Reject odd input depths that pixThresholdToBinary doesn't support.
 3 |   * Fix build with Leptonica >=1.83
 4 |   * Replace vs2008 solution with CMake
 5 |   * Fix `Error heap-use-after-free`
 6 |   * Fix typos in code
 7 |   * Support python 3, drop support for python2
 8 |   * Rename pdf.py to jbig2topdf.py
 9 |   * Add `-D` switch to set dpi for images w/o dpi information
10 |   * Add support for standalone .jb2
11 |   * Suppress a chatty informational message unless requested
12 |   * Neat symbolic threshold 0.92
13 |   * Fix Endian issue on ARM #63
14 |   * Neat symbolic threshold 0.92
15 |   * Make the weight parameter adjustable
16 |   * Add global BW threshold on 8 bpp images
17 |   * Replace obsolete macro `AC_PROG_LIBTOOL'
18 | 
19 | 0.29 (2017-01-30)
20 |   * fix build with leptonica>=1.70
21 |   * Remove incorrect reference count update.
22 | 
23 | 0.28 (2012-09-19):
24 |   * Update to the latest Leptonica (1.68)
25 |   * autotools support, VC++ 2008 solution
26 |   * fix binary file open mode on Windows
27 |   * version info (-V --version)
28 |   * pdf.py now correctly retains DPI from input images (thanks to Steven Lee
29 |     http://blog.rubypdf.com/2011/09/09/jbig2-pdf-py-patch-the-right-way-to-get-dpi/)
30 |   * R. Hatlapatka: option to use autoThresholding. Improved version from
31 |     bachelor thesis JBIG2 compression http://is.muni.cz/th/208155/fi_b/.
32 | 
33 | 0.27 (2009-04-29):
34 |   * Update to the latest Leptonica (1.58)
35 |   * Fix comments in pdf.py (thanks Dan)
36 | 
37 | 0.26:
38 |   * Update to the latest Leptonica (1.53)
39 | 
40 | 0.25:
41 |    * Should build on Windows if you get a leptonica which does so (thanks to
42 |      Dwight Kelly and a man known only as Dennis)
43 | 
44 | 0.24:
45 |    * Update to leptonica 1.38
46 |    * Add ability to gray symbols but it's commented out because it does little
47 |      good
48 |    * Refinement support broke because of memory savings in leptonica
49 | 
50 | 0.23:
51 |    * Quick release to support leptonica 1.37 (now bundled)
52 | 
53 | 0.22:
54 |    * Added segmentation support to extract text from mixed images
55 |    * Flyspeck removal disabled for now
56 |    * A few minor fixes
57 | 
58 | 0.21:
59 |    * Patch from Alberto Accomazzi to fix the pdf.py script and to output more
60 |      sensible filenames in pdf mode
61 |    * Symbols which are only used once are now included in a per-page dictionary
62 |      (great for scans of old books)
63 |    * Fixed several scale bugs (one at 65K symbols and one at 100K symbols)
64 |    * Page numbers > 256 went wrong
65 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
 1 | # Building
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | * installed [Leptonica](http://www.leptonica.org/) including development parts
 6 | * installed [cmake](https://cmake.org/) or [autotools] (https://www.gnu.org/software/automake/manual/html_node/Autotools-Introduction.html)
 7 | * installed C++ compiller (gcc, clang, MSVC)
 8 | * installed [git](https://git-scm.com/)
 9 | 
10 | 
11 | 
12 | ## Unix-like OS
13 | 
14 | If you're running a Unix-like OS, such as Linux, BSD, Mac OS X or msys just run:
15 | 
16 | ```
17 | ./autogen.sh
18 | ./configure
19 | make
20 | make install (or sudo make install)
21 | ```
22 | 
23 | 
24 | ## CMake
25 | 
26 | CMake requires CMake build installation of Leptonica
27 | 
28 | 
29 | ### Windows
30 | 
31 | 
32 | *Note*: `cat`, `rm` and `dos2unix` tool are part of [git for windows](https://gitforwindows.org/). You can add them to your path with `set PATH=%PATH%;C:\Program Files\Git\usr\bin`. Adjust path `f:\win64` to your Leptonica installation.
33 | 
34 | ```
35 | "c:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" x64
36 | set INSTALL_DIR=f:\win64
37 | set INCLUDE_DIR=f:\win64\include
38 | set LIB_DIR=f:\win64\lib
39 | set PATH=%PATH%;%INSTALL_DIR%\bin
40 | ```
41 | 
42 | ### Configuration
43 | 
44 | ```
45 | git clone --depth 1 https://github.com/agl/jbig2enc
46 | cmake -Bbuild -DCMAKE_INSTALL_PREFIX=%INSTALL_DIR% -DCMAKE_PREFIX_PATH=%INSTALL_DIR%
47 | cmake --build build --config Release
48 | ```
49 | 
50 | ### Install
51 | 
52 | ```
53 | cmake --build build --config Release --target install
54 | ```
55 | 
56 | ### Uninstall
57 | 
58 | ```
59 | cat build/install_manifest.txt | dos2unix | xargs rm
60 | 
61 | ```
62 | 
63 | ### Clean
64 | 
65 | ```
66 | rm -r build/*
67 | ```


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | ACLOCAL_AMFLAGS = -I m4
2 | SUBDIRS = src doc
3 | dist_bin_SCRIPTS = jbig2topdf.py
4 | dist_doc_DATA = AUTHORS ChangeLog COPYING INSTALL NEWS README.md
5 | EXTRA_DIST = autogen.sh
6 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | See Changelog


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![autotools](https://github.com/agl/jbig2enc/actions/workflows/autotools.yaml/badge.svg)](https://github.com/agl/jbig2enc/actions/workflows/autotools.yaml) [![Msys2](https://github.com/agl/jbig2enc/actions/workflows/msys2.yaml/badge.svg)](https://github.com/agl/jbig2enc/actions/workflows/msys2.yaml)
 2 | 
 3 | This is an encoder for [JBIG2](doc/fcd14492.pdf).
 4 | 
 5 | JBIG2 encodes bi-level (1 bpp) images using a number of clever tricks to get
 6 | better compression than G4. This encoder can:
 7 |    * Generate JBIG2 files, or fragments for embedding in PDFs
 8 |    * Generic region encoding
 9 |    * Perform symbol extraction, classification and text region coding
10 |    * Perform refinement coding and,
11 |    * Compress multipage documents
12 | 
13 | It uses the (Apache-ish licensed) Leptonica library:
14 |   http://leptonica.com/
15 | 
16 | Version 1.74 or later is required.
17 | 
18 | ## Known bugs
19 | 
20 | The refinement coding causes Acrobat to crash. It's not known if this is a bug
21 | in Acrobat, though it may well be.
22 | 
23 | 
24 | ## Usage
25 | 
26 | _Note_: Windows Command Prompt does not support wildcard expansion, so `*.jpg` will not work. You'll need to manually expand the file names yourself or you need to use the latest git code and [MSVC build](https://learn.microsoft.com/en-us/cpp/c-language/expanding-wildcard-arguments).
27 | 
28 | See the `jbig2enc.h` header for the high level API, or the `jbig2` program for an
29 | example of usage:
30 | 
31 | ```
32 | $ jbig2 -s -a -p -v *.jpg && python3 jbig2topdf.py output >out.pdf
33 | ```
34 | 
35 | or with standalone mode:
36 | 
37 | ```
38 | $ jbig2 -a -p -v images/feyn.tif > feyn.jb2 && python3 jbig2topdf.py -s feyn.jb2 > feyn.pdf
39 | ```
40 | 
41 | to encode jbig2 files for pdf creation.
42 | If you want to encode an image and then view output first to include in pdf
43 | 
44 | ```
45 | $ jbig2 -s -S -p -v -O out.png *.jpg
46 | ```
47 | 
48 | If you want to encode an image as jbig2 (can be view in [STDU Viewer](http://www.stdutility.com/stduviewer.html) on Windows) run:
49 | 
50 | ```
51 | $ jbig2 -s images/feyn.tif >feyn.jb2
52 | ```
53 | 
54 | ### Links:
55 | 
56 | * [jbig2enc-samples](https://github.com/zdenop/jbig2enc-samples)
57 | * [jbig2enc-minidjvu](https://github.com/ImageProcessing-ElectronicPublications/jbig2enc-minidjvu)
58 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is a simple script which is meant to help developers
 4 | # better deal with the GNU autotools, specifically:
 5 | #
 6 | #   aclocal
 7 | #   autoconf
 8 | #   automake
 9 | #
10 | # The whole thing is quite complex...
11 | #
12 | # The idea is to run this collection of tools on a single platform,
13 | # typically the main development platform, running a recent version of
14 | # autoconf. In theory, if we had these tools on each platform where we
15 | # ever expected to port the software, we would never need to checkin
16 | # more than a few autotools configuration files. However, the whole
17 | # idea is to generate a configure script and associated files in a way
18 | # that is portable across platforms, so we *have* to check in a whole
19 | # bunch of files generated by all these tools.
20 | 
21 | # The real source files are:
22 | #
23 | # acinclude.m4 (used by aclocal)
24 | # configure.ac (main autoconf file)
25 | # Makefile.am, */Makefile.am (automake config files)
26 | #
27 | # All the rest is auto-generated.
28 | 
29 | # create m4 directory if it not exists
30 | if [ ! -d m4 ];  then
31 | 	mkdir m4
32 | fi
33 | 
34 | bail_out()
35 | {
36 | 	echo 
37 | 	echo "  Something went wrong, bailing out!" 
38 | 	echo 
39 | 	exit 1
40 | }
41 | 
42 | # --- Step 1: Generate aclocal.m4
43 | 
44 | echo "Running aclocal"
45 | aclocal || bail_out
46 | 
47 | # --- Step 2:
48 | 
49 | echo "Running libtoolize"
50 | libtoolize -f -c || glibtoolize -f -c || bail_out
51 | libtoolize --automake || glibtoolize --automake || bail_out
52 | 
53 | # --- Step 3: Generate Makefile.in, 
54 | # Using --add-missing --copy makes sure that, if these files are missing,
55 | # they are copied from the system so they can be used in a distribution.
56 | 
57 | echo "Running automake --add-missing --copy"
58 | automake --add-missing -c > /dev/null || bail_out
59 | 
60 | # --- Step 4: Generate configure from:
61 | #             . configure.ac
62 | #
63 | 
64 | echo "Running autoconf"
65 | autoconf || bail_out
66 | 
67 | echo ""
68 | echo "All done."
69 | echo "To build the software now, do something like:"
70 | echo ""
71 | echo "$ ./configure [...other options]"
72 | echo "$ make"
73 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.71])
 2 | AC_INIT([jbig2enc],[0.30],[agl@imperialviolet.org],[jbig2enc-0.30],[https://github.com/agl/jbig2enc])
 3 | AC_CONFIG_MACRO_DIR([m4])
 4 | AM_INIT_AUTOMAKE([-Wall -Werror foreign no-dependencies])
 5 | 
 6 | # this should fix automake 1.12 build and compatible with automake 1.11
 7 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 8 | LT_INIT
 9 | 
10 | AC_PROG_CXX
11 | LT_INIT
12 | 
13 | # Release versioning
14 | GENERIC_MAJOR_VERSION=0
15 | GENERIC_MINOR_VERSION=30
16 | GENERIC_MICRO_VERSION=0
17 | 
18 | # API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)
19 | GENERIC_API_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION
20 | GENERIC_LIBRARY_VERSION=$GENERIC_MAJOR_VERSION:$GENERIC_MINOR_VERSION
21 | AC_SUBST(GENERIC_API_VERSION)
22 | AC_SUBST(GENERIC_MAJOR_VERSION)
23 | 
24 | AC_SUBST(GENERIC_LIBRARY_VERSION)
25 | PACKAGE=$GENERIC_LIBRARY_NAME
26 | AC_SUBST(GENERIC_LIBRARY_NAME)
27 | 
28 | GENERIC_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION.$GENERIC_MICRO_VERSION
29 | GENERIC_RELEASE=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION
30 | AC_SUBST(GENERIC_RELEASE)
31 | AC_SUBST(GENERIC_VERSION)
32 | 
33 | # default conditional
34 | AM_CONDITIONAL(MINGW, false)
35 | 
36 | #############################
37 | #
38 | # Platform specific setup
39 | #
40 | #############################
41 | AC_CANONICAL_HOST
42 | case "${host_os}" in
43 | 	mingw*)
44 | 		AC_DEFINE_UNQUOTED([MINGW], 1, [This is a MinGW system])
45 | 		dnl Try to detect winsock2 on mingw32/64 systems.
46 | 		AC_CHECK_LIB(ws2_32, [_head_libws2_32_a])
47 | 		AC_CHECK_LIB(ws2_32, [_head_lib32_libws2_32_a])
48 | 		AC_CHECK_LIB(ws2_32, [_head_lib64_libws2_32_a])
49 | 		;;
50 | 	*)
51 | 		# default
52 | 		;;
53 | esac
54 | 
55 | AC_CHECK_LIB([leptonica], [findFileFormatStream], [], [
56 | 			echo "Error! Leptonica not detected."
57 | 			exit -1
58 | 			])
59 | PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])
60 | if $have_lept; then
61 | 	CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS"
62 | else
63 | 	AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
64 | fi
65 | 
66 | AC_CHECK_LIB([m], [sqrt], [], [
67 | 			echo "Error! libm not detected."
68 | 			exit -1
69 | 			])
70 | 
71 | AC_CONFIG_FILES([
72 | 	Makefile
73 | 	src/Makefile
74 | 	doc/Makefile
75 | ])
76 | AC_OUTPUT
77 | 


--------------------------------------------------------------------------------
/doc/Makefile.am:
--------------------------------------------------------------------------------
1 | dist_doc_DATA = PATENTS jbig2enc.html
2 | 


--------------------------------------------------------------------------------
/doc/PATENTS:
--------------------------------------------------------------------------------
 1 | This software is a description of processes which may be patented.
 2 | 
 3 | Use of this software may require patent licenses in some countries.
 4 | You are directed to annex I of the JBIG2 specification for information.
 5 | 
 6 | Some information could be found at:
 7 |     http://www.jpeg.org/jbig/index.html
 8 |     http://www.cl.cam.ac.uk/~mgk25/jbigkit/patents/
 9 |     http://www.jpeg.org/public/fcd14492.pdf
10 |     http://itscj.ipsj.or.jp/sc29/open/29view/29n55161.doc
11 | 


--------------------------------------------------------------------------------
/doc/fcd14492.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agl/jbig2enc/494055cb47708bd015ae607962b02100584898fa/doc/fcd14492.pdf


--------------------------------------------------------------------------------
/doc/jbig2enc.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 |   <head>
  3 |     <title>jbig2enc: Documentation</title>
  4 |   </head>
  5 | 
  6 |   <body style="max-width: 70em; font-family: Arial; text-align: justify;">
  7 |     <h1><tt>jbig2enc</tt>: Documentation</h1>
  8 |     <p>Adam Langley <tt>&lt;agl@imperialviolet.org&gt;</tt></p>
  9 | 
 10 |     <h5>What is JBIG2</h5>
 11 | 
 12 |     <p>JBIG2 is an image compression standard from the same people who brought
 13 |     you the JPEG format. It compresses 1bpp (black and white) images only.
 14 |     These images can consist of <i>only</i> black and while, there are no
 15 |     shades of gray - that would be a grayscale image. Any "gray" areas must,
 16 |     therefore be simulated using black dots in a pattern called <a
 17 |       href="http://en.wikipedia.org/wiki/Halftone">halftoning</a>.</p>
 18 | 
 19 |     <p>The JBIG2 standard has several major areas:</p>
 20 |     <ul>
 21 |       <li>Generic region coding</li>
 22 |       <li>Symbol encoding (and text regions)</li>
 23 |       <li>Refinement</li>
 24 |       <li>Halftoning</li>
 25 |     </ul>
 26 | 
 27 |     <p>There are two major compression technologies which JBIG2 builds on:
 28 |     <a href="http://en.wikipedia.org/wiki/Arithmetic_coding">arithmetic encoding</a>
 29 |     and <a href="http://en.wikipedia.org/wiki/Huffman_coding">Huffman encoding</a>. You can
 30 |     choose between them and use both in the same JBIG2 file, but this is rare.
 31 |     Arithmetic encoding is slower, but compresses better. Huffman encoding was
 32 |     included in the standard because one of the (intended) users of JBIG2 were
 33 |     fax machines and they might not have the processing power for arithmetic
 34 |     coding.</p>
 35 | 
 36 |     <p><tt>jbig2enc</tt> <i>only</i> supports arithmetic encoding</p>
 37 | 
 38 |     <h5>Generic region coding</h5>
 39 | 
 40 |     <p>Generic region coding is used to compress bitmaps. It is progressive and
 41 |     uses a context around the current pixel to be decoded to estimate the
 42 |     probability that the pixel will be black. If the probability is 50% it uses
 43 |     a single bit to encode that pixel. If the probability is 99% then it takes less
 44 |     than a bit to encode a black pixel, but more than a bit to encode a white
 45 |     one.</p>
 46 | 
 47 |     <p>The context can only refer to pixels above and to the left of the
 48 |     current pixel, because the decoder doesn't know the values of any of the
 49 |     other pixels yet (pixels are decoded left-to-right, top-to-bottom). Based
 50 |     on the values of these pixels it estimates a probability and updates it's
 51 |     estimation for that context based on the actual pixel found. All contexts
 52 |     start off with a 50% chance of being black.</p>
 53 | 
 54 |     <p>You can encode whole pages with this and you will end up with a perfect
 55 |     reconstruction of the page. However, we can do better...</p>
 56 | 
 57 |     <h5>Symbol encoding</h5>
 58 | 
 59 |     <p>Most input images to JBIG2 encoders are scanned text. These have many
 60 |     repeating symbols (letters). The idea of symbol encoding is to encode what
 61 |     a letter &ldquo;a&rdquo; looks like and, for all the &ldquo;a&rdquo;s on
 62 |     the page, just give their locations. (This is lossy encoding)</p>
 63 | 
 64 |     <p>Unfortunately, all scanned images have noise in them: no two
 65 |     &ldquo;a&rdquo;s will look quite the same so we have to group all the
 66 |     symbols on a page into groups. Hopefully each member of a given group will
 67 |     be the same letter, otherwise we might place the wrong letter on the page!
 68 |     These, very surprising, errors are called cootoots.</p>
 69 | 
 70 |     <p>However, assuming that we group the symbols correctly, we can get great
 71 |     compression this way. Remember that the stricter the classifier, the more
 72 |     symbol groups (classes) will be generated, leading to bigger files. But,
 73 |     also, there is a lower risk of cootoots (misclassification).</p>
 74 | 
 75 |     <p>This is great, but we can do better...</p>
 76 | 
 77 |     <h5>Symbol retention</h5>
 78 | 
 79 |     <p>Symbol retention is the process of compressing multi-page documents by
 80 |     extracting the symbols from all the pages at once and classifying them all
 81 |     together. Thus we only have to encoding a single letter &ldquo;a&rdquo; for
 82 |     the whole document (in an ideal world).</p>
 83 | 
 84 |     <p>This is obviously slower, but generates smaller files (about half the
 85 |     size on average, with a decent number of similar typeset pages).</p>
 86 | 
 87 |     <p>One downside you should be aware of: If you are generating JBIG2 streams
 88 |     for inclusion to a linearised PDF file, the PDF reader has to download all
 89 |     the symbols before it can display the first page. There is solution to this
 90 |     involing multiple dictionaries and symbol importing, but that's not
 91 |     currently supported by <tt>jbig2enc</tt>.</p>
 92 | 
 93 |     <h5>Refinement</h5>
 94 | 
 95 |     <p>Symbol encoding is lossy because of noise, which is classified away and
 96 |     also because the symbol classifier is imperfect. Refinement allows us, when
 97 |     placing a symbol on the page, to encode the difference between the actual
 98 |     symbol at that location, and what the classifier told us was &ldquo;close
 99 |     enough&rdquo;. We can choose to do this for each symbol on the page, so we
100 |     don't have to refine when we are only a couple of pixel off. If we refine
101 |     whenever we see a wrong pixel, we have lossless encoding using symbols.</p>
102 | 
103 |     <h5>Halftoning</h5>
104 | 
105 |     <p><tt>jbig2enc</tt> doesn't support this at all - so I will only mention
106 |     this quickly. The JBIG2 standard supports the efficient encoding of
107 |     halftoning by building a dictionary of halftone blocks (like the
108 |     dictionaries of symbols which we build for text pages). The lack of support
109 |     for halftones in G4 (the old fax standard) was a major weakness.</p>
110 | 
111 |     <h5>Some numbers</h5>
112 | 
113 |     <p>My sample is a set of 90 pages scanning pages from the middle of a
114 |     recent book. The scanned images are 300dpi grayscale and they are being
115 |     upsampled to 600dpi 1-bpp for encoding.</p>
116 | 
117 |     <ul>
118 |       <li>Generic encoding each page: 3435177 bytes</li>
119 |       <li>Symbol encoding each page (default classifier settings): 1075185 bytes</li>
120 |       <li>Symbol encoding with refinement for more than 10 incorrect pixels: 3382605 bytes</li>
121 |       </li>
122 |     </ul>
123 | 
124 |     <h2>Command line options</h2>
125 | 
126 |     <p><tt>jbig2enc</tt> comes with a handy command line tool for encoding
127 |     images.</p>
128 | 
129 |     <ul>
130 |       <li><tt>-d | --duplicate-line-removal</tt>: When encoding generic
131 |       regions each scan line can be tagged to indicate that it's the same as
132 |       the last scanline - and encoding that scanline is skipped. This
133 |       drastically reduces the encoding time (by a factor of about 2 on some
134 |       images) although it doesn't typically save any bytes. This is an option
135 |       because some versions of <tt>jbig2dec</tt> (an open source decoding
136 |       library) cannot handle this.</li>
137 | 
138 |       <li><tt>-p | --pdf</tt>: The PDF spec includes support for JBIG2
139 |       (Syntax&rarr;Filters&rarr;JBIG2Decode in the PDF references for versions
140 |       1.4 and above). However, PDF requires a slightly different format for
141 |       JBIG2 streams: no file/page headers or trailers and all pages are
142 |       numbered 1. In symbol mode the output is to a series of files:
143 |       <tt>symboltable</tt> and <tt>page-</tt><i>n</i> (numbered from 0)</li>
144 | 
145 |       <li><tt>-s | --symbol-mode</tt>: use symbol encoding. Turn on for scanned
146 |       text pages.</li>
147 | 
148 |       <li><tt>-t &lt;threshold&gt;</tt>: sets the fraction of pixels which have
149 |       to match in order for two symbols to be classed the same. This isn't
150 |       strictly true, as there are other tests as well, but increasing this will
151 |       generally increase the number of symbol classes.</li>
152 | 
153 |       <li><tt>-w &lt;weight&gt;</tt>: sets weightfactor (0.1-0.9) that corrects
154 |       thresh for thick characters.</li>
155 | 
156 |       <li><tt>-T &lt;threshold&gt;</tt>: sets the black threshold (0-255). Any gray value darker
157 |       than this is considered black. Anything lighter is considered white.</li>
158 | 
159 |       <li><tt>-r | --refine &lt;tolerance&gt;</tt>: (requires <tt>-s</tt>) turn
160 |       on refinement for symbols with more than <tt>tolerance</tt> incorrect
161 |       pixels. (10 is a good value for 300dpi, try 40 for 600dpi). Note: this is
162 |       known to crash Adobe products.</li>
163 | 
164 |       <li><tt>-O &lt;outfile&gt;</tt>: dump a PNG of the 1 bpp image before
165 |       encoding. Can be used to test loss.</li>
166 | 
167 |       <li><tt>-2</tt> or <tt>-4</tt>: upscale either two or four times before
168 |       converting to black and white.</li>
169 | 
170 |       <li><tt>-S</tt> Segment an image into text and non-text regions. This isn't perfect, but running text through the symbol compressor is terrible so it's worth doing if your input has images in it (like a magazine page). You can also give the <tt>--image-output</tt> option to set a filename to which the parts which were removed are written (PNG format).</li>
171 |   </ul>
172 |   </body>
173 | </html>
174 | 


--------------------------------------------------------------------------------
/images/feyn.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agl/jbig2enc/494055cb47708bd015ae607962b02100584898fa/images/feyn.tif


--------------------------------------------------------------------------------
/jbig2topdf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2006 Google Inc.
  3 | # Author: agl@imperialviolet.org (Adam Langley)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # JBIG2 Encoder
 18 | # https://github.com/agl/jbig2enc
 19 | 
 20 | import glob
 21 | import struct
 22 | import sys
 23 | from pathlib import Path
 24 | 
 25 | # This is a very simple script to make a PDF file out of the output of a
 26 | # multipage symbol compression.
 27 | # Run ./jbig2 -s -p <other options> image1.jpeg image1.jpeg ...
 28 | # python jbig2topdf.py output > out.pdf
 29 | 
 30 | 
 31 | dpi = 72  # Default DPI value
 32 | 
 33 | 
 34 | class Ref:
 35 |     def __init__(self, x: int):
 36 |         self.x = x
 37 | 
 38 |     def __str__(self) -> str:
 39 |         return f"{self.x} 0 R"
 40 | 
 41 | 
 42 | class Dict:
 43 |     def __init__(self, values: dict = None):
 44 |         if values is None:
 45 |             values = {}
 46 |         self.d = values.copy()
 47 | 
 48 |     def __str__(self) -> str:
 49 |         entries = [f"/{key} {value}" for key, value in self.d.items()]
 50 |         return f"<< {' '.join(entries)} >>\n"
 51 | 
 52 | 
 53 | class Obj:
 54 |     next_id = 1
 55 | 
 56 |     def __init__(self, d: dict = None, stream: str = None):
 57 |         if d is None:
 58 |             d = {}
 59 |         if stream is not None:
 60 |             d["Length"] = str(len(stream))
 61 |         self.d = Dict(d)
 62 |         self.stream = stream
 63 |         self.id = Obj.next_id
 64 |         Obj.next_id += 1
 65 | 
 66 |     def __str__(self) -> str:
 67 |         result = [str(self.d)]
 68 |         if self.stream is not None:
 69 |             result.append(f"stream\n{self.stream}\nendstream\n")
 70 |         result.append("endobj\n")
 71 |         return "".join(result)
 72 | 
 73 | 
 74 | class Doc:
 75 |     def __init__(self):
 76 |         self.objs = []
 77 |         self.pages = []
 78 | 
 79 |     def add_object(self, obj: Obj) -> Obj:
 80 |         """Adds an object to the document."""
 81 |         self.objs.append(obj)
 82 |         return obj
 83 | 
 84 |     def add_page(self, page: Obj) -> Obj:
 85 |         """Adds a page to the document and the list of objects."""
 86 |         self.pages.append(page)
 87 |         return self.add_object(page)
 88 | 
 89 |     def __str__(self) -> str:
 90 |         output = []
 91 |         offsets = []
 92 |         current_offset = 0
 93 | 
 94 |         def add_line(line: str):
 95 |             nonlocal current_offset
 96 |             output.append(line)
 97 |             current_offset += len(line) + 1  # Adding 1 for the newline character
 98 | 
 99 |         # PDF header
100 |         add_line("%PDF-1.4")
101 | 
102 |         # Add each object and track its byte offset
103 |         for obj in self.objs:
104 |             offsets.append(current_offset)
105 |             add_line(f"{obj.id} 0 obj")
106 |             add_line(str(obj))
107 | 
108 |         # Cross-reference table
109 |         xref_start = current_offset
110 |         add_line("xref")
111 |         add_line(f"0 {len(offsets) + 1}")
112 |         add_line("0000000000 65535 f ")
113 |         for offset in offsets:
114 |             add_line(f"{offset:010} 00000 n ")
115 | 
116 |         # Trailer and EOF
117 |         add_line("trailer")
118 |         add_line(f"<< /Size {len(offsets) + 1}\n/Root 1 0 R >>")
119 |         add_line("startxref")
120 |         add_line(str(xref_start))
121 |         add_line("%%EOF")
122 | 
123 |         return "\n".join(output)
124 | 
125 | 
126 | def ref(x: int) -> str:
127 |     """Creates a PDF reference string."""
128 |     return f"{x} 0 R"
129 | 
130 | 
131 | def create_pdf(symboltable: str = "symboltable", pagefiles: list = None):
132 |     """Creates a PDF document from a symbol table and a list of page files."""
133 |     pagefiles = pagefiles or glob.glob("page-*")
134 |     doc = Doc()
135 | 
136 |     # Add catalog and outlines objects
137 |     catalog_obj = Obj({"Type": "/Catalog", "Outlines": ref(2), "Pages": ref(3)})
138 |     outlines_obj = Obj({"Type": "/Outlines", "Count": "0"})
139 |     pages_obj = Obj({"Type": "/Pages"})
140 | 
141 |     doc.add_object(catalog_obj)
142 |     doc.add_object(outlines_obj)
143 |     doc.add_object(pages_obj)
144 | 
145 |     # Read symbol table if it exists
146 |     symd = None
147 |     if symboltable:
148 |         try:
149 |             sym_file = Path(symboltable).read_bytes()
150 |             symd = doc.add_object(Obj({}, sym_file.decode("latin1")))
151 |         except IOError:
152 |             sys.stderr.write(f"Error reading symbol table: {symboltable}\n")
153 |             return
154 | 
155 |     page_objs = []
156 |     pagefiles.sort()
157 | 
158 |     for p in pagefiles:
159 |         try:
160 |             contents = Path(p).read_bytes()
161 |         except IOError:
162 |             sys.stderr.write(f"Error reading page file: {p}\n")
163 |             continue
164 | 
165 |         try:
166 |             width, height, xres, yres = struct.unpack(">IIII", contents[11:27])
167 |         except struct.error:
168 |             sys.stderr.write(f"Error unpacking page file: {p}\n")
169 |             continue
170 | 
171 |         # Set default resolution if missing
172 |         xres = xres or dpi
173 |         yres = yres or dpi
174 | 
175 |         # Create XObject (image) for the page
176 |         lexicon = {
177 |             "Type": "/XObject",
178 |             "Subtype": "/Image",
179 |             "Width": str(width),
180 |             "Height": str(height),
181 |             "ColorSpace": "/DeviceGray",
182 |             "BitsPerComponent": "1",
183 |             "Filter": "/JBIG2Decode",
184 |         }
185 |         if symd:
186 |             lexicon["DecodeParms"] = f"<< /JBIG2Globals {symd.id} 0 R >>"
187 |         xobj = Obj(
188 |             lexicon,
189 |             contents.decode("latin1"),
190 |         )
191 | 
192 |         # Create content stream for the page
193 |         contents_obj = Obj(
194 |             {},
195 |             f"q {float(width * 72) / xres} 0 0 {float(height * 72) / yres} 0 0 cm /Im1 Do Q",
196 |         )
197 | 
198 |         # Create resource dictionary for the page
199 |         resources_obj = Obj(
200 |             {"ProcSet": "[/PDF /ImageB]", "XObject": f"<< /Im1 {xobj.id} 0 R >>"}
201 |         )
202 | 
203 |         # Create the page object
204 |         page_obj = Obj(
205 |             {
206 |                 "Type": "/Page",
207 |                 "Parent": "3 0 R",
208 |                 "MediaBox": f"[ 0 0 {float(width * 72) / xres} {float(height * 72) / yres} ]",
209 |                 "Contents": ref(contents_obj.id),
210 |                 "Resources": ref(resources_obj.id),
211 |             }
212 |         )
213 | 
214 |         # Add objects to the document
215 |         for obj in (xobj, contents_obj, resources_obj, page_obj):
216 |             doc.add_object(obj)
217 | 
218 |         page_objs.append(page_obj)
219 | 
220 |         # Update pages object
221 |         pages_obj.d.d["Count"] = str(len(page_objs))
222 |         pages_obj.d.d["Kids"] = "[" + " ".join([ref(x.id) for x in page_objs]) + "]"
223 | 
224 |     # Output the final PDF document to stdout
225 |     sys.stdout.buffer.write(str(doc).encode("latin1"))
226 | 
227 | 
228 | def usage(script, msg):
229 |     """Display usage information and an optional error message."""
230 |     if msg:
231 |         sys.stderr.write(f"{script}: {msg}\n")
232 |     sys.stderr.write(f"""
233 | Usage:
234 |   {script} [basename] > out.pdf
235 |   {script} -s [page.jb2]... > out.pdf
236 | 
237 |   Read symbol table from `basename.sym` and pages from `basename.[0-9]*`
238 |     if basename not given: symbol table from `symboltable`, pages from `page-*`
239 | 
240 |   -s: standalone mode (no global symbol table)
241 | """)
242 |     sys.exit(1)
243 | 
244 | 
245 | def validate_file_exists(file_path: str, script: str, error_msg: str) -> None:
246 |     """Validates that a file exists, otherwise exits with usage error."""
247 |     if not Path(file_path).exists():
248 |         usage(script, error_msg)
249 | 
250 | 
251 | def parse_args(script: str) -> tuple:
252 |     """Parses command-line arguments and returns the symbol table and page files."""
253 |     if "-s" in sys.argv:
254 |         # Standalone mode, no global symbol table
255 |         pages = [arg for arg in sys.argv[1:] if arg != "-s"]
256 |         return "", pages
257 |     elif len(sys.argv) == 2:
258 |         base_name = sys.argv[1]
259 |         sym = f"{base_name}.sym"
260 |         pages = glob.glob(f"{base_name}.[0-9]*")
261 |     elif len(sys.argv) == 1:
262 |         sym = "symboltable"
263 |         pages = glob.glob("page-*")
264 |     else:
265 |         usage(script, "wrong number of arguments!")
266 | 
267 |     # Validate that the symbol table and pages exist
268 |     validate_file_exists(sym, script, f"symbol table '{sym}' not found!")
269 |     if not pages:
270 |         usage(script, "no pages found!")
271 | 
272 |     return sym, pages
273 | 
274 | 
275 | if __name__ == "__main__":
276 |     sym, pages = parse_args(sys.argv[0])
277 |     create_pdf(sym, pages)
278 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | AM_CXXFLAGS = -Wall
 2 | AM_LDFLAGS = -Wl,-E
 3 | 
 4 | lib_LTLIBRARIES = libjbig2enc.la
 5 | libjbig2enc_la_SOURCES = jbig2enc.cc jbig2arith.cc jbig2sym.cc jbig2comparator.cc
 6 | libjbig2enc_la_LDFLAGS = -no-undefined -version-info $(GENERIC_LIBRARY_VERSION)
 7 | include_HEADERS = jbig2arith.h jbig2sym.h jbig2structs.h jbig2segments.h jbig2comparator.h
 8 | 
 9 | bin_PROGRAMS = jbig2
10 | jbig2_SOURCES = jbig2.cc
11 | jbig2_LDADD = libjbig2enc.la 
12 | jbig2_LDFLAGS = -static
13 | 
14 | if MINGW
15 | jbig2_LDADD += -lws2_32
16 | endif 
17 | 


--------------------------------------------------------------------------------
/src/jbig2.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #include <vector>
 19 | 
 20 | #include <sys/types.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <fcntl.h>
 24 | #include <string.h>
 25 | #ifdef _MSC_VER
 26 | #include <io.h>
 27 | #else
 28 | #include <unistd.h>
 29 | #endif
 30 | 
 31 | #include <leptonica/allheaders.h>
 32 | #if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1
 33 | #include "leptonica/pix_internal.h"
 34 | #endif
 35 | 
 36 | #include "jbig2enc.h"
 37 | 
 38 | #if defined(WIN32)
 39 | #define WINBINARY O_BINARY
 40 | #else
 41 | #define WINBINARY 0
 42 | #endif
 43 | 
 44 | #define JBIG2_THRESHOLD_MIN 0.4f
 45 | #define JBIG2_THRESHOLD_MAX 0.97f
 46 | #define JBIG2_THRESHOLD_DEF 0.92f
 47 | #define JBIG2_WEIGHT_MIN 0.1f
 48 | #define JBIG2_WEIGHT_MAX 0.9f
 49 | #define JBIG2_WEIGHT_DEF 0.5f
 50 | #define BW_THRESHOLD_MIN 0
 51 | #define BW_THRESHOLD_MAX 255
 52 | #define BW_LOCAL_THRESHOLD_DEF 200
 53 | #define BW_GLOBAL_THRESHOLD_DEF 128
 54 | 
 55 | static void
 56 | usage(const char *argv0) {
 57 |   fprintf(stderr, "Usage: %s [options] <input filenames...>\n", argv0);
 58 |   fprintf(stderr, "Options:\n");
 59 |   fprintf(stderr, "  -b <basename>: output file root name when using symbol coding\n");
 60 |   fprintf(stderr, "  -d --duplicate-line-removal: use TPGD in generic region coder\n");
 61 |   fprintf(stderr, "  -p --pdf: produce PDF ready data\n");
 62 |   fprintf(stderr, "  -s --symbol-mode: use text region, not generic coder\n");
 63 |   fprintf(stderr, "  -t <threshold>: set classification threshold for symbol coder (def: %0.2f)\n", JBIG2_THRESHOLD_DEF);
 64 |   fprintf(stderr, "  -w <weight>: set classification weight for symbol coder (def: %0.2f)\n", JBIG2_WEIGHT_DEF);
 65 |   fprintf(stderr, "  -T <bw threshold>: set 1 bpp threshold (def: %d)\n", BW_LOCAL_THRESHOLD_DEF);
 66 |   fprintf(stderr, "  -G --global: use global BW threshold on 8 bpp images;\n"
 67 |                   "               the default is to use local (adaptive) thresholding\n");
 68 |   fprintf(stderr, "  -r --refine: use refinement (requires -s: lossless)\n");
 69 |   fprintf(stderr, "  -O <outfile>: dump thresholded image as PNG\n");
 70 |   fprintf(stderr, "  -2: upsample 2x before thresholding\n");
 71 |   fprintf(stderr, "  -4: upsample 4x before thresholding\n");
 72 |   fprintf(stderr, "  -S: remove images from mixed input and save separately\n");
 73 |   fprintf(stderr, "  -j --jpeg-output: write images from mixed input as JPEG\n");
 74 |   fprintf(stderr, "  -a --auto-thresh: use automatic thresholding in symbol encoder\n");
 75 |   fprintf(stderr, "  -D --dpi: force dpi\n");
 76 |   fprintf(stderr, "  --no-hash: disables use of hash function for automatic thresholding\n");
 77 |   fprintf(stderr, "  -V --version: version info\n");
 78 |   fprintf(stderr, "  -v: be verbose\n");
 79 | }
 80 | 
 81 | static bool verbose = false;
 82 | 
 83 | 
 84 | static void
 85 | pixInfo(PIX *pix, const char *msg) {
 86 |   if (msg != NULL) fprintf(stderr, "%s ", msg);
 87 |   if (pix == NULL) {
 88 |     fprintf(stderr, "NULL pointer!\n");
 89 |     return;
 90 |   }
 91 |   fprintf(stderr, "%u x %u (%d bits) %udpi x %udpi, refcount = %u\n",
 92 |           pix->w, pix->h, pix->d, pix->xres, pix->yres, pix->refcount);
 93 | }
 94 | 
 95 | #ifdef WIN32
 96 | // -----------------------------------------------------------------------------
 97 | // Windows, sadly, lacks asprintf
 98 | // -----------------------------------------------------------------------------
 99 | #include <stdarg.h>
100 | int
101 | asprintf(char **strp, const char *fmt, ...) {
102 |   va_list va;
103 |   va_start(va, fmt);
104 | 
105 |   const int required = vsnprintf(NULL, 0, fmt, va);
106 |   char *const buffer = (char *) malloc(required + 1);
107 |   const int ret = vsnprintf(buffer, required + 1, fmt, va);
108 |   *strp = buffer;
109 | 
110 |   va_end(va);
111 | 
112 |   return ret;
113 | }
114 | #endif
115 | 
116 | // -----------------------------------------------------------------------------
117 | // Morphological operations for segmenting an image into text regions
118 | // -----------------------------------------------------------------------------
119 | static const char *segment_mask_sequence = "r11";
120 | static const char *segment_seed_sequence = "r1143 + o4.4 + x4"; /* maybe o6.6 */
121 | static const char *segment_dilation_sequence = "d3.3";
122 | 
123 | // -----------------------------------------------------------------------------
124 | // Takes two pix as input, generated from the same original image:
125 | //   1. pixb   - a binary thresholded image
126 | //   2. piximg - a color or grayscale image
127 | // and segments them by finding the areas that contain color or grayscale
128 | // graphics.  These graphics regions are removed from the input binary
129 | // image, and they are retained in the returned color-or-grayscale image.
130 | // The upshot is that after this routine has been run:
131 | //  (a) the input binary image contains only text, and is NULL if there
132 | //      is no text, and
133 | //  (b) the returned color-or-grayscale image contains only the graphics,
134 | //      and is NULL if there is no graphics.
135 | // The input color-or-grayscale image is not affected.
136 | //
137 | // Thanks to Dan Bloomberg for this
138 | // -----------------------------------------------------------------------------
139 | 
140 | static PIX*
141 | segment_image(PIX **ppixb, PIX *piximg) {
142 |   PIX *pixb = *ppixb;
143 |   // Make a mask over the non-text (graphics) part of the input 1 bpp image
144 |   // Do this by making a seed and mask, and filling the seed into the mask
145 |   PIX *pixmask4 = pixMorphSequence(pixb, (char *) segment_mask_sequence, 0);
146 |   PIX *pixseed4 = pixMorphSequence(pixb, (char *) segment_seed_sequence, 0);
147 |   PIX *pixsf4 = pixSeedfillBinary(NULL, pixseed4, pixmask4, 8);
148 |   PIX *pixd4 = pixMorphSequence(pixsf4, (char *) segment_dilation_sequence, 0);
149 |   PIX *pixd = pixExpandBinaryPower2(pixd4, 4);
150 |   pixDestroy(&pixd4);
151 |   pixDestroy(&pixsf4);
152 |   pixDestroy(&pixseed4);
153 |   pixDestroy(&pixmask4);
154 |   if (verbose) pixInfo(pixd, "mask image: ");
155 | 
156 |   // Remove pixels over the graphics part from the text mask.  This
157 |   // side-effects the input binary mask.
158 |   pixSubtract(pixb, pixb, pixd);
159 | 
160 |   // Set up table to count pixels in the text and graphics masks
161 |   static l_int32 *tab = NULL;
162 |   if (tab == NULL) tab = makePixelSumTab8();
163 | 
164 |   // If no graphics portion is found, destroy the graphics mask and return NULL
165 |   l_int32  pcount;
166 |   pixCountPixels(pixd, &pcount, tab);
167 |   if (verbose) fprintf(stderr, "pixel count of graphics image: %u\n", pcount);
168 |   if (pcount < 100) {
169 |     pixDestroy(&pixd);
170 |     return NULL;
171 |   }
172 | 
173 |   // If no text portion is found, destroy the input binary image.
174 |   pixCountPixels(pixb, &pcount, tab);
175 |   if (verbose) fprintf(stderr, "pixel count of binary image: %u\n", pcount);
176 |   if (pcount < 100) {
177 |     pixDestroy(ppixb);  // destroy & set caller handle to NULL
178 |     pixb = NULL;  // needed later in this function for pixInfo()
179 |   }
180 | 
181 |   PIX *piximg1;
182 |   if (piximg->d == 1 || piximg->d == 8 || piximg->d == 32) {
183 |     piximg1 = pixClone(piximg);
184 |   } else if (piximg->d > 8) {
185 |     piximg1 = pixConvertTo32(piximg);
186 |   } else {
187 |     piximg1 = pixConvertTo8(piximg, FALSE);
188 |   }
189 | 
190 |   PIX *pixd1;
191 |   if (piximg1->d == 32) {
192 |     pixd1 = pixConvertTo32(pixd);
193 |   } else if (piximg1->d == 8) {
194 |     pixd1 = pixConvertTo8(pixd, FALSE);
195 |   } else {
196 |     pixd1 = pixClone(pixd);
197 |   }
198 |   pixDestroy(&pixd);
199 | 
200 |   if (verbose) {
201 |     pixInfo(pixd1, "binary mask image:");
202 |     pixInfo(piximg1, "graphics image:");
203 |   }
204 |   pixRasteropFullImage(pixd1, piximg1, PIX_SRC | PIX_DST);
205 | 
206 |   pixDestroy(&piximg1);
207 |   if (verbose) {
208 |     pixInfo(pixb, "segmented binary text image:");
209 |     pixInfo(pixd1, "segmented graphics image:");
210 |   }
211 | 
212 |   return pixd1;
213 | }
214 | 
215 | int
216 | main(int argc, char **argv) {
217 |   bool duplicate_line_removal = false;
218 |   bool pdfmode = false;
219 |   bool globalmode = false;
220 |   int bw_threshold = BW_LOCAL_THRESHOLD_DEF;
221 |   float threshold = JBIG2_THRESHOLD_DEF;
222 |   float weight = JBIG2_WEIGHT_DEF;
223 |   bool symbol_mode = false;
224 |   bool refine = false;
225 |   bool up2 = false, up4 = false;
226 |   const char *output_threshold_image = NULL;
227 |   const char *basename = "output";
228 |   l_int32 img_fmt = IFF_PNG;
229 |   const char *img_ext = "png";
230 |   bool segment = false;
231 |   bool auto_thresh = false;
232 |   bool hash = true;
233 |   int dpi = 0;
234 |   int i;
235 | 
236 |   #ifdef WIN32
237 |     int result = _setmode(_fileno(stdout), _O_BINARY);
238 |     if (result == -1)
239 |       fprintf(stderr, "Cannot set mode to binary for stdout\n");
240 |   #endif
241 | 
242 |   for (i = 1; i < argc; ++i) {
243 |     if (strcmp(argv[i], "-h") == 0 ||
244 |         strcmp(argv[i], "--help") == 0) {
245 |       usage(argv[0]);
246 |       return 0;
247 |       continue;
248 |     }
249 | 
250 |     if (strcmp(argv[i], "-V") == 0 ||
251 |         strcmp(argv[i], "--version") == 0) {
252 |       fprintf(stderr, "jbig2enc %s\n", getVersion());
253 | 
254 |       char *versionStrP;
255 |       versionStrP = getLeptonicaVersion();
256 |       fprintf(stderr, " %s\n", versionStrP);
257 |       lept_free(versionStrP);
258 | 
259 |       versionStrP = getImagelibVersions();
260 |       fprintf(stderr, "  %s\n", versionStrP);
261 |       lept_free(versionStrP);
262 |       return 0;
263 |     }
264 | 
265 |     if (strcmp(argv[i], "-b") == 0 ||
266 |         strcmp(argv[i], "--basename") == 0) {
267 |       basename = argv[i+1];
268 |       i++;
269 |       continue;
270 |     }
271 | 
272 |     if (strcmp(argv[i], "-d") == 0 ||
273 |         strcmp(argv[i], "--duplicate-line-removal") == 0) {
274 |       duplicate_line_removal = true;
275 |       continue;
276 |     }
277 | 
278 |     if (strcmp(argv[i], "-p") == 0 ||
279 |         strcmp(argv[i], "--pdf") == 0) {
280 |       pdfmode = true;
281 |       continue;
282 |     }
283 | 
284 |     if (strcmp(argv[i], "-s") == 0 ||
285 |         strcmp(argv[i], "--symbol-mode") == 0) {
286 |       symbol_mode = true;
287 |       continue;
288 |     }
289 | 
290 |     if (strcmp(argv[i], "-r") == 0 ||
291 |         strcmp(argv[i], "--refine") == 0) {
292 |       fprintf(stderr, "Refinement broke in recent releases since it's "
293 |                       "rarely used. If you need it you should bug "
294 |                       "agl@imperialviolet.org to fix it\n");
295 |       return 1;
296 |       refine = true;
297 |       continue;
298 |     }
299 | 
300 |     if (strcmp(argv[i], "-2") == 0) {
301 |       up2 = true;
302 |       continue;
303 |     }
304 |     if (strcmp(argv[i], "-4") == 0) {
305 |       up4 = true;
306 |       continue;
307 |     }
308 | 
309 |     if (strcmp(argv[i], "-O") == 0) {
310 |       output_threshold_image = argv[i+1];
311 |       i++;
312 |       continue;
313 |     }
314 | 
315 |     if (strcmp(argv[i], "-S") == 0) {
316 |       segment = true;
317 |       continue;
318 |     }
319 | 
320 |     if (strcmp(argv[i], "-j") == 0 ||
321 |         strcmp(argv[i], "--jpeg-output") == 0) {
322 |       img_ext = "jpg";
323 |       img_fmt = IFF_JFIF_JPEG;
324 |       continue;
325 |     }
326 | 
327 |     if (strcmp(argv[i], "-t") == 0) {
328 |       char *endptr;
329 |       threshold = strtod(argv[i+1], &endptr);
330 |       if (*endptr) {
331 |         fprintf(stderr, "Cannot parse float value: %s\n", argv[i+1]);
332 |         usage(argv[0]);
333 |         return 1;
334 |       }
335 | 
336 |       if ((threshold < JBIG2_THRESHOLD_MIN) ||
337 |           (threshold > JBIG2_THRESHOLD_MAX)) {
338 |         fprintf(stderr, "Invalid value for threshold\n");
339 |         fprintf(stderr, "(must be between %0.2f and %0.2f)\n",
340 |                 JBIG2_THRESHOLD_MIN, JBIG2_THRESHOLD_MAX);
341 |         return 10;
342 |       }
343 |       i++;
344 |       continue;
345 |      }
346 | 
347 |     if (strcmp(argv[i], "-w") == 0) {
348 |       char *endptr;
349 |       weight = strtod(argv[i+1], &endptr);
350 |       if (*endptr) {
351 |         fprintf(stderr, "Cannot parse float value: %s\n", argv[i+1]);
352 |         usage(argv[0]);
353 |         return 1;
354 |       }
355 | 
356 |       if ((weight < JBIG2_WEIGHT_MIN) || (weight > JBIG2_WEIGHT_MAX)) {
357 |         fprintf(stderr, "Invalid value for weight\n");
358 |         fprintf(stderr, "(must be between %0.2f and %0.2f)\n",
359 |                 JBIG2_WEIGHT_MIN, JBIG2_WEIGHT_MAX);
360 |         return 10;
361 |       }
362 |       i++;
363 |       continue;
364 |     }
365 | 
366 |     // Local BW thresholding is the default.  However, if global
367 |     // BW thresholding is requested, use its default threshold.
368 |     if (strcmp(argv[i], "-G") == 0 ||
369 |         strcmp(argv[i], "--global") == 0) {
370 |       globalmode = true;
371 |       bw_threshold = BW_GLOBAL_THRESHOLD_DEF;
372 |       continue;
373 |     }
374 | 
375 |     // If a BW threshold value is requested, overwrite the default value.
376 |     if (strcmp(argv[i], "-T") == 0) {
377 |       char *endptr;
378 |       bw_threshold = strtol(argv[i+1], &endptr, 10);
379 |       if (*endptr) {
380 |         fprintf(stderr, "Cannot parse int value: %s\n", argv[i+1]);
381 |         usage(argv[0]);
382 |         return 1;
383 |       }
384 |       if (bw_threshold < BW_THRESHOLD_MIN || bw_threshold > BW_THRESHOLD_MAX) {
385 |         fprintf(stderr, "Invalid bw threshold: (%d..%d)\n",
386 |                 BW_THRESHOLD_MIN, BW_THRESHOLD_MAX);
387 |         return 11;
388 |       }
389 |       i++;
390 |       continue;
391 |     }
392 | 
393 |     // engage auto thresholding
394 |     if (strcmp(argv[i], "--auto-thresh") == 0 ||
395 |         strcmp(argv[i], "-a") == 0 ) {
396 |       auto_thresh = true;
397 |       continue;
398 |     }
399 | 
400 |     if (strcmp(argv[i], "--no-hash") == 0) {
401 |       hash = false;
402 |       continue;
403 |     }
404 | 
405 |     if (strcmp(argv[i], "-v") == 0) {
406 |       verbose = true;
407 |       continue;
408 |     }
409 | 
410 |     if (strcmp(argv[i], "-D") == 0 ||
411 |         strcmp(argv[i], "--dpi") == 0) {
412 |       char *endptr;
413 |       long t_dpi = strtol(argv[i+1], &endptr, 10);
414 |       if (*endptr) {
415 |     fprintf(stderr, "Cannot parse int value: %s\n", argv[i+1]);
416 |     usage(argv[0]);
417 |     return 1;
418 |       }
419 |       if (t_dpi <= 0 || t_dpi > 9600) {
420 |         fprintf(stderr, "Invalid dpi: (1..9600)\n");
421 |         return 12;
422 |       } 
423 |       dpi = (int)t_dpi;
424 |       i++;
425 |       continue;
426 |     }
427 | 
428 |     break;
429 |   }
430 | 
431 |   if (i == argc) {
432 |     fprintf(stderr, "No filename given\n\n");
433 |     usage(argv[0]);
434 |     return 4;
435 |   }
436 | 
437 |   if (refine && !symbol_mode) {
438 |     fprintf(stderr, "Refinement makes not sense unless in symbol mode!\n");
439 |     fprintf(stderr, "(if you have -r, you must have -s)\n");
440 |     return 5;
441 |   }
442 | 
443 |   if (up2 && up4) {
444 |     fprintf(stderr, "Can't have both -2 and -4!\n");
445 |     return 6;
446 |   }
447 | 
448 |   struct jbig2ctx *ctx = jbig2_init(threshold, weight, 0, 0,
449 |                          !pdfmode, refine ? 10 : -1);
450 |   int pageno = -1;
451 | 
452 |   int numsubimages=0, subimage=0, num_pages = 0;
453 |   while (i < argc) {
454 |     if (subimage==numsubimages) {
455 |       subimage = numsubimages = 0;
456 |       FILE *fp;
457 |       if (verbose) fprintf(stderr, "Processing \"%s\"...\n", argv[i]);
458 |       if ((fp=lept_fopen(argv[i], "r"))==NULL) {
459 |         fprintf(stderr, "Unable to open \"%s\"\n", argv[i]);
460 |         return 1;
461 |       }
462 |       l_int32 filetype;
463 |       findFileFormatStream(fp, &filetype);
464 |       if (filetype==IFF_TIFF && tiffGetCount(fp, &numsubimages)) {
465 |         return 1;
466 |       }
467 |       lept_fclose(fp);
468 |     }
469 | 
470 |     PIX *source;
471 |     if (numsubimages<=1) {
472 |       source = pixRead(argv[i]);
473 |       numsubimages = 0;
474 |     } else {
475 |       source = pixReadTiff(argv[i], subimage++);
476 |     }
477 | 
478 |     if (dpi != 0 && source->xres == 0 && source->yres == 0) {
479 |       source->xres = dpi;
480 |       source->yres = dpi;
481 |     }
482 | 
483 |     if (!source) return 3;
484 |     if (verbose)
485 |       pixInfo(source, "source image:");
486 | 
487 |     PIX *pixl, *gray, *adapt, *pixt;
488 |     if ((pixl = pixRemoveColormap(source, REMOVE_CMAP_BASED_ON_SRC)) == NULL) {
489 |       fprintf(stderr, "Failed to remove colormap from %s\n", argv[i]);
490 |       return 1;
491 |     }
492 |     pixDestroy(&source);
493 |     pageno++;
494 | 
495 |     if (pixl->d > 1) {
496 |       if (pixl->d > 8) {
497 |         gray = pixConvertRGBToGrayFast(pixl);
498 |         if (!gray) return 1;
499 |       } else if (pixl->d == 4 || pixl->d == 8) {
500 |         gray = pixClone(pixl);
501 |       } else {
502 |         fprintf(stderr, "Unsupported input image depth: %d\n", pixl->d);
503 |         return 1;
504 |       }
505 |       if (!globalmode) {
506 |         adapt = pixCleanBackgroundToWhite(gray, NULL, NULL, 1.0, 90, 190);
507 |       } else {
508 |         adapt = pixClone(gray);
509 |       }
510 |       pixDestroy(&gray);
511 |       if (up2) {
512 |         pixt = pixScaleGray2xLIThresh(adapt, bw_threshold);
513 |       } else if (up4) {
514 |         pixt = pixScaleGray4xLIThresh(adapt, bw_threshold);
515 |       } else {
516 |         pixt = pixThresholdToBinary(adapt, bw_threshold);
517 |       }
518 |       pixDestroy(&adapt);
519 |     } else {
520 |       pixt = pixClone(pixl);
521 |     }
522 |     if (!pixt) {
523 |       fprintf(stderr, "Failed to convert input image to binary\n");
524 |       return 1;
525 |     }
526 |     if (verbose)
527 |       pixInfo(pixt, "thresholded image:");
528 | 
529 |     if (output_threshold_image) {
530 |       pixWrite(output_threshold_image, pixt, IFF_PNG);
531 |     }
532 | 
533 |     if (segment && pixl->d > 1) {
534 |       // If no text is found, pixt is destroyed
535 |       PIX *graphics = segment_image(&pixt, pixl);
536 |       pixDestroy(&pixl);  // if pixt == NULL, the loop exits at 'continue'
537 |       if (graphics) {
538 |         if (verbose)
539 |           pixInfo(graphics, "graphics image:");
540 |         char *filename;
541 |         asprintf(&filename, "%s.%04d.%s", basename, pageno, img_ext);
542 |         pixWrite(filename, graphics, img_fmt);
543 |         free(filename);
544 |         pixDestroy(&graphics);
545 |       } else if (verbose) {
546 |         fprintf(stderr, "%s: no graphics found in input image\n", argv[i]);
547 |       }
548 |       if (pixt == NULL) {
549 |         fprintf(stderr, "%s: no text portion found in input image\n", argv[i]);
550 |         i++;
551 |         continue;
552 |       }
553 |     }
554 | 
555 |     pixDestroy(&pixl);
556 | 
557 |     if (!symbol_mode) {
558 |       int length;
559 |       uint8_t *ret;
560 |       ret = jbig2_encode_generic(pixt, !pdfmode, 0, 0, duplicate_line_removal,
561 |                                  &length);
562 |       write(1, ret, length);
563 |       return 0;
564 |     }
565 | 
566 |     jbig2_add_page(ctx, pixt);
567 |     pixDestroy(&pixt);
568 |     num_pages++;
569 |     if (subimage==numsubimages) {
570 |       i++;
571 |     }
572 |   }
573 | 
574 |   if (auto_thresh) {
575 |     if (hash) {
576 |       jbig2enc_auto_threshold_using_hash(ctx);
577 |     } else {
578 |       jbig2enc_auto_threshold(ctx);
579 |     }
580 |   }
581 | 
582 |   uint8_t *ret;
583 |   int length;
584 |   ret = jbig2_pages_complete(ctx, &length);
585 |   if (pdfmode) {
586 |     char *filename;
587 |     asprintf(&filename, "%s.sym", basename);
588 |     const int fd = open(filename, O_WRONLY | O_TRUNC | O_CREAT | WINBINARY, 0600);
589 |     free(filename);
590 |     if (fd < 0) abort();
591 |     write(fd, ret, length);
592 |     close(fd);
593 |   } else {
594 |     write(1, ret, length);
595 |   }
596 |   free(ret);
597 | 
598 |   for (int i = 0; i < num_pages; ++i) {
599 |     ret = jbig2_produce_page(ctx, i, -1, -1, &length);
600 |     if (pdfmode) {
601 |       char *filename;
602 |       asprintf(&filename, "%s.%04d", basename, i);
603 |       const int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | WINBINARY, 0600);
604 |       free(filename);
605 |       if (fd < 0) abort();
606 |       write(fd, ret, length);
607 |       close(fd);
608 |     } else {
609 |       write(1, ret, length);
610 |     }
611 |     free(ret);
612 |   }
613 | 
614 |   jbig2_destroy(ctx);
615 |   return 0;
616 | 
617 | }
618 | 
619 | 


--------------------------------------------------------------------------------
/src/jbig2arith.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #include "jbig2arith.h"
 19 | 
 20 | #include <string.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | 
 24 | #define u64 uint64_t
 25 | #define u32 uint32_t
 26 | #define u16 uint16_t
 27 | #define u8  uint8_t
 28 | 
 29 | // C++ doesn't have C99 restricted pointers, but GCC does allow __restrict__
 30 | #if !defined(WIN32)
 31 | #define restrict __restrict__
 32 | #else
 33 | #define restrict
 34 | #endif
 35 | 
 36 | // -----------------------------------------------------------------------------
 37 | // This the structure for a single state of the adaptive arithmetic compressor
 38 | // -----------------------------------------------------------------------------
 39 | struct context {
 40 |   u16 qe;
 41 |   u8 mps, lps;
 42 | };
 43 | 
 44 | // -----------------------------------------------------------------------------
 45 | // And this is the table of states for that adaptive compressor
 46 | // -----------------------------------------------------------------------------
 47 | struct context ctbl[] = {
 48 |   // This is the standard state table from
 49 |   // Table E.1 of the standard. The switch has been omitted and
 50 |   // those states are included below
 51 | #define STATETABLE \
 52 |   {0x5601, F( 1), SWITCH(F( 1))},\
 53 |   {0x3401, F( 2), F( 6)},\
 54 |   {0x1801, F( 3), F( 9)},\
 55 |   {0x0ac1, F( 4), F(12)},\
 56 |   {0x0521, F( 5), F(29)},\
 57 |   {0x0221, F(38), F(33)},\
 58 |   {0x5601, F( 7), SWITCH(F( 6))},\
 59 |   {0x5401, F( 8), F(14)},\
 60 |   {0x4801, F( 9), F(14)},\
 61 |   {0x3801, F(10), F(14)},\
 62 |   {0x3001, F(11), F(17)},\
 63 |   {0x2401, F(12), F(18)},\
 64 |   {0x1c01, F(13), F(20)},\
 65 |   {0x1601, F(29), F(21)},\
 66 |   {0x5601, F(15), SWITCH(F(14))},\
 67 |   {0x5401, F(16), F(14)},\
 68 |   {0x5101, F(17), F(15)},\
 69 |   {0x4801, F(18), F(16)},\
 70 |   {0x3801, F(19), F(17)},\
 71 |   {0x3401, F(20), F(18)},\
 72 |   {0x3001, F(21), F(19)},\
 73 |   {0x2801, F(22), F(19)},\
 74 |   {0x2401, F(23), F(20)},\
 75 |   {0x2201, F(24), F(21)},\
 76 |   {0x1c01, F(25), F(22)},\
 77 |   {0x1801, F(26), F(23)},\
 78 |   {0x1601, F(27), F(24)},\
 79 |   {0x1401, F(28), F(25)},\
 80 |   {0x1201, F(29), F(26)},\
 81 |   {0x1101, F(30), F(27)},\
 82 |   {0x0ac1, F(31), F(28)},\
 83 |   {0x09c1, F(32), F(29)},\
 84 |   {0x08a1, F(33), F(30)},\
 85 |   {0x0521, F(34), F(31)},\
 86 |   {0x0441, F(35), F(32)},\
 87 |   {0x02a1, F(36), F(33)},\
 88 |   {0x0221, F(37), F(34)},\
 89 |   {0x0141, F(38), F(35)},\
 90 |   {0x0111, F(39), F(36)},\
 91 |   {0x0085, F(40), F(37)},\
 92 |   {0x0049, F(41), F(38)},\
 93 |   {0x0025, F(42), F(39)},\
 94 |   {0x0015, F(43), F(40)},\
 95 |   {0x0009, F(44), F(41)},\
 96 |   {0x0005, F(45), F(42)},\
 97 |   {0x0001, F(45), F(43)},
 98 | #undef F
 99 | #define F(x) x
100 | #define SWITCH(x) (x + 46)
101 |   STATETABLE
102 | #undef SWITCH
103 | #undef F
104 | 
105 | #define F(x) (x + 46)
106 | #define SWITCH(x) ((x) - 46)
107 |   STATETABLE
108 | #undef SWITCH
109 | #undef F
110 | };
111 | 
112 | #if __GNUC__ >= 4
113 | #define BRANCH_OPT
114 | #endif
115 | 
116 | // GCC peephole optimisations
117 | #ifdef BRANCH_OPT
118 | #define likely(x)       __builtin_expect((x),1)
119 | #define unlikely(x)     __builtin_expect((x),0)
120 | #else
121 | #define likely(x)       x
122 | #define unlikely(x)     x
123 | #endif
124 | 
125 | // see comments in .h file
126 | void
127 | jbig2enc_init(struct jbig2enc_ctx *ctx) {
128 |   memset(ctx->context, 0, JBIG2_MAX_CTX);
129 |   memset(ctx->intctx, 0, 13 * 512);
130 |   ctx->a = 0x8000;
131 |   ctx->c = 0;
132 |   ctx->ct = 12;
133 |   ctx->bp = -1;
134 |   ctx->b = 0;
135 |   ctx->outbuf_used = 0;
136 |   ctx->outbuf = (u8 *) malloc(JBIG2_OUTPUTBUFFER_SIZE);
137 |   ctx->output_chunks = new std::vector<uint8_t *>;
138 |   ctx->iaidctx = NULL;
139 | }
140 | 
141 | // see comments in .h file
142 | void
143 | jbig2enc_reset(struct jbig2enc_ctx *ctx) {
144 |   ctx->a = 0x8000;
145 |   ctx->c = 0;
146 |   ctx->ct = 12;
147 |   ctx->bp = -1;
148 |   ctx->b = 0;
149 |   free(ctx->iaidctx);
150 |   ctx->iaidctx = NULL;
151 |   memset(ctx->context, 0, JBIG2_MAX_CTX);
152 |   memset(ctx->intctx, 0, 13 * 512);
153 | }
154 | 
155 | // see comments in .h file
156 | void
157 | jbig2enc_flush(struct jbig2enc_ctx *ctx) {
158 |   ctx->outbuf_used = 0;
159 | 
160 |   for (std::vector<uint8_t *>::iterator i = ctx->output_chunks->begin();
161 |        i != ctx->output_chunks->end(); ++i) {
162 |     free(*i);
163 |   }
164 |   ctx->output_chunks->clear();
165 |   ctx->bp = -1;
166 | }
167 | 
168 | // see comments in .h file
169 | void
170 | jbig2enc_dealloc(struct jbig2enc_ctx *ctx) {
171 |   for (std::vector<uint8_t *>::iterator i = ctx->output_chunks->begin();
172 |        i != ctx->output_chunks->end(); ++i) {
173 |     free(*i);
174 |   }
175 |   delete ctx->output_chunks;
176 |   free(ctx->outbuf);
177 |   free(ctx->iaidctx);
178 | }
179 | 
180 | // -----------------------------------------------------------------------------
181 | // Emit a byte from the compressor by appending to the current output buffer.
182 | // If the buffer is full, allocate a new one
183 | // -----------------------------------------------------------------------------
184 | static void inline
185 | emit(struct jbig2enc_ctx *restrict ctx) {
186 |   if (unlikely(ctx->outbuf_used == JBIG2_OUTPUTBUFFER_SIZE)) {
187 |     ctx->output_chunks->push_back(ctx->outbuf);
188 |     ctx->outbuf = (u8 *) malloc(JBIG2_OUTPUTBUFFER_SIZE);
189 |     ctx->outbuf_used = 0;
190 |   }
191 | 
192 |   ctx->outbuf[ctx->outbuf_used++] = ctx->b;
193 | }
194 | 
195 | // -----------------------------------------------------------------------------
196 | // The BYTEOUT procedure from the standard
197 | // -----------------------------------------------------------------------------
198 | static void
199 | byteout(struct jbig2enc_ctx *restrict ctx) {
200 |   if (ctx->b == 0xff) goto rblock;
201 | 
202 |   if (ctx->c < 0x8000000) goto lblock;
203 |   ctx->b += 1;
204 |   if (ctx->b != 0xff) goto lblock;
205 |   ctx->c &= 0x7ffffff;
206 | 
207 | rblock:
208 |   if (ctx->bp >= 0) {
209 | #ifdef TRACE
210 |     printf("emit %x\n", ctx->b);
211 | #endif
212 |     emit(ctx);
213 |   }
214 |   ctx->b = ctx->c >> 20;
215 |   ctx->bp++;
216 |   ctx->c &= 0xfffff;
217 |   ctx->ct = 7;
218 |   return;
219 | 
220 | lblock:
221 |   if (ctx->bp >= 0) {
222 | #ifdef TRACE
223 |     printf("emit %x\n", ctx->b);
224 | #endif
225 |     emit(ctx);
226 |   }
227 |   ctx->b = ctx->c >> 19;
228 |   ctx->bp++;
229 |   ctx->c &= 0x7ffff;
230 |   ctx->ct = 8;
231 |   return;
232 | }
233 | 
234 | // -----------------------------------------------------------------------------
235 | // A merging of the ENCODE, CODELPS and CODEMPS procedures from the standard
236 | // -----------------------------------------------------------------------------
237 | static void
238 | encode_bit(struct jbig2enc_ctx *restrict ctx, u8 *restrict context, u32 ctxnum, u8 d) {
239 |   const u8 i = context[ctxnum];
240 |   const u8 mps = i > 46 ? 1 : 0;
241 |   const u16 qe = ctbl[i].qe;
242 | 
243 | #ifdef CODER_DEBUGGING
244 |     fprintf(stderr, "B: %d %d %d %d\n", ctxnum, qe, ctx->a, d);
245 | #endif
246 | 
247 | #ifdef TRACE
248 |   static int ec = 0;
249 |   printf("%d\t%d %d %x %x %x %d %x %d\n", ec++, i, mps, qe, ctx->a, ctx->c, ctx->ct, ctx->b, ctx->bp);
250 | #endif
251 | 
252 |   if (unlikely(d != mps)) goto codelps;
253 | #ifdef SURPRISE_MAP
254 |   {
255 |   u8 b = static_cast<unsigned char>
256 |     (((static_cast<float>(qe) / 0xac02) * 255));
257 |   write(3, &b, 1);
258 |   }
259 | #endif
260 |   ctx->a -= qe;
261 |   if (unlikely((ctx->a & 0x8000) == 0)) {
262 |     if (unlikely(ctx->a < qe)) {
263 |       ctx->a = qe;
264 |     } else {
265 |       ctx->c += qe;
266 |     }
267 |     context[ctxnum] = ctbl[i].mps;
268 |     goto renorme;
269 |   } else {
270 |     ctx->c += qe;
271 |   }
272 | 
273 |   return;
274 | 
275 | codelps:
276 | #ifdef SURPRISE_MAP
277 |   {
278 |   u8 b = static_cast<unsigned char>
279 |     ((1.0f - (static_cast<float>(qe) / 0xac02)) * 255);
280 |   write(3, &b, 1);
281 |   }
282 | #endif
283 |   ctx->a -= qe;
284 |   if (ctx->a < qe) {
285 |     ctx->c += qe;
286 |   } else {
287 |     ctx->a = qe;
288 |   }
289 |   context[ctxnum] = ctbl[i].lps;
290 | 
291 | renorme:
292 |   do {
293 |     ctx->a <<= 1;
294 |     ctx->c <<= 1;
295 |     ctx->ct -= 1;
296 |     if (unlikely(!ctx->ct)) {
297 |       byteout(ctx);
298 |     }
299 |   } while ((ctx->a & 0x8000) == 0);
300 | }
301 | 
302 | // -----------------------------------------------------------------------------
303 | // The FINALISE procudure from the standard
304 | // -----------------------------------------------------------------------------
305 | static void
306 | encode_final(struct jbig2enc_ctx *restrict ctx) {
307 |   // SETBITS
308 |   const u32 tempc = ctx->c + ctx->a;
309 |   ctx->c |= 0xffff;
310 |   if (ctx->c >= tempc) {
311 |     ctx->c -= 0x8000;
312 |   }
313 | 
314 |   ctx->c <<= ctx->ct;
315 |   byteout(ctx);
316 |   ctx->c <<= ctx->ct;
317 |   byteout(ctx);
318 |   emit(ctx);
319 |   if (ctx->b != 0xff) {
320 | #ifdef TRACE
321 |     printf("emit 0xff\n");
322 | #endif
323 |     ctx->b = 0xff;
324 |     emit(ctx);
325 |   }
326 | #ifdef TRACE
327 |   printf("emit 0xac\n");
328 | #endif
329 |   ctx->b = 0xac;
330 |   emit(ctx);
331 | }
332 | 
333 | // see comments in .h file
334 | void
335 | jbig2enc_final(struct jbig2enc_ctx *restrict ctx) {
336 |   encode_final(ctx);
337 | }
338 | 
339 | // -----------------------------------------------------------------------------
340 | // When encoding integers there are a number of different cases. This structure
341 | // contains all the information for one of those cases
342 | // -----------------------------------------------------------------------------
343 | struct intencrange_s {
344 |   int bot, top;  // the range of numbers for which this is valid
345 |   u8 data, bits; // the bits of data to write first, and the number which are valid
346 |                  // These bits are taken from the bottom of the u8, in reverse order
347 |   u16 delta;     // the amount to subtract from the value before encoding it
348 |   u8 intbits;    // number of bits to use to encode the integer
349 | };
350 | 
351 | // table for how to encode integers of a given range
352 | static struct intencrange_s intencrange[] = {
353 |   {0,   3,  0, 2, 0, 2},
354 |   {-1, -1,  9, 4, 0, 0},
355 |   {-3, -2,  5, 3, 2, 1},
356 |   {4,  19,  2, 3, 4, 4},
357 |   {-19,-4,  3, 3, 4, 4},
358 |   {20, 83,  6, 4, 20, 6},
359 |   {-83,-20, 7, 4, 20, 6},
360 |   {84, 339, 14,5, 84, 8},
361 |   {-339,-84,15,5, 84, 8},
362 |   {340,4435,30,6, 340, 12},
363 |   {-4435,-340,31,6,340, 12},
364 |   {4436,2000000000,62,6,4436, 32},
365 |   {-2000000000,-4436,63,6,4436, 32}
366 | };
367 | 
368 | // see comments in .h file
369 | void
370 | jbig2enc_oob(struct jbig2enc_ctx *restrict ctx, int proc) {
371 |   u8 *const context = ctx->intctx[proc];
372 | 
373 |   encode_bit(ctx, context, 1, 1);
374 |   encode_bit(ctx, context, 3, 0);
375 |   encode_bit(ctx, context, 6, 0);
376 |   encode_bit(ctx, context, 12, 0);
377 | }
378 | 
379 | // see comments in .h file
380 | void
381 | jbig2enc_int(struct jbig2enc_ctx *restrict ctx, int proc, int value) {
382 |   u8 *const context = ctx->intctx[proc];
383 |   int i;
384 | 
385 |   if (value > 2000000000 || value < -2000000000) abort();
386 | 
387 |   u32 prev = 1;
388 | 
389 |   for (i = 0; ; ++i) {
390 |     if (intencrange[i].bot <= value && intencrange[i].top >= value) break;
391 |   }
392 |   if (value < 0) value = -value;
393 |   value -= intencrange[i].delta;
394 | 
395 |   u8 data = intencrange[i].data;
396 |   for (int j = 0; j < intencrange[i].bits; ++j) {
397 |     const u8 v = data & 1;
398 |     encode_bit(ctx, context, prev, v);
399 |     data >>= 1;
400 |     if (prev & 0x100) {
401 |       // prev > 256
402 |       prev = (((prev << 1) | v) & 0x1ff) | 0x100;
403 |     } else {
404 |       prev = (prev << 1) | v;
405 |     }
406 |   }
407 | 
408 |   // move the data in value to the top of the word
409 |   value <<= (32 - intencrange[i].intbits);
410 |   for (int j = 0; j < intencrange[i].intbits; ++j) {
411 |     const u8 v = (value & 0x80000000) >> 31;
412 |     encode_bit(ctx, context, prev, v);
413 |     // roll the next bit into place
414 |     value <<= 1;
415 |     if (prev & 0x100) {
416 |       // prev > 256
417 |       prev = (((prev << 1) | v) & 0x1ff) | 0x100;
418 |     } else {
419 |       prev = (prev << 1) | v;
420 |     }
421 |   }
422 | }
423 | 
424 | // see comments in .h file
425 | void
426 | jbig2enc_iaid(struct jbig2enc_ctx *restrict ctx, int symcodelen, int value) {
427 |   if (!ctx->iaidctx) {
428 |     // we've not yet allocated the context index buffer for this
429 |     ctx->iaidctx = (u8 *) malloc(1 << symcodelen);
430 |     memset(ctx->iaidctx, 0, 1 << symcodelen);
431 |   }
432 |   const u32 mask = (1 << (symcodelen + 1)) - 1;
433 | 
434 |   value <<= (32 - symcodelen);  // roll the data to the top of the word
435 |   u32 prev = 1;
436 |   for (int i = 0; i < symcodelen; ++i) {
437 |     const u32 tval = prev & mask;
438 |     const u8 v = (value & 0x80000000) >> 31;
439 |     encode_bit(ctx, ctx->iaidctx, tval, v);
440 |     prev = (prev << 1) | v;
441 |     value <<= 1;
442 |   }
443 | }
444 | 
445 | // This is the test input to the coder as given in the standard (H.2)
446 | static const u8 input[] = { 0, 2, 0, 0x51, 0, 0, 0, 0xc0, 0x03, 0x52, 0x87,
447 |   0x2a, 0xaa, 0xaa, 0xaa, 0xaa, 0x82, 0xc0, 0x20, 0, 0xfc, 0xd7, 0x9e, 0xf6,
448 |   0xbf, 0x7f, 0xed, 0x90, 0x4f, 0x46, 0xa3, 0xbf } ;
449 | 
450 | // -----------------------------------------------------------------------------
451 | // This function  is used by jbig2enc_image to fetch values from the image and
452 | // to automatically extend the range of the image on three sides with zero's
453 | // -----------------------------------------------------------------------------
454 | static u8 image_get(const u8 *restrict image, int x, int y, int mx, int my) {
455 |   if (y < 0) return 0;
456 |   if (x >= mx) return 0;
457 |   if (y >= my) return 0;
458 |   return image[mx * y + x];
459 | }
460 | 
461 | // see comments in .h file
462 | unsigned
463 | jbig2enc_datasize(const struct jbig2enc_ctx *ctx) {
464 |   return JBIG2_OUTPUTBUFFER_SIZE * ctx->output_chunks->size() + ctx->outbuf_used;
465 | }
466 | 
467 | // see comments in .h file
468 | void
469 | jbig2enc_tobuffer(const struct jbig2enc_ctx *restrict ctx, u8 *restrict buffer) {
470 |   int j = 0;
471 |   for (std::vector<u8 *>::const_iterator i = ctx->output_chunks->begin();
472 |        i != ctx->output_chunks->end(); ++i) {
473 |     memcpy(&buffer[j], *i, JBIG2_OUTPUTBUFFER_SIZE);
474 |     j += JBIG2_OUTPUTBUFFER_SIZE;
475 |   }
476 | 
477 |   memcpy(&buffer[j], ctx->outbuf, ctx->outbuf_used);
478 | }
479 | 
480 | // This is the context used for the TPGD bits
481 | #define TPGDCTX 0x9b25
482 | 
483 | // -----------------------------------------------------------------------------
484 | // This is designed for Leptonica's 1bpp packed format images. Each row is some
485 | // number of 32-bit words. Pixels are in native-byte-order in each word.
486 | // -----------------------------------------------------------------------------
487 | void
488 | jbig2enc_bitimage(struct jbig2enc_ctx *restrict ctx, const u8 *restrict idata,
489 |                   int mx, int my, bool duplicate_line_removal) {
490 |   const u32 *restrict data = (u32 *) idata;
491 |   u8 *const context = ctx->context;
492 |   const unsigned words_per_row = (mx + 31) / 32;
493 |   const unsigned bytes_per_row = words_per_row * 4;
494 | 
495 |   u8 ltp = 0, sltp = 0;
496 | 
497 |   for (int y = 0; y < my; ++y) {
498 |     int x = 0;
499 | 
500 |     // the c* values store the context bits for each row. The template is fixed
501 |     // as template 0 with the floating bits in the default locations.
502 |     u16 c1, c2, c3;
503 |     // the w* values contain words from each of the rows: w1 is from two rows
504 |     // up etc. The next bit to roll onto the context values are kept at the top
505 |     // of these words.
506 |     u32 w1, w2, w3;
507 |     w1 = w2 = w3 = 0;
508 | 
509 |     if (y >= 2) w1 = data[(y - 2) * words_per_row];
510 |     if (y >= 1) {
511 |       w2 = data[(y - 1) * words_per_row];
512 | 
513 |       if (duplicate_line_removal) {
514 |         // it's possible that the last row was the same as this row
515 |         if (memcmp(&data[y * words_per_row], &data[(y - 1) * words_per_row],
516 |                    bytes_per_row) == 0) {
517 |           sltp = ltp ^ 1;
518 |           ltp = 1;
519 |         } else {
520 |           sltp = ltp;
521 |           ltp = 0;
522 |         }
523 |       }
524 |     }
525 |     if (duplicate_line_removal) {
526 |       encode_bit(ctx, context, TPGDCTX, sltp);
527 |       if (ltp) continue;
528 |     }
529 |     w3 = data[y * words_per_row];
530 | 
531 |     // the top three bits are the start of the context c1
532 |     c1 = w1 >> 29;
533 |     c2 = w2 >> 28;
534 |     // and we need to remove the used bits from the w* vars
535 |     w1 <<= 3;
536 |     w2 <<= 4;
537 |     c3 = 0;
538 |     for (x = 0; x < mx; ++x) {
539 |       const u16 tval = (c1 << 11) | (c2 << 4) | c3;
540 |       const u8 v = (w3 & 0x80000000) >> 31;
541 | 
542 |       //fprintf(stderr, "%d %d %d %d\n", x, y, tval, v);
543 |       encode_bit(ctx, context, tval, v);
544 |       c1 <<= 1;
545 |       c2 <<= 1;
546 |       c3 <<= 1;
547 |       c1 |= (w1 & 0x80000000) >> 31;
548 |       c2 |= (w2 & 0x80000000) >> 31;
549 |       c3 |= v;
550 |       const int m = x % 32;
551 |       if (m == 28 && y >= 2) {
552 |         // need to roll in another word from two lines up
553 |         const unsigned wordno = (x / 32) + 1;
554 |         if (wordno >= words_per_row) {
555 |           w1 = 0;
556 |         } else {
557 |           w1 = data[(y - 2) * words_per_row + wordno];
558 |         }
559 |       } else {
560 |         w1 <<= 1;
561 |       }
562 | 
563 |       if (m == 27 && y >= 1) {
564 |         // need to roll in another word from the last line
565 |         const unsigned wordno = (x / 32) + 1;
566 |         if (wordno >= words_per_row) {
567 |           w2 = 0;
568 |         } else {
569 |           w2 = data[(y - 1) * words_per_row + wordno];
570 |         }
571 |       } else {
572 |         w2 <<= 1;
573 |       }
574 | 
575 |       if (m == 31) {
576 |         // need to roll in another word from this line
577 |         const unsigned wordno = (x / 32) + 1;
578 |         if (wordno >= words_per_row) {
579 |           w3 = 0;
580 |         } else {
581 |           w3 = data[y * words_per_row + wordno];
582 |         }
583 |       } else {
584 |         w3 <<= 1;
585 |       }
586 | 
587 |       c1 &= 31;
588 |       c2 &= 127;
589 |       c3 &= 15;
590 |     }
591 |   }
592 | }
593 | 
594 | void
595 | jbig2enc_refine(struct jbig2enc_ctx *__restrict__ ctx,
596 |                 const uint8_t *__restrict__ itempl, int tx, int ty,
597 |                 const uint8_t *__restrict__ itarget, int mx, int my,
598 |                 int ox, int oy) {
599 |   const u32 *restrict templdata = (u32 *) itempl;
600 |   const u32 *restrict data = (u32 *) itarget;
601 |   u8 *restrict const context = ctx->context;
602 | 
603 |   static int image_counter = 0;
604 | 
605 |   image_counter++;
606 | 
607 | #ifdef SYM_DEBUGGING
608 |   fprintf(stderr, "refine:%d %d %d %d\n", tx, ty, mx, my);
609 | #endif
610 | 
611 |   const unsigned templwords_per_row = (tx + 31) / 32;
612 |   const unsigned words_per_row = (mx + 31) / 32;
613 | 
614 |   for (int y = 0; y < my; ++y) {
615 |     int x;
616 |     const int temply = y + oy;
617 |     // the template is fixed to the 13 pixel template with the floating bits in
618 |     // the default locations.
619 |     // we have 5 words of context. The first three are the last, current and
620 |     // next rows of the template. The last two are the last and current rows of
621 |     // the target.
622 |     // To form the 14 bits of content these are packed from the least
623 |     // significant bits rightward.
624 |     u16 c1, c2, c3, c4, c5;
625 |     // the w* values contain words from each of the corresponding rows. The
626 |     // next bit to be part of the context is kept at the top of these words
627 |     u32 w1, w2, w3, w4, w5;
628 |     w1 = w2 = w3 = w4 = w5 = 0;
629 | 
630 |     if (temply >= 1 && (temply - 1) < ty) w1 = templdata[(temply - 1) * templwords_per_row];
631 |     if (temply >= 0 && temply < ty) w2 = templdata[temply * templwords_per_row];
632 |     if (temply >= -1 && temply + 1 < ty) w3 = templdata[(temply + 1) * templwords_per_row];
633 | 
634 |     // the x offset prevents a hassel because we are dealing with bits. Thus we
635 |     // restrict it to being {-1, 0, 1}.
636 |     if (y >= 1) w4 = data[(y - 1) * words_per_row];
637 |     w5 = data[y * words_per_row];
638 | 
639 |     const int shiftoffset = 30 + ox;
640 |     c1 = w1 >> shiftoffset;
641 |     c2 = w2 >> shiftoffset;
642 |     c3 = w3 >> shiftoffset;
643 | 
644 |     c4 = w4 >> 30;
645 |     c5 = 0;
646 | 
647 |     // the w* should contain the next bit to be included in the context, in the
648 |     // MSB position. Thus we need to roll the used bits out of the way.
649 |     const int bits_to_trim = 2 - ox;
650 |     w1 <<= bits_to_trim;
651 |     w2 <<= bits_to_trim;
652 |     w3 <<= bits_to_trim;
653 | 
654 |     w4 <<= 2;
655 | 
656 |     for (x = 0; x < mx; ++x) {
657 |       const u16 tval = (c1 << 10) | (c2 << 7) | (c3 << 4) | (c4 << 1) | c5;
658 |       const u8 v = w5 >> 31;
659 | 
660 | #ifdef SYM_DEBUGGING
661 |       fprintf(stderr, "%d %d %d %d\n", x, y, tval, v);
662 | #endif
663 |       encode_bit(ctx, context, tval, v);
664 |       c1 <<= 1;
665 |       c2 <<= 1;
666 |       c3 <<= 1;
667 |       c4 <<= 1;
668 |       c1 |= w1 >> 31;
669 |       c2 |= w2 >> 31;
670 |       c3 |= w3 >> 31;
671 |       c4 |= w4 >> 31;
672 |       c5 = v;
673 | 
674 |       const int m = x % 32;
675 |       const unsigned wordno = (x / 32) + 1;
676 |       if (m == 29 + ox) {
677 |         // have run out of bits in the w[123] values. Need to get more.
678 | 
679 |         if (wordno >= templwords_per_row) {
680 |           w1 = w2 = w3 = 0;
681 |         } else {
682 |           if (temply >= 1 && (temply - 1 < ty)) {
683 |             w1 = templdata[(temply - 1) * templwords_per_row + wordno];
684 |           } else {
685 |             w1 = 0;
686 |           }
687 |           if (temply >= 0 && temply < ty) {
688 |             w2 = templdata[temply * templwords_per_row + wordno];
689 |           } else {
690 |             w2 = 0;
691 |           }
692 |           if (temply >= -1 && (temply + 1) < ty) {
693 |             w3 = templdata[(temply + 1) * templwords_per_row + wordno];
694 |           } else {
695 |             w3 = 0;
696 |           }
697 |         }
698 |       } else {
699 |         w1 <<= 1;
700 |         w2 <<= 1;
701 |         w3 <<= 1;
702 |       }
703 | 
704 |       if (m == 29 && y >= 1) {
705 |         // run out of data from w4
706 |         if (wordno >= words_per_row) {
707 |           w4 = 0;
708 |         } else {
709 |           w4 = data[(y - 1) * words_per_row + wordno];
710 |         }
711 |       } else {
712 |         w4 <<= 1;
713 |       }
714 | 
715 |       if (m == 31) {
716 |         // run out of data from w5
717 |         if (wordno >= words_per_row) {
718 |           w5 = 0;
719 |         } else {
720 |           w5 = data[y * words_per_row + wordno];
721 |         }
722 |       } else {
723 |         w5 <<= 1;
724 |       }
725 | 
726 |       c1 &= 7;
727 |       c2 &= 7;
728 |       c3 &= 7;
729 |       c4 &= 7;
730 |     }
731 |   }
732 | }
733 | 
734 | // see comments in .h file
735 | void
736 | jbig2enc_image(struct jbig2enc_ctx *restrict ctx, const u8 *restrict data,
737 |                int mx, int my, bool duplicate_line_removal) {
738 |   u8 *const context = ctx->context;
739 |   u8 ltp = 0;
740 |   u8 sltp = 0;
741 |   for (int y = 0; y < my; ++y) {
742 |     int x = 0;
743 |     u16 c1 = (image_get(data, x, y - 2, mx, my) << 2) |
744 |              (image_get(data, x + 1, y - 2, mx, my) << 1) |
745 |              (image_get(data, x + 2, y - 2, mx, my));
746 |     u16 c2 = (image_get(data, x, y - 1, mx, my) << 3) |
747 |              (image_get(data, x + 1, y - 1, mx, my) << 2) |
748 |              (image_get(data, x + 2, y - 1, mx, my) << 1) |
749 |              (image_get(data, x + 3, y - 1, mx, my));
750 |     u16 c3 = 0;
751 |     if (y > 0) {
752 |       // it's possible that the last row was the same as this row
753 |       if (memcmp(&data[y * mx], &data[(y - 1) * mx], mx) == 0) {
754 |         sltp = ltp ^ 1;
755 |         ltp = 1;
756 |       } else {
757 |         sltp = ltp;
758 |         ltp = 0;
759 |       }
760 |     }
761 |     if (duplicate_line_removal) {
762 |       encode_bit(ctx, context, TPGDCTX, sltp);
763 |       if (ltp) continue;
764 |     }
765 |     for (x = 0; x < mx; ++x) {
766 |       const u16 tval = (c1 << 11) | (c2 << 4) | c3;
767 |       const u8 v = image_get(data, x, y, mx, my);
768 |       encode_bit(ctx, context, tval, v);
769 |       c1 <<= 1;
770 |       c2 <<= 1;
771 |       c3 <<= 1;
772 |       c1 |= image_get(data, x + 3, y - 2, mx, my);
773 |       c2 |= image_get(data, x + 4, y - 1, mx, my);
774 |       c3 |= v;
775 |       c1 &= 31;
776 |       c2 &= 127;
777 |       c3 &= 15;
778 |     }
779 |   }
780 | }
781 | 


--------------------------------------------------------------------------------
/src/jbig2arith.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #ifndef JBIG2ENC_JBIG2ENC_H__
 19 | #define JBIG2ENC_JBIG2ENC_H__
 20 | 
 21 | #if defined(sun)
 22 | #include <sys/types.h>
 23 | #else
 24 | #include <stdint.h>
 25 | #endif
 26 | 
 27 | #include <vector>
 28 | 
 29 | #define JBIG2_MAX_CTX 65536
 30 | #define JBIG2_OUTPUTBUFFER_SIZE 20 * 1024
 31 | 
 32 | #ifdef _MSC_VER
 33 | #define __restrict__ __restrict
 34 | #endif
 35 | 
 36 | //#define JBIG2_DEBUGGING
 37 | //#define CODER_DEBUGGING
 38 | //#define SYM_DEBUGGING
 39 | //#define SYMBOL_COMPRESSION_DEBUGGING
 40 | 
 41 | // -----------------------------------------------------------------------------
 42 | // This is the context for the arithmetic encoder used in JBIG2. The coder is a
 43 | // state machine and there are many different states used - one for coding
 44 | // images, many more for coding numbers etc.
 45 | //
 46 | // When outputting data, the bytes are collected into chunks of size
 47 | // JBIG2_OUTPUTBUFFER_SIZE. These are chained in a linked list.
 48 | // -----------------------------------------------------------------------------
 49 | struct jbig2enc_ctx {
 50 |   // these are the current state of the arithmetic coder
 51 |   uint32_t c;
 52 |   uint16_t a;
 53 |   uint8_t ct, b;
 54 |   int bp;
 55 | 
 56 |   // This is a list of output chunks, not including the current one
 57 |   std::vector<uint8_t *> *output_chunks;
 58 |   uint8_t *outbuf;  // this is the current output chunk
 59 |   int outbuf_used;  // number of bytes used in outbuf
 60 |   uint8_t context[JBIG2_MAX_CTX];  // state machine context for encoding images
 61 |   uint8_t intctx[13][512];  // 512 bytes of context indexes for each of 13 different int decodings
 62 |                             // this data is also used for refinement coding
 63 |   uint8_t *iaidctx;  // size of this context not known at construction time
 64 | };
 65 | 
 66 | // these are the proc numbers for encoding different classes of integers
 67 | enum {
 68 |   JBIG2_IAAI = 0,
 69 |   JBIG2_IADH,
 70 |   JBIG2_IADS,
 71 |   JBIG2_IADT,
 72 |   JBIG2_IADW,
 73 |   JBIG2_IAEX,
 74 |   JBIG2_IAFS,
 75 |   JBIG2_IAIT,
 76 |   JBIG2_IARDH,
 77 |   JBIG2_IARDW,
 78 |   JBIG2_IARDX,
 79 |   JBIG2_IARDY,
 80 |   JBIG2_IARI
 81 | };
 82 | 
 83 | // -----------------------------------------------------------------------------
 84 | // Returns the number of bytes of output in the given context
 85 | //
 86 | // Before doing this you should make sure that the coder is _flush()'ed
 87 | // -----------------------------------------------------------------------------
 88 | unsigned jbig2enc_datasize(const struct jbig2enc_ctx *ctx);
 89 | 
 90 | // -----------------------------------------------------------------------------
 91 | // Writes the output of the given context to a buffer. The buffer must be at
 92 | // least long enough to contain all the data (see _datasize)
 93 | // -----------------------------------------------------------------------------
 94 | void jbig2enc_tobuffer(const struct jbig2enc_ctx *__restrict__ ctx,
 95 |                        uint8_t *__restrict__ buffer);
 96 | 
 97 | // -----------------------------------------------------------------------------
 98 | // Encode an integer of a given class. proc is one of JBIG2_IA* and specifies
 99 | // the type of the number. IAID is special and is handled by another function.
100 | // -----------------------------------------------------------------------------
101 | void jbig2enc_int(struct jbig2enc_ctx *__restrict__ ctx, int proc, int value);
102 | 
103 | 
104 | // -----------------------------------------------------------------------------
105 | // Encode an IAID number. This needs to know how many bits to use.
106 | // -----------------------------------------------------------------------------
107 | void jbig2enc_iaid(struct jbig2enc_ctx *__restrict__ ctx, int symcodelen,
108 |                    int value);
109 | 
110 | // -----------------------------------------------------------------------------
111 | // Encode the special out-of-bounds (-0) number for a given type. proc is one
112 | // of JBIG2_IA*
113 | // -----------------------------------------------------------------------------
114 | void jbig2enc_oob(struct jbig2enc_ctx *__restrict__ ctx, int proc);
115 | 
116 | // -----------------------------------------------------------------------------
117 | // Encode a bitmap with the arithmetic encoder.
118 | //   data: an array of mx * my bytes
119 | //   mx: max x value
120 | //   my: max y value
121 | //   duplicate_line_removal: if true, TPGD is used
122 | //
123 | // TPGD often takes very slightly more bytes to encode, but cuts the time taken
124 | // by half.
125 | // -----------------------------------------------------------------------------
126 | void jbig2enc_image(struct jbig2enc_ctx *__restrict__ ctx,
127 |                     const uint8_t *__restrict__ data, int mx, int my,
128 |                     bool duplicate_line_removal);
129 | 
130 | // -----------------------------------------------------------------------------
131 | // This function takes almost the same arguments as _image, above. But in this
132 | // case the data pointer points to packed data.
133 | //
134 | // This is designed for Leptonica's 1bpp packed format images. Each row is some
135 | // number of 32-bit words.
136 | //
137 | // *The pad bits at the end of each line must be zero.*
138 | // -----------------------------------------------------------------------------
139 | void jbig2enc_bitimage(struct jbig2enc_ctx *__restrict__ ctx,
140 |                        const uint8_t *__restrict__ data, int mx, int my,
141 |                        bool duplicate_line_removal);
142 | 
143 | 
144 | // -----------------------------------------------------------------------------
145 | // Encode the refinement of an exemplar to a bitmap.
146 | //
147 | // This encodes the difference between two images. If the template image is
148 | // close to the final image the amount of data needed should hopefully be
149 | // small.
150 | //   templ: the template image
151 | //   tx, ty: the size of the template image
152 | //   target: the desired image
153 | //   mx, my: the size of the desired image
154 | //   ox, oy: offset of the desired image from the template image.
155 | //           ox is limited to [-1, 0, 1]
156 | //
157 | // This uses Leptonica's 1bpp packed images (see comments above last function).
158 | //
159 | // *The pad bits at the end of each line, for both images, must be zero*
160 | // -----------------------------------------------------------------------------
161 | void jbig2enc_refine(struct jbig2enc_ctx *__restrict__ ctx,
162 |                      const uint8_t *__restrict__ templ, int tx, int ty,
163 |                      const uint8_t *__restrict__ target, int mx, int my,
164 |                      int ox, int oy);
165 | 
166 | // -----------------------------------------------------------------------------
167 | // Init a new context
168 | // -----------------------------------------------------------------------------
169 | void jbig2enc_init(struct jbig2enc_ctx *ctx);
170 | 
171 | // -----------------------------------------------------------------------------
172 | // Destroy a context
173 | // -----------------------------------------------------------------------------
174 | void jbig2enc_dealloc(struct jbig2enc_ctx *ctx);
175 | 
176 | // -----------------------------------------------------------------------------
177 | // Flush all the data stored in a context
178 | // -----------------------------------------------------------------------------
179 | void jbig2enc_flush(struct jbig2enc_ctx *ctx);
180 | 
181 | // -----------------------------------------------------------------------------
182 | // Reset the arithmetic coder back to an init state
183 | // -----------------------------------------------------------------------------
184 | void jbig2enc_reset(struct jbig2enc_ctx *ctx);
185 | 
186 | // -----------------------------------------------------------------------------
187 | // Flush any remaining arithmetic encoder context to the output.
188 | // -----------------------------------------------------------------------------
189 | void jbig2enc_final(struct jbig2enc_ctx *ctx);
190 | 
191 | #endif  // EXPERIMENTAL_USERS_AGL_JBIG2ENC_JBIG2ENC_H__
192 | 


--------------------------------------------------------------------------------
/src/jbig2comparator.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Google Inc. All Rights Reserved.
  2 | // Author: hata.radim@gmail.com (Radim Hatlapatka)
  3 | //
  4 | // Copyright (C) 2012 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #include <map>
 19 | #include <vector>
 20 | #include <algorithm>
 21 | 
 22 | #include <stdio.h>
 23 | #include <string.h>
 24 | 
 25 | #ifdef _MSC_VER
 26 | #define _USE_MATH_DEFINES
 27 | #include <math.h>
 28 | #endif
 29 | 
 30 | #include <leptonica/allheaders.h>
 31 | 
 32 | #include <math.h>
 33 | #if defined(sun)
 34 | #include <sys/types.h>
 35 | #else
 36 | #include <stdint.h>
 37 | #endif
 38 | 
 39 | #define u64 uint64_t
 40 | #define u32 uint32_t
 41 | #define u16 uint16_t
 42 | #define u8  uint8_t
 43 | 
 44 | bool
 45 | jbig2enc_are_equivalent(PIX *const first_template, PIX *const second_template) {
 46 |   l_int32 w, h, d;
 47 | 
 48 |   if (!pixSizesEqual(first_template, second_template)) {
 49 |     return false;
 50 |   }
 51 | 
 52 |   l_int32 first_wpl = pixGetWpl(first_template);
 53 |   l_int32 second_wpl = pixGetWpl(second_template);
 54 | 
 55 |   if (first_wpl != second_wpl) {
 56 |     return false;
 57 |   }
 58 | 
 59 |   PIX *pixd = pixXor(NULL, first_template, second_template);
 60 | 
 61 |   pixGetDimensions(pixd, &w, &h, &d);
 62 | 
 63 |   l_int32 init = 0;
 64 |   l_int32 *pcount = &init;
 65 |   l_int32 *above = &init;
 66 | 
 67 |   // counting number of ON pixels in first_template
 68 |   if (pixCountPixels(first_template, pcount, NULL)) {
 69 |     fprintf(stderr, "Unable to count pixels\n");
 70 |     pixDestroy(&pixd);
 71 |     return false;
 72 |   }
 73 | 
 74 |   // shortcut to failure if the symbols are significantly different.
 75 |   l_int32 thresh = (*pcount) * 0.25;
 76 |   if (pixThresholdPixelSum(pixd, thresh, above, NULL)) {
 77 |     fprintf(stderr, "Unable to count pixels of XORed pixes\n");
 78 |     pixDestroy(&pixd);
 79 |     return false;
 80 |   }
 81 | 
 82 |   if ((*above) == 1) {
 83 |     pixDestroy(&pixd);
 84 |     return false;
 85 |   }
 86 | 
 87 |   l_uint32 init_unsigned = 0;
 88 |   l_uint32 *pval = &init_unsigned;
 89 |   const int divider = 9;
 90 |   const int vertical = divider * 2;
 91 |   const int horizontal = divider * 2;
 92 | 
 93 |   l_uint32 parsed_pix_counts[divider][divider];
 94 |   l_uint32 horizontal_parsed_pix_counts[horizontal][divider];
 95 |   l_uint32 vertical_parsed_pix_counts[divider][vertical];
 96 | 
 97 |   if (d != 1) {
 98 |     return false;
 99 |   }
100 | 
101 |   int vertical_part = h/divider;
102 |   int horizontal_part = w/divider;
103 | 
104 |   int horizontal_module_counter = 0;
105 |   int vertical_module_counter = 0;
106 | 
107 |   // counting area of ellipse and taking percentage of it as point_thresh
108 |   int a, b;
109 |   if (vertical_part < horizontal_part) {
110 |     a = horizontal_part / 2;
111 |     b = vertical_part / 2;
112 |   } else {
113 |     a = vertical_part / 2;
114 |     b = horizontal_part / 2;
115 |   }
116 | 
117 |   float point_thresh = a * b * M_PI;
118 |   l_int32 vline_thresh = (vertical_part * (horizontal_part/2))*0.9;
119 |   l_int32 hline_thresh = (horizontal_part * (vertical_part/2))*0.9;
120 | 
121 |   // iterate through submatrixes
122 |   for (int horizontal_position = 0; horizontal_position < divider; horizontal_position++) {
123 |     int horizontal_start = horizontal_part*horizontal_position + horizontal_module_counter;
124 |     int horizontal_end;
125 |     if (horizontal_position == (divider-1)) {
126 |       horizontal_module_counter = 0;
127 |       horizontal_end = w;
128 |     } else {
129 |       if (((w - horizontal_module_counter) % divider)>0) {
130 |         horizontal_end = horizontal_start + horizontal_part + 1;
131 |         horizontal_module_counter++;
132 |       } else {
133 |         horizontal_end = horizontal_start + horizontal_part;
134 |       }
135 |     }
136 | 
137 |     for (int vertical_position = 0; vertical_position < divider; vertical_position++) {
138 |       int vertical_start = vertical_part*vertical_position + vertical_module_counter;
139 |       int vertical_end;
140 |       if (vertical_position == (divider-1)) {
141 |         vertical_module_counter = 0;
142 |         vertical_end = h;
143 |       } else {
144 |         if (((h - vertical_module_counter) % divider)>0) {
145 |           vertical_end = vertical_start + vertical_part + 1;
146 |           vertical_module_counter++;
147 |         } else {
148 |           vertical_end = vertical_start + vertical_part;
149 |         }
150 |       }
151 | 
152 |       // making sum of ON pixels in submatrix and saving the result to matrix of sums.
153 |       int left_count = 0;
154 |       int right_count = 0;
155 |       int down_count = 0;
156 |       int up_count = 0;
157 | 
158 |       int horizontal_center = (horizontal_start + horizontal_end) / 2;
159 |       int vertical_center = (vertical_start + vertical_end) / 2;
160 | 
161 |       for (int i = horizontal_start; i < horizontal_end; i++) {
162 |         for (int j = vertical_start; j < vertical_end; j++) {
163 |           if (pixGetPixel(pixd, i, j, pval)) {
164 |             fprintf(stderr, "unable to read pixel from pix\n");
165 |             break;
166 |           }
167 | 
168 |           if (*pval == 1) {
169 |             if (i < horizontal_center) {
170 |               left_count++;
171 |             } else {
172 |               right_count++;
173 |             }
174 |             if (j < vertical_center) {
175 |               up_count++;
176 |             } else {
177 |               down_count++;
178 |             }
179 |           }
180 |         }
181 |       }
182 |       parsed_pix_counts[horizontal_position][vertical_position] = left_count + right_count;
183 | 
184 |       horizontal_parsed_pix_counts[horizontal_position*2][vertical_position] = left_count;
185 |       horizontal_parsed_pix_counts[(horizontal_position*2)+1][vertical_position] = right_count;
186 | 
187 |       vertical_parsed_pix_counts[horizontal_position][vertical_position*2] = up_count;
188 |       vertical_parsed_pix_counts[horizontal_position][(vertical_position*2)+1] = down_count;
189 |     }
190 |   }
191 | 
192 |   pixDestroy(&pixd);
193 | 
194 |   // check for horizontal lines
195 |   for (int i = 0; (i < (divider*2)-1); i++) {
196 |     for (int j = 0; j < (divider-1); j++) {
197 |       int horizontal_sum = 0;
198 |       for (int x = 0; x < 2; x++) {
199 |         for (int y = 0; y < 2; y++) {
200 |           horizontal_sum += horizontal_parsed_pix_counts[i+x][j+y];
201 |         }
202 |       }
203 |       if (horizontal_sum >= hline_thresh) {
204 |         return 0;
205 |       }
206 |     }
207 |   }
208 | 
209 |   // check for vertical lines
210 |   for (int i = 0; i < (divider-1); i++) {
211 |     for (int j = 0; j < ((divider*2)-1); j++) {
212 |       int vertical_sum = 0;
213 |       for (int x = 0; x < 2; x++) {
214 |         for (int y = 0; y < 2; y++) {
215 |         vertical_sum += vertical_parsed_pix_counts[i+x][j+y];
216 |         }
217 |       }
218 |       if (vertical_sum >= vline_thresh) {
219 |         return 0;
220 |       }
221 |     }
222 |   }
223 | 
224 |   // check for cross lines
225 |   for (int i = 0; i < (divider - 2); i++) {
226 |     for (int j = 0; j < (divider - 2); j++) {
227 |       int left_cross = 0;
228 |       int right_cross = 0;
229 |       for (int x = 0; x < 3; x++) {
230 |         for (int y = 0; y < 3; y++) {
231 |           if (x == y) {
232 |             left_cross += parsed_pix_counts[i+x][j+y];
233 |           }
234 |           if ((2-x) == y) {
235 |             right_cross += parsed_pix_counts[i+x][j+y];
236 |           }
237 |         }
238 |       }
239 |       if ((left_cross >= hline_thresh) || (right_cross >= hline_thresh)) {
240 |         return 0;
241 |       }
242 |     }
243 |   }
244 | 
245 |   // check whether four submatrixes of XORed PIX data contains more ON pixels
246 |   // than concrete percentage of ON pixels of first_template.
247 | 
248 |   for (int i = 0; i < (divider-1); i++) {
249 |     for (int j = 0; j < (divider-1); j++) {
250 |       int sum = 0;
251 |       for (int x = 0; x < 2; x++) {
252 |         for (int y = 0; y < 2; y++) {
253 |           sum += parsed_pix_counts[i+x][j+y];
254 |         }
255 |       }
256 |       if (sum >= point_thresh) {
257 |         return 0;
258 |       }
259 |     }
260 |   }
261 |   return 1;
262 | }
263 | 


--------------------------------------------------------------------------------
/src/jbig2comparator.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 Google Inc. All Rights Reserved.
 2 | // Author: hata.radim@gmail.com (Radim Hatlapatka)
 3 | //
 4 | // Copyright (C) 2012 Google Inc.
 5 | //
 6 | // Licensed under the Apache License, Version 2.0 (the "License");
 7 | // you may not use this file except in compliance with the License.
 8 | // You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | // Unless required by applicable law or agreed to in writing, software
13 | // distributed under the License is distributed on an "AS IS" BASIS,
14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | // See the License for the specific language governing permissions and
16 | // limitations under the License.
17 | 
18 | #ifndef JBIG2ENC_JBIG2COMPARATOR_H__
19 | #define JBIG2ENC_JBIG2COMPARATOR_H__
20 | 
21 | #if defined(sun)
22 | #include <sys/types.h>
23 | #else
24 | #include <stdint.h>
25 | #endif
26 | 
27 | #include <leptonica/allheaders.h>
28 | 
29 | struct Pix;
30 | 
31 | // -----------------------------------------------------------------------------
32 | // jbig2enc_are_equivalent compares two pix and tell if they are equivalent by
33 | // trying to decide if these symbols are equivalent from visual point of view.
34 | // See http://is.muni.cz/th/208155/fi_m.
35 | //
36 | // It works by looking for accumulations of differences between two templates.
37 | //
38 | // If the difference is bigger than concrete percentage of one of templates
39 | // they are considered different, if such difference doesn't exist than they
40 | // are equivalent.
41 | //
42 | // Parts of this function should be recreated using leptonica functions, which
43 | // should speed up the process, but the principle should remain the same and
44 | // the result as well.
45 | // -----------------------------------------------------------------------------
46 | bool jbig2enc_are_equivalent(PIX *const firstTemplate,
47 |                              PIX *const secondTemplate);
48 | 
49 | #endif  // JBIG2ENC_JBIG2COMPARATOR_H__
50 | 


--------------------------------------------------------------------------------
/src/jbig2enc.cc:
--------------------------------------------------------------------------------
   1 | // Copyright 2006 Google Inc. All Rights Reserved.
   2 | // Author: agl@imperialviolet.org (Adam Langley)
   3 | //
   4 | // Copyright (C) 2006 Google Inc.
   5 | //
   6 | // Licensed under the Apache License, Version 2.0 (the "License");
   7 | // you may not use this file except in compliance with the License.
   8 | // You may obtain a copy of the License at
   9 | //
  10 | //      http://www.apache.org/licenses/LICENSE-2.0
  11 | //
  12 | // Unless required by applicable law or agreed to in writing, software
  13 | // distributed under the License is distributed on an "AS IS" BASIS,
  14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 | // See the License for the specific language governing permissions and
  16 | // limitations under the License.
  17 | 
  18 | #include <map>
  19 | #include <list>
  20 | #include <vector>
  21 | #include <algorithm>
  22 | 
  23 | #include <stdio.h>
  24 | #include <string.h>
  25 | 
  26 | #include <leptonica/allheaders.h>
  27 | #if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1
  28 | #include "leptonica/pix_internal.h"
  29 | #include "leptonica/array_internal.h"
  30 | #endif
  31 | 
  32 | #include <math.h>
  33 | #if defined(sun)
  34 | #include <sys/types.h>
  35 | #else
  36 | #include <stdint.h>
  37 | #endif
  38 | 
  39 | #define u64 uint64_t
  40 | #define u32 uint32_t
  41 | #define u16 uint16_t
  42 | #define u8  uint8_t
  43 | 
  44 | #include "jbig2arith.h"
  45 | #include "jbig2sym.h"
  46 | #include "jbig2structs.h"
  47 | #include "jbig2segments.h"
  48 | #include "jbig2comparator.h"
  49 | 
  50 | // -----------------------------------------------------------------------------
  51 | // Returns the version identifier as a static string.
  52 | // -----------------------------------------------------------------------------
  53 | char const *getVersion() {
  54 |   return VERSION;
  55 | }
  56 | 
  57 | // -----------------------------------------------------------------------------
  58 | // Removes spots which are less than size x size pixels
  59 | //
  60 | // Note, this has a side-effect of removing a few pixels
  61 | // that from components you want to keep.
  62 | //
  63 | // If that's a problem, you do a binary reconstruction
  64 | // (from seedfill.c):
  65 | // -----------------------------------------------------------------------------
  66 | static PIX *
  67 | remove_flyspecks(PIX *const source, const int size) {
  68 |   Sel *sel_5h = selCreateBrick(1, size, 0, 2, SEL_HIT);
  69 |   Sel *sel_5v = selCreateBrick(size, 1, 2, 0, SEL_HIT);
  70 | 
  71 |   Pix *pixt = pixOpen(NULL, source, sel_5h);
  72 |   Pix *pixd = pixOpen(NULL, source, sel_5v);
  73 |   pixOr(pixd, pixd, pixt);
  74 |   pixDestroy(&pixt);
  75 |   selDestroy(&sel_5h);
  76 |   selDestroy(&sel_5v);
  77 | 
  78 |   return pixd;
  79 | }
  80 | 
  81 | // -----------------------------------------------------------------------------
  82 | // Returns the number of bits needed to encode v symbols
  83 | // -----------------------------------------------------------------------------
  84 | static unsigned
  85 | log2up(int v) {
  86 |   unsigned r = 0;
  87 |   const bool is_pow_of_2 = (v & (v - 1)) == 0;
  88 | 
  89 |   while (v >>= 1) r++;
  90 |   if (is_pow_of_2) return r;
  91 | 
  92 |   return r + 1;
  93 | }
  94 | 
  95 | // -----------------------------------------------------------------------------
  96 | // This is the context for a multi-page JBIG2 document.
  97 | // -----------------------------------------------------------------------------
  98 | struct jbig2ctx {
  99 |   struct JbClasser *classer;  // the leptonica classifier
 100 |   int xres, yres;  // ppi for the X and Y direction
 101 |   bool full_headers;  // true if we are producing a full JBIG2 file
 102 |   bool pdf_page_numbering;  // true if all text pages are page "1" (pdf mode)
 103 |   int segnum;  // current segment number
 104 |   int symtab_segment;  // the segment number of the symbol table
 105 |   // a map from page number a list of components for that page
 106 |   std::map<int, std::vector<int> > pagecomps;
 107 |   // for each page, the list of symbols which are only used on that page
 108 |   std::map<int, std::vector<unsigned> > single_use_symbols;
 109 |   // the number of symbols in the global symbol table
 110 |   int num_global_symbols;
 111 |   std::vector<int> page_xres, page_yres;
 112 |   std::vector<int> page_width, page_height;
 113 |   // Used to store the mapping from symbol number to the index in the global
 114 |   // symbol dictionary.
 115 |   std::map<int, int> symmap;
 116 |   bool refinement;
 117 |   PIXA *avg_templates;  // grayed templates
 118 |   int refine_level;
 119 |   // only used when using refinement
 120 |     // the number of the first symbol of each page
 121 |     std::vector<int> baseindexes;
 122 | };
 123 | 
 124 | // see comments in .h file
 125 | struct jbig2ctx *
 126 | jbig2_init(float thresh, float weight, int xres, int yres, bool full_headers,
 127 |            int refine_level) {
 128 |   struct jbig2ctx *ctx = new jbig2ctx;
 129 |   ctx->xres = xres;
 130 |   ctx->yres = yres;
 131 |   ctx->full_headers = full_headers;
 132 |   ctx->pdf_page_numbering = !full_headers;
 133 |   ctx->segnum = 0;
 134 |   ctx->symtab_segment = -1;
 135 |   ctx->refinement = refine_level >= 0;
 136 |   ctx->refine_level = refine_level;
 137 |   ctx->avg_templates = NULL;
 138 | 
 139 |   ctx->classer = jbCorrelationInitWithoutComponents(JB_CONN_COMPS, 9999, 9999,
 140 |                                                     thresh, weight);
 141 | 
 142 |   return ctx;
 143 | }
 144 | 
 145 | #if defined(UNIFICATION_DEBUGGING)
 146 | static void
 147 | print_list(std::list<int> &l) {
 148 |   for (std::list<int>::iterator it = l.begin(); it != l.end(); it++) {
 149 |     fprintf(stderr, "%d, ", (*it));
 150 |   }
 151 |   fprintf(stderr, "\n");
 152 | }
 153 | #endif
 154 | 
 155 | // -----------------------------------------------------------------------------
 156 | // unite_templates unites templates of the same character to chosen character
 157 | // template
 158 | //
 159 | //   ctx: structure containing templates of symbols.
 160 | //   target_char: char that will remain (united char will be replaced by this
 161 | //       char).
 162 | //   char_to_be_united: array of indexes to templates that should be replaced
 163 | //       by targetCharTemplate.
 164 | //   n: number of templates to be united.
 165 | //
 166 | // returns 0 on success and non-zero on error.
 167 | //
 168 | // TODO: find out which is the first index and transfer to this position target char
 169 | // -----------------------------------------------------------------------------
 170 | static int
 171 | unite_templates(struct jbig2ctx *ctx,
 172 |                 int new_representant,
 173 |                 std::list<int> &templates_to_be_united) {
 174 |   if (!ctx) {
 175 |     fprintf(stderr, "ctx not given");
 176 |     return 1;
 177 |   }
 178 | 
 179 |   if (templates_to_be_united.empty()) {
 180 |     fprintf(stderr, "given no templates for uniting");
 181 |     return 1;
 182 |   }
 183 | 
 184 | #if defined(UNIFICATION_DEBUGGING)
 185 |   fprintf(stderr, "Uniting templates to point to template %d:\n", new_representant);
 186 |   print_list(templates_to_be_united);
 187 | #endif
 188 | 
 189 |   // check if new_representant exists
 190 |   if ((new_representant < 0) ||
 191 |     (new_representant >= pixaGetCount(ctx->classer->pixat))) {
 192 |     fprintf(stderr, "new representant template out of range");
 193 |     return 1;
 194 |   }
 195 | 
 196 |   for (std::list<int>::iterator it = templates_to_be_united.begin();
 197 |         it != templates_to_be_united.end(); it++) {
 198 |     int second_template = (*it);
 199 |     if ((second_template < 0) ||
 200 |         (second_template >= pixaGetCount(ctx->classer->pixat))) {
 201 |       fprintf(stderr, "template: %d out of range", (*it));
 202 |       return 1;
 203 |     }
 204 | 
 205 |     // reindexing
 206 |     for (int i = 0; i < ctx->classer->naclass->n; i++) {
 207 |       int n;
 208 |       numaGetIValue(ctx->classer->naclass, i, &n);
 209 |       if (n == second_template) {
 210 |         numaSetValue(ctx->classer->naclass, i, new_representant);
 211 |       }
 212 |     }
 213 | #if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1
 214 |     ctx->classer->pixat->pix[new_representant]->refcount += ctx->classer->pixat->pix[second_template]->refcount;
 215 | #else
 216 |     pixChangeRefcount(ctx->classer->pixat->pix[new_representant],pixGetRefcount(ctx->classer->pixat->pix[second_template]));
 217 | #endif
 218 |   }
 219 |   return 0;
 220 | }
 221 | 
 222 | 
 223 | static int
 224 | remove_templates(jbig2ctx *ctx, std::list<int> &templates_to_remove) {
 225 |   if (!ctx) {
 226 |     fprintf(stderr, "ctx not given\n");
 227 |     return 1;
 228 |   }
 229 |   if (templates_to_remove.empty()) {
 230 |     fprintf(stderr, "given no templates to remove\n");
 231 |     return 0;
 232 |   }
 233 |   templates_to_remove.sort();
 234 | 
 235 | #ifdef UNIFICATION_DEBUGGING
 236 |   fprintf(stderr, "Removing templates: ");
 237 |   print_list(templates_to_remove);
 238 | #endif
 239 | 
 240 |   std::list<int>::iterator it = templates_to_remove.begin();
 241 |   PIXA *pixat = ctx->classer->pixat;
 242 | 
 243 |   // index: represents pointer to dictionary (PIXAT) and is processed in
 244 |   //     reverse.
 245 |   // it: represents pointer to actual representant in list which should be
 246 |   //     removed.
 247 |   int last = templates_to_remove.back();
 248 |   for (int index = (pixat->n - 1); ((it != templates_to_remove.end()) && (index >= (*it))); index--) {
 249 |     // check if we assign PIX which should not be removed
 250 |     if (index == last) {
 251 |       templates_to_remove.pop_back();
 252 |       last = templates_to_remove.back();
 253 |     } else {
 254 |       PIX * end_pix;
 255 |       PIX * copied_pix;
 256 |       BOXA * boxa;
 257 |       int newIndex = (*it);
 258 |       if (index != newIndex) {
 259 |         end_pix = ctx->classer->pixat->pix[index];
 260 |         copied_pix = pixCopy(NULL, end_pix);
 261 |         boxa = ctx->classer->pixat->boxa;
 262 |         l_int32 nbox = boxaGetCount(boxa);
 263 |         BOX * box = NULL;
 264 |         if (index < nbox) {
 265 |           box = boxa->box[index];
 266 |         }
 267 |         if (pixaReplacePix(ctx->classer->pixat, newIndex, copied_pix, box)) {
 268 |           fprintf(stderr, "uniting - unable to replace pix %d in pixat\n", newIndex);
 269 |           return 2;
 270 |         }
 271 |         // reindexing
 272 |         for (int i = 0; i < ctx->classer->naclass->n; i++) {
 273 |           int n;
 274 |           numaGetIValue(ctx->classer->naclass, i, &n);
 275 |           if (n == index) {
 276 |             numaSetValue(ctx->classer->naclass, i, newIndex);
 277 |           }
 278 |         }
 279 |       }
 280 |       it++;
 281 |     }
 282 |     if (pixaRemovePix(ctx->classer->pixat, index)) {
 283 |       fprintf(stderr, "uniting - unable to remove pix with index %d from pixat\n", index);
 284 |       return 3;
 285 |     }
 286 |     ctx->classer->nclass--;
 287 |   }
 288 |   return 0;
 289 | }
 290 | 
 291 | // -----------------------------------------------------------------------------
 292 | // unites two templates to one template by reassigning indexes in numa struct
 293 | // and replacing deleted template by the last one
 294 | // -----------------------------------------------------------------------------
 295 | static int
 296 | unite_templates_with_indexes(struct jbig2ctx *ctx, int firstTemplateIndex,
 297 |                              int second_template_index) {
 298 |   if (!ctx) {
 299 |     fprintf(stderr, "ctx doesn't exist");
 300 |     return 1;
 301 |   }
 302 | 
 303 |   if ((ctx->classer->pixat->n < firstTemplateIndex) ||
 304 |       (ctx->classer->pixat->n < second_template_index)) {
 305 |     fprintf(stderr, "index doesn't point to templates array");
 306 |     return 1;
 307 |   }
 308 | 
 309 |   // reindexing
 310 |   for (int i = 0; i < ctx->classer->naclass->n; i++) {
 311 |     int n;
 312 |     numaGetIValue(ctx->classer->naclass, i, &n);
 313 |     if (n == second_template_index) {
 314 |       numaSetValue(ctx->classer->naclass, i, firstTemplateIndex);
 315 |     }
 316 |   }
 317 | 
 318 |   PIX * end_pix;
 319 |   PIX * copied_pix;
 320 |   BOXA * boxa;
 321 |   int index = pixaGetCount(ctx->classer->pixat) - 1;
 322 |   if (index != second_template_index) {
 323 |     end_pix = ctx->classer->pixat->pix[index];
 324 |     copied_pix = pixCopy(NULL, end_pix);
 325 |     boxa = ctx->classer->pixat->boxa;
 326 |     l_int32 nbox = boxaGetCount(boxa);
 327 |     BOX *box = NULL;
 328 |     if (index < nbox) {
 329 |       box = boxa->box[index];
 330 |     }
 331 |     if (pixaReplacePix(ctx->classer->pixat, second_template_index, copied_pix, box)) {
 332 |       fprintf(stderr, "uniting - unable to replace pix %d\n", second_template_index);
 333 |       return 2;
 334 |     }
 335 | 
 336 |     // reindexing
 337 |     for (int i = 0; i < ctx->classer->naclass->n; i++) {
 338 |       int n;
 339 |       numaGetIValue(ctx->classer->naclass, i, &n);
 340 |       if (n == index) {
 341 |         numaSetValue(ctx->classer->naclass, i, second_template_index);
 342 |       }
 343 |     }
 344 |   }
 345 | 
 346 |   if (pixaRemovePix(ctx->classer->pixat, index)) {
 347 |      fprintf(stderr, "uniting - unable to remove pix from pixat\n");
 348 |      return 3;
 349 |   }
 350 |   ctx->classer->nclass--;
 351 | 
 352 |   return 0;
 353 | }
 354 | 
 355 | // see comments in .h file
 356 | void
 357 | jbig2enc_auto_threshold(struct jbig2ctx *ctx) {
 358 |   if (!ctx) {
 359 |     fprintf(stderr, "jbig2ctx not given");
 360 |     return;
 361 |   }
 362 | 
 363 |   PIXA *pixa = ctx->classer->pixat;
 364 |   for (int i = 0; i < pixaGetCount(pixa); i++) {
 365 |     PIX *pix = pixa->pix[i];
 366 | 
 367 |     // The code only looks forward because jbig2enc_are_equivalent is
 368 |     // symmetric.
 369 |     for (int j = i+1; j < pixaGetCount(pixa); j++) {
 370 |       if (jbig2enc_are_equivalent(pix, pixa->pix[j])) {
 371 |         unite_templates_with_indexes(ctx, i, j);
 372 |         j--;
 373 |       }
 374 |     }
 375 |   }
 376 | }
 377 | 
 378 | #if defined(HASH_DEBUGGING)
 379 | static void
 380 | print_hash_map(std::map<unsigned int, list<int> > &hashed_templates) {
 381 |   std::map<unsigned int, list<int> >::iterator it;
 382 |   std::list<int>::iterator it_representants;
 383 | 
 384 |   for (it = hashed_templates.begin(); it != hashed_templates.end(); it++) {
 385 |     fprintf(stderr, "for hash %d:\n", it->first);
 386 |     fprintf(stderr, "  -- ");
 387 |     for (it_representants = it->second.begin(); it_representants != it->second.end(); it_representants++) {
 388 |       fprintf(stderr, "%d ", (*it_representants));
 389 |     }
 390 |     fprintf(stderr, "\n");
 391 |   }
 392 | }
 393 | #endif
 394 | 
 395 | static int
 396 | count_hash(PIX * pix, std::map<unsigned int, std::list<int> > &m, int template_index) {
 397 |   if (!pix) {
 398 |     fprintf(stderr, "no pix to count hash for\n");
 399 |     return 1;
 400 |   }
 401 | 
 402 |   l_uint32 w = pixGetWidth(pix);
 403 |   l_uint32 h = pixGetHeight(pix);
 404 | 
 405 |   // find number of holes.
 406 |   l_int32 holes;
 407 |   pixCountConnComp(pix, 4, &holes);
 408 | 
 409 |   unsigned int hash = (holes + 10 * h + 10000 * w) % 10000000;
 410 | 
 411 |   std::map<unsigned int, std::list<int> >::iterator it = m.find(hash);
 412 | 
 413 |   if (it == m.end()) {
 414 |     // Create new bin.
 415 |     it = m.begin();
 416 |     std::list<int> representants;
 417 |     representants.push_back(template_index);
 418 |     m.insert(std::pair<unsigned int, std::list<int> >(hash, representants));
 419 |   } else {
 420 |     // Add to existing bin.
 421 |     it->second.push_back(template_index);
 422 |   }
 423 |   return 0;
 424 | }
 425 | 
 426 | // see comments in .h file
 427 | void
 428 | jbig2enc_auto_threshold_using_hash(struct jbig2ctx *ctx) {
 429 |   if (!ctx) {
 430 |     fprintf(stderr, "jbig2ctx not given\n");
 431 |     return;
 432 |   }
 433 | 
 434 |   std::map<unsigned int, std::list<int> > hashed_templates;
 435 | 
 436 |   PIXA *pixa = ctx->classer->pixat;
 437 |   for (int i = 0; i < pixaGetCount(pixa); i++) {
 438 |     count_hash(pixa->pix[i], hashed_templates, i);
 439 |   }
 440 | 
 441 |   #ifdef HASH_DEBUGGING
 442 |     print_hash_map(hashed_templates);
 443 |   #endif
 444 | 
 445 |   // new_representant maps from a symbol to the list of symbols that should be
 446 |   // replaced by it.
 447 |   std::map<unsigned int, std::list<int> > new_representants;
 448 | 
 449 |   // going through representants with the same hash
 450 |   std::map<unsigned int, std::list<int> >::iterator it;
 451 |   std::list<int>::iterator first_template_it;
 452 |   std::list<int>::iterator second_template_it;
 453 | 
 454 |   for (it = hashed_templates.begin(); it != hashed_templates.end(); it++) {
 455 |     // compare all the templates with same hash.
 456 |     for (first_template_it = it->second.begin(); first_template_it != it->second.end();) {
 457 |       std::list<int> templates;
 458 |       second_template_it = first_template_it;
 459 | 
 460 |       for (++second_template_it; second_template_it != it->second.end();) {
 461 |         if (jbig2enc_are_equivalent(pixa->pix[(*first_template_it)], pixa->pix[(*second_template_it)])) {
 462 |           // unite templates without removing (just reindexing) but add to
 463 |           // array for later removal.
 464 |           templates.push_back(*second_template_it);
 465 |           second_template_it = (it->second.erase(second_template_it));
 466 |         } else {
 467 |           second_template_it++;
 468 |         }
 469 |       }
 470 |       if (!templates.empty()) {
 471 |         new_representants.insert(std::pair<unsigned int, std::list<int> >((*first_template_it), templates));
 472 |       }
 473 |       first_template_it++;
 474 |     }
 475 |   }
 476 | 
 477 |   std::list<int> templates_to_remove;
 478 |   for (it = new_representants.begin(); it != new_representants.end(); it++) {
 479 |     if (!unite_templates(ctx, it->first, it->second)) {
 480 |       templates_to_remove.merge(it->second);
 481 |     }
 482 |   }
 483 | 
 484 |   if (remove_templates(ctx, templates_to_remove)) {
 485 |     fprintf(stderr, "warning: removing united templates wasn't fully successful");
 486 |   }
 487 | }
 488 | 
 489 | // see comments in .h file
 490 | void
 491 | jbig2_destroy(struct jbig2ctx *ctx) {
 492 |   if (ctx->avg_templates) pixaDestroy(&ctx->avg_templates);
 493 |   jbClasserDestroy(&ctx->classer);
 494 |   delete ctx;
 495 | }
 496 | 
 497 | // see comments in .h file
 498 | void
 499 | jbig2_add_page(struct jbig2ctx *ctx, struct Pix *input) {
 500 |   PIX *bw;
 501 | 
 502 |   if (false /*ctx->xres >= 300*/) {
 503 |     bw = remove_flyspecks(input, (int) (0.0084*ctx->xres));
 504 |   } else {
 505 |     bw = pixClone(input);
 506 |   }
 507 | 
 508 |   if (ctx->refinement) {
 509 |     ctx->baseindexes.push_back(ctx->classer->baseindex);
 510 |   }
 511 | 
 512 |   jbAddPage(ctx->classer, bw);
 513 |   ctx->page_width.push_back(bw->w);
 514 |   ctx->page_height.push_back(bw->h);
 515 |   ctx->page_xres.push_back(bw->xres);
 516 |   ctx->page_yres.push_back(bw->yres);
 517 | 
 518 |   if (ctx->refinement) {
 519 |     // This code is broken by (my) recent changes to Leptonica. Needs to be
 520 |     // fixed at some point, but not too important at the moment since we don't
 521 |     // use refinement.
 522 | 
 523 |     /*BOXA *boxes = boxaCopy(ctx->classer->boxas, L_CLONE);
 524 |     ctx->boxes.push_back(boxes);
 525 |     PIXA *comps = pixaCopy(ctx->classer->pixas, L_CLONE);
 526 |     ctx->comps.push_back(comps);*/
 527 |   }
 528 | 
 529 |   pixDestroy(&bw);
 530 | }
 531 | 
 532 | #define F(x) memcpy(ret + offset, &x, sizeof(x)) ; offset += sizeof(x)
 533 | #define G(x, y) memcpy(ret + offset, x, y); offset += y;
 534 | #define SEGMENT(x) x.write(ret + offset); offset += x.size();
 535 | 
 536 | // see comments in .h file
 537 | uint8_t *
 538 | jbig2_pages_complete(struct jbig2ctx *ctx, int *const length, bool verbose) {
 539 |   /*
 540 |      Graying support - disabled.
 541 |      It's not very clear that graying actually buys you much extra quality
 542 |      above pick-the-first. Also, aligning the gray glyphs requires the
 543 |      original source image.
 544 | 
 545 |      Remember that you need the Init without WithoutComponents to use this */
 546 | 
 547 | 
 548 |   /*NUMA *samples_per_composition;
 549 |   PTA *grayed_centroids;
 550 |   PIXA *grayed;
 551 | 
 552 |   grayed = jbAccumulateComposites(ctx->classer->pixaa, &samples_per_composition,
 553 |                                   &grayed_centroids);
 554 | 
 555 |   if (!grayed || grayed->n != ctx->classer->pixaa->n) {
 556 |     fprintf(stderr, "Graying failed\n");
 557 |     return NULL;
 558 |   }
 559 | 
 560 |   ctx->avg_templates = pixaCreate(0);
 561 |   for (int i = 0; i < grayed->n; ++i) {
 562 |     int samples;
 563 |     numaGetIValue(samples_per_composition, i, &samples);
 564 |     PIX *avg = pixFinalAccumulateThreshold(grayed->pix[i], 0,
 565 |                                            (samples + 1) >> 1);
 566 |     pixaAddPix(ctx->avg_templates, avg, L_INSERT);
 567 |     //char b[512];
 568 |     //sprintf(b, "gray-%d/th.png", i);
 569 |     //pixWrite(b, avg, IFF_PNG);
 570 |   }
 571 | 
 572 |   pixaDestroy(&grayed);
 573 |   numaDestroy(&samples_per_composition);*/
 574 | 
 575 |   // We find the symbols which only appear on a single page and encode them in
 576 |   // a symbol dictionary just for that page. This is because we want to keep
 577 |   // the size of the global dictionary down as some PDF readers appear to
 578 |   // decode it for every page (!)
 579 | 
 580 |   // (as a short cut, we just pick the symbols which are only used once since,
 581 |   // in testing, all the symbols which appear on only one page appear only once
 582 |   // on that page)
 583 | 
 584 |   const bool single_page = ctx->classer->npages == 1;
 585 | 
 586 |   // maps symbol number to the number of times it has been used
 587 |   // pixat->n is the number of symbols
 588 |   // naclass->n is the number of connected components
 589 | 
 590 |   std::vector<unsigned> symbol_used(ctx->classer->pixat->n);
 591 |   for (int i = 0; i < ctx->classer->naclass->n; ++i) {
 592 |     int n;
 593 |     numaGetIValue(ctx->classer->naclass, i, &n);
 594 |     symbol_used[n]++;
 595 |   }
 596 | 
 597 |   // the multiuse symbols are the ones which go into the global dictionary
 598 |   std::vector<unsigned> multiuse_symbols;
 599 |   for (int i = 0; i < ctx->classer->pixat->n; ++i) {
 600 |     if (symbol_used[i] == 0) abort();
 601 |     if (symbol_used[i] > 1 || single_page) multiuse_symbols.push_back(i);
 602 |   }
 603 |   ctx->num_global_symbols = multiuse_symbols.size();
 604 | 
 605 |   // build the pagecomps map: a map from page number to the list of connected
 606 |   // components for that page. The classer gives us an array from connected
 607 |   // component number to page number - we just have to reverse it
 608 |   for (int i = 0; i < ctx->classer->napage->n; ++i) {
 609 |     int page_num;
 610 |     numaGetIValue(ctx->classer->napage, i, &page_num);
 611 |     ctx->pagecomps[page_num].push_back(i);
 612 |     int symbol;
 613 |     numaGetIValue(ctx->classer->naclass, i, &symbol);
 614 |     if (symbol_used[symbol] == 1 && !single_page) {
 615 |       ctx->single_use_symbols[page_num].push_back(symbol);
 616 |     }
 617 |   }
 618 | 
 619 | #ifdef DUMP_SYMBOL_GRAPH
 620 |   for (int p = 0; p < ctx->classer->npages; ++p) {
 621 |     for (std::vector<int>::const_iterator i = ctx->pagecomps[p].begin();
 622 |          i != ctx->pagecomps[p].end(); ++i) {
 623 |       const int sym = (int) ctx->classer->naclass->array[*i];
 624 |       fprintf(stderr, "S: %d %d\n", p, sym);
 625 |     }
 626 |   }
 627 | #endif
 628 | 
 629 | #ifdef SYMBOL_COMPRESSION_DEBUGGING
 630 |   std::map<int, int> usecount;
 631 |   for (int i = 0; i < ctx->classer->naclass->n; ++i) {
 632 |     usecount[(int)ctx->classer->naclass->array[i]]++;
 633 |   }
 634 | 
 635 |   for (int p = 0; p < ctx->classer->npages; ++p) {
 636 |     const int numcomps = ctx->pagecomps[p].size();
 637 |     int unique_in_doc = 0;
 638 |     std::map<int, int> symcount;
 639 |     for (std::vector<int>::const_iterator i = ctx->pagecomps[p].begin();
 640 |          i != ctx->pagecomps[p].end(); ++i) {
 641 |       const int sym = (int) ctx->classer->naclass->array[*i];
 642 |       symcount[sym]++;
 643 |       if (usecount[sym] == 1) unique_in_doc++;
 644 |     }
 645 |     int unique_this_page = 0;
 646 |     for (std::map<int, int>::const_iterator i = symcount.begin();
 647 |          i != symcount.end(); ++i) {
 648 |       if (i->second == 1) unique_this_page++;
 649 |     }
 650 | 
 651 |     fprintf(stderr, "Page %d %d/%d/%d\n", p, numcomps, unique_this_page, unique_in_doc);
 652 |   }
 653 | #endif
 654 | 
 655 | #ifdef DUMP_ALL_SYMBOLS
 656 |   char filenamebuf[128];
 657 |   for (int i = 0; i < ctx->classer->pixat->n; ++i) {
 658 |     sprintf(filenamebuf, "sym-%d.png", i);
 659 |     pixWrite(filenamebuf, ctx->classer->pixat->pix[i], IFF_PNG);
 660 |   }
 661 | #endif
 662 |   if (verbose) {
 663 |     fprintf(stderr, "JBIG2 compression complete. pages:%d symbols:%d log2:%d\n",
 664 |             ctx->classer->npages, ctx->classer->pixat->n,
 665 |             log2up(ctx->classer->pixat->n));
 666 |   }
 667 |   jbGetLLCorners(ctx->classer);
 668 | 
 669 |   struct jbig2enc_ctx ectx;
 670 |   jbig2enc_init(&ectx);
 671 | 
 672 |   struct jbig2_file_header header;
 673 |   if (ctx->full_headers) {
 674 |     memset(&header, 0, sizeof(header));
 675 |     header.n_pages = htonl(ctx->classer->npages);
 676 |     header.organisation_type = 1;
 677 |     memcpy(&header.id, JBIG2_FILE_MAGIC, 8);
 678 |   }
 679 | 
 680 |   Segment seg;
 681 |   struct jbig2_symbol_dict symtab;
 682 |   memset(&symtab, 0, sizeof(symtab));
 683 | 
 684 |   jbig2enc_symboltable
 685 |     (&ectx, ctx->avg_templates ? ctx->avg_templates : ctx->classer->pixat,
 686 |      &multiuse_symbols, &ctx->symmap, ctx->avg_templates == NULL);
 687 |   const int symdatasize = jbig2enc_datasize(&ectx);
 688 | 
 689 |   symtab.a1x = 3;
 690 |   symtab.a1y = -1;
 691 |   symtab.a2x = -3;
 692 |   symtab.a2y = -1;
 693 |   symtab.a3x = 2;
 694 |   symtab.a3y = -2;
 695 |   symtab.a4x = -2;
 696 |   symtab.a4y = -2;
 697 |   symtab.exsyms = symtab.newsyms = htonl(multiuse_symbols.size());
 698 | 
 699 |   ctx->symtab_segment = ctx->segnum;
 700 |   seg.number = ctx->segnum;
 701 |   ctx->segnum++;
 702 |   seg.type = segment_symbol_table;
 703 |   seg.len = sizeof(symtab) + symdatasize;
 704 |   seg.page = 0;
 705 |   seg.retain_bits = 1;
 706 | 
 707 |   u8 *const ret = (u8 *) malloc((ctx->full_headers ? sizeof(header) : 0) +
 708 |                                 seg.size() + sizeof(symtab) + symdatasize);
 709 |   int offset = 0;
 710 |   if (ctx->full_headers) {
 711 |     F(header);
 712 |   }
 713 |   SEGMENT(seg);
 714 |   F(symtab);
 715 |   jbig2enc_tobuffer(&ectx, ret + offset);
 716 |   jbig2enc_dealloc(&ectx);
 717 |   offset += symdatasize;
 718 | 
 719 |   *length = offset;
 720 | 
 721 |   return ret;
 722 | }
 723 | 
 724 | // see comments in .h file
 725 | uint8_t *
 726 | jbig2_produce_page(struct jbig2ctx *ctx, int page_no,
 727 |                    int xres, int yres, int *const length) {
 728 |   const bool last_page = page_no == ctx->classer->npages;
 729 |   const bool include_trailer = last_page && ctx->full_headers;
 730 | 
 731 |   struct jbig2enc_ctx ectx;
 732 |   jbig2enc_init(&ectx);
 733 | 
 734 |   Segment seg, symseg;
 735 |   Segment endseg, trailerseg;
 736 |   struct jbig2_page_info pageinfo;
 737 |   memset(&pageinfo, 0, sizeof(pageinfo));
 738 |   struct jbig2_text_region textreg;
 739 |   memset(&textreg, 0, sizeof(textreg));
 740 |   struct jbig2_text_region_syminsts textreg_syminsts;
 741 |   memset(&textreg_syminsts, 0, sizeof(textreg_syminsts));
 742 |   struct jbig2_text_region_atflags textreg_atflags;
 743 |   memset(&textreg_atflags, 0, sizeof(textreg_atflags));
 744 |   Segment segr;
 745 | 
 746 |   // page information segment
 747 |   seg.number = ctx->segnum;
 748 |   ctx->segnum++;
 749 |   seg.type = segment_page_information;
 750 |   seg.page = ctx->pdf_page_numbering ? 1 : 1 + page_no;
 751 |   seg.len = sizeof(struct jbig2_page_info);
 752 |   pageinfo.width = htonl(ctx->page_width[page_no]);
 753 |   pageinfo.height = htonl(ctx->page_height[page_no]);
 754 |   pageinfo.xres = htonl(xres == -1 ? ctx->page_xres[page_no] : xres );
 755 |   pageinfo.yres = htonl(yres == -1 ? ctx->page_yres[page_no] : yres );
 756 |   pageinfo.is_lossless = ctx->refinement;
 757 | 
 758 |   std::map<int, int> second_symbol_map;
 759 |   // If we have single-use symbols on this page we make a new symbol table
 760 |   // containing just them.
 761 |   const bool extrasymtab = ctx->single_use_symbols[page_no].size() > 0;
 762 |   struct jbig2enc_ctx extrasymtab_ctx;
 763 | 
 764 |   struct jbig2_symbol_dict symtab;
 765 |   memset(&symtab, 0, sizeof(symtab));
 766 | 
 767 |   if (extrasymtab) {
 768 |     jbig2enc_init(&extrasymtab_ctx);
 769 |     symseg.number = ctx->segnum++;
 770 |     symseg.type = segment_symbol_table;
 771 |     symseg.page = ctx->pdf_page_numbering ? 1 : 1 + page_no;
 772 | 
 773 |     jbig2enc_symboltable
 774 |       (&extrasymtab_ctx,
 775 |        ctx->avg_templates ? ctx->avg_templates : ctx->classer->pixat,
 776 |        &ctx->single_use_symbols[page_no], &second_symbol_map,
 777 |        ctx->avg_templates == NULL);
 778 |     symtab.a1x = 3;
 779 |     symtab.a1y = -1;
 780 |     symtab.a2x = -3;
 781 |     symtab.a2y = -1;
 782 |     symtab.a3x = 2;
 783 |     symtab.a3y = -2;
 784 |     symtab.a4x = -2;
 785 |     symtab.a4y = -2;
 786 |     symtab.exsyms = symtab.newsyms =
 787 |       htonl(ctx->single_use_symbols[page_no].size());
 788 | 
 789 |     symseg.len = jbig2enc_datasize(&extrasymtab_ctx) + sizeof(symtab);
 790 |   }
 791 | 
 792 |   const int numsyms = ctx->num_global_symbols +
 793 |                       ctx->single_use_symbols[page_no].size();
 794 |   //BOXA *const boxes = ctx->refinement ? ctx->boxes[page_no] : NULL;
 795 |   int baseindex = ctx->refinement ? ctx->baseindexes[page_no] : 0;
 796 |   jbig2enc_textregion(&ectx, ctx->symmap, second_symbol_map,
 797 |                       ctx->pagecomps[page_no],
 798 |                       ctx->classer->ptall,
 799 |                       ctx->avg_templates ? ctx->avg_templates : ctx->classer->pixat,
 800 |                       ctx->classer->naclass, 1,
 801 |                       log2up(numsyms),
 802 |                       //ctx->refinement ? ctx->comps[page_no] : NULL,
 803 |                       NULL,
 804 |                       /* boxes */ NULL, baseindex, ctx->refine_level,
 805 |                       ctx->avg_templates == NULL);
 806 |   const int textdatasize = jbig2enc_datasize(&ectx);
 807 |   textreg.width = htonl(ctx->page_width[page_no]);
 808 |   textreg.height = htonl(ctx->page_height[page_no]);
 809 |   textreg.logsbstrips = 0;
 810 |   textreg.sbrefine = ctx->refinement;
 811 |   // refcorner = 0 -> bot left
 812 |   textreg_syminsts.sbnuminstances = htonl(ctx->pagecomps[page_no].size());
 813 | 
 814 |   textreg_atflags.a1x = -1;
 815 |   textreg_atflags.a1y = -1;
 816 |   textreg_atflags.a2x = -1;
 817 |   textreg_atflags.a2y = -1;
 818 | 
 819 |   segr.number = ctx->segnum;
 820 |   ctx->segnum++;
 821 |   segr.type = segment_imm_text_region;
 822 |   segr.referred_to.push_back(ctx->symtab_segment);
 823 |   if (extrasymtab) segr.referred_to.push_back(symseg.number);
 824 |   if (ctx->refinement) {
 825 |     segr.len = sizeof(textreg) + sizeof(textreg_syminsts) +
 826 |                sizeof(textreg_atflags) + textdatasize;
 827 |   } else {
 828 |     segr.len = sizeof(textreg) + sizeof(textreg_syminsts) + textdatasize;
 829 |   }
 830 | 
 831 |   segr.retain_bits = 2;
 832 |   segr.page = ctx->pdf_page_numbering ? 1 : 1 + page_no;
 833 | 
 834 |   const int extrasymtab_size = extrasymtab ?
 835 |     jbig2enc_datasize(&extrasymtab_ctx) : 0;
 836 | 
 837 |   if (ctx->full_headers) {
 838 |     endseg.number = ctx->segnum;
 839 |     ctx->segnum++;
 840 |     endseg.type = segment_end_of_page;
 841 |     endseg.page = ctx->pdf_page_numbering ? 1 : 1 + page_no;
 842 |   }
 843 | 
 844 |   if (include_trailer) {
 845 |     trailerseg.number = ctx->segnum;
 846 |     ctx->segnum++;
 847 |     trailerseg.type = segment_end_of_file;
 848 |     trailerseg.page = 0;
 849 |   }
 850 | 
 851 |   const int totalsize = seg.size() + sizeof(pageinfo) +
 852 |                         (extrasymtab ? (extrasymtab_size + symseg.size() +
 853 |                                         sizeof(symtab)) : 0) +
 854 |                         segr.size() +
 855 |                         sizeof(textreg) + sizeof(textreg_syminsts) +
 856 |                         (ctx->refinement ? sizeof(textreg_atflags) : 0) +
 857 |                         textdatasize +
 858 |                         (ctx->full_headers ? endseg.size() : 0) +
 859 |                         (include_trailer ? trailerseg.size() : 0);
 860 |   u8 *ret = (u8 *) malloc(totalsize);
 861 |   int offset = 0;
 862 | 
 863 |   SEGMENT(seg);
 864 |   F(pageinfo);
 865 |   if (extrasymtab) {
 866 |     SEGMENT(symseg);
 867 |     F(symtab);
 868 |     jbig2enc_tobuffer(&extrasymtab_ctx, ret + offset);
 869 |     offset += extrasymtab_size;
 870 |   }
 871 |   SEGMENT(segr);
 872 |   F(textreg);
 873 |   if (ctx->refinement) {
 874 |     F(textreg_atflags);
 875 |   }
 876 |   F(textreg_syminsts);
 877 |   jbig2enc_tobuffer(&ectx, ret + offset); offset += textdatasize;
 878 |   if (ctx->full_headers) {
 879 |     SEGMENT(endseg);
 880 |   }
 881 |   if (include_trailer) {
 882 |     SEGMENT(trailerseg);
 883 |   }
 884 | 
 885 |   if (totalsize != offset) abort();
 886 | 
 887 |   jbig2enc_dealloc(&ectx);
 888 |   if (extrasymtab) jbig2enc_dealloc(&extrasymtab_ctx);
 889 | 
 890 |   *length = offset;
 891 |   return ret;
 892 | }
 893 | 
 894 | #undef F
 895 | #undef G
 896 | 
 897 | // see comments in .h file
 898 | u8 *
 899 | jbig2_encode_generic(struct Pix *const bw, const bool full_headers, const int xres,
 900 |                      const int yres, const bool duplicate_line_removal,
 901 |                      int *const length) {
 902 |   int segnum = 0;
 903 | 
 904 |   if (!bw) return NULL;
 905 |   pixSetPadBits(bw, 0);
 906 | 
 907 |   struct jbig2_file_header header;
 908 |   if (full_headers) {
 909 |     memset(&header, 0, sizeof(header));
 910 |     header.n_pages = htonl(1);
 911 |     header.organisation_type = 1;
 912 |     memcpy(&header.id, JBIG2_FILE_MAGIC, 8);
 913 |   }
 914 | 
 915 |   // setup compression
 916 |   struct jbig2enc_ctx ctx;
 917 |   jbig2enc_init(&ctx);
 918 | 
 919 |   Segment seg, seg2, endseg;
 920 |   jbig2_page_info pageinfo;
 921 |   memset(&pageinfo, 0, sizeof(pageinfo));
 922 |   jbig2_generic_region genreg;
 923 |   memset(&genreg, 0, sizeof(genreg));
 924 | 
 925 |   seg.number = segnum;
 926 |   segnum++;
 927 |   seg.type = segment_page_information;
 928 |   seg.page = 1;
 929 |   seg.len = sizeof(struct jbig2_page_info);
 930 |   pageinfo.width = htonl(bw->w);
 931 |   pageinfo.height = htonl(bw->h);
 932 |   pageinfo.xres = htonl(xres ? xres : bw->xres);
 933 |   pageinfo.yres = htonl(yres ? yres : bw->yres);
 934 |   pageinfo.is_lossless = 1;
 935 | 
 936 | #ifdef SURPRISE_MAP
 937 |   dprintf(3, "P5\n%d %d 255\n", bw->w, bw->h);
 938 | #endif
 939 | 
 940 |   jbig2enc_bitimage(&ctx, (u8 *) bw->data, bw->w, bw->h, duplicate_line_removal);
 941 |   jbig2enc_final(&ctx);
 942 |   const int datasize = jbig2enc_datasize(&ctx);
 943 | 
 944 |   seg2.number = segnum;
 945 |   segnum++;
 946 |   seg2.type = segment_imm_generic_region;
 947 |   seg2.page = 1;
 948 |   seg2.len = sizeof(genreg) + datasize;
 949 | 
 950 |   endseg.number = segnum;
 951 |   segnum++;
 952 |   endseg.page = 1;
 953 | 
 954 |   genreg.width = htonl(bw->w);
 955 |   genreg.height = htonl(bw->h);
 956 |   if (duplicate_line_removal) {
 957 |     genreg.tpgdon = true;
 958 |   }
 959 |   genreg.a1x = 3;
 960 |   genreg.a1y = -1;
 961 |   genreg.a2x = -3;
 962 |   genreg.a2y = -1;
 963 |   genreg.a3x = 2;
 964 |   genreg.a3y = -2;
 965 |   genreg.a4x = -2;
 966 |   genreg.a4y = -2;
 967 | 
 968 |   const int totalsize = seg.size() + sizeof(pageinfo) + seg2.size() +
 969 |                         sizeof(genreg) + datasize +
 970 |                         (full_headers ? (sizeof(header) + 2*endseg.size()) : 0);
 971 |   u8 *const ret = (u8 *) malloc(totalsize);
 972 |   int offset = 0;
 973 | 
 974 | #define F(x) memcpy(ret + offset, &x, sizeof(x)) ; offset += sizeof(x)
 975 |   if (full_headers) {
 976 |     F(header);
 977 |   }
 978 |   SEGMENT(seg);
 979 |   F(pageinfo);
 980 |   SEGMENT(seg2);
 981 |   F(genreg);
 982 |   jbig2enc_tobuffer(&ctx, ret + offset);
 983 |   offset += datasize;
 984 | 
 985 |   if (full_headers) {
 986 |     endseg.type = segment_end_of_page;
 987 |     SEGMENT(endseg);
 988 |     endseg.number += 1;
 989 |     endseg.type = segment_end_of_file;
 990 |     SEGMENT(endseg);
 991 |   }
 992 | 
 993 |   if (totalsize != offset) abort();
 994 | 
 995 |   jbig2enc_dealloc(&ctx);
 996 | 
 997 |   *length = offset;
 998 | 
 999 |   return ret;
1000 | }
1001 | 
1002 | 


--------------------------------------------------------------------------------
/src/jbig2enc.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #ifndef JBIG2ENC_JBIG2_H__
 19 | #define JBIG2ENC_JBIG2_H__
 20 | 
 21 | // -----------------------------------------------------------------------------
 22 | // Welcome gentle reader,
 23 | //
 24 | // This is an encoder for JBIG2:
 25 | // www.jpeg.org/public/fcd14492.pdf
 26 | //
 27 | // JBIG2 encodes bi-level (1 bpp) images using a number of clever tricks to get
 28 | // better compression than G4. This encoder can:
 29 | //    * Generate JBIG2 files, or fragments for embedding in PDFs
 30 | //    * Generic region encoding
 31 | //    * Symbol extraction, classification and text region coding
 32 | //
 33 | // It uses the (Apache-ish licensed) Leptonica library:
 34 | //   http://www.leptonica.com/
 35 | // -----------------------------------------------------------------------------
 36 | 
 37 | #if defined(sun)
 38 | #include <sys/types.h>
 39 | #else
 40 | #include <stdint.h>
 41 | #endif
 42 | 
 43 | // -----------------------------------------------------------------------------
 44 | // Returns the version identifier as a static string.
 45 | // -------------------------------------------------------------------------
 46 | char const *getVersion();
 47 | 
 48 | struct Pix;
 49 | // This is the (opaque) structure which handles multi-page compression.
 50 | struct jbig2ctx;
 51 | 
 52 | // -----------------------------------------------------------------------------
 53 | // Multipage compression.
 54 | //
 55 | // First call jbig2_init to setup the structure. This structure must be free'ed
 56 | // by calling jbig2_destroy when you are finished.
 57 | //
 58 | // First, add all the pages with jbig2_add_page. This will collect all the
 59 | // information required. If refinement is on, it will also save all the
 60 | // component images, so this may take large amounts of memory.
 61 | //
 62 | // Then call jbig2_pages_complete. This returns a malloced buffer with the
 63 | // symbol table encoded.
 64 | //
 65 | // Then call jbig2_produce_page for each page. You must call it with pages
 66 | // numbered from zero, and for every page.
 67 | // -----------------------------------------------------------------------------
 68 | 
 69 | // -----------------------------------------------------------------------------
 70 | // Create a multi-page compression context structure
 71 | //
 72 | // thresh: The threshold for the classifier. The larger the number the larger
 73 | //         the number of different symbols, the more bits used and the closer
 74 | //         the resulting image is to the original. (0.85 is a good value)
 75 | // weight: Use 0.5
 76 | // xres: the ppi in the X direction. If 0, the ppi is taken from bw
 77 | // yres: see xres
 78 | // full_headers: if true a full JBIG2 file is produced, otherwise the data is
 79 | //               only good for embedding in PDFs
 80 | // refine: If < 0, disable refinement. Otherwise, the number of incorrect
 81 | //         pixels which will be accepted per symbol. Enabling refinement
 82 | //         increases memory use.
 83 | // -----------------------------------------------------------------------------
 84 | struct jbig2ctx *jbig2_init(float thresh, float weight, int xres, int yres,
 85 |                             bool full_headers, int refine_level);
 86 | 
 87 | // -----------------------------------------------------------------------------
 88 | // Delete a context returned by jbig2_init
 89 | // -----------------------------------------------------------------------------
 90 | void jbig2_destroy(struct jbig2ctx *);
 91 | // -----------------------------------------------------------------------------
 92 | // Classify and record information about a page.
 93 | //
 94 | // bw: A 1-bpp image
 95 | // -----------------------------------------------------------------------------
 96 | void jbig2_add_page(struct jbig2ctx *ctx, struct Pix *bw);
 97 | // -----------------------------------------------------------------------------
 98 | // Finalise information about the document and encode the symbol table.
 99 | //
100 | // WARNING: returns a malloced buffer which the caller must free
101 | // -----------------------------------------------------------------------------
102 | uint8_t *jbig2_pages_complete(struct jbig2ctx *ctx, int *const length,
103 |                               bool verbose=false);
104 | // -----------------------------------------------------------------------------
105 | // Encode a page.
106 | //
107 | // page_no: number of this page, indexed from 0. This *must* match the order of
108 | //          pages presented to jbig2_add_page.
109 | // xres, yres: if -1, use values given in _init. Otherwise, set the resolution
110 | //             for this page only
111 | //
112 | // WARNING: returns a malloced buffer which the caller must free
113 | // -----------------------------------------------------------------------------
114 | uint8_t *jbig2_produce_page(struct jbig2ctx *ctx, int page_no, int xres,
115 |                             int yres, int *const length);
116 | 
117 | // WARNING: returns a malloced buffer which the caller must free
118 | // -----------------------------------------------------------------------------
119 | 
120 | 
121 | // -----------------------------------------------------------------------------
122 | // Single page compression
123 | // -----------------------------------------------------------------------------
124 | 
125 | // -----------------------------------------------------------------------------
126 | // Encode an image as a single generic region. This is lossless. It should not
127 | // be used for images as half-tone coding is not implemented.
128 | //
129 | // see argument comments for jbig2_init
130 | // duplicate_line_removal: turning this on
131 | //    * Breaks ghostscript
132 | //    * Takes ever so slightly more bytes to encode
133 | //    * Cuts the encode time by half
134 | //
135 | // WARNING: returns a malloced buffer which the caller must free
136 | // -----------------------------------------------------------------------------
137 | uint8_t *
138 | jbig2_encode_generic(struct Pix *const bw, const bool full_headers,
139 |                      const int xres, const int yres,
140 |                      const bool duplicate_line_removal,
141 |                      int *const length);
142 | 
143 | // -------------------------------------------------------------------------------
144 | // jbig2enc_auto_threshold gathers classes of symbols and uses a single
145 | // representative to stand for them all.
146 | // -------------------------------------------------------------------------------
147 | void jbig2enc_auto_threshold(struct jbig2ctx *ctx);
148 | 
149 | // -------------------------------------------------------------------------------
150 | // auto_threshold_using_hash performs the same action as auto_threshold, but
151 | // uses a hash function to attempt to quickly discard improbable matches.
152 | // -------------------------------------------------------------------------------
153 | void jbig2enc_auto_threshold_using_hash(struct jbig2ctx *ctx);
154 | 
155 | #endif  // JBIG2ENC_JBIG2_H__
156 | 


--------------------------------------------------------------------------------
/src/jbig2segments.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #ifndef THIRD_PARTY_JBIG2ENC_JBIG2SEGMENTS_H__
 19 | #define THIRD_PARTY_JBIG2ENC_JBIG2SEGMENTS_H__
 20 | 
 21 | #include <vector>
 22 | #ifdef WIN32
 23 | #include <winsock2.h>
 24 | #else
 25 | #include <netinet/in.h>
 26 | #endif
 27 | 
 28 | // -----------------------------------------------------------------------------
 29 | // See comments in jbig2structs.h about the bit packing in this structure.
 30 | // -----------------------------------------------------------------------------
 31 | #if defined(WIN32)
 32 | #pragma pack(1)
 33 | #endif
 34 | struct jbig2_segment {
 35 |   u32 number;
 36 | #ifndef __BIG_ENDIAN__
 37 |   unsigned char type : 6;
 38 |   unsigned char page_assoc_size : 1;
 39 |   unsigned char deferred_non_retain : 1;
 40 | #else
 41 |   unsigned char deferred_non_retain : 1;
 42 |   unsigned char page_assoc_size : 1;
 43 |   unsigned char type : 6;
 44 | #endif
 45 | 
 46 | #ifndef __BIG_ENDIAN__
 47 |   unsigned char retain_bits : 5;
 48 |   unsigned char segment_count : 3;
 49 | #else
 50 |   unsigned char segment_count : 3;
 51 |   unsigned char retain_bits : 5;
 52 | #endif
 53 | }
 54 | #if defined(WIN32)
 55 | ;
 56 | #pragma pack()
 57 | #else
 58 | __attribute__((packed));
 59 | #endif
 60 | ;
 61 | 
 62 | // -----------------------------------------------------------------------------
 63 | // This structure represents a JBIG2 segment header because they have too many
 64 | // variable length fields (number of referred to segments, page length etc).
 65 | // You should access and set the members directly. Endian swapping is carried
 66 | // out internally.
 67 | // -----------------------------------------------------------------------------
 68 | struct Segment {
 69 |   unsigned number;  // segment number
 70 |   int type;  // segment type (see enum in jbig2structs.h)
 71 |   int deferred_non_retain;  // see JBIG2 spec
 72 |   int retain_bits;
 73 |   std::vector<unsigned> referred_to;  // list of segment numbers referred to
 74 |   unsigned page;  // page number
 75 |   unsigned len;   // length of trailing data
 76 | 
 77 |   Segment()
 78 |       : number(0),
 79 |         type(0),
 80 |         deferred_non_retain(0),
 81 |         retain_bits(0),
 82 |         page(0),
 83 |         len(0) {}
 84 | 
 85 |   // ---------------------------------------------------------------------------
 86 |   // Return the size of the segment reference for this segment. Segments can
 87 |   // only refer to previous segments, so the bits needed is determined by the
 88 |   // number of this segment. (7.2.5)
 89 |   // ---------------------------------------------------------------------------
 90 |   unsigned reference_size() const {
 91 |     int refsize;
 92 |     if (number <= 256) {
 93 |       refsize = 1;
 94 |     } else if (number <= 65536) {
 95 |       refsize = 2;
 96 |     } else {
 97 |       refsize = 4;
 98 |     }
 99 | 
100 |     return refsize;
101 |   }
102 | 
103 |   // ---------------------------------------------------------------------------
104 |   // Return the size of the segment page association field for this segment.
105 |   // (7.2.6)
106 |   // ---------------------------------------------------------------------------
107 |   unsigned page_size() const {
108 |       return page <= 255 ? 1 : 4;
109 |   }
110 | 
111 |   // ---------------------------------------------------------------------------
112 |   // Return the number of bytes that this segment header will take up
113 |   // ---------------------------------------------------------------------------
114 |   unsigned size() const {
115 |     const int refsize = reference_size();
116 |     const int pagesize = page_size();
117 | 
118 |     return sizeof(struct jbig2_segment) + refsize * referred_to.size() +
119 |            pagesize + sizeof(u32);
120 |   }
121 | 
122 |   // ---------------------------------------------------------------------------
123 |   // Serialise this segment header into the memory pointed to by buf, which
124 |   // must be at least long enough to contain it (e.g. size() bytes)
125 |   // ---------------------------------------------------------------------------
126 |   void write(u8 *buf) {
127 |     struct jbig2_segment s;
128 |     memset(&s, 0, sizeof(s));
129 | #define F(x) s.x = x;
130 |     s.number = htonl(number);
131 |     s.type = type;
132 |     s.deferred_non_retain = deferred_non_retain;
133 |     s.retain_bits = retain_bits;
134 | #undef F
135 |     s.segment_count = referred_to.size();
136 | 
137 |     const int pagesize = page_size();
138 |     const int refsize = reference_size();
139 |     if (pagesize == 4) s.page_assoc_size = 1;
140 | 
141 |     unsigned j = 0;
142 | 
143 |     memcpy(buf, &s, sizeof(s));
144 |     j += sizeof(s);
145 | #define APPEND(type, val) type __i; __i = val; \
146 |     memcpy(&buf[j], &__i, sizeof(type)); \
147 |     j += sizeof(type)
148 | 
149 |     for (std::vector<unsigned>::const_iterator i = referred_to.begin();
150 |          i != referred_to.end(); ++i) {
151 |       if (refsize == 4) {
152 |         APPEND(u32, htonl(*i));
153 |       } else if (refsize == 2) {
154 |         APPEND(u16, htons(*i));
155 |       } else {
156 |         APPEND(u8, *i);
157 |       }
158 |     }
159 | 
160 |     if (pagesize == 4) {
161 |       APPEND(u32, htonl(page));
162 |     } else {
163 |       APPEND(u8, page);
164 |     }
165 | 
166 |     APPEND(u32, htonl(len));
167 | 
168 |     if (j != size()) abort();
169 |   }
170 | };
171 | 
172 | #endif  // THIRD_PARTY_JBIG2ENC_JBIG2SEGMENTS_H__
173 | 


--------------------------------------------------------------------------------
/src/jbig2structs.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #ifndef JBIG2ENC_JBIG2STRUCTS_H__
 19 | #define JBIG2ENC_JBIG2STRUCTS_H__
 20 | 
 21 | // GCC packs bit fields in a different order on big endian machines
 22 | 
 23 | enum {
 24 |   segment_symbol_table = 0,
 25 |   segment_imm_generic_region = 38,
 26 |   segment_page_information = 48,
 27 |   segment_imm_text_region =  6,
 28 |   segment_end_of_page = 49,
 29 |   segment_end_of_file = 51
 30 | };
 31 | 
 32 | // note that the < 1 byte fields are packed from the LSB upwards - unless
 33 | // you're bigendian, in which case they are packed MSB downwards. Joy.
 34 | 
 35 | #define JBIG2_FILE_MAGIC "\x97\x4a\x42\x32\x0d\x0a\x1a\x0a"
 36 | 
 37 | #if defined(WIN32)
 38 | #pragma pack(1)
 39 | #define PACKED
 40 | #else
 41 | #define PACKED __attribute__((packed))
 42 | #endif
 43 | 
 44 | struct jbig2_file_header {
 45 |   u8 id[8];
 46 | #ifndef __BIG_ENDIAN__
 47 |   u8 organisation_type : 1;
 48 |   u8 unknown_n_pages : 1;
 49 |   u8 reserved : 6;
 50 | #else
 51 |   u8 reserved : 6;
 52 |   u8 unknown_n_pages : 1;
 53 |   u8 organisation_type : 1;
 54 | #endif
 55 |   u32 n_pages;
 56 | } PACKED;
 57 | 
 58 | struct jbig2_page_info {
 59 |   u32 width;
 60 |   u32 height;
 61 |   u32 xres;
 62 |   u32 yres;
 63 | #ifndef __BIG_ENDIAN__
 64 |   u8 is_lossless : 1;
 65 |   u8 contains_refinements : 1;
 66 |   u8 default_pixel : 1;
 67 |   u8 default_operator : 2;
 68 |   u8 aux_buffers : 1;
 69 |   u8 operator_override : 1;
 70 |   u8 reserved : 1;
 71 | #else
 72 |   u8 reserved : 1;
 73 |   u8 operator_override : 1;
 74 |   u8 aux_buffers : 1;
 75 |   u8 default_operator : 2;
 76 |   u8 default_pixel : 1;
 77 |   u8 contains_refinements : 1;
 78 |   u8 is_lossless : 1;
 79 | #endif
 80 |   u16 segment_flags;
 81 | } PACKED;
 82 | 
 83 | struct jbig2_generic_region {
 84 |   u32 width;
 85 |   u32 height;
 86 |   u32 x;
 87 |   u32 y;
 88 |   u8 comb_operator;
 89 | 
 90 | #ifndef __BIG_ENDIAN__
 91 |   u8 mmr : 1;
 92 |   u8 gbtemplate : 2;
 93 |   u8 tpgdon : 1;
 94 |   u8 reserved : 4;
 95 | #else
 96 |   u8 reserved : 4;
 97 |   u8 tpgdon : 1;
 98 |   u8 gbtemplate : 2;
 99 |   u8 mmr : 1;
100 | #endif
101 | 
102 |   // generic region segment here. You may not need to write all 8 bytes here.
103 |   // If the template is 1..3 only the first two are needed.
104 |   signed char a1x, a1y, a2x, a2y, a3x, a3y, a4x, a4y;
105 | } PACKED ;
106 | 
107 | struct jbig2_symbol_dict {
108 | #ifndef __BIG_ENDIAN__
109 |   u8 sdhuff:1;
110 |   u8 sdrefagg:1;
111 |   u8 sdhuffdh:2;
112 |   u8 sdhuffdw:2;
113 |   u8 sdhuffbmsize:1;
114 |   u8 sdhuffagginst:1;
115 |   u8 bmcontext:1;
116 |   u8 bmcontextretained:1;
117 |   u8 sdtemplate:2;
118 |   u8 sdrtemplate:1;
119 |   u8 reserved:3;
120 | #else
121 |   u8 reserved:3;
122 |   u8 sdrtemplate:1;
123 |   u8 sdtemplate:2;
124 |   u8 bmcontextretained:1;
125 |   u8 bmcontext:1;
126 |   u8 sdhuffagginst:1;
127 |   u8 sdhuffbmsize:1;
128 |   u8 sdhuffdw:2;
129 |   u8 sdhuffdh:2;
130 |   u8 sdrefagg:1;
131 |   u8 sdhuff:1;
132 | #endif
133 | 
134 |   signed char a1x, a1y, a2x, a2y, a3x, a3y, a4x, a4y;
135 | 
136 |   // refinement AT flags omitted
137 | 
138 |   u32 exsyms;
139 |   u32 newsyms;
140 | } PACKED;
141 | 
142 | struct jbig2_text_region {
143 |   u32 width;
144 |   u32 height;
145 |   u32 x;
146 |   u32 y;
147 |   u8 comb_operator;
148 | 
149 | #ifndef __BIG_ENDIAN__
150 |   u8 sbcombop2:1;
151 |   u8 sbdefpixel:1;
152 |   u8 sbdsoffset:5;
153 |   u8 sbrtemplate:1;
154 |   u8 sbhuff:1;
155 |   u8 sbrefine:1;
156 |   u8 logsbstrips:2;
157 |   u8 refcorner:2;
158 |   u8 transposed:1;
159 |   u8 sbcombop1:1;
160 | #else
161 |   u8 sbcombop1:1;
162 |   u8 transposed:1;
163 |   u8 refcorner:2;
164 |   u8 logsbstrips:2;
165 |   u8 sbrefine:1;
166 |   u8 sbhuff:1;
167 |   u8 sbrtemplate:1;
168 |   u8 sbdsoffset:5;
169 |   u8 sbdefpixel:1;
170 |   u8 sbcombop2:1;
171 | #endif
172 | 
173 |   // huffman flags omitted
174 | } PACKED;
175 | 
176 | 
177 | struct jbig2_text_region_atflags {
178 |   signed char a1x, a1y, a2x, a2y;
179 | } PACKED;
180 | 
181 | struct jbig2_text_region_syminsts {
182 |   u32 sbnuminstances;
183 |   // huffman decoding table omitted
184 | } PACKED;
185 | 
186 | #if defined(WIN32)
187 | #pragma pack()
188 | #endif
189 | 
190 | #endif  // JBIG2ENC_JBIG2STRUCTS_H__
191 | 


--------------------------------------------------------------------------------
/src/jbig2sym.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Google Inc. All Rights Reserved.
  2 | // Author: agl@imperialviolet.org (Adam Langley)
  3 | //
  4 | // Copyright (C) 2006 Google Inc.
  5 | //
  6 | // Licensed under the Apache License, Version 2.0 (the "License");
  7 | // you may not use this file except in compliance with the License.
  8 | // You may obtain a copy of the License at
  9 | //
 10 | //      http://www.apache.org/licenses/LICENSE-2.0
 11 | //
 12 | // Unless required by applicable law or agreed to in writing, software
 13 | // distributed under the License is distributed on an "AS IS" BASIS,
 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | // See the License for the specific language governing permissions and
 16 | // limitations under the License.
 17 | 
 18 | #include <map>
 19 | #include <algorithm>
 20 | 
 21 | #include "jbig2arith.h"
 22 | 
 23 | #ifdef _MSC_VER
 24 | #define restrict __restrict
 25 | #else
 26 | #define restrict __restrict__
 27 | #endif
 28 | 
 29 | #include <stdio.h>
 30 | 
 31 | #include <leptonica/allheaders.h>
 32 | #if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1
 33 | #include "leptonica/pix_internal.h"
 34 | #include "leptonica/array_internal.h"
 35 | #endif
 36 | 
 37 | #include <math.h>
 38 | 
 39 | #define S(i) symbols->pix[i]
 40 | 
 41 | 
 42 | // -----------------------------------------------------------------------------
 43 | // iota isn't part of the STL standard, and it can be a pain to include even on
 44 | // gcc based systems. Thus we define it here and save the issues
 45 | // -----------------------------------------------------------------------------
 46 | template <class _ForwardIterator, class _Tp>
 47 | void
 48 | myiota(_ForwardIterator __first, _ForwardIterator __last, _Tp __val) {
 49 |   while (__first != __last) *__first++ = __val++;
 50 | }
 51 | 
 52 | // -----------------------------------------------------------------------------
 53 | // Sorts a vector of indexes into the symbols PIXA by height. This is needed
 54 | // because symbols are placed into the JBIG2 table in height order
 55 | // -----------------------------------------------------------------------------
 56 | class HeightSorter {  // concept: stl/StrictWeakOrdering
 57 |  public:
 58 |   HeightSorter(const PIXA *isymbols)
 59 |       : symbols(isymbols) {}
 60 | 
 61 |   bool operator() (int x, int y) {
 62 |     return S(x)->h < S(y)->h;
 63 |   }
 64 | 
 65 |  private:
 66 |   const PIXA *const symbols;
 67 | };
 68 | 
 69 | // -----------------------------------------------------------------------------
 70 | // Sorts a vector of indexes into the symbols PIXA by width. This is needed
 71 | // because symbols are placed into the JBIG2 table in width order (for a given
 72 | // height class)
 73 | // -----------------------------------------------------------------------------
 74 | class WidthSorter {  // concept: stl/StrictWeakOrdering
 75 |  public:
 76 |   WidthSorter(const PIXA *isymbols)
 77 |       : symbols(isymbols) {}
 78 | 
 79 |   bool operator() (int x, int y) {
 80 |     return S(x)->w < S(y)->w;
 81 |   }
 82 | 
 83 |  private:
 84 |   const PIXA *const symbols;
 85 | };
 86 | 
 87 | static const int kBorderSize = 6;
 88 | 
 89 | // see comment in .h file
 90 | void
 91 | jbig2enc_symboltable(struct jbig2enc_ctx *restrict ctx,
 92 |                      PIXA *restrict const symbols,
 93 |                      std::vector<unsigned> *__restrict__ symbol_list,
 94 |                      std::map<int, int> *symmap, const bool unborder_symbols) {
 95 |   const unsigned n = symbol_list->size();
 96 |   int number = 0;
 97 | 
 98 | #ifdef JBIG2_DEBUGGING
 99 |   fprintf(stderr, "  symbols: %d\n", n);
100 | #endif
101 | 
102 |   // this is a vector of indexes into symbols
103 |   std::vector<unsigned> syms(*symbol_list);
104 |   // now sort that vector by height
105 |   std::sort(syms.begin(), syms.end(), HeightSorter(symbols));
106 | 
107 |   // this is used for each height class to sort into increasing width
108 |   WidthSorter sorter(symbols);
109 | 
110 |   // this stores the indexes of the symbols for a given height class
111 |   std::vector<int> hc;
112 |   // this keeps the value of the height of the current class
113 |   unsigned hcheight = 0;
114 |   for (unsigned i = 0; i < n;) {
115 |     // height is the height of this class of symbols
116 |     const unsigned height = S(syms[i])->h - (unborder_symbols ? 2*kBorderSize : 0);
117 | #ifdef JBIG2_DEBUGGING
118 |     fprintf(stderr, "height is %d\n", height);
119 | #endif
120 |     unsigned j;
121 |     hc.clear();
122 |     hc.push_back(syms[i]);  // this is the first member of the new class
123 |     // walk the vector until we find a symbol with a different height
124 |     for (j = i + 1; j < n; ++j) {
125 |       if (S(syms[j])->h - (unborder_symbols ? 2*kBorderSize : 0) != height) break;
126 |       hc.push_back(syms[j]);  // add each symbol of the same height to the class
127 |     }
128 | #ifdef JBIG2_DEBUGGING
129 |     fprintf(stderr, "  hc (height: %d, members: %d)\n", height, hc.size());
130 | #endif
131 |     // all the symbols from i to j-1 are a height class
132 |     // now sort them into increasing width
133 |     sort(hc.begin(), hc.end(), sorter);
134 |     // encode the delta height
135 |     const int deltaheight = height - hcheight;
136 |     jbig2enc_int(ctx, JBIG2_IADH, deltaheight);
137 |     hcheight = height;
138 |     int symwidth = 0;
139 |     // encode each symbol
140 |     for (std::vector<int>::const_iterator k = hc.begin(); k != hc.end(); ++k) {
141 |       const int sym = *k;
142 |       const int thissymwidth = S(sym)->w - (unborder_symbols ? 2*kBorderSize : 0);
143 |       const int deltawidth = thissymwidth - symwidth;
144 | #ifdef JBIG2_DEBUGGING
145 |       fprintf(stderr, "    h: %d\n", S(sym)->w);
146 | #endif
147 |       symwidth += deltawidth;
148 |       //fprintf(stderr, "width is %d\n", S(sym)->w);
149 |       jbig2enc_int(ctx, JBIG2_IADW, deltawidth);
150 | 
151 |       PIX *unbordered;
152 |       if (unborder_symbols) {
153 |         // the exemplars are stored with a border
154 |         unbordered = pixRemoveBorder(S(sym), kBorderSize);
155 |         // encoding the bitmap requires that the pad bits be zero
156 |       } else {
157 |         unbordered = pixClone(S(sym));
158 |       }
159 |       pixSetPadBits(unbordered, 0);
160 |       jbig2enc_bitimage(ctx, (uint8_t *) unbordered->data, thissymwidth, height,
161 |                         false);
162 |       // add this symbol to the map
163 |       (*symmap)[sym] = number++;
164 |       pixDestroy(&unbordered);
165 |     }
166 |     // OOB marks the end of the height class
167 |     //fprintf(stderr, "OOB\n");
168 |     jbig2enc_oob(ctx, JBIG2_IADW);
169 |     i = j;
170 |   }
171 | 
172 |   // now we have the list of exported symbols (which is all of them)
173 |   // it's run length encoded and we have a run length of 0 (for all the symbols
174 |   // which aren't set) followed by a run length of the number of symbols
175 | 
176 |   jbig2enc_int(ctx, JBIG2_IAEX, 0);
177 |   jbig2enc_int(ctx, JBIG2_IAEX, n);
178 | 
179 |   jbig2enc_final(ctx);
180 | }
181 | 
182 | // sort by the bottom-left corner of the box
183 | class YSorter {  // concept: stl/StrictWeakOrdering
184 |  public:
185 |   YSorter(const PTA *ill)
186 |     : ll(ill) {}
187 | 
188 |   bool operator() (int x, int y) {
189 |     return ll->y[x] < ll->y[y];
190 |   }
191 | 
192 |  private:
193 |   const PTA *const ll;
194 | };
195 | 
196 | // sort by the bottom-left corner of the box
197 | class XSorter {  // concept: stl/StrictWeakOrdering
198 |  public:
199 |   XSorter(const PTA *ill)
200 |     : ll(ill) {}
201 | 
202 |   bool operator() (int x, int y) {
203 |     return ll->x[x] < ll->x[y];
204 |   }
205 | 
206 |  private:
207 |   const PTA *const ll;
208 | };
209 | 
210 | #if (__GNUC__ <= 2) || defined(sun)
211 | #define lrint(x) static_cast<int>(x)
212 | #endif
213 | 
214 | #define BY(x) (lrint(ll->y[x]))
215 | 
216 | // see comment in .h file
217 | void
218 | jbig2enc_textregion(struct jbig2enc_ctx *restrict ctx,
219 |                     /*const*/ std::map<int, int> &symmap,
220 |                     /*const*/ std::map<int, int> &symmap2,
221 |                     const std::vector<int> &comps,
222 |                     PTA *const in_ll,
223 |                     PIXA *const symbols,
224 |                     NUMA *assignments, int stripwidth, int symbits,
225 |                     PIXA *const source, BOXA *boxes, int baseindex,
226 |                     int refine_level, bool unborder_symbols) {
227 |   // these are the only valid values for stripwidth
228 |   if (stripwidth != 1 && stripwidth != 2 && stripwidth != 4 &&
229 |       stripwidth != 8) {
230 |     abort();
231 |   }
232 | 
233 |   PTA *ll;
234 | 
235 |   // In the case of refinement, we have to put the symbols where the original
236 |   // boxes were. So we make up an array of lower-left (ll) points from the
237 |   // boxes. Otherwise we take the points from the in_ll array we were given.
238 |   // However, the in_ll array is absolutely indexed and the boxes array is
239 |   // relative to this page so watch out below.
240 |   if (source) {
241 |     ll = ptaCreate(0);
242 |     for (int i = 0; i < boxes->n; ++i) {
243 |       ptaAddPt(ll, boxes->box[i]->x,
244 |                boxes->box[i]->y + boxes->box[i]->h - 1);
245 |     }
246 |   } else {
247 |     // if we aren't doing refinement - we just put the symbols where they
248 |     // matched best
249 |     ll = in_ll;
250 |   }
251 | 
252 |   const int n = comps.size();
253 | 
254 |   // sort each box by distance from the top of the page
255 |   // syms (a copy of comps) is a list of indexes into symmap and ll
256 |   // elements which are indexes into symmap and ll are labeled I
257 |   // indexes into the syms array are labeled II
258 |   std::vector<int> syms(n);
259 |   if (source) {
260 |     // refining: fill syms with the numbers 0..n because ll is relative to this
261 |     // page in this case
262 |     myiota(syms.begin(), syms.end(), 0);
263 |   } else {
264 |     // fill syms with the component numbers from the comps array because ll is
265 |     // absolutely indexed in this case (absolute: over the whole multi-page
266 |     // document)
267 |     syms = comps;
268 |   }
269 |   // sort into height order
270 |   sort(syms.begin(), syms.end(), YSorter(ll));
271 | 
272 |   XSorter sorter(ll);
273 | 
274 |   int stript = 0;
275 |   int firsts = 0;
276 |   int wibble = 0;
277 |   // this is the initial stript value. I don't see why encoding this as zero,
278 |   // then encoding the first stript value as the real start is any worst than
279 |   // encoding this value correctly and then having a 0 value for the first
280 |   // deltat
281 |   jbig2enc_int(ctx, JBIG2_IADT, 0);
282 | 
283 |   // for each symbol we group it into a strip, which is stripwidth px high
284 |   // for each strip we sort into left-right order
285 |   std::vector<int> strip; // elements of strip: I
286 |   for (int i = 0; i < n;) {   // i: II
287 |     const int height = (BY(syms[i]) / stripwidth) * stripwidth;
288 |     int j;
289 |     strip.clear();
290 |     strip.push_back(syms[i]);
291 | 
292 |     // now walk until we hit the first symbol which isn't in this strip
293 |     for (j = i + 1; j < n; ++j) {  // j: II
294 |       if (BY(syms[j]) < height) abort();
295 |       if (BY(syms[j]) >= height + stripwidth) {
296 |         // outside strip
297 |         break;
298 |       }
299 |       strip.push_back(syms[j]);
300 |     }
301 | 
302 |     // now sort the strip into left-right order
303 |     sort(strip.begin(), strip.end(), sorter);
304 |     const int deltat = height - stript;
305 | #ifdef SYM_DEBUGGING
306 |     fprintf(stderr, "deltat is %d\n", deltat);
307 | #endif
308 |     jbig2enc_int(ctx, JBIG2_IADT, deltat / stripwidth);
309 |     stript = height;
310 | #ifdef SYM_DEBUGGING
311 |     fprintf(stderr, "t now: %d\n", stript);
312 | #endif
313 | 
314 |     bool firstsymbol = true;
315 |     int curs = 0;
316 |     // k: iterator(I)
317 |     for (std::vector<int>::const_iterator k = strip.begin(); k != strip.end(); ++k) {
318 |       const int sym = *k;  // sym: I
319 |       if (firstsymbol) {
320 |         firstsymbol = false;
321 |         const int deltafs = lrint(ll->x[sym]) - firsts;
322 |         jbig2enc_int(ctx, JBIG2_IAFS, deltafs);
323 |         firsts += deltafs;
324 |         curs = firsts;
325 |       } else {
326 |         const int deltas = lrint(ll->x[sym]) - curs;
327 |         jbig2enc_int(ctx, JBIG2_IADS, deltas);
328 |         curs += deltas;
329 |       }
330 | 
331 |       // if stripwidth is 1, all the t values must be the same so they aren't
332 |       // even encoded
333 |       if (stripwidth > 1) {
334 |         const int deltat = BY(sym) - stript;
335 |         jbig2enc_int(ctx, JBIG2_IAIT, deltat);
336 |       }
337 | 
338 |       // The assignments array is absolutely indexed, but in the case that we
339 |       // are doing refinement (source != NULL) then the symbol number is
340 |       // relative to this page, so we have to add the baseindex to get an
341 |       // absolute index.
342 |       const int assigned = (int)assignments->array
343 |         [sym + (source ? baseindex : 0)];
344 | 
345 |       // the symmap maps the number of the symbol from the classifier to the
346 |       // order in while it was written in the symbol dict
347 | 
348 |       // We have two symbol dictionaries. A global one and a per-page one.
349 |       int symid;
350 |       std::map<int, int>::const_iterator symit = symmap.find(assigned);
351 |       if (symit != symmap.end()) {
352 |         symid = symit->second;
353 |       } else {
354 |         symit = symmap2.find(assigned);
355 |         if (symit != symmap2.end()) {
356 |           symid = symit->second + symmap.size();
357 |         } else {
358 |           for (symit = symmap.begin(); symit != symmap.end(); ++symit) {
359 |             fprintf(stderr, "%d ", symit->first);
360 |           }
361 |           for (symit = symmap2.begin(); symit != symmap2.end(); ++symit) {
362 |             fprintf(stderr, "%d ", symit->first);
363 |           }
364 |           fprintf(stderr, "\n%d\n", assigned);
365 |           abort();
366 |         }
367 |       }
368 | #ifdef SYM_DEBUGGING
369 |       fprintf(stderr, "sym: %d\n", symid);
370 | #endif
371 |       jbig2enc_iaid(ctx, symbits, symid);
372 | 
373 |       // refinement is enabled if the original source components are given
374 |       if (source) {
375 |         // the boxes array is indexed by the number of the symbol on this page.
376 |         // So we subtract the number of the first symbol to get this relative
377 |         // number.
378 |         const int abssym = baseindex + sym;
379 | 
380 |         PIX *symbol;
381 |         if (unborder_symbols) {
382 |           // the symbol has a 6 px border around it, which we need to remove
383 |           symbol = pixRemoveBorder(S(assigned), kBorderSize);
384 |         } else {
385 |           symbol = pixClone(S(assigned));
386 |         }
387 |         pixSetPadBits(symbol, 0);
388 | 
389 |         const int targetw = boxes->box[sym]->w;
390 |         const int targeth = boxes->box[sym]->h;
391 |         const int targetx = boxes->box[sym]->x;
392 |         const int targety = boxes->box[sym]->y;
393 | 
394 |         const int symboly = (int) (in_ll->y[abssym] - symbol->h) + 1;
395 |         const int symbolx = (int) in_ll->x[abssym];
396 | 
397 |         const int deltaw = targetw - symbol->w;
398 |         const int deltah = targeth - symbol->h;
399 |         const int deltax = targetx - symbolx;
400 |         const int deltay = targety - symboly;
401 | 
402 |         pixSetPadBits(source->pix[sym], 0);
403 |         // now see how well the symbol matches
404 |         PIX *targetcopy = pixCopy(NULL, source->pix[sym]);
405 |         pixRasterop(targetcopy, deltax, deltay, symbol->w, symbol->h,
406 |                     PIX_SRC ^ PIX_DST,
407 |                     symbol, 0, 0);
408 |         int deltacount;
409 |         pixCountPixels(targetcopy, &deltacount, NULL);
410 | #ifdef SYMBOL_COMPRESSION_DEBUGGING
411 |         fprintf(stderr, "delta count: %d\n", deltacount);
412 | #endif
413 |         pixDestroy(&targetcopy);
414 | 
415 | #ifdef SYMBOL_COMPRESSION_DEBUGGING
416 |           fprintf(stderr, "refinement: dw:%d dh:%d dx:%d dy:%d w:%d h:%d\n",
417 |                   deltaw, deltah, deltax, deltay, targetw, targeth);
418 |           fprintf(stderr, "  box: %d %d symbol: %d %d h:%d ll:%f %f\n",
419 |                   targetx, targety, symbolx, symboly, symbol->h,
420 |                   in_ll->x[abssym], in_ll->y[abssym]);
421 | #endif
422 | 
423 |         // Note that the refinement encoding function can only cope with x
424 |         // offsets in [-1, 0, 1] so refinement is disabled if the offset is
425 |         // outside this range. This should be *very* rare.
426 |         if (deltacount <= refine_level || deltax < -1 || deltax > 1) {
427 |         //if (deltaw > 1 || deltaw < -1 || deltax || deltah || deltay) {
428 |           // refinement disabled.
429 |           jbig2enc_int(ctx, JBIG2_IARI, 0);
430 |           // update curs given the width of the bitmap
431 |           curs += (S(assigned)->w - (unborder_symbols ? 2*kBorderSize : 0)) - 1;
432 |         } else {
433 |           wibble++;
434 |           jbig2enc_int(ctx, JBIG2_IARI, 1);
435 | 
436 |           jbig2enc_int(ctx, JBIG2_IARDW, deltaw);
437 |           jbig2enc_int(ctx, JBIG2_IARDH, deltah);
438 |           jbig2enc_int(ctx, JBIG2_IARDX, deltax - (deltaw >> 1));
439 |           jbig2enc_int(ctx, JBIG2_IARDY, deltay - (deltah >> 1));
440 | 
441 |           jbig2enc_refine
442 |             (ctx, (uint8_t *) symbol->data, symbol->w, symbol->h,
443 |              (uint8_t *) source->pix[sym]->data, targetw, targeth,
444 |              deltax, -deltay);
445 | 
446 |           pixDestroy(&symbol);
447 |           curs += targetw - 1;
448 |         }
449 |       } else {
450 |         // update curs given the width of the bitmap
451 |         curs += (S(assigned)->w - (unborder_symbols ? 2*kBorderSize : 0)) - 1;
452 |       }
453 |     }
454 |     // terminate the strip
455 |     jbig2enc_oob(ctx, JBIG2_IADS);
456 |     i = j;
457 |   }
458 | 
459 |   jbig2enc_final(ctx);
460 |   if (ll != in_ll) ptaDestroy(&ll);
461 | }
462 | 


--------------------------------------------------------------------------------
/src/jbig2sym.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Google Inc. All Rights Reserved.
 2 | // Author: agl@imperialviolet.org (Adam Langley)
 3 | //
 4 | // Copyright (C) 2006 Google Inc.
 5 | //
 6 | // Licensed under the Apache License, Version 2.0 (the "License");
 7 | // you may not use this file except in compliance with the License.
 8 | // You may obtain a copy of the License at
 9 | //
10 | //      http://www.apache.org/licenses/LICENSE-2.0
11 | //
12 | // Unless required by applicable law or agreed to in writing, software
13 | // distributed under the License is distributed on an "AS IS" BASIS,
14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | // See the License for the specific language governing permissions and
16 | // limitations under the License.
17 | 
18 | #ifndef JBIG2ENC_JBIG2SYM_H__
19 | #define JBIG2ENC_JBIG2SYM_H__
20 | 
21 | struct jbig2enc_ctx;
22 | 
23 | // -----------------------------------------------------------------------------
24 | // Write a symbol table.
25 | //
26 | // symbols: A 2d array. The first dimension is of different classes of symbols.
27 | //          Then, for each class, there are all the examples of that class. The
28 | //          first member of the class is taken as the exemplar.
29 | // symbol_list: a list of symbols to encode
30 | // symmap: an empty map which is filled. The symbols are written to the file in
31 | //         a different order than they are given in symbols. The maps an index
32 | //         into the symbols array to a symbol number in the file
33 | // unborder_symbols: if true, remove a border from every element of symbols
34 | // -----------------------------------------------------------------------------
35 | void jbig2enc_symboltable(struct jbig2enc_ctx *__restrict__ ctx,
36 |                           PIXA *__restrict__ const symbols,
37 |                           std::vector<unsigned> *__restrict__ symbol_list,
38 |                           std::map<int, int> *symmap,
39 |                           bool unborder_symbols);
40 | 
41 | // -----------------------------------------------------------------------------
42 | // Write a text region.
43 | //
44 | // A text region is a list of placements of symbols. The symbols must already
45 | // have been coded.
46 | //
47 | // symmap: This maps class numbers to symbol numbers. Only symbol numbers
48 | //         appear in the JBIG2 data stream
49 | // symmap2: If not found in the first symmap, try this one
50 | // comps: a list of connected-component numbers for this page
51 | // ll: This is an array of the lower-left corners of the boxes for each symbol
52 | // assignments: an array, of the same length as boxes, mapping each box to a
53 | //              symbol
54 | // stripwidth: 1 is a safe default (one of [1, 2, 4, 8])
55 | // symbits: number of bits needed to code the symbol number (log2(number of
56 | //          symbols) - rounded up)
57 | // source: an array of the original images for all the connected components.
58 | //         If NULL, refinement is disabled. (page indexed)
59 | // boxes: if source is non-NULL, this is page based list of boxes of symbols on
60 | //        the page
61 | // baseindex: if source is non-NULL, this is the component number of the first
62 | //            component on this page
63 | // refine_level: the number of incorrect pixels allowed before refining.
64 | // unborder_symbols: if true, symbols have a 6px border around them
65 | // -----------------------------------------------------------------------------
66 | void jbig2enc_textregion(struct jbig2enc_ctx *__restrict__ ctx,
67 |                          /*const*/ std::map<int, int> &symmap,
68 |                          /*const*/ std::map<int, int> &symmap2,
69 |                          const std::vector<int> &comps,
70 |                          PTA *const ll, PIXA *const symbols,
71 |                          NUMA *assignments,
72 |                          int stripwidth, int symbits,
73 |                          PIXA *const source, BOXA *boxes, int baseindex,
74 |                          int refine_level, bool unborder_symbols);
75 | 
76 | #endif  // JBIG2ENC_JBIG2SYM_H__
77 | 


--------------------------------------------------------------------------------