├── caj2pdf
    ├── __init__.py
    ├── dep
    │   ├── __init__.py
    │   ├── decode_jbig2data.cc
    │   ├── JBigDecode.h
    │   ├── decode_jbig2data_x.cc
    │   ├── jbig2dec.py
    │   ├── jbigdec.py
    │   ├── jbigdec.cc
    │   └── JBigDecode.cc
    ├── version.py
    ├── exceptions.py
    ├── exe_convert.py
    ├── install.py
    ├── cli.py
    ├── HNParsePage.py
    ├── utils.py
    └── cajparser.py
├── screenshot1.png
├── .gitattributes
├── tests
    ├── c8_src.caj
    ├── caj_src.caj
    ├── hn_src.caj
    └── kdh_src.caj
├── dlls
    ├── libjbigdec-w32.dll
    ├── libjbigdec-w64.dll
    ├── libjbig2codec-w32.dll
    └── libjbig2codec-w64.dll
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_cn.md
    └── workflows
    │   ├── build.yaml
    │   └── test.yaml
├── LICENSE
├── pyproject.toml
├── .gitignore
├── README.md
└── pdm.lock


/caj2pdf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/caj2pdf/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0a7"
2 | 


--------------------------------------------------------------------------------
/caj2pdf/exceptions.py:
--------------------------------------------------------------------------------
1 | class Caj2PdfException(Exception):
2 |     "root exception type"
3 |     pass
4 | 


--------------------------------------------------------------------------------
/screenshot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zombie110year/caj2pdf-restructured/HEAD/screenshot1.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.caj filter=lfs diff=lfs merge=lfs -text
2 | *.dll filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/tests/c8_src.caj:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7e3f2c0faebb8ddac2f4feff3cfc9249476ab170b43250ac00610b10aa46c74b
3 | size 8786313
4 | 


--------------------------------------------------------------------------------
/tests/caj_src.caj:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:50a63a5e96858d17258149d260a335a7b216dbbc38f86f9ab6d5ef0c9b4dbab3
3 | size 2983075
4 | 


--------------------------------------------------------------------------------
/tests/hn_src.caj:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:166d0014792326570d9e8ca42fe13a4a44329e4ad095d3ac8dfd73c8ddb9f20e
3 | size 163779
4 | 


--------------------------------------------------------------------------------
/tests/kdh_src.caj:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef0d77b2cdb2b9eeea105312bef7eef7727e7d8a055e7828d51fbd9a571cb17a
3 | size 192452
4 | 


--------------------------------------------------------------------------------
/dlls/libjbigdec-w32.dll:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0acbe501c2d6711fc11fdfec3ccda45b73cc44da978915e833ab16d750e4f108
3 | size 61142
4 | 


--------------------------------------------------------------------------------
/dlls/libjbigdec-w64.dll:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8517169c45a8d3f7823dc47d2b98a102a61f0f463dc0df4eb48cb90b9541a5f2
3 | size 66401
4 | 


--------------------------------------------------------------------------------
/dlls/libjbig2codec-w32.dll:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7fe46b81dbd839afc9ad7b797a0378292635928149bb3de87c92f4e3b3811684
3 | size 180634
4 | 


--------------------------------------------------------------------------------
/dlls/libjbig2codec-w64.dll:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:afba105509cc454c976f9b7835fc6c70b3e4fd4d1c8a297d46fb69f0077b301b
3 | size 193739
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length=120
 3 | max-doc-length=120
 4 | ignore=
 5 |     # : 前可以有空格，因为 [expr : expr]
 6 |     E203,
 7 |     # 双元运算符放在行首，为了让部分比较长的算术看起来像竖式
 8 |     W503,
 9 | exclude=caj2pdf/pdfwutils.py
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_cn.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: CAJ2PDF 程序问题
 3 | about: "提交一个 CAJ2PDF 的程序问题报告。"
 4 | ---
 5 | 
 6 | 除非特殊情况，请完整填写所有问题。不按模板发的 issue 将直接被关闭。
 7 | 如果你遇到的问题不是 bug，比如你不清楚要如何配置，请使用[Discussion](https://github.com/caj2pdf/discussion/issues)进行讨论。
 8 | 
 9 | 1) 你正在使用哪个版本的 CAJ2PDF？
10 | 
11 | 2) 你的使用场景是什么？比如convert还是其他。
12 | 
13 | 3) 你看到的不正常的现象是什么？（请描述具体现象，比如PDF打不开等）
14 | 
15 | 4) 你期待看到的正确表现是怎样的？
16 | 
17 | 5) 请附上你的配置（Python版本、Python包版本以及mutools版本）。
18 | 
19 | 7) 请使用××ZIP××格式压缩CAJ文件，附在此处。
20 | 
21 | 8)  请附上出错时软件输出的错误信息。
22 | 
23 | 错误信息:
24 | 
25 | ```python
26 |     # 软件输出的错误信息
27 | ```
28 | 
29 | 请预览一下你填的内容再提交。
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |                GLWT(Good Luck With That) Public License
 2 |                  Copyright (c) Everyone, except Author
 3 | 
 4 | Everyone is permitted to copy, distribute, modify, merge, sell, publish,
 5 | sublicense or whatever they want with this software but at their OWN RISK.
 6 | 
 7 |                             Preamble
 8 | 
 9 | The author has absolutely no clue what the code in this project does.
10 | It might just work or not, there is no third option.
11 | 
12 | 
13 |                 GOOD LUCK WITH THAT PUBLIC LICENSE
14 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION, AND MODIFICATION
15 | 
16 |   0. You just DO WHATEVER YOU WANT TO as long as you NEVER LEAVE A
17 | TRACE TO TRACK THE AUTHOR of the original product to blame for or hold
18 | responsible.
19 | 
20 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 
25 | Good luck and Godspeed.


--------------------------------------------------------------------------------
/caj2pdf/dep/decode_jbig2data.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright 2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
 3 |   See The FreeType Project LICENSE for license terms.
 4 | 
 5 |   This is a small wrapper around libpoppler to provide a python
 6 |   interface to decode JBIG2 stream.
 7 | 
 8 |   To build:
 9 | 
10 |       cc -Wall `pkg-config --cflags poppler` -fPIC -shared -o libjbig2codec.so decode_jbig2data.cc `pkg-config --libs poppler`
11 | */
12 | 
13 | #include <JBIG2Stream.h>
14 | 
15 | int decode_jbig2data(char*, int, char*, int, int, int, int);
16 | 
17 | extern "C" {
18 | int decode_jbig2data_c(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes)
19 | {
20 |   return decode_jbig2data(inbuf, bufsize, outptr, width, height, width_in_padded_4bytes, width_in_padded_bytes);
21 | }
22 | }
23 | 
24 | int decode_jbig2data(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes)
25 | {
26 |   int v12;
27 | 
28 |   Object globals;
29 |   MemStream *v10 = new MemStream(inbuf, 0, bufsize, Object(objNull));
30 |   Stream *v11 = new JBIG2Stream(v10, Object(objNull), &globals);
31 |   v11->reset(); // required
32 |   if ( height > 0 )
33 |   {
34 |     v12 = 0;
35 |     char* v13 = outptr + (height - 1) * width_in_padded_4bytes;
36 |     do
37 |     {
38 |       ++v12;
39 |       for (int i = 0; i < width_in_padded_bytes; i++)
40 |         {
41 |           *(v13 + i) = 0xFF & (v11->getChar() ^ 0xFF);
42 |         }
43 |       v13 -= width_in_padded_4bytes;
44 |     }
45 |     while ( v12 != height );
46 |   }
47 |   return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/caj2pdf/exe_convert.py:
--------------------------------------------------------------------------------
 1 | """提供给上下文菜单调用
 2 | 
 3 | 目前仅支持 Windows
 4 | """
 5 | import pathlib
 6 | import sys
 7 | import threading
 8 | 
 9 | from .cajparser import CAJParser
10 | from .exceptions import Caj2PdfException
11 | 
12 | 
13 | def main():
14 |     try:
15 |         app()
16 |     except Exception:
17 |         pass
18 | 
19 | 
20 | def app():
21 |     try:
22 |         cajfilepath_str = sys.argv[1]
23 |     except IndexError:
24 |         raise Caj2PdfException(f"找不到caj文件，输入参数为：{sys.argv!r}")
25 | 
26 |     cajfilepath = pathlib.Path(cajfilepath_str)
27 |     if not cajfilepath.exists():
28 |         raise Caj2PdfException(f"caj文件不存在：{cajfilepath.as_posix()}")
29 | 
30 |     inputfile = str(cajfilepath)
31 |     outputfile = f"{inputfile}.pdf"
32 |     task = threading.Thread(
33 |         group=None, target=convert_caj, args=(inputfile, outputfile)
34 |     )
35 |     alive = AliveStatus()
36 |     task.start()
37 |     alive.start()
38 |     task.join()
39 |     alive.finish()
40 |     alive.join()
41 | 
42 | 
43 | def convert_caj(inputfile, outputfile):
44 |     caj = CAJParser(inputfile)
45 |     caj.convert(outputfile)
46 | 
47 | 
48 | class AliveStatus(threading.Thread):
49 |     def __init__(self):
50 |         super().__init__(group=None)
51 |         self.finished = False
52 | 
53 |     def run(self):
54 |         from time import sleep
55 | 
56 |         status = ("正在转换 <<<<<<", "正在转换 >>>>>>")
57 |         i = 0b0
58 |         while not self.finished:
59 |             print(status[i & 0b1], end="\r", file=sys.stderr)
60 |             i ^= 0b1
61 |             sleep(0.2)
62 | 
63 |     def finish(self):
64 |         self.finished = True
65 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "caj2pdf-restructured"
 3 | dynamic = ["version", "entry-points"]
 4 | description = "caj2pdf 项目的重新组织，方便打包与安装"
 5 | authors = [
 6 |     { name = "Hin-Tak Leung", email = "htl10@users.sourceforge.net" },
 7 |     { name = "JeziL", email = "wangjinlithu@gmail.com" },
 8 | ]
 9 | maintainers = [{ name = "zombie110year", email = "zombie110year@outlook.com" }]
10 | dependencies = ["imagesize==1.3.0", "PyPDF2==2.2.0"]
11 | requires-python = ">=3.10"
12 | readme.content-type = "text/markdown"
13 | readme.file = "README.md"
14 | license = { text = "GLWTPL" }
15 | 
16 | classifiers = [
17 |     "Environment :: Console",
18 |     "Operating System :: Microsoft :: Windows",
19 |     "Operating System :: POSIX :: Linux",
20 | ]
21 | keywords = ["cnki", "caj", "pdf"]
22 | 
23 | [project.urls]
24 | repository = "https://github.com/zombie110year/caj2pdf-restructured/"
25 | 
26 | [project.scripts]
27 | caj2pdf = "caj2pdf.cli:main"
28 | caj2pdf-ec = "caj2pdf.exe_convert:main"
29 | 
30 | 
31 | [project.optional-dependencies]
32 | [build-system]
33 | requires = ["pdm-pep517>=1.0.0"]
34 | build-backend = "pdm.pep517.api"
35 | 
36 | [tool.pdm]
37 | version = { source = "file", path = "caj2pdf/version.py" }
38 | 
39 | [tool.pdm.build]
40 | excludes = ["tests/**"]
41 | setup-script = "build.py"
42 | run-setuptools = true
43 | 
44 | [[tool.pdm.source]]
45 | name = "pypi"
46 | url = "https://mirrors.aliyun.com/pypi/simple"
47 | verify_ssl = true
48 | 
49 | [tool.pdm.dev-dependencies]
50 | dev = [
51 |     "black>=22.10.0",
52 |     "isort>=5.10.1",
53 |     "flake8>=5.0.4",
54 |     "setuptools>=65.6.0",
55 | ]
56 | 
57 | [tool.setuptools.package-data]
58 | caj2pdf = ["*.dll", "*.so"]
59 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: Upload caj2pdf to GitHub Actions Artifacts
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - 'releases/**'
 7 | 
 8 | jobs:
 9 |   windows_pub:
10 |     name: Publish from Windows Server 2019
11 |     runs-on: windows-2019
12 |     steps:
13 |     - name: Fetch Source Code
14 |       uses: actions/checkout@v2
15 |       with:
16 |         lfs: true
17 |         fetch-depth: 1
18 |     - name: Setup Python 3
19 |       uses: actions/setup-python@v4
20 |       with:
21 |         python-version: "3.10"
22 |     - name: Install Python Dependencies
23 |       run: |
24 |         python -m pip install -U pip && pip install pdm
25 |         pdm install
26 |     - name: Build Package
27 |       run: pdm build --no-sdist
28 |     - name: Upload to github action artifact
29 |       uses: actions/upload-artifact@v3
30 |       with:
31 |         path: dist/*.whl
32 | 
33 |   linux_pub:
34 |     name: Publish from Ubuntu, no JBIG2DEC
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |     - name: Fetch Source Code
38 |       uses: actions/checkout@v2
39 |       with:
40 |         lfs: true
41 |         fetch-depth: 1
42 |     - name: Setup Python 3
43 |       uses: actions/setup-python@v4
44 |       with:
45 |         python-version: "3.10"
46 |     - name: Install Python Dependencies
47 |       run: |
48 |         python -m pip install -U pip && pip install pdm
49 |         pdm install -d
50 |     - name: Build Package
51 |       run: pdm build
52 |     - name: Get Glibc version
53 |       run: ldd --version > dist/ldd.version.txt
54 |     - name: Upload to github action artifact
55 |       uses: actions/upload-artifact@v3
56 |       with:
57 |         path: |
58 |           dist/*.whl
59 |           dist/*.tar.gz
60 |           dist/ldd.version.txt
61 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/JBigDecode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright 2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
 3 |   See The FreeType Project LICENSE for license terms.
 4 | 
 5 |   Decode-only part of JBigCodec. Drop-in compatible with LibReaderEx's.
 6 | 
 7 |   Note: MPS/ST are very wasteful, as only 1-bit is used, and
 8 |         array of length 0x20 (5 contexts) instead of 0x4000 (14 contexts).
 9 | 
10 |         GetBit() has /3 instead of >> 3, GetCX() only 5 contexts instead of 10/14.
11 |         SLNTP / LNTP is neither the three-line template nor the two-line template
12 |         form (and GetBit() is strange anyway).
13 | 
14 |         LpsExchange/MpsExchange/RenormDe/ByteIn/InitDecode are essentially
15 |         identical as in T-82, as well as Decode1() and Decode().
16 | 
17 | 
18 | */
19 | class JBigCodec {
20 | public:
21 |   void  ByteIn();
22 |   void* ClearLine(char*, unsigned int);
23 |   void* CopyLine(char*, char*, unsigned int);
24 |   int   Decode1(int);
25 |   int   Decode(char*, unsigned int, unsigned int, unsigned int, unsigned int, char*);
26 |   int   Decode(int);
27 |   void* DupLine(char*, unsigned int, unsigned int, unsigned int);
28 |   int   GetBit(int, int);
29 |   int   GetCX(int, int);
30 |   void  InitDecode(char*, unsigned int);
31 |   int   LowestDecode();
32 |   int   LowestDecodeLine(unsigned int, char*, char*, unsigned int, char*);
33 |   void  LpsExchange(int, unsigned int, unsigned int);
34 |   void* MakeTypicalLine(int);
35 |   void  MpsExchange(int, unsigned int, unsigned int);
36 |   void  RenormDe();
37 | private:
38 |   unsigned int A_interval;
39 |   int CT;
40 |   int SC; /* Only used by Encode */
41 |   unsigned int inbuf_length;
42 |   int read_count;
43 |   unsigned char *inbuf;
44 |   unsigned int MPS[0x1000];
45 |   unsigned int ST[0x1000];
46 |   unsigned int C_register;
47 |   int PIX;
48 |   int BUFFER; /* Only used by Encode */
49 |   int bitwidth;
50 |   int height;
51 |   int width_in_padded_bytes;
52 |   char *outptr;
53 | };
54 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/decode_jbig2data_x.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright 2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
 3 |   See The FreeType Project LICENSE for license terms.
 4 | 
 5 |   This is a small wrapper around libjbig2dec to provide a python
 6 |   interface to decode JBIG2 stream.
 7 | 
 8 |   To build:
 9 | 
10 |       cc -Wall `pkg-config --cflags jbig2dec` -fPIC -shared -o libjbig2codec.so decode_jbig2data_x.cc `pkg-config --libs jbig2dec`
11 | 
12 |   Or, if you have jbig2dec in source form, in its directory
13 |   (see "jbig2dec/Makefile.am" - everything exept "jbig2_image_pbm.c" and "memento.c"):
14 | 
15 |       cc -I . -Wall -fPIC -shared -o ${CAJ2PDF_SRC}/libjbig2codec.so ${CAJ2PDF_SRC}/decode_jbig2data_x.cc \
16 | 	jbig2.c \
17 | 	jbig2_arith.c jbig2_arith_int.c jbig2_arith_iaid.c jbig2_huffman.c jbig2_hufftab.c \
18 | 	jbig2_segment.c jbig2_page.c \
19 | 	jbig2_symbol_dict.c jbig2_text.c \
20 | 	jbig2_generic.c jbig2_refinement.c jbig2_mmr.c \
21 | 	jbig2_halftone.c \
22 | 	jbig2_image.c
23 | */
24 | 
25 | #include <cstdint>
26 | #include <cstring>
27 | #include <jbig2.h>
28 | 
29 | int decode_jbig2data(char*, int, char*, int, int, int, int);
30 | 
31 | extern "C" {
32 | int decode_jbig2data_c(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes)
33 | {
34 |   return decode_jbig2data(inbuf, bufsize, outptr, width, height, width_in_padded_4bytes, width_in_padded_bytes);
35 | }
36 | }
37 | 
38 | int decode_jbig2data(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes)
39 | {
40 |   int v12;
41 | 
42 |   Jbig2Ctx *ctx = jbig2_ctx_new(NULL, JBIG2_OPTIONS_EMBEDDED, NULL, NULL, NULL);
43 |   jbig2_data_in(ctx, (const unsigned char*)inbuf, bufsize);
44 |   jbig2_complete_page(ctx); // Required, apparently this is "work around broken CVision embedded streams",
45 |                             // "simulating an end-of-page segment (for broken streams)"
46 |   Jbig2Image *image = jbig2_page_out(ctx);
47 |   if ( height > 0 )
48 |   {
49 |     v12 = 0;
50 |     char* v13 = outptr + (height - 1) * width_in_padded_4bytes;
51 |     unsigned char *data = image->data;
52 |     do
53 |     {
54 |       ++v12;
55 |       memcpy(v13, data, width_in_padded_bytes);
56 |       v13 -= width_in_padded_4bytes;
57 |       data += image->stride;
58 |     }
59 |     while ( v12 != height );
60 |   }
61 |   return 0;
62 | }
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .pdm.toml
  2 | /caj2pdf/dep/bin/*.dll
  3 | 
  4 | # Created by .ignore support plugin (hsz.mobi)
  5 | ### JetBrains template
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff:
 10 | .idea/
 11 | .vscode/
 12 | *.caj
 13 | *.tmp
 14 | *.pdf
 15 | cajs/
 16 | mutool.exe
 17 | 
 18 | ## File-based project format:
 19 | *.iws
 20 | 
 21 | ## Plugin-specific files:
 22 | 
 23 | # IntelliJ
 24 | /out/
 25 | 
 26 | # mpeltonen/sbt-idea plugin
 27 | .idea_modules/
 28 | 
 29 | # JIRA plugin
 30 | atlassian-ide-plugin.xml
 31 | 
 32 | # Crashlytics plugin (for Android Studio and IntelliJ)
 33 | com_crashlytics_export_strings.xml
 34 | crashlytics.properties
 35 | crashlytics-build.properties
 36 | fabric.properties
 37 | ### Python template
 38 | # Byte-compiled / optimized / DLL files
 39 | __pycache__/
 40 | *.py[cod]
 41 | *$py.class
 42 | 
 43 | # C extensions
 44 | *.so
 45 | 
 46 | # Distribution / packaging
 47 | .Python
 48 | env/
 49 | build/
 50 | develop-eggs/
 51 | dist/
 52 | downloads/
 53 | eggs/
 54 | .eggs/
 55 | parts/
 56 | sdist/
 57 | var/
 58 | wheels/
 59 | *.egg-info/
 60 | .installed.cfg
 61 | *.egg
 62 | 
 63 | # PyInstaller
 64 | #  Usually these files are written by a python script from a template
 65 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 66 | *.manifest
 67 | *.spec
 68 | 
 69 | # Installer logs
 70 | pip-log.txt
 71 | pip-delete-this-directory.txt
 72 | 
 73 | # Unit test / coverage reports
 74 | htmlcov/
 75 | .tox/
 76 | .coverage
 77 | .coverage.*
 78 | .cache
 79 | nosetests.xml
 80 | coverage.xml
 81 | *,cover
 82 | .hypothesis/
 83 | 
 84 | # Translations
 85 | *.mo
 86 | *.pot
 87 | 
 88 | # Django stuff:
 89 | *.log
 90 | local_settings.py
 91 | 
 92 | # Flask stuff:
 93 | instance/
 94 | .webassets-cache
 95 | 
 96 | # Scrapy stuff:
 97 | .scrapy
 98 | 
 99 | # Sphinx documentation
100 | docs/_build/
101 | 
102 | # PyBuilder
103 | target/
104 | 
105 | # Jupyter Notebook
106 | .ipynb_checkpoints
107 | 
108 | # pyenv
109 | .python-version
110 | 
111 | # celery beat schedule file
112 | celerybeat-schedule
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # dotenv
118 | .env
119 | 
120 | # virtualenv
121 | .venv
122 | venv/
123 | ENV/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | !/tests/caj_src.caj
132 | !/tests/c8_src.caj
133 | !/tests/kdh_src.caj
134 | !/tests/hn_src.caj
135 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: Test caj2pdf bin
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - dev
 6 | 
 7 | jobs:
 8 |   test_ubuntu:
 9 |     strategy:
10 |       matrix:
11 |         ubuntu-version: ["ubuntu-20.04"]
12 |         python-version: ["3.10"]
13 |         dep-lib: ["libpoppler-dev"]
14 |         # dep-lib: ["libpoppler-dev", "libjbig2dec0-dev"] # libjbig2dec in Ubuntu has lots of problems.
15 |         # src-caj: ["caj", "kdh"]
16 |         src-caj: ["c8", "caj", "hn", "kdh"] # only caj, kdh in sample successed
17 |     name: Test on ${{ matrix.ubuntu-version }}
18 |     runs-on: ${{ matrix.ubuntu-version }}
19 |     steps:
20 |     - name: Fetch Source Code
21 |       uses: actions/checkout@v2
22 |       with:
23 |         lfs: true
24 |         fetch-depth: 1
25 |     - name: Setup Python ${{ matrix.python-version }}
26 |       uses: actions/setup-python@v4
27 |       with:
28 |         python-version: ${{ matrix.python-version }}
29 |     - name: Install C++ Dependencies
30 |       run: sudo apt -y install build-essential ${{ matrix.dep-lib }} mupdf-tools pkg-config
31 |     - name: Install Python Project with "poppler"
32 |       if: ${{ matrix.dep-lib == 'libpoppler-dev' }}
33 |       run: |
34 |         python -m pip install -U pip && pip install pdm
35 |         pdm install
36 |     - name: Install Python Project with "jbig2dec"
37 |       if: ${{ matrix.dep-lib == 'libjbig2dec0-dev' }}
38 |       run: |
39 |         python -m pip install -U pip && pip install pdm
40 |         LIBJBIG2DEC=1 pdm install
41 |     - name: Convert Test Caj
42 |       run: pdm run caj2pdf convert tests/${{ matrix.src-caj }}_src.caj -o tests/${{ matrix.src-caj }}_dst.pdf
43 | 
44 |   test_windows:
45 |     name: Test on Windows Server 2019
46 |     runs-on: windows-2019
47 |     strategy:
48 |       matrix:
49 |         python-version: ["3.10"]
50 |         # src-caj: ["caj", "kdh"]
51 |         src-caj: ["c8", "caj", "hn", "kdh"] # only caj, kdh in sample successed
52 |     steps:
53 |     - name: Fetch Source Code
54 |       uses: actions/checkout@v2
55 |       with:
56 |         lfs: true
57 |         fetch-depth: 1
58 |     - name: Setup Python ${{ matrix.python-version }}
59 |       uses: actions/setup-python@v4
60 |       with:
61 |         python-version: ${{ matrix.python-version }}
62 |     - name: Install Python Dependencies
63 |       run: |
64 |         python -m pip install -U pip && pip install pdm
65 |         pdm install
66 |     - name: Install mutool.exe
67 |       run: |
68 |         curl.exe -L https://mupdf.com/downloads/archive/mupdf-1.18.0-windows.zip -o mupdf.zip
69 |         Expand-Archive -Force mupdf.zip .
70 |         Copy-Item mupdf-1.18.0-windows/mutool.exe C:\WINDOWS\system32\mutool.exe
71 |       shell: pwsh
72 |     - name: Convert Test Caj
73 |       run: pdm run caj2pdf convert tests\\${{ matrix.src-caj }}_src.caj -o tests\\${{ matrix.src-caj }}_dst.pdf
74 | 


--------------------------------------------------------------------------------
/caj2pdf/install.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import sys
 4 | 
 5 | 
 6 | def install_context_windows(dry_run: bool):
 7 |     binary = sys.argv[0]
 8 |     binary_dir = pathlib.Path(binary).parent
 9 |     exe = binary_dir / "caj2pdf-ec.exe"
10 | 
11 |     description = "Convert CAJ to PDF"
12 |     command = f'"{exe}" "%1"'
13 |     if dry_run:
14 |         regedit = f"""\
15 |             Windows Registry Editor Version 5.00
16 | 
17 |             [HKEY_CLASSES_ROOT\\.caj\\shell\\caj2pdf]
18 |             @="{description}"
19 | 
20 |             [HKEY_CLASSES_ROOT\\.caj\\shell\\caj2pdf\\command]
21 |             @="{command}"
22 |             """
23 |         print(regedit, file=sys.stderr)
24 |     else:
25 |         import ctypes
26 |         import winreg
27 | 
28 |         if 1 != ctypes.windll.shell32.IsUserAnAdmin():
29 |             # https://docs.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-shellexecutew
30 |             # https://support.microsoft.com/zh-cn/topic/wd2000-%E5%A6%82%E4%BD%95%E8%B0%83%E7%94%A8-shellexecute-windows-api-%E5%87%BD%E6%95%B0-80da207b-2fa3-ac60-e871-f0a63164bad7
31 |             apifn = ctypes.windll.shell32.ShellExecuteW
32 |             args = (
33 |                 # 没有主窗口
34 |                 None,
35 |                 # 运行 runas 命令
36 |                 "runas",
37 |                 # 可执行文件
38 |                 binary,
39 |                 # 运行该程序时的参数
40 |                 "install",
41 |                 # 工作目录
42 |                 os.getcwd(),
43 |                 # https://docs.microsoft.com/en-us/windows/win32/api/winuser/nf-winuser-showwindow
44 |                 # 激活并显示窗口
45 |                 1,
46 |             )
47 |             # TODO 权限获取失败，弹窗消失太快看不清报错。
48 |             print(apifn, args)
49 |             status = apifn(*args)
50 |             if status <= 32:
51 |                 raise WindowsError((f"win32api错误，返回 {status}", ("ShellExecuteW", args)))
52 |         else:
53 |             # 如果拥有管理员权限
54 |             reg = winreg.ConnectRegistry(None, winreg.HKEY_CLASSES_ROOT)
55 |             cajshell = winreg.CreateKeyEx(
56 |                 key=winreg.HKEY_CLASSES_ROOT,
57 |                 sub_key=".caj\\shell\\caj2pdf",
58 |                 reserved=0,
59 |                 access=winreg.KEY_WRITE,
60 |             )
61 |             winreg.SetValue(
62 |                 winreg.HKEY_CLASSES_ROOT,
63 |                 ".caj\\shell\\caj2pdf",
64 |                 winreg.REG_SZ,
65 |                 description,
66 |             )
67 |             cajshellcmd = winreg.CreateKeyEx(
68 |                 key=winreg.HKEY_CLASSES_ROOT,
69 |                 sub_key=".caj\\shell\\caj2pdf\\command",
70 |                 reserved=0,
71 |                 access=winreg.KEY_WRITE,
72 |             )
73 |             winreg.SetValue(
74 |                 winreg.HKEY_CLASSES_ROOT,
75 |                 ".caj\\shell\\caj2pdf\\command",
76 |                 winreg.REG_SZ,
77 |                 command,
78 |             )
79 |             cajshellcmd.Close()
80 |             cajshell.Close()
81 |             reg.Close()
82 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/jbig2dec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #  Copyright 2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
 4 | #  See The FreeType Project LICENSE for license terms.
 5 | #
 6 | #  python ctypes module and short program to decode JBIG2 image data in a CAJ file.
 7 | 
 8 | #  To build, either libpoppler-based, or libjbig2dec-based (pick only one!):
 9 | #
10 | #      cc -Wall `pkg-config --cflags poppler`  -fPIC -shared -o libjbig2codec.so decode_jbig2data.cc   `pkg-config --libs poppler`
11 | #
12 | #      cc -Wall `pkg-config --cflags jbig2dec` -fPIC -shared -o libjbig2codec.so decode_jbig2data_x.cc `pkg-config --libs jbig2dec`
13 | #
14 | #  NOTE(zombie110year,2021/04/20): in this project, just compile them with script file `build.py`
15 | 
16 | import importlib.resources
17 | import platform
18 | import struct
19 | from ctypes import *
20 | 
21 | arch = platform.architecture()
22 | if (arch[1] == 'WindowsPE'):
23 |     if (arch[0] == '64bit'):
24 |         with importlib.resources.files(__package__) as pkg_dir:
25 |             libjbig2codec = cdll.LoadLibrary(str(pkg_dir / "bin/libjbig2codec-w64.dll"))
26 |     else:
27 |         with importlib.resources.files(__package__) as pkg_dir:
28 |             libjbig2codec = cdll.LoadLibrary(str(pkg_dir / "bin/libjbig2codec-w32.dll"))
29 | else:
30 |     with importlib.resources.files(__package__) as pkg_dir:
31 |         libjbig2codec = cdll.LoadLibrary(pkg_dir / "bin/libjbig2codec.so")
32 | 
33 | decode_jbig2data_c    = libjbig2codec.decode_jbig2data_c
34 | 
35 | decode_jbig2data_c.restype   = c_int
36 | decode_jbig2data_c.argtypes  = [c_void_p, c_int, c_void_p, c_int, c_int, c_int, c_int]
37 | 
38 | class CImage:
39 |     def __init__(self, buffer):
40 |         self.buffer = buffer
41 |         self.buffer_size=len(buffer)
42 |         (self.width, self.height,
43 |          self.num_planes, self.bits_per_pixel) = struct.unpack("<IIHH", buffer[4:16])
44 |         self.bytes_per_line = ((self.width * self.bits_per_pixel + 31) >> 5) << 2
45 | 
46 |     def DecodeJbig2(self):
47 |         out = create_string_buffer(self.height * self.bytes_per_line)
48 |         width_in_bytes = (self.width * self.bits_per_pixel + 7) >> 3
49 |         decode_jbig2data_c(self.buffer[48:], self.buffer_size-48, out, self.width, self.height, self.bytes_per_line, width_in_bytes)
50 |         return out
51 | 
52 | if __name__ == '__main__':
53 |     import os
54 |     import sys
55 | 
56 |     if len(sys.argv) < 3:
57 |         print("Usage: %s input output" % sys.argv[0])
58 |         sys.exit()
59 | 
60 |     f = open(sys.argv[1], "rb")
61 |     buffer_size = os.stat(sys.argv[1]).st_size
62 |     buffer = f.read()
63 | 
64 |     cimage = CImage(buffer)
65 |     out = cimage.DecodeJbig2()
66 | 
67 |     # PBM is only padded to 8 rather than 32.
68 |     # If the padding is larger, write padded file.
69 |     if (cimage.bytes_per_line > ((cimage.width +7) >> 3)):
70 |         #! bytes_per_line doesn't defined
71 |         cimage.width = cimage.bytes_per_line << 3
72 | 
73 |     with open(sys.argv[2], "wb") as fout:
74 |         fout.write("P4\n".encode("ascii"))
75 |         fout.write(("%d %d\n" % (cimage.width, cimage.height)).encode("ascii"))
76 |         fout.write(out)
77 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/jbigdec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #  Copyright 2020-2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
 4 | #  See The FreeType Project LICENSE for license terms.
 5 | #
 6 | #  python ctypes module and short program decodes the image data in a CAJ file.
 7 | 
 8 | #  To build, copy "libreaderex_x64.so" from the Ubuntu AppImage
 9 | #  to the current directory.
10 | #  (See "Analysing libreaderex" in the Wiki on how to)
11 | #
12 | #  Then, run
13 | #
14 | #       cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc
15 | 
16 | import importlib.resources
17 | import os
18 | import platform
19 | import struct
20 | from ctypes import *
21 | 
22 | arch = platform.architecture()
23 | if (arch[1] == 'WindowsPE'):
24 |     if (arch[0] == '64bit'):
25 |         with importlib.resources.files(__package__) as pkg_dir:
26 |             dllpath = (pkg_dir / "bin/libjbigdec-w64.dll").as_posix()
27 |             libjbigdec = cdll.LoadLibrary(dllpath)
28 |     else:
29 |         with importlib.resources.files(__package__) as pkg_dir:
30 |             dllpath = (pkg_dir / "bin/libjbigdec-w32.dll").as_posix()
31 |             libjbigdec = cdll.LoadLibrary(dllpath)
32 | else:
33 |     with importlib.resources.files(__package__) as pkg_dir:
34 |         dllpath = (pkg_dir / "bin/libjbigdec.so").as_posix()
35 |         libjbigdec = cdll.LoadLibrary(dllpath)
36 | 
37 | #SaveJbigAsBmp = libjbigdec.SaveJbigAsBmp
38 | #SaveJbigAsBmp.restype = None
39 | #SaveJbigAsBmp.argtypes = [c_void_p, c_int, c_char_p]
40 | 
41 | #SaveJbig2AsBmp = libjbigdec.SaveJbig2AsBmp
42 | #SaveJbig2AsBmp.restype = None
43 | #SaveJbig2AsBmp.argtypes = [c_void_p, c_int, c_char_p]
44 | 
45 | jbigDecode = libjbigdec.jbigDecode
46 | jbigDecode.restype = None
47 | jbigDecode.argtypes = [c_void_p, c_int, c_int, c_int, c_int, c_void_p]
48 | 
49 | class CImage:
50 |     def __init__(self, buffer):
51 |         self.buffer = buffer
52 |         self.buffer_size=len(buffer)
53 |         (self.width, self.height,
54 |          self.num_planes, self.bits_per_pixel) = struct.unpack("<IIHH", buffer[4:16])
55 |         self.bytes_per_line = ((self.width * self.bits_per_pixel + 31) >> 5) << 2
56 | 
57 |     def DecodeJbig(self):
58 |         out = create_string_buffer(self.height * self.bytes_per_line)
59 |         jbigDecode(self.buffer[48:], self.buffer_size-48, self.height, self.width, self.bytes_per_line, out)
60 |         return out
61 | 
62 | if __name__ == '__main__':
63 |     import os
64 |     import sys
65 | 
66 |     if len(sys.argv) < 3:
67 |         print("Usage: %s input output" % sys.argv[0])
68 |         sys.exit()
69 | 
70 |     f = open(sys.argv[1], "rb")
71 |     buffer_size = os.stat(sys.argv[1]).st_size
72 |     buffer = f.read()
73 | 
74 |     #SaveJbigAsBmp(buffer, buffer_size, sys.argv[2].encode("ascii"))
75 | 
76 |     cimage = CImage(buffer)
77 |     out = cimage.DecodeJbig()
78 | 
79 |     # PBM is only padded to 8 rather than 32.
80 |     # If the padding is larger, write padded file.
81 |     width = cimage.width
82 |     if (cimage.bytes_per_line > ((cimage.width +7) >> 3)):
83 |         width = cimage.bytes_per_line << 3
84 | 
85 |     fout = open(sys.argv[2].replace(".bmp", ".pbm"), "wb")
86 |     fout.write("P4\n".encode("ascii"))
87 |     fout.write(("%d %d\n" % (width, cimage.height)).encode("ascii"))
88 |     fout.write(out)
89 |     fout.close()
90 | 


--------------------------------------------------------------------------------
/caj2pdf/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import platform
 6 | 
 7 | from .cajparser import CAJParser
 8 | from .install import install_context_windows
 9 | from .utils import add_outlines
10 | from .version import __version__
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(prog="caj2pdf")
15 |     parser.add_argument("--version", action="version", version=f"%(prog)s v{__version__}")
16 |     subparsers = parser.add_subparsers(help="commands", dest="command")
17 | 
18 |     show_parser = subparsers.add_parser("show", help="Show the information of the CAJ file.")
19 |     show_parser.add_argument("input", help="Path to the CAJ file.")
20 | 
21 |     convert_parser = subparsers.add_parser("convert", help="Convert the CAJ file to PDF file.")
22 |     convert_parser.add_argument("input", help="Path to the CAJ file.")
23 |     convert_parser.add_argument("-o", "--output", help="Output path to the PDF file.", required=False)
24 | 
25 |     outlines_parser = subparsers.add_parser("outlines", help="Extract outlines from the CAJ file and add it to PDF file.")
26 |     outlines_parser.add_argument("input", help="Path to the CAJ file.")
27 |     outlines_parser.add_argument("-o", "--output", help="Path to the PDF file.", required=True)
28 | 
29 |     parse_parser = subparsers.add_parser("parse", help="Parse CAJ file for debugging/development")
30 |     parse_parser.add_argument("input", help="Path to the CAJ file.")
31 | 
32 |     text_extract_parser = subparsers.add_parser("text-extract", help="Parse CAJ file for debugging/development")
33 |     text_extract_parser.add_argument("input", help="Path to the CAJ file.")
34 | 
35 |     install_parser = subparsers.add_parser("install", help="install some system features, may need admin permission.")
36 |     install_parser.add_argument("--dry-run", help="not do actually, show the effect.", action="store_true")
37 |     args = parser.parse_args()
38 | 
39 |     if args.command == "show":
40 |         caj = CAJParser(args.input)
41 |         if caj.format == "PDF" or caj.format == "KDH":
42 |             print("File: {0}\nType: {1}\n".format(args.input, caj.format))
43 |         else:
44 |             print("File: {0}\nType: {1}\nPage count: {2}\nOutlines count: {3}\n".format(
45 |                 args.input,
46 |                 caj.format,
47 |                 caj.page_num,
48 |                 caj.toc_num
49 |             ))
50 | 
51 |     if args.command == "convert":
52 |         caj = CAJParser(args.input)
53 |         if args.output is None:
54 |             if args.input.endswith(".caj"):
55 |                 args.output = args.input.replace(".caj", ".pdf")
56 |             elif (len(args.input) > 4 and (args.input[-4] == '.' or args.input[-3] == '.') and not args.input.endswith(".pdf")):
57 |                 args.output = os.path.splitext(args.input)[0] + ".pdf"
58 |             else:
59 |                 args.output = args.input + ".pdf"
60 |         caj.convert(args.output)
61 | 
62 |     if args.command == "outlines":
63 |         caj = CAJParser(args.input)
64 |         if caj.format == "PDF" or caj.format == "KDH":
65 |             raise SystemExit("Unsupported file type: {0}.".format(caj.format))
66 |         toc = caj.get_toc()
67 |         add_outlines(toc, args.output, "tmp.pdf")
68 |         os.replace("tmp.pdf", args.output)
69 | 
70 |     if args.command == "text-extract":
71 |         caj = CAJParser(args.input)
72 |         caj.text_extract()
73 | 
74 |     if args.command == "parse":
75 |         caj = CAJParser(args.input)
76 |         caj.parse()
77 | 
78 |     if args.command == "install":
79 |         if platform.system() == "Windows":
80 |             install_context_windows(args.dry_run)
81 |         else:
82 |             raise NotImplementedError("Only support Windows now.")
83 | 


--------------------------------------------------------------------------------
/caj2pdf/HNParsePage.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
  2 | #  See The FreeType Project LICENSE for license terms.
  3 | #
  4 | #  HNParsePage class, for extracting text and image positions
  5 | import struct
  6 | 
  7 | class HNParsePage(object):
  8 |     def __init__(self, data, old_style=False):
  9 |         self.data = data
 10 |         self.data_length = len(data)
 11 |         self.characters = []
 12 |         self.figures = []
 13 |         self.stats = {}
 14 |         self.offset = 0
 15 |         def Text(self, code):
 16 |             try:
 17 |                 self.characters.append(bytes([self.data[self.offset+5],self.data[self.offset+4]]).decode("gbk"))
 18 |             except IndexError: # short data, nothing to do
 19 |                 pass
 20 |             except UnicodeDecodeError:
 21 |                 # HTL: When cut-and-paste on Linux, these transform to GB18030,
 22 |                 # but I believe they are OCR artifacts. Where they occur,
 23 |                 # 0xA38D 0xA38a (always together) are line-breaks, and 0xA389, 0xA3A0
 24 |                 # are tabs and spaces.
 25 |                 hash = {
 26 |                     0xA389 : "\t",
 27 |                     0xA38a : "\n",
 28 |                     0xA38D : "\r",
 29 |                     0xA3A0 : " ",
 30 |                     # # GB18030
 31 |                     #0xA389 : "",
 32 |                     #0xA38a : "",
 33 |                     #0xA38D : "",
 34 |                     #0xA3A0 : "",
 35 |                 }
 36 |                 code = self.data[self.offset+5] * 256 + self.data[self.offset+4]
 37 |                 try:
 38 |                     #self.characters.append("<0x%04X>\n" % code)
 39 |                     self.characters.append(hash[code])
 40 |                 except KeyError:
 41 |                     self.characters.append("<0x%04X>\n" % code)
 42 |             self.offset += 6
 43 | 
 44 |         def TextMulti(self, code):
 45 |             self.offset += 2
 46 |             if (code == 0x8001):
 47 |                 self.characters.append("\n")
 48 |             while (1):
 49 |                 try:
 50 |                     if (self.data[self.offset+1] == 0x80):
 51 |                         break
 52 |                 except IndexError: # short data, nothing to do
 53 |                     return
 54 |                 try:
 55 |                     self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk"))
 56 |                 except UnicodeDecodeError:
 57 |                     self.characters.append("<0x%04X>\n" % (self.data[self.offset+3] * 256 + self.data[self.offset+2]))
 58 |                 except IndexError: # short data, nothing to do
 59 |                     return
 60 |                 self.offset += 4
 61 | 
 62 |         def Figure(self, code):
 63 |             try:
 64 |                 self.data[self.offset+25]
 65 |             except IndexError: # short data, nothing to do
 66 |                 return
 67 |             (ignore1, offset_x, offset_y, size_x, size_y, int2, int3, int4, int5)= struct.unpack("<HHHHHIIII", self.data[self.offset:self.offset+26])
 68 |             # in units of 1/2.473 pixels
 69 |             self.figures.append([offset_x, offset_y, size_x, size_y])
 70 |             self.offset += 26
 71 | 
 72 |         if (not old_style):
 73 |             dispatch = {
 74 |                 0x8001 : Text,
 75 |                 0x800A : Figure,
 76 |             }
 77 |         else:
 78 |             dispatch = {
 79 |                 0x8001 : TextMulti,
 80 |                 0x8070 : TextMulti,
 81 |                 0x800A : Figure,
 82 |             }
 83 |         dispatch_keys = dispatch.keys()
 84 | 
 85 |         while (self.offset <= self.data_length - 2):
 86 |             (dispatch_code,) = struct.unpack("H", self.data[self.offset:self.offset+2])
 87 |             self.offset += 2
 88 |             if (dispatch_code in dispatch_keys):
 89 |                 dispatch[dispatch_code](self, dispatch_code)
 90 |             else:
 91 |                 self.offset +=2
 92 |                 if (dispatch_code in self.stats.keys()):
 93 |                     self.stats[dispatch_code] +=1
 94 |                 else:
 95 |                     self.stats[dispatch_code] = 1
 96 | 
 97 |     @property
 98 |     def texts(self):
 99 |         text = ''.join(self.characters)
100 |         text.replace('\x00', '')
101 |         text.replace('\r', '')
102 |         return text
103 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/jbigdec.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Copyright 2020-2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
  3 |   See The FreeType Project LICENSE for license terms.
  4 | 
  5 |   This short program decodes the image data in a CAJ file.
  6 | 
  7 |   To build, copy "libreaderex_x64.so" from the Ubuntu AppImage
  8 |   to the current directory.
  9 |   (See "Analysing libreaderex" in the Wiki on how to)
 10 | 
 11 |   Then, run
 12 | 
 13 |       cc -DHAVE_MAIN -Wall -o jbigdec jbigdec.cc -Wl,-rpath,. -L. -lreaderex_x64
 14 | 
 15 |   For the python module, also:
 16 | 
 17 |       cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc
 18 | 
 19 |   and to generate the "image_dump_*.dat":
 20 | 
 21 |       ./caj2pdf parse thesis.caj
 22 | 
 23 |   Identify which ones are DIB and which ones are JPG with:
 24 | 
 25 |       file image_dump_*.dat
 26 | 
 27 |   Usage example (Page 1 / Cover is likely JPG!):
 28 | 
 29 |       ./jbigdec image_dump_0002.dat page_0002.bmp
 30 |       ./jbigdec image_dump_0003.dat page_0003.bmp
 31 |       ...
 32 | 
 33 |   Note: The program outputs a few "string to int" while it is working.
 34 |         This is an anomaly with "libreaderex_x64.so".
 35 | */
 36 | 
 37 | #include <cstdio>
 38 | #include <cstdlib>
 39 | #include <cctype>
 40 | #include <cstring>
 41 | 
 42 | extern "C" {
 43 |   class JBigCodec {
 44 |   public:
 45 |     void ByteIn();
 46 |     void ClearLine(char*, unsigned int);
 47 |     void CopyLine(char*, char*, unsigned int);
 48 |     int Decode1(int);
 49 |     void Decode(char* inbuf, unsigned int size, unsigned int height, unsigned int bitwidth, unsigned int bitwidth_in_bytes /* rounded up to x4 */, char*outbuf);
 50 |     int Decode(int);
 51 |     void DupLine(char*, unsigned int, unsigned int, unsigned int);
 52 |     int GetBit(int, int);
 53 |     unsigned int GetCX(int, int);
 54 |     void InitDecode(char*, unsigned int);
 55 |     void LowestDecode();
 56 |     int LowestDecodeLine(unsigned int, char*, char*, unsigned int, char*);
 57 |     void MakeTypicalLine(int);
 58 |     void RenormDe();
 59 |   };
 60 | #ifdef HAVE_MAIN
 61 |   class CImage {
 62 |   public:
 63 |     static CImage* DecodeJbig(void*, unsigned int, unsigned int*);
 64 |     static CImage* DecodeJbig2(void*, unsigned int, unsigned int*);
 65 |     int SaveAsBmp(char const*);
 66 |   };
 67 | 
 68 | void SaveJbigAsBmp(void* in, unsigned int len, char const* outfile)
 69 | {
 70 |   CImage* x = CImage::DecodeJbig(in, len, NULL);
 71 |   x->SaveAsBmp(outfile);
 72 | }
 73 | 
 74 | void SaveJbig2AsBmp(void* in, unsigned int len, char const* outfile)
 75 | {
 76 |   CImage* x = CImage::DecodeJbig2(in, len, NULL);
 77 |   x->SaveAsBmp(outfile);
 78 | }
 79 | #endif
 80 | 
 81 | void jbigDecode(char* inbuf, unsigned int size, unsigned int height,
 82 |                 unsigned int bitwidth, unsigned int bitwidth_in_bytes /* rounded up to x4 */, char*outbuf)
 83 | {
 84 |   JBigCodec *jbig = (JBigCodec *)calloc(0x8040, 1); // 0x8040 is linux 64-bit specific
 85 |   jbig->Decode(inbuf, size, height, bitwidth, bitwidth_in_bytes, outbuf);
 86 |   free(jbig);
 87 | }
 88 | 
 89 | }
 90 | 
 91 | #ifdef HAVE_MAIN
 92 | int main(int argc, char *argv[])
 93 | {
 94 |   size_t buflen = 80000; // large number - should be large enough to hold the whole input file.
 95 |   char *in = (char *)calloc(buflen, 1);
 96 | 
 97 |   FILE *fin = fopen(argv[1], "rb");
 98 | 
 99 |   size_t len = fread(in, 1, buflen, fin);
100 | 
101 |   unsigned int intout = 0;
102 |   CImage* x = CImage::DecodeJbig(in, len, &intout);
103 |   x->SaveAsBmp(argv[2]);
104 | 
105 |   int width  = in[4] | (in[5] << 8) | (in[6]  << 16) | (in[7]  << 24);
106 |   int height = in[8] | (in[9] << 8) | (in[10] << 16) | (in[11] << 24);
107 |   int bits_per_pixel = in[14] | (in[15] << 8);
108 |   // padding to multiple of 4 bytes.
109 |   int bytes_per_line = ((width * bits_per_pixel + 31) >> 5) << 2;
110 | 
111 |   char *out = (char *)calloc(height * bytes_per_line, 1);
112 | 
113 |   JBigCodec *jbig = (JBigCodec *)calloc(0x8040, 1); // 0x8040 is linux 64-bit specific
114 |   jbig->Decode(in+48, len-48, height, width, bytes_per_line, out);
115 |   free(jbig);
116 | 
117 |   FILE *fout = fopen("test.pbm", "wb");
118 |   fprintf(fout, "P4\n");
119 |   // PBM is padded to 8 rather than 32.
120 |   // If the padding is larger, write padded file.
121 |   if (bytes_per_line > ((width +7) >> 3))
122 |     width = bytes_per_line << 3;
123 |   fprintf(fout, "%d %d\n", width, height);
124 |   fwrite(out, 1, bytes_per_line * height, fout);
125 |   fclose(fout); // "cmp -i 62:13 x.bmp x.pbm" shows nothing - identical.
126 | }
127 | #endif
128 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # caj2pdf
  2 | 
  3 | 本项目由 [caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf) 重构而来，仅仅修改了 Python 包的组织方式，以便使用包管理工具进行简便地安装和调用。
  4 | 
  5 | 1. 可以使用 build.py 脚本编译二进制依赖
  6 | 2. 可以在任何工作目录下使用 caj2pdf 命令，而无需移动到同一目录
  7 | 3. 如果存在任何关于 CAJ 文件格式而导致的问题，请到 [caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf/issues) 提交反馈。如果存在本项目无法安装、调用出错或者版本过于落后等问题，可到 [issues](issues/) 提交反馈。
  8 | 
  9 | ## Why
 10 | 
 11 | [中国知网](http://cnki.net/)的某些文献（多为学位论文）仅提供其专有的 CAJ 格式下载，仅能使用知网提供的软件（如 [CAJViewer](http://cajviewer.cnki.net/) 等）打开，给文献的阅读和管理带来了不便（尤其是在非 Windows 系统上）。
 12 | 
 13 | 若要将 CAJ 文件转换为 PDF 文件，可以使用 CAJViewer 的打印功能。但这样得到的 PDF 文件的内容为图片，无法进行文字的选择，且原文献的大纲列表也会丢失。本项目希望可以解决上述两问题。
 14 | 
 15 | ## How to use
 16 | 
 17 | ### 环境和依赖
 18 | 
 19 | - Python 3.10+ （使用了 `importlib.resources` 模块，以提供在任意目录下工作的能力）
 20 | - [PyPDF2](https://github.com/mstamy2/PyPDF2)
 21 | - [mutool](https://mupdf.com/index.html)
 22 | 
 23 | 除了Microsoft Windows：我们提供Microsoft Windows 32-bit/64-bit DLLs，HN 格式需要
 24 | 
 25 | - C/C++编译器
 26 | - libpoppler开发包，或libjbig2dec开发包
 27 | 
 28 | ### 安装
 29 | 
 30 | #### ArchLinux
 31 | 
 32 | ```sh
 33 | # poppler 库
 34 | sudo pacman -S base-devel poppler mupdf-tools
 35 | pip install caj2pdf-restructured
 36 | 
 37 | # jbig2dec 库
 38 | sudo pacman -S base-devel jbig2dec mupdf-tools
 39 | LIBJBIG2DEC=1 pip install caj2pdf-restructured
 40 | ```
 41 | 
 42 | 或使用 [pipx](https://github.com/pipxproject/pipx)
 43 | 
 44 | ```sh
 45 | # poppler 库
 46 | sudo pacman -S base-devel poppler mupdf-tools
 47 | pipx install caj2pdf-restructured
 48 | 
 49 | # jbig2dec 库
 50 | sudo pacman -S base-devel jbig2dec mupdf-tools
 51 | LIBJBIG2DEC=1 pipx install caj2pdf-restructured
 52 | ```
 53 | 
 54 | #### Debian, Ubuntu 等 Linux
 55 | 
 56 | ```sh
 57 | # poppler 库
 58 | sudo apt install build-essential libpoppler-dev mupdf-tools
 59 | pip install caj2pdf-restructured
 60 | ```
 61 | 
 62 | 或使用 [pipx](https://github.com/pipxproject/pipx)
 63 | 
 64 | ```sh
 65 | # poppler 库
 66 | sudo apt install build-essential libpoppler-dev mupdf-tools
 67 | pipx install caj2pdf-restructured
 68 | ```
 69 | 
 70 | **注意**：
 71 | 
 72 | 1. jbig2dec 库在 Ubuntu/Debian 上的安装存在依赖问题，但是 poppler 库可能无法解析 HN 文件，建议能配置好依赖的尽量使用 `LIBJBIG2DEC=1` 进行构建。
 73 | 2. Ubuntu 16.04 的 poppler 库版本过于落后，建议在较新的系统上安装。
 74 | 
 75 | #### Windows
 76 | 
 77 | 可以直接通过 pip 或 pipx 安装：
 78 | 
 79 | ```sh
 80 | pip install caj2pdf-restructured
 81 | 
 82 | pipx install caj2pdf-restructured
 83 | ```
 84 | 
 85 | 然后，从 [mutool](https://mupdf.com/index.html) 下载 mupdf-1.18.0-windows.zip 并解压，将其中的 mutool.exe 添加到 `PATH` 变量中的路径下，以便从任意位置调用。
 86 | 
 87 | 如果你使用 [choco](https://chocolatey.org) 或 [scoop](https://scoop.sh/) 作为 Windows 下的包管理工具，则可一键式安装：
 88 | 
 89 | ```sh
 90 | choco install mupdf
 91 | ```
 92 | 
 93 | 或者
 94 | 
 95 | ```sh
 96 | scoop install mupdf
 97 | ```
 98 | 
 99 | ### 用法
100 | 
101 | ```
102 | # 打印文件基本信息（文件类型、页面数、大纲项目数）
103 | caj2pdf show [input_file]
104 | 
105 | # 转换文件
106 | caj2pdf convert [input_file] -o/--output [output_file]
107 | 
108 | # 从 CAJ 文件中提取大纲信息并添加至 PDF 文件
109 | ## 遇到不支持的文件类型或 Bug 时，可用 CAJViewer 打印 PDF 文件，并用这条命令为其添加大纲
110 | caj2pdf outlines [input_file] -o/--output [pdf_file]
111 | ```
112 | ### 例
113 | 
114 | ```
115 | caj2pdf show test.caj
116 | caj2pdf convert test.caj -o output.pdf
117 | caj2pdf outlines test.caj -o printed.pdf
118 | ```
119 | 
120 | #### 右键菜单
121 | 
122 | 0.1.0a4 版本后，可以在 Windows 系统上使用右键菜单转换 CAJ 文件了。
123 | 
124 | ![](screenshot1.png)
125 | 
126 | 需要在命令行中调用命令 `caj2pdf install` 安装注册表，然后才能使用此功能。
127 | 如果卸载程序，注册表 **不会被清理**，待研究 pip，看看能不能在 uninstall 之前加 HOOK。
128 | 
129 | TODO: 清理注册表的功能。
130 | 
131 | ### 异常输出（IMPORTANT!!!）
132 | 
133 | 尽管这个项目目前有不少同学关注到了，但它**仍然只支持部分 caj 文件的转换**，必须承认这完全不是一个对普通用户足够友好的成熟项目。具体支持哪些不支持哪些，在前文也已经说了，但似乎很多同学并没有注意到。所以**如果你遇到以下两种输出，本项目目前无法帮助到你**。与此相关的 issue 不再回复。
134 | 
135 | - `Unknown file type.`：未知文件类型；
136 | 
137 | ## How far we've come
138 | 
139 | 知网下载到的后缀为 `caj` 的文件内部结构其实分为两类：CAJ 格式和 HN 格式（受考察样本所限可能还有更多）。目前本项目支持 CAJ 格式文件的转换，HN 格式的转换未完善，并且需要建立两个新的共享库（除了Microsoft Windows：我们提供Microsoft Windows 32-bit/64-bit DLLs），详情如下：
140 | 
141 | ```
142 | cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc
143 | cc -Wall `pkg-config --cflags poppler` -fPIC -shared -o libjbig2codec.so decode_jbig2data.cc `pkg-config --libs poppler`
144 | ```
145 | 
146 | 抑或和libpoppler 相比，还是取决于您是否更喜欢libjbig2dec一点，可以替换libpoppler：
147 | 
148 | ```
149 | cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc
150 | cc -Wall `pkg-config --cflags jbig2dec` -fPIC -shared -o libjbig2codec.so decode_jbig2data_x.cc `pkg-config --libs jbig2dec`
151 | ```
152 | 
153 | **NOTE（zombie110year,2021/04/20）**：现在可以使用 `python build.py` 指令来编译链接库了。并且源代码和输出文件的路径移动到了 `caj2pdf/dep` 之中，和上面的命令不同。
154 | 
155 | 1. 默认使用 libpoppler 作为依赖编译：
156 | 
157 | ```sh
158 | python build.py
159 | ```
160 | 
161 | 2. 或者，使用 jbig2dec 作为依赖编译：
162 | 
163 | ```sh
164 | LIBJBIG2DEC=1 python build.py
165 | ```
166 | 
167 | **关于两种格式文件结构的分析进展和本项目的实现细节，请查阅[项目 Wiki](https://github.com/JeziL/caj2pdf/wiki)。**
168 | 
169 | ## How to contribute
170 | 
171 | 受测试样本数量所限，即使转换 CAJ 格式的文件也可能（或者说几乎一定）存在 Bug。如遇到这种情况，欢迎在 [Issue](https://github.com/JeziL/caj2pdf/issues) 中提出，**并提供可重现 Bug 的 caj 文件**——可以将样本文件上传到网盘等处<del>，也可直接提供知网链接</del>（作者已滚出校园网，提 issue 请提供可下载的 caj 文件）。
172 | 
173 | 如果你对二进制文件分析、图像/文字压缩算法、逆向工程等领域中的一个或几个有所了解，欢迎帮助完善此项目。你可以从阅读[项目 Wiki](https://github.com/JeziL/caj2pdf/wiki) 开始，看看是否有可以发挥你特长的地方。**Pull requests are always welcome**.
174 | 
175 | ## License
176 | 
177 | 本项目基于 [GLWTPL](https://github.com/me-shaon/GLWTPL)  (Good Luck With That Public License) 许可证开源。
178 | 


--------------------------------------------------------------------------------
/caj2pdf/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import struct
  3 | import sys
  4 | 
  5 | import PyPDF2.generic as PDF
  6 | from PyPDF2 import PdfFileReader, PdfFileWriter
  7 | 
  8 | 
  9 | class Node(object):
 10 |     def __init__(self, data, parent=None, lchild=None, rchild=None):
 11 |         self.data = data
 12 |         self.parent = parent
 13 |         self.lchild = lchild
 14 |         self.rchild = rchild
 15 | 
 16 |     @property
 17 |     def level(self):
 18 |         return self.data["level"]
 19 | 
 20 |     @property
 21 |     def index(self):
 22 |         return self.data["index"]
 23 | 
 24 |     def real_parent(self):
 25 |         p = self
 26 |         while True:
 27 |             c = p
 28 |             p = p.parent
 29 |             if p.lchild == c:
 30 |                 return p
 31 |             if p.parent is None:
 32 |                 return None
 33 | 
 34 |     def prev(self):
 35 |         if self.parent.rchild == self:
 36 |             return self.parent
 37 |         else:
 38 |             return None
 39 | 
 40 |     def next(self):
 41 |         return self.rchild
 42 | 
 43 |     def first(self):
 44 |         return self.lchild
 45 | 
 46 |     def last(self):
 47 |         f = self.first()
 48 |         if f is None:
 49 |             return None
 50 |         r = f
 51 |         while r.rchild is not None:
 52 |             r = r.rchild
 53 |         return r
 54 | 
 55 | 
 56 | class BTree(object):
 57 |     def __init__(self):
 58 |         self.root = Node({"level": 0, "index": 0}, None)
 59 |         self.cursor = self.root
 60 | 
 61 |     @property
 62 |     def current_level(self):
 63 |         return self.cursor.level
 64 | 
 65 |     def insert_as_lchild(self, node):
 66 |         self.cursor.lchild = node
 67 |         node.parent = self.cursor
 68 |         self.cursor = node
 69 | 
 70 |     def insert_as_rchild(self, node):
 71 |         self.cursor.rchild = node
 72 |         node.parent = self.cursor
 73 |         self.cursor = node
 74 | 
 75 | 
 76 | def fnd(f, s, start=0):
 77 |     fsize = f.seek(0, os.SEEK_END)
 78 |     f.seek(0)
 79 |     bsize = 4096
 80 |     buffer = None
 81 |     if start > 0:
 82 |         f.seek(start)
 83 |     overlap = len(s) - 1
 84 |     while True:
 85 |         if overlap <= f.tell() < fsize:
 86 |             f.seek(f.tell() - overlap)
 87 |         buffer = f.read(bsize)
 88 |         if buffer:
 89 |             pos = buffer.find(s)
 90 |             if pos >= 0:
 91 |                 return f.tell() - (len(buffer) - pos)
 92 |         else:
 93 |             return -1
 94 | 
 95 | 
 96 | def fnd_rvrs(f, s, end=sys.maxsize):
 97 |     # find target in reverse direction
 98 |     fsize = f.seek(0, os.SEEK_END)
 99 |     bsize = 4096
100 |     if len(s) > end:
101 |         raise SystemExit("Too large string size for search.")
102 |     f.seek(fsize - bsize)
103 |     buffer = None
104 |     size = bsize
105 |     if bsize <= end < fsize:
106 |         f.seek(end - bsize)
107 |     elif 0 < end < bsize:
108 |         size = end
109 |         f.seek(0)
110 |     overlap = len(s) - 1
111 |     s = s[::-1]
112 |     while True:
113 |         buffer = f.read(size)
114 |         if buffer:
115 |             buffer = buffer[::-1]
116 |             pos = buffer.find(s)
117 |             if pos >= 0:
118 |                 return f.tell() - pos
119 |         if (2 * bsize - overlap) < f.tell():
120 |             f.seek(f.tell() - (2 * bsize - overlap))
121 |             size = bsize
122 |         elif (bsize - overlap) < f.tell():
123 |             size = f.tell() - (bsize - overlap)
124 |             f.seek(0)
125 |         else:
126 |             return -1
127 | 
128 | 
129 | def fnd_all(f, s):
130 |     results = []
131 |     last_addr = -len(s)
132 |     while True:
133 |         addr = fnd(f, s, start=last_addr + len(s))
134 |         if addr != -1:
135 |             results.append(addr)
136 |             last_addr = addr
137 |         else:
138 |             return results
139 | 
140 | 
141 | def fnd_unuse_no(nos1, nos2):
142 |     unuse_no = -1
143 |     for i in range(99999):
144 |         if (99999 - i not in nos1) and (99999 - i not in nos2):
145 |             unuse_no = 99999 - i
146 |             break
147 |     if unuse_no == -1:
148 |         raise SystemExit("Error on PDF objects numbering.")
149 |     return unuse_no
150 | 
151 | 
152 | def make_dest(pdfw, pg):
153 |     d = PDF.ArrayObject()
154 |     try:
155 |         d.append(pdfw.getPage(pg).indirect_ref)
156 |     except AttributeError:
157 |         d.append(pdfw.getPage(pg).indirectRef)
158 |     d.append(PDF.NameObject("/XYZ"))
159 |     d.append(PDF.NullObject())
160 |     d.append(PDF.NullObject())
161 |     d.append(PDF.NullObject())
162 |     return d
163 | 
164 | 
165 | def build_outlines_btree(toc):
166 |     tree = BTree()
167 |     for i, t in enumerate(toc):
168 |         t["page"] -= 1  # Page starts at 0.
169 |         t["index"] = i + 1
170 |         node = Node(t)
171 |         if t["level"] > tree.current_level:
172 |             tree.insert_as_lchild(node)
173 |         elif t["level"] == tree.current_level:
174 |             tree.insert_as_rchild(node)
175 |         else:
176 |             while True:
177 |                 p = tree.cursor.real_parent()
178 |                 tree.cursor = p
179 |                 if p.level == t["level"]:
180 |                     tree.insert_as_rchild(node)
181 |                     break
182 |         t["node"] = node
183 | 
184 | 
185 | def add_outlines(toc, filename, output):
186 |     build_outlines_btree(toc)
187 |     pdf_out = PdfFileWriter()
188 |     inputFile = open(filename, "rb")
189 |     pdf_in = PdfFileReader(inputFile)
190 |     for p in pdf_in.pages:
191 |         try:
192 |             pdf_out.add_page(p)
193 |         except AttributeError:
194 |             pdf_out.addPage(p)
195 |     toc_num = len(toc)
196 |     if toc_num == 0:  # Just copy if toc empty
197 |         outputFile = open(output, "wb")
198 |         pdf_out.write(outputFile)
199 |         inputFile.close()
200 |         outputFile.close()
201 |         return
202 |     idoix = len(pdf_out._objects) + 1
203 |     idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)]
204 |     ol = PDF.DictionaryObject()
205 |     ol.update(
206 |         {
207 |             PDF.NameObject("/Type"): PDF.NameObject("/Outlines"),
208 |             PDF.NameObject("/First"): idorefs[1],
209 |             PDF.NameObject("/Last"): idorefs[-1],
210 |             PDF.NameObject("/Count"): PDF.NumberObject(toc_num),
211 |         }
212 |     )
213 |     olitems = []
214 |     for t in toc:
215 |         oli = PDF.DictionaryObject()
216 |         oli.update(
217 |             {
218 |                 PDF.NameObject("/Title"): PDF.TextStringObject(
219 |                     t["title"].decode("utf-8")
220 |                 ),
221 |                 PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"]),
222 |             }
223 |         )
224 |         opt_keys = {
225 |             "real_parent": "/Parent",
226 |             "prev": "/Prev",
227 |             "next": "/Next",
228 |             "first": "/First",
229 |             "last": "/Last",
230 |         }
231 |         for k, v in opt_keys.items():
232 |             n = getattr(t["node"], k)()
233 |             if n is not None:
234 |                 oli.update({PDF.NameObject(v): idorefs[n.index]})
235 |         olitems.append(oli)
236 |     try:
237 |         pdf_out._add_object(ol)
238 |     except AttributeError:
239 |         pdf_out._addObject(ol)
240 |     for i in olitems:
241 |         try:
242 |             pdf_out._add_object(i)
243 |         except AttributeError:
244 |             pdf_out._addObject(i)
245 |     pdf_out._root_object.update({PDF.NameObject("/Outlines"): idorefs[0]})
246 |     outputFile = open(output, "wb")
247 |     pdf_out.write(outputFile)
248 |     inputFile.close()
249 |     outputFile.close()
250 | 
251 | 
252 | # See if the page is N * N images, N images written N times,
253 | # by checking image sizes and within 1 < N <= 10.
254 | # Return True and N if that's the case.
255 | def find_redundant_images(caj, initial_offset, images_per_page):
256 |     sqrts = {
257 |         4: 2,
258 |         9: 3,
259 |         16: 4,
260 |         25: 5,
261 |         36: 6,
262 |         49: 7,
263 |         64: 8,
264 |         81: 9,
265 |         100: 10,
266 |     }
267 | 
268 |     if not (images_per_page in sqrts.keys()):
269 |         return (False, images_per_page)
270 |     stride = sqrts[images_per_page]
271 |     sizes = []
272 |     current_offset = initial_offset
273 |     for j in range(images_per_page):
274 |         caj.seek(current_offset)
275 |         read32 = caj.read(32)
276 |         [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack(
277 |             "iii", read32[0:12]
278 |         )
279 |         if (j >= stride) and (size_of_image_data != sizes[j - stride]):
280 |             return (False, images_per_page)
281 |         sizes.append(size_of_image_data)
282 |         current_offset = offset_to_image_data + size_of_image_data
283 |     # if we reach here, the image sizes seen are [A, B, C ... N, ..., A, B, C ... N] exactly N times.
284 |     return (True, stride)
285 | 


--------------------------------------------------------------------------------
/caj2pdf/dep/JBigDecode.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Copyright 2021 (c) Hin-Tak Leung <htl10@users.sourceforge.net>
  3 |   See The FreeType Project LICENSE for license terms.
  4 | 
  5 |   Decode-only part of JBigCodec. Drop-in compatible with LibReaderEx's.
  6 | */
  7 | 
  8 | #include <cstdlib>
  9 | #include <cstring>
 10 | #include "JBigDecode.h"
 11 | 
 12 | void JBigCodec::ByteIn()
 13 | {
 14 |   unsigned int v3;
 15 |   unsigned int v1 = this->read_count;
 16 |   int v2 = 0;
 17 |   if ( v1 < this->inbuf_length )
 18 |   {
 19 |     v3 = *(this->inbuf + v1); // Needs to be unsigned!
 20 |     this->read_count = v1 + 1;
 21 |     v2 = v3 << 8;
 22 |   }
 23 |   this->C_register += v2;
 24 |   this->CT = 8;
 25 | }
 26 | 
 27 | /* size in number of ints! */
 28 | void* JBigCodec::ClearLine(char* dest, unsigned int size)
 29 | {
 30 |   return memset(dest, 0, 4 * size);
 31 | }
 32 | 
 33 | /* size in number of ints! */
 34 | void* JBigCodec::CopyLine(char* dest, char* src, unsigned int size)
 35 | {
 36 |   return memcpy(dest, src, 4 * size);
 37 | }
 38 | 
 39 | /* Table 24 on page 45 of ITU-T REC T-82 */
 40 | 
 41 | static int LSZ[256] = {
 42 | 0x5a1d,
 43 | 0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036,
 44 | 0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2,
 45 | 0x207c, 0x17b9, 0x1182, 0x0cef, 0x09a1, 0x072f, 0x055c, 0x0406,
 46 | 0x0303, 0x0240, 0x01b1, 0x0144, 0x00f5, 0x00b7, 0x008a, 0x0068,
 47 | 0x004e, 0x003b, 0x002c, 0x5ae1, 0x484c, 0x3a0d, 0x2ef1, 0x261f,
 48 | 0x1f33, 0x19a8, 0x1518, 0x1177, 0x0e74, 0x0bfb, 0x09f8, 0x0861,
 49 | 0x0706, 0x05cd, 0x04de, 0x040f, 0x0363, 0x02d4, 0x025c, 0x01f8,
 50 | 
 51 | 0x01a4, 0x0160, 0x0125, 0x00f6, 0x00cb, 0x00ab, 0x008f, 0x5b12,
 52 | 0x4d04, 0x412c, 0x37d8, 0x2fe8, 0x293c, 0x2379, 0x1edf, 0x1aa9,
 53 | 0x174e, 0x1424, 0x119c, 0x0f6b, 0x0d51, 0x0bb6, 0x0a40, 0x5832,
 54 | 0x4d1c, 0x438e, 0x3bdd, 0x34ee, 0x2eae, 0x299a, 0x2516, 0x5570,
 55 | 0x4ca9, 0x44d9, 0x3e22, 0x3824, 0x32b4, 0x2e17, 0x56a8, 0x4f46,
 56 | 0x47e5, 0x41cf, 0x3c3d, 0x375e, 0x5231, 0x4c0f, 0x4639, 0x415e,
 57 | 0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb,
 58 | };
 59 | 
 60 | static int NLPS[256] = {
 61 |  1,
 62 | 14, 16, 18, 20, 23, 25, 28, 30,
 63 | 33, 35,  9, 10, 12, 15, 36, 38,
 64 | 39, 40, 42, 43, 45, 46, 48, 49,
 65 | 51, 52, 54, 56, 57, 59, 60, 62,
 66 | 63, 32, 33, 37, 64, 65, 67, 68,
 67 | 69, 70, 72, 73, 74, 75, 77, 78,
 68 | 79, 48, 50, 50, 51, 52, 53, 54,
 69 | 
 70 | 55, 56, 57, 58, 59, 61, 61, 65,
 71 | 80, 81, 82, 83, 84, 86, 87, 87,
 72 | 72, 72, 74, 74, 75, 77, 77, 80,
 73 | 88, 89, 90, 91, 92, 93, 86, 88,
 74 | 95, 96, 97, 99, 99, 93, 95, 101,
 75 | 102, 103, 104,  99, 105, 106, 107, 103,
 76 | 105, 108, 109, 110, 111, 110, 112, 112,
 77 | };
 78 | 
 79 | static int NMPS[256] = {
 80 |  1,
 81 |  2,  3,  4,  5,  6,  7,  8,  9,
 82 | 10, 11, 12, 13, 13, 15, 16, 17,
 83 | 18, 19, 20, 21, 22, 23, 24, 25,
 84 | 26, 27, 28, 29, 30, 31, 32, 33,
 85 | 34, 35,  9, 37, 38, 39, 40, 41,
 86 | 42, 43, 44, 45, 46, 47, 48, 49,
 87 | 50, 51, 52, 53, 54, 55, 56, 57,
 88 | 
 89 |  58,  59,  60,  61,  62,  63,  32,  65,
 90 |  66,  67,  68,  69,  70,  71,  72,  73,
 91 |  74,  75,  76,  77,  78,  79,  48,  81,
 92 |  82,  83,  84,  85,  86,  87,  71,  89,
 93 |  90,  91,  92,  93,  94,  86,  96,  97,
 94 |  98,  99, 100,  93, 102, 103, 104,  99,
 95 | 106, 107, 103, 109, 107, 111, 109, 111,
 96 | };
 97 | 
 98 | static int SWITCH[256] = {
 99 | 1,
100 | 0, 0, 0, 0, 0, 0, 0, 0,
101 | 0, 0, 0, 0, 0, 1, 0, 0,
102 | 0, 0, 0, 0, 0, 0, 0, 0,
103 | 0, 0, 0, 0, 0, 0, 0, 0,
104 | 0, 0, 0, 1, 0, 0, 0, 0,
105 | 0, 0, 0, 0, 0, 0, 0, 0,
106 | 0, 0, 0, 0, 0, 0, 0, 0,
107 | 
108 | 0, 0, 0, 0, 0, 0, 0, 1,
109 | 0, 0, 0, 0, 0, 0, 0, 0,
110 | 0, 0, 0, 0, 0, 0, 0, 1,
111 | 0, 0, 0, 0, 0, 0, 0, 1,
112 | 0, 0, 0, 0, 0, 0, 1, 0,
113 | 0, 0, 0, 0, 0, 0, 0, 0,
114 | 1, 0, 0, 0, 0, 1, 0, 1,
115 | };
116 | 
117 | void JBigCodec::LpsExchange(int CX, unsigned int ST_CX, unsigned int LSZ_ST_CX)
118 | {
119 |   int v6;
120 | 
121 |   if ( A_interval < LSZ_ST_CX )
122 |   {
123 |     PIX = MPS[CX];
124 |     ST[CX] = NMPS[ST_CX];
125 |   }
126 |   else
127 |   {
128 |     v6 = (MPS[CX] ^ 1)& 1; // 1 - MPS[CX]
129 |     PIX = v6;
130 |     ST[CX] = NLPS[ST_CX];
131 |     if ( SWITCH[ST_CX] == 1 )
132 |       MPS[CX] = v6;
133 |   }
134 |   C_register -= A_interval << 16;
135 |   A_interval = LSZ_ST_CX;
136 | }
137 | 
138 | void JBigCodec::MpsExchange(int CX, unsigned int ST_CX, unsigned int LSZ_ST_CX)
139 | {
140 |   int v6;
141 | 
142 |   if ( A_interval >= LSZ_ST_CX )
143 |   {
144 |     PIX = MPS[CX];
145 |     ST[CX] = NMPS[ST_CX];
146 |   }
147 |   else
148 |   {
149 |     v6 = (MPS[CX] ^ 1) & 1;
150 |     PIX = v6;
151 |     ST[CX] = NLPS[ST_CX];
152 |     if ( SWITCH[ST_CX] == 1 )
153 |       MPS[CX] = v6;
154 |   }
155 | }
156 | 
157 | int JBigCodec::Decode1(int CX)
158 | {
159 |   A_interval -= LSZ[ST[CX]];
160 |   if ( A_interval <= C_register >> 16 )
161 |   {
162 |     LpsExchange(CX, ST[CX], LSZ[ST[CX]]);
163 |   }
164 |   else
165 |   {
166 |     PIX = MPS[CX];                   // difference
167 |     if ( A_interval > 0x7FFF )
168 |       return PIX;
169 |     MpsExchange(CX, ST[CX], LSZ[ST[CX]]);
170 |   }
171 |   this->RenormDe();
172 |   return PIX;
173 | }
174 | 
175 | int JBigCodec::Decode(char* inbuf, unsigned int size, unsigned int height, unsigned int bitwidth, unsigned int bitwidth_in_padded_bytes, char*outbuf)
176 | {
177 |   this->bitwidth = bitwidth;
178 |   this->height = height;
179 |   this->width_in_padded_bytes = bitwidth_in_padded_bytes;
180 |   memset(outbuf, 0, height * bitwidth_in_padded_bytes);
181 |   this->outptr = outbuf;
182 |   this->InitDecode(inbuf, size);
183 |   this->LowestDecode();
184 |   return 0;
185 | }
186 | 
187 | int JBigCodec::Decode(int CX)
188 | {
189 |   A_interval -= LSZ[ST[CX]];
190 |   if ( A_interval <= C_register >> 16 )
191 |   {
192 |     LpsExchange(CX, ST[CX], LSZ[ST[CX]]);
193 |     this->RenormDe();
194 |   }
195 |   else
196 |     {
197 |       if ( A_interval <= 0x7FFF )
198 |         {
199 |           MpsExchange(CX, ST[CX], LSZ[ST[CX]]);
200 |           this->RenormDe();
201 |         }
202 |       else
203 |         PIX = MPS[CX]; // difference
204 |     }
205 |   return PIX;
206 | }
207 | 
208 | /* size in ints! */
209 | void* JBigCodec::DupLine(char* buf, unsigned int dest_offset, unsigned int src_offset, unsigned int size)
210 | {
211 |   return memcpy(buf + dest_offset, buf + src_offset, 4 * size);
212 | }
213 | 
214 | int JBigCodec::GetBit(int line_offset, int bit_offset)
215 | {
216 |   static const unsigned char bitmask[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
217 | 
218 |   if (bit_offset < 0 || bit_offset >= this->bitwidth || line_offset <0)
219 |     return 0;
220 | 
221 |   if (line_offset >= this->height)
222 |     line_offset = this->height -1;
223 | 
224 |   return (*(char *)(this->outptr
225 |                     + this->width_in_padded_bytes * (this->height - line_offset - 1)
226 |                     + bit_offset / 3) & bitmask[bit_offset & 7]) != 0;
227 | }
228 | 
229 | int JBigCodec::GetCX(int a2, int a3)
230 | {
231 |   int v3;
232 |   int v4;
233 |   int v5;
234 |   int v6;
235 |   int v7;
236 | 
237 |   v3 = a3;
238 |   v4 = 2 * GetBit(a2 - 1, a3 + 2);
239 |   v5 = 2 * (GetBit(a2 - 1, v3 + 1) + v4);
240 |   v6 = 8 * (GetBit(a2 - 1, v3) + v5);
241 |   v7 = 2 * (GetBit(a2 - 2, v3 + 1) + v6);
242 |   return 2 * (GetBit(a2 - 2, v3) + v7);
243 | }
244 | 
245 | void JBigCodec::InitDecode(char* inbuf, unsigned int buflen)
246 | {
247 |   this->inbuf_length = buflen;
248 |   this->read_count = 0;
249 |   this->inbuf = (unsigned char*)inbuf;
250 |   memset((void *)this->MPS,   0, 0x4000u);
251 |   memset((void *)ST, 0, 0x4000u);
252 |   this->ByteIn();
253 |   this->C_register <<=8;
254 |   this->ByteIn();
255 |   this->C_register <<=8;
256 |   this->ByteIn();
257 |   this->A_interval = 0x10000;
258 | }
259 | 
260 | int JBigCodec::LowestDecode()
261 | {
262 |   int v2 = this->width_in_padded_bytes;
263 |   int v3 = v2 + 2;
264 |   int v4 = 3 * (v2 + 2);
265 |   int v5 = 2 * v2;
266 |   char *v15 = (char *)malloc(24 * (v2 + 2));
267 |   this->ClearLine(v15, 2 * v4);
268 |   int v6 = this->height;
269 |   if ( v6 )
270 |   {
271 |     char *v7 = v15 + 8 * v3;
272 |     char *v8 = v15 + 16 * v3;
273 |     int v9 = this->width_in_padded_bytes * (v6 - 1);
274 |     int v10 = 0;
275 |     char *v13;
276 |     for ( char *i = v15; ; i = v13 )
277 |     {
278 |       if ( this->Decode(0x29c) )
279 |       {
280 |         this->MakeTypicalLine(v10);
281 |         this->CopyLine(v8, v7, v5);
282 |       }
283 |       else
284 |       {
285 |         this->ClearLine(v8, v5);
286 |         unsigned int v14 = this->GetCX(v10, 0);
287 |         this->LowestDecodeLine(v9, v7, i, v14, v8);
288 |       }
289 |       ++v10;
290 |       if ( v10 >= this->height )
291 |         break;
292 |       v9 -= this->width_in_padded_bytes;
293 |       v13 = v7;
294 |       v7 = v8;
295 |       v8 = i;
296 |     }
297 |   }
298 |   if ( v15 )
299 |     free(v15);
300 |   return 0;
301 | }
302 | 
303 | int JBigCodec::LowestDecodeLine(unsigned int scanline_offset, char* a3, char* a4, unsigned int cx, char* a6)
304 | {
305 |   char *v7 = a3;
306 |   char *v8 = a4;
307 |   unsigned int v9 = cx;
308 |   int v10 = 0;
309 |   int v11;
310 |   int result = 0;
311 |   int v13;
312 | 
313 |   do
314 |   {
315 |     this->Decode1(v9);
316 |     v13 = (v9 >> 1) & 0xFDFF;
317 |     if ( (this->PIX & 0xFF) == 1 )
318 |     {
319 |       *(this->outptr + (v10 >> 3) + scanline_offset) |= 1 << (~(char)v10 & 7);
320 |       v13 |= 0x200u;
321 |       *(a6 + v10) = 1;
322 |     }
323 |     v11 = v13 | 4;
324 |     if ( *(v8 + v10 + 2) != 1 )
325 |       v11 &= 0xFFFBu;
326 |     v9 = v11 | 0x80;
327 |     if ( *(v7 + v10 + 3) != 1 )
328 |       v9 &= 0xFF7Fu;
329 |     ++v10;
330 |   }
331 |   while ( v10 < this->bitwidth );
332 |   return result;
333 | }
334 | 
335 | /* this routine copies one line from the bottom upwards */
336 | void* JBigCodec::MakeTypicalLine(int number)
337 | {
338 |   if (number > 0)
339 |     {
340 |       int max = this->height - 1;
341 |       if (number <= max)
342 |         {
343 |           return this->DupLine(this->outptr,
344 |                         this->width_in_padded_bytes * (max-number),
345 |                         this->width_in_padded_bytes * (max-number) + this->width_in_padded_bytes,
346 |                         this->width_in_padded_bytes >> 2);       /* bytes / 4 */
347 |         }
348 |     }
349 |   return NULL;
350 | }
351 | 
352 | void JBigCodec::RenormDe()
353 | {
354 |   do
355 |   {
356 |     if ( !this->CT )
357 |     {
358 |       this->ByteIn();
359 |     }
360 |     this->A_interval *= 2;
361 |     this->C_register *= 2;
362 |     -- this->CT;
363 |   }
364 |   while ( this->A_interval <= 0x7FFF );
365 |   if ( !this->CT )
366 |     this->ByteIn();
367 |   return;
368 | }
369 | 


--------------------------------------------------------------------------------
/pdm.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | name = "black"
  3 | version = "22.10.0"
  4 | requires_python = ">=3.7"
  5 | summary = "The uncompromising code formatter."
  6 | dependencies = [
  7 |     "click>=8.0.0",
  8 |     "mypy-extensions>=0.4.3",
  9 |     "pathspec>=0.9.0",
 10 |     "platformdirs>=2",
 11 |     "tomli>=1.1.0; python_full_version < \"3.11.0a7\"",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "click"
 16 | version = "8.1.3"
 17 | requires_python = ">=3.7"
 18 | summary = "Composable command line interface toolkit"
 19 | dependencies = [
 20 |     "colorama; platform_system == \"Windows\"",
 21 | ]
 22 | 
 23 | [[package]]
 24 | name = "colorama"
 25 | version = "0.4.6"
 26 | requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 27 | summary = "Cross-platform colored terminal text."
 28 | 
 29 | [[package]]
 30 | name = "flake8"
 31 | version = "5.0.4"
 32 | requires_python = ">=3.6.1"
 33 | summary = "the modular source code checker: pep8 pyflakes and co"
 34 | dependencies = [
 35 |     "mccabe<0.8.0,>=0.7.0",
 36 |     "pycodestyle<2.10.0,>=2.9.0",
 37 |     "pyflakes<2.6.0,>=2.5.0",
 38 | ]
 39 | 
 40 | [[package]]
 41 | name = "imagesize"
 42 | version = "1.3.0"
 43 | requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 44 | summary = "Getting image size from png/jpeg/jpeg2000/gif file"
 45 | 
 46 | [[package]]
 47 | name = "isort"
 48 | version = "5.10.1"
 49 | requires_python = ">=3.6.1,<4.0"
 50 | summary = "A Python utility / library to sort Python imports."
 51 | 
 52 | [[package]]
 53 | name = "mccabe"
 54 | version = "0.7.0"
 55 | requires_python = ">=3.6"
 56 | summary = "McCabe checker, plugin for flake8"
 57 | 
 58 | [[package]]
 59 | name = "mypy-extensions"
 60 | version = "0.4.3"
 61 | summary = "Experimental type system extensions for programs checked with the mypy typechecker."
 62 | 
 63 | [[package]]
 64 | name = "pathspec"
 65 | version = "0.10.2"
 66 | requires_python = ">=3.7"
 67 | summary = "Utility library for gitignore style pattern matching of file paths."
 68 | 
 69 | [[package]]
 70 | name = "platformdirs"
 71 | version = "2.5.4"
 72 | requires_python = ">=3.7"
 73 | summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 74 | 
 75 | [[package]]
 76 | name = "pycodestyle"
 77 | version = "2.9.1"
 78 | requires_python = ">=3.6"
 79 | summary = "Python style guide checker"
 80 | 
 81 | [[package]]
 82 | name = "pyflakes"
 83 | version = "2.5.0"
 84 | requires_python = ">=3.6"
 85 | summary = "passive checker of Python programs"
 86 | 
 87 | [[package]]
 88 | name = "pypdf2"
 89 | version = "2.2.0"
 90 | requires_python = ">=3.6"
 91 | summary = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files"
 92 | 
 93 | [[package]]
 94 | name = "setuptools"
 95 | version = "65.6.0"
 96 | requires_python = ">=3.7"
 97 | summary = "Easily download, build, install, upgrade, and uninstall Python packages"
 98 | 
 99 | [[package]]
100 | name = "tomli"
101 | version = "2.0.1"
102 | requires_python = ">=3.7"
103 | summary = "A lil' TOML parser"
104 | 
105 | [metadata]
106 | lock_version = "4.0"
107 | content_hash = "sha256:1831b76c8f8fb125c8f108835e9163ec885af94991206055c9aeba6c98c5468e"
108 | 
109 | [metadata.files]
110 | "black 22.10.0" = [
111 |     {url = "https://mirrors.aliyun.com/pypi/packages/2c/11/f2737cd3b458d91401801e83a014e87c63e8904dc063200f77826c352f54/black-22.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2039230db3c6c639bd84efe3292ec7b06e9214a2992cd9beb293d639c6402edb"},
112 |     {url = "https://mirrors.aliyun.com/pypi/packages/3d/c5/b3ab9b563f35fb284d37ab2b14acaed9a27d8cdea9c31364766eb54946a7/black-22.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9311e99228ae10023300ecac05be5a296f60d2fd10fff31cf5c1fa4ca4b1988d"},
113 |     {url = "https://mirrors.aliyun.com/pypi/packages/56/df/913d71817c7034edba25d596c54f782c2f809b6af30367d2f00309e8890a/black-22.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:21199526696b8f09c3997e2b4db8d0b108d801a348414264d2eb8eb2532e540d"},
114 |     {url = "https://mirrors.aliyun.com/pypi/packages/69/21/846c95710cc6561ba980bd6c72479dbcdde742e927ff5ef7340916d003ac/black-22.10.0-1fixedarch-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:2644b5d63633702bc2c5f3754b1b475378fbbfb481f62319388235d0cd104c2d"},
115 |     {url = "https://mirrors.aliyun.com/pypi/packages/69/84/903cdf41514088d5a716538cb189c471ab34e56ae9a1c2da6b8bfe8e4dbf/black-22.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:974308c58d057a651d182208a484ce80a26dac0caef2895836a92dd6ebd725e0"},
116 |     {url = "https://mirrors.aliyun.com/pypi/packages/71/f8/57e47ea67f59613c4368a952062bc3429131249920cffbb8362fd404b733/black-22.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fba8a281e570adafb79f7755ac8721b6cf1bbf691186a287e990c7929c7692ff"},
117 |     {url = "https://mirrors.aliyun.com/pypi/packages/86/da/edebcc6c13441d91eff6761e50512bc6d6886a556dc5357b399694122b4f/black-22.10.0-1fixedarch-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:197df8509263b0b8614e1df1756b1dd41be6738eed2ba9e9769f3880c2b9d7b6"},
118 |     {url = "https://mirrors.aliyun.com/pypi/packages/91/e6/d9b78987d7d903369ba1a0b795bce4de06f0155be6609f15e8950aef8f7e/black-22.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:444ebfb4e441254e87bad00c661fe32df9969b2bf224373a448d8aca2132b395"},
119 |     {url = "https://mirrors.aliyun.com/pypi/packages/a3/89/629fca2eea0899c06befaa58dc0f49d56807d454202bb2e54bd0d98c77f3/black-22.10.0.tar.gz", hash = "sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1"},
120 |     {url = "https://mirrors.aliyun.com/pypi/packages/a5/5f/9cfc6dd95965f8df30194472543e6f0515a10d78ea5378426ef1546735c7/black-22.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ff67aec0a47c424bc99b71005202045dc09270da44a27848d534600ac64fc7"},
121 |     {url = "https://mirrors.aliyun.com/pypi/packages/a6/84/5c3f3ffc4143fa7e208d745d2239d915e74d3709fdbc64c3e98d3fd27e56/black-22.10.0-1fixedarch-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5d8f74030e67087b219b032aa33a919fae8806d49c867846bfacde57f43972ef"},
122 |     {url = "https://mirrors.aliyun.com/pypi/packages/ab/15/61119d166a44699827c112d7c4726421f14323c2cb7aa9f4c26628f237f9/black-22.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:432247333090c8c5366e69627ccb363bc58514ae3e63f7fc75c54b1ea80fa7de"},
123 |     {url = "https://mirrors.aliyun.com/pypi/packages/ae/49/ea03c318a25be359b8e5178a359d47e2da8f7524e1522c74b8f74c66b6f8/black-22.10.0-1fixedarch-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5cc42ca67989e9c3cf859e84c2bf014f6633db63d1cbdf8fdb666dcd9e77e3fa"},
124 |     {url = "https://mirrors.aliyun.com/pypi/packages/b0/9e/fa912c5ae4b8eb6d36982fc8ac2d779cf944dbd7c3c1fe7a28acf462c1ed/black-22.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b49776299fece66bffaafe357d929ca9451450f5466e997a7285ab0fe28e3b"},
125 |     {url = "https://mirrors.aliyun.com/pypi/packages/b9/51/403b0b0eb9fb412ca02b79dc38472469f2f88c9aacc6bb5262143e4ff0bc/black-22.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72ef3925f30e12a184889aac03d77d031056860ccae8a1e519f6cbb742736383"},
126 |     {url = "https://mirrors.aliyun.com/pypi/packages/ce/6f/74492b8852ee4f2ad2178178f6b65bc8fc80ad539abe56c1c23eab6732e2/black-22.10.0-py3-none-any.whl", hash = "sha256:c957b2b4ea88587b46cf49d1dc17681c1e672864fd7af32fc1e9664d572b3458"},
127 |     {url = "https://mirrors.aliyun.com/pypi/packages/d0/5a/5f31494e3acbb6319ee60c3a3a09d3e536a3fd2353f76af9cbff799c4999/black-22.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:915ace4ff03fdfff953962fa672d44be269deb2eaf88499a0f8805221bc68c87"},
128 |     {url = "https://mirrors.aliyun.com/pypi/packages/e2/2f/a8406a9e337a213802aa90a3e9fbf90c86f3edce92f527255fd381309b77/black-22.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b9b29da4f564ba8787c119f37d174f2b69cdfdf9015b7d8c5c16121ddc054ae"},
129 |     {url = "https://mirrors.aliyun.com/pypi/packages/e3/b4/9203f1a0c99aa30389b61fa8cb54bc9f4bf16ac3aa74630c6b974ed3f3b0/black-22.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e464456d24e23d11fced2bc8c47ef66d471f845c7b7a42f3bd77bf3d1789650"},
130 |     {url = "https://mirrors.aliyun.com/pypi/packages/f2/23/f4278377cabf882298b4766e977fd04377f288d1ccef706953076a1e0598/black-22.10.0-1fixedarch-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:e41a86c6c650bcecc6633ee3180d80a025db041a8e2398dcc059b3afa8382cd4"},
131 |     {url = "https://mirrors.aliyun.com/pypi/packages/ff/ce/22281871536b3d79474fd44d48dad48f7cbc5c3982bddf6a7495e7079d00/black-22.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:819dc789f4498ecc91438a7de64427c73b45035e2e3680c92e18795a839ebb66"},
132 | ]
133 | "click 8.1.3" = [
134 |     {url = "https://mirrors.aliyun.com/pypi/packages/59/87/84326af34517fca8c58418d148f2403df25303e02736832403587318e9e8/click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
135 |     {url = "https://mirrors.aliyun.com/pypi/packages/c2/f1/df59e28c642d583f7dacffb1e0965d0e00b218e0186d7858ac5233dce840/click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
136 | ]
137 | "colorama 0.4.6" = [
138 |     {url = "https://mirrors.aliyun.com/pypi/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
139 |     {url = "https://mirrors.aliyun.com/pypi/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
140 | ]
141 | "flake8 5.0.4" = [
142 |     {url = "https://mirrors.aliyun.com/pypi/packages/ad/00/9808c62b2d529cefc69ce4e4a1ea42c0f855effa55817b7327ec5b75e60a/flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
143 |     {url = "https://mirrors.aliyun.com/pypi/packages/cf/a0/b881b63a17a59d9d07f5c0cc91a29182c8e8a9aa2bde5b3b2b16519c02f4/flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
144 | ]
145 | "imagesize 1.3.0" = [
146 |     {url = "https://mirrors.aliyun.com/pypi/packages/60/d6/5e803b17f4d42e085c365b44fda34deb0d8675a1a910635930b831c43f07/imagesize-1.3.0-py2.py3-none-any.whl", hash = "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c"},
147 |     {url = "https://mirrors.aliyun.com/pypi/packages/f6/27/b147794d43249e8303a06f427e407a090696b65b81045e36f8873d8d8a42/imagesize-1.3.0.tar.gz", hash = "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"},
148 | ]
149 | "isort 5.10.1" = [
150 |     {url = "https://mirrors.aliyun.com/pypi/packages/ab/e9/964cb0b2eedd80c92f5172f1f8ae0443781a9d461c1372a3ce5762489593/isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"},
151 |     {url = "https://mirrors.aliyun.com/pypi/packages/b8/5b/f18e227df38b94b4ee30d2502fd531bebac23946a2497e5595067a561274/isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"},
152 | ]
153 | "mccabe 0.7.0" = [
154 |     {url = "https://mirrors.aliyun.com/pypi/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
155 |     {url = "https://mirrors.aliyun.com/pypi/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
156 | ]
157 | "mypy-extensions 0.4.3" = [
158 |     {url = "https://mirrors.aliyun.com/pypi/packages/5c/eb/975c7c080f3223a5cdaff09612f3a5221e4ba534f7039db34c35d95fa6a5/mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
159 |     {url = "https://mirrors.aliyun.com/pypi/packages/63/60/0582ce2eaced55f65a4406fc97beba256de4b7a95a0034c6576458c6519f/mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
160 | ]
161 | "pathspec 0.10.2" = [
162 |     {url = "https://mirrors.aliyun.com/pypi/packages/42/79/94b21d5fabb97749ca94590315abe150a750483c87add8543781bcb6cd26/pathspec-0.10.2-py3-none-any.whl", hash = "sha256:88c2606f2c1e818b978540f73ecc908e13999c6c3a383daf3705652ae79807a5"},
163 |     {url = "https://mirrors.aliyun.com/pypi/packages/a2/29/959c72e1a6c3c25eaa46b9bfcc7fd401f65af83163d4796af09272c83c8a/pathspec-0.10.2.tar.gz", hash = "sha256:8f6bf73e5758fd365ef5d58ce09ac7c27d2833a8d7da51712eac6e27e35141b0"},
164 | ]
165 | "platformdirs 2.5.4" = [
166 |     {url = "https://mirrors.aliyun.com/pypi/packages/61/e0/15ba41c6716acb033c3793be3a02f26c53914ecd9bdd6b315001f8f5f581/platformdirs-2.5.4-py3-none-any.whl", hash = "sha256:af0276409f9a02373d540bf8480021a048711d572745aef4b7842dad245eba10"},
167 |     {url = "https://mirrors.aliyun.com/pypi/packages/cb/5f/dda8451435f17ed8043eab5ffe04e47d703debe8fe845eb074f42260e50a/platformdirs-2.5.4.tar.gz", hash = "sha256:1006647646d80f16130f052404c6b901e80ee4ed6bef6792e1f238a8969106f7"},
168 | ]
169 | "pycodestyle 2.9.1" = [
170 |     {url = "https://mirrors.aliyun.com/pypi/packages/67/e4/fc77f1039c34b3612c4867b69cbb2b8a4e569720b1f19b0637002ee03aff/pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
171 |     {url = "https://mirrors.aliyun.com/pypi/packages/b6/83/5bcaedba1f47200f0665ceb07bcb00e2be123192742ee0edfb66b600e5fd/pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
172 | ]
173 | "pyflakes 2.5.0" = [
174 |     {url = "https://mirrors.aliyun.com/pypi/packages/07/92/f0cb5381f752e89a598dd2850941e7f570ac3cb8ea4a344854de486db152/pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
175 |     {url = "https://mirrors.aliyun.com/pypi/packages/dc/13/63178f59f74e53acc2165aee4b002619a3cfa7eeaeac989a9eb41edf364e/pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
176 | ]
177 | "pypdf2 2.2.0" = [
178 |     {url = "https://mirrors.aliyun.com/pypi/packages/3f/1e/9204070476be4b6f598e4c042590385341c6019c862cc73892f31f17b45c/PyPDF2-2.2.0-py3-none-any.whl", hash = "sha256:6167a965a2b44f6e763f0bb2028810898bace2caae5ddd5040a8b57e1c6aaa5a"},
179 |     {url = "https://mirrors.aliyun.com/pypi/packages/b8/f4/8bbb7a9fa8b6bf7eb55c0d14f31396d2b7812c270454ab02e582c281ad68/PyPDF2-2.2.0.tar.gz", hash = "sha256:12a289d7be1cac0b066b05854ebc40dfaaeea31244ec45ea02682e51deefb7e8"},
180 | ]
181 | "setuptools 65.6.0" = [
182 |     {url = "https://mirrors.aliyun.com/pypi/packages/09/b6/33512596fb92ba68f7c45e9bbc5e1bb9b24fbd941f9aece250fb420c2f5c/setuptools-65.6.0.tar.gz", hash = "sha256:d1eebf881c6114e51df1664bc2c9133d022f78d12d5f4f665b9191f084e2862d"},
183 |     {url = "https://mirrors.aliyun.com/pypi/packages/1f/97/c03668380f278f1f8b0486d820c142cf224bba1bd78416e1797b52e0e81c/setuptools-65.6.0-py3-none-any.whl", hash = "sha256:6211d2f5eddad8757bd0484923ca7c0a6302ebc4ab32ea5e94357176e0ca0840"},
184 | ]
185 | "tomli 2.0.1" = [
186 |     {url = "https://mirrors.aliyun.com/pypi/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
187 |     {url = "https://mirrors.aliyun.com/pypi/packages/c0/3f/d7af728f075fb08564c5949a9c95e44352e23dee646869fa104a3b2060a3/tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
188 | ]
189 | 


--------------------------------------------------------------------------------
/caj2pdf/cajparser.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import struct
  3 | from shutil import copy
  4 | from subprocess import STDOUT, CalledProcessError, check_output
  5 | 
  6 | from PyPDF2.errors import PdfReadError
  7 | 
  8 | from .utils import (
  9 |     add_outlines,
 10 |     fnd,
 11 |     fnd_all,
 12 |     fnd_rvrs,
 13 |     fnd_unuse_no,
 14 |     find_redundant_images,
 15 | )
 16 | 
 17 | KDH_PASSPHRASE = b"FZHMEI"
 18 | 
 19 | printables = "".join(
 20 |     [
 21 |         (len(repr(chr(x))) == 3) and (x != 47) and (x < 128) and chr(x) or "."
 22 |         for x in range(256)
 23 |     ]
 24 | )
 25 | 
 26 | image_type = {0: "JBIG", 1: "JPEG", 2: "JPEG", 3: "JBIG2"}  # up-side-down
 27 | 
 28 | 
 29 | class CAJParser(object):
 30 |     def __init__(self, filename):
 31 |         self.filename = filename
 32 |         try:
 33 |             with open(filename, "rb") as caj:
 34 |                 caj_read4 = caj.read(4)
 35 |                 if caj_read4[0:1] == b"\xc8":
 36 |                     self.format = "C8"
 37 |                     self._PAGE_NUMBER_OFFSET = 0x08
 38 |                     self._TOC_NUMBER_OFFSET = 0  # No TOC
 39 |                     self._TOC_END_OFFSET = 0x50
 40 |                     self._PAGEDATA_OFFSET = self._TOC_END_OFFSET + 20 * self.page_num
 41 |                     return
 42 |                 if caj_read4[0:2] == b"HN":
 43 |                     if (
 44 |                         caj.read(2) == b"\xc8\x00"
 45 |                     ):  # Most of them are: 90 01, handled later
 46 |                         self.format = "HN"
 47 |                         self._PAGE_NUMBER_OFFSET = 0x90
 48 |                         self._TOC_NUMBER_OFFSET = 0
 49 |                         self._TOC_END_OFFSET = 0xD8
 50 |                         self._PAGEDATA_OFFSET = (
 51 |                             self._TOC_END_OFFSET + 20 * self.page_num
 52 |                         )
 53 |                         return
 54 |                 fmt = (
 55 |                     struct.unpack("4s", caj_read4)[0]
 56 |                     .replace(b"\x00", b"")
 57 |                     .decode("gb18030")
 58 |                 )
 59 |             if fmt == "CAJ":
 60 |                 self.format = "CAJ"
 61 |                 self._PAGE_NUMBER_OFFSET = 0x10
 62 |                 self._TOC_NUMBER_OFFSET = 0x110
 63 |             elif fmt == "HN":
 64 |                 self.format = "HN"
 65 |                 self._PAGE_NUMBER_OFFSET = 0x90
 66 |                 self._TOC_NUMBER_OFFSET = 0x158
 67 | 
 68 |                 # TOC = [toc_num] followed by [toc_entry * toc_num]
 69 |                 # followed by [Page Info struct (20-byte) * page_num], followed by Page Data
 70 |                 self._TOC_END_OFFSET = (
 71 |                     self._TOC_NUMBER_OFFSET + 4 + 0x134 * self.toc_num
 72 |                 )
 73 |                 self._PAGEDATA_OFFSET = self._TOC_END_OFFSET + 20 * self.page_num
 74 |             elif fmt == "%PDF":
 75 |                 self.format = "PDF"
 76 |             elif fmt == "KDH ":
 77 |                 self.format = "KDH"
 78 |             elif fmt == "TEB":
 79 |                 self.format = "TEB"
 80 |             else:
 81 |                 self.format = None
 82 |                 raise SystemExit("Unknown file type.")
 83 |         except UnicodeDecodeError:
 84 |             raise SystemExit("Unknown file type.")
 85 | 
 86 |     @property
 87 |     def page_num(self):
 88 |         with open(self.filename, "rb") as caj:
 89 |             caj.seek(self._PAGE_NUMBER_OFFSET)
 90 |             [page_num] = struct.unpack("i", caj.read(4))
 91 |             return page_num
 92 | 
 93 |     @property
 94 |     def toc_num(self):
 95 |         if self._TOC_NUMBER_OFFSET == 0:
 96 |             return 0
 97 |         with open(self.filename, "rb") as caj:
 98 |             caj.seek(self._TOC_NUMBER_OFFSET)
 99 |             [toc_num] = struct.unpack("i", caj.read(4))
100 |             return toc_num
101 | 
102 |     def get_toc(self, verbose=False):
103 |         toc = []
104 |         if self._TOC_NUMBER_OFFSET == 0:
105 |             return toc
106 |         with open(self.filename, "rb") as caj:
107 |             for i in range(self.toc_num):
108 |                 caj.seek(self._TOC_NUMBER_OFFSET + 4 + 0x134 * i)
109 |                 toc_bytes = struct.unpack("256s24s12s12si", caj.read(0x134))
110 |                 ttl_end = toc_bytes[0].find(b"\x00")
111 |                 title = toc_bytes[0][0:ttl_end].decode("gb18030").encode("utf-8")
112 |                 pg_end = toc_bytes[2].find(b"\x00")
113 |                 page = int(toc_bytes[2][0:pg_end])
114 |                 level = toc_bytes[4]
115 |                 toc_entry = {"title": title, "page": page, "level": level}
116 |                 if verbose:
117 |                     print("   " * (level - 1), title.decode("utf-8"))
118 |                 toc.append(toc_entry)
119 |             if verbose:
120 |                 print(
121 |                     "TOC END: 0x%04X"
122 |                     % (self._TOC_NUMBER_OFFSET + 4 + 0x134 * self.toc_num)
123 |                 )
124 |         return toc
125 | 
126 |     def output_toc(self, dest):
127 |         toc_items = self.get_toc()
128 |         with open(dest, "wb") as f:
129 |             for toc in toc_items:
130 |                 f.write(
131 |                     b"    " * (toc["level"] - 1)
132 |                     + toc["title"]
133 |                     + b"    "
134 |                     + str(toc["page"]).encode("utf-8")
135 |                     + b"\n"
136 |                 )
137 | 
138 |     def convert(self, dest):
139 |         if self.format == "CAJ":
140 |             self._convert_caj(dest)
141 |         elif self.format == "HN":
142 |             self._convert_hn(dest)
143 |         elif self.format == "C8":
144 |             self._convert_hn(dest)
145 |         elif self.format == "PDF":
146 |             self._convert_pdf(dest)
147 |         elif self.format == "KDH":
148 |             self._convert_kdh(dest)
149 | 
150 |     def parse(self):
151 |         if self.format == "CAJ":
152 |             pass
153 |         elif self.format == "HN":
154 |             self._parse_hn()
155 |         elif self.format == "C8":
156 |             self._parse_hn()
157 |         elif self.format == "PDF":
158 |             pass
159 |         elif self.format == "KDH":
160 |             pass
161 | 
162 |     def text_extract(self):
163 |         if self.format == "CAJ":
164 |             pass
165 |         if self.format == "HN":
166 |             self._text_extract_hn()
167 |         elif self.format == "C8":
168 |             self._text_extract_hn()
169 |         elif self.format == "PDF":
170 |             pass
171 |         elif self.format == "KDH":
172 |             pass
173 | 
174 |     def _convert_caj(self, dest):
175 |         caj = open(self.filename, "rb")
176 | 
177 |         # Extract original PDF data (and add header)
178 |         caj.seek(self._PAGE_NUMBER_OFFSET + 4)
179 |         [pdf_start_pointer] = struct.unpack("i", caj.read(4))
180 |         caj.seek(pdf_start_pointer)
181 |         [pdf_start] = struct.unpack("i", caj.read(4))
182 |         pdf_end = fnd_all(caj, b"endobj")[-1] + 6
183 |         pdf_length = pdf_end - pdf_start
184 |         caj.seek(pdf_start)
185 |         pdf_data = b"%PDF-1.3\r\n" + caj.read(pdf_length) + b"\r\n"
186 |         with open("pdf.tmp", "wb") as f:
187 |             f.write(pdf_data)
188 |         pdf = open("pdf.tmp", "rb")
189 | 
190 |         # deal with disordered PDF data
191 |         endobj_addr = fnd_all(pdf, b"endobj")
192 |         obj_no = []
193 |         for addr in endobj_addr:
194 |             startobj = fnd_rvrs(pdf, b" 0 obj", addr)
195 |             startobj1 = fnd_rvrs(pdf, b"\r", startobj)
196 |             startobj2 = fnd_rvrs(pdf, b"\n", startobj)
197 |             startobj = max(startobj1, startobj2)
198 |             length = fnd(pdf, b" ", startobj) - startobj
199 |             pdf.seek(startobj)
200 |             [no] = struct.unpack(str(length) + "s", pdf.read(length))
201 |             if int(no) not in obj_no:
202 |                 obj_no.append(int(no))
203 |                 obj_len = addr - startobj + 6
204 |                 pdf.seek(startobj)
205 |                 [obj] = struct.unpack(str(obj_len) + "s", pdf.read(obj_len))
206 | 
207 |         # Add Catalog (find obj_no of pages)
208 |         inds_addr = [i + 8 for i in fnd_all(pdf, b"/Parent")]
209 |         inds = []
210 |         for addr in inds_addr:
211 |             length = fnd(pdf, b" ", addr) - addr
212 |             pdf.seek(addr)
213 |             [ind] = struct.unpack(str(length) + "s", pdf.read(length))
214 |             inds.append(int(ind))
215 |         # get pages_obj_no list containing distinct elements
216 |         # & find missing pages object(s) -- top pages object(s) in pages_obj_no
217 |         pages_obj_no = []
218 |         top_pages_obj_no = []
219 |         for ind in inds:
220 |             if (ind not in pages_obj_no) and (ind not in top_pages_obj_no):
221 |                 if fnd(pdf, bytes("\r{0} 0 obj".format(ind), "utf-8")) == -1:
222 |                     top_pages_obj_no.append(ind)
223 |                 else:
224 |                     pages_obj_no.append(ind)
225 |         single_pages_obj_missed = len(top_pages_obj_no) == 1
226 |         multi_pages_obj_missed = len(top_pages_obj_no) > 1
227 |         # generate catalog object
228 |         catalog_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no)
229 |         obj_no.append(catalog_obj_no)
230 |         root_pages_obj_no = None
231 |         if multi_pages_obj_missed:
232 |             root_pages_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no)
233 |         elif single_pages_obj_missed:
234 |             root_pages_obj_no = top_pages_obj_no[0]
235 |             top_pages_obj_no = pages_obj_no
236 |         else:  # root pages object exists, then find the root pages object #
237 |             found = False
238 |             for pon in pages_obj_no:
239 |                 tmp_addr = fnd(pdf, bytes("\r{0} 0 obj".format(pon), "utf-8"))
240 |                 while True:
241 |                     pdf.seek(tmp_addr)
242 |                     [_str] = struct.unpack("6s", pdf.read(6))
243 |                     if _str == b"Parent":
244 |                         break
245 |                     elif _str == b"endobj":
246 |                         root_pages_obj_no = pon
247 |                         found = True
248 |                         break
249 |                     tmp_addr = tmp_addr + 1
250 |                 if found:
251 |                     break
252 |         catalog = bytes(
253 |             "{0} 0 obj\r<</Type /Catalog\r/Pages {1} 0 R\r>>\rendobj\r".format(
254 |                 catalog_obj_no, root_pages_obj_no
255 |             ),
256 |             "utf-8",
257 |         )
258 |         pdf_data += catalog
259 |         pdf.close()
260 |         with open("pdf.tmp", "wb") as f:
261 |             f.write(pdf_data)
262 |         pdf = open("pdf.tmp", "rb")
263 | 
264 |         # Add Pages obj and EOF mark
265 |         # if root pages object exist, pass
266 |         # deal with single missing pages object
267 |         if single_pages_obj_missed or multi_pages_obj_missed:
268 |             inds_str = ["{0} 0 R".format(i) for i in top_pages_obj_no]
269 |             kids_str = "[{0}]".format(" ".join(inds_str))
270 |             pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format(
271 |                 root_pages_obj_no, kids_str, self.page_num
272 |             )
273 |             pdf_data += bytes(pages_str, "utf-8")
274 |             pdf.close()
275 |             with open("pdf.tmp", "wb") as f:
276 |                 f.write(pdf_data)
277 |             pdf = open("pdf.tmp", "rb")
278 |         # deal with multiple missing pages objects
279 |         if multi_pages_obj_missed:
280 |             kids_dict = {i: [] for i in top_pages_obj_no}
281 |             count_dict = {i: 0 for i in top_pages_obj_no}
282 |             for tpon in top_pages_obj_no:
283 |                 kids_addr = fnd_all(pdf, bytes("/Parent {0} 0 R".format(tpon), "utf-8"))
284 |                 for kid in kids_addr:
285 |                     ind = fnd_rvrs(pdf, b"obj", kid) - 4
286 |                     addr = fnd_rvrs(pdf, b"\r", ind)
287 |                     length = fnd(pdf, b" ", addr) - addr
288 |                     pdf.seek(addr)
289 |                     [ind] = struct.unpack(str(length) + "s", pdf.read(length))
290 |                     kids_dict[tpon].append(int(ind))
291 |                     type_addr = fnd(pdf, b"/Type", addr) + 5
292 |                     tmp_addr = fnd(pdf, b"/", type_addr) + 1
293 |                     pdf.seek(tmp_addr)
294 |                     [_type] = struct.unpack("5s", pdf.read(5))
295 |                     if _type == b"Pages":
296 |                         cnt_addr = fnd(pdf, b"/Count ", addr) + 7
297 |                         pdf.seek(cnt_addr)
298 |                         [_str] = struct.unpack("1s", pdf.read(1))
299 |                         cnt_len = 0
300 |                         while _str not in [b" ", b"\r", b"/"]:
301 |                             cnt_len += 1
302 |                             pdf.seek(cnt_addr + cnt_len)
303 |                             [_str] = struct.unpack("1s", pdf.read(1))
304 |                         pdf.seek(cnt_addr)
305 |                         [cnt] = struct.unpack(str(cnt_len) + "s", pdf.read(cnt_len))
306 |                         count_dict[tpon] += int(cnt)
307 |                     else:  # _type == b"Page"
308 |                         count_dict[tpon] += 1
309 |                 kids_no_str = ["{0} 0 R".format(i) for i in kids_dict[tpon]]
310 |                 kids_str = "[{0}]".format(" ".join(kids_no_str))
311 |                 pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format(
312 |                     tpon, kids_str, count_dict[tpon]
313 |                 )
314 |                 pdf_data += bytes(pages_str, "utf-8")
315 |         pdf_data += bytes("\n%%EOF\r", "utf-8")
316 |         pdf.close()
317 |         with open("pdf.tmp", "wb") as f:
318 |             f.write(pdf_data)
319 | 
320 |         # Use mutool to repair xref
321 |         try:
322 |             check_output(["mutool", "clean", "pdf.tmp", "pdf_toc.pdf"], stderr=STDOUT)
323 |         except CalledProcessError as e:
324 |             print(e.output.decode("utf-8"))
325 |             raise SystemExit(
326 |                 "Command mutool returned non-zero exit status " + str(e.returncode)
327 |             )
328 | 
329 |         # Add Outlines
330 |         try:
331 |             add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
332 |         except PdfReadError as e:
333 |             print("PdfReadError:", str(e))
334 |             copy("pdf_toc.pdf", dest)
335 |             pass
336 |         os.remove("pdf.tmp")
337 |         os.remove("pdf_toc.pdf")
338 | 
339 |     def _convert_hn(self, dest):
340 |         caj = open(self.filename, "rb")
341 |         image_list = []
342 | 
343 |         import zlib
344 | 
345 |         from .pdfwutils import Colorspace, ImageFormat, convert_ImageList
346 | 
347 |         for i in range(self.page_num):
348 |             caj.seek(self._TOC_END_OFFSET + i * 20)
349 |             [
350 |                 page_data_offset,
351 |                 size_of_text_section,
352 |                 images_per_page,
353 |                 page_no,
354 |                 unk2,
355 |                 next_page_data_offset,
356 |             ] = struct.unpack("iihhii", caj.read(20))
357 |             caj.seek(page_data_offset)
358 |             text_header_read32 = caj.read(32)
359 |             if (text_header_read32[8:20] == b"COMPRESSTEXT") or (
360 |                 text_header_read32[0:12] == b"COMPRESSTEXT"
361 |             ):
362 |                 coff = 8
363 |                 if text_header_read32[0:12] == b"COMPRESSTEXT":
364 |                     coff = 0
365 |                 [expanded_text_size] = struct.unpack(
366 |                     "i", text_header_read32[12 + coff : 16 + coff]
367 |                 )
368 |                 caj.seek(page_data_offset + 16 + coff)
369 |                 data = caj.read(size_of_text_section - 16 - coff)
370 |                 output = zlib.decompress(data, bufsize=expanded_text_size)
371 |                 if len(output) != expanded_text_size:
372 |                     raise SystemExit("Unexpected:", len(output), expanded_text_size)
373 |             else:
374 |                 caj.seek(page_data_offset)
375 |                 output = caj.read(size_of_text_section)
376 |             from .HNParsePage import HNParsePage
377 | 
378 |             page_style = next_page_data_offset > page_data_offset
379 |             page_data = HNParsePage(output, page_style)
380 | 
381 |             current_offset = page_data_offset + size_of_text_section
382 |             (found, images_per_page) = find_redundant_images(
383 |                 caj, current_offset, images_per_page
384 |             )
385 |             if found:
386 |                 print(
387 |                     "Page %d, skipping %d redundant images"
388 |                     % (i + 1, images_per_page * (images_per_page - 1))
389 |                 )
390 | 
391 |             if images_per_page > 1:
392 |                 if len(page_data.figures) == images_per_page:
393 |                     if (page_data.figures[0][0] == 0) and (
394 |                         page_data.figures[0][1] == 0
395 |                     ):
396 |                         image_list.append(None)
397 |                         image_list.append(page_data.figures)
398 |                     else:
399 |                         print(
400 |                             "Page %d, Image Count %d, first image not at origin, expanding to %d pages"
401 |                             % (i + 1, len(page_data.figures), images_per_page)
402 |                         )
403 |                 else:
404 |                     print(
405 |                         "Page %d, Image Count %d != %d"
406 |                         % (i + 1, len(page_data.figures), images_per_page)
407 |                     )
408 |                     if len(page_data.figures) > images_per_page:
409 |                         print("\tTruncating Page %d," % (i + 1), page_data.figures)
410 |                         image_list.append(None)
411 |                         image_list.append(page_data.figures[0:images_per_page])
412 |                     else:
413 |                         print(
414 |                             "Page %d expanding to %d separate image pages"
415 |                             % (i + 1, images_per_page)
416 |                         )
417 |             elif images_per_page == 1:
418 |                 if (len(page_data.figures) == 0) or (
419 |                     (len(page_data.figures) > 0)
420 |                     and (
421 |                         not (
422 |                             (page_data.figures[0][0] == 0)
423 |                             and (page_data.figures[0][1] == 0)
424 |                         )
425 |                     )
426 |                 ):
427 |                     print(
428 |                         "Page %d possibly text-only + single figure(%d)"
429 |                         % (i + 1, len(page_data.figures))
430 |                     )
431 |             else:
432 |                 # don't care about images_per_page == 0
433 |                 pass
434 |             for j in range(images_per_page):
435 |                 caj.seek(current_offset)
436 |                 read32 = caj.read(32)
437 |                 [
438 |                     image_type_enum,
439 |                     offset_to_image_data,
440 |                     size_of_image_data,
441 |                 ] = struct.unpack("iii", read32[0:12])
442 |                 if offset_to_image_data != current_offset + 12:
443 |                     raise SystemExit("unusual image offset")
444 |                 caj.seek(offset_to_image_data)
445 |                 image_data = caj.read(size_of_image_data)
446 |                 current_offset = offset_to_image_data + size_of_image_data
447 |                 if image_type[image_type_enum] == "JBIG":
448 |                     from .dep.jbigdec import CImage
449 | 
450 |                     cimage = CImage(image_data)
451 |                     out = cimage.DecodeJbig()
452 |                     # PBM is only padded to 8 rather than 32.
453 |                     # If the padding is larger, write padded file.
454 |                     width = cimage.width
455 |                     if cimage.bytes_per_line > ((cimage.width + 7) >> 3):
456 |                         width = cimage.bytes_per_line << 3
457 |                     image_item = (
458 |                         Colorspace.P,
459 |                         (300, 300),
460 |                         ImageFormat.PBM,
461 |                         zlib.compress(out),
462 |                         width,
463 |                         cimage.height,
464 |                         [0xFFFFFF, 0],
465 |                         False,
466 |                         1,
467 |                         0,
468 |                     )
469 |                 elif image_type[image_type_enum] == "JBIG2":
470 |                     from .dep.jbig2dec import CImage
471 | 
472 |                     cimage = CImage(image_data)
473 |                     out = cimage.DecodeJbig2()
474 |                     # PBM is only padded to 8 rather than 32.
475 |                     # If the padding is larger, write padded file.
476 |                     width = cimage.width
477 |                     if cimage.bytes_per_line > ((cimage.width + 7) >> 3):
478 |                         width = cimage.bytes_per_line << 3
479 |                     image_item = (
480 |                         Colorspace.P,
481 |                         (300, 300),
482 |                         ImageFormat.PBM,
483 |                         zlib.compress(out),
484 |                         width,
485 |                         cimage.height,
486 |                         [0xFFFFFF, 0],
487 |                         False,
488 |                         1,
489 |                         0,
490 |                     )
491 |                 elif image_type[image_type_enum] == "JPEG":
492 |                     colorspace = Colorspace.RGB
493 |                     component = 3
494 |                     # stock libjpeg location
495 |                     (
496 |                         SOFn,
497 |                         frame_length,
498 |                         bits_per_pixel,
499 |                         height,
500 |                         width,
501 |                         component,
502 |                     ) = struct.unpack(">HHBHHB", image_data[158:168])
503 |                     if SOFn != 0xFFC0:
504 |                         # "Intel(R) JPEG Library" location
505 |                         (
506 |                             SOFn,
507 |                             frame_length,
508 |                             bits_per_pixel,
509 |                             height,
510 |                             width,
511 |                             component,
512 |                         ) = struct.unpack(">HHBHHB", image_data[0x272:0x27C])
513 |                         if SOFn != 0xFFC0:
514 |                             # neither works, try brute-force
515 |                             import imagesize
516 |                             from PIL import Image as pilimage
517 | 
518 |                             with open(".tmp.jpg", "wb") as f:
519 |                                 f.write(image_data)
520 |                                 (width, height) = imagesize.get(".tmp.jpg")
521 |                                 pim = pilimage.open(".tmp.jpg")
522 |                                 if pim.mode == "L":
523 |                                     component = 1
524 |                             os.remove(".tmp.jpg")
525 |                     if image_type_enum == 1:
526 |                         # non-inverted JPEG Images
527 |                         height = -height
528 |                     if component == 1:
529 |                         colorspace = Colorspace.L
530 |                     image_item = (
531 |                         colorspace,
532 |                         (300, 300),
533 |                         ImageFormat.JPEG,
534 |                         image_data,
535 |                         width,
536 |                         height,
537 |                         [],
538 |                         False,
539 |                         8,
540 |                         0,
541 |                     )
542 |                 else:
543 |                     raise SystemExit("Unknown Image Type %d" % (image_type_enum))
544 |                 image_list.append(image_item)
545 |         if len(image_list) == 0:
546 |             raise SystemExit("File is pure-text HN; cannot convert to pdf")
547 |         pdf_data = convert_ImageList(image_list)
548 |         with open("pdf_toc.pdf", "wb") as f:
549 |             f.write(pdf_data)
550 |         # Add Outlines
551 |         add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
552 |         os.remove("pdf_toc.pdf")
553 | 
554 |     def _text_extract_hn(self):
555 |         if self._TOC_NUMBER_OFFSET > 0:
556 |             self.get_toc(verbose=True)
557 |         caj = open(self.filename, "rb")
558 | 
559 |         for i in range(self.page_num):
560 |             caj.seek(self._TOC_END_OFFSET + i * 20)
561 |             [
562 |                 page_data_offset,
563 |                 size_of_text_section,
564 |                 images_per_page,
565 |                 page_no,
566 |                 unk2,
567 |                 next_page_data_offset,
568 |             ] = struct.unpack("iihhii", caj.read(20))
569 |             caj.seek(page_data_offset)
570 |             text_header_read32 = caj.read(32)
571 |             if (text_header_read32[8:20] == b"COMPRESSTEXT") or (
572 |                 text_header_read32[0:12] == b"COMPRESSTEXT"
573 |             ):
574 |                 coff = 8
575 |                 if text_header_read32[0:12] == b"COMPRESSTEXT":
576 |                     coff = 0
577 |                 [expanded_text_size] = struct.unpack(
578 |                     "i", text_header_read32[12 + coff : 16 + coff]
579 |                 )
580 |                 import zlib
581 | 
582 |                 caj.seek(page_data_offset + 16 + coff)
583 |                 data = caj.read(size_of_text_section - 16 - coff)
584 |                 output = zlib.decompress(data, bufsize=expanded_text_size)
585 |                 if len(output) != expanded_text_size:
586 |                     raise SystemExit("Unexpected:", len(output), expanded_text_size)
587 |             else:
588 |                 caj.seek(page_data_offset)
589 |                 output = caj.read(size_of_text_section)
590 |             from .HNParsePage import HNParsePage
591 | 
592 |             page_style = next_page_data_offset > page_data_offset
593 |             page_data = HNParsePage(output, page_style)
594 |             print("Text on Page %d:" % (i + 1))
595 |             print(page_data.texts)
596 |             # print("Figures:\n", page_data.figures)
597 | 
598 |     def _parse_hn(self):
599 |         if self._TOC_NUMBER_OFFSET > 0:
600 |             self.get_toc(verbose=True)
601 |         caj = open(self.filename, "rb")
602 | 
603 |         for i in range(self.page_num):
604 |             caj.seek(self._TOC_END_OFFSET + i * 20)
605 |             print(
606 |                 "Reading Page Info struct #%d at offset 0x%04X"
607 |                 % (i + 1, self._TOC_END_OFFSET + i * 20)
608 |             )
609 |             [
610 |                 page_data_offset,
611 |                 size_of_text_section,
612 |                 images_per_page,
613 |                 page_no,
614 |                 unk2,
615 |                 next_page_data_offset,
616 |             ] = struct.unpack("iihhii", caj.read(20))
617 |             print(
618 |                 "unknown page struct members = (%d %d)" % (unk2, next_page_data_offset)
619 |             )
620 |             # All 71: 1,0,0
621 |             print("Page Number %d Data offset = 0x%04X" % (page_no, page_data_offset))
622 |             caj.seek(page_data_offset)
623 |             text_header_read32 = caj.read(32)
624 |             print("Page Text Header dump:\n", self.dump(text_header_read32), sep="")
625 |             # The first 8 bytes are always: 03 80 XX 16 03 80 XX XX,
626 |             # the last one 20 or 21, but the first two can be any.
627 |             # 48/71 has: 03 80 E0 16 03 80 F7 20, the rest uniq
628 |             if (text_header_read32[8:20] == b"COMPRESSTEXT") or (
629 |                 text_header_read32[0:12] == b"COMPRESSTEXT"
630 |             ):
631 |                 coff = 8
632 |                 if text_header_read32[0:12] == b"COMPRESSTEXT":
633 |                     coff = 0
634 |                 # expanded_text_size seems to be always about 2-3 times size_of_text_section, so this is a guess.
635 |                 [expanded_text_size] = struct.unpack(
636 |                     "i", text_header_read32[12 + coff : 16 + coff]
637 |                 )
638 |                 import zlib
639 | 
640 |                 caj.seek(page_data_offset + 16 + coff)
641 |                 data = caj.read(size_of_text_section - 16 - coff)
642 |                 output = zlib.decompress(data, bufsize=expanded_text_size)
643 |                 if len(output) != expanded_text_size:
644 |                     print("Unexpected:", len(output), expanded_text_size)
645 |                 print(
646 |                     "Page Text Header COMPRESSTEXT:\n",
647 |                     self.dump(output, GB=True),
648 |                     sep="",
649 |                 )
650 |                 for x in range(len(output) >> 4):
651 |                     try:
652 |                         print(
653 |                             bytes([output[(x << 4) + 7], output[(x << 4) + 6]]).decode(
654 |                                 "gbk"
655 |                             ),
656 |                             end="",
657 |                         )
658 |                     except UnicodeDecodeError:
659 |                         print(self.dump(output[x << 4 : (x + 1) << 4]))
660 |                 print()
661 |             else:
662 |                 caj.seek(page_data_offset)
663 |                 output = caj.read(size_of_text_section)
664 |                 print(
665 |                     "Page Text Header non-COMPRESSTEXT:\n",
666 |                     self.dump(output, GB=True),
667 |                     sep="",
668 |                 )
669 |             from .HNParsePage import HNParsePage
670 | 
671 |             page_style = next_page_data_offset > page_data_offset
672 |             page_data = HNParsePage(output, page_style)
673 |             print("Text:\n", page_data.texts)
674 |             print("Figures:\n", page_data.figures)
675 |             current_offset = page_data_offset + size_of_text_section
676 |             for j in range(images_per_page):
677 |                 caj.seek(current_offset)
678 |                 read32 = caj.read(32)
679 |                 [
680 |                     image_type_enum,
681 |                     offset_to_image_data,
682 |                     size_of_image_data,
683 |                 ] = struct.unpack("iii", read32[0:12])
684 |                 if image_type[image_type_enum] != "JPEG":
685 |                     read32 += caj.read(64)
686 |                 print(
687 |                     "size of image data = %d (%s)"
688 |                     % (size_of_image_data, image_type[image_type_enum])
689 |                 )
690 |                 if offset_to_image_data != current_offset + 12:
691 |                     raise SystemExit("unusual image offset")
692 |                 print("Page Image Header dump:\n", self.dump(read32), sep="")
693 |                 print(
694 |                     "Expected End of Page #%d: 0x%08X"
695 |                     % (i + 1, current_offset + size_of_image_data + 12)
696 |                 )
697 |                 caj.seek(offset_to_image_data)
698 |                 image_data = caj.read(size_of_image_data)
699 |                 current_offset = offset_to_image_data + size_of_image_data
700 |                 image_name = "image_dump_%04d" % (i + 1)
701 |                 if j > 0:
702 |                     image_name = "image_dump_%04d_%04d" % (i + 1, j)
703 |                 with open(image_name + ".dat", "wb") as f:
704 |                     f.write(image_data)
705 |                 if image_type[image_type_enum] == "JBIG":
706 |                     try:
707 |                         from .dep.jbigdec import SaveJbigAsBmp
708 | 
709 |                         SaveJbigAsBmp(
710 |                             image_data,
711 |                             size_of_image_data,
712 |                             (image_name + ".bmp").encode("ascii"),
713 |                         )
714 |                     except ImportError:
715 |                         pass
716 |                 elif image_type[image_type_enum] == "JBIG2":
717 |                     try:
718 |                         from .dep.jbigdec import SaveJbig2AsBmp
719 | 
720 |                         SaveJbig2AsBmp(
721 |                             image_data,
722 |                             size_of_image_data,
723 |                             (image_name + ".bmp").encode("ascii"),
724 |                         )
725 |                     except ImportError:
726 |                         pass
727 |                 elif image_type[image_type_enum] == "JPEG":
728 |                     with open(image_name + ".jpg", "wb") as f:
729 |                         f.write(image_data)
730 |         print("end 0x%08x" % self._PAGEDATA_OFFSET)
731 | 
732 |     def dump(self, src, GB=False):
733 |         N = 0
734 |         result = []
735 |         while src:
736 |             s, src = src[:16], src[16:]
737 |             hexa = " ".join(["%02X" % x for x in s])
738 |             gb = ""
739 |             if GB:
740 |                 gb += "    "
741 |                 for x in range(len(s) >> 1):
742 |                     try:
743 |                         if s[(x << 1) + 1] < 128 and s[(x << 1) + 0] < 128:
744 |                             gb += ".."
745 |                         else:
746 |                             gb += bytes([s[(x << 1) + 1], s[(x << 1) + 0]]).decode(
747 |                                 "gbk"
748 |                             )
749 |                     except UnicodeDecodeError:
750 |                         gb += ".."
751 |             s = "".join(printables[x] for x in s)
752 |             result += "%04X   %-*s   %s%s\n" % (N, 16 * 3, hexa, s, gb)
753 |             N += 16
754 |         return "".join(result)
755 | 
756 |     def _convert_pdf(self, dest):
757 |         copy(self.filename, dest)
758 | 
759 |     def _convert_kdh(self, dest):
760 |         #  Read KDH file.
761 |         fp = open(self.filename, "rb")
762 |         origin = fp.read()
763 |         fp.close()
764 | 
765 |         #  Decrypt.
766 |         origin = origin[254:]
767 |         output = []
768 |         keycursor = 0
769 |         for origin_byte in origin:
770 |             output.append(origin_byte ^ KDH_PASSPHRASE[keycursor])
771 |             keycursor += 1
772 |             if keycursor >= len(KDH_PASSPHRASE):
773 |                 keycursor = 0
774 |         output = bytes(output)
775 | 
776 |         #  Remove useless tail data.
777 |         eofpos = output.rfind(b"%%EOF")
778 |         if eofpos < 0:
779 |             raise Exception("%%EOF mark can't be found.")
780 |         output = output[: eofpos + 5]
781 | 
782 |         #  Write output file.
783 |         fp = open(dest + ".tmp", "wb")
784 |         fp.write(output)
785 |         fp.close()
786 | 
787 |         # Use mutool to repair xref
788 |         try:
789 |             check_output(["mutool", "clean", dest + ".tmp", dest], stderr=STDOUT)
790 |         except CalledProcessError as e:
791 |             print(e.output.decode("utf-8"))
792 |             raise SystemExit(
793 |                 "Command mutool returned non-zero exit status " + str(e.returncode)
794 |             )
795 | 
796 |         os.remove(dest + ".tmp")
797 | 


--------------------------------------------------------------------------------