├── caj2pdf ├── __init__.py ├── dep │ ├── __init__.py │ ├── decode_jbig2data.cc │ ├── JBigDecode.h │ ├── decode_jbig2data_x.cc │ ├── jbig2dec.py │ ├── jbigdec.py │ ├── jbigdec.cc │ └── JBigDecode.cc ├── version.py ├── exceptions.py ├── exe_convert.py ├── install.py ├── cli.py ├── HNParsePage.py ├── utils.py └── cajparser.py ├── screenshot1.png ├── .gitattributes ├── tests ├── c8_src.caj ├── caj_src.caj ├── hn_src.caj └── kdh_src.caj ├── dlls ├── libjbigdec-w32.dll ├── libjbigdec-w64.dll ├── libjbig2codec-w32.dll └── libjbig2codec-w64.dll ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ └── bug_cn.md └── workflows │ ├── build.yaml │ └── test.yaml ├── LICENSE ├── pyproject.toml ├── .gitignore ├── README.md └── pdm.lock /caj2pdf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /caj2pdf/dep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /caj2pdf/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0a7" 2 | -------------------------------------------------------------------------------- /caj2pdf/exceptions.py: -------------------------------------------------------------------------------- 1 | class Caj2PdfException(Exception): 2 | "root exception type" 3 | pass 4 | -------------------------------------------------------------------------------- /screenshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zombie110year/caj2pdf-restructured/HEAD/screenshot1.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.caj filter=lfs diff=lfs merge=lfs -text 2 | *.dll filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /tests/c8_src.caj: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7e3f2c0faebb8ddac2f4feff3cfc9249476ab170b43250ac00610b10aa46c74b 3 | size 8786313 4 | -------------------------------------------------------------------------------- /tests/caj_src.caj: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:50a63a5e96858d17258149d260a335a7b216dbbc38f86f9ab6d5ef0c9b4dbab3 3 | size 2983075 4 | -------------------------------------------------------------------------------- /tests/hn_src.caj: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:166d0014792326570d9e8ca42fe13a4a44329e4ad095d3ac8dfd73c8ddb9f20e 3 | size 163779 4 | -------------------------------------------------------------------------------- /tests/kdh_src.caj: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ef0d77b2cdb2b9eeea105312bef7eef7727e7d8a055e7828d51fbd9a571cb17a 3 | size 192452 4 | -------------------------------------------------------------------------------- /dlls/libjbigdec-w32.dll: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0acbe501c2d6711fc11fdfec3ccda45b73cc44da978915e833ab16d750e4f108 3 | size 61142 4 | -------------------------------------------------------------------------------- /dlls/libjbigdec-w64.dll: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8517169c45a8d3f7823dc47d2b98a102a61f0f463dc0df4eb48cb90b9541a5f2 3 | size 66401 4 | -------------------------------------------------------------------------------- /dlls/libjbig2codec-w32.dll: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7fe46b81dbd839afc9ad7b797a0378292635928149bb3de87c92f4e3b3811684 3 | size 180634 4 | -------------------------------------------------------------------------------- /dlls/libjbig2codec-w64.dll: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:afba105509cc454c976f9b7835fc6c70b3e4fd4d1c8a297d46fb69f0077b301b 3 | size 193739 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=120 3 | max-doc-length=120 4 | ignore= 5 | # : 前可以有空格,因为 [expr : expr] 6 | E203, 7 | # 双元运算符放在行首,为了让部分比较长的算术看起来像竖式 8 | W503, 9 | exclude=caj2pdf/pdfwutils.py 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_cn.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: CAJ2PDF 程序问题 3 | about: "提交一个 CAJ2PDF 的程序问题报告。" 4 | --- 5 | 6 | 除非特殊情况,请完整填写所有问题。不按模板发的 issue 将直接被关闭。 7 | 如果你遇到的问题不是 bug,比如你不清楚要如何配置,请使用[Discussion](https://github.com/caj2pdf/discussion/issues)进行讨论。 8 | 9 | 1) 你正在使用哪个版本的 CAJ2PDF? 10 | 11 | 2) 你的使用场景是什么?比如convert还是其他。 12 | 13 | 3) 你看到的不正常的现象是什么?(请描述具体现象,比如PDF打不开等) 14 | 15 | 4) 你期待看到的正确表现是怎样的? 16 | 17 | 5) 请附上你的配置(Python版本、Python包版本以及mutools版本)。 18 | 19 | 7) 请使用××ZIP××格式压缩CAJ文件,附在此处。 20 | 21 | 8) 请附上出错时软件输出的错误信息。 22 | 23 | 错误信息: 24 | 25 | ```python 26 | # 软件输出的错误信息 27 | ``` 28 | 29 | 请预览一下你填的内容再提交。 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GLWT(Good Luck With That) Public License 2 | Copyright (c) Everyone, except Author 3 | 4 | Everyone is permitted to copy, distribute, modify, merge, sell, publish, 5 | sublicense or whatever they want with this software but at their OWN RISK. 6 | 7 | Preamble 8 | 9 | The author has absolutely no clue what the code in this project does. 10 | It might just work or not, there is no third option. 11 | 12 | 13 | GOOD LUCK WITH THAT PUBLIC LICENSE 14 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION, AND MODIFICATION 15 | 16 | 0. You just DO WHATEVER YOU WANT TO as long as you NEVER LEAVE A 17 | TRACE TO TRACK THE AUTHOR of the original product to blame for or hold 18 | responsible. 19 | 20 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | 25 | Good luck and Godspeed. -------------------------------------------------------------------------------- /caj2pdf/dep/decode_jbig2data.cc: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 (c) Hin-Tak Leung 3 | See The FreeType Project LICENSE for license terms. 4 | 5 | This is a small wrapper around libpoppler to provide a python 6 | interface to decode JBIG2 stream. 7 | 8 | To build: 9 | 10 | cc -Wall `pkg-config --cflags poppler` -fPIC -shared -o libjbig2codec.so decode_jbig2data.cc `pkg-config --libs poppler` 11 | */ 12 | 13 | #include 14 | 15 | int decode_jbig2data(char*, int, char*, int, int, int, int); 16 | 17 | extern "C" { 18 | int decode_jbig2data_c(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes) 19 | { 20 | return decode_jbig2data(inbuf, bufsize, outptr, width, height, width_in_padded_4bytes, width_in_padded_bytes); 21 | } 22 | } 23 | 24 | int decode_jbig2data(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes) 25 | { 26 | int v12; 27 | 28 | Object globals; 29 | MemStream *v10 = new MemStream(inbuf, 0, bufsize, Object(objNull)); 30 | Stream *v11 = new JBIG2Stream(v10, Object(objNull), &globals); 31 | v11->reset(); // required 32 | if ( height > 0 ) 33 | { 34 | v12 = 0; 35 | char* v13 = outptr + (height - 1) * width_in_padded_4bytes; 36 | do 37 | { 38 | ++v12; 39 | for (int i = 0; i < width_in_padded_bytes; i++) 40 | { 41 | *(v13 + i) = 0xFF & (v11->getChar() ^ 0xFF); 42 | } 43 | v13 -= width_in_padded_4bytes; 44 | } 45 | while ( v12 != height ); 46 | } 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /caj2pdf/exe_convert.py: -------------------------------------------------------------------------------- 1 | """提供给上下文菜单调用 2 | 3 | 目前仅支持 Windows 4 | """ 5 | import pathlib 6 | import sys 7 | import threading 8 | 9 | from .cajparser import CAJParser 10 | from .exceptions import Caj2PdfException 11 | 12 | 13 | def main(): 14 | try: 15 | app() 16 | except Exception: 17 | pass 18 | 19 | 20 | def app(): 21 | try: 22 | cajfilepath_str = sys.argv[1] 23 | except IndexError: 24 | raise Caj2PdfException(f"找不到caj文件,输入参数为:{sys.argv!r}") 25 | 26 | cajfilepath = pathlib.Path(cajfilepath_str) 27 | if not cajfilepath.exists(): 28 | raise Caj2PdfException(f"caj文件不存在:{cajfilepath.as_posix()}") 29 | 30 | inputfile = str(cajfilepath) 31 | outputfile = f"{inputfile}.pdf" 32 | task = threading.Thread( 33 | group=None, target=convert_caj, args=(inputfile, outputfile) 34 | ) 35 | alive = AliveStatus() 36 | task.start() 37 | alive.start() 38 | task.join() 39 | alive.finish() 40 | alive.join() 41 | 42 | 43 | def convert_caj(inputfile, outputfile): 44 | caj = CAJParser(inputfile) 45 | caj.convert(outputfile) 46 | 47 | 48 | class AliveStatus(threading.Thread): 49 | def __init__(self): 50 | super().__init__(group=None) 51 | self.finished = False 52 | 53 | def run(self): 54 | from time import sleep 55 | 56 | status = ("正在转换 <<<<<<", "正在转换 >>>>>>") 57 | i = 0b0 58 | while not self.finished: 59 | print(status[i & 0b1], end="\r", file=sys.stderr) 60 | i ^= 0b1 61 | sleep(0.2) 62 | 63 | def finish(self): 64 | self.finished = True 65 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "caj2pdf-restructured" 3 | dynamic = ["version", "entry-points"] 4 | description = "caj2pdf 项目的重新组织,方便打包与安装" 5 | authors = [ 6 | { name = "Hin-Tak Leung", email = "htl10@users.sourceforge.net" }, 7 | { name = "JeziL", email = "wangjinlithu@gmail.com" }, 8 | ] 9 | maintainers = [{ name = "zombie110year", email = "zombie110year@outlook.com" }] 10 | dependencies = ["imagesize==1.3.0", "PyPDF2==2.2.0"] 11 | requires-python = ">=3.10" 12 | readme.content-type = "text/markdown" 13 | readme.file = "README.md" 14 | license = { text = "GLWTPL" } 15 | 16 | classifiers = [ 17 | "Environment :: Console", 18 | "Operating System :: Microsoft :: Windows", 19 | "Operating System :: POSIX :: Linux", 20 | ] 21 | keywords = ["cnki", "caj", "pdf"] 22 | 23 | [project.urls] 24 | repository = "https://github.com/zombie110year/caj2pdf-restructured/" 25 | 26 | [project.scripts] 27 | caj2pdf = "caj2pdf.cli:main" 28 | caj2pdf-ec = "caj2pdf.exe_convert:main" 29 | 30 | 31 | [project.optional-dependencies] 32 | [build-system] 33 | requires = ["pdm-pep517>=1.0.0"] 34 | build-backend = "pdm.pep517.api" 35 | 36 | [tool.pdm] 37 | version = { source = "file", path = "caj2pdf/version.py" } 38 | 39 | [tool.pdm.build] 40 | excludes = ["tests/**"] 41 | setup-script = "build.py" 42 | run-setuptools = true 43 | 44 | [[tool.pdm.source]] 45 | name = "pypi" 46 | url = "https://mirrors.aliyun.com/pypi/simple" 47 | verify_ssl = true 48 | 49 | [tool.pdm.dev-dependencies] 50 | dev = [ 51 | "black>=22.10.0", 52 | "isort>=5.10.1", 53 | "flake8>=5.0.4", 54 | "setuptools>=65.6.0", 55 | ] 56 | 57 | [tool.setuptools.package-data] 58 | caj2pdf = ["*.dll", "*.so"] 59 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Upload caj2pdf to GitHub Actions Artifacts 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - 'releases/**' 7 | 8 | jobs: 9 | windows_pub: 10 | name: Publish from Windows Server 2019 11 | runs-on: windows-2019 12 | steps: 13 | - name: Fetch Source Code 14 | uses: actions/checkout@v2 15 | with: 16 | lfs: true 17 | fetch-depth: 1 18 | - name: Setup Python 3 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: "3.10" 22 | - name: Install Python Dependencies 23 | run: | 24 | python -m pip install -U pip && pip install pdm 25 | pdm install 26 | - name: Build Package 27 | run: pdm build --no-sdist 28 | - name: Upload to github action artifact 29 | uses: actions/upload-artifact@v3 30 | with: 31 | path: dist/*.whl 32 | 33 | linux_pub: 34 | name: Publish from Ubuntu, no JBIG2DEC 35 | runs-on: ubuntu-latest 36 | steps: 37 | - name: Fetch Source Code 38 | uses: actions/checkout@v2 39 | with: 40 | lfs: true 41 | fetch-depth: 1 42 | - name: Setup Python 3 43 | uses: actions/setup-python@v4 44 | with: 45 | python-version: "3.10" 46 | - name: Install Python Dependencies 47 | run: | 48 | python -m pip install -U pip && pip install pdm 49 | pdm install -d 50 | - name: Build Package 51 | run: pdm build 52 | - name: Get Glibc version 53 | run: ldd --version > dist/ldd.version.txt 54 | - name: Upload to github action artifact 55 | uses: actions/upload-artifact@v3 56 | with: 57 | path: | 58 | dist/*.whl 59 | dist/*.tar.gz 60 | dist/ldd.version.txt 61 | -------------------------------------------------------------------------------- /caj2pdf/dep/JBigDecode.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 (c) Hin-Tak Leung 3 | See The FreeType Project LICENSE for license terms. 4 | 5 | Decode-only part of JBigCodec. Drop-in compatible with LibReaderEx's. 6 | 7 | Note: MPS/ST are very wasteful, as only 1-bit is used, and 8 | array of length 0x20 (5 contexts) instead of 0x4000 (14 contexts). 9 | 10 | GetBit() has /3 instead of >> 3, GetCX() only 5 contexts instead of 10/14. 11 | SLNTP / LNTP is neither the three-line template nor the two-line template 12 | form (and GetBit() is strange anyway). 13 | 14 | LpsExchange/MpsExchange/RenormDe/ByteIn/InitDecode are essentially 15 | identical as in T-82, as well as Decode1() and Decode(). 16 | 17 | 18 | */ 19 | class JBigCodec { 20 | public: 21 | void ByteIn(); 22 | void* ClearLine(char*, unsigned int); 23 | void* CopyLine(char*, char*, unsigned int); 24 | int Decode1(int); 25 | int Decode(char*, unsigned int, unsigned int, unsigned int, unsigned int, char*); 26 | int Decode(int); 27 | void* DupLine(char*, unsigned int, unsigned int, unsigned int); 28 | int GetBit(int, int); 29 | int GetCX(int, int); 30 | void InitDecode(char*, unsigned int); 31 | int LowestDecode(); 32 | int LowestDecodeLine(unsigned int, char*, char*, unsigned int, char*); 33 | void LpsExchange(int, unsigned int, unsigned int); 34 | void* MakeTypicalLine(int); 35 | void MpsExchange(int, unsigned int, unsigned int); 36 | void RenormDe(); 37 | private: 38 | unsigned int A_interval; 39 | int CT; 40 | int SC; /* Only used by Encode */ 41 | unsigned int inbuf_length; 42 | int read_count; 43 | unsigned char *inbuf; 44 | unsigned int MPS[0x1000]; 45 | unsigned int ST[0x1000]; 46 | unsigned int C_register; 47 | int PIX; 48 | int BUFFER; /* Only used by Encode */ 49 | int bitwidth; 50 | int height; 51 | int width_in_padded_bytes; 52 | char *outptr; 53 | }; 54 | -------------------------------------------------------------------------------- /caj2pdf/dep/decode_jbig2data_x.cc: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 (c) Hin-Tak Leung 3 | See The FreeType Project LICENSE for license terms. 4 | 5 | This is a small wrapper around libjbig2dec to provide a python 6 | interface to decode JBIG2 stream. 7 | 8 | To build: 9 | 10 | cc -Wall `pkg-config --cflags jbig2dec` -fPIC -shared -o libjbig2codec.so decode_jbig2data_x.cc `pkg-config --libs jbig2dec` 11 | 12 | Or, if you have jbig2dec in source form, in its directory 13 | (see "jbig2dec/Makefile.am" - everything exept "jbig2_image_pbm.c" and "memento.c"): 14 | 15 | cc -I . -Wall -fPIC -shared -o ${CAJ2PDF_SRC}/libjbig2codec.so ${CAJ2PDF_SRC}/decode_jbig2data_x.cc \ 16 | jbig2.c \ 17 | jbig2_arith.c jbig2_arith_int.c jbig2_arith_iaid.c jbig2_huffman.c jbig2_hufftab.c \ 18 | jbig2_segment.c jbig2_page.c \ 19 | jbig2_symbol_dict.c jbig2_text.c \ 20 | jbig2_generic.c jbig2_refinement.c jbig2_mmr.c \ 21 | jbig2_halftone.c \ 22 | jbig2_image.c 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | int decode_jbig2data(char*, int, char*, int, int, int, int); 30 | 31 | extern "C" { 32 | int decode_jbig2data_c(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes) 33 | { 34 | return decode_jbig2data(inbuf, bufsize, outptr, width, height, width_in_padded_4bytes, width_in_padded_bytes); 35 | } 36 | } 37 | 38 | int decode_jbig2data(char* inbuf, int bufsize, char* outptr, int width, int height, int width_in_padded_4bytes, int width_in_padded_bytes) 39 | { 40 | int v12; 41 | 42 | Jbig2Ctx *ctx = jbig2_ctx_new(NULL, JBIG2_OPTIONS_EMBEDDED, NULL, NULL, NULL); 43 | jbig2_data_in(ctx, (const unsigned char*)inbuf, bufsize); 44 | jbig2_complete_page(ctx); // Required, apparently this is "work around broken CVision embedded streams", 45 | // "simulating an end-of-page segment (for broken streams)" 46 | Jbig2Image *image = jbig2_page_out(ctx); 47 | if ( height > 0 ) 48 | { 49 | v12 = 0; 50 | char* v13 = outptr + (height - 1) * width_in_padded_4bytes; 51 | unsigned char *data = image->data; 52 | do 53 | { 54 | ++v12; 55 | memcpy(v13, data, width_in_padded_bytes); 56 | v13 -= width_in_padded_4bytes; 57 | data += image->stride; 58 | } 59 | while ( v12 != height ); 60 | } 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .pdm.toml 2 | /caj2pdf/dep/bin/*.dll 3 | 4 | # Created by .ignore support plugin (hsz.mobi) 5 | ### JetBrains template 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff: 10 | .idea/ 11 | .vscode/ 12 | *.caj 13 | *.tmp 14 | *.pdf 15 | cajs/ 16 | mutool.exe 17 | 18 | ## File-based project format: 19 | *.iws 20 | 21 | ## Plugin-specific files: 22 | 23 | # IntelliJ 24 | /out/ 25 | 26 | # mpeltonen/sbt-idea plugin 27 | .idea_modules/ 28 | 29 | # JIRA plugin 30 | atlassian-ide-plugin.xml 31 | 32 | # Crashlytics plugin (for Android Studio and IntelliJ) 33 | com_crashlytics_export_strings.xml 34 | crashlytics.properties 35 | crashlytics-build.properties 36 | fabric.properties 37 | ### Python template 38 | # Byte-compiled / optimized / DLL files 39 | __pycache__/ 40 | *.py[cod] 41 | *$py.class 42 | 43 | # C extensions 44 | *.so 45 | 46 | # Distribution / packaging 47 | .Python 48 | env/ 49 | build/ 50 | develop-eggs/ 51 | dist/ 52 | downloads/ 53 | eggs/ 54 | .eggs/ 55 | parts/ 56 | sdist/ 57 | var/ 58 | wheels/ 59 | *.egg-info/ 60 | .installed.cfg 61 | *.egg 62 | 63 | # PyInstaller 64 | # Usually these files are written by a python script from a template 65 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 66 | *.manifest 67 | *.spec 68 | 69 | # Installer logs 70 | pip-log.txt 71 | pip-delete-this-directory.txt 72 | 73 | # Unit test / coverage reports 74 | htmlcov/ 75 | .tox/ 76 | .coverage 77 | .coverage.* 78 | .cache 79 | nosetests.xml 80 | coverage.xml 81 | *,cover 82 | .hypothesis/ 83 | 84 | # Translations 85 | *.mo 86 | *.pot 87 | 88 | # Django stuff: 89 | *.log 90 | local_settings.py 91 | 92 | # Flask stuff: 93 | instance/ 94 | .webassets-cache 95 | 96 | # Scrapy stuff: 97 | .scrapy 98 | 99 | # Sphinx documentation 100 | docs/_build/ 101 | 102 | # PyBuilder 103 | target/ 104 | 105 | # Jupyter Notebook 106 | .ipynb_checkpoints 107 | 108 | # pyenv 109 | .python-version 110 | 111 | # celery beat schedule file 112 | celerybeat-schedule 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # dotenv 118 | .env 119 | 120 | # virtualenv 121 | .venv 122 | venv/ 123 | ENV/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | !/tests/caj_src.caj 132 | !/tests/c8_src.caj 133 | !/tests/kdh_src.caj 134 | !/tests/hn_src.caj 135 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Test caj2pdf bin 2 | on: 3 | push: 4 | branches: 5 | - dev 6 | 7 | jobs: 8 | test_ubuntu: 9 | strategy: 10 | matrix: 11 | ubuntu-version: ["ubuntu-20.04"] 12 | python-version: ["3.10"] 13 | dep-lib: ["libpoppler-dev"] 14 | # dep-lib: ["libpoppler-dev", "libjbig2dec0-dev"] # libjbig2dec in Ubuntu has lots of problems. 15 | # src-caj: ["caj", "kdh"] 16 | src-caj: ["c8", "caj", "hn", "kdh"] # only caj, kdh in sample successed 17 | name: Test on ${{ matrix.ubuntu-version }} 18 | runs-on: ${{ matrix.ubuntu-version }} 19 | steps: 20 | - name: Fetch Source Code 21 | uses: actions/checkout@v2 22 | with: 23 | lfs: true 24 | fetch-depth: 1 25 | - name: Setup Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install C++ Dependencies 30 | run: sudo apt -y install build-essential ${{ matrix.dep-lib }} mupdf-tools pkg-config 31 | - name: Install Python Project with "poppler" 32 | if: ${{ matrix.dep-lib == 'libpoppler-dev' }} 33 | run: | 34 | python -m pip install -U pip && pip install pdm 35 | pdm install 36 | - name: Install Python Project with "jbig2dec" 37 | if: ${{ matrix.dep-lib == 'libjbig2dec0-dev' }} 38 | run: | 39 | python -m pip install -U pip && pip install pdm 40 | LIBJBIG2DEC=1 pdm install 41 | - name: Convert Test Caj 42 | run: pdm run caj2pdf convert tests/${{ matrix.src-caj }}_src.caj -o tests/${{ matrix.src-caj }}_dst.pdf 43 | 44 | test_windows: 45 | name: Test on Windows Server 2019 46 | runs-on: windows-2019 47 | strategy: 48 | matrix: 49 | python-version: ["3.10"] 50 | # src-caj: ["caj", "kdh"] 51 | src-caj: ["c8", "caj", "hn", "kdh"] # only caj, kdh in sample successed 52 | steps: 53 | - name: Fetch Source Code 54 | uses: actions/checkout@v2 55 | with: 56 | lfs: true 57 | fetch-depth: 1 58 | - name: Setup Python ${{ matrix.python-version }} 59 | uses: actions/setup-python@v4 60 | with: 61 | python-version: ${{ matrix.python-version }} 62 | - name: Install Python Dependencies 63 | run: | 64 | python -m pip install -U pip && pip install pdm 65 | pdm install 66 | - name: Install mutool.exe 67 | run: | 68 | curl.exe -L https://mupdf.com/downloads/archive/mupdf-1.18.0-windows.zip -o mupdf.zip 69 | Expand-Archive -Force mupdf.zip . 70 | Copy-Item mupdf-1.18.0-windows/mutool.exe C:\WINDOWS\system32\mutool.exe 71 | shell: pwsh 72 | - name: Convert Test Caj 73 | run: pdm run caj2pdf convert tests\\${{ matrix.src-caj }}_src.caj -o tests\\${{ matrix.src-caj }}_dst.pdf 74 | -------------------------------------------------------------------------------- /caj2pdf/install.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import sys 4 | 5 | 6 | def install_context_windows(dry_run: bool): 7 | binary = sys.argv[0] 8 | binary_dir = pathlib.Path(binary).parent 9 | exe = binary_dir / "caj2pdf-ec.exe" 10 | 11 | description = "Convert CAJ to PDF" 12 | command = f'"{exe}" "%1"' 13 | if dry_run: 14 | regedit = f"""\ 15 | Windows Registry Editor Version 5.00 16 | 17 | [HKEY_CLASSES_ROOT\\.caj\\shell\\caj2pdf] 18 | @="{description}" 19 | 20 | [HKEY_CLASSES_ROOT\\.caj\\shell\\caj2pdf\\command] 21 | @="{command}" 22 | """ 23 | print(regedit, file=sys.stderr) 24 | else: 25 | import ctypes 26 | import winreg 27 | 28 | if 1 != ctypes.windll.shell32.IsUserAnAdmin(): 29 | # https://docs.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-shellexecutew 30 | # https://support.microsoft.com/zh-cn/topic/wd2000-%E5%A6%82%E4%BD%95%E8%B0%83%E7%94%A8-shellexecute-windows-api-%E5%87%BD%E6%95%B0-80da207b-2fa3-ac60-e871-f0a63164bad7 31 | apifn = ctypes.windll.shell32.ShellExecuteW 32 | args = ( 33 | # 没有主窗口 34 | None, 35 | # 运行 runas 命令 36 | "runas", 37 | # 可执行文件 38 | binary, 39 | # 运行该程序时的参数 40 | "install", 41 | # 工作目录 42 | os.getcwd(), 43 | # https://docs.microsoft.com/en-us/windows/win32/api/winuser/nf-winuser-showwindow 44 | # 激活并显示窗口 45 | 1, 46 | ) 47 | # TODO 权限获取失败,弹窗消失太快看不清报错。 48 | print(apifn, args) 49 | status = apifn(*args) 50 | if status <= 32: 51 | raise WindowsError((f"win32api错误,返回 {status}", ("ShellExecuteW", args))) 52 | else: 53 | # 如果拥有管理员权限 54 | reg = winreg.ConnectRegistry(None, winreg.HKEY_CLASSES_ROOT) 55 | cajshell = winreg.CreateKeyEx( 56 | key=winreg.HKEY_CLASSES_ROOT, 57 | sub_key=".caj\\shell\\caj2pdf", 58 | reserved=0, 59 | access=winreg.KEY_WRITE, 60 | ) 61 | winreg.SetValue( 62 | winreg.HKEY_CLASSES_ROOT, 63 | ".caj\\shell\\caj2pdf", 64 | winreg.REG_SZ, 65 | description, 66 | ) 67 | cajshellcmd = winreg.CreateKeyEx( 68 | key=winreg.HKEY_CLASSES_ROOT, 69 | sub_key=".caj\\shell\\caj2pdf\\command", 70 | reserved=0, 71 | access=winreg.KEY_WRITE, 72 | ) 73 | winreg.SetValue( 74 | winreg.HKEY_CLASSES_ROOT, 75 | ".caj\\shell\\caj2pdf\\command", 76 | winreg.REG_SZ, 77 | command, 78 | ) 79 | cajshellcmd.Close() 80 | cajshell.Close() 81 | reg.Close() 82 | -------------------------------------------------------------------------------- /caj2pdf/dep/jbig2dec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2021 (c) Hin-Tak Leung 4 | # See The FreeType Project LICENSE for license terms. 5 | # 6 | # python ctypes module and short program to decode JBIG2 image data in a CAJ file. 7 | 8 | # To build, either libpoppler-based, or libjbig2dec-based (pick only one!): 9 | # 10 | # cc -Wall `pkg-config --cflags poppler` -fPIC -shared -o libjbig2codec.so decode_jbig2data.cc `pkg-config --libs poppler` 11 | # 12 | # cc -Wall `pkg-config --cflags jbig2dec` -fPIC -shared -o libjbig2codec.so decode_jbig2data_x.cc `pkg-config --libs jbig2dec` 13 | # 14 | # NOTE(zombie110year,2021/04/20): in this project, just compile them with script file `build.py` 15 | 16 | import importlib.resources 17 | import platform 18 | import struct 19 | from ctypes import * 20 | 21 | arch = platform.architecture() 22 | if (arch[1] == 'WindowsPE'): 23 | if (arch[0] == '64bit'): 24 | with importlib.resources.files(__package__) as pkg_dir: 25 | libjbig2codec = cdll.LoadLibrary(str(pkg_dir / "bin/libjbig2codec-w64.dll")) 26 | else: 27 | with importlib.resources.files(__package__) as pkg_dir: 28 | libjbig2codec = cdll.LoadLibrary(str(pkg_dir / "bin/libjbig2codec-w32.dll")) 29 | else: 30 | with importlib.resources.files(__package__) as pkg_dir: 31 | libjbig2codec = cdll.LoadLibrary(pkg_dir / "bin/libjbig2codec.so") 32 | 33 | decode_jbig2data_c = libjbig2codec.decode_jbig2data_c 34 | 35 | decode_jbig2data_c.restype = c_int 36 | decode_jbig2data_c.argtypes = [c_void_p, c_int, c_void_p, c_int, c_int, c_int, c_int] 37 | 38 | class CImage: 39 | def __init__(self, buffer): 40 | self.buffer = buffer 41 | self.buffer_size=len(buffer) 42 | (self.width, self.height, 43 | self.num_planes, self.bits_per_pixel) = struct.unpack("> 5) << 2 45 | 46 | def DecodeJbig2(self): 47 | out = create_string_buffer(self.height * self.bytes_per_line) 48 | width_in_bytes = (self.width * self.bits_per_pixel + 7) >> 3 49 | decode_jbig2data_c(self.buffer[48:], self.buffer_size-48, out, self.width, self.height, self.bytes_per_line, width_in_bytes) 50 | return out 51 | 52 | if __name__ == '__main__': 53 | import os 54 | import sys 55 | 56 | if len(sys.argv) < 3: 57 | print("Usage: %s input output" % sys.argv[0]) 58 | sys.exit() 59 | 60 | f = open(sys.argv[1], "rb") 61 | buffer_size = os.stat(sys.argv[1]).st_size 62 | buffer = f.read() 63 | 64 | cimage = CImage(buffer) 65 | out = cimage.DecodeJbig2() 66 | 67 | # PBM is only padded to 8 rather than 32. 68 | # If the padding is larger, write padded file. 69 | if (cimage.bytes_per_line > ((cimage.width +7) >> 3)): 70 | #! bytes_per_line doesn't defined 71 | cimage.width = cimage.bytes_per_line << 3 72 | 73 | with open(sys.argv[2], "wb") as fout: 74 | fout.write("P4\n".encode("ascii")) 75 | fout.write(("%d %d\n" % (cimage.width, cimage.height)).encode("ascii")) 76 | fout.write(out) 77 | -------------------------------------------------------------------------------- /caj2pdf/dep/jbigdec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020-2021 (c) Hin-Tak Leung 4 | # See The FreeType Project LICENSE for license terms. 5 | # 6 | # python ctypes module and short program decodes the image data in a CAJ file. 7 | 8 | # To build, copy "libreaderex_x64.so" from the Ubuntu AppImage 9 | # to the current directory. 10 | # (See "Analysing libreaderex" in the Wiki on how to) 11 | # 12 | # Then, run 13 | # 14 | # cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc 15 | 16 | import importlib.resources 17 | import os 18 | import platform 19 | import struct 20 | from ctypes import * 21 | 22 | arch = platform.architecture() 23 | if (arch[1] == 'WindowsPE'): 24 | if (arch[0] == '64bit'): 25 | with importlib.resources.files(__package__) as pkg_dir: 26 | dllpath = (pkg_dir / "bin/libjbigdec-w64.dll").as_posix() 27 | libjbigdec = cdll.LoadLibrary(dllpath) 28 | else: 29 | with importlib.resources.files(__package__) as pkg_dir: 30 | dllpath = (pkg_dir / "bin/libjbigdec-w32.dll").as_posix() 31 | libjbigdec = cdll.LoadLibrary(dllpath) 32 | else: 33 | with importlib.resources.files(__package__) as pkg_dir: 34 | dllpath = (pkg_dir / "bin/libjbigdec.so").as_posix() 35 | libjbigdec = cdll.LoadLibrary(dllpath) 36 | 37 | #SaveJbigAsBmp = libjbigdec.SaveJbigAsBmp 38 | #SaveJbigAsBmp.restype = None 39 | #SaveJbigAsBmp.argtypes = [c_void_p, c_int, c_char_p] 40 | 41 | #SaveJbig2AsBmp = libjbigdec.SaveJbig2AsBmp 42 | #SaveJbig2AsBmp.restype = None 43 | #SaveJbig2AsBmp.argtypes = [c_void_p, c_int, c_char_p] 44 | 45 | jbigDecode = libjbigdec.jbigDecode 46 | jbigDecode.restype = None 47 | jbigDecode.argtypes = [c_void_p, c_int, c_int, c_int, c_int, c_void_p] 48 | 49 | class CImage: 50 | def __init__(self, buffer): 51 | self.buffer = buffer 52 | self.buffer_size=len(buffer) 53 | (self.width, self.height, 54 | self.num_planes, self.bits_per_pixel) = struct.unpack("> 5) << 2 56 | 57 | def DecodeJbig(self): 58 | out = create_string_buffer(self.height * self.bytes_per_line) 59 | jbigDecode(self.buffer[48:], self.buffer_size-48, self.height, self.width, self.bytes_per_line, out) 60 | return out 61 | 62 | if __name__ == '__main__': 63 | import os 64 | import sys 65 | 66 | if len(sys.argv) < 3: 67 | print("Usage: %s input output" % sys.argv[0]) 68 | sys.exit() 69 | 70 | f = open(sys.argv[1], "rb") 71 | buffer_size = os.stat(sys.argv[1]).st_size 72 | buffer = f.read() 73 | 74 | #SaveJbigAsBmp(buffer, buffer_size, sys.argv[2].encode("ascii")) 75 | 76 | cimage = CImage(buffer) 77 | out = cimage.DecodeJbig() 78 | 79 | # PBM is only padded to 8 rather than 32. 80 | # If the padding is larger, write padded file. 81 | width = cimage.width 82 | if (cimage.bytes_per_line > ((cimage.width +7) >> 3)): 83 | width = cimage.bytes_per_line << 3 84 | 85 | fout = open(sys.argv[2].replace(".bmp", ".pbm"), "wb") 86 | fout.write("P4\n".encode("ascii")) 87 | fout.write(("%d %d\n" % (width, cimage.height)).encode("ascii")) 88 | fout.write(out) 89 | fout.close() 90 | -------------------------------------------------------------------------------- /caj2pdf/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import platform 6 | 7 | from .cajparser import CAJParser 8 | from .install import install_context_windows 9 | from .utils import add_outlines 10 | from .version import __version__ 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(prog="caj2pdf") 15 | parser.add_argument("--version", action="version", version=f"%(prog)s v{__version__}") 16 | subparsers = parser.add_subparsers(help="commands", dest="command") 17 | 18 | show_parser = subparsers.add_parser("show", help="Show the information of the CAJ file.") 19 | show_parser.add_argument("input", help="Path to the CAJ file.") 20 | 21 | convert_parser = subparsers.add_parser("convert", help="Convert the CAJ file to PDF file.") 22 | convert_parser.add_argument("input", help="Path to the CAJ file.") 23 | convert_parser.add_argument("-o", "--output", help="Output path to the PDF file.", required=False) 24 | 25 | outlines_parser = subparsers.add_parser("outlines", help="Extract outlines from the CAJ file and add it to PDF file.") 26 | outlines_parser.add_argument("input", help="Path to the CAJ file.") 27 | outlines_parser.add_argument("-o", "--output", help="Path to the PDF file.", required=True) 28 | 29 | parse_parser = subparsers.add_parser("parse", help="Parse CAJ file for debugging/development") 30 | parse_parser.add_argument("input", help="Path to the CAJ file.") 31 | 32 | text_extract_parser = subparsers.add_parser("text-extract", help="Parse CAJ file for debugging/development") 33 | text_extract_parser.add_argument("input", help="Path to the CAJ file.") 34 | 35 | install_parser = subparsers.add_parser("install", help="install some system features, may need admin permission.") 36 | install_parser.add_argument("--dry-run", help="not do actually, show the effect.", action="store_true") 37 | args = parser.parse_args() 38 | 39 | if args.command == "show": 40 | caj = CAJParser(args.input) 41 | if caj.format == "PDF" or caj.format == "KDH": 42 | print("File: {0}\nType: {1}\n".format(args.input, caj.format)) 43 | else: 44 | print("File: {0}\nType: {1}\nPage count: {2}\nOutlines count: {3}\n".format( 45 | args.input, 46 | caj.format, 47 | caj.page_num, 48 | caj.toc_num 49 | )) 50 | 51 | if args.command == "convert": 52 | caj = CAJParser(args.input) 53 | if args.output is None: 54 | if args.input.endswith(".caj"): 55 | args.output = args.input.replace(".caj", ".pdf") 56 | elif (len(args.input) > 4 and (args.input[-4] == '.' or args.input[-3] == '.') and not args.input.endswith(".pdf")): 57 | args.output = os.path.splitext(args.input)[0] + ".pdf" 58 | else: 59 | args.output = args.input + ".pdf" 60 | caj.convert(args.output) 61 | 62 | if args.command == "outlines": 63 | caj = CAJParser(args.input) 64 | if caj.format == "PDF" or caj.format == "KDH": 65 | raise SystemExit("Unsupported file type: {0}.".format(caj.format)) 66 | toc = caj.get_toc() 67 | add_outlines(toc, args.output, "tmp.pdf") 68 | os.replace("tmp.pdf", args.output) 69 | 70 | if args.command == "text-extract": 71 | caj = CAJParser(args.input) 72 | caj.text_extract() 73 | 74 | if args.command == "parse": 75 | caj = CAJParser(args.input) 76 | caj.parse() 77 | 78 | if args.command == "install": 79 | if platform.system() == "Windows": 80 | install_context_windows(args.dry_run) 81 | else: 82 | raise NotImplementedError("Only support Windows now.") 83 | -------------------------------------------------------------------------------- /caj2pdf/HNParsePage.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 (c) Hin-Tak Leung 2 | # See The FreeType Project LICENSE for license terms. 3 | # 4 | # HNParsePage class, for extracting text and image positions 5 | import struct 6 | 7 | class HNParsePage(object): 8 | def __init__(self, data, old_style=False): 9 | self.data = data 10 | self.data_length = len(data) 11 | self.characters = [] 12 | self.figures = [] 13 | self.stats = {} 14 | self.offset = 0 15 | def Text(self, code): 16 | try: 17 | self.characters.append(bytes([self.data[self.offset+5],self.data[self.offset+4]]).decode("gbk")) 18 | except IndexError: # short data, nothing to do 19 | pass 20 | except UnicodeDecodeError: 21 | # HTL: When cut-and-paste on Linux, these transform to GB18030, 22 | # but I believe they are OCR artifacts. Where they occur, 23 | # 0xA38D 0xA38a (always together) are line-breaks, and 0xA389, 0xA3A0 24 | # are tabs and spaces. 25 | hash = { 26 | 0xA389 : "\t", 27 | 0xA38a : "\n", 28 | 0xA38D : "\r", 29 | 0xA3A0 : " ", 30 | # # GB18030 31 | #0xA389 : "", 32 | #0xA38a : "", 33 | #0xA38D : "", 34 | #0xA3A0 : "", 35 | } 36 | code = self.data[self.offset+5] * 256 + self.data[self.offset+4] 37 | try: 38 | #self.characters.append("<0x%04X>\n" % code) 39 | self.characters.append(hash[code]) 40 | except KeyError: 41 | self.characters.append("<0x%04X>\n" % code) 42 | self.offset += 6 43 | 44 | def TextMulti(self, code): 45 | self.offset += 2 46 | if (code == 0x8001): 47 | self.characters.append("\n") 48 | while (1): 49 | try: 50 | if (self.data[self.offset+1] == 0x80): 51 | break 52 | except IndexError: # short data, nothing to do 53 | return 54 | try: 55 | self.characters.append(bytes([self.data[self.offset+3],self.data[self.offset+2]]).decode("gbk")) 56 | except UnicodeDecodeError: 57 | self.characters.append("<0x%04X>\n" % (self.data[self.offset+3] * 256 + self.data[self.offset+2])) 58 | except IndexError: # short data, nothing to do 59 | return 60 | self.offset += 4 61 | 62 | def Figure(self, code): 63 | try: 64 | self.data[self.offset+25] 65 | except IndexError: # short data, nothing to do 66 | return 67 | (ignore1, offset_x, offset_y, size_x, size_y, int2, int3, int4, int5)= struct.unpack(" 3 | See The FreeType Project LICENSE for license terms. 4 | 5 | This short program decodes the image data in a CAJ file. 6 | 7 | To build, copy "libreaderex_x64.so" from the Ubuntu AppImage 8 | to the current directory. 9 | (See "Analysing libreaderex" in the Wiki on how to) 10 | 11 | Then, run 12 | 13 | cc -DHAVE_MAIN -Wall -o jbigdec jbigdec.cc -Wl,-rpath,. -L. -lreaderex_x64 14 | 15 | For the python module, also: 16 | 17 | cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc 18 | 19 | and to generate the "image_dump_*.dat": 20 | 21 | ./caj2pdf parse thesis.caj 22 | 23 | Identify which ones are DIB and which ones are JPG with: 24 | 25 | file image_dump_*.dat 26 | 27 | Usage example (Page 1 / Cover is likely JPG!): 28 | 29 | ./jbigdec image_dump_0002.dat page_0002.bmp 30 | ./jbigdec image_dump_0003.dat page_0003.bmp 31 | ... 32 | 33 | Note: The program outputs a few "string to int" while it is working. 34 | This is an anomaly with "libreaderex_x64.so". 35 | */ 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | extern "C" { 43 | class JBigCodec { 44 | public: 45 | void ByteIn(); 46 | void ClearLine(char*, unsigned int); 47 | void CopyLine(char*, char*, unsigned int); 48 | int Decode1(int); 49 | void Decode(char* inbuf, unsigned int size, unsigned int height, unsigned int bitwidth, unsigned int bitwidth_in_bytes /* rounded up to x4 */, char*outbuf); 50 | int Decode(int); 51 | void DupLine(char*, unsigned int, unsigned int, unsigned int); 52 | int GetBit(int, int); 53 | unsigned int GetCX(int, int); 54 | void InitDecode(char*, unsigned int); 55 | void LowestDecode(); 56 | int LowestDecodeLine(unsigned int, char*, char*, unsigned int, char*); 57 | void MakeTypicalLine(int); 58 | void RenormDe(); 59 | }; 60 | #ifdef HAVE_MAIN 61 | class CImage { 62 | public: 63 | static CImage* DecodeJbig(void*, unsigned int, unsigned int*); 64 | static CImage* DecodeJbig2(void*, unsigned int, unsigned int*); 65 | int SaveAsBmp(char const*); 66 | }; 67 | 68 | void SaveJbigAsBmp(void* in, unsigned int len, char const* outfile) 69 | { 70 | CImage* x = CImage::DecodeJbig(in, len, NULL); 71 | x->SaveAsBmp(outfile); 72 | } 73 | 74 | void SaveJbig2AsBmp(void* in, unsigned int len, char const* outfile) 75 | { 76 | CImage* x = CImage::DecodeJbig2(in, len, NULL); 77 | x->SaveAsBmp(outfile); 78 | } 79 | #endif 80 | 81 | void jbigDecode(char* inbuf, unsigned int size, unsigned int height, 82 | unsigned int bitwidth, unsigned int bitwidth_in_bytes /* rounded up to x4 */, char*outbuf) 83 | { 84 | JBigCodec *jbig = (JBigCodec *)calloc(0x8040, 1); // 0x8040 is linux 64-bit specific 85 | jbig->Decode(inbuf, size, height, bitwidth, bitwidth_in_bytes, outbuf); 86 | free(jbig); 87 | } 88 | 89 | } 90 | 91 | #ifdef HAVE_MAIN 92 | int main(int argc, char *argv[]) 93 | { 94 | size_t buflen = 80000; // large number - should be large enough to hold the whole input file. 95 | char *in = (char *)calloc(buflen, 1); 96 | 97 | FILE *fin = fopen(argv[1], "rb"); 98 | 99 | size_t len = fread(in, 1, buflen, fin); 100 | 101 | unsigned int intout = 0; 102 | CImage* x = CImage::DecodeJbig(in, len, &intout); 103 | x->SaveAsBmp(argv[2]); 104 | 105 | int width = in[4] | (in[5] << 8) | (in[6] << 16) | (in[7] << 24); 106 | int height = in[8] | (in[9] << 8) | (in[10] << 16) | (in[11] << 24); 107 | int bits_per_pixel = in[14] | (in[15] << 8); 108 | // padding to multiple of 4 bytes. 109 | int bytes_per_line = ((width * bits_per_pixel + 31) >> 5) << 2; 110 | 111 | char *out = (char *)calloc(height * bytes_per_line, 1); 112 | 113 | JBigCodec *jbig = (JBigCodec *)calloc(0x8040, 1); // 0x8040 is linux 64-bit specific 114 | jbig->Decode(in+48, len-48, height, width, bytes_per_line, out); 115 | free(jbig); 116 | 117 | FILE *fout = fopen("test.pbm", "wb"); 118 | fprintf(fout, "P4\n"); 119 | // PBM is padded to 8 rather than 32. 120 | // If the padding is larger, write padded file. 121 | if (bytes_per_line > ((width +7) >> 3)) 122 | width = bytes_per_line << 3; 123 | fprintf(fout, "%d %d\n", width, height); 124 | fwrite(out, 1, bytes_per_line * height, fout); 125 | fclose(fout); // "cmp -i 62:13 x.bmp x.pbm" shows nothing - identical. 126 | } 127 | #endif 128 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # caj2pdf 2 | 3 | 本项目由 [caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf) 重构而来,仅仅修改了 Python 包的组织方式,以便使用包管理工具进行简便地安装和调用。 4 | 5 | 1. 可以使用 build.py 脚本编译二进制依赖 6 | 2. 可以在任何工作目录下使用 caj2pdf 命令,而无需移动到同一目录 7 | 3. 如果存在任何关于 CAJ 文件格式而导致的问题,请到 [caj2pdf/caj2pdf](https://github.com/caj2pdf/caj2pdf/issues) 提交反馈。如果存在本项目无法安装、调用出错或者版本过于落后等问题,可到 [issues](issues/) 提交反馈。 8 | 9 | ## Why 10 | 11 | [中国知网](http://cnki.net/)的某些文献(多为学位论文)仅提供其专有的 CAJ 格式下载,仅能使用知网提供的软件(如 [CAJViewer](http://cajviewer.cnki.net/) 等)打开,给文献的阅读和管理带来了不便(尤其是在非 Windows 系统上)。 12 | 13 | 若要将 CAJ 文件转换为 PDF 文件,可以使用 CAJViewer 的打印功能。但这样得到的 PDF 文件的内容为图片,无法进行文字的选择,且原文献的大纲列表也会丢失。本项目希望可以解决上述两问题。 14 | 15 | ## How to use 16 | 17 | ### 环境和依赖 18 | 19 | - Python 3.10+ (使用了 `importlib.resources` 模块,以提供在任意目录下工作的能力) 20 | - [PyPDF2](https://github.com/mstamy2/PyPDF2) 21 | - [mutool](https://mupdf.com/index.html) 22 | 23 | 除了Microsoft Windows:我们提供Microsoft Windows 32-bit/64-bit DLLs,HN 格式需要 24 | 25 | - C/C++编译器 26 | - libpoppler开发包,或libjbig2dec开发包 27 | 28 | ### 安装 29 | 30 | #### ArchLinux 31 | 32 | ```sh 33 | # poppler 库 34 | sudo pacman -S base-devel poppler mupdf-tools 35 | pip install caj2pdf-restructured 36 | 37 | # jbig2dec 库 38 | sudo pacman -S base-devel jbig2dec mupdf-tools 39 | LIBJBIG2DEC=1 pip install caj2pdf-restructured 40 | ``` 41 | 42 | 或使用 [pipx](https://github.com/pipxproject/pipx) 43 | 44 | ```sh 45 | # poppler 库 46 | sudo pacman -S base-devel poppler mupdf-tools 47 | pipx install caj2pdf-restructured 48 | 49 | # jbig2dec 库 50 | sudo pacman -S base-devel jbig2dec mupdf-tools 51 | LIBJBIG2DEC=1 pipx install caj2pdf-restructured 52 | ``` 53 | 54 | #### Debian, Ubuntu 等 Linux 55 | 56 | ```sh 57 | # poppler 库 58 | sudo apt install build-essential libpoppler-dev mupdf-tools 59 | pip install caj2pdf-restructured 60 | ``` 61 | 62 | 或使用 [pipx](https://github.com/pipxproject/pipx) 63 | 64 | ```sh 65 | # poppler 库 66 | sudo apt install build-essential libpoppler-dev mupdf-tools 67 | pipx install caj2pdf-restructured 68 | ``` 69 | 70 | **注意**: 71 | 72 | 1. jbig2dec 库在 Ubuntu/Debian 上的安装存在依赖问题,但是 poppler 库可能无法解析 HN 文件,建议能配置好依赖的尽量使用 `LIBJBIG2DEC=1` 进行构建。 73 | 2. Ubuntu 16.04 的 poppler 库版本过于落后,建议在较新的系统上安装。 74 | 75 | #### Windows 76 | 77 | 可以直接通过 pip 或 pipx 安装: 78 | 79 | ```sh 80 | pip install caj2pdf-restructured 81 | 82 | pipx install caj2pdf-restructured 83 | ``` 84 | 85 | 然后,从 [mutool](https://mupdf.com/index.html) 下载 mupdf-1.18.0-windows.zip 并解压,将其中的 mutool.exe 添加到 `PATH` 变量中的路径下,以便从任意位置调用。 86 | 87 | 如果你使用 [choco](https://chocolatey.org) 或 [scoop](https://scoop.sh/) 作为 Windows 下的包管理工具,则可一键式安装: 88 | 89 | ```sh 90 | choco install mupdf 91 | ``` 92 | 93 | 或者 94 | 95 | ```sh 96 | scoop install mupdf 97 | ``` 98 | 99 | ### 用法 100 | 101 | ``` 102 | # 打印文件基本信息(文件类型、页面数、大纲项目数) 103 | caj2pdf show [input_file] 104 | 105 | # 转换文件 106 | caj2pdf convert [input_file] -o/--output [output_file] 107 | 108 | # 从 CAJ 文件中提取大纲信息并添加至 PDF 文件 109 | ## 遇到不支持的文件类型或 Bug 时,可用 CAJViewer 打印 PDF 文件,并用这条命令为其添加大纲 110 | caj2pdf outlines [input_file] -o/--output [pdf_file] 111 | ``` 112 | ### 例 113 | 114 | ``` 115 | caj2pdf show test.caj 116 | caj2pdf convert test.caj -o output.pdf 117 | caj2pdf outlines test.caj -o printed.pdf 118 | ``` 119 | 120 | #### 右键菜单 121 | 122 | 0.1.0a4 版本后,可以在 Windows 系统上使用右键菜单转换 CAJ 文件了。 123 | 124 | ![](screenshot1.png) 125 | 126 | 需要在命令行中调用命令 `caj2pdf install` 安装注册表,然后才能使用此功能。 127 | 如果卸载程序,注册表 **不会被清理**,待研究 pip,看看能不能在 uninstall 之前加 HOOK。 128 | 129 | TODO: 清理注册表的功能。 130 | 131 | ### 异常输出(IMPORTANT!!!) 132 | 133 | 尽管这个项目目前有不少同学关注到了,但它**仍然只支持部分 caj 文件的转换**,必须承认这完全不是一个对普通用户足够友好的成熟项目。具体支持哪些不支持哪些,在前文也已经说了,但似乎很多同学并没有注意到。所以**如果你遇到以下两种输出,本项目目前无法帮助到你**。与此相关的 issue 不再回复。 134 | 135 | - `Unknown file type.`:未知文件类型; 136 | 137 | ## How far we've come 138 | 139 | 知网下载到的后缀为 `caj` 的文件内部结构其实分为两类:CAJ 格式和 HN 格式(受考察样本所限可能还有更多)。目前本项目支持 CAJ 格式文件的转换,HN 格式的转换未完善,并且需要建立两个新的共享库(除了Microsoft Windows:我们提供Microsoft Windows 32-bit/64-bit DLLs),详情如下: 140 | 141 | ``` 142 | cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc 143 | cc -Wall `pkg-config --cflags poppler` -fPIC -shared -o libjbig2codec.so decode_jbig2data.cc `pkg-config --libs poppler` 144 | ``` 145 | 146 | 抑或和libpoppler 相比,还是取决于您是否更喜欢libjbig2dec一点,可以替换libpoppler: 147 | 148 | ``` 149 | cc -Wall -fPIC --shared -o libjbigdec.so jbigdec.cc JBigDecode.cc 150 | cc -Wall `pkg-config --cflags jbig2dec` -fPIC -shared -o libjbig2codec.so decode_jbig2data_x.cc `pkg-config --libs jbig2dec` 151 | ``` 152 | 153 | **NOTE(zombie110year,2021/04/20)**:现在可以使用 `python build.py` 指令来编译链接库了。并且源代码和输出文件的路径移动到了 `caj2pdf/dep` 之中,和上面的命令不同。 154 | 155 | 1. 默认使用 libpoppler 作为依赖编译: 156 | 157 | ```sh 158 | python build.py 159 | ``` 160 | 161 | 2. 或者,使用 jbig2dec 作为依赖编译: 162 | 163 | ```sh 164 | LIBJBIG2DEC=1 python build.py 165 | ``` 166 | 167 | **关于两种格式文件结构的分析进展和本项目的实现细节,请查阅[项目 Wiki](https://github.com/JeziL/caj2pdf/wiki)。** 168 | 169 | ## How to contribute 170 | 171 | 受测试样本数量所限,即使转换 CAJ 格式的文件也可能(或者说几乎一定)存在 Bug。如遇到这种情况,欢迎在 [Issue](https://github.com/JeziL/caj2pdf/issues) 中提出,**并提供可重现 Bug 的 caj 文件**——可以将样本文件上传到网盘等处,也可直接提供知网链接(作者已滚出校园网,提 issue 请提供可下载的 caj 文件)。 172 | 173 | 如果你对二进制文件分析、图像/文字压缩算法、逆向工程等领域中的一个或几个有所了解,欢迎帮助完善此项目。你可以从阅读[项目 Wiki](https://github.com/JeziL/caj2pdf/wiki) 开始,看看是否有可以发挥你特长的地方。**Pull requests are always welcome**. 174 | 175 | ## License 176 | 177 | 本项目基于 [GLWTPL](https://github.com/me-shaon/GLWTPL) (Good Luck With That Public License) 许可证开源。 178 | -------------------------------------------------------------------------------- /caj2pdf/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import struct 3 | import sys 4 | 5 | import PyPDF2.generic as PDF 6 | from PyPDF2 import PdfFileReader, PdfFileWriter 7 | 8 | 9 | class Node(object): 10 | def __init__(self, data, parent=None, lchild=None, rchild=None): 11 | self.data = data 12 | self.parent = parent 13 | self.lchild = lchild 14 | self.rchild = rchild 15 | 16 | @property 17 | def level(self): 18 | return self.data["level"] 19 | 20 | @property 21 | def index(self): 22 | return self.data["index"] 23 | 24 | def real_parent(self): 25 | p = self 26 | while True: 27 | c = p 28 | p = p.parent 29 | if p.lchild == c: 30 | return p 31 | if p.parent is None: 32 | return None 33 | 34 | def prev(self): 35 | if self.parent.rchild == self: 36 | return self.parent 37 | else: 38 | return None 39 | 40 | def next(self): 41 | return self.rchild 42 | 43 | def first(self): 44 | return self.lchild 45 | 46 | def last(self): 47 | f = self.first() 48 | if f is None: 49 | return None 50 | r = f 51 | while r.rchild is not None: 52 | r = r.rchild 53 | return r 54 | 55 | 56 | class BTree(object): 57 | def __init__(self): 58 | self.root = Node({"level": 0, "index": 0}, None) 59 | self.cursor = self.root 60 | 61 | @property 62 | def current_level(self): 63 | return self.cursor.level 64 | 65 | def insert_as_lchild(self, node): 66 | self.cursor.lchild = node 67 | node.parent = self.cursor 68 | self.cursor = node 69 | 70 | def insert_as_rchild(self, node): 71 | self.cursor.rchild = node 72 | node.parent = self.cursor 73 | self.cursor = node 74 | 75 | 76 | def fnd(f, s, start=0): 77 | fsize = f.seek(0, os.SEEK_END) 78 | f.seek(0) 79 | bsize = 4096 80 | buffer = None 81 | if start > 0: 82 | f.seek(start) 83 | overlap = len(s) - 1 84 | while True: 85 | if overlap <= f.tell() < fsize: 86 | f.seek(f.tell() - overlap) 87 | buffer = f.read(bsize) 88 | if buffer: 89 | pos = buffer.find(s) 90 | if pos >= 0: 91 | return f.tell() - (len(buffer) - pos) 92 | else: 93 | return -1 94 | 95 | 96 | def fnd_rvrs(f, s, end=sys.maxsize): 97 | # find target in reverse direction 98 | fsize = f.seek(0, os.SEEK_END) 99 | bsize = 4096 100 | if len(s) > end: 101 | raise SystemExit("Too large string size for search.") 102 | f.seek(fsize - bsize) 103 | buffer = None 104 | size = bsize 105 | if bsize <= end < fsize: 106 | f.seek(end - bsize) 107 | elif 0 < end < bsize: 108 | size = end 109 | f.seek(0) 110 | overlap = len(s) - 1 111 | s = s[::-1] 112 | while True: 113 | buffer = f.read(size) 114 | if buffer: 115 | buffer = buffer[::-1] 116 | pos = buffer.find(s) 117 | if pos >= 0: 118 | return f.tell() - pos 119 | if (2 * bsize - overlap) < f.tell(): 120 | f.seek(f.tell() - (2 * bsize - overlap)) 121 | size = bsize 122 | elif (bsize - overlap) < f.tell(): 123 | size = f.tell() - (bsize - overlap) 124 | f.seek(0) 125 | else: 126 | return -1 127 | 128 | 129 | def fnd_all(f, s): 130 | results = [] 131 | last_addr = -len(s) 132 | while True: 133 | addr = fnd(f, s, start=last_addr + len(s)) 134 | if addr != -1: 135 | results.append(addr) 136 | last_addr = addr 137 | else: 138 | return results 139 | 140 | 141 | def fnd_unuse_no(nos1, nos2): 142 | unuse_no = -1 143 | for i in range(99999): 144 | if (99999 - i not in nos1) and (99999 - i not in nos2): 145 | unuse_no = 99999 - i 146 | break 147 | if unuse_no == -1: 148 | raise SystemExit("Error on PDF objects numbering.") 149 | return unuse_no 150 | 151 | 152 | def make_dest(pdfw, pg): 153 | d = PDF.ArrayObject() 154 | try: 155 | d.append(pdfw.getPage(pg).indirect_ref) 156 | except AttributeError: 157 | d.append(pdfw.getPage(pg).indirectRef) 158 | d.append(PDF.NameObject("/XYZ")) 159 | d.append(PDF.NullObject()) 160 | d.append(PDF.NullObject()) 161 | d.append(PDF.NullObject()) 162 | return d 163 | 164 | 165 | def build_outlines_btree(toc): 166 | tree = BTree() 167 | for i, t in enumerate(toc): 168 | t["page"] -= 1 # Page starts at 0. 169 | t["index"] = i + 1 170 | node = Node(t) 171 | if t["level"] > tree.current_level: 172 | tree.insert_as_lchild(node) 173 | elif t["level"] == tree.current_level: 174 | tree.insert_as_rchild(node) 175 | else: 176 | while True: 177 | p = tree.cursor.real_parent() 178 | tree.cursor = p 179 | if p.level == t["level"]: 180 | tree.insert_as_rchild(node) 181 | break 182 | t["node"] = node 183 | 184 | 185 | def add_outlines(toc, filename, output): 186 | build_outlines_btree(toc) 187 | pdf_out = PdfFileWriter() 188 | inputFile = open(filename, "rb") 189 | pdf_in = PdfFileReader(inputFile) 190 | for p in pdf_in.pages: 191 | try: 192 | pdf_out.add_page(p) 193 | except AttributeError: 194 | pdf_out.addPage(p) 195 | toc_num = len(toc) 196 | if toc_num == 0: # Just copy if toc empty 197 | outputFile = open(output, "wb") 198 | pdf_out.write(outputFile) 199 | inputFile.close() 200 | outputFile.close() 201 | return 202 | idoix = len(pdf_out._objects) + 1 203 | idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)] 204 | ol = PDF.DictionaryObject() 205 | ol.update( 206 | { 207 | PDF.NameObject("/Type"): PDF.NameObject("/Outlines"), 208 | PDF.NameObject("/First"): idorefs[1], 209 | PDF.NameObject("/Last"): idorefs[-1], 210 | PDF.NameObject("/Count"): PDF.NumberObject(toc_num), 211 | } 212 | ) 213 | olitems = [] 214 | for t in toc: 215 | oli = PDF.DictionaryObject() 216 | oli.update( 217 | { 218 | PDF.NameObject("/Title"): PDF.TextStringObject( 219 | t["title"].decode("utf-8") 220 | ), 221 | PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"]), 222 | } 223 | ) 224 | opt_keys = { 225 | "real_parent": "/Parent", 226 | "prev": "/Prev", 227 | "next": "/Next", 228 | "first": "/First", 229 | "last": "/Last", 230 | } 231 | for k, v in opt_keys.items(): 232 | n = getattr(t["node"], k)() 233 | if n is not None: 234 | oli.update({PDF.NameObject(v): idorefs[n.index]}) 235 | olitems.append(oli) 236 | try: 237 | pdf_out._add_object(ol) 238 | except AttributeError: 239 | pdf_out._addObject(ol) 240 | for i in olitems: 241 | try: 242 | pdf_out._add_object(i) 243 | except AttributeError: 244 | pdf_out._addObject(i) 245 | pdf_out._root_object.update({PDF.NameObject("/Outlines"): idorefs[0]}) 246 | outputFile = open(output, "wb") 247 | pdf_out.write(outputFile) 248 | inputFile.close() 249 | outputFile.close() 250 | 251 | 252 | # See if the page is N * N images, N images written N times, 253 | # by checking image sizes and within 1 < N <= 10. 254 | # Return True and N if that's the case. 255 | def find_redundant_images(caj, initial_offset, images_per_page): 256 | sqrts = { 257 | 4: 2, 258 | 9: 3, 259 | 16: 4, 260 | 25: 5, 261 | 36: 6, 262 | 49: 7, 263 | 64: 8, 264 | 81: 9, 265 | 100: 10, 266 | } 267 | 268 | if not (images_per_page in sqrts.keys()): 269 | return (False, images_per_page) 270 | stride = sqrts[images_per_page] 271 | sizes = [] 272 | current_offset = initial_offset 273 | for j in range(images_per_page): 274 | caj.seek(current_offset) 275 | read32 = caj.read(32) 276 | [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack( 277 | "iii", read32[0:12] 278 | ) 279 | if (j >= stride) and (size_of_image_data != sizes[j - stride]): 280 | return (False, images_per_page) 281 | sizes.append(size_of_image_data) 282 | current_offset = offset_to_image_data + size_of_image_data 283 | # if we reach here, the image sizes seen are [A, B, C ... N, ..., A, B, C ... N] exactly N times. 284 | return (True, stride) 285 | -------------------------------------------------------------------------------- /caj2pdf/dep/JBigDecode.cc: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2021 (c) Hin-Tak Leung 3 | See The FreeType Project LICENSE for license terms. 4 | 5 | Decode-only part of JBigCodec. Drop-in compatible with LibReaderEx's. 6 | */ 7 | 8 | #include 9 | #include 10 | #include "JBigDecode.h" 11 | 12 | void JBigCodec::ByteIn() 13 | { 14 | unsigned int v3; 15 | unsigned int v1 = this->read_count; 16 | int v2 = 0; 17 | if ( v1 < this->inbuf_length ) 18 | { 19 | v3 = *(this->inbuf + v1); // Needs to be unsigned! 20 | this->read_count = v1 + 1; 21 | v2 = v3 << 8; 22 | } 23 | this->C_register += v2; 24 | this->CT = 8; 25 | } 26 | 27 | /* size in number of ints! */ 28 | void* JBigCodec::ClearLine(char* dest, unsigned int size) 29 | { 30 | return memset(dest, 0, 4 * size); 31 | } 32 | 33 | /* size in number of ints! */ 34 | void* JBigCodec::CopyLine(char* dest, char* src, unsigned int size) 35 | { 36 | return memcpy(dest, src, 4 * size); 37 | } 38 | 39 | /* Table 24 on page 45 of ITU-T REC T-82 */ 40 | 41 | static int LSZ[256] = { 42 | 0x5a1d, 43 | 0x2586, 0x1114, 0x080b, 0x03d8, 0x01da, 0x00e5, 0x006f, 0x0036, 44 | 0x001a, 0x000d, 0x0006, 0x0003, 0x0001, 0x5a7f, 0x3f25, 0x2cf2, 45 | 0x207c, 0x17b9, 0x1182, 0x0cef, 0x09a1, 0x072f, 0x055c, 0x0406, 46 | 0x0303, 0x0240, 0x01b1, 0x0144, 0x00f5, 0x00b7, 0x008a, 0x0068, 47 | 0x004e, 0x003b, 0x002c, 0x5ae1, 0x484c, 0x3a0d, 0x2ef1, 0x261f, 48 | 0x1f33, 0x19a8, 0x1518, 0x1177, 0x0e74, 0x0bfb, 0x09f8, 0x0861, 49 | 0x0706, 0x05cd, 0x04de, 0x040f, 0x0363, 0x02d4, 0x025c, 0x01f8, 50 | 51 | 0x01a4, 0x0160, 0x0125, 0x00f6, 0x00cb, 0x00ab, 0x008f, 0x5b12, 52 | 0x4d04, 0x412c, 0x37d8, 0x2fe8, 0x293c, 0x2379, 0x1edf, 0x1aa9, 53 | 0x174e, 0x1424, 0x119c, 0x0f6b, 0x0d51, 0x0bb6, 0x0a40, 0x5832, 54 | 0x4d1c, 0x438e, 0x3bdd, 0x34ee, 0x2eae, 0x299a, 0x2516, 0x5570, 55 | 0x4ca9, 0x44d9, 0x3e22, 0x3824, 0x32b4, 0x2e17, 0x56a8, 0x4f46, 56 | 0x47e5, 0x41cf, 0x3c3d, 0x375e, 0x5231, 0x4c0f, 0x4639, 0x415e, 57 | 0x5627, 0x50e7, 0x4b85, 0x5597, 0x504f, 0x5a10, 0x5522, 0x59eb, 58 | }; 59 | 60 | static int NLPS[256] = { 61 | 1, 62 | 14, 16, 18, 20, 23, 25, 28, 30, 63 | 33, 35, 9, 10, 12, 15, 36, 38, 64 | 39, 40, 42, 43, 45, 46, 48, 49, 65 | 51, 52, 54, 56, 57, 59, 60, 62, 66 | 63, 32, 33, 37, 64, 65, 67, 68, 67 | 69, 70, 72, 73, 74, 75, 77, 78, 68 | 79, 48, 50, 50, 51, 52, 53, 54, 69 | 70 | 55, 56, 57, 58, 59, 61, 61, 65, 71 | 80, 81, 82, 83, 84, 86, 87, 87, 72 | 72, 72, 74, 74, 75, 77, 77, 80, 73 | 88, 89, 90, 91, 92, 93, 86, 88, 74 | 95, 96, 97, 99, 99, 93, 95, 101, 75 | 102, 103, 104, 99, 105, 106, 107, 103, 76 | 105, 108, 109, 110, 111, 110, 112, 112, 77 | }; 78 | 79 | static int NMPS[256] = { 80 | 1, 81 | 2, 3, 4, 5, 6, 7, 8, 9, 82 | 10, 11, 12, 13, 13, 15, 16, 17, 83 | 18, 19, 20, 21, 22, 23, 24, 25, 84 | 26, 27, 28, 29, 30, 31, 32, 33, 85 | 34, 35, 9, 37, 38, 39, 40, 41, 86 | 42, 43, 44, 45, 46, 47, 48, 49, 87 | 50, 51, 52, 53, 54, 55, 56, 57, 88 | 89 | 58, 59, 60, 61, 62, 63, 32, 65, 90 | 66, 67, 68, 69, 70, 71, 72, 73, 91 | 74, 75, 76, 77, 78, 79, 48, 81, 92 | 82, 83, 84, 85, 86, 87, 71, 89, 93 | 90, 91, 92, 93, 94, 86, 96, 97, 94 | 98, 99, 100, 93, 102, 103, 104, 99, 95 | 106, 107, 103, 109, 107, 111, 109, 111, 96 | }; 97 | 98 | static int SWITCH[256] = { 99 | 1, 100 | 0, 0, 0, 0, 0, 0, 0, 0, 101 | 0, 0, 0, 0, 0, 1, 0, 0, 102 | 0, 0, 0, 0, 0, 0, 0, 0, 103 | 0, 0, 0, 0, 0, 0, 0, 0, 104 | 0, 0, 0, 1, 0, 0, 0, 0, 105 | 0, 0, 0, 0, 0, 0, 0, 0, 106 | 0, 0, 0, 0, 0, 0, 0, 0, 107 | 108 | 0, 0, 0, 0, 0, 0, 0, 1, 109 | 0, 0, 0, 0, 0, 0, 0, 0, 110 | 0, 0, 0, 0, 0, 0, 0, 1, 111 | 0, 0, 0, 0, 0, 0, 0, 1, 112 | 0, 0, 0, 0, 0, 0, 1, 0, 113 | 0, 0, 0, 0, 0, 0, 0, 0, 114 | 1, 0, 0, 0, 0, 1, 0, 1, 115 | }; 116 | 117 | void JBigCodec::LpsExchange(int CX, unsigned int ST_CX, unsigned int LSZ_ST_CX) 118 | { 119 | int v6; 120 | 121 | if ( A_interval < LSZ_ST_CX ) 122 | { 123 | PIX = MPS[CX]; 124 | ST[CX] = NMPS[ST_CX]; 125 | } 126 | else 127 | { 128 | v6 = (MPS[CX] ^ 1)& 1; // 1 - MPS[CX] 129 | PIX = v6; 130 | ST[CX] = NLPS[ST_CX]; 131 | if ( SWITCH[ST_CX] == 1 ) 132 | MPS[CX] = v6; 133 | } 134 | C_register -= A_interval << 16; 135 | A_interval = LSZ_ST_CX; 136 | } 137 | 138 | void JBigCodec::MpsExchange(int CX, unsigned int ST_CX, unsigned int LSZ_ST_CX) 139 | { 140 | int v6; 141 | 142 | if ( A_interval >= LSZ_ST_CX ) 143 | { 144 | PIX = MPS[CX]; 145 | ST[CX] = NMPS[ST_CX]; 146 | } 147 | else 148 | { 149 | v6 = (MPS[CX] ^ 1) & 1; 150 | PIX = v6; 151 | ST[CX] = NLPS[ST_CX]; 152 | if ( SWITCH[ST_CX] == 1 ) 153 | MPS[CX] = v6; 154 | } 155 | } 156 | 157 | int JBigCodec::Decode1(int CX) 158 | { 159 | A_interval -= LSZ[ST[CX]]; 160 | if ( A_interval <= C_register >> 16 ) 161 | { 162 | LpsExchange(CX, ST[CX], LSZ[ST[CX]]); 163 | } 164 | else 165 | { 166 | PIX = MPS[CX]; // difference 167 | if ( A_interval > 0x7FFF ) 168 | return PIX; 169 | MpsExchange(CX, ST[CX], LSZ[ST[CX]]); 170 | } 171 | this->RenormDe(); 172 | return PIX; 173 | } 174 | 175 | int JBigCodec::Decode(char* inbuf, unsigned int size, unsigned int height, unsigned int bitwidth, unsigned int bitwidth_in_padded_bytes, char*outbuf) 176 | { 177 | this->bitwidth = bitwidth; 178 | this->height = height; 179 | this->width_in_padded_bytes = bitwidth_in_padded_bytes; 180 | memset(outbuf, 0, height * bitwidth_in_padded_bytes); 181 | this->outptr = outbuf; 182 | this->InitDecode(inbuf, size); 183 | this->LowestDecode(); 184 | return 0; 185 | } 186 | 187 | int JBigCodec::Decode(int CX) 188 | { 189 | A_interval -= LSZ[ST[CX]]; 190 | if ( A_interval <= C_register >> 16 ) 191 | { 192 | LpsExchange(CX, ST[CX], LSZ[ST[CX]]); 193 | this->RenormDe(); 194 | } 195 | else 196 | { 197 | if ( A_interval <= 0x7FFF ) 198 | { 199 | MpsExchange(CX, ST[CX], LSZ[ST[CX]]); 200 | this->RenormDe(); 201 | } 202 | else 203 | PIX = MPS[CX]; // difference 204 | } 205 | return PIX; 206 | } 207 | 208 | /* size in ints! */ 209 | void* JBigCodec::DupLine(char* buf, unsigned int dest_offset, unsigned int src_offset, unsigned int size) 210 | { 211 | return memcpy(buf + dest_offset, buf + src_offset, 4 * size); 212 | } 213 | 214 | int JBigCodec::GetBit(int line_offset, int bit_offset) 215 | { 216 | static const unsigned char bitmask[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; 217 | 218 | if (bit_offset < 0 || bit_offset >= this->bitwidth || line_offset <0) 219 | return 0; 220 | 221 | if (line_offset >= this->height) 222 | line_offset = this->height -1; 223 | 224 | return (*(char *)(this->outptr 225 | + this->width_in_padded_bytes * (this->height - line_offset - 1) 226 | + bit_offset / 3) & bitmask[bit_offset & 7]) != 0; 227 | } 228 | 229 | int JBigCodec::GetCX(int a2, int a3) 230 | { 231 | int v3; 232 | int v4; 233 | int v5; 234 | int v6; 235 | int v7; 236 | 237 | v3 = a3; 238 | v4 = 2 * GetBit(a2 - 1, a3 + 2); 239 | v5 = 2 * (GetBit(a2 - 1, v3 + 1) + v4); 240 | v6 = 8 * (GetBit(a2 - 1, v3) + v5); 241 | v7 = 2 * (GetBit(a2 - 2, v3 + 1) + v6); 242 | return 2 * (GetBit(a2 - 2, v3) + v7); 243 | } 244 | 245 | void JBigCodec::InitDecode(char* inbuf, unsigned int buflen) 246 | { 247 | this->inbuf_length = buflen; 248 | this->read_count = 0; 249 | this->inbuf = (unsigned char*)inbuf; 250 | memset((void *)this->MPS, 0, 0x4000u); 251 | memset((void *)ST, 0, 0x4000u); 252 | this->ByteIn(); 253 | this->C_register <<=8; 254 | this->ByteIn(); 255 | this->C_register <<=8; 256 | this->ByteIn(); 257 | this->A_interval = 0x10000; 258 | } 259 | 260 | int JBigCodec::LowestDecode() 261 | { 262 | int v2 = this->width_in_padded_bytes; 263 | int v3 = v2 + 2; 264 | int v4 = 3 * (v2 + 2); 265 | int v5 = 2 * v2; 266 | char *v15 = (char *)malloc(24 * (v2 + 2)); 267 | this->ClearLine(v15, 2 * v4); 268 | int v6 = this->height; 269 | if ( v6 ) 270 | { 271 | char *v7 = v15 + 8 * v3; 272 | char *v8 = v15 + 16 * v3; 273 | int v9 = this->width_in_padded_bytes * (v6 - 1); 274 | int v10 = 0; 275 | char *v13; 276 | for ( char *i = v15; ; i = v13 ) 277 | { 278 | if ( this->Decode(0x29c) ) 279 | { 280 | this->MakeTypicalLine(v10); 281 | this->CopyLine(v8, v7, v5); 282 | } 283 | else 284 | { 285 | this->ClearLine(v8, v5); 286 | unsigned int v14 = this->GetCX(v10, 0); 287 | this->LowestDecodeLine(v9, v7, i, v14, v8); 288 | } 289 | ++v10; 290 | if ( v10 >= this->height ) 291 | break; 292 | v9 -= this->width_in_padded_bytes; 293 | v13 = v7; 294 | v7 = v8; 295 | v8 = i; 296 | } 297 | } 298 | if ( v15 ) 299 | free(v15); 300 | return 0; 301 | } 302 | 303 | int JBigCodec::LowestDecodeLine(unsigned int scanline_offset, char* a3, char* a4, unsigned int cx, char* a6) 304 | { 305 | char *v7 = a3; 306 | char *v8 = a4; 307 | unsigned int v9 = cx; 308 | int v10 = 0; 309 | int v11; 310 | int result = 0; 311 | int v13; 312 | 313 | do 314 | { 315 | this->Decode1(v9); 316 | v13 = (v9 >> 1) & 0xFDFF; 317 | if ( (this->PIX & 0xFF) == 1 ) 318 | { 319 | *(this->outptr + (v10 >> 3) + scanline_offset) |= 1 << (~(char)v10 & 7); 320 | v13 |= 0x200u; 321 | *(a6 + v10) = 1; 322 | } 323 | v11 = v13 | 4; 324 | if ( *(v8 + v10 + 2) != 1 ) 325 | v11 &= 0xFFFBu; 326 | v9 = v11 | 0x80; 327 | if ( *(v7 + v10 + 3) != 1 ) 328 | v9 &= 0xFF7Fu; 329 | ++v10; 330 | } 331 | while ( v10 < this->bitwidth ); 332 | return result; 333 | } 334 | 335 | /* this routine copies one line from the bottom upwards */ 336 | void* JBigCodec::MakeTypicalLine(int number) 337 | { 338 | if (number > 0) 339 | { 340 | int max = this->height - 1; 341 | if (number <= max) 342 | { 343 | return this->DupLine(this->outptr, 344 | this->width_in_padded_bytes * (max-number), 345 | this->width_in_padded_bytes * (max-number) + this->width_in_padded_bytes, 346 | this->width_in_padded_bytes >> 2); /* bytes / 4 */ 347 | } 348 | } 349 | return NULL; 350 | } 351 | 352 | void JBigCodec::RenormDe() 353 | { 354 | do 355 | { 356 | if ( !this->CT ) 357 | { 358 | this->ByteIn(); 359 | } 360 | this->A_interval *= 2; 361 | this->C_register *= 2; 362 | -- this->CT; 363 | } 364 | while ( this->A_interval <= 0x7FFF ); 365 | if ( !this->CT ) 366 | this->ByteIn(); 367 | return; 368 | } 369 | -------------------------------------------------------------------------------- /pdm.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "black" 3 | version = "22.10.0" 4 | requires_python = ">=3.7" 5 | summary = "The uncompromising code formatter." 6 | dependencies = [ 7 | "click>=8.0.0", 8 | "mypy-extensions>=0.4.3", 9 | "pathspec>=0.9.0", 10 | "platformdirs>=2", 11 | "tomli>=1.1.0; python_full_version < \"3.11.0a7\"", 12 | ] 13 | 14 | [[package]] 15 | name = "click" 16 | version = "8.1.3" 17 | requires_python = ">=3.7" 18 | summary = "Composable command line interface toolkit" 19 | dependencies = [ 20 | "colorama; platform_system == \"Windows\"", 21 | ] 22 | 23 | [[package]] 24 | name = "colorama" 25 | version = "0.4.6" 26 | requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 27 | summary = "Cross-platform colored terminal text." 28 | 29 | [[package]] 30 | name = "flake8" 31 | version = "5.0.4" 32 | requires_python = ">=3.6.1" 33 | summary = "the modular source code checker: pep8 pyflakes and co" 34 | dependencies = [ 35 | "mccabe<0.8.0,>=0.7.0", 36 | "pycodestyle<2.10.0,>=2.9.0", 37 | "pyflakes<2.6.0,>=2.5.0", 38 | ] 39 | 40 | [[package]] 41 | name = "imagesize" 42 | version = "1.3.0" 43 | requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 44 | summary = "Getting image size from png/jpeg/jpeg2000/gif file" 45 | 46 | [[package]] 47 | name = "isort" 48 | version = "5.10.1" 49 | requires_python = ">=3.6.1,<4.0" 50 | summary = "A Python utility / library to sort Python imports." 51 | 52 | [[package]] 53 | name = "mccabe" 54 | version = "0.7.0" 55 | requires_python = ">=3.6" 56 | summary = "McCabe checker, plugin for flake8" 57 | 58 | [[package]] 59 | name = "mypy-extensions" 60 | version = "0.4.3" 61 | summary = "Experimental type system extensions for programs checked with the mypy typechecker." 62 | 63 | [[package]] 64 | name = "pathspec" 65 | version = "0.10.2" 66 | requires_python = ">=3.7" 67 | summary = "Utility library for gitignore style pattern matching of file paths." 68 | 69 | [[package]] 70 | name = "platformdirs" 71 | version = "2.5.4" 72 | requires_python = ">=3.7" 73 | summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." 74 | 75 | [[package]] 76 | name = "pycodestyle" 77 | version = "2.9.1" 78 | requires_python = ">=3.6" 79 | summary = "Python style guide checker" 80 | 81 | [[package]] 82 | name = "pyflakes" 83 | version = "2.5.0" 84 | requires_python = ">=3.6" 85 | summary = "passive checker of Python programs" 86 | 87 | [[package]] 88 | name = "pypdf2" 89 | version = "2.2.0" 90 | requires_python = ">=3.6" 91 | summary = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" 92 | 93 | [[package]] 94 | name = "setuptools" 95 | version = "65.6.0" 96 | requires_python = ">=3.7" 97 | summary = "Easily download, build, install, upgrade, and uninstall Python packages" 98 | 99 | [[package]] 100 | name = "tomli" 101 | version = "2.0.1" 102 | requires_python = ">=3.7" 103 | summary = "A lil' TOML parser" 104 | 105 | [metadata] 106 | lock_version = "4.0" 107 | content_hash = "sha256:1831b76c8f8fb125c8f108835e9163ec885af94991206055c9aeba6c98c5468e" 108 | 109 | [metadata.files] 110 | "black 22.10.0" = [ 111 | {url = "https://mirrors.aliyun.com/pypi/packages/2c/11/f2737cd3b458d91401801e83a014e87c63e8904dc063200f77826c352f54/black-22.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2039230db3c6c639bd84efe3292ec7b06e9214a2992cd9beb293d639c6402edb"}, 112 | {url = "https://mirrors.aliyun.com/pypi/packages/3d/c5/b3ab9b563f35fb284d37ab2b14acaed9a27d8cdea9c31364766eb54946a7/black-22.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9311e99228ae10023300ecac05be5a296f60d2fd10fff31cf5c1fa4ca4b1988d"}, 113 | {url = "https://mirrors.aliyun.com/pypi/packages/56/df/913d71817c7034edba25d596c54f782c2f809b6af30367d2f00309e8890a/black-22.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:21199526696b8f09c3997e2b4db8d0b108d801a348414264d2eb8eb2532e540d"}, 114 | {url = "https://mirrors.aliyun.com/pypi/packages/69/21/846c95710cc6561ba980bd6c72479dbcdde742e927ff5ef7340916d003ac/black-22.10.0-1fixedarch-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:2644b5d63633702bc2c5f3754b1b475378fbbfb481f62319388235d0cd104c2d"}, 115 | {url = "https://mirrors.aliyun.com/pypi/packages/69/84/903cdf41514088d5a716538cb189c471ab34e56ae9a1c2da6b8bfe8e4dbf/black-22.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:974308c58d057a651d182208a484ce80a26dac0caef2895836a92dd6ebd725e0"}, 116 | {url = "https://mirrors.aliyun.com/pypi/packages/71/f8/57e47ea67f59613c4368a952062bc3429131249920cffbb8362fd404b733/black-22.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fba8a281e570adafb79f7755ac8721b6cf1bbf691186a287e990c7929c7692ff"}, 117 | {url = "https://mirrors.aliyun.com/pypi/packages/86/da/edebcc6c13441d91eff6761e50512bc6d6886a556dc5357b399694122b4f/black-22.10.0-1fixedarch-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:197df8509263b0b8614e1df1756b1dd41be6738eed2ba9e9769f3880c2b9d7b6"}, 118 | {url = "https://mirrors.aliyun.com/pypi/packages/91/e6/d9b78987d7d903369ba1a0b795bce4de06f0155be6609f15e8950aef8f7e/black-22.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:444ebfb4e441254e87bad00c661fe32df9969b2bf224373a448d8aca2132b395"}, 119 | {url = "https://mirrors.aliyun.com/pypi/packages/a3/89/629fca2eea0899c06befaa58dc0f49d56807d454202bb2e54bd0d98c77f3/black-22.10.0.tar.gz", hash = "sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1"}, 120 | {url = "https://mirrors.aliyun.com/pypi/packages/a5/5f/9cfc6dd95965f8df30194472543e6f0515a10d78ea5378426ef1546735c7/black-22.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ff67aec0a47c424bc99b71005202045dc09270da44a27848d534600ac64fc7"}, 121 | {url = "https://mirrors.aliyun.com/pypi/packages/a6/84/5c3f3ffc4143fa7e208d745d2239d915e74d3709fdbc64c3e98d3fd27e56/black-22.10.0-1fixedarch-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5d8f74030e67087b219b032aa33a919fae8806d49c867846bfacde57f43972ef"}, 122 | {url = "https://mirrors.aliyun.com/pypi/packages/ab/15/61119d166a44699827c112d7c4726421f14323c2cb7aa9f4c26628f237f9/black-22.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:432247333090c8c5366e69627ccb363bc58514ae3e63f7fc75c54b1ea80fa7de"}, 123 | {url = "https://mirrors.aliyun.com/pypi/packages/ae/49/ea03c318a25be359b8e5178a359d47e2da8f7524e1522c74b8f74c66b6f8/black-22.10.0-1fixedarch-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5cc42ca67989e9c3cf859e84c2bf014f6633db63d1cbdf8fdb666dcd9e77e3fa"}, 124 | {url = "https://mirrors.aliyun.com/pypi/packages/b0/9e/fa912c5ae4b8eb6d36982fc8ac2d779cf944dbd7c3c1fe7a28acf462c1ed/black-22.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b49776299fece66bffaafe357d929ca9451450f5466e997a7285ab0fe28e3b"}, 125 | {url = "https://mirrors.aliyun.com/pypi/packages/b9/51/403b0b0eb9fb412ca02b79dc38472469f2f88c9aacc6bb5262143e4ff0bc/black-22.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72ef3925f30e12a184889aac03d77d031056860ccae8a1e519f6cbb742736383"}, 126 | {url = "https://mirrors.aliyun.com/pypi/packages/ce/6f/74492b8852ee4f2ad2178178f6b65bc8fc80ad539abe56c1c23eab6732e2/black-22.10.0-py3-none-any.whl", hash = "sha256:c957b2b4ea88587b46cf49d1dc17681c1e672864fd7af32fc1e9664d572b3458"}, 127 | {url = "https://mirrors.aliyun.com/pypi/packages/d0/5a/5f31494e3acbb6319ee60c3a3a09d3e536a3fd2353f76af9cbff799c4999/black-22.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:915ace4ff03fdfff953962fa672d44be269deb2eaf88499a0f8805221bc68c87"}, 128 | {url = "https://mirrors.aliyun.com/pypi/packages/e2/2f/a8406a9e337a213802aa90a3e9fbf90c86f3edce92f527255fd381309b77/black-22.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b9b29da4f564ba8787c119f37d174f2b69cdfdf9015b7d8c5c16121ddc054ae"}, 129 | {url = "https://mirrors.aliyun.com/pypi/packages/e3/b4/9203f1a0c99aa30389b61fa8cb54bc9f4bf16ac3aa74630c6b974ed3f3b0/black-22.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e464456d24e23d11fced2bc8c47ef66d471f845c7b7a42f3bd77bf3d1789650"}, 130 | {url = "https://mirrors.aliyun.com/pypi/packages/f2/23/f4278377cabf882298b4766e977fd04377f288d1ccef706953076a1e0598/black-22.10.0-1fixedarch-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:e41a86c6c650bcecc6633ee3180d80a025db041a8e2398dcc059b3afa8382cd4"}, 131 | {url = "https://mirrors.aliyun.com/pypi/packages/ff/ce/22281871536b3d79474fd44d48dad48f7cbc5c3982bddf6a7495e7079d00/black-22.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:819dc789f4498ecc91438a7de64427c73b45035e2e3680c92e18795a839ebb66"}, 132 | ] 133 | "click 8.1.3" = [ 134 | {url = "https://mirrors.aliyun.com/pypi/packages/59/87/84326af34517fca8c58418d148f2403df25303e02736832403587318e9e8/click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, 135 | {url = "https://mirrors.aliyun.com/pypi/packages/c2/f1/df59e28c642d583f7dacffb1e0965d0e00b218e0186d7858ac5233dce840/click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, 136 | ] 137 | "colorama 0.4.6" = [ 138 | {url = "https://mirrors.aliyun.com/pypi/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 139 | {url = "https://mirrors.aliyun.com/pypi/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 140 | ] 141 | "flake8 5.0.4" = [ 142 | {url = "https://mirrors.aliyun.com/pypi/packages/ad/00/9808c62b2d529cefc69ce4e4a1ea42c0f855effa55817b7327ec5b75e60a/flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, 143 | {url = "https://mirrors.aliyun.com/pypi/packages/cf/a0/b881b63a17a59d9d07f5c0cc91a29182c8e8a9aa2bde5b3b2b16519c02f4/flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, 144 | ] 145 | "imagesize 1.3.0" = [ 146 | {url = "https://mirrors.aliyun.com/pypi/packages/60/d6/5e803b17f4d42e085c365b44fda34deb0d8675a1a910635930b831c43f07/imagesize-1.3.0-py2.py3-none-any.whl", hash = "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c"}, 147 | {url = "https://mirrors.aliyun.com/pypi/packages/f6/27/b147794d43249e8303a06f427e407a090696b65b81045e36f8873d8d8a42/imagesize-1.3.0.tar.gz", hash = "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"}, 148 | ] 149 | "isort 5.10.1" = [ 150 | {url = "https://mirrors.aliyun.com/pypi/packages/ab/e9/964cb0b2eedd80c92f5172f1f8ae0443781a9d461c1372a3ce5762489593/isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, 151 | {url = "https://mirrors.aliyun.com/pypi/packages/b8/5b/f18e227df38b94b4ee30d2502fd531bebac23946a2497e5595067a561274/isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, 152 | ] 153 | "mccabe 0.7.0" = [ 154 | {url = "https://mirrors.aliyun.com/pypi/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, 155 | {url = "https://mirrors.aliyun.com/pypi/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, 156 | ] 157 | "mypy-extensions 0.4.3" = [ 158 | {url = "https://mirrors.aliyun.com/pypi/packages/5c/eb/975c7c080f3223a5cdaff09612f3a5221e4ba534f7039db34c35d95fa6a5/mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, 159 | {url = "https://mirrors.aliyun.com/pypi/packages/63/60/0582ce2eaced55f65a4406fc97beba256de4b7a95a0034c6576458c6519f/mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, 160 | ] 161 | "pathspec 0.10.2" = [ 162 | {url = "https://mirrors.aliyun.com/pypi/packages/42/79/94b21d5fabb97749ca94590315abe150a750483c87add8543781bcb6cd26/pathspec-0.10.2-py3-none-any.whl", hash = "sha256:88c2606f2c1e818b978540f73ecc908e13999c6c3a383daf3705652ae79807a5"}, 163 | {url = "https://mirrors.aliyun.com/pypi/packages/a2/29/959c72e1a6c3c25eaa46b9bfcc7fd401f65af83163d4796af09272c83c8a/pathspec-0.10.2.tar.gz", hash = "sha256:8f6bf73e5758fd365ef5d58ce09ac7c27d2833a8d7da51712eac6e27e35141b0"}, 164 | ] 165 | "platformdirs 2.5.4" = [ 166 | {url = "https://mirrors.aliyun.com/pypi/packages/61/e0/15ba41c6716acb033c3793be3a02f26c53914ecd9bdd6b315001f8f5f581/platformdirs-2.5.4-py3-none-any.whl", hash = "sha256:af0276409f9a02373d540bf8480021a048711d572745aef4b7842dad245eba10"}, 167 | {url = "https://mirrors.aliyun.com/pypi/packages/cb/5f/dda8451435f17ed8043eab5ffe04e47d703debe8fe845eb074f42260e50a/platformdirs-2.5.4.tar.gz", hash = "sha256:1006647646d80f16130f052404c6b901e80ee4ed6bef6792e1f238a8969106f7"}, 168 | ] 169 | "pycodestyle 2.9.1" = [ 170 | {url = "https://mirrors.aliyun.com/pypi/packages/67/e4/fc77f1039c34b3612c4867b69cbb2b8a4e569720b1f19b0637002ee03aff/pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, 171 | {url = "https://mirrors.aliyun.com/pypi/packages/b6/83/5bcaedba1f47200f0665ceb07bcb00e2be123192742ee0edfb66b600e5fd/pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, 172 | ] 173 | "pyflakes 2.5.0" = [ 174 | {url = "https://mirrors.aliyun.com/pypi/packages/07/92/f0cb5381f752e89a598dd2850941e7f570ac3cb8ea4a344854de486db152/pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, 175 | {url = "https://mirrors.aliyun.com/pypi/packages/dc/13/63178f59f74e53acc2165aee4b002619a3cfa7eeaeac989a9eb41edf364e/pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, 176 | ] 177 | "pypdf2 2.2.0" = [ 178 | {url = "https://mirrors.aliyun.com/pypi/packages/3f/1e/9204070476be4b6f598e4c042590385341c6019c862cc73892f31f17b45c/PyPDF2-2.2.0-py3-none-any.whl", hash = "sha256:6167a965a2b44f6e763f0bb2028810898bace2caae5ddd5040a8b57e1c6aaa5a"}, 179 | {url = "https://mirrors.aliyun.com/pypi/packages/b8/f4/8bbb7a9fa8b6bf7eb55c0d14f31396d2b7812c270454ab02e582c281ad68/PyPDF2-2.2.0.tar.gz", hash = "sha256:12a289d7be1cac0b066b05854ebc40dfaaeea31244ec45ea02682e51deefb7e8"}, 180 | ] 181 | "setuptools 65.6.0" = [ 182 | {url = "https://mirrors.aliyun.com/pypi/packages/09/b6/33512596fb92ba68f7c45e9bbc5e1bb9b24fbd941f9aece250fb420c2f5c/setuptools-65.6.0.tar.gz", hash = "sha256:d1eebf881c6114e51df1664bc2c9133d022f78d12d5f4f665b9191f084e2862d"}, 183 | {url = "https://mirrors.aliyun.com/pypi/packages/1f/97/c03668380f278f1f8b0486d820c142cf224bba1bd78416e1797b52e0e81c/setuptools-65.6.0-py3-none-any.whl", hash = "sha256:6211d2f5eddad8757bd0484923ca7c0a6302ebc4ab32ea5e94357176e0ca0840"}, 184 | ] 185 | "tomli 2.0.1" = [ 186 | {url = "https://mirrors.aliyun.com/pypi/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 187 | {url = "https://mirrors.aliyun.com/pypi/packages/c0/3f/d7af728f075fb08564c5949a9c95e44352e23dee646869fa104a3b2060a3/tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 188 | ] 189 | -------------------------------------------------------------------------------- /caj2pdf/cajparser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import struct 3 | from shutil import copy 4 | from subprocess import STDOUT, CalledProcessError, check_output 5 | 6 | from PyPDF2.errors import PdfReadError 7 | 8 | from .utils import ( 9 | add_outlines, 10 | fnd, 11 | fnd_all, 12 | fnd_rvrs, 13 | fnd_unuse_no, 14 | find_redundant_images, 15 | ) 16 | 17 | KDH_PASSPHRASE = b"FZHMEI" 18 | 19 | printables = "".join( 20 | [ 21 | (len(repr(chr(x))) == 3) and (x != 47) and (x < 128) and chr(x) or "." 22 | for x in range(256) 23 | ] 24 | ) 25 | 26 | image_type = {0: "JBIG", 1: "JPEG", 2: "JPEG", 3: "JBIG2"} # up-side-down 27 | 28 | 29 | class CAJParser(object): 30 | def __init__(self, filename): 31 | self.filename = filename 32 | try: 33 | with open(filename, "rb") as caj: 34 | caj_read4 = caj.read(4) 35 | if caj_read4[0:1] == b"\xc8": 36 | self.format = "C8" 37 | self._PAGE_NUMBER_OFFSET = 0x08 38 | self._TOC_NUMBER_OFFSET = 0 # No TOC 39 | self._TOC_END_OFFSET = 0x50 40 | self._PAGEDATA_OFFSET = self._TOC_END_OFFSET + 20 * self.page_num 41 | return 42 | if caj_read4[0:2] == b"HN": 43 | if ( 44 | caj.read(2) == b"\xc8\x00" 45 | ): # Most of them are: 90 01, handled later 46 | self.format = "HN" 47 | self._PAGE_NUMBER_OFFSET = 0x90 48 | self._TOC_NUMBER_OFFSET = 0 49 | self._TOC_END_OFFSET = 0xD8 50 | self._PAGEDATA_OFFSET = ( 51 | self._TOC_END_OFFSET + 20 * self.page_num 52 | ) 53 | return 54 | fmt = ( 55 | struct.unpack("4s", caj_read4)[0] 56 | .replace(b"\x00", b"") 57 | .decode("gb18030") 58 | ) 59 | if fmt == "CAJ": 60 | self.format = "CAJ" 61 | self._PAGE_NUMBER_OFFSET = 0x10 62 | self._TOC_NUMBER_OFFSET = 0x110 63 | elif fmt == "HN": 64 | self.format = "HN" 65 | self._PAGE_NUMBER_OFFSET = 0x90 66 | self._TOC_NUMBER_OFFSET = 0x158 67 | 68 | # TOC = [toc_num] followed by [toc_entry * toc_num] 69 | # followed by [Page Info struct (20-byte) * page_num], followed by Page Data 70 | self._TOC_END_OFFSET = ( 71 | self._TOC_NUMBER_OFFSET + 4 + 0x134 * self.toc_num 72 | ) 73 | self._PAGEDATA_OFFSET = self._TOC_END_OFFSET + 20 * self.page_num 74 | elif fmt == "%PDF": 75 | self.format = "PDF" 76 | elif fmt == "KDH ": 77 | self.format = "KDH" 78 | elif fmt == "TEB": 79 | self.format = "TEB" 80 | else: 81 | self.format = None 82 | raise SystemExit("Unknown file type.") 83 | except UnicodeDecodeError: 84 | raise SystemExit("Unknown file type.") 85 | 86 | @property 87 | def page_num(self): 88 | with open(self.filename, "rb") as caj: 89 | caj.seek(self._PAGE_NUMBER_OFFSET) 90 | [page_num] = struct.unpack("i", caj.read(4)) 91 | return page_num 92 | 93 | @property 94 | def toc_num(self): 95 | if self._TOC_NUMBER_OFFSET == 0: 96 | return 0 97 | with open(self.filename, "rb") as caj: 98 | caj.seek(self._TOC_NUMBER_OFFSET) 99 | [toc_num] = struct.unpack("i", caj.read(4)) 100 | return toc_num 101 | 102 | def get_toc(self, verbose=False): 103 | toc = [] 104 | if self._TOC_NUMBER_OFFSET == 0: 105 | return toc 106 | with open(self.filename, "rb") as caj: 107 | for i in range(self.toc_num): 108 | caj.seek(self._TOC_NUMBER_OFFSET + 4 + 0x134 * i) 109 | toc_bytes = struct.unpack("256s24s12s12si", caj.read(0x134)) 110 | ttl_end = toc_bytes[0].find(b"\x00") 111 | title = toc_bytes[0][0:ttl_end].decode("gb18030").encode("utf-8") 112 | pg_end = toc_bytes[2].find(b"\x00") 113 | page = int(toc_bytes[2][0:pg_end]) 114 | level = toc_bytes[4] 115 | toc_entry = {"title": title, "page": page, "level": level} 116 | if verbose: 117 | print(" " * (level - 1), title.decode("utf-8")) 118 | toc.append(toc_entry) 119 | if verbose: 120 | print( 121 | "TOC END: 0x%04X" 122 | % (self._TOC_NUMBER_OFFSET + 4 + 0x134 * self.toc_num) 123 | ) 124 | return toc 125 | 126 | def output_toc(self, dest): 127 | toc_items = self.get_toc() 128 | with open(dest, "wb") as f: 129 | for toc in toc_items: 130 | f.write( 131 | b" " * (toc["level"] - 1) 132 | + toc["title"] 133 | + b" " 134 | + str(toc["page"]).encode("utf-8") 135 | + b"\n" 136 | ) 137 | 138 | def convert(self, dest): 139 | if self.format == "CAJ": 140 | self._convert_caj(dest) 141 | elif self.format == "HN": 142 | self._convert_hn(dest) 143 | elif self.format == "C8": 144 | self._convert_hn(dest) 145 | elif self.format == "PDF": 146 | self._convert_pdf(dest) 147 | elif self.format == "KDH": 148 | self._convert_kdh(dest) 149 | 150 | def parse(self): 151 | if self.format == "CAJ": 152 | pass 153 | elif self.format == "HN": 154 | self._parse_hn() 155 | elif self.format == "C8": 156 | self._parse_hn() 157 | elif self.format == "PDF": 158 | pass 159 | elif self.format == "KDH": 160 | pass 161 | 162 | def text_extract(self): 163 | if self.format == "CAJ": 164 | pass 165 | if self.format == "HN": 166 | self._text_extract_hn() 167 | elif self.format == "C8": 168 | self._text_extract_hn() 169 | elif self.format == "PDF": 170 | pass 171 | elif self.format == "KDH": 172 | pass 173 | 174 | def _convert_caj(self, dest): 175 | caj = open(self.filename, "rb") 176 | 177 | # Extract original PDF data (and add header) 178 | caj.seek(self._PAGE_NUMBER_OFFSET + 4) 179 | [pdf_start_pointer] = struct.unpack("i", caj.read(4)) 180 | caj.seek(pdf_start_pointer) 181 | [pdf_start] = struct.unpack("i", caj.read(4)) 182 | pdf_end = fnd_all(caj, b"endobj")[-1] + 6 183 | pdf_length = pdf_end - pdf_start 184 | caj.seek(pdf_start) 185 | pdf_data = b"%PDF-1.3\r\n" + caj.read(pdf_length) + b"\r\n" 186 | with open("pdf.tmp", "wb") as f: 187 | f.write(pdf_data) 188 | pdf = open("pdf.tmp", "rb") 189 | 190 | # deal with disordered PDF data 191 | endobj_addr = fnd_all(pdf, b"endobj") 192 | obj_no = [] 193 | for addr in endobj_addr: 194 | startobj = fnd_rvrs(pdf, b" 0 obj", addr) 195 | startobj1 = fnd_rvrs(pdf, b"\r", startobj) 196 | startobj2 = fnd_rvrs(pdf, b"\n", startobj) 197 | startobj = max(startobj1, startobj2) 198 | length = fnd(pdf, b" ", startobj) - startobj 199 | pdf.seek(startobj) 200 | [no] = struct.unpack(str(length) + "s", pdf.read(length)) 201 | if int(no) not in obj_no: 202 | obj_no.append(int(no)) 203 | obj_len = addr - startobj + 6 204 | pdf.seek(startobj) 205 | [obj] = struct.unpack(str(obj_len) + "s", pdf.read(obj_len)) 206 | 207 | # Add Catalog (find obj_no of pages) 208 | inds_addr = [i + 8 for i in fnd_all(pdf, b"/Parent")] 209 | inds = [] 210 | for addr in inds_addr: 211 | length = fnd(pdf, b" ", addr) - addr 212 | pdf.seek(addr) 213 | [ind] = struct.unpack(str(length) + "s", pdf.read(length)) 214 | inds.append(int(ind)) 215 | # get pages_obj_no list containing distinct elements 216 | # & find missing pages object(s) -- top pages object(s) in pages_obj_no 217 | pages_obj_no = [] 218 | top_pages_obj_no = [] 219 | for ind in inds: 220 | if (ind not in pages_obj_no) and (ind not in top_pages_obj_no): 221 | if fnd(pdf, bytes("\r{0} 0 obj".format(ind), "utf-8")) == -1: 222 | top_pages_obj_no.append(ind) 223 | else: 224 | pages_obj_no.append(ind) 225 | single_pages_obj_missed = len(top_pages_obj_no) == 1 226 | multi_pages_obj_missed = len(top_pages_obj_no) > 1 227 | # generate catalog object 228 | catalog_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no) 229 | obj_no.append(catalog_obj_no) 230 | root_pages_obj_no = None 231 | if multi_pages_obj_missed: 232 | root_pages_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no) 233 | elif single_pages_obj_missed: 234 | root_pages_obj_no = top_pages_obj_no[0] 235 | top_pages_obj_no = pages_obj_no 236 | else: # root pages object exists, then find the root pages object # 237 | found = False 238 | for pon in pages_obj_no: 239 | tmp_addr = fnd(pdf, bytes("\r{0} 0 obj".format(pon), "utf-8")) 240 | while True: 241 | pdf.seek(tmp_addr) 242 | [_str] = struct.unpack("6s", pdf.read(6)) 243 | if _str == b"Parent": 244 | break 245 | elif _str == b"endobj": 246 | root_pages_obj_no = pon 247 | found = True 248 | break 249 | tmp_addr = tmp_addr + 1 250 | if found: 251 | break 252 | catalog = bytes( 253 | "{0} 0 obj\r<>\rendobj\r".format( 254 | catalog_obj_no, root_pages_obj_no 255 | ), 256 | "utf-8", 257 | ) 258 | pdf_data += catalog 259 | pdf.close() 260 | with open("pdf.tmp", "wb") as f: 261 | f.write(pdf_data) 262 | pdf = open("pdf.tmp", "rb") 263 | 264 | # Add Pages obj and EOF mark 265 | # if root pages object exist, pass 266 | # deal with single missing pages object 267 | if single_pages_obj_missed or multi_pages_obj_missed: 268 | inds_str = ["{0} 0 R".format(i) for i in top_pages_obj_no] 269 | kids_str = "[{0}]".format(" ".join(inds_str)) 270 | pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format( 271 | root_pages_obj_no, kids_str, self.page_num 272 | ) 273 | pdf_data += bytes(pages_str, "utf-8") 274 | pdf.close() 275 | with open("pdf.tmp", "wb") as f: 276 | f.write(pdf_data) 277 | pdf = open("pdf.tmp", "rb") 278 | # deal with multiple missing pages objects 279 | if multi_pages_obj_missed: 280 | kids_dict = {i: [] for i in top_pages_obj_no} 281 | count_dict = {i: 0 for i in top_pages_obj_no} 282 | for tpon in top_pages_obj_no: 283 | kids_addr = fnd_all(pdf, bytes("/Parent {0} 0 R".format(tpon), "utf-8")) 284 | for kid in kids_addr: 285 | ind = fnd_rvrs(pdf, b"obj", kid) - 4 286 | addr = fnd_rvrs(pdf, b"\r", ind) 287 | length = fnd(pdf, b" ", addr) - addr 288 | pdf.seek(addr) 289 | [ind] = struct.unpack(str(length) + "s", pdf.read(length)) 290 | kids_dict[tpon].append(int(ind)) 291 | type_addr = fnd(pdf, b"/Type", addr) + 5 292 | tmp_addr = fnd(pdf, b"/", type_addr) + 1 293 | pdf.seek(tmp_addr) 294 | [_type] = struct.unpack("5s", pdf.read(5)) 295 | if _type == b"Pages": 296 | cnt_addr = fnd(pdf, b"/Count ", addr) + 7 297 | pdf.seek(cnt_addr) 298 | [_str] = struct.unpack("1s", pdf.read(1)) 299 | cnt_len = 0 300 | while _str not in [b" ", b"\r", b"/"]: 301 | cnt_len += 1 302 | pdf.seek(cnt_addr + cnt_len) 303 | [_str] = struct.unpack("1s", pdf.read(1)) 304 | pdf.seek(cnt_addr) 305 | [cnt] = struct.unpack(str(cnt_len) + "s", pdf.read(cnt_len)) 306 | count_dict[tpon] += int(cnt) 307 | else: # _type == b"Page" 308 | count_dict[tpon] += 1 309 | kids_no_str = ["{0} 0 R".format(i) for i in kids_dict[tpon]] 310 | kids_str = "[{0}]".format(" ".join(kids_no_str)) 311 | pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format( 312 | tpon, kids_str, count_dict[tpon] 313 | ) 314 | pdf_data += bytes(pages_str, "utf-8") 315 | pdf_data += bytes("\n%%EOF\r", "utf-8") 316 | pdf.close() 317 | with open("pdf.tmp", "wb") as f: 318 | f.write(pdf_data) 319 | 320 | # Use mutool to repair xref 321 | try: 322 | check_output(["mutool", "clean", "pdf.tmp", "pdf_toc.pdf"], stderr=STDOUT) 323 | except CalledProcessError as e: 324 | print(e.output.decode("utf-8")) 325 | raise SystemExit( 326 | "Command mutool returned non-zero exit status " + str(e.returncode) 327 | ) 328 | 329 | # Add Outlines 330 | try: 331 | add_outlines(self.get_toc(), "pdf_toc.pdf", dest) 332 | except PdfReadError as e: 333 | print("PdfReadError:", str(e)) 334 | copy("pdf_toc.pdf", dest) 335 | pass 336 | os.remove("pdf.tmp") 337 | os.remove("pdf_toc.pdf") 338 | 339 | def _convert_hn(self, dest): 340 | caj = open(self.filename, "rb") 341 | image_list = [] 342 | 343 | import zlib 344 | 345 | from .pdfwutils import Colorspace, ImageFormat, convert_ImageList 346 | 347 | for i in range(self.page_num): 348 | caj.seek(self._TOC_END_OFFSET + i * 20) 349 | [ 350 | page_data_offset, 351 | size_of_text_section, 352 | images_per_page, 353 | page_no, 354 | unk2, 355 | next_page_data_offset, 356 | ] = struct.unpack("iihhii", caj.read(20)) 357 | caj.seek(page_data_offset) 358 | text_header_read32 = caj.read(32) 359 | if (text_header_read32[8:20] == b"COMPRESSTEXT") or ( 360 | text_header_read32[0:12] == b"COMPRESSTEXT" 361 | ): 362 | coff = 8 363 | if text_header_read32[0:12] == b"COMPRESSTEXT": 364 | coff = 0 365 | [expanded_text_size] = struct.unpack( 366 | "i", text_header_read32[12 + coff : 16 + coff] 367 | ) 368 | caj.seek(page_data_offset + 16 + coff) 369 | data = caj.read(size_of_text_section - 16 - coff) 370 | output = zlib.decompress(data, bufsize=expanded_text_size) 371 | if len(output) != expanded_text_size: 372 | raise SystemExit("Unexpected:", len(output), expanded_text_size) 373 | else: 374 | caj.seek(page_data_offset) 375 | output = caj.read(size_of_text_section) 376 | from .HNParsePage import HNParsePage 377 | 378 | page_style = next_page_data_offset > page_data_offset 379 | page_data = HNParsePage(output, page_style) 380 | 381 | current_offset = page_data_offset + size_of_text_section 382 | (found, images_per_page) = find_redundant_images( 383 | caj, current_offset, images_per_page 384 | ) 385 | if found: 386 | print( 387 | "Page %d, skipping %d redundant images" 388 | % (i + 1, images_per_page * (images_per_page - 1)) 389 | ) 390 | 391 | if images_per_page > 1: 392 | if len(page_data.figures) == images_per_page: 393 | if (page_data.figures[0][0] == 0) and ( 394 | page_data.figures[0][1] == 0 395 | ): 396 | image_list.append(None) 397 | image_list.append(page_data.figures) 398 | else: 399 | print( 400 | "Page %d, Image Count %d, first image not at origin, expanding to %d pages" 401 | % (i + 1, len(page_data.figures), images_per_page) 402 | ) 403 | else: 404 | print( 405 | "Page %d, Image Count %d != %d" 406 | % (i + 1, len(page_data.figures), images_per_page) 407 | ) 408 | if len(page_data.figures) > images_per_page: 409 | print("\tTruncating Page %d," % (i + 1), page_data.figures) 410 | image_list.append(None) 411 | image_list.append(page_data.figures[0:images_per_page]) 412 | else: 413 | print( 414 | "Page %d expanding to %d separate image pages" 415 | % (i + 1, images_per_page) 416 | ) 417 | elif images_per_page == 1: 418 | if (len(page_data.figures) == 0) or ( 419 | (len(page_data.figures) > 0) 420 | and ( 421 | not ( 422 | (page_data.figures[0][0] == 0) 423 | and (page_data.figures[0][1] == 0) 424 | ) 425 | ) 426 | ): 427 | print( 428 | "Page %d possibly text-only + single figure(%d)" 429 | % (i + 1, len(page_data.figures)) 430 | ) 431 | else: 432 | # don't care about images_per_page == 0 433 | pass 434 | for j in range(images_per_page): 435 | caj.seek(current_offset) 436 | read32 = caj.read(32) 437 | [ 438 | image_type_enum, 439 | offset_to_image_data, 440 | size_of_image_data, 441 | ] = struct.unpack("iii", read32[0:12]) 442 | if offset_to_image_data != current_offset + 12: 443 | raise SystemExit("unusual image offset") 444 | caj.seek(offset_to_image_data) 445 | image_data = caj.read(size_of_image_data) 446 | current_offset = offset_to_image_data + size_of_image_data 447 | if image_type[image_type_enum] == "JBIG": 448 | from .dep.jbigdec import CImage 449 | 450 | cimage = CImage(image_data) 451 | out = cimage.DecodeJbig() 452 | # PBM is only padded to 8 rather than 32. 453 | # If the padding is larger, write padded file. 454 | width = cimage.width 455 | if cimage.bytes_per_line > ((cimage.width + 7) >> 3): 456 | width = cimage.bytes_per_line << 3 457 | image_item = ( 458 | Colorspace.P, 459 | (300, 300), 460 | ImageFormat.PBM, 461 | zlib.compress(out), 462 | width, 463 | cimage.height, 464 | [0xFFFFFF, 0], 465 | False, 466 | 1, 467 | 0, 468 | ) 469 | elif image_type[image_type_enum] == "JBIG2": 470 | from .dep.jbig2dec import CImage 471 | 472 | cimage = CImage(image_data) 473 | out = cimage.DecodeJbig2() 474 | # PBM is only padded to 8 rather than 32. 475 | # If the padding is larger, write padded file. 476 | width = cimage.width 477 | if cimage.bytes_per_line > ((cimage.width + 7) >> 3): 478 | width = cimage.bytes_per_line << 3 479 | image_item = ( 480 | Colorspace.P, 481 | (300, 300), 482 | ImageFormat.PBM, 483 | zlib.compress(out), 484 | width, 485 | cimage.height, 486 | [0xFFFFFF, 0], 487 | False, 488 | 1, 489 | 0, 490 | ) 491 | elif image_type[image_type_enum] == "JPEG": 492 | colorspace = Colorspace.RGB 493 | component = 3 494 | # stock libjpeg location 495 | ( 496 | SOFn, 497 | frame_length, 498 | bits_per_pixel, 499 | height, 500 | width, 501 | component, 502 | ) = struct.unpack(">HHBHHB", image_data[158:168]) 503 | if SOFn != 0xFFC0: 504 | # "Intel(R) JPEG Library" location 505 | ( 506 | SOFn, 507 | frame_length, 508 | bits_per_pixel, 509 | height, 510 | width, 511 | component, 512 | ) = struct.unpack(">HHBHHB", image_data[0x272:0x27C]) 513 | if SOFn != 0xFFC0: 514 | # neither works, try brute-force 515 | import imagesize 516 | from PIL import Image as pilimage 517 | 518 | with open(".tmp.jpg", "wb") as f: 519 | f.write(image_data) 520 | (width, height) = imagesize.get(".tmp.jpg") 521 | pim = pilimage.open(".tmp.jpg") 522 | if pim.mode == "L": 523 | component = 1 524 | os.remove(".tmp.jpg") 525 | if image_type_enum == 1: 526 | # non-inverted JPEG Images 527 | height = -height 528 | if component == 1: 529 | colorspace = Colorspace.L 530 | image_item = ( 531 | colorspace, 532 | (300, 300), 533 | ImageFormat.JPEG, 534 | image_data, 535 | width, 536 | height, 537 | [], 538 | False, 539 | 8, 540 | 0, 541 | ) 542 | else: 543 | raise SystemExit("Unknown Image Type %d" % (image_type_enum)) 544 | image_list.append(image_item) 545 | if len(image_list) == 0: 546 | raise SystemExit("File is pure-text HN; cannot convert to pdf") 547 | pdf_data = convert_ImageList(image_list) 548 | with open("pdf_toc.pdf", "wb") as f: 549 | f.write(pdf_data) 550 | # Add Outlines 551 | add_outlines(self.get_toc(), "pdf_toc.pdf", dest) 552 | os.remove("pdf_toc.pdf") 553 | 554 | def _text_extract_hn(self): 555 | if self._TOC_NUMBER_OFFSET > 0: 556 | self.get_toc(verbose=True) 557 | caj = open(self.filename, "rb") 558 | 559 | for i in range(self.page_num): 560 | caj.seek(self._TOC_END_OFFSET + i * 20) 561 | [ 562 | page_data_offset, 563 | size_of_text_section, 564 | images_per_page, 565 | page_no, 566 | unk2, 567 | next_page_data_offset, 568 | ] = struct.unpack("iihhii", caj.read(20)) 569 | caj.seek(page_data_offset) 570 | text_header_read32 = caj.read(32) 571 | if (text_header_read32[8:20] == b"COMPRESSTEXT") or ( 572 | text_header_read32[0:12] == b"COMPRESSTEXT" 573 | ): 574 | coff = 8 575 | if text_header_read32[0:12] == b"COMPRESSTEXT": 576 | coff = 0 577 | [expanded_text_size] = struct.unpack( 578 | "i", text_header_read32[12 + coff : 16 + coff] 579 | ) 580 | import zlib 581 | 582 | caj.seek(page_data_offset + 16 + coff) 583 | data = caj.read(size_of_text_section - 16 - coff) 584 | output = zlib.decompress(data, bufsize=expanded_text_size) 585 | if len(output) != expanded_text_size: 586 | raise SystemExit("Unexpected:", len(output), expanded_text_size) 587 | else: 588 | caj.seek(page_data_offset) 589 | output = caj.read(size_of_text_section) 590 | from .HNParsePage import HNParsePage 591 | 592 | page_style = next_page_data_offset > page_data_offset 593 | page_data = HNParsePage(output, page_style) 594 | print("Text on Page %d:" % (i + 1)) 595 | print(page_data.texts) 596 | # print("Figures:\n", page_data.figures) 597 | 598 | def _parse_hn(self): 599 | if self._TOC_NUMBER_OFFSET > 0: 600 | self.get_toc(verbose=True) 601 | caj = open(self.filename, "rb") 602 | 603 | for i in range(self.page_num): 604 | caj.seek(self._TOC_END_OFFSET + i * 20) 605 | print( 606 | "Reading Page Info struct #%d at offset 0x%04X" 607 | % (i + 1, self._TOC_END_OFFSET + i * 20) 608 | ) 609 | [ 610 | page_data_offset, 611 | size_of_text_section, 612 | images_per_page, 613 | page_no, 614 | unk2, 615 | next_page_data_offset, 616 | ] = struct.unpack("iihhii", caj.read(20)) 617 | print( 618 | "unknown page struct members = (%d %d)" % (unk2, next_page_data_offset) 619 | ) 620 | # All 71: 1,0,0 621 | print("Page Number %d Data offset = 0x%04X" % (page_no, page_data_offset)) 622 | caj.seek(page_data_offset) 623 | text_header_read32 = caj.read(32) 624 | print("Page Text Header dump:\n", self.dump(text_header_read32), sep="") 625 | # The first 8 bytes are always: 03 80 XX 16 03 80 XX XX, 626 | # the last one 20 or 21, but the first two can be any. 627 | # 48/71 has: 03 80 E0 16 03 80 F7 20, the rest uniq 628 | if (text_header_read32[8:20] == b"COMPRESSTEXT") or ( 629 | text_header_read32[0:12] == b"COMPRESSTEXT" 630 | ): 631 | coff = 8 632 | if text_header_read32[0:12] == b"COMPRESSTEXT": 633 | coff = 0 634 | # expanded_text_size seems to be always about 2-3 times size_of_text_section, so this is a guess. 635 | [expanded_text_size] = struct.unpack( 636 | "i", text_header_read32[12 + coff : 16 + coff] 637 | ) 638 | import zlib 639 | 640 | caj.seek(page_data_offset + 16 + coff) 641 | data = caj.read(size_of_text_section - 16 - coff) 642 | output = zlib.decompress(data, bufsize=expanded_text_size) 643 | if len(output) != expanded_text_size: 644 | print("Unexpected:", len(output), expanded_text_size) 645 | print( 646 | "Page Text Header COMPRESSTEXT:\n", 647 | self.dump(output, GB=True), 648 | sep="", 649 | ) 650 | for x in range(len(output) >> 4): 651 | try: 652 | print( 653 | bytes([output[(x << 4) + 7], output[(x << 4) + 6]]).decode( 654 | "gbk" 655 | ), 656 | end="", 657 | ) 658 | except UnicodeDecodeError: 659 | print(self.dump(output[x << 4 : (x + 1) << 4])) 660 | print() 661 | else: 662 | caj.seek(page_data_offset) 663 | output = caj.read(size_of_text_section) 664 | print( 665 | "Page Text Header non-COMPRESSTEXT:\n", 666 | self.dump(output, GB=True), 667 | sep="", 668 | ) 669 | from .HNParsePage import HNParsePage 670 | 671 | page_style = next_page_data_offset > page_data_offset 672 | page_data = HNParsePage(output, page_style) 673 | print("Text:\n", page_data.texts) 674 | print("Figures:\n", page_data.figures) 675 | current_offset = page_data_offset + size_of_text_section 676 | for j in range(images_per_page): 677 | caj.seek(current_offset) 678 | read32 = caj.read(32) 679 | [ 680 | image_type_enum, 681 | offset_to_image_data, 682 | size_of_image_data, 683 | ] = struct.unpack("iii", read32[0:12]) 684 | if image_type[image_type_enum] != "JPEG": 685 | read32 += caj.read(64) 686 | print( 687 | "size of image data = %d (%s)" 688 | % (size_of_image_data, image_type[image_type_enum]) 689 | ) 690 | if offset_to_image_data != current_offset + 12: 691 | raise SystemExit("unusual image offset") 692 | print("Page Image Header dump:\n", self.dump(read32), sep="") 693 | print( 694 | "Expected End of Page #%d: 0x%08X" 695 | % (i + 1, current_offset + size_of_image_data + 12) 696 | ) 697 | caj.seek(offset_to_image_data) 698 | image_data = caj.read(size_of_image_data) 699 | current_offset = offset_to_image_data + size_of_image_data 700 | image_name = "image_dump_%04d" % (i + 1) 701 | if j > 0: 702 | image_name = "image_dump_%04d_%04d" % (i + 1, j) 703 | with open(image_name + ".dat", "wb") as f: 704 | f.write(image_data) 705 | if image_type[image_type_enum] == "JBIG": 706 | try: 707 | from .dep.jbigdec import SaveJbigAsBmp 708 | 709 | SaveJbigAsBmp( 710 | image_data, 711 | size_of_image_data, 712 | (image_name + ".bmp").encode("ascii"), 713 | ) 714 | except ImportError: 715 | pass 716 | elif image_type[image_type_enum] == "JBIG2": 717 | try: 718 | from .dep.jbigdec import SaveJbig2AsBmp 719 | 720 | SaveJbig2AsBmp( 721 | image_data, 722 | size_of_image_data, 723 | (image_name + ".bmp").encode("ascii"), 724 | ) 725 | except ImportError: 726 | pass 727 | elif image_type[image_type_enum] == "JPEG": 728 | with open(image_name + ".jpg", "wb") as f: 729 | f.write(image_data) 730 | print("end 0x%08x" % self._PAGEDATA_OFFSET) 731 | 732 | def dump(self, src, GB=False): 733 | N = 0 734 | result = [] 735 | while src: 736 | s, src = src[:16], src[16:] 737 | hexa = " ".join(["%02X" % x for x in s]) 738 | gb = "" 739 | if GB: 740 | gb += " " 741 | for x in range(len(s) >> 1): 742 | try: 743 | if s[(x << 1) + 1] < 128 and s[(x << 1) + 0] < 128: 744 | gb += ".." 745 | else: 746 | gb += bytes([s[(x << 1) + 1], s[(x << 1) + 0]]).decode( 747 | "gbk" 748 | ) 749 | except UnicodeDecodeError: 750 | gb += ".." 751 | s = "".join(printables[x] for x in s) 752 | result += "%04X %-*s %s%s\n" % (N, 16 * 3, hexa, s, gb) 753 | N += 16 754 | return "".join(result) 755 | 756 | def _convert_pdf(self, dest): 757 | copy(self.filename, dest) 758 | 759 | def _convert_kdh(self, dest): 760 | # Read KDH file. 761 | fp = open(self.filename, "rb") 762 | origin = fp.read() 763 | fp.close() 764 | 765 | # Decrypt. 766 | origin = origin[254:] 767 | output = [] 768 | keycursor = 0 769 | for origin_byte in origin: 770 | output.append(origin_byte ^ KDH_PASSPHRASE[keycursor]) 771 | keycursor += 1 772 | if keycursor >= len(KDH_PASSPHRASE): 773 | keycursor = 0 774 | output = bytes(output) 775 | 776 | # Remove useless tail data. 777 | eofpos = output.rfind(b"%%EOF") 778 | if eofpos < 0: 779 | raise Exception("%%EOF mark can't be found.") 780 | output = output[: eofpos + 5] 781 | 782 | # Write output file. 783 | fp = open(dest + ".tmp", "wb") 784 | fp.write(output) 785 | fp.close() 786 | 787 | # Use mutool to repair xref 788 | try: 789 | check_output(["mutool", "clean", dest + ".tmp", dest], stderr=STDOUT) 790 | except CalledProcessError as e: 791 | print(e.output.decode("utf-8")) 792 | raise SystemExit( 793 | "Command mutool returned non-zero exit status " + str(e.returncode) 794 | ) 795 | 796 | os.remove(dest + ".tmp") 797 | --------------------------------------------------------------------------------