├── src ├── .gitignore ├── Makefile.am ├── tounicode.hh ├── regex_dispatcher_m.hh ├── pdf-handler.hh ├── build_table.py ├── main.cc ├── pdf-handler.cc ├── tounicode_table.cc ├── tounicode.cc └── cmdlineparse.hh ├── test ├── bfchar-expected.txt ├── bfchar-garbled.txt ├── bfrange-expected.txt ├── bfrange-garbled.txt ├── bfrange2-expected.txt ├── bfrange2-garbled.txt ├── bfchar.tex ├── test-pdftotext.sh ├── bfchar-qdfcontents.txt ├── bfrange-qdfcontents.txt ├── test-qdfcontents.sh ├── bfrange2-qdfcontents.txt ├── SourceHanSerif-Regular-UTF16 ├── build_bfrange2.py ├── build_bfrange.py └── Makefile.am ├── autogen.sh ├── Makefile.am ├── .gitignore ├── .dir-locals.el ├── LICENSE ├── Readme.md ├── configure.ac ├── Readme.ja.md └── m4 └── ax_cxx_compile_stdcxx.m4 /src/.gitignore: -------------------------------------------------------------------------------- 1 | aj17-kanji.txt 2 | -------------------------------------------------------------------------------- /test/bfchar-expected.txt: -------------------------------------------------------------------------------- 1 | 初見はハ長調で高音の白玉があった 2 | -------------------------------------------------------------------------------- /test/bfchar-garbled.txt: -------------------------------------------------------------------------------- 1 | 初⾒はハ⻑調で⾼⾳の⽩⽟があった 2 | -------------------------------------------------------------------------------- /test/bfrange-expected.txt: -------------------------------------------------------------------------------- 1 | 初見はハ長調で高音の白玉があった 2 | -------------------------------------------------------------------------------- /test/bfrange-garbled.txt: -------------------------------------------------------------------------------- 1 | 初⾒はハ⻑調で⾼⾳の⽩⽟があった 2 | -------------------------------------------------------------------------------- /test/bfrange2-expected.txt: -------------------------------------------------------------------------------- 1 | 初見はハ長調で高音の白玉があった 2 | -------------------------------------------------------------------------------- /test/bfrange2-garbled.txt: -------------------------------------------------------------------------------- 1 | 初⾒はハ⻑調で⾼⾳の⽩⽟があった 2 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | autoreconf -v -i 4 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src test 2 | 3 | dist_doc_DATA = Readme.md Readme.ja.md LICENSE 4 | EXTRA_DIST = autogen.sh 5 | 6 | ACLOCAL_AMFLAGS = -I m4 7 | -------------------------------------------------------------------------------- /test/bfchar.tex: -------------------------------------------------------------------------------- 1 | % plain upTeX 2 | \special{dvipdfmx:config z 0} 3 | \special{pdf:mapline uprml-h unicode !SourceHanSerif-Regular.otf} 4 | \nopagenumbers 5 | 初見はハ長調で高音の白玉があった 6 | \bye 7 | -------------------------------------------------------------------------------- /test/test-pdftotext.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TESTFILE="$1" 4 | BASENAME=`basename ${TESTFILE} | sed -e "s/-testtext\.pdftotext$//"` 5 | EXPECTED=${BASENAME}-expected.txt 6 | 7 | if [ ! -e "${EXPECTED}" ]; then 8 | EXPECTED=${srcdir}/${BASENAME}-expected.txt 9 | fi 10 | 11 | ${DIFF} -ubwBE ${EXPECTED} ${TESTFILE} 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .\#* 3 | \#* 4 | \#.* 5 | *.o 6 | *.a 7 | *.exe 8 | pdf-fix-tuc 9 | build/ 10 | .deps/ 11 | Makefile 12 | Makefile.in 13 | aclocal.m4 14 | autom4te.cache/ 15 | config.h 16 | config.h.in 17 | config.log 18 | config.status 19 | configure 20 | depcomp 21 | install-sh 22 | missing 23 | stamp-h1 24 | test-driver 25 | -------------------------------------------------------------------------------- /test/bfchar-qdfcontents.txt: -------------------------------------------------------------------------------- 1 | 16 beginbfchar 2 | <05B5> <3042> 3 | <05BF> <304C> 4 | <05D2> <305F> 5 | <05D6> <3063> 6 | <05DA> <3067> 7 | <05E1> <306E> 8 | <05E2> <306F> 9 | <0640> <30CF> 10 | <2BB4> <521D> 11 | <691D> <7389> 12 | <6EAE> <767D> 13 | <9536> <898B> 14 | <97AA> <8ABF> 15 | <9577> 16 | <97F3> 17 | <9AD8> 18 | endbfchar 19 | -------------------------------------------------------------------------------- /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ;;; Directory Local Variables 2 | ;;; See Info node `(emacs) Directory Variables' for more information. 3 | 4 | ((c++-mode . 5 | ((c-file-style . "gnu") 6 | (indent-tabs-mode . nil))) 7 | (autoconf-mode . 8 | ((indent-tabs-mode . nil))) 9 | (python-mode . 10 | ((indent-tabs-mode . nil))) 11 | (shellscript-mode . 12 | ((indent-tabs-mode . nil)))) 13 | -------------------------------------------------------------------------------- /test/bfrange-qdfcontents.txt: -------------------------------------------------------------------------------- 1 | 16 beginbfrange 2 | <05B5> <05B5> <3042> 3 | <05BF> <05BF> <304C> 4 | <05D2> <05D2> <305F> 5 | <05D6> <05D6> <3063> 6 | <05DA> <05DA> <3067> 7 | <05E1> <05E1> <306E> 8 | <05E2> <05E2> <306F> 9 | <0640> <0640> <30CF> 10 | <2BB4> <2BB4> <521D> 11 | <691D> <691D> <7389> 12 | <6EAE> <6EAE> <767D> 13 | <9536> <9536> <898B> 14 | <97AA> <97AA> <8ABF> 15 | <9577> 16 | <97F3> 17 | <9AD8> 18 | endbfrange 19 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = pdf-fix-tuc 2 | pdf_fix_tuc_SOURCES = main.cc \ 3 | pdf-handler.cc pdf-handler.hh \ 4 | tounicode.cc tounicode.hh tounicode_table.cc \ 5 | cmdlineparse.hh regex_dispatcher_m.hh 6 | pdf_fix_tuc_LDADD = $(LIBQPDF_LIBS) 7 | 8 | EXTRA_DIST = build_table.py 9 | 10 | if ENABLE_BUILD_TABLE 11 | tounicode_table.cc: aj17-kanji.txt 12 | $(srcdir)/build_table.py $< > $@ 13 | endif 14 | 15 | aj17-kanji.txt: 16 | $(WGET) https://github.com/adobe-type-tools/Adobe-Japan1/raw/master/aj17-kanji.txt 17 | -------------------------------------------------------------------------------- /test/test-qdfcontents.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TESTFILE="$1" 4 | BASENAME=`basename ${TESTFILE} | sed -e "s/-testqdf\.qdfcontents$//"` 5 | EXPECTED=${BASENAME}-qdfcontents.txt 6 | 7 | if [ ! -e "${EXPECTED}" ]; then 8 | EXPECTED=${srcdir}/${BASENAME}-qdfcontents.txt 9 | fi 10 | 11 | NOTFOUND=no 12 | 13 | function check () { 14 | if "${GREP}" -q "$1" "${TESTFILE}"; then 15 | echo \"$1\" is found 16 | else 17 | echo \"$1\" is not found 18 | NOTFOUND=yes 19 | fi 20 | } 21 | 22 | cat "${EXPECTED}" | while read line; do 23 | check "${line}" 24 | done 25 | 26 | if [ "x${NOTFOUND}" = "xyes" ]; then 27 | exit 1 28 | fi 29 | 30 | exit 0 31 | -------------------------------------------------------------------------------- /test/bfrange2-qdfcontents.txt: -------------------------------------------------------------------------------- 1 | 29 beginbfrange 2 | <0001> <0002> <2E81> 3 | <0003> <0003> <4E5A> 4 | <0004> <0004> <6535> 5 | <0005> <0005> <2E9A> 6 | <0006> <0006> <65E1> 7 | <0007> <0007> <4E80> 8 | <0008> <0009> <2EF3> 9 | <000A> <000A> <2EFF> 10 | <000B> <000B> <4E00> 11 | <000C> <000C> <4E28> 12 | <000D> <000D> <9F9C> 13 | <000E> <000E> <9FA0> 14 | <000F> <000F> <2FD6> 15 | <05B5> <05B5> <3042> 16 | <05BF> <05BF> <304C> 17 | <05D2> <05D2> <305F> 18 | <05D6> <05D6> <3063> 19 | <05DA> <05DA> <3067> 20 | <05E1> <05E1> <306E> 21 | <05E2> <05E2> <306F> 22 | <0640> <0640> <30CF> 23 | <2BB4> <2BB4> <521D> 24 | <691D> <691D> <7389> 25 | <6EAE> <6EAE> <767D> 26 | <9536> <9536> <898B> 27 | <97AA> <97AA> <8ABF> 28 | <9577> 29 | <97F3> 30 | <9AD8> 31 | endbfrange 32 | -------------------------------------------------------------------------------- /test/SourceHanSerif-Regular-UTF16: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-3.0 Resource-CMap 2 | % 3 | % *** Caution *** 4 | % For test only. 5 | % This file contains garbled Unicode codepoints. 6 | % *** Caution *** 7 | % 8 | /CIDInit /ProcSet findresource begin 9 | 12 dict begin 10 | begincmap 11 | /CMapName /SourceHanSerif-Regular-UTF16 def 12 | /CMapType 2 def 13 | /CIDSystemInfo << 14 | /Registry (Adobe) 15 | /Ordering (UCS) 16 | /Supplement 0 17 | >> def 18 | 1 begincodespacerange 19 | <0000> 20 | endcodespacerange 21 | 16 beginbfchar 22 | <05B5> <3042> 23 | <05BF> <304C> 24 | <05D2> <305F> 25 | <05D6> <3063> 26 | <05DA> <3067> 27 | <05E1> <306E> 28 | <05E2> <306F> 29 | <0640> <30CF> 30 | <2BB4> <521D> 31 | % <691D> <7389> 32 | <691D> <2F5F> 33 | % <6EAE> <767D> 34 | <6EAE> <2F69> 35 | % <9536> <898B> 36 | <9536> <2F92> 37 | <97AA> <8ABF> 38 | % <9577> 39 | <2ED1> 40 | % <97F3> 41 | <2FB3> 42 | % <9AD8> 43 | <2FBC> 44 | endbfchar 45 | endcmap 46 | CMapName currentdict /CMap defineresource pop 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (C) 2021 Masamichi Hosoda. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # A tool to fix ToUnicod CMap in PDF to prevent extracted text from being garbled 3 | 4 | [ English / [日本語 (Japanese)](Readme.ja.md) ] 5 | 6 | When copying and pasting text from a PDF file, depending on the PDF, kanji characters such as "見" and "高" are often garbled into similar but different characters (e.g. special characters such as Kangxi Radical and CJK Radicals Supplement). 7 | This tool fixes such PDF that raises the garbled text extraction and generates a PDF that does not raise the garbled. 8 | 9 | ## Caution 10 | 11 | It is not possible to reverse the conversion from the PDF generated by this tool to the original PDF (it is irreversible). 12 | It is also possible that the generated PDF may be corrupted without you noticing it. 13 | Be sure to keep a backup of the original PDF. 14 | 15 | ## Usage 16 | 17 | ``` 18 | $ pdf-fix-tuc 19 | Fix ToUnicode CMap in PDF 0.0.0 20 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved. 21 | License: BSD-2-Clause 22 | 23 | https://github.com/trueroad/pdf-fix-tuc 24 | 25 | Usage: pdf-fix-tuc [options] [INPUT.pdf OUTPUT.pdf] ... 26 | 27 | -h, --help 28 | Print help and exit 29 | -V, --version 30 | Print version and exit 31 | --verbose 32 | Verbose 33 | 34 | Output PDF settings (QPDF): 35 | --linearize 36 | Output linearized (web-optimized) PDF 37 | --object-streams=[preserve|disable|generate] (default=preserve) 38 | Settings for object streams 39 | --newline-before-endstream 40 | Output newline before endstream 41 | --qdf 42 | Output QDF 43 | 44 | $ 45 | ``` 46 | 47 | ## Install 48 | 49 | ### Required 50 | 51 | * C++11 compiler (g++ 4.9+ etc.) 52 | * libqpdf 53 | * pkg-config etc. 54 | * Autoconf 2.69+ 55 | * Automake 56 | 57 | When you would like to use packages for preparing the required library, the following might be convenient. 58 | 59 | * Debian / Ubuntu 60 | + libqpdf-dev 61 | * Fedora 62 | + qpdf-devel 63 | * Cygwin 64 | + libqpdf-devel 65 | 66 | ### Build & install 67 | 68 | ``` 69 | $ git clone https://github.com/trueroad/pdf-fix-tuc.git 70 | $ cd pdf-fix-tuc 71 | $ ./autogen.sh 72 | $ mkdir build 73 | $ cd build 74 | $ ../configure 75 | $ make 76 | $ make install 77 | ``` 78 | 79 | ## License 80 | 81 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved. 82 | 83 | License: BSD-2-Clause 84 | 85 | See [LICENSE](./LICENSE). 86 | -------------------------------------------------------------------------------- /test/build_bfrange2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # Fix ToUnicode CMap in PDF 5 | # https://github.com/trueroad/pdf-fix-tuc 6 | # 7 | # build_bfrange2.py: 8 | # Build bfrange2 PDF. 9 | # 10 | # Copyright (C) 2021 Masamichi Hosoda. 11 | # All rights reserved. 12 | # 13 | # Redistribution and use in source and binary forms, with or without 14 | # modification, are permitted provided that the following conditions 15 | # are met: 16 | # 17 | # * Redistributions of source code must retain the above copyright notice, 18 | # this list of conditions and the following disclaimer. 19 | # 20 | # * Redistributions in binary form must reproduce the above copyright notice, 21 | # this list of conditions and the following disclaimer in the documentation 22 | # and/or other materials provided with the distribution. 23 | # 24 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 | # ARE DISCLAIMED. 28 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 29 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 | # SUCH DAMAGE. 36 | # 37 | 38 | import re 39 | import sys 40 | 41 | def main (): 42 | re_begin = re.compile (r"(\d+)\s+beginbfrange\r?\n?") 43 | 44 | for line in sys.stdin.buffer: 45 | try: 46 | line = line.decode ("utf-8") 47 | except UnicodeDecodeError: 48 | sys.stdout.buffer.write (line) 49 | continue 50 | 51 | m = re_begin.match (line) 52 | if m: 53 | print ("{} beginbfrange".format (int (m.group (1)) + 5)) 54 | print ("""\ 55 | <0001> <0003> <2E81> 56 | <0004> <0006> <2E99> 57 | <0007> <0009> <2EF2> 58 | <000A> <000C> <2EFF> 59 | <000D> <000F> <2FD4>\ 60 | """) 61 | continue 62 | 63 | print (line, end = "", flush = True) 64 | 65 | if __name__ == "__main__": 66 | main () 67 | -------------------------------------------------------------------------------- /test/build_bfrange.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # Fix ToUnicode CMap in PDF 5 | # https://github.com/trueroad/pdf-fix-tuc 6 | # 7 | # build_bfrange.py: 8 | # Build bfrange PDF. 9 | # 10 | # Copyright (C) 2021 Masamichi Hosoda. 11 | # All rights reserved. 12 | # 13 | # Redistribution and use in source and binary forms, with or without 14 | # modification, are permitted provided that the following conditions 15 | # are met: 16 | # 17 | # * Redistributions of source code must retain the above copyright notice, 18 | # this list of conditions and the following disclaimer. 19 | # 20 | # * Redistributions in binary form must reproduce the above copyright notice, 21 | # this list of conditions and the following disclaimer in the documentation 22 | # and/or other materials provided with the distribution. 23 | # 24 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 | # ARE DISCLAIMED. 28 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 29 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 | # SUCH DAMAGE. 36 | # 37 | 38 | import re 39 | import sys 40 | 41 | def main (): 42 | b_bfchar = False 43 | re_begin = re.compile (r"(\d+)\s+beginbfchar\r?\n?") 44 | re_bfchar = re.compile (r"<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\r?\n?") 45 | re_end = re.compile (r"endbfchar\r?\n?") 46 | 47 | for line in sys.stdin.buffer: 48 | try: 49 | line = line.decode ("utf-8") 50 | except UnicodeDecodeError: 51 | sys.stdout.buffer.write (line) 52 | continue 53 | 54 | if b_bfchar: 55 | if re_end.match (line): 56 | line = "endbfrange\n" 57 | b_bfchar = False 58 | else: 59 | m = re_bfchar.match (line) 60 | if m: 61 | line = "<{}> <{}> <{}>\n".\ 62 | format (m.group (1), m.group (1), m.group (2)) 63 | else: 64 | m = re_begin.match (line) 65 | if m: 66 | line = "{} beginbfrange\n".format (m.group (1)) 67 | b_bfchar = True 68 | 69 | print (line, end = "", flush = True) 70 | 71 | if __name__ == "__main__": 72 | main () 73 | -------------------------------------------------------------------------------- /test/Makefile.am: -------------------------------------------------------------------------------- 1 | TESTS = $(TESTPDFTOTEXT) $(TESTQDFCONTENTS) 2 | 3 | TEST_EXTENSIONS = .pdftotext .qdfcontents 4 | 5 | PDFTOTEXT_LOG_COMPILER = $(srcdir)/test-pdftotext.sh 6 | QDFCONTENTS_LOG_COMPILER = $(srcdir)/test-qdfcontents.sh 7 | 8 | TESTS_ENVIRONMENT = top_builddir=$(top_builddir) \ 9 | srcdir=$(srcdir) \ 10 | DIFF=$(DIFF) \ 11 | PDFTOTEXT=$(PDFTOTEXT) \ 12 | GREP=$(GREP) 13 | 14 | # Source files for test PDFs 15 | TESTTEXS = bfchar.tex 16 | # Test PDFs 17 | TESTPDFS = $(TESTTEXS:.tex=.pdf) bfrange.pdf bfrange2.pdf 18 | # Garbled text 19 | TESTGARBLED = $(TESTPDFS:.pdf=-garbled.txt) 20 | # Expected text 21 | TESTEXPECTED = $(TESTPDFS:.pdf=-expected.txt) 22 | # Expected QDF contents 23 | TESTCONTENTS = $(TESTPDFS:.pdf=-qdfcontents.txt) 24 | 25 | # Tests by pdftotext 26 | TESTPDFTOTEXT = $(TESTPDFS:.pdf=-testtext.pdftotext) 27 | 28 | # Tests by QDF contents 29 | TESTQDFCONTENTS = $(TESTPDFS:.pdf=-testqdf.qdfcontents) 30 | 31 | # Test scripts and test PDFs 32 | EXTRA_DIST = test-pdftotext.sh test-qdfcontents.sh \ 33 | $(TESTTEXS) $(TESTPDFS) \ 34 | $(TESTGARBLED) $(TESTEXPECTED) $(TESTCONTENTS) \ 35 | SourceHanSerif-Regular-UTF16 \ 36 | build_bfrange.py build_bfrange2.py 37 | 38 | # For `make clean' 39 | CLEANFILES = $(TESTTEXS:.tex=.log) \ 40 | $(TESTPDFTOTEXT) \ 41 | $(TESTQDFCONTENTS) \ 42 | $(TESTPDFS:.pdf=-test.pdf) 43 | 44 | if GENERATE_TEST_PDF 45 | # Generate garbled PDFs 46 | # Tarball contains them for convenience. 47 | %.dvi: %.tex 48 | $(UPTEX) -halt-on-error -interaction=nonstopmode -file-line-error $< 49 | 50 | %.pdf: %.dvi %-garbled.txt SourceHanSerif-Regular-UTF16 51 | if [ ! -e SourceHanSerif-Regular-UTF16 ]; then \ 52 | ln -s $(srcdir)/SourceHanSerif-Regular-UTF16 ; \ 53 | fi 54 | $(DVIPDFMX) $< -o $@ 55 | $(PDFTOTEXT) $@ $(@:.pdf=-extracted.txt) 56 | $(DIFF) -ubwBE $(word 2,$^) $(@:.pdf=-extracted.txt) 57 | 58 | # Generate bfrange PDF 59 | # Tarball contains them for convenience. 60 | bfchar.qdf: bfchar.pdf 61 | $(QPDF) --qdf $< $@ 62 | 63 | bfrange.qdf: bfchar.qdf 64 | $(srcdir)/build_bfrange.py < $< > $@ 65 | 66 | bfrange.pdf: bfrange.qdf 67 | $(FIX_QDF) < $< > $@ 68 | 69 | # Generate bfrange2 PDF 70 | # Tarball contains them for convenience. 71 | bfrange2.qdf: bfrange.qdf 72 | $(srcdir)/build_bfrange2.py < $< > $@ 73 | 74 | bfrange2.pdf: bfrange2.qdf 75 | $(FIX_QDF) < $< > $@ 76 | endif 77 | 78 | # Test PDFs 79 | %-test.pdf: %.pdf 80 | ${top_builddir}/src/pdf-fix-tuc $< $@ 81 | 82 | # Test pdftotext 83 | %-testtext.pdftotext: %-test.pdf %-expected.txt 84 | $(PDFTOTEXT) $< $@ 85 | 86 | # Test QDFs 87 | %-test.qdf: %.pdf 88 | ${top_builddir}/src/pdf-fix-tuc --qdf $< $@ 89 | 90 | # Test qdfcontents 91 | %-testqdf.qdfcontents: %-test.qdf %-qdfcontents.txt 92 | ln -s $< $@ 93 | 94 | .PRECIOUS: %.dvi %.pdf %-test.pdf %-testtext.pdftotext \ 95 | %-test.qdf %-testqdf.qdfcontents 96 | -------------------------------------------------------------------------------- /src/tounicode.hh: -------------------------------------------------------------------------------- 1 | // 2 | // Fix ToUnicode CMap in PDF 3 | // https://github.com/trueroad/pdf-fix-tuc 4 | // 5 | // tounicode.hh: ToUnicode CMap handler class 6 | // 7 | // Copyright (C) 2021 Masamichi Hosoda. 8 | // All rights reserved. 9 | // 10 | // Redistribution and use in source and binary forms, with or without 11 | // modification, are permitted provided that the following conditions 12 | // are met: 13 | // 14 | // * Redistributions of source code must retain the above copyright notice, 15 | // this list of conditions and the following disclaimer. 16 | // 17 | // * Redistributions in binary form must reproduce the above copyright notice, 18 | // this list of conditions and the following disclaimer in the documentation 19 | // and/or other materials provided with the distribution. 20 | // 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | // ARE DISCLAIMED. 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 | // SUCH DAMAGE. 33 | // 34 | 35 | #ifndef INCLUDE_GUARD_TOUNICODE_HH 36 | #define INCLUDE_GUARD_TOUNICODE_HH 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #include "regex_dispatcher_m.hh" 44 | 45 | class tounicode: public regex_dispatcher::member_table 46 | { 47 | public: 48 | tounicode (); 49 | 50 | std::string get (void); 51 | 52 | private: 53 | bool bfchar_hex (const std::smatch &); 54 | void bfrange_output (const int, const int, const int, 55 | const std::string &, const std::string &, 56 | const std::string &); 57 | bool bfrange_hex (const std::smatch &); 58 | bool beginbfchar (const std::smatch &); 59 | bool endbfchar (const std::smatch &); 60 | bool beginbfrange (const std::smatch &); 61 | bool endbfrange (const std::smatch &); 62 | bool other (const std::smatch &); 63 | 64 | std::stringstream ss_; 65 | std::stringstream ss_bfrange_; 66 | int bfranges_; 67 | std::string beginbfrange_space_; 68 | std::string beginbfrange_cr_; 69 | bool is_bfchar_ = false; 70 | bool is_bfrange_ = false; 71 | 72 | const static std::unordered_map table_; 73 | }; 74 | 75 | #endif // INCLUDE_GUARD_TOUNICODE_HH 76 | -------------------------------------------------------------------------------- /src/regex_dispatcher_m.hh: -------------------------------------------------------------------------------- 1 | // 2 | // Regex dispatcher table for C++11 3 | // https://gist.github.com/trueroad/bdb81622c083ee544698c4bc55cda6a9 4 | // 5 | // regex_dispatcher_m.hh: Regex dispatcher table class by member function 6 | // 7 | // Copyright (C) 2018 Masamichi Hosoda. 8 | // All rights reserved. 9 | // 10 | // Redistribution and use in source and binary forms, with or without 11 | // modification, are permitted provided that the following conditions 12 | // are met: 13 | // 14 | // * Redistributions of source code must retain the above copyright notice, 15 | // this list of conditions and the following disclaimer. 16 | // 17 | // * Redistributions in binary form must reproduce the above copyright notice, 18 | // this list of conditions and the following disclaimer in the documentation 19 | // and/or other materials provided with the distribution. 20 | // 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | // ARE DISCLAIMED. 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 | // SUCH DAMAGE. 33 | // 34 | 35 | #ifndef INCLUDE_GUARD_REGEX_DISPATCHER_M_HH_ 36 | #define INCLUDE_GUARD_REGEX_DISPATCHER_M_HH_ 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | namespace regex_dispatcher 45 | { 46 | template 47 | class member_table 48 | { 49 | using callback_type = bool (Derived::*) (const std::smatch &); 50 | using table_type = 51 | std::vector>; 52 | 53 | public: 54 | member_table (table_type t): 55 | table_ (t) 56 | { 57 | } 58 | 59 | void process_line (const std::string &s) 60 | { 61 | std::smatch sm; 62 | for (const auto &t: table_) 63 | { 64 | if (std::regex_match (s, sm, t.first)) 65 | { 66 | if ((static_cast (this)->*(t.second)) (sm)) 67 | break; 68 | } 69 | } 70 | } 71 | 72 | void process_stream (std::istream &is) 73 | { 74 | std::string line; 75 | while (std::getline (is, line)) 76 | { 77 | process_line (line); 78 | } 79 | } 80 | 81 | private: 82 | table_type table_; 83 | }; 84 | } 85 | 86 | #endif // INCLUDE_GUARD_REGEX_DISPATCHER_M_HH_ 87 | -------------------------------------------------------------------------------- /src/pdf-handler.hh: -------------------------------------------------------------------------------- 1 | // 2 | // Fix ToUnicode CMap in PDF 3 | // https://github.com/trueroad/pdf-fix-tuc 4 | // 5 | // pdf-handler.hh: PDF handler class 6 | // 7 | // Copyright (C) 2021 Masamichi Hosoda. 8 | // All rights reserved. 9 | // 10 | // Redistribution and use in source and binary forms, with or without 11 | // modification, are permitted provided that the following conditions 12 | // are met: 13 | // 14 | // * Redistributions of source code must retain the above copyright notice, 15 | // this list of conditions and the following disclaimer. 16 | // 17 | // * Redistributions in binary form must reproduce the above copyright notice, 18 | // this list of conditions and the following disclaimer in the documentation 19 | // and/or other materials provided with the distribution. 20 | // 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | // ARE DISCLAIMED. 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 | // SUCH DAMAGE. 33 | // 34 | 35 | #ifndef INCLUDE_GUARD_PDF_HANDLER_HH 36 | #define INCLUDE_GUARD_PDF_HANDLER_HH 37 | 38 | #include 39 | 40 | #include 41 | #include 42 | 43 | class pdf_handler 44 | { 45 | public: 46 | pdf_handler () 47 | { 48 | } 49 | void set_verbose (bool v) 50 | { 51 | verbose_ = v; 52 | } 53 | void set_filename_in (const std::string &f) 54 | { 55 | filename_in_ = f; 56 | } 57 | void set_filename_out (const std::string &f) 58 | { 59 | filename_out_ = f; 60 | } 61 | void set_linearize (bool l) 62 | { 63 | linearize_ = l; 64 | } 65 | void set_object_streams (qpdf_object_stream_e os) 66 | { 67 | object_streams_ = os; 68 | } 69 | void set_newline_before_endstream (bool n) 70 | { 71 | newline_before_endstream_ = n; 72 | } 73 | void set_qdf (bool q) 74 | { 75 | qdf_ = q; 76 | } 77 | 78 | void load (); 79 | void process (); 80 | void save (); 81 | 82 | private: 83 | void process_obj (QPDFObjectHandle oh); 84 | void process_font (QPDFObjectHandle oh); 85 | void process_tounicode (QPDFObjectHandle oh); 86 | 87 | std::string filename_in_; 88 | std::string filename_out_; 89 | 90 | QPDF qpdf_; 91 | 92 | bool verbose_ = false; 93 | 94 | bool linearize_ = false; 95 | qpdf_object_stream_e object_streams_ = qpdf_o_preserve; 96 | bool newline_before_endstream_ = false; 97 | bool qdf_ = false; 98 | }; 99 | 100 | #endif // INCLUDE_GUARD_PDF_HANDLER_HH 101 | -------------------------------------------------------------------------------- /src/build_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # Fix ToUnicode CMap in PDF 5 | # https://github.com/trueroad/pdf-fix-tuc 6 | # 7 | # build_table.py: 8 | # Build radical to ideograph table. 9 | # 10 | # Copyright (C) 2021 Masamichi Hosoda. 11 | # All rights reserved. 12 | # 13 | # Redistribution and use in source and binary forms, with or without 14 | # modification, are permitted provided that the following conditions 15 | # are met: 16 | # 17 | # * Redistributions of source code must retain the above copyright notice, 18 | # this list of conditions and the following disclaimer. 19 | # 20 | # * Redistributions in binary form must reproduce the above copyright notice, 21 | # this list of conditions and the following disclaimer in the documentation 22 | # and/or other materials provided with the distribution. 23 | # 24 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 | # ARE DISCLAIMED. 28 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 29 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 | # SUCH DAMAGE. 36 | # 37 | 38 | import sys 39 | 40 | # See tt_cmap.c in dvipdfmx source. 41 | def is_PUA_or_presentation (unicode): 42 | uni = int (unicode.lstrip ('U+'), base = 16) 43 | 44 | if uni >= 0x2E80 and uni <= 0x2EF3: 45 | return True 46 | if uni >= 0x2F00 and uni <= 0x2FD5: 47 | return True 48 | if uni >= 0xE000 and uni <= 0xF8FF: 49 | return True 50 | if uni >= 0xFB00 and uni <= 0xFB4F: 51 | return True 52 | if uni >= 0xF900 and uni <= 0xFAFF: 53 | return True 54 | if uni >= 0x2F800 and uni <= 0x2FA1F: 55 | return True 56 | if uni >= 0xF0000 and uni <= 0xFFFFD: 57 | return True 58 | if uni >= 0x100000 and uni <= 0x10FFFD: 59 | return True 60 | if uni == 0x00AD: 61 | return True 62 | return False 63 | 64 | def process (file): 65 | with open (file, "r") as f: 66 | for line in f: 67 | if line.startswith ("#"): 68 | continue 69 | items = line.split () 70 | unijis2004 = items[18] 71 | if ':' in unijis2004: 72 | unis = unijis2004.split (':') 73 | lows = [] 74 | highs = [] 75 | for u in unis: 76 | if is_PUA_or_presentation (u): 77 | lows.append (u) 78 | else: 79 | highs.append (u) 80 | if len (lows) > 0 and len (highs) > 0: 81 | for low in lows: 82 | key = int (low.lstrip ('U+'), base = 16) 83 | value = int (highs[0].lstrip ('U+'), base = 16) 84 | if key < 0x10000 and value >= 0x10000: 85 | print (" // { " + \ 86 | "0x{:x}, 0x{:x}".format (key, value) + \ 87 | " },") 88 | else: 89 | print (" { " + \ 90 | "0x{:x}, 0x{:x}".format (key, value) + \ 91 | " },") 92 | 93 | def main (): 94 | input_filename = "aj17-kanji.txt" 95 | 96 | if len (sys.argv) > 1: 97 | input_filename = sys.argv[1] 98 | 99 | print ('''\ 100 | // Generated by build_table.py 101 | // Do not edit this file. 102 | 103 | #include 104 | 105 | #include "tounicode.hh" 106 | 107 | const std::unordered_map tounicode::table_ = 108 | {\ 109 | ''') 110 | 111 | process (input_filename) 112 | 113 | print ('''\ 114 | };\ 115 | ''') 116 | 117 | if __name__ == "__main__": 118 | main () 119 | -------------------------------------------------------------------------------- /src/main.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Fix ToUnicode CMap in PDF 3 | // https://github.com/trueroad/pdf-fix-tuc 4 | // 5 | // Copyright (C) 2021 Masamichi Hosoda. 6 | // All rights reserved. 7 | // 8 | // Redistribution and use in source and binary forms, with or without 9 | // modification, are permitted provided that the following conditions 10 | // are met: 11 | // 12 | // * Redistributions of source code must retain the above copyright notice, 13 | // this list of conditions and the following disclaimer. 14 | // 15 | // * Redistributions in binary form must reproduce the above copyright notice, 16 | // this list of conditions and the following disclaimer in the documentation 17 | // and/or other materials provided with the distribution. 18 | // 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | // ARE DISCLAIMED. 23 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 | // SUCH DAMAGE. 31 | // 32 | 33 | #include "config.h" 34 | 35 | #include 36 | #include 37 | #include 38 | 39 | #include "cmdlineparse.hh" 40 | #include "pdf-handler.hh" 41 | 42 | bool parse_options (int argc, char *argv[], pdf_handler *pph) 43 | { 44 | cmdlineparse::parser cmd; 45 | 46 | cmd.set_usage_unamed_opts ("INPUT.pdf OUTPUT.pdf"); 47 | cmd.add_default (); 48 | 49 | bool verbose; 50 | cmd.add_flag (0, "verbose", &verbose, " Verbose"); 51 | 52 | bool linearize; 53 | std::string object_streams; 54 | bool newline_before_endstream; 55 | bool qdf; 56 | cmd.add_flag (0, "linearize", &linearize, 57 | " Output linearized (web-optimized) PDF", 58 | "Output PDF settings (QPDF)"); 59 | cmd.add_string (0, "object-streams", &object_streams, "preserve", 60 | " Settings for object streams", 61 | "[preserve|disable|generate]", 62 | "Output PDF settings (QPDF)"); 63 | cmd.add_flag (0, "newline-before-endstream", &newline_before_endstream, 64 | " Output newline before endstream", 65 | "Output PDF settings (QPDF)"); 66 | cmd.add_flag (0, "qdf", &qdf, 67 | " Output QDF", 68 | "Output PDF settings (QPDF)"); 69 | 70 | if (!cmd.parse (argc, argv)) 71 | return false; 72 | 73 | auto uargs {cmd.get_unamed_args ()}; 74 | if (uargs.size () != 2) 75 | { 76 | std::cout << cmd.build_help (); 77 | return false; 78 | } 79 | 80 | std::cout << cmd.get_version_string () << std::endl; 81 | 82 | pph->set_verbose (verbose); 83 | 84 | pph->set_linearize (linearize); 85 | if (object_streams == "preserve") 86 | pph->set_object_streams (qpdf_o_preserve); 87 | else if (object_streams == "generate") 88 | pph->set_object_streams (qpdf_o_generate); 89 | else if (object_streams == "disable") 90 | pph->set_object_streams (qpdf_o_disable); 91 | else 92 | { 93 | std::cerr << "unknwon --object-streams mode" << std::endl; 94 | return false; 95 | } 96 | 97 | pph->set_newline_before_endstream (newline_before_endstream); 98 | pph->set_qdf (qdf); 99 | 100 | pph->set_filename_in (uargs[0]); 101 | pph->set_filename_out (uargs[1]); 102 | 103 | return true; 104 | } 105 | 106 | int main (int argc, char *argv[]) 107 | { 108 | try 109 | { 110 | pdf_handler ph; 111 | if (!parse_options (argc, argv, &ph)) 112 | return false; 113 | 114 | ph.load (); 115 | ph.process (); 116 | ph.save (); 117 | 118 | std::cout << "Done." << std::endl; 119 | } 120 | catch (std::exception &e) 121 | { 122 | std::cerr << e.what () << std::endl; 123 | return 1; 124 | } 125 | 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.69]) 5 | AC_INIT([Fix ToUnicode CMap in PDF], [0.0.0], , 6 | [pdf-fix-tuc], [https://github.com/trueroad/pdf-fix-tuc]) 7 | AM_INIT_AUTOMAKE([foreign]) 8 | AC_CONFIG_SRCDIR([src/main.cc]) 9 | AC_CONFIG_HEADERS([config.h]) 10 | 11 | PACKAGE_COPYRIGHT="Copyright (C) 2021 Masamichi Hosoda. All rights reserved." 12 | PACKAGE_LICENSE="License: BSD-2-Clause" 13 | 14 | AC_SUBST([PACKAGE_COPYRIGHT]) 15 | AC_SUBST([PACKAGE_LICENSE]) 16 | AC_DEFINE_UNQUOTED([PACKAGE_COPYRIGHT], ["$PACKAGE_COPYRIGHT"], 17 | [Define to the copyright of this package.]) 18 | AC_DEFINE_UNQUOTED([PACKAGE_LICENSE], ["$PACKAGE_LICENSE"], 19 | [Define to the license of this package.]) 20 | 21 | # Checks for programs. 22 | AC_PROG_CXX 23 | 24 | AC_PATH_PROG([DIFF], [diff]) 25 | AS_IF([test "x$DIFF" = x], 26 | [AC_MSG_WARN([diff isn't found. `make check' will fail tests.])]) 27 | AC_SUBST([DIFF]) 28 | 29 | AC_PATH_PROG([PDFTOTEXT], [pdftotext]) 30 | AS_IF([test "x$PDFTOTEXT" = x], 31 | [AC_MSG_WARN([pdftotext isn't found. `make check' will fail tests.])]) 32 | AC_SUBST([PDFTOTEXT]) 33 | 34 | AC_PATH_PROG([GREP], [grep]) 35 | AS_IF([test "x$GREP" = x], 36 | [AC_MSG_WARN([grep isn't found. `make check' will fail tests.])]) 37 | AC_SUBST([GREP]) 38 | 39 | AC_MSG_CHECKING([for pre-generated test PDFs]) 40 | AS_IF([test ! -e ${srcdir}/test/bfchar.pdf], 41 | [AC_MSG_RESULT([no]) 42 | AC_PATH_PROG([UPTEX], [uptex]) 43 | AS_IF([test "x$UPTEX" = x], 44 | [AC_MSG_WARN( 45 | [uptex isn't found. Test PDFs cannot be generated.]) 46 | AC_MSG_WARN( 47 | [You cannot use `make check'.])]) 48 | AC_PATH_PROG([DVIPDFMX], [dvipdfmx]) 49 | AS_IF([test "x$DVIPDFMX" = x], 50 | [AC_MSG_WARN( 51 | [dvipdfmx isn't found. Test PDFs cannot be generated.]) 52 | AC_MSG_WARN( 53 | [You cannot use `make check'.])]) 54 | AC_PATH_PROG([QPDF], [qpdf]) 55 | AS_IF([test "x$QPDF" = x], 56 | [AC_MSG_WARN( 57 | [qpdf isn't found. Test PDFs cannot be generated.]) 58 | AC_MSG_WARN( 59 | [You cannot use `make check'.])]) 60 | AC_PATH_PROG([FIX_QDF], [fix-qdf]) 61 | AS_IF([test "x$FIX_QDF" = x], 62 | [AC_MSG_WARN( 63 | [fix-qdf isn't found. Test PDFs cannot be generated.]) 64 | AC_MSG_WARN( 65 | [You cannot use `make check'.])]) 66 | AC_PATH_PROG([PYTHON3], [python3]) 67 | AS_IF([test "x$PYTHON3" = x], 68 | [AC_MSG_WARN( 69 | [python3 isn't found. Test PDFs cannot be generated.]) 70 | AC_MSG_WARN( 71 | [You cannot use `make check'.])])], 72 | [AC_MSG_RESULT([yes])]) 73 | AM_CONDITIONAL([GENERATE_TEST_PDF], 74 | [test ! -e ${srcdir}/test/bfchar.pdf]) 75 | AC_SUBST([UPTEX]) 76 | AC_SUBST([DVIPDFMX]) 77 | AC_SUBST([QPDF]) 78 | AC_SUBST([FIX_QDF]) 79 | 80 | AC_ARG_ENABLE([build-table], 81 | [AS_HELP_STRING([--enable-build-table], 82 | [Enable building table (default is disable)])], 83 | [enable_build_table=yes 84 | AC_MSG_CHECKING([for build-table source aj17-kanji.txt]) 85 | AS_IF([test ! -e ${srcdir}/src/aj17-kanji.txt], 86 | [AC_MSG_RESULT([no]) 87 | AC_PATH_PROG([WGET], [wget]) 88 | AS_IF([test "x$WGET" = x], 89 | [AC_MSG_ERROR( 90 | [wget isn't found. aj17-kanji.txt cannot be retrieved.])], 91 | [AC_MSG_NOTICE( 92 | [wget is found. aj17-kanji.txt will be retrieved.])])], 93 | [AC_MSG_RESULT([yes])]) 94 | AC_PATH_PROG([PYTHON3], [python3]) 95 | AS_IF([test "x$PYTHON3" = x], 96 | [AC_MSG_ERROR([python3 isn't found. You cannot enable build-table.])], 97 | [AC_MSG_NOTICE([python3 is found. The table will build.])])], 98 | [enable_build_table=no]) 99 | AM_CONDITIONAL([ENABLE_BUILD_TABLE], [test "x$enable_build_table" = xyes]) 100 | AC_SUBST([WGET]) 101 | 102 | # Checks for libraries. 103 | PKG_PROG_PKG_CONFIG 104 | PKG_CHECK_MODULES([LIBQPDF], [libqpdf]) 105 | 106 | # Checks for header files. 107 | 108 | # Checks for typedefs, structures, and compiler characteristics. 109 | AX_CXX_COMPILE_STDCXX(11, [noext], [mandatory]) 110 | 111 | # Checks for library functions. 112 | 113 | AC_CONFIG_FILES([Makefile 114 | src/Makefile 115 | test/Makefile]) 116 | AC_OUTPUT 117 | -------------------------------------------------------------------------------- /src/pdf-handler.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Fix ToUnicode CMap in PDF 3 | // https://github.com/trueroad/pdf-fix-tuc 4 | // 5 | // pdf-handler.cc: PDF handler class 6 | // 7 | // Copyright (C) 2021 Masamichi Hosoda. 8 | // All rights reserved. 9 | // 10 | // Redistribution and use in source and binary forms, with or without 11 | // modification, are permitted provided that the following conditions 12 | // are met: 13 | // 14 | // * Redistributions of source code must retain the above copyright notice, 15 | // this list of conditions and the following disclaimer. 16 | // 17 | // * Redistributions in binary form must reproduce the above copyright notice, 18 | // this list of conditions and the following disclaimer in the documentation 19 | // and/or other materials provided with the distribution. 20 | // 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | // ARE DISCLAIMED. 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 | // SUCH DAMAGE. 33 | // 34 | 35 | #include "config.h" 36 | 37 | #include "pdf-handler.hh" 38 | 39 | #include 40 | #include 41 | #include 42 | 43 | #include 44 | #include 45 | #include 46 | 47 | #include "tounicode.hh" 48 | 49 | void pdf_handler::load () 50 | { 51 | if (verbose_) 52 | std::cout << "Loading \"" << filename_in_ << "\"" << std::endl; 53 | 54 | qpdf_.processFile (filename_in_.c_str ()); 55 | } 56 | 57 | void pdf_handler::process () 58 | { 59 | auto objs {qpdf_.getAllObjects ()}; 60 | for (auto o: objs) 61 | process_obj (o); 62 | } 63 | 64 | void pdf_handler::save () 65 | { 66 | if (verbose_) 67 | std::cout << "Writing \"" << filename_out_ << "\"" << std::endl; 68 | 69 | QPDFWriter w (qpdf_, filename_out_.c_str ()); 70 | w.setLinearization (linearize_); 71 | w.setObjectStreamMode (object_streams_); 72 | w.setNewlineBeforeEndstream (newline_before_endstream_); 73 | w.setQDFMode (qdf_); 74 | w.write (); 75 | } 76 | 77 | void pdf_handler::process_obj (QPDFObjectHandle oh) 78 | { 79 | if (verbose_) 80 | { 81 | std::cout << "Object: ID " << oh.getObjectID () 82 | << ", Generation " << oh.getGeneration () 83 | << ", Type " << oh.getTypeName () 84 | << std::endl; 85 | } 86 | 87 | if (oh.isDictionary () && oh.hasKey ("/Type")) 88 | { 89 | if (verbose_) 90 | std::cout << " /Type "; 91 | 92 | auto type {oh.getKey ("/Type")}; 93 | if (type.isName ()) 94 | { 95 | if (verbose_) 96 | std::cout << type.getName () << std::endl; 97 | if (std::string ("/Font") == type.getName ()) 98 | process_font (oh); 99 | } 100 | else 101 | { 102 | if(verbose_) 103 | std::cout << "does not contain a name" << std::endl; 104 | } 105 | } 106 | } 107 | 108 | void pdf_handler::process_font (QPDFObjectHandle oh) 109 | { 110 | if (oh.hasKey ("/ToUnicode")) 111 | { 112 | if (verbose_) 113 | std::cout << " /ToUnicode "; 114 | 115 | auto tounicode {oh.getKey ("/ToUnicode")}; 116 | if (tounicode.isStream ()) 117 | { 118 | if (verbose_) 119 | std::cout << "stream" << std::endl; 120 | process_tounicode (tounicode); 121 | } 122 | else 123 | { 124 | if(verbose_) 125 | std::cout << "does not contain a stream" << std::endl; 126 | } 127 | } 128 | } 129 | 130 | void pdf_handler::process_tounicode (QPDFObjectHandle oh) 131 | { 132 | std::stringstream ss_in; 133 | { 134 | auto pb {oh.getStreamData ()}; 135 | std::string buff (reinterpret_cast (pb->getBuffer ()), 136 | pb->getSize ()); 137 | ss_in.str (buff + "\n"); 138 | } 139 | 140 | tounicode tu; 141 | tu.process_stream (ss_in); 142 | 143 | auto in {ss_in.str ()}; 144 | in.pop_back (); 145 | auto out {tu.get ()}; 146 | out.pop_back (); 147 | 148 | if (in != out) 149 | { 150 | if (verbose_) 151 | std::cout << " replacing..." << std::endl; 152 | 153 | oh.replaceStreamData (out, 154 | QPDFObjectHandle::newNull(), 155 | QPDFObjectHandle::newNull()); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/tounicode_table.cc: -------------------------------------------------------------------------------- 1 | // Generated by build_table.py 2 | // Do not edit this file. 3 | 4 | #include 5 | 6 | #include "tounicode.hh" 7 | 8 | const std::unordered_map tounicode::table_ = 9 | { 10 | { 0x2f90, 0x8863 }, 11 | { 0x2f00, 0x4e00 }, 12 | { 0x2f7b, 0x7fbd }, 13 | { 0x2fac, 0x96e8 }, 14 | { 0x2f85, 0x81fc }, 15 | { 0x2f60, 0x74dc }, 16 | { 0x2ee9, 0x9ec4 }, 17 | { 0x2f04, 0x4e59 }, 18 | { 0x2fb3, 0x97f3 }, 19 | { 0x2f55, 0x706b }, 20 | { 0x2f72, 0x79be }, 21 | { 0x2f99, 0x8c9d }, 22 | { 0x2f93, 0x89d2 }, 23 | { 0x2fb0, 0x9769 }, 24 | { 0x2f61, 0x74e6 }, 25 | { 0x2f32, 0x5e72 }, 26 | { 0x2f62, 0x7518 }, 27 | { 0x2f78, 0x7f36 }, 28 | { 0x2ee4, 0x9b3c }, 29 | { 0x2fc1, 0x9b3c }, 30 | { 0x2ef2, 0x4e80 }, 31 | { 0x2fc9, 0x9ecd }, 32 | { 0x2f38, 0x5f13 }, 33 | { 0x2f5c, 0x725b }, 34 | { 0x2fc2, 0x9b5a }, 35 | { 0x2f5f, 0x7389 }, 36 | { 0x2f31, 0x5dfe }, 37 | { 0x2f44, 0x65a4 }, 38 | { 0x2fa6, 0x91d1 }, 39 | { 0x2f4b, 0x6b20 }, 40 | { 0x2f73, 0x7a74 }, 41 | { 0x2f8e, 0x8840 }, 42 | { 0x2f49, 0x6708 }, 43 | { 0x2f5d, 0x72ac }, 44 | { 0x2f92, 0x898b }, 45 | { 0x2f5e, 0x7384 }, 46 | { 0x2f94, 0x8a00 }, 47 | { 0x2f30, 0x5df1 }, 48 | { 0x2f3e, 0x6238 }, 49 | { 0x2ec1, 0x864e }, 50 | { 0x2fce, 0x9f13 }, 51 | { 0x2f1d, 0x53e3 }, 52 | { 0x2f2f, 0x5de5 }, 53 | { 0x2f8f, 0x884c }, 54 | { 0x2fb9, 0x9999 }, 55 | { 0x2fbc, 0x9ad8 }, 56 | { 0x2fca, 0x9ed2 }, 57 | { 0x2fbb, 0x9aa8 }, 58 | { 0x2f89, 0x826e }, 59 | { 0x2f6b, 0x76bf }, 60 | { 0x2f2d, 0x5c71 }, 61 | { 0x2f20, 0x58eb }, 62 | { 0x2f26, 0x5b50 }, 63 | { 0x2f40, 0x652f }, 64 | { 0x2f4c, 0x6b62 }, 65 | { 0x2f52, 0x6c0f }, 66 | { 0x2f77, 0x7cf8 }, 67 | { 0x2f84, 0x81f3 }, 68 | { 0x2eed, 0x6b6f }, 69 | { 0x2f70, 0x793a }, 70 | { 0x2f7d, 0x800c }, 71 | { 0x2f7f, 0x8033 }, 72 | { 0x2f83, 0x81ea }, 73 | { 0x2fc5, 0x9e7f }, 74 | { 0x2f9e, 0x8eca }, 75 | { 0x2f3f, 0x624b }, 76 | { 0x2fb8, 0x9996 }, 77 | { 0x2f88, 0x821f }, 78 | { 0x2f17, 0x5341 }, 79 | { 0x2f25, 0x5973 }, 80 | { 0x2f29, 0x5c0f }, 81 | { 0x2f8a, 0x8272 }, 82 | { 0x2fb7, 0x98df }, 83 | { 0x2f3c, 0x5fc3 }, 84 | { 0x2f82, 0x81e3 }, 85 | { 0x2f9d, 0x8eab }, 86 | { 0x2f9f, 0x8f9b }, 87 | { 0x2f08, 0x4eba }, 88 | { 0x2f54, 0x6c34 }, 89 | { 0x2f28, 0x5bf8 }, 90 | { 0x2f63, 0x751f }, 91 | { 0x2ec4, 0x897f }, 92 | { 0x2ed8, 0x9752 }, 93 | { 0x2eeb, 0x6589 }, 94 | { 0x2f6f, 0x77f3 }, 95 | { 0x2f9a, 0x8d64 }, 96 | { 0x2f86, 0x820c }, 97 | { 0x2f87, 0x821b }, 98 | { 0x2fcf, 0x9f20 }, 99 | { 0x2f9b, 0x8d70 }, 100 | { 0x2f9c, 0x8db3 }, 101 | { 0x2f24, 0x5927 }, 102 | { 0x2fa0, 0x8fb0 }, 103 | { 0x2f95, 0x8c37 }, 104 | { 0x2f75, 0x7af9 }, 105 | { 0x2f8d, 0x866b }, 106 | { 0x2ed1, 0x9577 }, 107 | { 0x2fa7, 0x9577 }, 108 | { 0x2fc3, 0x9ce5 }, 109 | { 0x2f56, 0x722a }, 110 | { 0x2fcd, 0x9f0e }, 111 | { 0x2f65, 0x7530 }, 112 | { 0x2f43, 0x6597 }, 113 | { 0x2f1f, 0x571f }, 114 | { 0x2f11, 0x5200 }, 115 | { 0x2f96, 0x8c46 }, 116 | { 0x2fa3, 0x9149 }, 117 | { 0x2f06, 0x4e8c }, 118 | { 0x2f81, 0x8089 }, 119 | { 0x2f47, 0x65e5 }, 120 | { 0x2f0a, 0x5165 }, 121 | { 0x2fba, 0x99ac }, 122 | { 0x2f69, 0x767d }, 123 | { 0x2ee8, 0x9ea6 }, 124 | { 0x2f0b, 0x516b }, 125 | { 0x2fa4, 0x91c6 }, 126 | { 0x2f50, 0x6bd4 }, 127 | { 0x2f6a, 0x76ae }, 128 | { 0x2fae, 0x975e }, 129 | { 0x2fb6, 0x98db }, 130 | { 0x2fd0, 0x9f3b }, 131 | { 0x2f66, 0x758b }, 132 | { 0x2f57, 0x7236 }, 133 | { 0x2fa9, 0x961c }, 134 | { 0x2fb5, 0x98a8 }, 135 | { 0x2f42, 0x6587 }, 136 | { 0x2f76, 0x7c73 }, 137 | { 0x2fb4, 0x9801 }, 138 | { 0x2f5a, 0x7247 }, 139 | { 0x2e9f, 0x6bcd }, 140 | { 0x2f45, 0x65b9 }, 141 | { 0x2f18, 0x535c }, 142 | { 0x2fc7, 0x9ebb }, 143 | { 0x2f1c, 0x53c8 }, 144 | { 0x2e92, 0x5df3 }, 145 | { 0x2ea0, 0x6c11 }, 146 | { 0x2f6d, 0x77db }, 147 | { 0x2faf, 0x9762 }, 148 | { 0x2f51, 0x6bdb }, 149 | { 0x2f4a, 0x6728 }, 150 | { 0x2f6c, 0x76ee }, 151 | { 0x2fa8, 0x9580 }, 152 | { 0x2f6e, 0x77e2 }, 153 | { 0x2fa2, 0x9091 }, 154 | { 0x2f23, 0x5915 }, 155 | { 0x2f64, 0x7528 }, 156 | { 0x2f7a, 0x7f8a }, 157 | { 0x2fa5, 0x91cc }, 158 | { 0x2f74, 0x7acb }, 159 | { 0x2eef, 0x7adc }, 160 | { 0x2fd3, 0x9f8d }, 161 | { 0x2f12, 0x529b }, 162 | { 0x2f7c, 0x8001 }, 163 | { 0x2f02, 0x4e36 }, 164 | { 0x2f03, 0x4e3f }, 165 | { 0x2f05, 0x4e85 }, 166 | { 0x2f07, 0x4ea0 }, 167 | { 0x2f09, 0x513f }, 168 | { 0x2e8e, 0x5140 }, 169 | { 0x2f0c, 0x5182 }, 170 | { 0x2f0d, 0x5196 }, 171 | { 0x2f0e, 0x51ab }, 172 | { 0x2f0f, 0x51e0 }, 173 | { 0x2f10, 0x51f5 }, 174 | { 0x2f13, 0x52f9 }, 175 | { 0x2f14, 0x5315 }, 176 | { 0x2f15, 0x531a }, 177 | { 0x2f16, 0x5338 }, 178 | { 0x2f19, 0x5369 }, 179 | { 0x2f1a, 0x5382 }, 180 | { 0x2f1b, 0x53b6 }, 181 | { 0x2f1e, 0x56d7 }, 182 | { 0x2f21, 0x5902 }, 183 | { 0x2f22, 0x590a }, 184 | { 0x2f27, 0x5b80 }, 185 | { 0x2e90, 0x5c22 }, 186 | { 0x2f2a, 0x5c22 }, 187 | { 0x2f2b, 0x5c38 }, 188 | { 0x2f2c, 0x5c6e }, 189 | { 0x2f2e, 0x5ddb }, 190 | { 0x2e93, 0x5e7a }, 191 | { 0x2f33, 0x5e7a }, 192 | { 0x2f34, 0x5e7f }, 193 | { 0x2f35, 0x5ef4 }, 194 | { 0x2f36, 0x5efe }, 195 | { 0x2f37, 0x5f0b }, 196 | { 0x2e94, 0x5f51 }, 197 | { 0x2f3a, 0x5f61 }, 198 | { 0x2f3b, 0x5f73 }, 199 | { 0x2f3d, 0x6208 }, 200 | { 0x2f41, 0x6534 }, 201 | { 0x2e99, 0x6535 }, 202 | { 0x2f46, 0x65e0 }, 203 | { 0x2e9b, 0x65e1 }, 204 | { 0x2f48, 0x66f0 }, 205 | { 0x2f4d, 0x6b79 }, 206 | { 0x2f4e, 0x6bb3 }, 207 | { 0x2f4f, 0x6bcb }, 208 | { 0x2f53, 0x6c14 }, 209 | { 0x2f58, 0x723b }, 210 | { 0x2f59, 0x723f }, 211 | { 0x2f68, 0x7676 }, 212 | { 0x2f79, 0x7f51 }, 213 | { 0x2f7e, 0x8012 }, 214 | { 0x2f80, 0x807f }, 215 | { 0x2f8b, 0x8278 }, 216 | { 0x2f8c, 0x864d }, 217 | { 0x2f91, 0x897e }, 218 | { 0x2f97, 0x8c55 }, 219 | { 0x2f98, 0x8c78 }, 220 | { 0x2faa, 0x96b6 }, 221 | { 0x2fab, 0x96b9 }, 222 | { 0x2fb1, 0x97cb }, 223 | { 0x2fb2, 0x97ed }, 224 | { 0x2fbd, 0x9adf }, 225 | { 0x2fbe, 0x9b25 }, 226 | { 0x2fbf, 0x9b2f }, 227 | { 0x2fc0, 0x9b32 }, 228 | { 0x2fc4, 0x9e75 }, 229 | { 0x2fc6, 0x9ea5 }, 230 | { 0x2fcb, 0x9ef9 }, 231 | { 0x2fcc, 0x9efd }, 232 | { 0x2fd1, 0x9f4a }, 233 | { 0x2fd2, 0x9f52 }, 234 | { 0x2fd4, 0x9f9c }, 235 | { 0x2fd5, 0x9fa0 }, 236 | { 0x2f8ed, 0x6adb }, 237 | { 0x2f8dc, 0x6753 }, 238 | { 0xf9ec, 0x6eba }, 239 | { 0x2f877, 0x5c60 }, 240 | { 0xf992, 0x6f23 }, 241 | { 0xf993, 0x7149 }, 242 | { 0x2f818, 0x51a4 }, 243 | { 0x2f5b, 0x7259 }, 244 | { 0x2f01, 0x4e28 }, 245 | { 0x2fad, 0x9751 }, 246 | { 0x2fc8, 0x9ec3 }, 247 | // { 0x2ebd, 0x26951 }, 248 | // { 0x2e8d, 0x2d544 }, 249 | { 0x2edf, 0x98e0 }, 250 | // { 0x2ede, 0x2967f }, 251 | { 0x2e97, 0x38fa }, 252 | { 0x2e85, 0x4ebb }, 253 | { 0x2ec3, 0x8980 }, 254 | // { 0x2eca, 0x27fb7 }, 255 | // { 0x2eae, 0x25ad7 }, 256 | // { 0x2eaa, 0x24d14 }, 257 | { 0x2eb9, 0x8002 }, 258 | // { 0x2e87, 0x20628 }, 259 | { 0x2f82c, 0x20984 }, 260 | { 0x2e8b, 0x353e }, 261 | { 0x2f8db, 0x233cc }, 262 | { 0x2ea6, 0x4e2c }, 263 | // { 0x2eb3, 0x2626a }, 264 | { 0x2ebe, 0x8279 }, 265 | { 0x2e83, 0x4e5a }, 266 | { 0x2e89, 0x5202 }, 267 | { 0x2e8f, 0x5c23 }, 268 | { 0x2f39, 0x5f50 }, 269 | { 0x2e96, 0x5fc4 }, 270 | { 0x2e98, 0x624c }, 271 | { 0x2ea1, 0x6c35 }, 272 | { 0x2ea3, 0x706c }, 273 | { 0x2ea8, 0x72ad }, 274 | { 0x2f67, 0x7592 }, 275 | { 0x2ead, 0x793b }, 276 | { 0x2f71, 0x79b8 }, 277 | { 0x2eab, 0x7f52 }, 278 | { 0x2eb2, 0x7f52 }, 279 | { 0x2eb1, 0x7f53 }, 280 | { 0x2ec2, 0x8864 }, 281 | { 0x2fa1, 0x8fb5 }, 282 | { 0x2ecd, 0x8fb6 }, 283 | { 0x2ed2, 0x9578 }, 284 | { 0x2ecf, 0x961d }, 285 | { 0x2ed6, 0x961d }, 286 | { 0x2e9e, 0x6b7a }, 287 | }; 288 | -------------------------------------------------------------------------------- /Readme.ja.md: -------------------------------------------------------------------------------- 1 | 2 | # PDF 内の ToUnicod CMap を修正してコピペの文字化けを防ぐツール 3 | 4 | [ [English](Readme.md) / 日本語 (Japanese) ] 5 | 6 | PDF から文字をコピペするとき、 7 | PDF によっては「見」とか「高」などの漢字が 8 | よく似ているけど違う文字(康熙部首や CJK 部首補助などの特殊な文字) 9 | に化けてしまう、なんてことが結構あります。 10 | このツールはそうした文字化けする PDF を修正し、 11 | 文字化けしない PDF を生成します。 12 | 13 | ## 注意 14 | 15 | このツールで生成した PDF から元の 16 | PDF へ逆変換することはできません(不可逆です)。 17 | また、気が付かないところで PDF が壊れている可能性も否定できません。 18 | 元の PDF は必ずバックアップを残しておいてください。 19 | 20 | ## 使い方 21 | 22 | ``` 23 | $ pdf-fix-tuc 24 | Fix ToUnicode CMap in PDF 0.0.0 25 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved. 26 | License: BSD-2-Clause 27 | 28 | https://github.com/trueroad/pdf-fix-tuc 29 | 30 | Usage: pdf-fix-tuc [options] [INPUT.pdf OUTPUT.pdf] ... 31 | 32 | -h, --help 33 | Print help and exit 34 | -V, --version 35 | Print version and exit 36 | --verbose 37 | Verbose 38 | 39 | Output PDF settings (QPDF): 40 | --linearize 41 | Output linearized (web-optimized) PDF 42 | --object-streams=[preserve|disable|generate] (default=preserve) 43 | Settings for object streams 44 | --newline-before-endstream 45 | Output newline before endstream 46 | --qdf 47 | Output QDF 48 | 49 | $ 50 | ``` 51 | 52 | ## インストール 53 | 54 | ### 必要なもの 55 | 56 | * C++11 コンパイラ(g++ 4.9 以降等) 57 | * [libqpdf](http://qpdf.sourceforge.net/) 58 | * pkg-config 等 59 | * Autoconf 2.69 以降 60 | * Automake 61 | 62 | パッケージを使って準備したい場合には、以下が便利でしょう。 63 | 64 | * Debian / Ubuntu 65 | + libqpdf-dev 66 | * Fedora 67 | + qpdf-devel 68 | * Cygwin 69 | + libqpdf-devel 70 | 71 | ### ビルド・インストール方法 72 | 73 | ``` 74 | $ git clone https://github.com/trueroad/pdf-fix-tuc.git 75 | $ cd pdf-fix-tuc 76 | $ ./autogen.sh 77 | $ mkdir build 78 | $ cd build 79 | $ ../configure 80 | $ make 81 | $ make install 82 | ``` 83 | 84 | ## しくみ 85 | 86 | ### PDF のテキスト抽出(コピペ) 87 | 88 | 多くの PDF は文字を表現する際に、 89 | Unicode のような文字コードではなく 90 | CID (Character ID) や GID (Glyph ID) と呼ばれる 91 | フォント内のグリフ(字形)を識別するコードが用いられています。 92 | 93 | PDF を作るときには、使用するフォントの cmap テーブルを使って 94 | Unicode のコードポイントを CID/GID へ変換し、それを PDF 内に記述します。 95 | 例えば「見」という漢字は Unicode では U+898B と表現されます。 96 | この CID/GID が何になるかはフォントによって違うのですが、 97 | [原ノ味フォント](https://github.com/trueroad/HaranoAjiFonts) 98 | のような 99 | [Adobe-Japan1](https://github.com/adobe-type-tools/Adobe-Japan1) 100 | に従っているフォントの場合、対応する CID は CID+1887 になります。 101 | つまり、PDF にする前は U+898B と表現されていた「見」を PDF にすると、 102 | PDF 内部では CID+1887 として記録されます。 103 | この PDF を表示や印刷する際には、CID+1887 を元に 104 | フォントから必要なグリフを取り出して表示や印刷をします。 105 | 106 | 一方で、PDF からテキスト抽出(文字をコピペしたり検索したり等)する際には、 107 | CID+1887 だけでは元の文字コードがわかりません。 108 | そこで CID/GID から Unicode コードポイントへ変換できる 109 | ToUnicode CMap という変換テーブルが用いられます。 110 | 多くの PDF では PDF 生成時に使用するフォント毎に ToUnicode CMap が生成され 111 | PDF 内部に埋め込まれるため、これによってテキスト抽出が可能になっています。 112 | 113 | ### 文字化けの原因 114 | 115 | 「見」などをコピペすると文字化けする PDF は、 116 | この ToUnicode CMap の生成方法に問題がある、 117 | つまり PDF を作るのに使ったツールに問題があります。 118 | ですが、メジャーどころも含めて多くのツールが同じ問題を抱えており、 119 | こうしたツールで生成した PDF は同じ文字化けを起こしてしまいます 120 | (こうした問題に配慮してコピペが文字化けしない PDF 生成ツールもあります)。 121 | 122 | 問題の一端は cmap テーブルの構造にあります。 123 | cmap テーブルは Unicode コードポイントから CID/GID 124 | へ変換するためのテーブルです。 125 | これを逆変換するテーブルを作れば ToUnicode CMap にすることができます。 126 | しかし、cmap テーブルは 1 対 1 対応になっていないため、 127 | 簡単に逆変換することができません。 128 | 「見」の場合、U+898B → CID+1887 というマッピングの他に、 129 | U+2F92 → CID+1887 というマッピングが書かれています。 130 | CID+1887 から逆変換しようとすると U+898B にすればよいのか 131 | U+2F92 にすればよいのか、わからなくて困ってしまいます。 132 | 「見」の他にも、複数の Unicode コードポイントが一つの CID/GID 133 | にマッピングされているものはたくさんあり、n 対 1 対応になっています。 134 | なぜ 1 対 1 対応になっていないのかというと、Unicode 135 | は見た目が同じでも由来が異なれば異なるコードポイントを割り当てるのに対して、 136 | CID/GID はフォントの仕組みなので見た目(字形)が同じであれば 137 | 同じ ID を割り当てる、という方針の違いがあるからと言えると思います。 138 | 139 | こうした n 対 1 対応がある、ということを考慮せず逆変換しようとして 140 | CID+1887 に対応する Unicode コードポイントを 141 | cmap テーブルから探すとどうなるでしょうか。 142 | 恐らく cmap テーブルを上から順番に検索して最初に見つかった CID+1887 143 | に対応する Unicode コードポイントを返すでしょう。 144 | cmap テーブルは Unicode コードポイントの昇順に並んでいますから、 145 | 最初に見つかるのは番号の小さい U+2F92 の方ということになります。 146 | というわけで、元々は cmap テーブルで U+898B → CID+1887 を使ったのに 147 | ToUnicode CMap には CID+1887 → U+2F92 が書かれてしまい、 148 | 元々 U+898B だった「見」が違う文字である U+2F92 に化けてしまう、 149 | ということになります。 150 | 151 | この U+2F92 というのは何かというと、 152 | Unicode では 'KANGXI RADICAL SEE' という名前がついていて 153 | 'Kangxi Radicals' というブロックに入っています。 154 | Kangxi Radicals というのは康熙部首(康煕部首)と呼ばれるもので、 155 | 普通の漢字ではなく部首を表す特殊な文字が収められているブロックになります。 156 | 特殊な文字ですから普通は使いませんし、 157 | これらが収録されていないフォントもあります。 158 | 収録されていないフォントで表示しようとすると他のフォントで代替されたりして、 159 | そこだけ変な表示になってしまいます。 160 | また、普通の「見」で検索をかけると U+898B を探すことになり、 161 | U+2F92 は違う文字なのでヒットしなくなってしまいます。 162 | 163 | 一方の U+898B は 'CJK Unified Ideographs' つまり 164 | CJK 統合漢字のブロックに入っている普通の漢字です。 165 | この問題のもう一つの一端は、 166 | 見た目が同じ文字に複数の Unicode 167 | コードポイントが割り当てられているのは仕方ないにしても、 168 | 普通は使わない文字の方が小さい番号、 169 | 普通に使う漢字の方が大きい番号になっており、 170 | 逆変換で単純に小さい番号の方を採用すると化けてしまう、 171 | というところにあります。 172 | 173 | ### 文字化けしない PDF 生成ツール 174 | 175 | cmap テーブルの逆変換に頼らずに 176 | 元の Unicode コードポイントと CID/GID の関係を覚えておいて 177 | ToUnicode CMap を生成するような PDF 生成ツールであれば、 178 | コピペで文字化けしない PDF ができます 179 | (ただし、一つの PDF で同じフォントに対して 180 | U+2F92 と U+898B の両方が使われていたら区別できませんので、 181 | どちらかに寄せなければならなくなります)。 182 | 183 | また、cmap テーブルの逆変換で ToUnicode CMap を生成するような 184 | PDF 生成ツールでも、n 対 1 対応を考慮して、 185 | 闇雲に最初に見つかった番号の小さい Unicode コードポイントを返すのではなく、 186 | 複数の候補の中から何らかの基準で優先順位をつけて、 187 | よりふさわしい Unicode コードポイントを返すようにすれば、 188 | ほとんどの場合でコピペが文字化けしなくなるハズです。 189 | 190 | 例えば XeTeX で使われる xdvipdfmx が PDF を生成する場合は、 191 | xdvipdfmx の入力ファイル (.xdv) は既に CID/GID しか記載されておらず 192 | 元の Unicode コードポイントが失われた状態になっているため、 193 | 後者の方法でコピペが文字化けしない PDF を生成しています 194 | (他にもフォントが Adobe-Japan1 の場合は ToUnicode CMap を埋め込まない 195 | という動作でもコピペの文字化けを防いでおり、これと同様になるよう 196 | PDF から Adobe-Japan1 フォントの ToUnicode CMap を削除するツール[ 197 | pdf-rm-tuc 198 | ](https://github.com/trueroad/pdf-rm-tuc) 199 | もあります)。 200 | 優先度の決定は Unicode のコードポイントの範囲を区切って行っています。 201 | ソースコードに入っている[ 202 | tt_cmap.c 203 | ](https://github.com/texjporg/tex-jp-build/blob/master/source/texk/dvipdfm-x/tt_cmap.c) 204 | にある `is_PUA_or_presentation` 関数を見ると 205 | 以下のようになっていてその範囲がわかります。 206 | 207 | ```c 208 | /* Soft-hyphen (U+00AD) to lower its priority... added here for convenience */ 209 | static int is_PUA_or_presentation (unsigned int uni) 210 | { 211 | /* Some of CJK Radicals Supplement and Kangxi Radicals 212 | * are commonly double encoded, lower the priority. 213 | * CJK Compatibility Ideographs & Supplement added. 214 | */ 215 | return ((uni >= 0x2E80 && uni <= 0x2EF3) || (uni >= 0x2F00 && uni <= 0x2FD5) || 216 | (uni >= 0xE000 && uni <= 0xF8FF) || (uni >= 0xFB00 && uni <= 0xFB4F) || 217 | (uni >= 0xF900 && uni <= 0xFAFF) || (uni >= 0x2F800 && uni <= 0x2FA1F) || 218 | (uni >= 0xF0000 && uni <= 0xFFFFD) || (uni >= 0x100000 && uni <= 0x10FFFD) || 219 | (uni == 0x00AD)); 220 | } 221 | ``` 222 | 223 | 「見」が化ける先の U+2F92 は康熙部首のブロックに入っており、 224 | これを判定する `(uni >= 0x2F00 && uni <= 0x2FD5)` 225 | に含まれているので優先度が低くなり採用されません。 226 | その結果、ToUnicode CMap には正しい方の 227 | CID+1887 → U+898B 228 | が記載されるためコピペが文字化けしなくなるというわけです。 229 | 他にも康煕部首と同様の部首が収められている 230 | 'CJK Radicals Supplement' CJK 部首補助のブロックや、 231 | 'CJK Compatibility Ideographs' CJK 互換漢字、 232 | 'CJK Compatibility Ideographs Supplement' CJK 互換漢字補助、 233 | 'Private Use Area' 私用領域、 234 | などが除外されるようになっています。 235 | 236 | ### 本ツールの動作 237 | 238 | 本ツールは、これまでに説明したようなコピペで文字化けする PDF の 239 | ToUnicode CMap に手を入れてコピペが文字化けしないようにします。 240 | 具体的には ToUnicode CMap に 241 | CID+1887 → U+2F92 242 | のような変換先が優先度の低い Unicode になっているマッピングを見つけたら、 243 | 変換先を優先度が高い Unicode に書き換え 244 | CID+1887 → U+898B 245 | のようにした PDF を出力します。 246 | 247 | この書き換えを行うテーブルは自動生成したものをソースの[ 248 | tounicode_table.cc 249 | ](https://github.com/trueroad/pdf-fix-tuc/blob/master/src/tounicode_table.cc) 250 | に入れてあります。これを自動生成するスクリプトもソースの[ 251 | build_table.py 252 | ](https://github.com/trueroad/pdf-fix-tuc/blob/master/src/build_table.py) 253 | に同梱してあります(ただし、普通にビルドしても自動生成は発動しません)。 254 | 自動生成の方法は、[ 255 | Adobe-Japan1 256 | ](https://github.com/adobe-type-tools/Adobe-Japan1) 257 | で配布されている漢字グリフの一覧表[ 258 | aj17-kanji.txt 259 | ](https://github.com/adobe-type-tools/Adobe-Japan1/blob/master/aj17-kanji.txt) 260 | を使い、 261 | 同じ CID に複数の Unicode コードポイントが割り当てられてるものについて、 262 | xdvipdfmx と同じ方法で優先度の低い Unicode コードポイントを判定し、 263 | 同じグリフ内で優先度が低い Unicode コードポイントを優先度が低くない 264 | Unicode コードポイントへ書き換える、 265 | というテーブルを生成しています。 266 | ただし、今のところ BMP から 267 | BMP 外へ書き換えるようなテーブルは生成しないようにしています。 268 | 269 | Adobe-Japan1 を元にしてテーブルを生成しているため 270 | 非 Adobe-Japan1 フォントではうまくいかなくなる可能性もありますが、 271 | ほとんどの日本語フォントは非 Adobe-Japan1 であっても 272 | 漢字グリフは Adobe-Japan1 のグリフセットをベースにしていると思いますので、 273 | 大抵は大丈夫だろうと思っています。 274 | 275 | ## ライセンス 276 | 277 | Copyright (C) 2021 Masamichi Hosoda. All rights reserved. 278 | 279 | License: BSD-2-Clause 280 | 281 | [LICENSE](./LICENSE) をご覧ください。 282 | -------------------------------------------------------------------------------- /src/tounicode.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Fix ToUnicode CMap in PDF 3 | // https://github.com/trueroad/pdf-fix-tuc 4 | // 5 | // tounicode.cc: ToUnicode CMap handler class 6 | // 7 | // Copyright (C) 2021 Masamichi Hosoda. 8 | // All rights reserved. 9 | // 10 | // Redistribution and use in source and binary forms, with or without 11 | // modification, are permitted provided that the following conditions 12 | // are met: 13 | // 14 | // * Redistributions of source code must retain the above copyright notice, 15 | // this list of conditions and the following disclaimer. 16 | // 17 | // * Redistributions in binary form must reproduce the above copyright notice, 18 | // this list of conditions and the following disclaimer in the documentation 19 | // and/or other materials provided with the distribution. 20 | // 21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | // ARE DISCLAIMED. 25 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 26 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 | // SUCH DAMAGE. 33 | // 34 | 35 | #include "tounicode.hh" 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #include "regex_dispatcher_m.hh" 45 | 46 | tounicode::tounicode (): 47 | regex_dispatcher::member_table 48 | ({ 49 | { std::regex (R"((<[\da-fA-F]+>\s*)<([\da-fA-F]+)>(\r?))"), 50 | &tounicode::bfchar_hex}, 51 | { std::regex (R"(<([\da-fA-F]+)>(\s*)<([\da-fA-F]+)>(\s*)<([\da-fA-F]+)>(\r?))"), 52 | &tounicode::bfrange_hex}, 53 | { std::regex (R"((\d+)(\s*)beginbfchar(\r?))"), 54 | &tounicode::beginbfchar}, 55 | { std::regex (R"((\d+)(\s*)beginbfrange(\r?))"), 56 | &tounicode::beginbfrange}, 57 | { std::regex (R"(endbfchar\r?)"), 58 | &tounicode::endbfchar}, 59 | { std::regex (R"(endbfrange\r?)"), 60 | &tounicode::endbfrange}, 61 | { std::regex (R"(.*\r?)"), 62 | &tounicode::other}, 63 | }) 64 | { 65 | } 66 | 67 | std::string tounicode::get (void) 68 | { 69 | return ss_.str (); 70 | } 71 | 72 | bool tounicode::bfchar_hex (const std::smatch &sm) 73 | { 74 | const auto prefix {sm[1].str ()}; 75 | auto uni {std::stoi (sm[2].str (), nullptr, 16)}; 76 | const auto cr {sm[3].str ()}; 77 | 78 | if (is_bfchar_ && table_.find (uni) != table_.end ()) 79 | { 80 | uni = table_.at (uni); 81 | 82 | ss_ << prefix 83 | << "<" 84 | << std::hex << std::uppercase 85 | << std::setw (4) << std::setfill ('0') 86 | << uni 87 | << std::dec << std::nouppercase 88 | << ">" 89 | << cr 90 | << std::endl; 91 | } 92 | else 93 | { 94 | ss_ << sm[0].str () << std::endl; 95 | } 96 | 97 | return true; 98 | } 99 | 100 | void tounicode::bfrange_output (const int cid_start, 101 | const int cid_end, 102 | const int uni_start, 103 | const std::string &space1, 104 | const std::string &space2, 105 | const std::string &cr) 106 | { 107 | ss_bfrange_ << "<" 108 | << std::hex << std::uppercase 109 | << std::setw (4) << std::setfill ('0') 110 | << cid_start 111 | << ">" 112 | << space1 113 | << "<" 114 | << std::setw (4) << std::setfill ('0') 115 | << cid_end 116 | << ">" 117 | << space2 118 | << "<" 119 | << std::setw (4) << std::setfill ('0') 120 | << uni_start 121 | << std::dec << std::nouppercase 122 | << ">" 123 | << cr 124 | << std::endl; 125 | ++bfranges_; 126 | } 127 | 128 | bool tounicode::bfrange_hex (const std::smatch &sm) 129 | { 130 | if (!is_bfrange_) 131 | { 132 | ss_ << sm[0].str () << std::endl; 133 | return true; 134 | } 135 | 136 | const auto cid_start {std::stoi (sm[1].str (), nullptr, 16)}; 137 | const auto cid_end {std::stoi (sm[3].str (), nullptr, 16)}; 138 | const auto uni_start {std::stoi (sm[5].str (), nullptr, 16)}; 139 | const auto space1 {sm[2].str ()}; 140 | const auto space2 {sm[4].str ()}; 141 | const auto cr {sm[6].str ()}; 142 | 143 | auto cid_new_range_start = cid_start; 144 | auto uni_new_range_start = uni_start; 145 | auto cid = cid_start; 146 | auto uni = uni_start; 147 | for (; cid <= cid_end; ++cid, ++uni) 148 | { 149 | if (table_.find (uni) != table_.end ()) 150 | { 151 | if (cid_new_range_start < cid) 152 | bfrange_output (cid_new_range_start, cid - 1, 153 | uni_new_range_start, 154 | space1, space2, cr); 155 | 156 | const auto uni_new = table_.at (uni); 157 | bfrange_output (cid, cid, uni_new, space1, space2, cr); 158 | 159 | cid_new_range_start = cid + 1; 160 | uni_new_range_start = uni + 1; 161 | } 162 | } 163 | 164 | if (cid_new_range_start == cid_start) 165 | { 166 | ss_bfrange_ << sm[0].str () << std::endl; 167 | ++bfranges_; 168 | } 169 | else if (cid_new_range_start < cid) 170 | bfrange_output (cid_new_range_start, cid - 1, 171 | uni_new_range_start, 172 | space1, space2, cr); 173 | 174 | return true; 175 | } 176 | 177 | bool tounicode::beginbfchar (const std::smatch &sm) 178 | { 179 | const auto chars {std::stoi (sm[1].str ())}; 180 | const auto space {sm[2].str ()}; 181 | const auto cr {sm[3].str ()}; 182 | 183 | if (is_bfrange_) 184 | { 185 | std::cerr << "warning: invalid ToUnicode: no endbfrange" << std::endl; 186 | is_bfrange_ = false; 187 | } 188 | if (is_bfchar_) 189 | { 190 | std::cerr << "warning: invalid ToUnicode: no endbfchar" << std::endl; 191 | } 192 | 193 | ss_ << chars 194 | << space 195 | << "beginbfchar" 196 | << cr 197 | << std::endl; 198 | 199 | is_bfchar_ = true; 200 | 201 | return true; 202 | } 203 | 204 | bool tounicode::endbfchar (const std::smatch &sm) 205 | { 206 | if (is_bfrange_) 207 | { 208 | std::cerr << "warning: invalid ToUnicode: no endbfrange" << std::endl; 209 | is_bfrange_ = false; 210 | } 211 | if (is_bfchar_) 212 | { 213 | is_bfchar_ = false; 214 | } 215 | else 216 | { 217 | std::cerr << "warning: invalid ToUnicode: no beginbfchar" << std::endl; 218 | } 219 | 220 | ss_ << sm[0].str () 221 | << std::endl; 222 | 223 | return true; 224 | } 225 | 226 | bool tounicode::beginbfrange (const std::smatch &sm) 227 | { 228 | beginbfrange_space_ = sm[2].str (); 229 | beginbfrange_cr_ = sm[3].str (); 230 | 231 | if (is_bfchar_) 232 | { 233 | std::cerr << "warning: invalid ToUnicode: no endbfchar" << std::endl; 234 | is_bfchar_ = false; 235 | } 236 | if (is_bfrange_) 237 | { 238 | std::cerr << "warning: invalid ToUnicode: no endbfrange" << std::endl; 239 | } 240 | 241 | is_bfrange_ = true; 242 | ss_bfrange_.str (""); 243 | ss_bfrange_.clear (); 244 | bfranges_ = 0; 245 | 246 | return true; 247 | } 248 | 249 | bool tounicode::endbfrange (const std::smatch &sm) 250 | { 251 | if (is_bfchar_) 252 | { 253 | std::cerr << "warning: invalid ToUnicode: no endbfchar" << std::endl; 254 | is_bfchar_ = false; 255 | } 256 | if (is_bfrange_) 257 | { 258 | is_bfrange_ = false; 259 | } 260 | else 261 | { 262 | std::cerr << "warning: invalid ToUnicode: no beginbfrange" << std::endl; 263 | } 264 | 265 | ss_ << bfranges_ 266 | << beginbfrange_space_ 267 | << "beginbfrange" 268 | << beginbfrange_cr_ 269 | << std::endl 270 | << ss_bfrange_.str () 271 | << sm[0].str () 272 | << std::endl; 273 | ss_bfrange_.str (""); 274 | ss_bfrange_.clear (); 275 | bfranges_ = 0; 276 | 277 | return true; 278 | } 279 | 280 | bool tounicode::other (const std::smatch &sm) 281 | { 282 | ss_ << sm[0].str () 283 | << std::endl; 284 | 285 | return true; 286 | } 287 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the specified 12 | # version of the C++ standard. If necessary, add switches to CXX and 13 | # CXXCPP to enable support. VERSION may be '11' (for the C++11 standard) 14 | # or '14' (for the C++14 standard). 15 | # 16 | # The second argument, if specified, indicates whether you insist on an 17 | # extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. 18 | # -std=c++11). If neither is specified, you get whatever works, with 19 | # preference for an extended mode. 20 | # 21 | # The third argument, if specified 'mandatory' or if left unspecified, 22 | # indicates that baseline support for the specified C++ standard is 23 | # required and that the macro should error out if no mode with that 24 | # support is found. If specified 'optional', then configuration proceeds 25 | # regardless, after defining HAVE_CXX${VERSION} if and only if a 26 | # supporting mode is found. 27 | # 28 | # LICENSE 29 | # 30 | # Copyright (c) 2008 Benjamin Kosnik 31 | # Copyright (c) 2012 Zack Weinberg 32 | # Copyright (c) 2013 Roy Stogner 33 | # Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov 34 | # Copyright (c) 2015 Paul Norman 35 | # Copyright (c) 2015 Moritz Klammler 36 | # Copyright (c) 2016, 2018 Krzesimir Nowak 37 | # Copyright (c) 2019 Enji Cooper 38 | # 39 | # Copying and distribution of this file, with or without modification, are 40 | # permitted in any medium without royalty provided the copyright notice 41 | # and this notice are preserved. This file is offered as-is, without any 42 | # warranty. 43 | 44 | #serial 11 45 | 46 | dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro 47 | dnl (serial version number 13). 48 | 49 | AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl 50 | m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"], 51 | [$1], [14], [ax_cxx_compile_alternatives="14 1y"], 52 | [$1], [17], [ax_cxx_compile_alternatives="17 1z"], 53 | [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl 54 | m4_if([$2], [], [], 55 | [$2], [ext], [], 56 | [$2], [noext], [], 57 | [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl 58 | m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true], 59 | [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true], 60 | [$3], [optional], [ax_cxx_compile_cxx$1_required=false], 61 | [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])]) 62 | AC_LANG_PUSH([C++])dnl 63 | ac_success=no 64 | 65 | m4_if([$2], [noext], [], [dnl 66 | if test x$ac_success = xno; then 67 | for alternative in ${ax_cxx_compile_alternatives}; do 68 | switch="-std=gnu++${alternative}" 69 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) 70 | AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, 71 | $cachevar, 72 | [ac_save_CXX="$CXX" 73 | CXX="$CXX $switch" 74 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], 75 | [eval $cachevar=yes], 76 | [eval $cachevar=no]) 77 | CXX="$ac_save_CXX"]) 78 | if eval test x\$$cachevar = xyes; then 79 | CXX="$CXX $switch" 80 | if test -n "$CXXCPP" ; then 81 | CXXCPP="$CXXCPP $switch" 82 | fi 83 | ac_success=yes 84 | break 85 | fi 86 | done 87 | fi]) 88 | 89 | m4_if([$2], [ext], [], [dnl 90 | if test x$ac_success = xno; then 91 | dnl HP's aCC needs +std=c++11 according to: 92 | dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf 93 | dnl Cray's crayCC needs "-h std=c++11" 94 | for alternative in ${ax_cxx_compile_alternatives}; do 95 | for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do 96 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) 97 | AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, 98 | $cachevar, 99 | [ac_save_CXX="$CXX" 100 | CXX="$CXX $switch" 101 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], 102 | [eval $cachevar=yes], 103 | [eval $cachevar=no]) 104 | CXX="$ac_save_CXX"]) 105 | if eval test x\$$cachevar = xyes; then 106 | CXX="$CXX $switch" 107 | if test -n "$CXXCPP" ; then 108 | CXXCPP="$CXXCPP $switch" 109 | fi 110 | ac_success=yes 111 | break 112 | fi 113 | done 114 | if test x$ac_success = xyes; then 115 | break 116 | fi 117 | done 118 | fi]) 119 | AC_LANG_POP([C++]) 120 | if test x$ax_cxx_compile_cxx$1_required = xtrue; then 121 | if test x$ac_success = xno; then 122 | AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.]) 123 | fi 124 | fi 125 | if test x$ac_success = xno; then 126 | HAVE_CXX$1=0 127 | AC_MSG_NOTICE([No compiler with C++$1 support was found]) 128 | else 129 | HAVE_CXX$1=1 130 | AC_DEFINE(HAVE_CXX$1,1, 131 | [define if the compiler supports basic C++$1 syntax]) 132 | fi 133 | AC_SUBST(HAVE_CXX$1) 134 | ]) 135 | 136 | 137 | dnl Test body for checking C++11 support 138 | 139 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11], 140 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 141 | ) 142 | 143 | 144 | dnl Test body for checking C++14 support 145 | 146 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14], 147 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 148 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 149 | ) 150 | 151 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17], 152 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 153 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 154 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_17 155 | ) 156 | 157 | dnl Tests for new features in C++11 158 | 159 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[ 160 | 161 | // If the compiler admits that it is not ready for C++11, why torture it? 162 | // Hopefully, this will speed up the test. 163 | 164 | #ifndef __cplusplus 165 | 166 | #error "This is not a C++ compiler" 167 | 168 | #elif __cplusplus < 201103L 169 | 170 | #error "This is not a C++11 compiler" 171 | 172 | #else 173 | 174 | namespace cxx11 175 | { 176 | 177 | namespace test_static_assert 178 | { 179 | 180 | template 181 | struct check 182 | { 183 | static_assert(sizeof(int) <= sizeof(T), "not big enough"); 184 | }; 185 | 186 | } 187 | 188 | namespace test_final_override 189 | { 190 | 191 | struct Base 192 | { 193 | virtual ~Base() {} 194 | virtual void f() {} 195 | }; 196 | 197 | struct Derived : public Base 198 | { 199 | virtual ~Derived() override {} 200 | virtual void f() override {} 201 | }; 202 | 203 | } 204 | 205 | namespace test_double_right_angle_brackets 206 | { 207 | 208 | template < typename T > 209 | struct check {}; 210 | 211 | typedef check single_type; 212 | typedef check> double_type; 213 | typedef check>> triple_type; 214 | typedef check>>> quadruple_type; 215 | 216 | } 217 | 218 | namespace test_decltype 219 | { 220 | 221 | int 222 | f() 223 | { 224 | int a = 1; 225 | decltype(a) b = 2; 226 | return a + b; 227 | } 228 | 229 | } 230 | 231 | namespace test_type_deduction 232 | { 233 | 234 | template < typename T1, typename T2 > 235 | struct is_same 236 | { 237 | static const bool value = false; 238 | }; 239 | 240 | template < typename T > 241 | struct is_same 242 | { 243 | static const bool value = true; 244 | }; 245 | 246 | template < typename T1, typename T2 > 247 | auto 248 | add(T1 a1, T2 a2) -> decltype(a1 + a2) 249 | { 250 | return a1 + a2; 251 | } 252 | 253 | int 254 | test(const int c, volatile int v) 255 | { 256 | static_assert(is_same::value == true, ""); 257 | static_assert(is_same::value == false, ""); 258 | static_assert(is_same::value == false, ""); 259 | auto ac = c; 260 | auto av = v; 261 | auto sumi = ac + av + 'x'; 262 | auto sumf = ac + av + 1.0; 263 | static_assert(is_same::value == true, ""); 264 | static_assert(is_same::value == true, ""); 265 | static_assert(is_same::value == true, ""); 266 | static_assert(is_same::value == false, ""); 267 | static_assert(is_same::value == true, ""); 268 | return (sumf > 0.0) ? sumi : add(c, v); 269 | } 270 | 271 | } 272 | 273 | namespace test_noexcept 274 | { 275 | 276 | int f() { return 0; } 277 | int g() noexcept { return 0; } 278 | 279 | static_assert(noexcept(f()) == false, ""); 280 | static_assert(noexcept(g()) == true, ""); 281 | 282 | } 283 | 284 | namespace test_constexpr 285 | { 286 | 287 | template < typename CharT > 288 | unsigned long constexpr 289 | strlen_c_r(const CharT *const s, const unsigned long acc) noexcept 290 | { 291 | return *s ? strlen_c_r(s + 1, acc + 1) : acc; 292 | } 293 | 294 | template < typename CharT > 295 | unsigned long constexpr 296 | strlen_c(const CharT *const s) noexcept 297 | { 298 | return strlen_c_r(s, 0UL); 299 | } 300 | 301 | static_assert(strlen_c("") == 0UL, ""); 302 | static_assert(strlen_c("1") == 1UL, ""); 303 | static_assert(strlen_c("example") == 7UL, ""); 304 | static_assert(strlen_c("another\0example") == 7UL, ""); 305 | 306 | } 307 | 308 | namespace test_rvalue_references 309 | { 310 | 311 | template < int N > 312 | struct answer 313 | { 314 | static constexpr int value = N; 315 | }; 316 | 317 | answer<1> f(int&) { return answer<1>(); } 318 | answer<2> f(const int&) { return answer<2>(); } 319 | answer<3> f(int&&) { return answer<3>(); } 320 | 321 | void 322 | test() 323 | { 324 | int i = 0; 325 | const int c = 0; 326 | static_assert(decltype(f(i))::value == 1, ""); 327 | static_assert(decltype(f(c))::value == 2, ""); 328 | static_assert(decltype(f(0))::value == 3, ""); 329 | } 330 | 331 | } 332 | 333 | namespace test_uniform_initialization 334 | { 335 | 336 | struct test 337 | { 338 | static const int zero {}; 339 | static const int one {1}; 340 | }; 341 | 342 | static_assert(test::zero == 0, ""); 343 | static_assert(test::one == 1, ""); 344 | 345 | } 346 | 347 | namespace test_lambdas 348 | { 349 | 350 | void 351 | test1() 352 | { 353 | auto lambda1 = [](){}; 354 | auto lambda2 = lambda1; 355 | lambda1(); 356 | lambda2(); 357 | } 358 | 359 | int 360 | test2() 361 | { 362 | auto a = [](int i, int j){ return i + j; }(1, 2); 363 | auto b = []() -> int { return '0'; }(); 364 | auto c = [=](){ return a + b; }(); 365 | auto d = [&](){ return c; }(); 366 | auto e = [a, &b](int x) mutable { 367 | const auto identity = [](int y){ return y; }; 368 | for (auto i = 0; i < a; ++i) 369 | a += b--; 370 | return x + identity(a + b); 371 | }(0); 372 | return a + b + c + d + e; 373 | } 374 | 375 | int 376 | test3() 377 | { 378 | const auto nullary = [](){ return 0; }; 379 | const auto unary = [](int x){ return x; }; 380 | using nullary_t = decltype(nullary); 381 | using unary_t = decltype(unary); 382 | const auto higher1st = [](nullary_t f){ return f(); }; 383 | const auto higher2nd = [unary](nullary_t f1){ 384 | return [unary, f1](unary_t f2){ return f2(unary(f1())); }; 385 | }; 386 | return higher1st(nullary) + higher2nd(nullary)(unary); 387 | } 388 | 389 | } 390 | 391 | namespace test_variadic_templates 392 | { 393 | 394 | template 395 | struct sum; 396 | 397 | template 398 | struct sum 399 | { 400 | static constexpr auto value = N0 + sum::value; 401 | }; 402 | 403 | template <> 404 | struct sum<> 405 | { 406 | static constexpr auto value = 0; 407 | }; 408 | 409 | static_assert(sum<>::value == 0, ""); 410 | static_assert(sum<1>::value == 1, ""); 411 | static_assert(sum<23>::value == 23, ""); 412 | static_assert(sum<1, 2>::value == 3, ""); 413 | static_assert(sum<5, 5, 11>::value == 21, ""); 414 | static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, ""); 415 | 416 | } 417 | 418 | // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae 419 | // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function 420 | // because of this. 421 | namespace test_template_alias_sfinae 422 | { 423 | 424 | struct foo {}; 425 | 426 | template 427 | using member = typename T::member_type; 428 | 429 | template 430 | void func(...) {} 431 | 432 | template 433 | void func(member*) {} 434 | 435 | void test(); 436 | 437 | void test() { func(0); } 438 | 439 | } 440 | 441 | } // namespace cxx11 442 | 443 | #endif // __cplusplus >= 201103L 444 | 445 | ]]) 446 | 447 | 448 | dnl Tests for new features in C++14 449 | 450 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[ 451 | 452 | // If the compiler admits that it is not ready for C++14, why torture it? 453 | // Hopefully, this will speed up the test. 454 | 455 | #ifndef __cplusplus 456 | 457 | #error "This is not a C++ compiler" 458 | 459 | #elif __cplusplus < 201402L 460 | 461 | #error "This is not a C++14 compiler" 462 | 463 | #else 464 | 465 | namespace cxx14 466 | { 467 | 468 | namespace test_polymorphic_lambdas 469 | { 470 | 471 | int 472 | test() 473 | { 474 | const auto lambda = [](auto&&... args){ 475 | const auto istiny = [](auto x){ 476 | return (sizeof(x) == 1UL) ? 1 : 0; 477 | }; 478 | const int aretiny[] = { istiny(args)... }; 479 | return aretiny[0]; 480 | }; 481 | return lambda(1, 1L, 1.0f, '1'); 482 | } 483 | 484 | } 485 | 486 | namespace test_binary_literals 487 | { 488 | 489 | constexpr auto ivii = 0b0000000000101010; 490 | static_assert(ivii == 42, "wrong value"); 491 | 492 | } 493 | 494 | namespace test_generalized_constexpr 495 | { 496 | 497 | template < typename CharT > 498 | constexpr unsigned long 499 | strlen_c(const CharT *const s) noexcept 500 | { 501 | auto length = 0UL; 502 | for (auto p = s; *p; ++p) 503 | ++length; 504 | return length; 505 | } 506 | 507 | static_assert(strlen_c("") == 0UL, ""); 508 | static_assert(strlen_c("x") == 1UL, ""); 509 | static_assert(strlen_c("test") == 4UL, ""); 510 | static_assert(strlen_c("another\0test") == 7UL, ""); 511 | 512 | } 513 | 514 | namespace test_lambda_init_capture 515 | { 516 | 517 | int 518 | test() 519 | { 520 | auto x = 0; 521 | const auto lambda1 = [a = x](int b){ return a + b; }; 522 | const auto lambda2 = [a = lambda1(x)](){ return a; }; 523 | return lambda2(); 524 | } 525 | 526 | } 527 | 528 | namespace test_digit_separators 529 | { 530 | 531 | constexpr auto ten_million = 100'000'000; 532 | static_assert(ten_million == 100000000, ""); 533 | 534 | } 535 | 536 | namespace test_return_type_deduction 537 | { 538 | 539 | auto f(int& x) { return x; } 540 | decltype(auto) g(int& x) { return x; } 541 | 542 | template < typename T1, typename T2 > 543 | struct is_same 544 | { 545 | static constexpr auto value = false; 546 | }; 547 | 548 | template < typename T > 549 | struct is_same 550 | { 551 | static constexpr auto value = true; 552 | }; 553 | 554 | int 555 | test() 556 | { 557 | auto x = 0; 558 | static_assert(is_same::value, ""); 559 | static_assert(is_same::value, ""); 560 | return x; 561 | } 562 | 563 | } 564 | 565 | } // namespace cxx14 566 | 567 | #endif // __cplusplus >= 201402L 568 | 569 | ]]) 570 | 571 | 572 | dnl Tests for new features in C++17 573 | 574 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[ 575 | 576 | // If the compiler admits that it is not ready for C++17, why torture it? 577 | // Hopefully, this will speed up the test. 578 | 579 | #ifndef __cplusplus 580 | 581 | #error "This is not a C++ compiler" 582 | 583 | #elif __cplusplus < 201703L 584 | 585 | #error "This is not a C++17 compiler" 586 | 587 | #else 588 | 589 | #include 590 | #include 591 | #include 592 | 593 | namespace cxx17 594 | { 595 | 596 | namespace test_constexpr_lambdas 597 | { 598 | 599 | constexpr int foo = [](){return 42;}(); 600 | 601 | } 602 | 603 | namespace test::nested_namespace::definitions 604 | { 605 | 606 | } 607 | 608 | namespace test_fold_expression 609 | { 610 | 611 | template 612 | int multiply(Args... args) 613 | { 614 | return (args * ... * 1); 615 | } 616 | 617 | template 618 | bool all(Args... args) 619 | { 620 | return (args && ...); 621 | } 622 | 623 | } 624 | 625 | namespace test_extended_static_assert 626 | { 627 | 628 | static_assert (true); 629 | 630 | } 631 | 632 | namespace test_auto_brace_init_list 633 | { 634 | 635 | auto foo = {5}; 636 | auto bar {5}; 637 | 638 | static_assert(std::is_same, decltype(foo)>::value); 639 | static_assert(std::is_same::value); 640 | } 641 | 642 | namespace test_typename_in_template_template_parameter 643 | { 644 | 645 | template typename X> struct D; 646 | 647 | } 648 | 649 | namespace test_fallthrough_nodiscard_maybe_unused_attributes 650 | { 651 | 652 | int f1() 653 | { 654 | return 42; 655 | } 656 | 657 | [[nodiscard]] int f2() 658 | { 659 | [[maybe_unused]] auto unused = f1(); 660 | 661 | switch (f1()) 662 | { 663 | case 17: 664 | f1(); 665 | [[fallthrough]]; 666 | case 42: 667 | f1(); 668 | } 669 | return f1(); 670 | } 671 | 672 | } 673 | 674 | namespace test_extended_aggregate_initialization 675 | { 676 | 677 | struct base1 678 | { 679 | int b1, b2 = 42; 680 | }; 681 | 682 | struct base2 683 | { 684 | base2() { 685 | b3 = 42; 686 | } 687 | int b3; 688 | }; 689 | 690 | struct derived : base1, base2 691 | { 692 | int d; 693 | }; 694 | 695 | derived d1 {{1, 2}, {}, 4}; // full initialization 696 | derived d2 {{}, {}, 4}; // value-initialized bases 697 | 698 | } 699 | 700 | namespace test_general_range_based_for_loop 701 | { 702 | 703 | struct iter 704 | { 705 | int i; 706 | 707 | int& operator* () 708 | { 709 | return i; 710 | } 711 | 712 | const int& operator* () const 713 | { 714 | return i; 715 | } 716 | 717 | iter& operator++() 718 | { 719 | ++i; 720 | return *this; 721 | } 722 | }; 723 | 724 | struct sentinel 725 | { 726 | int i; 727 | }; 728 | 729 | bool operator== (const iter& i, const sentinel& s) 730 | { 731 | return i.i == s.i; 732 | } 733 | 734 | bool operator!= (const iter& i, const sentinel& s) 735 | { 736 | return !(i == s); 737 | } 738 | 739 | struct range 740 | { 741 | iter begin() const 742 | { 743 | return {0}; 744 | } 745 | 746 | sentinel end() const 747 | { 748 | return {5}; 749 | } 750 | }; 751 | 752 | void f() 753 | { 754 | range r {}; 755 | 756 | for (auto i : r) 757 | { 758 | [[maybe_unused]] auto v = i; 759 | } 760 | } 761 | 762 | } 763 | 764 | namespace test_lambda_capture_asterisk_this_by_value 765 | { 766 | 767 | struct t 768 | { 769 | int i; 770 | int foo() 771 | { 772 | return [*this]() 773 | { 774 | return i; 775 | }(); 776 | } 777 | }; 778 | 779 | } 780 | 781 | namespace test_enum_class_construction 782 | { 783 | 784 | enum class byte : unsigned char 785 | {}; 786 | 787 | byte foo {42}; 788 | 789 | } 790 | 791 | namespace test_constexpr_if 792 | { 793 | 794 | template 795 | int f () 796 | { 797 | if constexpr(cond) 798 | { 799 | return 13; 800 | } 801 | else 802 | { 803 | return 42; 804 | } 805 | } 806 | 807 | } 808 | 809 | namespace test_selection_statement_with_initializer 810 | { 811 | 812 | int f() 813 | { 814 | return 13; 815 | } 816 | 817 | int f2() 818 | { 819 | if (auto i = f(); i > 0) 820 | { 821 | return 3; 822 | } 823 | 824 | switch (auto i = f(); i + 4) 825 | { 826 | case 17: 827 | return 2; 828 | 829 | default: 830 | return 1; 831 | } 832 | } 833 | 834 | } 835 | 836 | namespace test_template_argument_deduction_for_class_templates 837 | { 838 | 839 | template 840 | struct pair 841 | { 842 | pair (T1 p1, T2 p2) 843 | : m1 {p1}, 844 | m2 {p2} 845 | {} 846 | 847 | T1 m1; 848 | T2 m2; 849 | }; 850 | 851 | void f() 852 | { 853 | [[maybe_unused]] auto p = pair{13, 42u}; 854 | } 855 | 856 | } 857 | 858 | namespace test_non_type_auto_template_parameters 859 | { 860 | 861 | template 862 | struct B 863 | {}; 864 | 865 | B<5> b1; 866 | B<'a'> b2; 867 | 868 | } 869 | 870 | namespace test_structured_bindings 871 | { 872 | 873 | int arr[2] = { 1, 2 }; 874 | std::pair pr = { 1, 2 }; 875 | 876 | auto f1() -> int(&)[2] 877 | { 878 | return arr; 879 | } 880 | 881 | auto f2() -> std::pair& 882 | { 883 | return pr; 884 | } 885 | 886 | struct S 887 | { 888 | int x1 : 2; 889 | volatile double y1; 890 | }; 891 | 892 | S f3() 893 | { 894 | return {}; 895 | } 896 | 897 | auto [ x1, y1 ] = f1(); 898 | auto& [ xr1, yr1 ] = f1(); 899 | auto [ x2, y2 ] = f2(); 900 | auto& [ xr2, yr2 ] = f2(); 901 | const auto [ x3, y3 ] = f3(); 902 | 903 | } 904 | 905 | namespace test_exception_spec_type_system 906 | { 907 | 908 | struct Good {}; 909 | struct Bad {}; 910 | 911 | void g1() noexcept; 912 | void g2(); 913 | 914 | template 915 | Bad 916 | f(T*, T*); 917 | 918 | template 919 | Good 920 | f(T1*, T2*); 921 | 922 | static_assert (std::is_same_v); 923 | 924 | } 925 | 926 | namespace test_inline_variables 927 | { 928 | 929 | template void f(T) 930 | {} 931 | 932 | template inline T g(T) 933 | { 934 | return T{}; 935 | } 936 | 937 | template<> inline void f<>(int) 938 | {} 939 | 940 | template<> int g<>(int) 941 | { 942 | return 5; 943 | } 944 | 945 | } 946 | 947 | } // namespace cxx17 948 | 949 | #endif // __cplusplus < 201703L 950 | 951 | ]]) 952 | -------------------------------------------------------------------------------- /src/cmdlineparse.hh: -------------------------------------------------------------------------------- 1 | // 2 | // One header file Commandline Parse for C++11 2019-11-17.07 3 | // https://github.com/trueroad/cmdlineparse 4 | // 5 | // Copyright (C) 2016, 2017, 2019 Masamichi Hosoda. All rights reserved. 6 | // 7 | // Redistribution and use in source and binary forms, with or without 8 | // modification, are permitted provided that the following conditions 9 | // are met: 10 | // 11 | // * Redistributions of source code must retain the above copyright notice, 12 | // this list of conditions and the following disclaimer. 13 | // 14 | // * Redistributions in binary form must reproduce the above copyright notice, 15 | // this list of conditions and the following disclaimer in the documentation 16 | // and/or other materials provided with the distribution. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | // ARE DISCLAIMED. 22 | // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 | // SUCH DAMAGE. 30 | // 31 | 32 | #ifndef INCLUDE_GUARD_CMDLINEPARSE_HH 33 | #define INCLUDE_GUARD_CMDLINEPARSE_HH 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | // gettext macros 43 | #ifndef _ 44 | 45 | #define _(DUMMY) DUMMY 46 | #define CMDLINEPARSE_HH_DEFINE_DUMMY_GETTEXT 47 | 48 | #endif 49 | 50 | namespace cmdlineparse 51 | { 52 | 53 | enum class arg_mode 54 | { 55 | no_argument = 0, 56 | required_argument = 1, 57 | optional_argument = 2 58 | }; 59 | 60 | enum class abort_reason 61 | { 62 | option_handler = -1, 63 | no_abort = 0, 64 | error_extra_arg, 65 | error_no_arg, 66 | error_ambiguous_option, 67 | error_unknown_option, 68 | error_no_arg_short, 69 | error_unknown_option_short, 70 | }; 71 | 72 | class parser 73 | { 74 | public: 75 | // Add an option handler 76 | bool add_handler (char short_name, const std::string &long_name, 77 | arg_mode has_arg, 78 | std::function option_handler, 79 | const std::string &description = "", 80 | const std::string &typestr = "", 81 | const std::string &header = "", 82 | const std::string &group = ""); 83 | 84 | // Add a flag option 85 | bool add_flag (char short_name, const std::string &long_name, 86 | bool *flag, 87 | const std::string &description = "", 88 | const std::string &group = ""); 89 | 90 | // Add a string option 91 | bool add_string (char short_name, const std::string &long_name, 92 | std::string *var, const std::string &defval, 93 | const std::string &description = "", 94 | const std::string &typestr = "STRING", 95 | const std::string &group = ""); 96 | 97 | // Add help description 98 | void add_description (char short_name, const std::string &long_name, 99 | arg_mode has_arg, 100 | const std::string &description, 101 | const std::string &typestr = "", 102 | const std::string &header = "", 103 | const std::string &group = ""); 104 | 105 | // Add default handler 106 | bool add_default_help (void); 107 | bool add_default_version (void); 108 | bool add_default (void) 109 | { 110 | if (!add_default_help ()) 111 | return false; 112 | return add_default_version (); 113 | } 114 | 115 | // Parse options 116 | bool parse (int argc, char const* const* argv, int optind = 1); 117 | 118 | // Build default strings 119 | std::string build_usage (void) const; 120 | std::string build_help (void) const; 121 | 122 | // Get version_string 123 | const std::string &get_version_string () const noexcept 124 | { 125 | return version_string; 126 | } 127 | 128 | // Set version_string 129 | void set_version_string (const std::string &s) 130 | { 131 | version_string = s; 132 | } 133 | 134 | // Get unamed_args 135 | const std::vector &get_unamed_args () const noexcept 136 | { 137 | return unamed_args; 138 | } 139 | 140 | // Set usage_unamed_opts 141 | void set_usage_unamed_opts (const std::string &s) 142 | { 143 | usage_unamed_opts = s; 144 | } 145 | 146 | // Get abort reason 147 | abort_reason get_abort_reason () const noexcept 148 | { 149 | return abort; 150 | } 151 | 152 | // Get abort option 153 | const std::string &get_abort_option () const noexcept 154 | { 155 | return abort_option; 156 | } 157 | 158 | // Set opterr 159 | void set_opterr (bool flag) noexcept 160 | { 161 | opterr = flag; 162 | } 163 | 164 | // Set long_only 165 | void set_long_only (bool flag) noexcept 166 | { 167 | long_only = flag; 168 | } 169 | 170 | // Set continue_on_error 171 | void set_continue_on_error (bool flag) noexcept 172 | { 173 | continue_on_error = flag; 174 | } 175 | 176 | // Set abbreviated_long_name 177 | void set_abbreviated_long_name (bool flag) noexcept 178 | { 179 | abbreviated_long_name = flag; 180 | } 181 | 182 | // Constructor 183 | parser (); 184 | 185 | // Const 186 | const std::string h_indent {" "}; 187 | const std::string h_space {" "}; 188 | const std::string d_indent {" "}; 189 | 190 | private: 191 | // Internal functions 192 | void add_short (char short_name, arg_mode has_arg, 193 | std::function option_handler); 194 | void add_long (const std::string &long_name, arg_mode has_arg, 195 | std::function option_handler); 196 | bool find_unique_long_name (const std::string &long_name, 197 | std::pair> 199 | *target, 200 | bool *ambiguous); 201 | bool parse_long_name (std::vector::const_iterator *it); 202 | bool parse_short_name (std::vector::const_iterator *it); 203 | 204 | // Error handlers 205 | std::function error_extra_arg 207 | { 208 | [this](const std::string &long_name, const std::string &)->bool 209 | { 210 | if (opterr) 211 | std::cerr << argvs[0] 212 | << _(": option doesn't take an argument -- ") 213 | << long_name << std::endl; 214 | return continue_on_error; 215 | } 216 | }; 217 | std::function error_no_arg 218 | { 219 | [this](const std::string &long_name)->bool 220 | { 221 | if (opterr) 222 | std::cerr << argvs[0] 223 | << _(": option requires an argument -- ") 224 | << long_name << std::endl; 225 | return continue_on_error; 226 | } 227 | }; 228 | std::function error_ambiguous_option 229 | { 230 | [this](const std::string &optchars)->bool 231 | { 232 | if (opterr) 233 | std::cerr << argvs[0] 234 | << _(": ambiguous option -- ") 235 | << optchars << std::endl; 236 | return continue_on_error; 237 | } 238 | }; 239 | std::function error_unknown_option 240 | { 241 | [this](const std::string &optchars)->bool 242 | { 243 | if (opterr) 244 | std::cerr << argvs[0] 245 | << _(": unknown option -- ") 246 | << optchars << std::endl; 247 | return continue_on_error; 248 | } 249 | }; 250 | std::function error_no_arg_short 251 | { 252 | [this](char optchar)->bool 253 | { 254 | if (opterr) 255 | std::cerr << argvs[0] 256 | << _(": option requires an argument -- ") 257 | << optchar << std::endl; 258 | return continue_on_error; 259 | } 260 | }; 261 | std::function error_unknown_option_short 262 | { 263 | [this](char optchar)->bool 264 | { 265 | if (opterr) 266 | std::cerr << argvs[0] 267 | << _(": unknown option -- ") 268 | << optchar << std::endl; 269 | return continue_on_error; 270 | } 271 | }; 272 | 273 | // Help strings 274 | std::string version_string; 275 | std::string usage_unamed_opts; 276 | 277 | // Flags 278 | bool opterr {true}; 279 | bool continue_on_error {false}; 280 | bool long_only {false}; 281 | bool abbreviated_long_name {true}; 282 | 283 | // Abort reason 284 | abort_reason abort {abort_reason::no_abort}; 285 | std::string abort_option; 286 | 287 | // Arguments 288 | std::vector argvs; 289 | std::vector unamed_args; 290 | 291 | // Maps 292 | std::map>> short_map; 295 | std::map>> long_map; 298 | std::map> help_map; 299 | }; 300 | 301 | inline 302 | parser::parser () 303 | { 304 | #ifdef PACKAGE_STRING 305 | // Build version_string 306 | std::stringstream ss; 307 | ss << PACKAGE_STRING << std::endl; 308 | 309 | #ifdef PACKAGE_COPYRIGHT 310 | ss << PACKAGE_COPYRIGHT << std::endl; 311 | #ifdef PACKAGE_LICENSE 312 | ss << PACKAGE_LICENSE << std::endl; 313 | #endif // PACKAGE_LICENSE 314 | #endif // PACKAGE_COPYRIGHT 315 | 316 | #ifdef PACKAGE_URL 317 | ss << std::endl << PACKAGE_URL << std::endl; 318 | #endif // PACKAGE_URL 319 | 320 | version_string = ss.str (); 321 | #endif // PACKAGE_STRING 322 | } 323 | 324 | inline void 325 | parser::add_short (char short_name, arg_mode has_arg, 326 | std::function option_handler) 327 | { 328 | if (short_name) 329 | { 330 | short_map[short_name] = 331 | std::pair> 332 | (has_arg, option_handler); 333 | } 334 | } 335 | 336 | inline void 337 | parser::add_long (const std::string &long_name, arg_mode has_arg, 338 | std::function option_handler) 339 | { 340 | if (!long_name.empty ()) 341 | { 342 | long_map[long_name] = 343 | std::pair> 344 | (has_arg, option_handler); 345 | } 346 | } 347 | 348 | inline void 349 | parser::add_description (char short_name, const std::string &long_name, 350 | arg_mode has_arg, 351 | const std::string &description, 352 | const std::string &typestr, 353 | const std::string &header, 354 | const std::string &group) 355 | { 356 | std::stringstream ss; 357 | 358 | if (short_name || !long_name.empty () || !header.empty ()) 359 | { 360 | ss << h_indent; 361 | if (short_name) 362 | { 363 | ss << "-" << short_name; 364 | if (long_name.empty () && !typestr.empty ()) 365 | ss << typestr; 366 | } 367 | if (!long_name.empty ()) 368 | { 369 | if (short_name) 370 | ss << ", "; 371 | ss << "--" << long_name; 372 | if (!typestr.empty ()) 373 | { 374 | switch (has_arg) 375 | { 376 | case arg_mode::no_argument: 377 | // Nothing to do 378 | break; 379 | case arg_mode::required_argument: 380 | ss << "=" << typestr; 381 | break; 382 | case arg_mode::optional_argument: 383 | ss << "[=" << typestr << "]"; 384 | break; 385 | } 386 | } 387 | } 388 | if (!header.empty ()) 389 | ss << header; 390 | ss << std::endl; 391 | } 392 | 393 | if (!description.empty ()) 394 | ss << description << std::endl; 395 | 396 | help_map[group].push_back (ss.str ()); 397 | } 398 | 399 | inline bool 400 | parser::add_handler (char short_name, const std::string &long_name, 401 | arg_mode has_arg, 402 | std::function option_handler, 403 | const std::string &description, 404 | const std::string &typestr, 405 | const std::string &header, 406 | const std::string &group) 407 | { 408 | if (short_name && short_map.find (short_name) != short_map.end ()) 409 | { 410 | // The short name is already registerd. 411 | return false; 412 | } 413 | if (!long_name.empty() && long_map.find (long_name) != long_map.end ()) 414 | { 415 | // The long name is already registerd. 416 | return false; 417 | } 418 | 419 | add_short (short_name, has_arg, option_handler); 420 | add_long (long_name, has_arg, option_handler); 421 | 422 | if (!description.empty () || !typestr.empty ()) 423 | add_description (short_name, long_name, has_arg, 424 | description, typestr, header, group); 425 | 426 | return true; 427 | } 428 | 429 | inline bool 430 | parser::add_flag (char short_name, const std::string &long_name, 431 | bool *flag, 432 | const std::string &description, 433 | const std::string &group) 434 | { 435 | *flag = false; 436 | return add_handler (short_name, long_name, arg_mode::no_argument, 437 | [flag](const std::string &)->bool 438 | { 439 | *flag = true; 440 | return true; 441 | }, 442 | description, "", "", group); 443 | } 444 | 445 | inline bool 446 | parser::add_string (char short_name, const std::string &long_name, 447 | std::string *var, const std::string &defval, 448 | const std::string &description, 449 | const std::string &typestr, 450 | const std::string &group) 451 | { 452 | *var = defval; 453 | std::string header; 454 | 455 | if(!defval.empty ()) 456 | { 457 | // Three spaces for separator (same as h_space) 458 | header = _(" (default=") + defval + ")"; 459 | } 460 | return add_handler (short_name, long_name, arg_mode::required_argument, 461 | [var](const std::string &optarg)->bool 462 | { 463 | *var = optarg; 464 | return true; 465 | }, 466 | description, typestr, header, group); 467 | } 468 | 469 | inline bool 470 | parser::add_default_help (void) 471 | { 472 | return add_handler ('h', "help", arg_mode::no_argument, 473 | [this](const std::string &)->bool 474 | { 475 | std::cout << build_help (); 476 | return false; 477 | }, 478 | // Four spaces for indent (same as d_indent) 479 | _(" Print help and exit")); 480 | } 481 | 482 | inline bool 483 | parser::add_default_version (void) 484 | { 485 | return add_handler ('V', "version", arg_mode::no_argument, 486 | [this](const std::string &)->bool 487 | { 488 | std::cout << version_string; 489 | return false; 490 | }, 491 | // Four spaces for indent (same as d_indent) 492 | _(" Print version and exit")); 493 | } 494 | 495 | inline bool 496 | parser::find_unique_long_name (const std::string &long_name, 497 | std::pair> 499 | *target, 500 | bool *ambiguous) 501 | { 502 | if (long_map.find (long_name) != long_map.end ()) 503 | { 504 | // Long option name found 505 | *target = long_map[long_name]; 506 | return true; 507 | } 508 | if (abbreviated_long_name) 509 | { 510 | // Search abbreviated long option name 511 | for (decltype (long_map.cbegin()) it = 512 | long_map.upper_bound (long_name); 513 | it != long_map.cend (); 514 | ++it) 515 | { 516 | if (it->first.substr (0, long_name.size ()) == long_name) 517 | { 518 | auto next_it = it; 519 | ++next_it; 520 | if (next_it != long_map.cend () && 521 | next_it->first.substr (0, long_name.size ()) == long_name) 522 | { 523 | // Failed: ambiguous option 524 | *ambiguous = true; 525 | return false; 526 | } 527 | // Unique abbreviated long option name fount 528 | *target = it->second; 529 | return true; 530 | } 531 | } 532 | } 533 | // Failed: unknown long option name 534 | *ambiguous = false; 535 | return false; 536 | } 537 | 538 | inline bool 539 | parser::parse_long_name (std::vector::const_iterator *it) 540 | { 541 | size_t optchars_pos = (*it)->at(1) == '-' ? 542 | 2: // Double hyphen (--long_name) style 543 | 1; // Single hyphen (-long_name) style 544 | auto delimiter_pos = (*it)->find ('='); 545 | std::string long_name = (*it)->substr (optchars_pos, 546 | delimiter_pos - optchars_pos); 547 | std::string optarg; 548 | 549 | if (delimiter_pos != std::string::npos) 550 | { 551 | // Option characters have an option argument 552 | // (something like --long_name=optarg) 553 | optarg = (*it)->substr (delimiter_pos + 1, std::string::npos); 554 | } 555 | 556 | std::pair> target; 557 | bool ambiguous; 558 | if (find_unique_long_name (long_name, &target, &ambiguous)) 559 | { 560 | // Long option name found 561 | switch (target.first) 562 | { 563 | case arg_mode::no_argument: 564 | if (delimiter_pos != std::string::npos) 565 | { 566 | if (!error_extra_arg (long_name, optarg)) 567 | { 568 | abort = abort_reason::error_extra_arg; 569 | abort_option = long_name; 570 | return false; 571 | } 572 | return true; 573 | } 574 | break; 575 | case arg_mode::required_argument: 576 | if (delimiter_pos == std::string::npos) 577 | { 578 | if ((*it + 1) != argvs.cend ()) 579 | { 580 | // Next argv-element is an option argument 581 | // (something like --long_name optarg) 582 | optarg = *(++(*it)); 583 | } 584 | else 585 | { 586 | if (!error_no_arg (long_name)) 587 | { 588 | abort = abort_reason::error_no_arg; 589 | abort_option = long_name; 590 | return false; 591 | } 592 | return true; 593 | } 594 | } 595 | break; 596 | case arg_mode::optional_argument: 597 | // Nothing to do 598 | break; 599 | } 600 | // Call option handler 601 | if (!target.second (optarg)) 602 | { 603 | abort = abort_reason::option_handler; 604 | abort_option = long_name; 605 | return false; 606 | } 607 | return true; 608 | } 609 | 610 | // Long option name did not find 611 | if (optchars_pos == 1) 612 | { 613 | // Fallback to short option name 614 | return parse_short_name (it); 615 | } 616 | if (ambiguous) 617 | { 618 | if (!error_ambiguous_option ((*it)->substr (optchars_pos, 619 | std::string::npos))) 620 | { 621 | abort = abort_reason::error_ambiguous_option; 622 | abort_option = long_name; 623 | return false; 624 | } 625 | return true; 626 | } 627 | if (!error_unknown_option ((*it)->substr (optchars_pos, 628 | std::string::npos))) 629 | { 630 | abort = abort_reason::error_unknown_option; 631 | abort_option = long_name; 632 | return false; 633 | } 634 | return true; 635 | } 636 | 637 | inline bool 638 | parser::parse_short_name (std::vector::const_iterator *it) 639 | { 640 | for (auto name_it = (*it)->cbegin () + 1; 641 | name_it != (*it)->cend (); 642 | ++name_it) 643 | { 644 | if (short_map.find (*name_it) != short_map.end()) 645 | { 646 | // Short option name found 647 | auto target = short_map[*name_it]; 648 | switch (target.first) 649 | { 650 | case arg_mode::no_argument: 651 | // Option characters doesn't have an option argument 652 | // (something like -o) 653 | if (!target.second ("")) // Call option handler 654 | { 655 | abort = abort_reason::option_handler; 656 | abort_option = *name_it; 657 | return false; 658 | } 659 | break; 660 | case arg_mode::required_argument: 661 | if ((name_it + 1) != (*it)->cend ()) 662 | { 663 | // Option characters have an option argument 664 | // (something like -ooptarg) 665 | std::string optarg (name_it + 1, (*it)->cend ()); 666 | if (!target.second (optarg)) // Call option handler 667 | { 668 | abort = abort_reason::option_handler; 669 | abort_option = *name_it; 670 | return false; 671 | } 672 | return true; 673 | } 674 | else if ((*it + 1) != argvs.cend ()) 675 | { 676 | // Next argv-element is an option argument 677 | // (something like -o optarg) 678 | std::string optarg = *(++(*it)); 679 | if (!target.second (optarg)) // Call option handler 680 | { 681 | abort = abort_reason::option_handler; 682 | abort_option = *name_it; 683 | return false; 684 | } 685 | return true; 686 | } 687 | else 688 | { 689 | if (!error_no_arg_short (*name_it)) 690 | { 691 | abort = abort_reason::error_no_arg_short; 692 | abort_option = *name_it; 693 | return false; 694 | } 695 | } 696 | break; 697 | case arg_mode::optional_argument: 698 | if ((name_it + 1) != (*it)->cend ()) 699 | { 700 | // Option characters have an option argument 701 | // (something like -ooptarg) 702 | std::string optarg (name_it + 1, (*it)->cend ()); 703 | if (!target.second (optarg)) // Call option handler 704 | { 705 | abort = abort_reason::option_handler; 706 | abort_option = *name_it; 707 | return false; 708 | } 709 | return true; 710 | } 711 | // Option characters doesn't have an option argument 712 | // (something like -o) 713 | if (!target.second ("")) // Call option handler 714 | { 715 | abort = abort_reason::option_handler; 716 | abort_option = *name_it; 717 | return false; 718 | } 719 | break; 720 | } 721 | } 722 | else 723 | { 724 | if (!error_unknown_option_short (*name_it)) 725 | { 726 | abort = abort_reason::error_unknown_option_short; 727 | abort_option = *name_it; 728 | return false; 729 | } 730 | } 731 | } 732 | return true; 733 | } 734 | 735 | inline bool 736 | parser::parse (int argc, char const* const* argv, int optind) 737 | { 738 | argvs.assign (argv, argv + argc); 739 | 740 | if (version_string.empty ()) 741 | version_string = argvs[0] + "\n"; 742 | 743 | bool all_skip = false; 744 | 745 | for (auto argv_it = argvs.cbegin () + optind; 746 | argv_it != argvs.cend (); 747 | ++argv_it) 748 | { 749 | // Check skip 750 | if (all_skip) 751 | { 752 | unamed_args.push_back (*argv_it); 753 | continue; 754 | } 755 | 756 | // Check "--" and "-" 757 | // They are not option element. 758 | if (*argv_it == "--") 759 | { 760 | all_skip = true; 761 | continue; 762 | } 763 | else if (*argv_it == "-") 764 | { 765 | unamed_args.push_back (*argv_it); 766 | continue; 767 | } 768 | 769 | // Check option element 770 | if (argv_it->substr (0, 2) == "--") 771 | { 772 | // Long option 773 | if (!parse_long_name (&argv_it)) 774 | return false; 775 | } 776 | else if (argv_it->substr (0, 1) == "-") 777 | { 778 | if (long_only) 779 | { 780 | // Long option 781 | if (!parse_long_name (&argv_it)) 782 | return false; 783 | } 784 | else 785 | { 786 | // Short option 787 | if (!parse_short_name (&argv_it)) 788 | return false; 789 | } 790 | } 791 | else 792 | { 793 | // It is not an option element. 794 | unamed_args.push_back (*argv_it); 795 | } 796 | } 797 | return true; 798 | } 799 | 800 | inline std::string 801 | parser::build_usage (void) const 802 | { 803 | std::stringstream ss; 804 | 805 | ss << _("Usage: ") << argvs[0] << _(" [options] "); 806 | if (!usage_unamed_opts.empty ()) 807 | ss << "[" << usage_unamed_opts << "] "; 808 | ss << "..." << std::endl; 809 | 810 | return ss.str (); 811 | } 812 | 813 | inline std::string 814 | parser::build_help (void) const 815 | { 816 | std::stringstream ss; 817 | 818 | ss << version_string << std::endl << build_usage () << std::endl; 819 | 820 | for (const auto &group: help_map) 821 | { 822 | if (!group.first.empty ()) 823 | ss << std::endl << group.first << ":" << std::endl; 824 | for (const auto &description: group.second) 825 | { 826 | ss << description; 827 | } 828 | } 829 | 830 | return ss.str (); 831 | } 832 | 833 | } 834 | 835 | #if defined (_) && defined (CMDLINEPARSE_HH_DEFINE_DUMMY_GETTEXT) 836 | #undef _ 837 | #undef CMDLINEPARSE_HH_DEFINE_DUMMY_GETTEXT 838 | #endif 839 | 840 | #endif 841 | --------------------------------------------------------------------------------